biotex_wrapper.py 6.32 KiB


import os,glob,shutil,sys,time
from pathlib import Path

import pandas as pd

home = str(Path.home())


class BiotexWrapper():
    """
    Wrapper to execute and returned the result from the Biotex program
    See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp
    """
    def __init__(self,
        biotex_jar_path = os.path.join(os.path.dirname(__file__),"Biotex.jar"),
        pattern_path = os.path.join(os.path.dirname(__file__),"patterns"),
        dataset_src = os.path.join(os.path.dirname(__file__),"dataSetReference") ,
        stopwords_src = os.path.join(os.path.dirname(__file__),"stopWords"),
        treetagger_src = os.path.join(home,".tree-tagger/"),
        type_of_terms = "all",
        language = "french",
        score = "F-TFIDF-C_M",
        patron_number = "3"):
        """
        Constructor
        
        Parameters
        ----------
        biotex_jar_path : str, optional
            Filepath of Biotex jar, by default os.path.join(os.path.dirname(__file__),"Biotex.jar")
        pattern_path : str, optional
            Directory that contains pre-defined patterns, by default os.path.join(os.path.dirname(__file__),"patterns")
        dataset_src : src, optional
            FilePath of datasets used by Biotex, by default os.path.join(os.path.dirname(__file__),"dataSetReference")
        stopwords_src : str, optional
            Path of the directory that contains stopwords for each language, by default os.path.join(os.path.dirname(__file__),"stopWords")
        treetagger_src : str, optional
            Path of the directory that contains TreeTagger, by default os.path.join(home,".tree-tagger/")
        type_of_terms : str, optional
            number of terms you want to extract, by default "all"
        language : str, optional
            language of the data, by default "french"
        score : str, optional
            score used to sort the extracted term, by default "F-TFIDF-C_M"
        patron_number : str, optional
            number of pattern used to extract terms, by default "3"
        """


        self.biotexJarPath=biotex_jar_path
        self.configuration={
            "patternsSrc":pattern_path,
            "datasetSrc":dataset_src,
            "stopwordsSrc":stopwords_src,
            "treetaggerSrc":treetagger_src,
            "typeOfTerms":type_of_terms,
            "language":language,
            "score":score,
	        "patronNumber":patron_number
        }

        self.write_conf(self.configuration)
        self.output_data=None

    def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"):
        """
        Transform a corpus (based on a list of str) into BiotexFormat
        
        Parameters
        ----------
        list_of_text : list
            list of stre
        outputfn : str, optional
            output filename (the default is "output.txt")
        
        """

        corpus=""
        sep="\n##########END##########\n"
        corpus = sep.join(list_of_text)
        open(outputfn,'w').write((corpus+sep).strip())


    def write_conf(self,confDict):
        """
        Create the configuration file to execute Biotex
        """
        f=open("configuration.txt",'w')
        for key in confDict.keys():
            f.write("{0}={1}\n".format(key,confDict[key]))
        f.close()

    def extract_terminology(self,inputFile,nbGram="ALL"):
        """
        Execute and extract the result returned by Biotex
        """

        if isinstance(nbGram,str):
            if nbGram != "ALL":
                print("Error : Except 'ALL' value, nbGram args in extractTerminology method can't take string arg !!!\nAvailable values: 'ALL',1,2,3,4")
                return False
        if isinstance(nbGram,int):
            if nbGram > 4 or nbGram < 0:
                print("Error : nbGram value : {0} is forbidden!\nAvailable values: 'ALL',1,2,3,4 ".format(nbGram))
                return False
        #if not isinstance(nbGram,str) or not isinstance(nbGram,int):
            #print("Error: Wrong args type :{0}!\nAvailable values: 'ALL',1,2,3,4 ".format(type(nbGram)))
            #return False
        debut=time.time()
        status=os.system("java -Xms6g -Xmx10g -jar {0} {1}".format(self.biotexJarPath,inputFile))
        print("Done in {0} sec".format(time.time()-debut))
        if status == 1 :
            print("Biotex java program has crashed !")
            return False
        if not os.path.exists("output"):
            os.makedirs("output")

        if isinstance(nbGram,int):
            output=open("output/t{0}gram.txt".format(nbGram),'r').read()
        else:
            output=open("output/ALL_gram.txt",'r').read()
        #output=open("output/{0}{1}gram.txt".format('t' if isinstance(nbGram,int) else '',str(nbGram)+"_" if ),'r').read()
        data=[]
        for line in output.split("\n"):
            parsed=line.split(";")
            if len(parsed) == 3:
                parsed[1]=int(parsed[1])
                parsed[2]=float(parsed[2])
                data.append(parsed)
        shutil.rmtree('output')
        for f in glob.glob("to_tag_*.txt"):
            os.remove(f)
        self.output_data=data
        return self.output_data

    def terminology(self, corpus):
        try:
            self.create_corpus_from_txt(corpus)
        except:
            raise Exception("Error while creating file !")
        return pd.DataFrame(self.extract_terminology("output.txt"), columns = "term in_umls rank".split())


if __name__ == '__main__':
    import argparse
    parser= argparse.ArgumentParser()
    parser.add_argument("input",help="Your Biotex input filename")
    parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extract")
    parser.add_argument('-o',"--output",help="Output filename")
    parser.add_argument('-d',"--debug",action="store_true",help="debug activated")

    args=parser.parse_args()
    if args.debug:
        print(args)

    wrap=BiotexWrapper()
    if args.sizeOfGram:
        if args.sizeOfGram != 'ALL':
            try:
                sGram=int(args.sizeOfGram)
            except:
                sGram=args.sizeOfGram
    else:sGram="ALL"

    data=wrap.extract_terminology(args.input,nbGram=sGram)
    out_=open((args.output if args.output else "GRAM_ALL.txt"),'w')
    for d in data:
        #print(d)
        #print("\t".join(d))
        out_.write("\t".join(map(str, d))+"\n")
    out_.close()

    #print(test)