biotex_wrapper.py

import os,glob,shutil,sys,time
from pathlib import Path
home = str(Path.home())
class BiotexWrapper():
    """
    Wrapper to execute and returned the result from the Biotex program
    See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp
    """
    def __init__(self,**kwargs):
        self.biotexJarPath=kwargs.get("biotex_jar_path",os.path.join(os.path.dirname(__file__),"Biotex.jar"))

        self.configuration=kwargs.get("configuration",{
            "patternsSrc":kwargs.get("pattern_path",os.path.join(os.path.dirname(__file__),"patterns")),
            "datasetSrc":kwargs.get("datasetRef_path",os.path.join(os.path.dirname(__file__),"dataSetReference")),
            "stopwordsSrc":kwargs.get("stop_words_path",os.path.join(os.path.dirname(__file__),"stopWords")),
            "treetaggerSrc":kwargs.get("treetagger_bin_path",os.path.join(home,".tree-tagger/")),
            "typeOfTerms":kwargs.get("type_of_terms","all"),
            "language":kwargs.get("lang","french"),
            "score":kwargs.get("score","F-TFIDF-C_M"),
	    "patronNumber":kwargs.get("patronNumber",3)
        })

        self.write_conf(self.configuration)

        self.output_data=None


    def create_corpus_from_txt_files(self,list_of_text_filename : list, outputfn="output.txt"):
        """
        Format corpus made of multiple file into Biotex Corpus Format

        Parameters
        ----------
        list_of_text_filename : list
            list of filenames ot the corpus
        outputfn : str, optional
            output filename (the default is "output.txt")

        Returns
        -------
        str
            output filename
        """

        corpus=[]
        sep="\n##########END##########\n"
        for file in list_of_text_filename:
            corpus.append(open(file,'r').read())
        open(outputfn,'w').write((sep.join(corpus)+sep).strip())
        return outputfn

    def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"):
        """
        Transform a corpus (based on a list of str) into BiotexFormat

        Parameters
        ----------
        list_of_text : list
            list of stre
        outputfn : str, optional
            output filename (the default is "output.txt")

        """

        corpus=""
        sep="\n##########END##########\n"
        corpus = sep.join(list_of_text)
        open(outputfn,'w').write((corpus+sep).strip())


    def write_conf(self,confDict):
        """
        Create the configuration file to execute Biotex
        """
        f=open("configuration.txt",'w')
        for key in confDict.keys():
            f.write("{0}={1}\n".format(key,confDict[key]))
        f.close()

    def extract_terminology(self,inputFile,nbGram="ALL"):
        """
        Execute and extract the result returned by Biotex
        """

        if isinstance(nbGram,str):
            if nbGram != "ALL":
                print("Error : Except 'ALL' value, nbGram args in extractTerminology method can't take string arg !!!\nAvailable values: 'ALL',1,2,3,4")
                return False
        if isinstance(nbGram,int):
            if nbGram > 4 or nbGram < 0:
                print("Error : nbGram value : {0} is forbidden!\nAvailable values: 'ALL',1,2,3,4 ".format(nbGram))
                return False
        #if not isinstance(nbGram,str) or not isinstance(nbGram,int):
            #print("Error: Wrong args type :{0}!\nAvailable values: 'ALL',1,2,3,4 ".format(type(nbGram)))
            #return False
        debut=time.time()
        status=os.system("java -Xms6g -Xmx10g -jar {0} {1}".format(self.biotexJarPath,inputFile))
        print("Done in {0} sec".format(time.time()-debut))
        if status == 1 :
            print("Biotex java program has crashed !")
            return False
        if not os.path.exists("output"):
            os.makedirs("output")

        if isinstance(nbGram,int):
            output=open("output/t{0}gram.txt".format(nbGram),'r').read()
        else:
            output=open("output/ALL_gram.txt",'r').read()
        #output=open("output/{0}{1}gram.txt".format('t' if isinstance(nbGram,int) else '',str(nbGram)+"_" if ),'r').read()
        data=[]
        for line in output.split("\n"):
            parsed=line.split(";")
            if len(parsed) == 3:
                parsed[1]=int(parsed[1])
                parsed[2]=float(parsed[2])
                data.append(parsed)
        shutil.rmtree('output')
        self.output_data=data
        return self.output_data

if __name__ == '__main__':
    import argparse
    parser= argparse.ArgumentParser()
    parser.add_argument("input",help="Your Biotex input filename")
    parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extreact")
    parser.add_argument('-o',"--output",help="Output filename")
    parser.add_argument('-d',"--debug",action="store_true",help="debug activated")

    args=parser.parse_args()
    if args.debug:
        print(args)

    wrap=BiotexWrapper()
    if args.sizeOfGram:
        if args.sizeOfGram != 'ALL':
            try:
                sGram=int(args.sizeOfGram)
            except:
                sGram=args.sizeOfGram
    else:sGram="ALL"

    data=wrap.extract_terminology(args.input,nbGram=sGram)
    out_=open((args.output if args.output else "GRAM_ALL.txt"),'w')
    for d in data:
        #print(d)
        #print("\t".join(d))
        out_.write("\t".join(map(str, d))+"\n")
    out_.close()

    #print(test)