simplify wrapper

696d4787 · Fize Jacques · 2e4c22c9 · 696d4787
Commit 696d4787 authored 5 years ago by Fize Jacques
Hide whitespace changes
Inline Side-by-side

Showing

with 62 additions and 37 deletions
+62 -37
--- a/biotex/biotex_wrapper.py
+++ b/biotex/biotex_wrapper.py
 import os,glob,shutil,sys,time
 from pathlib import Path
+import pandas as pd
 home = str(Path.home())
 class BiotexWrapper():
    """
    Wrapper to execute and returned the result from the Biotex program
    See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp
    """
-    def __init__(self,**kwargs):
+    def __init__(self,
-        self.biotexJarPath=kwargs.get("biotex_jar_path",os.path.join(os.path.dirname(__file__),"Biotex.jar"))
+        biotex_jar_path = os.path.join(os.path.dirname(__file__),"Biotex.jar"),
+        pattern_path = os.path.join(os.path.dirname(__file__),"patterns"),
-        self.configuration=kwargs.get("configuration",{
+        dataset_src = os.path.join(os.path.dirname(__file__),"dataSetReference") ,
-            "patternsSrc":kwargs.get("pattern_path",os.path.join(os.path.dirname(__file__),"patterns")),
+        stopwords_src = os.path.join(os.path.dirname(__file__),"stopWords"),
-            "datasetSrc":kwargs.get("datasetRef_path",os.path.join(os.path.dirname(__file__),"dataSetReference")),
+        treetagger_src = os.path.join(home,".tree-tagger/"),
-            "stopwordsSrc":kwargs.get("stop_words_path",os.path.join(os.path.dirname(__file__),"stopWords")),
+        type_of_terms = "all",
-            "treetaggerSrc":kwargs.get("treetagger_bin_path",os.path.join(home,".tree-tagger/")),
+        language = "french",
-            "typeOfTerms":kwargs.get("type_of_terms","all"),
+        score = "F-TFIDF-C_M",
-            "language":kwargs.get("lang","french"),
+        patron_number = "3"):
-            "score":kwargs.get("score","F-TFIDF-C_M"),
-	    "patronNumber":kwargs.get("patronNumber",3)
-        })
-        self.write_conf(self.configuration)
-        self.output_data=None
-    def create_corpus_from_txt_files(self,list_of_text_filename : list, outputfn="output.txt"):
        """
-        Format corpus made of multiple file into Biotex Corpus Format
+        Constructor
        Parameters
        ----------
-        list_of_text_filename : list
+        biotex_jar_path : str, optional
-            list of filenames ot the corpus
+            Filepath of Biotex jar, by default os.path.join(os.path.dirname(__file__),"Biotex.jar")
-        outputfn : str, optional
+        pattern_path : str, optional
-            output filename (the default is "output.txt")
+            Directory that contains pre-defined patterns, by default os.path.join(os.path.dirname(__file__),"patterns")
+        dataset_src : src, optional
-        Returns
+            FilePath of datasets used by Biotex, by default os.path.join(os.path.dirname(__file__),"dataSetReference")
-        -------
+        stopwords_src : str, optional
-        str
+            Path of the directory that contains stopwords for each language, by default os.path.join(os.path.dirname(__file__),"stopWords")
-            output filename
+        treetagger_src : str, optional
+            Path of the directory that contains TreeTagger, by default os.path.join(home,".tree-tagger/")
+        type_of_terms : str, optional
+            number of terms you want to extract, by default "all"
+        language : str, optional
+            language of the data, by default "french"
+        score : str, optional
+            score used to sort the extracted term, by default "F-TFIDF-C_M"
+        patron_number : str, optional
+            number of pattern used to extract terms, by default "3"
        """
-        corpus=[]
-        sep="\n##########END##########\n"
+        self.biotexJarPath=biotex_jar_path
-        for file in list_of_text_filename:
+        self.configuration={
-            corpus.append(open(file,'r').read())
+            "patternsSrc":pattern_path,
-        open(outputfn,'w').write((sep.join(corpus)+sep).strip())
+            "datasetSrc":dataset_src,
-        return outputfn
+            "stopwordsSrc":stopwords_src,
+            "treetaggerSrc":treetagger_src,
+            "typeOfTerms":type_of_terms,
+            "language":language,
+            "score":score,
+	        "patronNumber":patron_number
+        }
+        self.write_conf(self.configuration)
+        self.output_data=None
    def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"):
        """
@@ -115,14 +130,24 @@ class BiotexWrapper():
                parsed[2]=float(parsed[2])
                data.append(parsed)
        shutil.rmtree('output')
+        for f in glob.glob("to_tag_*.txt"):
+            os.remove(f)
        self.output_data=data
        return self.output_data
+    def terminology(self, corpus):
+        try:
+            self.create_corpus_from_txt(corpus)
+        except:
+            raise Exception("Error while creating file !")
+        return pd.DataFrame(self.extract_terminology("output.txt"), columns = "term in_umls rank".split())
 if __name__ == '__main__':
    import argparse
    parser= argparse.ArgumentParser()
    parser.add_argument("input",help="Your Biotex input filename")
-    parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extreact")
+    parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extract")
    parser.add_argument('-o',"--output",help="Output filename")
    parser.add_argument('-d',"--debug",action="store_true",help="debug activated")