Commit 696d4787 authored by Fize Jacques's avatar Fize Jacques
Browse files

simplify wrapper

parent 2e4c22c9
No related merge requests found
Showing with 62 additions and 37 deletions
+62 -37
import os,glob,shutil,sys,time import os,glob,shutil,sys,time
from pathlib import Path from pathlib import Path
import pandas as pd
home = str(Path.home()) home = str(Path.home())
class BiotexWrapper(): class BiotexWrapper():
""" """
Wrapper to execute and returned the result from the Biotex program Wrapper to execute and returned the result from the Biotex program
See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp
""" """
def __init__(self,**kwargs): def __init__(self,
self.biotexJarPath=kwargs.get("biotex_jar_path",os.path.join(os.path.dirname(__file__),"Biotex.jar")) biotex_jar_path = os.path.join(os.path.dirname(__file__),"Biotex.jar"),
pattern_path = os.path.join(os.path.dirname(__file__),"patterns"),
self.configuration=kwargs.get("configuration",{ dataset_src = os.path.join(os.path.dirname(__file__),"dataSetReference") ,
"patternsSrc":kwargs.get("pattern_path",os.path.join(os.path.dirname(__file__),"patterns")), stopwords_src = os.path.join(os.path.dirname(__file__),"stopWords"),
"datasetSrc":kwargs.get("datasetRef_path",os.path.join(os.path.dirname(__file__),"dataSetReference")), treetagger_src = os.path.join(home,".tree-tagger/"),
"stopwordsSrc":kwargs.get("stop_words_path",os.path.join(os.path.dirname(__file__),"stopWords")), type_of_terms = "all",
"treetaggerSrc":kwargs.get("treetagger_bin_path",os.path.join(home,".tree-tagger/")), language = "french",
"typeOfTerms":kwargs.get("type_of_terms","all"), score = "F-TFIDF-C_M",
"language":kwargs.get("lang","french"), patron_number = "3"):
"score":kwargs.get("score","F-TFIDF-C_M"),
"patronNumber":kwargs.get("patronNumber",3)
})
self.write_conf(self.configuration)
self.output_data=None
def create_corpus_from_txt_files(self,list_of_text_filename : list, outputfn="output.txt"):
""" """
Format corpus made of multiple file into Biotex Corpus Format Constructor
Parameters Parameters
---------- ----------
list_of_text_filename : list biotex_jar_path : str, optional
list of filenames ot the corpus Filepath of Biotex jar, by default os.path.join(os.path.dirname(__file__),"Biotex.jar")
outputfn : str, optional pattern_path : str, optional
output filename (the default is "output.txt") Directory that contains pre-defined patterns, by default os.path.join(os.path.dirname(__file__),"patterns")
dataset_src : src, optional
Returns FilePath of datasets used by Biotex, by default os.path.join(os.path.dirname(__file__),"dataSetReference")
------- stopwords_src : str, optional
str Path of the directory that contains stopwords for each language, by default os.path.join(os.path.dirname(__file__),"stopWords")
output filename treetagger_src : str, optional
Path of the directory that contains TreeTagger, by default os.path.join(home,".tree-tagger/")
type_of_terms : str, optional
number of terms you want to extract, by default "all"
language : str, optional
language of the data, by default "french"
score : str, optional
score used to sort the extracted term, by default "F-TFIDF-C_M"
patron_number : str, optional
number of pattern used to extract terms, by default "3"
""" """
corpus=[]
sep="\n##########END##########\n" self.biotexJarPath=biotex_jar_path
for file in list_of_text_filename: self.configuration={
corpus.append(open(file,'r').read()) "patternsSrc":pattern_path,
open(outputfn,'w').write((sep.join(corpus)+sep).strip()) "datasetSrc":dataset_src,
return outputfn "stopwordsSrc":stopwords_src,
"treetaggerSrc":treetagger_src,
"typeOfTerms":type_of_terms,
"language":language,
"score":score,
"patronNumber":patron_number
}
self.write_conf(self.configuration)
self.output_data=None
def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"): def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"):
""" """
...@@ -115,14 +130,24 @@ class BiotexWrapper(): ...@@ -115,14 +130,24 @@ class BiotexWrapper():
parsed[2]=float(parsed[2]) parsed[2]=float(parsed[2])
data.append(parsed) data.append(parsed)
shutil.rmtree('output') shutil.rmtree('output')
for f in glob.glob("to_tag_*.txt"):
os.remove(f)
self.output_data=data self.output_data=data
return self.output_data return self.output_data
def terminology(self, corpus):
try:
self.create_corpus_from_txt(corpus)
except:
raise Exception("Error while creating file !")
return pd.DataFrame(self.extract_terminology("output.txt"), columns = "term in_umls rank".split())
if __name__ == '__main__': if __name__ == '__main__':
import argparse import argparse
parser= argparse.ArgumentParser() parser= argparse.ArgumentParser()
parser.add_argument("input",help="Your Biotex input filename") parser.add_argument("input",help="Your Biotex input filename")
parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extreact") parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extract")
parser.add_argument('-o',"--output",help="Output filename") parser.add_argument('-o',"--output",help="Output filename")
parser.add_argument('-d',"--debug",action="store_true",help="debug activated") parser.add_argument('-d',"--debug",action="store_true",help="debug activated")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment