Commit 696d4787 authored by Fize Jacques's avatar Fize Jacques
Browse files

simplify wrapper

parent 2e4c22c9
No related merge requests found
Showing with 62 additions and 37 deletions
+62 -37
import os,glob,shutil,sys,time
from pathlib import Path
import pandas as pd
home = str(Path.home())
class BiotexWrapper():
"""
Wrapper to execute and returned the result from the Biotex program
See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp
"""
def __init__(self,**kwargs):
self.biotexJarPath=kwargs.get("biotex_jar_path",os.path.join(os.path.dirname(__file__),"Biotex.jar"))
self.configuration=kwargs.get("configuration",{
"patternsSrc":kwargs.get("pattern_path",os.path.join(os.path.dirname(__file__),"patterns")),
"datasetSrc":kwargs.get("datasetRef_path",os.path.join(os.path.dirname(__file__),"dataSetReference")),
"stopwordsSrc":kwargs.get("stop_words_path",os.path.join(os.path.dirname(__file__),"stopWords")),
"treetaggerSrc":kwargs.get("treetagger_bin_path",os.path.join(home,".tree-tagger/")),
"typeOfTerms":kwargs.get("type_of_terms","all"),
"language":kwargs.get("lang","french"),
"score":kwargs.get("score","F-TFIDF-C_M"),
"patronNumber":kwargs.get("patronNumber",3)
})
self.write_conf(self.configuration)
self.output_data=None
def create_corpus_from_txt_files(self,list_of_text_filename : list, outputfn="output.txt"):
def __init__(self,
biotex_jar_path = os.path.join(os.path.dirname(__file__),"Biotex.jar"),
pattern_path = os.path.join(os.path.dirname(__file__),"patterns"),
dataset_src = os.path.join(os.path.dirname(__file__),"dataSetReference") ,
stopwords_src = os.path.join(os.path.dirname(__file__),"stopWords"),
treetagger_src = os.path.join(home,".tree-tagger/"),
type_of_terms = "all",
language = "french",
score = "F-TFIDF-C_M",
patron_number = "3"):
"""
Format corpus made of multiple file into Biotex Corpus Format
Constructor
Parameters
----------
list_of_text_filename : list
list of filenames ot the corpus
outputfn : str, optional
output filename (the default is "output.txt")
Returns
-------
str
output filename
biotex_jar_path : str, optional
Filepath of Biotex jar, by default os.path.join(os.path.dirname(__file__),"Biotex.jar")
pattern_path : str, optional
Directory that contains pre-defined patterns, by default os.path.join(os.path.dirname(__file__),"patterns")
dataset_src : src, optional
FilePath of datasets used by Biotex, by default os.path.join(os.path.dirname(__file__),"dataSetReference")
stopwords_src : str, optional
Path of the directory that contains stopwords for each language, by default os.path.join(os.path.dirname(__file__),"stopWords")
treetagger_src : str, optional
Path of the directory that contains TreeTagger, by default os.path.join(home,".tree-tagger/")
type_of_terms : str, optional
number of terms you want to extract, by default "all"
language : str, optional
language of the data, by default "french"
score : str, optional
score used to sort the extracted term, by default "F-TFIDF-C_M"
patron_number : str, optional
number of pattern used to extract terms, by default "3"
"""
corpus=[]
sep="\n##########END##########\n"
for file in list_of_text_filename:
corpus.append(open(file,'r').read())
open(outputfn,'w').write((sep.join(corpus)+sep).strip())
return outputfn
self.biotexJarPath=biotex_jar_path
self.configuration={
"patternsSrc":pattern_path,
"datasetSrc":dataset_src,
"stopwordsSrc":stopwords_src,
"treetaggerSrc":treetagger_src,
"typeOfTerms":type_of_terms,
"language":language,
"score":score,
"patronNumber":patron_number
}
self.write_conf(self.configuration)
self.output_data=None
def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"):
"""
......@@ -115,14 +130,24 @@ class BiotexWrapper():
parsed[2]=float(parsed[2])
data.append(parsed)
shutil.rmtree('output')
for f in glob.glob("to_tag_*.txt"):
os.remove(f)
self.output_data=data
return self.output_data
def terminology(self, corpus):
try:
self.create_corpus_from_txt(corpus)
except:
raise Exception("Error while creating file !")
return pd.DataFrame(self.extract_terminology("output.txt"), columns = "term in_umls rank".split())
if __name__ == '__main__':
import argparse
parser= argparse.ArgumentParser()
parser.add_argument("input",help="Your Biotex input filename")
parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extreact")
parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extract")
parser.add_argument('-o',"--output",help="Output filename")
parser.add_argument('-d',"--debug",action="store_true",help="debug activated")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment