From 696d47875550c0ea04d98dbd870b7afd633693bb Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Tue, 15 Oct 2019 10:54:58 +0200 Subject: [PATCH] simplify wrapper --- biotex/biotex_wrapper.py | 99 +++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 37 deletions(-) diff --git a/biotex/biotex_wrapper.py b/biotex/biotex_wrapper.py index 9a5de7c..6aa15d4 100644 --- a/biotex/biotex_wrapper.py +++ b/biotex/biotex_wrapper.py @@ -1,53 +1,68 @@ + + import os,glob,shutil,sys,time from pathlib import Path + +import pandas as pd + home = str(Path.home()) + + class BiotexWrapper(): """ Wrapper to execute and returned the result from the Biotex program See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp """ - def __init__(self,**kwargs): - self.biotexJarPath=kwargs.get("biotex_jar_path",os.path.join(os.path.dirname(__file__),"Biotex.jar")) - - self.configuration=kwargs.get("configuration",{ - "patternsSrc":kwargs.get("pattern_path",os.path.join(os.path.dirname(__file__),"patterns")), - "datasetSrc":kwargs.get("datasetRef_path",os.path.join(os.path.dirname(__file__),"dataSetReference")), - "stopwordsSrc":kwargs.get("stop_words_path",os.path.join(os.path.dirname(__file__),"stopWords")), - "treetaggerSrc":kwargs.get("treetagger_bin_path",os.path.join(home,".tree-tagger/")), - "typeOfTerms":kwargs.get("type_of_terms","all"), - "language":kwargs.get("lang","french"), - "score":kwargs.get("score","F-TFIDF-C_M"), - "patronNumber":kwargs.get("patronNumber",3) - }) - - self.write_conf(self.configuration) - - self.output_data=None - - - def create_corpus_from_txt_files(self,list_of_text_filename : list, outputfn="output.txt"): + def __init__(self, + biotex_jar_path = os.path.join(os.path.dirname(__file__),"Biotex.jar"), + pattern_path = os.path.join(os.path.dirname(__file__),"patterns"), + dataset_src = os.path.join(os.path.dirname(__file__),"dataSetReference") , + stopwords_src = os.path.join(os.path.dirname(__file__),"stopWords"), + treetagger_src = os.path.join(home,".tree-tagger/"), + type_of_terms = "all", + language = "french", + score = "F-TFIDF-C_M", + patron_number = "3"): """ - Format corpus made of multiple file into Biotex Corpus Format + Constructor Parameters ---------- - list_of_text_filename : list - list of filenames ot the corpus - outputfn : str, optional - output filename (the default is "output.txt") - - Returns - ------- - str - output filename + biotex_jar_path : str, optional + Filepath of Biotex jar, by default os.path.join(os.path.dirname(__file__),"Biotex.jar") + pattern_path : str, optional + Directory that contains pre-defined patterns, by default os.path.join(os.path.dirname(__file__),"patterns") + dataset_src : src, optional + FilePath of datasets used by Biotex, by default os.path.join(os.path.dirname(__file__),"dataSetReference") + stopwords_src : str, optional + Path of the directory that contains stopwords for each language, by default os.path.join(os.path.dirname(__file__),"stopWords") + treetagger_src : str, optional + Path of the directory that contains TreeTagger, by default os.path.join(home,".tree-tagger/") + type_of_terms : str, optional + number of terms you want to extract, by default "all" + language : str, optional + language of the data, by default "french" + score : str, optional + score used to sort the extracted term, by default "F-TFIDF-C_M" + patron_number : str, optional + number of pattern used to extract terms, by default "3" """ - corpus=[] - sep="\n##########END##########\n" - for file in list_of_text_filename: - corpus.append(open(file,'r').read()) - open(outputfn,'w').write((sep.join(corpus)+sep).strip()) - return outputfn + + self.biotexJarPath=biotex_jar_path + self.configuration={ + "patternsSrc":pattern_path, + "datasetSrc":dataset_src, + "stopwordsSrc":stopwords_src, + "treetaggerSrc":treetagger_src, + "typeOfTerms":type_of_terms, + "language":language, + "score":score, + "patronNumber":patron_number + } + + self.write_conf(self.configuration) + self.output_data=None def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"): """ @@ -115,14 +130,24 @@ class BiotexWrapper(): parsed[2]=float(parsed[2]) data.append(parsed) shutil.rmtree('output') + for f in glob.glob("to_tag_*.txt"): + os.remove(f) self.output_data=data return self.output_data + def terminology(self, corpus): + try: + self.create_corpus_from_txt(corpus) + except: + raise Exception("Error while creating file !") + return pd.DataFrame(self.extract_terminology("output.txt"), columns = "term in_umls rank".split()) + + if __name__ == '__main__': import argparse parser= argparse.ArgumentParser() parser.add_argument("input",help="Your Biotex input filename") - parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extreact") + parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extract") parser.add_argument('-o',"--output",help="Output filename") parser.add_argument('-d',"--debug",action="store_true",help="debug activated") -- GitLab