An error occurred while loading the file. Please try again.
-
Dorchies David authored
Refs #108
ae70dd4b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import os,glob,shutil,sys,time
from pathlib import Path
import pandas as pd
home = str(Path.home())
class BiotexWrapper():
"""
Wrapper to execute and returned the result from the Biotex program
See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp
"""
def __init__(self,
biotex_jar_path = os.path.join(os.path.dirname(__file__),"Biotex.jar"),
pattern_path = os.path.join(os.path.dirname(__file__),"patterns"),
dataset_src = os.path.join(os.path.dirname(__file__),"dataSetReference") ,
stopwords_src = os.path.join(os.path.dirname(__file__),"stopWords"),
treetagger_src = os.path.join(home,".tree-tagger/"),
type_of_terms = "all",
language = "french",
score = "F-TFIDF-C_M",
patron_number = "3"):
"""
Constructor
Parameters
----------
biotex_jar_path : str, optional
Filepath of Biotex jar, by default os.path.join(os.path.dirname(__file__),"Biotex.jar")
pattern_path : str, optional
Directory that contains pre-defined patterns, by default os.path.join(os.path.dirname(__file__),"patterns")
dataset_src : src, optional
FilePath of datasets used by Biotex, by default os.path.join(os.path.dirname(__file__),"dataSetReference")
stopwords_src : str, optional
Path of the directory that contains stopwords for each language, by default os.path.join(os.path.dirname(__file__),"stopWords")
treetagger_src : str, optional
Path of the directory that contains TreeTagger, by default os.path.join(home,".tree-tagger/")
type_of_terms : str, optional
number of terms you want to extract, by default "all"
language : str, optional
language of the data, by default "french"
score : str, optional
score used to sort the extracted term, by default "F-TFIDF-C_M"
patron_number : str, optional
number of pattern used to extract terms, by default "3"
"""
self.biotexJarPath=biotex_jar_path
self.configuration={
"patternsSrc":pattern_path,
"datasetSrc":dataset_src,
"stopwordsSrc":stopwords_src,
"treetaggerSrc":treetagger_src,
"typeOfTerms":type_of_terms,
"language":language,
"score":score,
"patronNumber":patron_number
}
self.write_conf(self.configuration)
self.output_data=None
def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"):
"""
Transform a corpus (based on a list of str) into BiotexFormat
Parameters
----------
list_of_text : list
list of stre
outputfn : str, optional
output filename (the default is "output.txt")
"""
corpus=""
sep="\n##########END##########\n"
corpus = sep.join(list_of_text)
open(outputfn,'w').write((corpus+sep).strip())
def write_conf(self,confDict):
"""
Create the configuration file to execute Biotex
"""
f=open("configuration.txt",'w')
for key in confDict.keys():
f.write("{0}={1}\n".format(key,confDict[key]))
f.close()
def extract_terminology(self,inputFile,nbGram="ALL"):
"""
Execute and extract the result returned by Biotex
"""
if isinstance(nbGram,str):
if nbGram != "ALL":
print("Error : Except 'ALL' value, nbGram args in extractTerminology method can't take string arg !!!\nAvailable values: 'ALL',1,2,3,4")
return False
if isinstance(nbGram,int):
if nbGram > 4 or nbGram < 0:
print("Error : nbGram value : {0} is forbidden!\nAvailable values: 'ALL',1,2,3,4 ".format(nbGram))
return False
#if not isinstance(nbGram,str) or not isinstance(nbGram,int):
#print("Error: Wrong args type :{0}!\nAvailable values: 'ALL',1,2,3,4 ".format(type(nbGram)))
#return False
debut=time.time()
status=os.system("java -Xms6g -Xmx10g -jar {0} {1}".format(self.biotexJarPath,inputFile))
print("Done in {0} sec".format(time.time()-debut))
if status == 1 :
print("Biotex java program has crashed !")
return False
if not os.path.exists("output"):
os.makedirs("output")
if isinstance(nbGram,int):
output=open("output/t{0}gram.txt".format(nbGram),'r').read()
else:
output=open("output/ALL_gram.txt",'r').read()
#output=open("output/{0}{1}gram.txt".format('t' if isinstance(nbGram,int) else '',str(nbGram)+"_" if ),'r').read()
data=[]
for line in output.split("\n"):
parsed=line.split(";")
if len(parsed) == 3:
parsed[1]=int(parsed[1])
parsed[2]=float(parsed[2])
data.append(parsed)
shutil.rmtree('output')
for f in glob.glob("to_tag_*.txt"):
os.remove(f)
self.output_data=data
return self.output_data
def terminology(self, corpus):
try:
self.create_corpus_from_txt(corpus)
except:
raise Exception("Error while creating file !")
return pd.DataFrame(self.extract_terminology("output.txt"), columns = "term in_umls rank".split())
if __name__ == '__main__':
import argparse
parser= argparse.ArgumentParser()
parser.add_argument("input",help="Your Biotex input filename")
parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extract")
parser.add_argument('-o',"--output",help="Output filename")
parser.add_argument('-d',"--debug",action="store_true",help="debug activated")
args=parser.parse_args()
if args.debug:
print(args)
wrap=BiotexWrapper()
if args.sizeOfGram:
if args.sizeOfGram != 'ALL':
try:
sGram=int(args.sizeOfGram)
except:
sGram=args.sizeOfGram
else:sGram="ALL"
data=wrap.extract_terminology(args.input,nbGram=sGram)
out_=open((args.output if args.output else "GRAM_ALL.txt"),'w')
for d in data:
#print(d)
#print("\t".join(d))
out_.write("\t".join(map(str, d))+"\n")
out_.close()
#print(test)