Failed to fetch fork details. Try again later.
-
Fize Jacques authored3ecbe3e1
Forked from
Fize Jacques / biotex_python
Source project has a limited visibility.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os,glob,shutil,sys,time
from pathlib import Path
home = str(Path.home())
class BiotexWrapper():
"""
Wrapper to execute and returned the result from the Biotex program
See more about Biotex at: http://tubo.lirmm.fr:8080/biotex/index.jsp
"""
def __init__(self,**kwargs):
self.biotexJarPath=kwargs.get("biotex_jar_path",os.path.join(os.path.dirname(__file__),"Biotex.jar"))
self.configuration=kwargs.get("configuration",{
"patternsSrc":kwargs.get("pattern_path",os.path.join(os.path.dirname(__file__),"patterns")),
"datasetSrc":kwargs.get("datasetRef_path",os.path.join(os.path.dirname(__file__),"dataSetReference")),
"stopwordsSrc":kwargs.get("stop_words_path",os.path.join(os.path.dirname(__file__),"stopWords")),
"treetaggerSrc":kwargs.get("treetagger_bin_path",os.path.join(home,".tree-tagger/")),
"typeOfTerms":kwargs.get("type_of_terms","all"),
"language":kwargs.get("lang","french"),
"score":kwargs.get("score","F-TFIDF-C_M"),
"patronNumber":kwargs.get("patronNumber",3)
})
self.write_conf(self.configuration)
self.output_data=None
def create_corpus_from_txt_files(self,list_of_text_filename : list, outputfn="output.txt"):
"""
Format corpus made of multiple file into Biotex Corpus Format
Parameters
----------
list_of_text_filename : list
list of filenames ot the corpus
outputfn : str, optional
output filename (the default is "output.txt")
Returns
-------
str
output filename
"""
corpus=[]
sep="\n##########END##########\n"
for file in list_of_text_filename:
corpus.append(open(file,'r').read())
open(outputfn,'w').write((sep.join(corpus)+sep).strip())
return outputfn
def create_corpus_from_txt(self,list_of_text : list, outputfn="output.txt"):
"""
Transform a corpus (based on a list of str) into BiotexFormat
Parameters
----------
list_of_text : list
list of stre
outputfn : str, optional
output filename (the default is "output.txt")
"""
corpus=""
sep="\n##########END##########\n"
corpus = sep.join(list_of_text)
open(outputfn,'w').write((corpus+sep).strip())
def write_conf(self,confDict):
"""
Create the configuration file to execute Biotex
"""
f=open("configuration.txt",'w')
for key in confDict.keys():
f.write("{0}={1}\n".format(key,confDict[key]))
f.close()
def extract_terminology(self,inputFile,nbGram="ALL"):
"""
Execute and extract the result returned by Biotex
"""
if isinstance(nbGram,str):
if nbGram != "ALL":
print("Error : Except 'ALL' value, nbGram args in extractTerminology method can't take string arg !!!\nAvailable values: 'ALL',1,2,3,4")
return False
if isinstance(nbGram,int):
if nbGram > 4 or nbGram < 0:
print("Error : nbGram value : {0} is forbidden!\nAvailable values: 'ALL',1,2,3,4 ".format(nbGram))
return False
#if not isinstance(nbGram,str) or not isinstance(nbGram,int):
#print("Error: Wrong args type :{0}!\nAvailable values: 'ALL',1,2,3,4 ".format(type(nbGram)))
#return False
debut=time.time()
status=os.system("java -Xms6g -Xmx10g -jar {0} {1}".format(self.biotexJarPath,inputFile))
print("Done in {0} sec".format(time.time()-debut))
if status == 1 :
print("Biotex java program has crashed !")
return False
if not os.path.exists("output"):
os.makedirs("output")
if isinstance(nbGram,int):
output=open("output/t{0}gram.txt".format(nbGram),'r').read()
else:
output=open("output/ALL_gram.txt",'r').read()
#output=open("output/{0}{1}gram.txt".format('t' if isinstance(nbGram,int) else '',str(nbGram)+"_" if ),'r').read()
data=[]
for line in output.split("\n"):
parsed=line.split(";")
if len(parsed) == 3:
parsed[1]=int(parsed[1])
parsed[2]=float(parsed[2])
data.append(parsed)
shutil.rmtree('output')
self.output_data=data
return self.output_data
if __name__ == '__main__':
import argparse
parser= argparse.ArgumentParser()
parser.add_argument("input",help="Your Biotex input filename")
parser.add_argument('-s',"--sizeOfGram",help="Gram size of the term you want to extreact")
parser.add_argument('-o',"--output",help="Output filename")
parser.add_argument('-d',"--debug",action="store_true",help="debug activated")
args=parser.parse_args()
if args.debug:
print(args)
wrap=BiotexWrapper()
if args.sizeOfGram:
if args.sizeOfGram != 'ALL':
try:
sGram=int(args.sizeOfGram)
except:
sGram=args.sizeOfGram
else:sGram="ALL"
data=wrap.extract_terminology(args.input,nbGram=sGram)
out_=open((args.output if args.output else "GRAM_ALL.txt"),'w')
for d in data:
#print(d)
#print("\t".join(d))
out_.write("\t".join(map(str, d))+"\n")
out_.close()
#print(test)