An error occurred while loading the file. Please try again.
-
Fize Jacques authored
Create scipt to sythetize the results Move generate data.py to depreciated
daefcf35
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# coding = utf-8
# coding = utf-8
import argparse,glob, string,time,re
from progressbar import ProgressBar, Timer, Bar, ETA, Counter
from strpython.models.str import STR
from strpython.nlp.disambiguator.share_prop import *
from strpython.pipeline import *
import pandas as pd
import networkx as nx
def filter_nonprintable(text):
# Get the difference of all ASCII characters from the set of printable characters
nonprintable = set([chr(i) for i in range(128)]).difference(string.printable)
# Use translate to remove all non-printable characters
return text.translate({ord(character):None for character in nonprintable})
parser = argparse.ArgumentParser()
parser.add_argument("csv_input_dir")
parser.add_argument("graphs_output_dir")
parser.add_argument("metadata_output_fn")
subparsers = parser.add_subparsers(help='commands')
normal = subparsers.add_parser(
'normal', help='Basic STR generation. No argument are necessary !')
normal.set_defaults(which="norm")
gen_parser = subparsers.add_parser(
'generalisation', help='Apply a generalisation transformation on the generated STRs')
gen_parser.set_defaults(which="gene")
gen_parser.add_argument(
'-t','--type_gen', help='Type of generalisation',default="all")
gen_parser.add_argument(
'-n', help='Language',default=1)
gen_parser.add_argument(
'-b','--bound', help='If Generalisation is bounded, this arg. correspond'
'to the maximal ',default="country")
ext_parser = subparsers.add_parser(
'extension', help='Apply a extension process on the STRs')
ext_parser.set_defaults(which="ext")
ext_parser.add_argument(
'-d','--distance', help='radius distance',default=150)
ext_parser.add_argument(
'-u','--unit', help='unit used for the radius distance',default="km")
ext_parser.add_argument(
'-a','--adjacent_count', help='number of adjacent SE add to the STR',default=1)
args = parser.parse_args()
if "which" in args:
if args.which =="gene":
args.type_trans="gen"
elif args.which =="ext":
args.type_trans="ext"
print("Parameters entered : ",args)
if os.path.exists(args.csv_input_dir):
files_glob= glob.glob(args.csv_input_dir+"/*.csv")
if not files_glob:
files_glob = glob.glob(args.csv_input_dir + "/*.txt")
else:
exit()
if not os.path.exists(args.graphs_output_dir):
os.makedirs(args.graphs_output_dir)
start = time.time()
associated_es={}
count_per_doc={}
i=0
#logging.info("Get associated spatial entities and ")
with ProgressBar(max_value=len(files_glob),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg:
for fn in files_glob:
id_=int(re.findall("\d+", fn)[-1])
df=pd.read_csv(fn)
df = df[-df["GID"].isin(['0', 'o', 'NR', 'O'])]
try:
count_per_doc[id_]=json.loads(df.groupby("GID").GID.count().to_json())
associated_es[id_] = df[["GID","text"]].groupby("GID",as_index=False).max().set_index('GID').to_dict()["text"]
except:
count_per_doc[id_]={}
associated_es[id_]={}
pg.update(i)
i+=1
#logging.info("Fetch list of spatial entities available !")
all_es=set([])
for k,v in associated_es.items():
for k2 in v:
all_es.add(k2)
i=0
def foo_(x):
try:
return get_data(x)["en"]
except:
print(x)
with ProgressBar(max_value=len(files_glob),
widgets=[' [', Timer(), '] ', Bar(), '(', Counter(), ')', '(', ETA(), ')']) as pg:
for fn in files_glob:
id_ = int(re.findall("\d+", fn)[-1])
df = pd.read_csv(fn)
# try:
df= df.fillna("O")
df= df[-df["GID"].isin(['0','o','NR','O'])]
#print(df)
# except:
# df = df[(df.GID.notnull())]
# print("BUG",df)
df["label"]=df.GID.apply(foo_)
df = df.rename(columns={"GID": "id"})
str_=STR.from_pandas(df,[]).build()
nx.write_gexf(str_, args.graphs_output_dir + "/{0}.gexf".format(id_))
i+=1
pg.update(i)
# Save Metadata
open(os.path.join(args.graphs_output_dir,args.metadata_output_fn),'w').write(json.dumps([associated_es,count_per_doc],indent=4))
print("--- %s seconds ---" % (time.time() - start))