An error occurred while loading the file. Please try again.
-
Fize Jacques authored
Reorder files, debug geodict helpers, add disambiguation module + shell script, cleanup str models, clean wikicooc disambiguation, and minor changes
fd045e4c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# coding = utf-8
import argparse
import sys
import numpy as np
from numpy import inf
import glob,re,sys,os,json
import pandas as pd
from strpython.eval.disambiguation import *
import logging
for _ in ("boto", "elasticsearch", "urllib3"):
logging.getLogger(_).setLevel(logging.CRITICAL)
parser= argparse.ArgumentParser()
parser.add_argument("corpus_name",default="padiweb",help="Corpus you want to evaluate",choices=["padiweb","agromada"])
parser.add_argument("measure",default="accuracy",help="Performance measure you want to compute",choices=["accuracy","accuracy_k","mean_distance_error"])
parser.add_argument("-k",type=float,default=1,help="K value for the accuracy@k computation")
args= parser.parse_args()
if args.corpus_name == "padiweb":
corpus_dir="data/disambiguation_data/padiweb_disambiguation"
data_lang = json.load(open("data/disambiguation_data/padiweb_disambiguation/data_lang.json"))
else:
corpus_dir = "data/disambiguation_data/mada_disambiguisation"
data_lang = json.load(open("/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json"))
data_lang = {int(k): v for k, v in data_lang.items()}
corpus_files=glob.glob("{0}/*.csv".format(corpus_dir))
acc_MC,acc_GEO,acc_wiki=[],[],[]
i=0
for fn in corpus_files:
i+=1
id_=int(re.findall(r"\d+",fn)[-1])
#sys.stdout.write("\r{0}/{1}".format(i,len(fns)))
try:
df=pd.read_csv(fn)
lang=data_lang[id_]
acc_MC.append(efficiencyMostCommon(df,lang,args.measure,args.k))
acc_GEO.append(efficiencyGeodict(df,lang,args.measure,args.k))
acc_wiki.append(efficiencyWiki(df,lang,args.measure,args.k))
except Exception as e:
print(e)
acc_GEO=np.array(acc_GEO)
acc_GEO[acc_GEO == inf] = 0
acc_GEO=acc_GEO.tolist()
sys.stdout.write("\r{0}/{1} -- {5}Wiki : {2} | {5}MC : {3} | {5}GEO : {4}".format(
i,
len(corpus_files),
np.mean(np.nan_to_num(acc_wiki)),
np.mean(np.nan_to_num(acc_MC)),
np.mean(np.nan_to_num(acc_GEO)),
args.measure
)
)
# In[63]:
print("\naccGEO",np.mean(np.nan_to_num(acc_GEO)))
print("acc_MC",np.mean(np.nan_to_num(acc_MC)))
print("accWiki",np.mean(np.nan_to_num(acc_wiki)))