eval_disambiguation.py 2.17 KiB
# coding = utf-8

import argparse
import sys
import numpy as np
from numpy import inf
import glob,re,sys,os,json
import pandas as pd
from strpython.eval.disambiguation import *
import logging
for _ in ("boto", "elasticsearch", "urllib3"):
    logging.getLogger(_).setLevel(logging.CRITICAL)


parser= argparse.ArgumentParser()

parser.add_argument("corpus_name",default="padiweb",help="Corpus you want to evaluate",choices=["padiweb","agromada"])
parser.add_argument("measure",default="accuracy",help="Performance measure you want to compute",choices=["accuracy","accuracy_k","mean_distance_error"])
parser.add_argument("-k",type=float,default=1,help="K value for the accuracy@k computation")

args= parser.parse_args()

if args.corpus_name == "padiweb":
    corpus_dir="data/disambiguation_data/padiweb_disambiguation"
    data_lang = json.load(open("data/disambiguation_data/padiweb_disambiguation/data_lang.json"))

else:
    corpus_dir = "data/disambiguation_data/mada_disambiguisation"
    data_lang = json.load(open("/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json"))

data_lang = {int(k): v for k, v in data_lang.items()}

corpus_files=glob.glob("{0}/*.csv".format(corpus_dir))

acc_MC,acc_GEO,acc_wiki=[],[],[]
i=0

for fn in corpus_files:
    i+=1
    id_=int(re.findall(r"\d+",fn)[-1])
    #sys.stdout.write("\r{0}/{1}".format(i,len(fns)))
    try:
        df=pd.read_csv(fn)
        lang=data_lang[id_]
        acc_MC.append(efficiencyMostCommon(df,lang,args.measure,args.k))
        acc_GEO.append(efficiencyGeodict(df,lang,args.measure,args.k))
        acc_wiki.append(efficiencyWiki(df,lang,args.measure,args.k))
    except Exception as e:
        print(e)
    acc_GEO=np.array(acc_GEO)
    acc_GEO[acc_GEO == inf] = 0
    acc_GEO=acc_GEO.tolist()
    sys.stdout.write("\r{0}/{1} -- {5}Wiki : {2} | {5}MC : {3} | {5}GEO : {4}".format(
        i,
        len(corpus_files),
        np.mean(np.nan_to_num(acc_wiki)),
        np.mean(np.nan_to_num(acc_MC)),
        np.mean(np.nan_to_num(acc_GEO)),
        args.measure
        )
    )


# In[63]:


print("\naccGEO",np.mean(np.nan_to_num(acc_GEO)))
print("acc_MC",np.mean(np.nan_to_num(acc_MC)))
print("accWiki",np.mean(np.nan_to_num(acc_wiki)))