Commit daa40cdd authored by Fize Jacques's avatar Fize Jacques
Browse files

debug

parent 4d787b85
No related merge requests found
Showing with 652 additions and 621 deletions
+652 -621
...@@ -37,23 +37,20 @@ for fn in corpus_files: ...@@ -37,23 +37,20 @@ for fn in corpus_files:
id_=int(re.findall(r"\d+",fn)[-1]) id_=int(re.findall(r"\d+",fn)[-1])
df=pd.read_csv(fn) df=pd.read_csv(fn)
lang=data_lang[id_] lang=data_lang[id_]
acc_MC.append(efficiencyMostCommon(df,lang,args.measure,args.k)) #acc_MC.append(efficiencyMostCommon(df,lang,args.measure,args.k))
acc_GEO.append(efficiencyGeodict(df,lang,args.measure,args.k)) #acc_GEO.append(efficiencyGeodict(df,lang,args.measure,args.k))
acc_wiki.append(efficiencyWiki(df,lang,args.measure,args.k)) acc_wiki.append(efficiencyWiki(df,lang,args.measure,args.k))
acc_GEO=np.array(acc_GEO) #acc_GEO=np.array(acc_GEO)
acc_GEO[acc_GEO == inf] = 0 #acc_GEO[acc_GEO == inf] = 0
acc_GEO=acc_GEO.tolist() #acc_GEO=acc_GEO.tolist()
sys.stdout.write("\r{0}/{1} -- {5}Wiki : {2} | {5}MC : {3} | {5}GEO : {4}".format( # sys.stdout.write("\r{0}/{1} -- {5}Wiki : {2} | {5}MC : {3} | {5}GEO : {4}".format(
i, # i,
len(corpus_files), # len(corpus_files),
np.mean(np.nan_to_num(acc_wiki)), # np.mean(np.nan_to_num(acc_wiki)),
np.mean(np.nan_to_num(acc_MC)), # np.mean(np.nan_to_num(acc_MC)),
np.mean(np.nan_to_num(acc_GEO)), # np.mean(np.nan_to_num(acc_GEO)),
args.measure # args.measure
) # )
) # )
print(args,"\naccGEO",np.mean(np.nan_to_num(acc_GEO)),"acc_MC",np.mean(np.nan_to_num(acc_MC)),"accWiki",np.mean(np.nan_to_num(acc_wiki)))
print("\naccGEO",np.mean(np.nan_to_num(acc_GEO))) \ No newline at end of file
print("acc_MC",np.mean(np.nan_to_num(acc_MC)))
print("accWiki",np.mean(np.nan_to_num(acc_wiki)))
...@@ -8,8 +8,8 @@ ...@@ -8,8 +8,8 @@
"database_json":"resources/database_exp_25_may.db", "database_json":"resources/database_exp_25_may.db",
"log_file":"extract_log", "log_file":"extract_log",
"wiki_cooc_dis":{ "wiki_cooc_dis":{
"cooc_freq":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/coocurrence_wiki.pkl", "cooc_freq":"/Users/jacquesfize/nas_cloud/Data/geodict/new_model_cooc_freqGD.json",
"count":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/count_wiki.pkl" "count":"/Users/jacquesfize/nas_cloud/Data/geodict/new_model_count_GD.json"
}, },
"language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/language_resources", "language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/language_resources",
"gazetteer":"geodict", "gazetteer":"geodict",
......
...@@ -46,8 +46,15 @@ def matcher_devdura_voc( lang): ...@@ -46,8 +46,15 @@ def matcher_devdura_voc( lang):
TerminologyMatcher TerminologyMatcher
matcher matcher
""" """
if lang == "fr":
terminology = pd.read_csv(os.path.join(package_directory, "resources/terminology/dev_durable/Vocabulaire_du_Developement_Durable.csv"),sep=";")["label"].values.tolist()
else:
old_terminology = pd.read_csv(os.path.join(package_directory,"resources/terminology/dev_durable/Vocabulaire_du_Developement_Durable.csv"),sep=";")["alt_labels"].apply(eval)
terminology = []
[terminology.extend(t) for t in old_terminology]
terminology = [re.sub("([|[\(]).*(]|[\)])?","",x).strip() for x in terminology]
terminology = pd.read_csv(os.path.join(package_directory,"resources/terminology/dev_durable/Vocabulaire_du_Developement_Durable.csv"))["label"].values.tolist()
return TerminologyMatcher( terminology) return TerminologyMatcher( terminology)
...@@ -126,7 +133,7 @@ def matcher_biotex_lda(lang): ...@@ -126,7 +133,7 @@ def matcher_biotex_lda(lang):
TerminologyMatcher TerminologyMatcher
matcher matcher
""" """
df=pd.read_csv(os.path.join(package_directory,"resources/terminology/mixed/lda_top1000_4topic.csv"),index_col=0) df=pd.read_csv(os.path.join(package_directory,"resources/terminology/mixed/lda_top1000_4topic_1500.csv"),index_col=0)
return TerminologyMatcher( df.term.values.tolist()) return TerminologyMatcher( df.term.values.tolist())
......
...@@ -240,7 +240,7 @@ class Expansion(Transformation): ...@@ -240,7 +240,7 @@ class Expansion(Transformation):
if not "country" in data_.other: if not "country" in data_.other:
continue continue
neighbor = self.getAroundEntities(data_, median, distance, unit, n,lang=lang,stop_en=stop_en) neighbor = self.getAroundEntities(data_, median, distance, unit, n,lang=lang,stop_en=stop_en)
stop_en.extend(neighbor) #stop_en.extend(neighbor)
# if not neighbor: # if not neighbor:
# try: # try:
# neighbor = [get_inclusion_chain(node, "P131")[0]] # neighbor = [get_inclusion_chain(node, "P131")[0]]
......
...@@ -74,10 +74,10 @@ class Disambiguator(object): ...@@ -74,10 +74,10 @@ class Disambiguator(object):
def get_candidates(self,label,lang): def get_candidates(self,label,lang):
candidates=[] candidates=[]
candidates.extend(gazetteer.get_by_label(label,lang)) candidates.extend(gazetteer.get_by_label(label,lang,size=3,score=True))
candidates.extend(gazetteer.get_by_alias(label, lang,score=False)) candidates.extend(gazetteer.get_by_alias(label, lang,size=3,score=True))
candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1,score=False)) candidates.extend(gazetteer.get_n_label_similar(label,lang, n=3,score=False))
candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1,score=False)) candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=3,score=False))
return pd.DataFrame([[ return pd.DataFrame([[
r.id, r.id,
label, label,
......
...@@ -36,11 +36,11 @@ class BigramModel: ...@@ -36,11 +36,11 @@ class BigramModel:
if uri1 in self.cooc_freq: if uri1 in self.cooc_freq:
if uri2 in self.cooc_freq[uri1]: if uri2 in self.cooc_freq[uri1]:
return self.cooc_freq[uri1][uri2] return self.cooc_freq[uri1][uri2]
#return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1 #return self.count_associated[uri1]/self.cooc_freq[uri1][uri2]
elif uri2 in self.cooc_freq: elif uri2 in self.cooc_freq:
if uri1 in self.cooc_freq[uri2]: if uri1 in self.cooc_freq[uri2]:
return self.cooc_freq[uri2][uri1] return self.cooc_freq[uri2][uri1]
#return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1 #return self.count_associated[uri2]/self.cooc_freq[uri2][uri1]
return nna return nna
......
...@@ -126,16 +126,22 @@ class ShareProp(Disambiguator): ...@@ -126,16 +126,22 @@ class ShareProp(Disambiguator):
------- -------
""" """
from ...models.spatial_relation import get_spatial_relations
all = [cand. id for cand in spat_candidates]
all.extend([cand. id for cand in fixed_entities])
relations = get_spatial_relations(all)
score_dc = {} score_dc = {}
for cand in spat_candidates: for cand in spat_candidates:
id_cand = cand.id id_cand = cand.id
score_dc[id_cand] = 0 score_dc[id_cand] = 0
for fixed in fixed_entities: for fixed in fixed_entities:
id_fixed = fixed.id id_fixed = fixed.id
if self.Adjacency_P47(cand, fixed): if relations.adjacency[id_cand][id_fixed] :
score_dc[id_cand] += 3
elif self.Adjacency_Hull(id_cand, id_fixed):
score_dc[id_cand] += 2 score_dc[id_cand] += 2
# if self.Adjacency_P47(cand, fixed):
# score_dc[id_cand] += 3
# elif self.Adjacency_Hull(id_cand, id_fixed):
# score_dc[id_cand] += 2
score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed) score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
m = max(score_dc, key=score_dc.get) m = max(score_dc, key=score_dc.get)
......
# coding = utf-8 # coding = utf-8
import re import re
import json
from .disambiguator import Disambiguator from .disambiguator import Disambiguator
from .models.bigram import BigramModel from .models.bigram import BigramModel
import pickle import numpy as np
from ...config.configuration import config
#from ...helpers.geodict_helpers_old import *
from ...helpers.geodict_helpers import * from ...helpers.geodict_helpers import *
from .most_common import stop_words,common_words from .most_common import stop_words,common_words
import networkx as nx import networkx as nx
from .most_common import MostCommonDisambiguator from .most_common import MostCommonDisambiguator
from ...models.spatial_relation import get_spatial_relations
def read_pickle(fn): def read_json(fn):
return pickle.load(open(fn,'rb')) return json.load(open(fn,'r'))
class WikipediaDisambiguator(Disambiguator): class WikipediaDisambiguator(Disambiguator):
def __init__(self,measure="degree"): def __init__(self,measure="degree"):
Disambiguator.__init__(self,context_based=True) Disambiguator.__init__(self,context_based=True)
# Load model # Load model
self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count)) self.model=BigramModel(read_json(config.wiki_cooc_dis.cooc_freq), read_json(config.wiki_cooc_dis.count))
self.measure=measure self.measure=measure
self.mostcommon = MostCommonDisambiguator() self.mostcommon = MostCommonDisambiguator()
self.geo = False
self.most = False
self.normalise = True
def disambiguate_list(self,toponyms,lang): def disambiguate_list(self,toponyms,lang):
result=self.disambiguate_wiki(toponyms,lang) result=self.disambiguate_wiki(toponyms,lang)
...@@ -47,24 +49,18 @@ class WikipediaDisambiguator(Disambiguator): ...@@ -47,24 +49,18 @@ class WikipediaDisambiguator(Disambiguator):
betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ... betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
group_candidate = {} #candidates per toponym group_candidate = {} #candidates per toponym
most_com = set([])
for toponym in toponyms_filtered: for toponym in toponyms_filtered:
candidates = self.get_candidates(toponym, lang) candidates = self.get_candidates(toponym, lang)
if len(candidates)<1: if len(candidates)<1:
continue continue
f=False f=False
for ix,c in candidates.iterrows():
if c.id in self.model.cooc_freq :
f=True
for ij,c2 in candidates.iterrows():
if c2.id in self.model.cooc_freq and c.id in self.model.cooc_freq[c2.id]:
f=True
if not f:
candidates=self.mostcommon.disambiguate(lang,toponyms=[toponym])
if candidates :
candidates= list(candidates.values())
if not isinstance(candidates,list): if not isinstance(candidates,list):
candidates = [c.id for ix,c in candidates.iterrows()] candidates = [c.id for ix,c in candidates.iterrows()]
most_com.add( self.mostcommon.disambiguate(lang,toponyms=[toponym])[toponym])
group_candidate[toponym] = candidates group_candidate[toponym] = candidates
betw_cand[toponym]=candidates betw_cand[toponym]=candidates
for n in candidates: for n in candidates:
...@@ -83,17 +79,42 @@ class WikipediaDisambiguator(Disambiguator): ...@@ -83,17 +79,42 @@ class WikipediaDisambiguator(Disambiguator):
sc = d.score sc = d.score
# Compute probability # Compute probability
prob = self.model.get_coocurence_probability(sc, candidate, candidate2) prob = self.model.get_coocurence_probability(sc, candidate, candidate2)
if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]: if candidate2 in betw_cand[candidate] or prob < 0.0000001:
prob = 0.0
if prob < 0.0000001:
prob = 0.0 prob = 0.0
if (candidate in most_com) or (candidate2 in most_com):
prob = 1
if not candidate == candidate2: if not candidate == candidate2:
# take the lowest co-occurrency between two candidates # take the lowest co-occurrency between two candidates
if g.has_edge(candidate2, candidate) : if g.has_edge(candidate2, candidate) :
g.edges[candidate2, candidate]["weight"] += prob g.edges[candidate2, candidate]["weight"] += prob
# if g.edges[candidate2,candidate]["weight"] < prob: # if g.edges[candidate2,candidate]["weight"] < prob:
# continue # continue
g.add_edge(candidate, candidate2, weight=prob) else:
g.add_edge(candidate, candidate2, weight=prob)
if self.geo or self.normalise:
if nx.get_edge_attributes(g, "weight"):
max_weight = np.max([val for _, val in nx.get_edge_attributes(g, "weight").items()])
for item in list(g.edges(data=True)):
src, target, att = item
g.edges[src, target]["weight"] = att["weight"] / max_weight
if self.geo:
if nx.get_edge_attributes(g,"weight"):
spatial_relations = get_spatial_relations(possible_candidates)
for item in list(g.edges(data=True)):
src, target, att = item
if spatial_relations["inclusion"][src][target] or spatial_relations["inclusion"][src][target] :
g.edges[src, target]["weight"] *= 1.25
if spatial_relations["adjacency"][src][target] or spatial_relations["adjacency"][src][target] :
g.edges[src, target]["weight"] *= 1.5
if self.most:
for item in list(g.edges(data=True)):
src, target, att = item
if src in most_com or target in most_com:
g.edges[src, target]["weight"] *= 1.25
selected = {} selected = {}
#Take the candidates with the highest degree weighted #Take the candidates with the highest degree weighted
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment