From 4d787b853e59d9ee7870c1f91c50c612f347161f Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Wed, 19 Jun 2019 00:38:15 +0200 Subject: [PATCH] DEBUG --- eval_disambiguation.py | 4 +- strpython/eval/disambiguation.py | 11 ++++-- strpython/models/spatial_relation.py | 4 +- strpython/models/str.py | 38 ++++++++++--------- strpython/models/transformation/thematic.py | 1 + strpython/models/transformation/transform.py | 24 +++++++----- strpython/nlp/disambiguator/disambiguator.py | 4 +- strpython/nlp/disambiguator/models/bigram.py | 1 + strpython/nlp/disambiguator/share_prop.py | 36 ++++++++---------- strpython/nlp/disambiguator/wikipedia_cooc.py | 23 ++++++++--- strpython/pipeline.py | 2 +- 11 files changed, 86 insertions(+), 62 deletions(-) diff --git a/eval_disambiguation.py b/eval_disambiguation.py index 4d5ca5c..dcec725 100644 --- a/eval_disambiguation.py +++ b/eval_disambiguation.py @@ -26,9 +26,9 @@ if args.corpus_name == "padiweb": else: corpus_dir = "data/disambiguation_data/mada_disambiguisation" - data_lang = json.load(open("/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json")) + data_lang = json.load(open("/Users/jacquesfize/DATA_THESIS/BVLAC/raw_bvlac/associated_lang.json")) -data_lang = {int(k): v for k, v in data_lang.items()} +data_lang = {int(k): (v if v in ["fr",'en'] else "en") for k, v in data_lang.items()} corpus_files=glob.glob("{0}/*.csv".format(corpus_dir)) acc_MC,acc_GEO,acc_wiki=[],[],[] i=0 diff --git a/strpython/eval/disambiguation.py b/strpython/eval/disambiguation.py index ce64473..11e31e1 100644 --- a/strpython/eval/disambiguation.py +++ b/strpython/eval/disambiguation.py @@ -33,7 +33,12 @@ def dist(id1, id2): def efficiencyMostCommon(df, lang, score="accuracy",k=1): df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]] - df2["disambiguation"] = df2.text.apply(lambda x: disMost_common.disambiguate_(x, lang)[0]) + def foo(lang,x): + res = disMost_common.disambiguate(lang, toponyms=[x]) + if x in res: + return res[x] + return "O" + df2["disambiguation"] = df2.text.apply(lambda x:foo(lang,x)) if score == "mean_distance_error": df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1) return df2["distance"][df2["distance"] >= 0].mean() @@ -45,7 +50,7 @@ def efficiencyMostCommon(df, lang, score="accuracy",k=1): def efficiencyGeodict(df, lang, score="accuracy",k=1): df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]] - res_dis = disGaurav.eval(df2["text"].unique(), lang) + res_dis = disGaurav.disambiguate(lang,toponyms=df2["text"].unique().tolist()) df2["disambiguation"] = df2.text.apply(lambda x: res_dis[x] if x in res_dis else None) if score == "mean_distance_error": df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1) @@ -59,7 +64,7 @@ def efficiencyGeodict(df, lang, score="accuracy",k=1): def efficiencyWiki(df, lang, score="accuracy",k=1): df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]] - res_dis = disWiki.disambiguate_wiki(df2["text"].unique(), lang) + res_dis = disWiki.disambiguate(lang,toponyms=df2["text"].unique().tolist()) df2["disambiguation"] = df2.text.apply(lambda x: res_dis[x] if x in res_dis else None) if score == "mean_distance_error": df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1) diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py index c844c5d..f1df8b1 100644 --- a/strpython/models/spatial_relation.py +++ b/strpython/models/spatial_relation.py @@ -259,8 +259,8 @@ class AdjacencyMetaRelation(MetaCollector): for se2 in spatial_entities: data_se1, data_se2 = data[se1], data[se2] if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2: - not_in_stop = len(set(data_se1.class_) & stop_class) < 1 and len( - set(data_se2.class_) & stop_class) < 1 + not_in_stop = len(set(data_se1.class_) & stop_class) < 0.5 and len( + set(data_se2.class_) & stop_class) < 0.5 self.distances_is_inf_to[se1][se2] = dist_all[se1][se2] < max_d and not_in_stop else: self.distances_is_inf_to[se1][se2] = False diff --git a/strpython/models/str.py b/strpython/models/str.py index d330528..1aeab90 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -213,7 +213,7 @@ class STR(object): warnings.warn("Label empty. @en label from Geo-Database will be used.") label = data_["en"] self.spatial_entities[id] = label - self.graph.add_node(id, label=label,type="SE") + self.graph.add_node(id, label=label,type="S_E") def add_spatial_entities(self, ids: list, labels: list = []): @@ -656,33 +656,37 @@ class STR(object): def plot(self, title="STR", output_fn=None,se_color ="#4183d7",te_color="#d64541",inc_edge_color="r", - adj_edge_color="g",them_edge_color = "b",figsize=(7,7)): + adj_edge_color="g",them_edge_color = "b",figsize=(7,7),scale=2,node_size=700,layout_func=nx.shell_layout,dech=0): import matplotlib.pyplot as plt plt.figure(figsize=figsize) G = self.graph.copy() - pos = nx.shell_layout(G, scale=0.5) + pos = layout_func(G, scale=scale) + #pos = nx.layout.shell_layout(ext_2_t.graph) - nx.draw_networkx_nodes(G, pos, - nodelist=[n[0] for n in list(G.nodes(data=True)) if n[1]["type"] == "S_E"], - node_color=se_color, node_size=500) + nodes = list(G.nodes(data=True)) + max_n_char= ([len(n[1]["label"]) * node_size for n in nodes]) nx.draw_networkx_nodes(G, pos, - nodelist=[n[0] for n in list(G.nodes(data=True)) if n[1]["type"] == "T_E"], - node_color=te_color, node_size=500) - + nodelist=[n[0] for n in nodes if n[1]["type"] == "S_E"], + node_color=se_color, node_size=max_n_char) - nx.draw_networkx_labels(G, nx.shell_layout(G, scale=0.5), labels={n[0]: n[1]["label"] for n in G.nodes(data=True)}) - nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "inc"], - edge_color=inc_edge_color, arrows=True) - nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "adj"], - edge_color=adj_edge_color, arrows=True) - nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "them"], - edge_color=them_edge_color, arrows=True) + nx.draw_networkx_nodes(G, pos, + nodelist=[n[0] for n in nodes if n[1]["type"] == "T_E"], + node_color=te_color, node_size=max_n_char) + + edges = list(G.edges(data=True)) + nx.draw_networkx_labels(G, pos, labels={n[0]: n[1]["label"] for n in nodes},font_color='w') + nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "inc"], + edge_color=inc_edge_color, arrows=True,width=1.5) + nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "adj"], + edge_color=adj_edge_color, arrows=True,width=1.5) + nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "them"], + edge_color=them_edge_color, arrows=True,width=1.5) plt.title(title) plt.axis('off') - plt.margins(0.1) + plt.margins(0.2) if output_fn: plt.savefig(output_fn, bbox_inches='tight') else: diff --git a/strpython/models/transformation/thematic.py b/strpython/models/transformation/thematic.py index fbc4586..a7df8db 100644 --- a/strpython/models/transformation/thematic.py +++ b/strpython/models/transformation/thematic.py @@ -53,6 +53,7 @@ def get_extended_with_thematic(extended_str, thematic_str): if not them in new_ext.thematic_entities: new_ext.add_thematic_entities(them, thematic_str.thematic_entities[them]) new_ext.graph.add_node(them, label=thematic_str.thematic_entities[them], type="T_E") + print(es,them) new_ext.graph.add_edge(es, them, color="blue",type_="them") new_ext.add_thematic_relationships(es, them) return new_ext diff --git a/strpython/models/transformation/transform.py b/strpython/models/transformation/transform.py index 8cb788a..840b72e 100644 --- a/strpython/models/transformation/transform.py +++ b/strpython/models/transformation/transform.py @@ -159,7 +159,7 @@ class Generalisation(Transformation): class Expansion(Transformation): - def getAroundEntities(self, data, score, distance=150, unit="km", n=1): + def getAroundEntities(self, data, score, distance=100, unit="km", n=1,lang="fr",stop_en=[]): if not "coord" in data: return [] hits = client.search("gazetteer", "place", { @@ -168,13 +168,14 @@ class Expansion(Transformation): "must": [ {"match_all": {}}, {"exists": {"field": "score"}}, # Get place with high score - {"terms": {"class": ["P-PPL", "A-ADM4", "P-PPLC"]}}, + #{"terms": {"class": ["P-PPL", "A-ADM4", "P-PPLC"]}}, # Populated Settlement, Last administration level, Capital {"range": {"score": {"gt": score}}}, # Has a higher score (PR) - {"term": {"country": data.other["country"]}} # stay in the same country + #{"term": {"country": data.other["country"]}} # stay in the same country ], "must_not": [ - {"terms": {"class": ["A-ADM3", "A-ADM2", "A-ADM1"]}} # No region, departement, ... ! + {"terms": {"class": ["A-ADM3", "A-ADM2", "A-ADM1"]}}, + {"terms": {lang: stop_en}}, ], "filter": { "geo_distance": { @@ -206,14 +207,15 @@ class Expansion(Transformation): def transform(self, str_: STR, **kwargs): type_ = "adjacency" - distance = kwargs.get("distance", 150) - unit = kwargs.get("unit", 150) + distance = kwargs.get("distance", 100) + unit = kwargs.get("unit", 100) n = kwargs.get("adjacent_count", 1) cp = kwargs.get("cp", True) + lang = kwargs.get("lang","fr") if type_ == "adjacency": - return self.transform_adj(str_, distance, unit, n, cp) + return self.transform_adj(str_, distance, unit, n, lang, cp) - def transform_adj(self, str_: STR, distance: int, unit: str, n: int, cp=True) -> STR: + def transform_adj(self, str_: STR, distance: int, unit: str, n: int,lang:str, cp=True) -> STR: graph = str_.graph median, selected_se = self.select_es(graph) data_se, scores_ = {}, [] @@ -230,19 +232,21 @@ class Expansion(Transformation): new_nodes = [] labels = [] + stop_en = list(str_.spatial_entities.keys()) for node in selected_se: data_ = data_se[node] if (not "P-PPL" in data_.class_) and (not "A-ADM4" in data_.class_): continue if not "country" in data_.other: continue - neighbor = self.getAroundEntities(data_, median, distance, unit, n) + neighbor = self.getAroundEntities(data_, median, distance, unit, n,lang=lang,stop_en=stop_en) + stop_en.extend(neighbor) # if not neighbor: # try: # neighbor = [get_inclusion_chain(node, "P131")[0]] # except: # neighbor = [] - labels.extend([gazetteer.get_by_id(n)[0].label.en for n in neighbor]) + labels.extend([gazetteer.get_by_id(n)[0].label[lang] for n in neighbor]) new_nodes.extend(neighbor) new_nodes = list(set(new_nodes)) diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py index 7d8f354..9327a2b 100644 --- a/strpython/nlp/disambiguator/disambiguator.py +++ b/strpython/nlp/disambiguator/disambiguator.py @@ -76,8 +76,8 @@ class Disambiguator(object): candidates=[] candidates.extend(gazetteer.get_by_label(label,lang)) candidates.extend(gazetteer.get_by_alias(label, lang,score=False)) - #candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1,score=False)) - #candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1,score=False)) + candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1,score=False)) + candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1,score=False)) return pd.DataFrame([[ r.id, label, diff --git a/strpython/nlp/disambiguator/models/bigram.py b/strpython/nlp/disambiguator/models/bigram.py index ec146b4..c13701b 100644 --- a/strpython/nlp/disambiguator/models/bigram.py +++ b/strpython/nlp/disambiguator/models/bigram.py @@ -41,6 +41,7 @@ class BigramModel: if uri1 in self.cooc_freq[uri2]: return self.cooc_freq[uri2][uri1] #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1 + return nna diff --git a/strpython/nlp/disambiguator/share_prop.py b/strpython/nlp/disambiguator/share_prop.py index 7637faf..40c8401 100644 --- a/strpython/nlp/disambiguator/share_prop.py +++ b/strpython/nlp/disambiguator/share_prop.py @@ -73,7 +73,7 @@ class ShareProp(Disambiguator): # return fib_no[interP131]+fib_no[interP706] return self.inclusion_log(interP131) + self.inclusion_log(interP706) - def Adjacency_P47(self, id1, id2): + def Adjacency_P47(self, es1, es2): """ Return true, if two spatial entities are found adjacent using the P47 property (share borders) from Wikidata. Parameters @@ -88,9 +88,10 @@ class ShareProp(Disambiguator): bool true if adjacent using P47 """ - data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0] - if "P47" in data_1 and "P47" in data_2: - if id1 in data_2.other.P47 or id2 in data_1.other.P47: + # data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0] + + if "P47" in es1 and "P47" in es2: + if es1.id in es2.other.P47 or es2.id in es1.other.P47: return True return False @@ -130,19 +131,15 @@ class ShareProp(Disambiguator): id_cand = cand.id score_dc[id_cand] = 0 for fixed in fixed_entities: - id_fixed = fixed_entities[fixed].id - if self.Adjacency_P47(id_cand, id_fixed): + id_fixed = fixed.id + if self.Adjacency_P47(cand, fixed): score_dc[id_cand] += 3 elif self.Adjacency_Hull(id_cand, id_fixed): score_dc[id_cand] += 2 score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed) m = max(score_dc, key=score_dc.get) - if score_dc[m] < 4: - return None - for cand in spat_candidates: - if cand.id == m: - return cand.id + return m def disambiguate_context_based(self,toponyms,lang): @@ -152,21 +149,20 @@ class ShareProp(Disambiguator): for topo in toponyms: request = self.get_candidates(topo,lang) if len(request) > 1: - ambiguous_entities[topo] = request + ambiguous_entities[topo] = request.raw.values.tolist() elif len(request) == 1: fixed_entities[topo] = request.iloc[0].raw - d_amb_results = {} - for amb_ent in ambiguous_entities: - d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities) - if not d: - d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id - else: - d_amb_results[amb_ent] = d + for topo in ambiguous_entities: + d = self.disambiguateOne(ambiguous_entities[topo], fixed_entities.values()) + d_amb_results[topo] = d for k, v in fixed_entities.items(): selected_en[k] = v.id for k, v in d_amb_results.items(): selected_en[k] = v - return selected_en \ No newline at end of file + return selected_en + + + diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py index 27abd75..70391df 100644 --- a/strpython/nlp/disambiguator/wikipedia_cooc.py +++ b/strpython/nlp/disambiguator/wikipedia_cooc.py @@ -9,6 +9,7 @@ from ...config.configuration import config from ...helpers.geodict_helpers import * from .most_common import stop_words,common_words import networkx as nx +from .most_common import MostCommonDisambiguator def read_pickle(fn): return pickle.load(open(fn,'rb')) @@ -20,7 +21,7 @@ class WikipediaDisambiguator(Disambiguator): # Load model self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count)) self.measure=measure - + self.mostcommon = MostCommonDisambiguator() def disambiguate_list(self,toponyms,lang): result=self.disambiguate_wiki(toponyms,lang) @@ -50,7 +51,20 @@ class WikipediaDisambiguator(Disambiguator): candidates = self.get_candidates(toponym, lang) if len(candidates)<1: continue - candidates = [c.id for ix,c in candidates.iterrows()] + f=False + for ix,c in candidates.iterrows(): + if c.id in self.model.cooc_freq : + f=True + for ij,c2 in candidates.iterrows(): + if c2.id in self.model.cooc_freq and c.id in self.model.cooc_freq[c2.id]: + f=True + if not f: + candidates=self.mostcommon.disambiguate(lang,toponyms=[toponym]) + if candidates : + candidates= list(candidates.values()) + + if not isinstance(candidates,list): + candidates = [c.id for ix,c in candidates.iterrows()] group_candidate[toponym] = candidates betw_cand[toponym]=candidates for n in candidates: @@ -61,14 +75,14 @@ class WikipediaDisambiguator(Disambiguator): g.add_node(candidate, label=gazetteer.get_by_id(candidate)[0].label[lang]) data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates} + for candidate in possible_candidates: for candidate2 in possible_candidates: - # Get PageRank score + d = data_candidate[candidate] sc = d.score # Compute probability prob = self.model.get_coocurence_probability(sc, candidate, candidate2) - if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]: prob = 0.0 if prob < 0.0000001: @@ -82,7 +96,6 @@ class WikipediaDisambiguator(Disambiguator): g.add_edge(candidate, candidate2, weight=prob) selected = {} - #Take the candidates with the highest degree weighted for gr in group_candidate: if self.measure == "degree": diff --git a/strpython/pipeline.py b/strpython/pipeline.py index 0ecc279..a25a66a 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -145,7 +145,7 @@ class Pipeline(object): str_.build() return str_ - def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs): + def pipe_transform(self,strs_,**kwargs): str_s = [ self.transform(str_, **kwargs) for str_ in tqdm(strs_,desc="Transform STR", disable=(not self.verbose))] return str_s -- GitLab