From 4d787b853e59d9ee7870c1f91c50c612f347161f Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Wed, 19 Jun 2019 00:38:15 +0200
Subject: [PATCH] DEBUG

---
 eval_disambiguation.py                        |  4 +-
 strpython/eval/disambiguation.py              | 11 ++++--
 strpython/models/spatial_relation.py          |  4 +-
 strpython/models/str.py                       | 38 ++++++++++---------
 strpython/models/transformation/thematic.py   |  1 +
 strpython/models/transformation/transform.py  | 24 +++++++-----
 strpython/nlp/disambiguator/disambiguator.py  |  4 +-
 strpython/nlp/disambiguator/models/bigram.py  |  1 +
 strpython/nlp/disambiguator/share_prop.py     | 36 ++++++++----------
 strpython/nlp/disambiguator/wikipedia_cooc.py | 23 ++++++++---
 strpython/pipeline.py                         |  2 +-
 11 files changed, 86 insertions(+), 62 deletions(-)

diff --git a/eval_disambiguation.py b/eval_disambiguation.py
index 4d5ca5c..dcec725 100644
--- a/eval_disambiguation.py
+++ b/eval_disambiguation.py
@@ -26,9 +26,9 @@ if args.corpus_name == "padiweb":
 
 else:
     corpus_dir = "data/disambiguation_data/mada_disambiguisation"
-    data_lang = json.load(open("/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json"))
+    data_lang = json.load(open("/Users/jacquesfize/DATA_THESIS/BVLAC/raw_bvlac/associated_lang.json"))
 
-data_lang = {int(k): v for k, v in data_lang.items()}
+data_lang = {int(k): (v if v in ["fr",'en'] else "en") for k, v in data_lang.items()}
 corpus_files=glob.glob("{0}/*.csv".format(corpus_dir))
 acc_MC,acc_GEO,acc_wiki=[],[],[]
 i=0
diff --git a/strpython/eval/disambiguation.py b/strpython/eval/disambiguation.py
index ce64473..11e31e1 100644
--- a/strpython/eval/disambiguation.py
+++ b/strpython/eval/disambiguation.py
@@ -33,7 +33,12 @@ def dist(id1, id2):
 
 def efficiencyMostCommon(df, lang, score="accuracy",k=1):
     df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]]
-    df2["disambiguation"] = df2.text.apply(lambda x: disMost_common.disambiguate_(x, lang)[0])
+    def foo(lang,x):
+        res = disMost_common.disambiguate(lang, toponyms=[x])
+        if x in res:
+            return res[x]
+        return "O"
+    df2["disambiguation"] = df2.text.apply(lambda x:foo(lang,x))
     if score == "mean_distance_error":
         df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1)
         return df2["distance"][df2["distance"] >= 0].mean()
@@ -45,7 +50,7 @@ def efficiencyMostCommon(df, lang, score="accuracy",k=1):
 
 def efficiencyGeodict(df, lang, score="accuracy",k=1):
     df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]]
-    res_dis = disGaurav.eval(df2["text"].unique(), lang)
+    res_dis = disGaurav.disambiguate(lang,toponyms=df2["text"].unique().tolist())
     df2["disambiguation"] = df2.text.apply(lambda x: res_dis[x] if x in res_dis else None)
     if score == "mean_distance_error":
         df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1)
@@ -59,7 +64,7 @@ def efficiencyGeodict(df, lang, score="accuracy",k=1):
 
 def efficiencyWiki(df, lang, score="accuracy",k=1):
     df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]]
-    res_dis = disWiki.disambiguate_wiki(df2["text"].unique(), lang)
+    res_dis = disWiki.disambiguate(lang,toponyms=df2["text"].unique().tolist())
     df2["disambiguation"] = df2.text.apply(lambda x: res_dis[x] if x in res_dis else None)
     if score == "mean_distance_error":
         df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1)
diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py
index c844c5d..f1df8b1 100644
--- a/strpython/models/spatial_relation.py
+++ b/strpython/models/spatial_relation.py
@@ -259,8 +259,8 @@ class AdjacencyMetaRelation(MetaCollector):
             for se2 in spatial_entities:
                 data_se1, data_se2 = data[se1], data[se2]
                 if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2:
-                    not_in_stop = len(set(data_se1.class_) & stop_class) < 1 and len(
-                        set(data_se2.class_) & stop_class) < 1
+                    not_in_stop = len(set(data_se1.class_) & stop_class) < 0.5 and len(
+                        set(data_se2.class_) & stop_class) < 0.5
                     self.distances_is_inf_to[se1][se2] = dist_all[se1][se2] < max_d and not_in_stop
                 else:
                     self.distances_is_inf_to[se1][se2] = False
diff --git a/strpython/models/str.py b/strpython/models/str.py
index d330528..1aeab90 100644
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -213,7 +213,7 @@ class STR(object):
             warnings.warn("Label empty. @en label from Geo-Database will be used.")
             label = data_["en"]
         self.spatial_entities[id] = label
-        self.graph.add_node(id, label=label,type="SE")
+        self.graph.add_node(id, label=label,type="S_E")
 
 
     def add_spatial_entities(self, ids: list, labels: list = []):
@@ -656,33 +656,37 @@ class STR(object):
 
 
     def plot(self, title="STR", output_fn=None,se_color ="#4183d7",te_color="#d64541",inc_edge_color="r",
-             adj_edge_color="g",them_edge_color = "b",figsize=(7,7)):
+             adj_edge_color="g",them_edge_color = "b",figsize=(7,7),scale=2,node_size=700,layout_func=nx.shell_layout,dech=0):
 
         import matplotlib.pyplot as plt
         plt.figure(figsize=figsize)
         G = self.graph.copy()
-        pos = nx.shell_layout(G, scale=0.5)
+        pos = layout_func(G, scale=scale)
+        #pos = nx.layout.shell_layout(ext_2_t.graph)
 
-        nx.draw_networkx_nodes(G, pos,
-                            nodelist=[n[0] for n in list(G.nodes(data=True)) if n[1]["type"] == "S_E"],
-                            node_color=se_color, node_size=500)
+        nodes = list(G.nodes(data=True))
+        max_n_char= ([len(n[1]["label"]) * node_size for n in nodes])
 
         nx.draw_networkx_nodes(G, pos,
-                                nodelist=[n[0] for n in list(G.nodes(data=True)) if n[1]["type"] == "T_E"],
-                                node_color=te_color, node_size=500)
-
+                            nodelist=[n[0] for n in nodes if n[1]["type"] == "S_E"],
+                            node_color=se_color, node_size=max_n_char)
 
-        nx.draw_networkx_labels(G, nx.shell_layout(G, scale=0.5), labels={n[0]: n[1]["label"] for n in G.nodes(data=True)})
-        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "inc"],
-                               edge_color=inc_edge_color, arrows=True)
-        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "adj"],
-                               edge_color=adj_edge_color, arrows=True)
-        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "them"],
-                               edge_color=them_edge_color, arrows=True)
+        nx.draw_networkx_nodes(G, pos,
+                                nodelist=[n[0] for n in nodes if n[1]["type"] == "T_E"],
+                                node_color=te_color, node_size=max_n_char)
+
+        edges = list(G.edges(data=True))
+        nx.draw_networkx_labels(G, pos, labels={n[0]: n[1]["label"] for n in nodes},font_color='w')
+        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "inc"],
+                               edge_color=inc_edge_color, arrows=True,width=1.5)
+        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "adj"],
+                               edge_color=adj_edge_color, arrows=True,width=1.5)
+        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "them"],
+                               edge_color=them_edge_color, arrows=True,width=1.5)
 
         plt.title(title)
         plt.axis('off')
-        plt.margins(0.1)
+        plt.margins(0.2)
         if output_fn:
             plt.savefig(output_fn, bbox_inches='tight')
         else:
diff --git a/strpython/models/transformation/thematic.py b/strpython/models/transformation/thematic.py
index fbc4586..a7df8db 100644
--- a/strpython/models/transformation/thematic.py
+++ b/strpython/models/transformation/thematic.py
@@ -53,6 +53,7 @@ def get_extended_with_thematic(extended_str, thematic_str):
             if not them in new_ext.thematic_entities:
                 new_ext.add_thematic_entities(them, thematic_str.thematic_entities[them])
                 new_ext.graph.add_node(them, label=thematic_str.thematic_entities[them], type="T_E")
+            print(es,them)
             new_ext.graph.add_edge(es, them, color="blue",type_="them")
             new_ext.add_thematic_relationships(es, them)
     return new_ext
diff --git a/strpython/models/transformation/transform.py b/strpython/models/transformation/transform.py
index 8cb788a..840b72e 100644
--- a/strpython/models/transformation/transform.py
+++ b/strpython/models/transformation/transform.py
@@ -159,7 +159,7 @@ class Generalisation(Transformation):
 
 
 class Expansion(Transformation):
-    def getAroundEntities(self, data, score, distance=150, unit="km", n=1):
+    def getAroundEntities(self, data, score, distance=100, unit="km", n=1,lang="fr",stop_en=[]):
         if not "coord" in data:
             return []
         hits = client.search("gazetteer", "place", {
@@ -168,13 +168,14 @@ class Expansion(Transformation):
                     "must": [
                         {"match_all": {}},
                         {"exists": {"field": "score"}},  # Get place with high score
-                        {"terms": {"class": ["P-PPL", "A-ADM4", "P-PPLC"]}},
+                        #{"terms": {"class": ["P-PPL", "A-ADM4", "P-PPLC"]}},
                         # Populated Settlement, Last administration level, Capital
                         {"range": {"score": {"gt": score}}},  # Has a higher score (PR)
-                        {"term": {"country": data.other["country"]}}  # stay in the same country
+                        #{"term": {"country": data.other["country"]}}  # stay in the same country
                     ],
                     "must_not": [
-                        {"terms": {"class": ["A-ADM3", "A-ADM2", "A-ADM1"]}}  # No region, departement, ... !
+                        {"terms": {"class": ["A-ADM3", "A-ADM2", "A-ADM1"]}},
+                        {"terms": {lang: stop_en}},
                     ],
                     "filter": {
                         "geo_distance": {
@@ -206,14 +207,15 @@ class Expansion(Transformation):
 
     def transform(self, str_: STR, **kwargs):
         type_ = "adjacency"
-        distance = kwargs.get("distance", 150)
-        unit = kwargs.get("unit", 150)
+        distance = kwargs.get("distance", 100)
+        unit = kwargs.get("unit", 100)
         n = kwargs.get("adjacent_count", 1)
         cp = kwargs.get("cp", True)
+        lang = kwargs.get("lang","fr")
         if type_ == "adjacency":
-            return self.transform_adj(str_, distance, unit, n, cp)
+            return self.transform_adj(str_, distance, unit, n, lang, cp)
 
-    def transform_adj(self, str_: STR, distance: int, unit: str, n: int, cp=True) -> STR:
+    def transform_adj(self, str_: STR, distance: int, unit: str, n: int,lang:str, cp=True) -> STR:
         graph = str_.graph
         median, selected_se = self.select_es(graph)
         data_se, scores_ = {}, []
@@ -230,19 +232,21 @@ class Expansion(Transformation):
 
         new_nodes = []
         labels = []
+        stop_en = list(str_.spatial_entities.keys())
         for node in selected_se:
             data_ = data_se[node]
             if (not "P-PPL" in data_.class_) and (not "A-ADM4" in data_.class_):
                 continue
             if not "country" in data_.other:
                 continue
-            neighbor = self.getAroundEntities(data_, median, distance, unit, n)
+            neighbor = self.getAroundEntities(data_, median, distance, unit, n,lang=lang,stop_en=stop_en)
+            stop_en.extend(neighbor)
             # if not neighbor:
             #     try:
             #         neighbor = [get_inclusion_chain(node, "P131")[0]]
             #     except:
             #         neighbor = []
-            labels.extend([gazetteer.get_by_id(n)[0].label.en for n in neighbor])
+            labels.extend([gazetteer.get_by_id(n)[0].label[lang] for n in neighbor])
             new_nodes.extend(neighbor)
 
         new_nodes = list(set(new_nodes))
diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py
index 7d8f354..9327a2b 100644
--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -76,8 +76,8 @@ class Disambiguator(object):
         candidates=[]
         candidates.extend(gazetteer.get_by_label(label,lang))
         candidates.extend(gazetteer.get_by_alias(label, lang,score=False))
-        #candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1,score=False))
-        #candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1,score=False))
+        candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1,score=False))
+        candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1,score=False))
         return pd.DataFrame([[
             r.id,
             label,
diff --git a/strpython/nlp/disambiguator/models/bigram.py b/strpython/nlp/disambiguator/models/bigram.py
index ec146b4..c13701b 100644
--- a/strpython/nlp/disambiguator/models/bigram.py
+++ b/strpython/nlp/disambiguator/models/bigram.py
@@ -41,6 +41,7 @@ class BigramModel:
             if uri1 in self.cooc_freq[uri2]:
                 return self.cooc_freq[uri2][uri1]
                 #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
+
         return nna
 
 
diff --git a/strpython/nlp/disambiguator/share_prop.py b/strpython/nlp/disambiguator/share_prop.py
index 7637faf..40c8401 100644
--- a/strpython/nlp/disambiguator/share_prop.py
+++ b/strpython/nlp/disambiguator/share_prop.py
@@ -73,7 +73,7 @@ class ShareProp(Disambiguator):
         # return fib_no[interP131]+fib_no[interP706]
         return self.inclusion_log(interP131) + self.inclusion_log(interP706)
 
-    def Adjacency_P47(self, id1, id2):
+    def Adjacency_P47(self, es1, es2):
         """
         Return true, if two spatial entities are found adjacent using the P47 property (share borders) from Wikidata.
         Parameters
@@ -88,9 +88,10 @@ class ShareProp(Disambiguator):
         bool
             true if adjacent using P47
         """
-        data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
-        if "P47" in data_1 and "P47" in data_2:
-            if id1 in data_2.other.P47 or id2 in data_1.other.P47:
+        # data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
+
+        if "P47" in es1 and "P47" in es2:
+            if es1.id in es2.other.P47 or es2.id in es1.other.P47:
                 return True
         return False
 
@@ -130,19 +131,15 @@ class ShareProp(Disambiguator):
             id_cand = cand.id
             score_dc[id_cand] = 0
             for fixed in fixed_entities:
-                id_fixed = fixed_entities[fixed].id
-                if self.Adjacency_P47(id_cand, id_fixed):
+                id_fixed = fixed.id
+                if self.Adjacency_P47(cand, fixed):
                     score_dc[id_cand] += 3
                 elif self.Adjacency_Hull(id_cand, id_fixed):
                     score_dc[id_cand] += 2
                 score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
 
         m = max(score_dc, key=score_dc.get)
-        if score_dc[m] < 4:
-            return None
-        for cand in spat_candidates:
-            if cand.id == m:
-                return cand.id
+        return m
 
 
     def disambiguate_context_based(self,toponyms,lang):
@@ -152,21 +149,20 @@ class ShareProp(Disambiguator):
         for topo in toponyms:
             request = self.get_candidates(topo,lang)
             if len(request) > 1:
-                ambiguous_entities[topo] = request
+                ambiguous_entities[topo] = request.raw.values.tolist()
             elif len(request) == 1:
                 fixed_entities[topo] = request.iloc[0].raw
-
         d_amb_results = {}
-        for amb_ent in ambiguous_entities:
-            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
-            if not d:
-                d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
-            else:
-                d_amb_results[amb_ent] = d
+        for topo in ambiguous_entities:
+            d = self.disambiguateOne(ambiguous_entities[topo], fixed_entities.values())
+            d_amb_results[topo] = d
 
         for k, v in fixed_entities.items():
             selected_en[k] = v.id
         for k, v in d_amb_results.items():
             selected_en[k] = v
 
-        return selected_en
\ No newline at end of file
+        return selected_en
+
+
+
diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py
index 27abd75..70391df 100644
--- a/strpython/nlp/disambiguator/wikipedia_cooc.py
+++ b/strpython/nlp/disambiguator/wikipedia_cooc.py
@@ -9,6 +9,7 @@ from ...config.configuration import config
 from ...helpers.geodict_helpers import *
 from .most_common import stop_words,common_words
 import networkx as nx
+from .most_common import MostCommonDisambiguator
 
 def read_pickle(fn):
     return pickle.load(open(fn,'rb'))
@@ -20,7 +21,7 @@ class WikipediaDisambiguator(Disambiguator):
         # Load model
         self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
         self.measure=measure
-
+        self.mostcommon = MostCommonDisambiguator()
 
     def disambiguate_list(self,toponyms,lang):
         result=self.disambiguate_wiki(toponyms,lang)
@@ -50,7 +51,20 @@ class WikipediaDisambiguator(Disambiguator):
             candidates = self.get_candidates(toponym, lang)
             if len(candidates)<1:
                 continue
-            candidates = [c.id for ix,c in candidates.iterrows()]
+            f=False
+            for ix,c in candidates.iterrows():
+                if c.id in self.model.cooc_freq :
+                    f=True
+                for ij,c2 in candidates.iterrows():
+                    if c2.id in self.model.cooc_freq and c.id in self.model.cooc_freq[c2.id]:
+                        f=True
+            if not f:
+                candidates=self.mostcommon.disambiguate(lang,toponyms=[toponym])
+                if candidates :
+                    candidates= list(candidates.values())
+
+            if not isinstance(candidates,list):
+                candidates = [c.id for ix,c in candidates.iterrows()]
             group_candidate[toponym] = candidates
             betw_cand[toponym]=candidates
             for n in candidates:
@@ -61,14 +75,14 @@ class WikipediaDisambiguator(Disambiguator):
             g.add_node(candidate, label=gazetteer.get_by_id(candidate)[0].label[lang])
 
         data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
+
         for candidate in possible_candidates:
             for candidate2 in possible_candidates:
-                # Get PageRank score
+
                 d = data_candidate[candidate]
                 sc = d.score
                 # Compute probability
                 prob = self.model.get_coocurence_probability(sc, candidate, candidate2)
-
                 if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]:
                     prob = 0.0
                 if prob < 0.0000001:
@@ -82,7 +96,6 @@ class WikipediaDisambiguator(Disambiguator):
                     g.add_edge(candidate, candidate2, weight=prob)
 
         selected = {}
-
         #Take the candidates with the highest degree weighted
         for gr in group_candidate:
             if self.measure == "degree":
diff --git a/strpython/pipeline.py b/strpython/pipeline.py
index 0ecc279..a25a66a 100644
--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -145,7 +145,7 @@ class Pipeline(object):
         str_.build()
         return str_
 
-    def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs):
+    def pipe_transform(self,strs_,**kwargs):
         str_s = [ self.transform(str_, **kwargs) for str_ in tqdm(strs_,desc="Transform STR", disable=(not self.verbose))]
         return str_s
 
-- 
GitLab