DEBUG

4d787b85 · Fize Jacques · 3e95bf0d · 4d787b85 · 4d787b85 · 4d787b85
Commit 4d787b85 authored 6 years ago by Fize Jacques
Hide whitespace changes
Inline Side-by-side

Showing

with 86 additions and 62 deletions
+86 -62
--- a/eval_disambiguation.py
+++ b/eval_disambiguation.py
@@ -26,9 +26,9 @@ if args.corpus_name == "padiweb":
 else:
    corpus_dir = "data/disambiguation_data/mada_disambiguisation"
-    data_lang = json.load(open("/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json"))
+    data_lang = json.load(open("/Users/jacquesfize/DATA_THESIS/BVLAC/raw_bvlac/associated_lang.json"))
-data_lang = {int(k): v for k, v in data_lang.items()}
+data_lang = {int(k): (v if v in ["fr",'en'] else "en") for k, v in data_lang.items()}
 corpus_files=glob.glob("{0}/*.csv".format(corpus_dir))
 acc_MC,acc_GEO,acc_wiki=[],[],[]
 i=0

--- a/strpython/eval/disambiguation.py
+++ b/strpython/eval/disambiguation.py
@@ -33,7 +33,12 @@ def dist(id1, id2):
 def efficiencyMostCommon(df, lang, score="accuracy",k=1):
    df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]]
-    df2["disambiguation"] = df2.text.apply(lambda x: disMost_common.disambiguate_(x, lang)[0])
+    def foo(lang,x):
+        res = disMost_common.disambiguate(lang, toponyms=[x])
+        if x in res:
+            return res[x]
+        return "O"
+    df2["disambiguation"] = df2.text.apply(lambda x:foo(lang,x))
    if score == "mean_distance_error":
        df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1)
        return df2["distance"][df2["distance"] >= 0].mean()
@@ -45,7 +50,7 @@ def efficiencyMostCommon(df, lang, score="accuracy",k=1):
 def efficiencyGeodict(df, lang, score="accuracy",k=1):
    df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]]
-    res_dis = disGaurav.eval(df2["text"].unique(), lang)
+    res_dis = disGaurav.disambiguate(lang,toponyms=df2["text"].unique().tolist())
    df2["disambiguation"] = df2.text.apply(lambda x: res_dis[x] if x in res_dis else None)
    if score == "mean_distance_error":
        df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1)
@@ -59,7 +64,7 @@ def efficiencyGeodict(df, lang, score="accuracy",k=1):
 def efficiencyWiki(df, lang, score="accuracy",k=1):
    df2 = df[-df["GID"].isin(["O", "NR", "o"])][["text", "GID"]]
-    res_dis = disWiki.disambiguate_wiki(df2["text"].unique(), lang)
+    res_dis = disWiki.disambiguate(lang,toponyms=df2["text"].unique().tolist())
    df2["disambiguation"] = df2.text.apply(lambda x: res_dis[x] if x in res_dis else None)
    if score == "mean_distance_error":
        df2["distance"] = df2.apply(lambda row: dist(row.GID, row.disambiguation) if "GID" in row else -1, axis=1)

--- a/strpython/models/spatial_relation.py
+++ b/strpython/models/spatial_relation.py
@@ -259,8 +259,8 @@ class AdjacencyMetaRelation(MetaCollector):
            for se2 in spatial_entities:
                data_se1, data_se2 = data[se1], data[se2]
                if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2:
-                    not_in_stop = len(set(data_se1.class_) & stop_class) < 1 and len(
+                    not_in_stop = len(set(data_se1.class_) & stop_class) < 0.5 and len(
-                        set(data_se2.class_) & stop_class) < 1
+                        set(data_se2.class_) & stop_class) < 0.5
                    self.distances_is_inf_to[se1][se2] = dist_all[se1][se2] < max_d and not_in_stop
                else:
                    self.distances_is_inf_to[se1][se2] = False

--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -213,7 +213,7 @@ class STR(object):
            warnings.warn("Label empty. @en label from Geo-Database will be used.")
            label = data_["en"]
        self.spatial_entities[id] = label
-        self.graph.add_node(id, label=label,type="SE")
+        self.graph.add_node(id, label=label,type="S_E")
    def add_spatial_entities(self, ids: list, labels: list = []):
@@ -656,33 +656,37 @@ class STR(object):
    def plot(self, title="STR", output_fn=None,se_color ="#4183d7",te_color="#d64541",inc_edge_color="r",
-             adj_edge_color="g",them_edge_color = "b",figsize=(7,7)):
+             adj_edge_color="g",them_edge_color = "b",figsize=(7,7),scale=2,node_size=700,layout_func=nx.shell_layout,dech=0):
        import matplotlib.pyplot as plt
        plt.figure(figsize=figsize)
        G = self.graph.copy()
-        pos = nx.shell_layout(G, scale=0.5)
+        pos = layout_func(G, scale=scale)
+        #pos = nx.layout.shell_layout(ext_2_t.graph)
-        nx.draw_networkx_nodes(G, pos,
+        nodes = list(G.nodes(data=True))
-                            nodelist=[n[0] for n in list(G.nodes(data=True)) if n[1]["type"] == "S_E"],
+        max_n_char= ([len(n[1]["label"]) * node_size for n in nodes])
-                            node_color=se_color, node_size=500)
        nx.draw_networkx_nodes(G, pos,
-                                nodelist=[n[0] for n in list(G.nodes(data=True)) if n[1]["type"] == "T_E"],
+                            nodelist=[n[0] for n in nodes if n[1]["type"] == "S_E"],
-                                node_color=te_color, node_size=500)
+                            node_color=se_color, node_size=max_n_char)
-        nx.draw_networkx_labels(G, nx.shell_layout(G, scale=0.5), labels={n[0]: n[1]["label"] for n in G.nodes(data=True)})
+        nx.draw_networkx_nodes(G, pos,
-        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "inc"],
+                                nodelist=[n[0] for n in nodes if n[1]["type"] == "T_E"],
-                               edge_color=inc_edge_color, arrows=True)
+                                node_color=te_color, node_size=max_n_char)
-        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "adj"],
-                               edge_color=adj_edge_color, arrows=True)
+        edges = list(G.edges(data=True))
-        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in G.edges(data=True) if ed[2]["type_"] == "them"],
+        nx.draw_networkx_labels(G, pos, labels={n[0]: n[1]["label"] for n in nodes},font_color='w')
-                               edge_color=them_edge_color, arrows=True)
+        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "inc"],
+                               edge_color=inc_edge_color, arrows=True,width=1.5)
+        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "adj"],
+                               edge_color=adj_edge_color, arrows=True,width=1.5)
+        nx.draw_networkx_edges(G, pos, edgelist=[ed for ed in edges if ed[2]["type_"] == "them"],
+                               edge_color=them_edge_color, arrows=True,width=1.5)
        plt.title(title)
        plt.axis('off')
-        plt.margins(0.1)
+        plt.margins(0.2)
        if output_fn:
            plt.savefig(output_fn, bbox_inches='tight')
        else:

--- a/strpython/models/transformation/thematic.py
+++ b/strpython/models/transformation/thematic.py
@@ -53,6 +53,7 @@ def get_extended_with_thematic(extended_str, thematic_str):
            if not them in new_ext.thematic_entities:
                new_ext.add_thematic_entities(them, thematic_str.thematic_entities[them])
                new_ext.graph.add_node(them, label=thematic_str.thematic_entities[them], type="T_E")
+            print(es,them)
            new_ext.graph.add_edge(es, them, color="blue",type_="them")
            new_ext.add_thematic_relationships(es, them)
    return new_ext
--- a/strpython/models/transformation/transform.py
+++ b/strpython/models/transformation/transform.py
@@ -159,7 +159,7 @@ class Generalisation(Transformation):
 class Expansion(Transformation):
-    def getAroundEntities(self, data, score, distance=150, unit="km", n=1):
+    def getAroundEntities(self, data, score, distance=100, unit="km", n=1,lang="fr",stop_en=[]):
        if not "coord" in data:
            return []
        hits = client.search("gazetteer", "place", {
@@ -168,13 +168,14 @@ class Expansion(Transformation):
                    "must": [
                        {"match_all": {}},
                        {"exists": {"field": "score"}},  # Get place with high score
-                        {"terms": {"class": ["P-PPL", "A-ADM4", "P-PPLC"]}},
+                        #{"terms": {"class": ["P-PPL", "A-ADM4", "P-PPLC"]}},
                        # Populated Settlement, Last administration level, Capital
                        {"range": {"score": {"gt": score}}},  # Has a higher score (PR)
-                        {"term": {"country": data.other["country"]}}  # stay in the same country
+                        #{"term": {"country": data.other["country"]}}  # stay in the same country
                    ],
                    "must_not": [
-                        {"terms": {"class": ["A-ADM3", "A-ADM2", "A-ADM1"]}}  # No region, departement, ... !
+                        {"terms": {"class": ["A-ADM3", "A-ADM2", "A-ADM1"]}},
+                        {"terms": {lang: stop_en}},
                    ],
                    "filter": {
                        "geo_distance": {
@@ -206,14 +207,15 @@ class Expansion(Transformation):
    def transform(self, str_: STR, **kwargs):
        type_ = "adjacency"
-        distance = kwargs.get("distance", 150)
+        distance = kwargs.get("distance", 100)
-        unit = kwargs.get("unit", 150)
+        unit = kwargs.get("unit", 100)
        n = kwargs.get("adjacent_count", 1)
        cp = kwargs.get("cp", True)
+        lang = kwargs.get("lang","fr")
        if type_ == "adjacency":
-            return self.transform_adj(str_, distance, unit, n, cp)
+            return self.transform_adj(str_, distance, unit, n, lang, cp)
-    def transform_adj(self, str_: STR, distance: int, unit: str, n: int, cp=True) -> STR:
+    def transform_adj(self, str_: STR, distance: int, unit: str, n: int,lang:str, cp=True) -> STR:
        graph = str_.graph
        median, selected_se = self.select_es(graph)
        data_se, scores_ = {}, []
@@ -230,19 +232,21 @@ class Expansion(Transformation):
        new_nodes = []
        labels = []
+        stop_en = list(str_.spatial_entities.keys())
        for node in selected_se:
            data_ = data_se[node]
            if (not "P-PPL" in data_.class_) and (not "A-ADM4" in data_.class_):
                continue
            if not "country" in data_.other:
                continue
-            neighbor = self.getAroundEntities(data_, median, distance, unit, n)
+            neighbor = self.getAroundEntities(data_, median, distance, unit, n,lang=lang,stop_en=stop_en)
+            stop_en.extend(neighbor)
            # if not neighbor:
            #     try:
            #         neighbor = [get_inclusion_chain(node, "P131")[0]]
            #     except:
            #         neighbor = []
-            labels.extend([gazetteer.get_by_id(n)[0].label.en for n in neighbor])
+            labels.extend([gazetteer.get_by_id(n)[0].label[lang] for n in neighbor])
            new_nodes.extend(neighbor)
        new_nodes = list(set(new_nodes))

--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -76,8 +76,8 @@ class Disambiguator(object):
        candidates=[]
        candidates.extend(gazetteer.get_by_label(label,lang))
        candidates.extend(gazetteer.get_by_alias(label, lang,score=False))
-        #candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1,score=False))
+        candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1,score=False))
-        #candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1,score=False))
+        candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1,score=False))
        return pd.DataFrame([[
            r.id,
            label,

--- a/strpython/nlp/disambiguator/models/bigram.py
+++ b/strpython/nlp/disambiguator/models/bigram.py
@@ -41,6 +41,7 @@ class BigramModel:
            if uri1 in self.cooc_freq[uri2]:
                return self.cooc_freq[uri2][uri1]
                #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
        return nna
--- a/strpython/nlp/disambiguator/share_prop.py
+++ b/strpython/nlp/disambiguator/share_prop.py
@@ -73,7 +73,7 @@ class ShareProp(Disambiguator):
        # return fib_no[interP131]+fib_no[interP706]
        return self.inclusion_log(interP131) + self.inclusion_log(interP706)
-    def Adjacency_P47(self, id1, id2):
+    def Adjacency_P47(self, es1, es2):
        """
        Return true, if two spatial entities are found adjacent using the P47 property (share borders) from Wikidata.
        Parameters
@@ -88,9 +88,10 @@ class ShareProp(Disambiguator):
        bool
            true if adjacent using P47
        """
-        data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
+        # data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
-        if "P47" in data_1 and "P47" in data_2:
-            if id1 in data_2.other.P47 or id2 in data_1.other.P47:
+        if "P47" in es1 and "P47" in es2:
+            if es1.id in es2.other.P47 or es2.id in es1.other.P47:
                return True
        return False
@@ -130,19 +131,15 @@ class ShareProp(Disambiguator):
            id_cand = cand.id
            score_dc[id_cand] = 0
            for fixed in fixed_entities:
-                id_fixed = fixed_entities[fixed].id
+                id_fixed = fixed.id
-                if self.Adjacency_P47(id_cand, id_fixed):
+                if self.Adjacency_P47(cand, fixed):
                    score_dc[id_cand] += 3
                elif self.Adjacency_Hull(id_cand, id_fixed):
                    score_dc[id_cand] += 2
                score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
        m = max(score_dc, key=score_dc.get)
-        if score_dc[m] < 4:
+        return m
-            return None
-        for cand in spat_candidates:
-            if cand.id == m:
-                return cand.id
    def disambiguate_context_based(self,toponyms,lang):
@@ -152,21 +149,20 @@ class ShareProp(Disambiguator):
        for topo in toponyms:
            request = self.get_candidates(topo,lang)
            if len(request) > 1:
-                ambiguous_entities[topo] = request
+                ambiguous_entities[topo] = request.raw.values.tolist()
            elif len(request) == 1:
                fixed_entities[topo] = request.iloc[0].raw
        d_amb_results = {}
-        for amb_ent in ambiguous_entities:
+        for topo in ambiguous_entities:
-            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
+            d = self.disambiguateOne(ambiguous_entities[topo], fixed_entities.values())
-            if not d:
+            d_amb_results[topo] = d
-                d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
-            else:
-                d_amb_results[amb_ent] = d
        for k, v in fixed_entities.items():
            selected_en[k] = v.id
        for k, v in d_amb_results.items():
            selected_en[k] = v
        return selected_en
\ No newline at end of file
--- a/strpython/nlp/disambiguator/wikipedia_cooc.py
+++ b/strpython/nlp/disambiguator/wikipedia_cooc.py
@@ -9,6 +9,7 @@ from ...config.configuration import config
 from ...helpers.geodict_helpers import *
 from .most_common import stop_words,common_words
 import networkx as nx
+from .most_common import MostCommonDisambiguator
 def read_pickle(fn):
    return pickle.load(open(fn,'rb'))
@@ -20,7 +21,7 @@ class WikipediaDisambiguator(Disambiguator):
        # Load model
        self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
        self.measure=measure
+        self.mostcommon = MostCommonDisambiguator()
    def disambiguate_list(self,toponyms,lang):
        result=self.disambiguate_wiki(toponyms,lang)
@@ -50,7 +51,20 @@ class WikipediaDisambiguator(Disambiguator):
            candidates = self.get_candidates(toponym, lang)
            if len(candidates)<1:
                continue
-            candidates = [c.id for ix,c in candidates.iterrows()]
+            f=False
+            for ix,c in candidates.iterrows():
+                if c.id in self.model.cooc_freq :
+                    f=True
+                for ij,c2 in candidates.iterrows():
+                    if c2.id in self.model.cooc_freq and c.id in self.model.cooc_freq[c2.id]:
+                        f=True
+            if not f:
+                candidates=self.mostcommon.disambiguate(lang,toponyms=[toponym])
+                if candidates :
+                    candidates= list(candidates.values())
+            if not isinstance(candidates,list):
+                candidates = [c.id for ix,c in candidates.iterrows()]
            group_candidate[toponym] = candidates
            betw_cand[toponym]=candidates
            for n in candidates:
@@ -61,14 +75,14 @@ class WikipediaDisambiguator(Disambiguator):
            g.add_node(candidate, label=gazetteer.get_by_id(candidate)[0].label[lang])
        data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
        for candidate in possible_candidates:
            for candidate2 in possible_candidates:
-                # Get PageRank score
                d = data_candidate[candidate]
                sc = d.score
                # Compute probability
                prob = self.model.get_coocurence_probability(sc, candidate, candidate2)
                if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]:
                    prob = 0.0
                if prob < 0.0000001:
@@ -82,7 +96,6 @@ class WikipediaDisambiguator(Disambiguator):
                    g.add_edge(candidate, candidate2, weight=prob)
        selected = {}
        #Take the candidates with the highest degree weighted
        for gr in group_candidate:
            if self.measure == "degree":

--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -145,7 +145,7 @@ class Pipeline(object):
        str_.build()
        return str_
-    def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs):
+    def pipe_transform(self,strs_,**kwargs):
        str_s = [ self.transform(str_, **kwargs) for str_ in tqdm(strs_,desc="Transform STR", disable=(not self.verbose))]
        return str_s