From e9d151de8c505f1edfb680c0c376a3ac43160a7e Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Mon, 18 Mar 2019 17:03:01 +0100
Subject: [PATCH] Debug spatial relation extraction Debug disambiguator delete
 old disambiguator classes Add Parallelization for STR generation and
 Transform

---
 strpython/helpers/collision.py                |   5 +-
 strpython/helpers/relation_extraction.py      | 157 ++++++++++++++++++
 strpython/models/spatial_relation.py          |  31 ++--
 strpython/models/str.py                       |   6 +-
 strpython/nlp/disambiguator/disambiguator.py  |   4 +-
 strpython/nlp/disambiguator_old/__init__.py   |   1 -
 .../nlp/disambiguator_old/disambiguator.py    |  62 -------
 .../nlp/disambiguator_old/geodict_gaurav.py   | 111 -------------
 .../nlp/disambiguator_old/models/__init__.py  |   1 -
 .../nlp/disambiguator_old/models/bigram.py    |  46 -----
 .../nlp/disambiguator_old/most_common.py      |  71 --------
 .../nlp/disambiguator_old/wikipedia_cooc.py   | 110 ------------
 strpython/nlp/ner/__init__.py                 |   2 +-
 strpython/pipeline.py                         |  71 +++++---
 14 files changed, 228 insertions(+), 450 deletions(-)
 create mode 100644 strpython/helpers/relation_extraction.py
 delete mode 100644 strpython/nlp/disambiguator_old/__init__.py
 delete mode 100644 strpython/nlp/disambiguator_old/disambiguator.py
 delete mode 100644 strpython/nlp/disambiguator_old/geodict_gaurav.py
 delete mode 100644 strpython/nlp/disambiguator_old/models/__init__.py
 delete mode 100644 strpython/nlp/disambiguator_old/models/bigram.py
 delete mode 100644 strpython/nlp/disambiguator_old/most_common.py
 delete mode 100644 strpython/nlp/disambiguator_old/wikipedia_cooc.py

diff --git a/strpython/helpers/collision.py b/strpython/helpers/collision.py
index 25d4f95..86a9dbb 100644
--- a/strpython/helpers/collision.py
+++ b/strpython/helpers/collision.py
@@ -80,7 +80,7 @@ def getGEO(id_se):
     if "path" in data.other:
         return gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"])).convex_hull
     elif "coord" in data.other:
-        return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename(
+        return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(0.5)])).rename(
             columns={0: 'geometry'})
     return None
 
@@ -144,4 +144,5 @@ def collisionTwoSEBoundaries(id_se1, id_se2):
         __cache_adjacency[id_se1][id_se2] = True
         return True
     __cache_adjacency[id_se1][id_se2] = False
-    return False
\ No newline at end of file
+    return False
+
diff --git a/strpython/helpers/relation_extraction.py b/strpython/helpers/relation_extraction.py
new file mode 100644
index 0000000..86585e7
--- /dev/null
+++ b/strpython/helpers/relation_extraction.py
@@ -0,0 +1,157 @@
+# coding = utf-8
+from shapely.geometry import Point
+
+from .collision import collide
+from .geo_relation_database import GeoRelationMatchingDatabase
+from ..helpers.geodict_helpers import gazetteer
+
+
+class RelationExtractor():
+    __cache_entity_data = {}
+
+    def __init__(self, pre_computed={}):
+        self.db_rel_match = pre_computed
+
+    def is_relation(self, id_se1: str, id_se2: str):
+        raise NotImplementedError()
+
+    def get_data(self, id_se):
+        """
+        Return an gazpy.Element object containing information about a spatial entity.
+
+        Parameters
+        ----------
+        id_se : str
+            Identifier of the spatial entity
+
+        Returns
+        -------
+        gazpy.Element
+            data
+        """
+
+        if id_se in RelationExtractor.__cache_entity_data:
+            return RelationExtractor.__cache_entity_data[id_se]
+        data = gazetteer.get_by_id(id_se)
+        if len(data) > 0:
+            RelationExtractor.__cache_entity_data[id_se] = data[0]
+            return data[0]
+
+    def in_cache(self, id_se1: str, id_se2: str):
+        raise NotImplementedError()
+
+    def add_cache(self,id_se1: str, id_se2: str, value : bool, two_way:bool = False):
+        if id_se1 not in self.db_rel_match:
+            self.db_rel_match[id_se1] = {}
+        if two_way and id_se2 not in self.db_rel_match:
+            self.db_rel_match[id_se2] = {}
+
+        self.db_rel_match[id_se1][id_se2] = value
+        if two_way:
+            self.db_rel_match[id_se2][id_se1] = value
+
+
+class InclusionRelation(RelationExtractor):
+
+    def __init__(self, precomputed={}):
+        RelationExtractor.__init__(self, precomputed)
+
+    def in_cache(self, id_se1: str, id_se2: str):
+        if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]:
+            return True, self.db_rel_match[id_se1][id_se2]
+        return False, False
+
+    def is_relation(self, id_se1: str, id_se2: str):
+        found_, value = self.in_cache(id_se1, id_se2)
+        if found_:
+            return value
+
+        inc_chain_P131, inc_chain_P706 = self.get_inclusion_chain(id_se1, "P131"), self.get_inclusion_chain(id_se1,"P706")
+        inc_chain = inc_chain_P131
+        inc_chain.extend(inc_chain_P706)
+        inc_chain = set(inc_chain)
+
+        if id_se2 in inc_chain:
+            self.add_cache(id_se1, id_se2, True)
+            return True
+
+        self.add_cache(id_se1, id_se2, False)
+        return False
+
+    def get_inclusion_chain(self, id_, prop):
+        """
+        For an entity return it geographical inclusion tree using a property.
+        """
+        arr__ = []
+        try:
+            current_entity = gazetteer.get_by_id(id_)[0]
+            if "inc_" + prop in current_entity.other:
+                arr__ = current_entity.other["inc_" + prop]
+            elif "inc_geoname" in current_entity.other:
+                arr__ = current_entity.other.inc_geoname
+            if isinstance(arr__, str):
+                arr__ = [arr__]
+        except:
+            pass
+        return arr__
+
+
+class AdjacencyRelation(RelationExtractor):
+
+    def __init__(self, precomputed={},inc_rel_extractor=InclusionRelation()):
+        RelationExtractor.__init__(self, precomputed)
+        self.inc_rel_extractor=inc_rel_extractor
+
+    def in_cache(self, id_se1: str, id_se2: str):
+        if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]:
+            return True, self.db_rel_match[id_se1][id_se2]
+        elif id_se2 in self.db_rel_match and id_se1 in self.db_rel_match[id_se2]:
+            return True, self.db_rel_match[id_se2][id_se1]
+        return False, False
+
+    def is_relation(self, id_se1: str, id_se2: str):
+        found_, value = self.in_cache(id_se1, id_se2)
+        if found_:
+            return value
+
+        stop_class = {"A-PCLI", "A-ADM1"}
+
+        def get_p47_adjacency_data(data):
+            p47se1 = []
+            for el in data.other.P47:
+                d = gazetteer.get_by_other_id(el, "wikidata")
+                if not d: continue
+                p47se1.append(d[0].id)
+            return p47se1
+
+
+        if self.inc_rel_extractor.is_relation(id_se1, id_se2) or self.inc_rel_extractor.is_relation(id_se2, id_se1):
+            self.add_cache(id_se1, id_se2, False, True)
+            return False
+
+        data_se1, data_se2 = self.get_data(id_se1), self.get_data(id_se2)
+        if not data_se1 or not  data_se2:
+            self.add_cache(id_se1, id_se2, False, True)
+            return False
+
+        if "P47" in data_se2.other and id_se1 in get_p47_adjacency_data(data_se2):
+            self.add_cache(id_se1, id_se2, True, True)
+            return True
+
+        elif "P47" in data_se1.other and id_se2 in get_p47_adjacency_data(data_se1):
+            self.add_cache(id_se1, id_se2, True,True)
+            return True
+
+        if collide(id_se1, id_se2):
+            self.add_cache(id_se1, id_se2, True,True)
+            return True
+
+        if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2:
+            if Point(data_se1.coord.lon, data_se1.coord.lat).distance(
+                    Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len(
+                set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1:
+                self.add_cache(id_se1, id_se2, True,True)
+                return True
+
+        self.add_cache(id_se1, id_se2, False,True)
+        return False
diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py
index b3b02fc..8f3f26d 100644
--- a/strpython/models/spatial_relation.py
+++ b/strpython/models/spatial_relation.py
@@ -74,11 +74,11 @@ class RelationExtractor(MetaCollector):
         spatial_entities : list
             list of spatial entities identifier
         """
-        self.spatial_entities = spatial_entities
+        self.spatial_entities = list(set(spatial_entities))
 
         # Retrieve Geometries
         data = [[sp_id, getGEO(sp_id)] for sp_id in
-                tqdm(spatial_entities, desc="Retrieving Geometries...")]
+                tqdm(self.spatial_entities, desc="Retrieving Geometries...")]
 
         self.all_geometry = []
         for i in data:
@@ -116,18 +116,18 @@ class RelationExtractor(MetaCollector):
             except Exception as e:
                 print(e)
 
-        corr_ = gdf_intersect.iloc[:, 2:] ^ (gdf_within.iloc[:,2:] | gdf_within.iloc[:,2:].T) # An entity cannot be related to an other entity by two type of relation
-        adj_ = gdf_intersect.iloc[:, 2:] & corr_  # because if include and not adjacent does not mean Adjacent !
+        gdf_intersect.set_index("id", inplace=True)
+        gdf_within.set_index("id", inplace=True)
+        del gdf_intersect["geometry"]
+        del gdf_within["geometry"]
 
-        gdf_adjacency = gdf_within.iloc[:, :2]
-        gdf_adjacency = pd.concat((gdf_adjacency, adj_), axis=1)  # Stuck id and geom to adjacency data
+        corr_ = gdf_intersect ^ (gdf_within | gdf_within.T) # An entity cannot be related to an other entity by two type of relation
+        adj_ = gdf_intersect & corr_  # because if include and not adjacent does not mean Adjacent !
 
-        del gdf_adjacency["geometry"]
-        del gdf_within["geometry"]
 
         # Transform to dict for a fastest access !
-        self.adjacency_geom = gdf_adjacency.set_index("id")
-        self.inclusion_geom = gdf_within.set_index("id")
+        self.adjacency_geom = adj_
+        self.inclusion_geom = gdf_within
 
     def get_relation_meta_based(self):
         """
@@ -158,7 +158,7 @@ class RelationExtractor(MetaCollector):
                 adj_res[se2][se1] = adj_res[se1][se2]
 
         self.adjacency_meta = pd.DataFrame.from_dict(adj_res)
-        self.inclusion_meta = pd.DataFrame.from_dict(inc_res)
+        self.inclusion_meta = pd.DataFrame.from_dict(inc_res,orient="index")
 
     def fuse_meta_and_geom(self):
         """
@@ -176,16 +176,15 @@ class RelationExtractor(MetaCollector):
         self.inclusion_meta.sort_index(inplace=True)
         self.adjacency_geom.sort_index(inplace=True)
         self.inclusion_geom.sort_index(inplace=True)
-
         self.adjacency_meta.sort_index(axis=1, inplace=True)
         self.inclusion_meta.sort_index(axis=1, inplace=True)
         self.adjacency_geom.sort_index(axis=1, inplace=True)
         self.inclusion_geom.sort_index(axis=1, inplace=True)
 
-        df_adj = self.adjacency_meta.copy()
-        df_inc = self.inclusion_meta.copy()
-        df_adj.iloc[:, :] = self.adjacency_meta | self.adjacency_geom
-        df_inc.iloc[:, :] = self.inclusion_meta | self.inclusion_geom
+        df_inc = self.inclusion_meta
+        self.adjacency_geom = (self.adjacency_geom ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_geom
+        self.adjacency_meta = (self.adjacency_meta ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_meta
+        df_adj = (self.adjacency_geom | self.adjacency_meta)
 
         return df_adj, df_inc
 
diff --git a/strpython/models/str.py b/strpython/models/str.py
index 6612e89..ca8b46c 100644
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -39,7 +39,7 @@ class STR(object):
     """
     __cache_entity_data = {}  # Â Store data about entity requested
 
-    def __init__(self, tagged_text, spatial_entities,toponym_first=True):
+    def __init__(self, tagged_text, spatial_entities,toponym_first=True, precomputed_inc={}, precomputed_adj={}):
         """
         Constructor
 
@@ -64,8 +64,8 @@ class STR(object):
         self.adjacency_relationships = {}
         self.inclusion_relationships = {}
 
-        self.adj_rel_db=AdjacencyRelation()
-        self.inc_rel_db = InclusionRelation()
+        self.inc_rel_db = InclusionRelation(precomputed_inc)
+        self.adj_rel_db = AdjacencyRelation(precomputed_adj,self.inc_rel_db)
 
         self.graph = nx.MultiDiGraph()
 
diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py
index 927a70f..54defd1 100644
--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -30,8 +30,10 @@ class Disambiguator(object):
         dict
             {toponym : geodictID}
         """
-        if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2:
+        if isinstance(ner_output, np.ndarray) and len(ner_output.shape) == 2 and ner_output.shape[1] == 2:
             toponyms = self.parse_ner_output(ner_output)
+        elif len(ner_output.shape) != 2:
+            return  {}
         elif not toponyms:
             raise ValueError("Either enter a list of toponyms or give ner_output")
         if self.context_based:
diff --git a/strpython/nlp/disambiguator_old/__init__.py b/strpython/nlp/disambiguator_old/__init__.py
deleted file mode 100644
index 950f635..0000000
--- a/strpython/nlp/disambiguator_old/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# coding = utf-8
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator_old/disambiguator.py b/strpython/nlp/disambiguator_old/disambiguator.py
deleted file mode 100644
index ee0d899..0000000
--- a/strpython/nlp/disambiguator_old/disambiguator.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding = utf-8
-
-import copy
-import string
-
-import numpy as np
-
-from ..ner.ner import NER
-
-
-class Disambiguator(object):
-
-    def __init__(self):
-        """Constructor for Disambiguator"""
-        pass
-
-    def extract_se_entities(self, input):
-        out = Disambiguator.parse_corpus(input)
-        en_ = out[out[:, 1] == NER._unified_tag["place"]]
-        return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
-
-    def toponymes_frequencies(self, ens_):
-        count = {}
-        for en in ens_:
-            if not en in count: count[en] = 0
-            count[en] += 1
-        return count
-
-    @staticmethod
-    def parse_corpus(corpus):
-        final_corpus = []
-        t = 0
-        placeTag = NER._unified_tag["place"]
-        while t < len(corpus):
-            tag = copy.copy(corpus[t])
-
-            if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
-                lenw = 1
-                if tag[1] == "BEG-" + placeTag:
-                    compound_tag = tag[0]
-                    t += 1
-                    while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
-                        tag = copy.copy(corpus[t])
-                        if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
-                            compound_tag += tag[0]
-                        else:
-                            compound_tag += " " + tag[0]
-                        t += 1
-                        lenw += 1
-                    tag[0] = compound_tag
-                    tag[1] = placeTag
-                t += 1
-            else:
-                t += 1
-            final_corpus.append(tag)
-        return np.array(final_corpus)
-
-    def disambiguate(self, ner_result):
-        pass
-
-    def disambiguate_list(self,toponyms,lang):
-        pass
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator_old/geodict_gaurav.py b/strpython/nlp/disambiguator_old/geodict_gaurav.py
deleted file mode 100644
index 3d59912..0000000
--- a/strpython/nlp/disambiguator_old/geodict_gaurav.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# coding = utf-8
-import math
-
-from ...helpers.collision import *
-#from ...helpers.geodict_helpers_old import *
-from ...helpers.geodict_helpers import *
-from .disambiguator import Disambiguator
-
-from ...models.str import get_inclusion_chain
-
-
-class GauravGeodict(Disambiguator):
-
-    def __init__(self):
-        Disambiguator.__init__(self)
-
-    def fib_formula(self, n):
-        if n in [0, 1]: return 0  # Modifying fibonacci behaviour
-        golden_ratio = (1 + math.sqrt(5)) / 2
-        val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
-        return int(round(val))
-
-    def inclusion_log(self, x, alpha=0.2):
-        if x==0:
-            return 1
-        return math.log(x)
-
-    def get_inclusion_tree(self, id_, prop):
-        """
-        For an entity return it geographical inclusion tree using a property.
-        """
-        arr = []
-        current_entity = gazetteer.get_by_id(id_)[0]
-        while True:
-            if prop in current_entity:
-                arr.append(current_entity[prop][0])
-                current_entity = gazetteer.get_by_other_id(current_entity[prop][0],"wikidata")
-            else:
-                arr.append(gazetteer.get_by_label("Earth","en")[0].id)  # Earth ID
-                break
-        return arr
-
-    def get_inclusion_score(self, id1, id2):  # is it really inclusion ? :)
-        list1 = get_inclusion_chain(id1, 'P131')
-        list2 = get_inclusion_chain(id2, 'P131')
-        interP131 = len(list(set(list1).intersection(list2)))
-        list1 = get_inclusion_chain(id1, 'P706')
-        list2 = get_inclusion_chain(id2, 'P706')
-        interP706 = len(list(set(list1).intersection(list2)))
-        # return fib_no[interP131]+fib_no[interP706]
-        return self.inclusion_log(interP131) + self.inclusion_log(interP706)
-
-    def Adjacency_P47(self, id1, id2):
-        data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
-        if "P47" in data_1 and "P47" in data_2:
-            if id1 in data_2.other.P47 or id2 in data_1.other.P47:
-                return True
-        return False
-
-    def Adjacency_Hull(self, id1, id2):
-        return collisionTwoSEBoundaries(id1, id2)
-
-    def disambiguateOne(self, spat_candidates, fixed_entities):
-        score_dc = {}
-
-        for cand in spat_candidates:
-            id_cand = cand.id
-            score_dc[id_cand] = 0
-            for fixed in fixed_entities:
-                id_fixed = fixed_entities[fixed].id
-                if self.Adjacency_P47(id_cand, id_fixed):
-                    score_dc[id_cand] += 3
-                elif self.Adjacency_Hull(id_cand, id_fixed):
-                    score_dc[id_cand] += 2
-                score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
-        m = max(score_dc, key=score_dc.get)
-        if score_dc[m] < 4:
-            return None
-        for cand in spat_candidates:
-            if cand.id == m:
-                return cand.id
-
-
-    def eval(self,se_,lang):
-        selected_en = {}
-        fixed_entities = {}
-        ambiguous_entities = {}
-        for en in se_:
-            request = gazetteer.get_by_label(en, lang)
-            if len(request) == 0:
-                request = gazetteer.get_by_alias(en, lang)
-
-            if len(request) > 1:
-                ambiguous_entities[en] = request
-            elif len(request) == 1:
-                fixed_entities[en] = request[0]
-
-        d_amb_results = {}
-        for amb_ent in ambiguous_entities:
-            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
-            if not d:
-                d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
-            else:
-                d_amb_results[amb_ent] = d
-        #print(fixed_entities)
-        for k, v in fixed_entities.items():
-            fixed_entities[k] = v.id
-        for k, v in d_amb_results.items():
-            fixed_entities[k] = v
-
-        return fixed_entities
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator_old/models/__init__.py b/strpython/nlp/disambiguator_old/models/__init__.py
deleted file mode 100644
index 950f635..0000000
--- a/strpython/nlp/disambiguator_old/models/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# coding = utf-8
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator_old/models/bigram.py b/strpython/nlp/disambiguator_old/models/bigram.py
deleted file mode 100644
index ec146b4..0000000
--- a/strpython/nlp/disambiguator_old/models/bigram.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# coding = utf-8
-
-
-class BigramModel:
-    def __init__(self,freq={},count={}):
-        self.cooc_freq=freq
-        self.count_associated=count
-
-    def append(self,uri1,uri2):
-
-        if not uri1 in self.cooc_freq:
-            self.cooc_freq[uri1]={}
-        if not uri2 in self.cooc_freq[uri1]:
-            self.cooc_freq[uri1][uri2]=0
-        self.cooc_freq[uri1][uri2]+=1
-
-        self.increment_count(uri2)
-
-    def increment_count(self,uri):
-        if not uri in self.count_associated:
-            self.count_associated[uri]=0
-        self.count_associated[uri]+=1
-
-    def get_coocurence_probability(self, pr1, *args):
-        if len(args) < 2:
-            print("Only one URI indicated")
-            return 0.
-        res_=1.
-        for u in range(1,len(args)):
-            res_*=self.get_bigram_probability(args[0],args[u],pr1)
-        return res_
-
-
-    def get_bigram_probability(self,uri1,uri2,pr1=1):
-        nna=0.00000001
-        if  uri1 in self.cooc_freq:
-            if  uri2 in self.cooc_freq[uri1]:
-                return self.cooc_freq[uri1][uri2]
-                #return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1
-        elif uri2 in self.cooc_freq:
-            if uri1 in self.cooc_freq[uri2]:
-                return self.cooc_freq[uri2][uri1]
-                #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
-        return nna
-
-
diff --git a/strpython/nlp/disambiguator_old/most_common.py b/strpython/nlp/disambiguator_old/most_common.py
deleted file mode 100644
index 2989325..0000000
--- a/strpython/nlp/disambiguator_old/most_common.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding = utf-8
-
-
-
-from ...helpers.geodict_helpers import *
-from .disambiguator import Disambiguator
-import re, json, os
-from ...config.configuration import config
-
-from inflector import Inflector,English,Spanish,French
-
-inflectors= {
-    "en":Inflector(English()),
-    "fr":Inflector(French()),
-    "es":Inflector(Spanish())
-}
-stop_words = {
-    "fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")),
-    "en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n"))
-}
-
-common_words = {
-    "fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))),
-    "en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n"))
-}
-
-
-class MostCommonDisambiguator(Disambiguator):
-
-    def __init__(self):
-        Disambiguator.__init__(self)
-
-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en = {}
-        for en in se_:
-            id_,score=self.disambiguate_(en,lang)
-            if not id_ == "O" and id_:
-                selected_en[id_] = en
-                new_count[id_] = count[en]
-
-        return new_count, selected_en
-
-    def disambiguate_list(self,toponyms,lang):
-        result={}
-        for toponym in toponyms:
-            id_,_=self.disambiguate_(toponym,lang)
-            if id_:
-                result[id_]=toponym
-        return result
-
-    def disambiguate_(self, label, lang='fr'):
-        if re.match("^\d+$", label):
-            return 'O', -1
-        if lang in stop_words: #and lang in common_words:
-            if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
-                return 'O', -1
-
-            if lang in inflectors:
-                plural=inflectors[lang].singularize(label)
-            else:
-                plural = label.rstrip("s") + "s"
-            if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
-                return 'O', -1
-
-        data=get_most_common_id_v3(label, lang)
-        id_, score=None,0
-        if data:
-            id_,score=data.id,data.score
-        return id_, score
diff --git a/strpython/nlp/disambiguator_old/wikipedia_cooc.py b/strpython/nlp/disambiguator_old/wikipedia_cooc.py
deleted file mode 100644
index c9a522a..0000000
--- a/strpython/nlp/disambiguator_old/wikipedia_cooc.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# coding = utf-8
-import re
-
-from .disambiguator import Disambiguator
-from .models.bigram import BigramModel
-import pickle
-from ...config.configuration import config
-#from ...helpers.geodict_helpers_old import *
-from ...helpers.geodict_helpers import *
-from .most_common import stop_words,common_words
-import networkx as nx
-
-def read_pickle(fn):
-    return pickle.load(open(fn,'rb'))
-
-class WikipediaDisambiguator(Disambiguator):
-
-    def __init__(self,measure="degree"):
-        Disambiguator.__init__(self)
-        # Load model
-        self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
-        self.measure=measure
-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en_rev = {}
-        selected_en = self.disambiguate_wiki(se_,lang)
-        for en in selected_en:
-            selected_en_rev[en]=selected_en[en]
-            #new_count[selected_en[en]] = count[en]
-
-        return new_count, selected_en
-
-    def disambiguate_list(self,toponyms,lang):
-        result=self.disambiguate_wiki(toponyms,lang)
-        return {k:v for k,v in result.items() if v}
-
-    def disambiguate_wiki(self, entities, lang):
-
-        spat_en=[]
-        for e in entities:
-            if re.match("^\d+$", e):
-                continue
-            if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
-                continue
-
-            plural = e.rstrip("s") + "s"
-            if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
-                continue
-            spat_en.append(e)
-        spat_en=list(set(spat_en))
-        g = nx.Graph()
-
-        possible_candidates = []
-        betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
-        group_candidate = {} #candidates per toponym
-
-        for e in spat_en:
-            cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
-            cand = [c.id for c in cand if c]
-            if not cand:
-                cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
-            group_candidate[e] = cand
-            betw_cand[e]=cand
-            for n in cand:
-                betw_cand[n]=set(cand)-set(n)
-            possible_candidates.extend(cand)
-
-        for cand in possible_candidates:
-            g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
-
-        data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
-        for cand in possible_candidates:
-            for cand2 in possible_candidates:
-                # Get PageRank score
-                d = data_candidate[cand]
-
-                sc = 1
-                sc=d.score
-                # Compute probability
-                prob = self.model.get_coocurence_probability(sc, cand, cand2)
-
-                if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
-                    prob = 0.0
-                if prob < 0.0000001:
-                    prob = 0.0
-                if not cand == cand2:
-                    # take the lowest co-occurrency between two candidates
-                    if g.has_edge(cand2, cand) :
-                        if g.edges[cand2,cand]["weight"] < prob:
-                            continue
-                    g.add_edge(cand, cand2, weight=prob)
-
-        selected = {}
-
-        #Take the candidates with the highest degree weighted
-        for gr in group_candidate:
-            try:
-
-                if self.measure == "degree":
-                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
-                elif self.measure == "centrality":
-                    selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight"))
-                else:# degree by default
-                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
-                #print(1)
-            except Exception as e:
-                selected[gr]=get_most_common_id_v3(gr,lang)
-        return selected
-
diff --git a/strpython/nlp/ner/__init__.py b/strpython/nlp/ner/__init__.py
index 764d8f5..52d9924 100644
--- a/strpython/nlp/ner/__init__.py
+++ b/strpython/nlp/ner/__init__.py
@@ -1,5 +1,5 @@
 from .spacy import Spacy
 from .nltk import NLTK
-from .polyglot import Polyglot
+#from .polyglot import Polyglot
 from .stanford_ner import StanfordNER
 from .ner import NER
\ No newline at end of file
diff --git a/strpython/pipeline.py b/strpython/pipeline.py
index c7cd89f..db81049 100644
--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -2,8 +2,10 @@
 import re
 
 from nltk import word_tokenize
+from joblib import Parallel, delayed
 
-from strpython.models.str import STR
+from .models.spatial_relation import RelationExtractor
+from .models.str import STR
 from .models.transformation.transform import Generalisation, Expansion
 
 from .nlp.disambiguator import *
@@ -11,12 +13,13 @@ from .nlp.ner import *
 
 from .nlp.exception.disambiguator import NotADisambiguatorInstance
 from .nlp.exception.ner import NotANERInstance
-from .nlp.exception.tagger import NotATaggerInstance
-
-from .nlp.pos_tagger.tagger import Tagger
-from .nlp.pos_tagger.treetagger import TreeTagger
-
+from multiprocessing import cpu_count
 
+from mytoolbox.env import in_notebook
+if in_notebook():
+    from tqdm._tqdm_notebook import tqdm_notebook as tqdm
+else:
+    from tqdm import tqdm
 
 
 class Pipeline(object):
@@ -76,28 +79,46 @@ class Pipeline(object):
         else:
             raise NotADisambiguatorInstance()
 
-
-    def build(self,text,se_identified=None, **kwargs):
-        """
-        Return the corresponding STR for a text.
-        :param text:
-        :return: STR
+    def extract_all_relation(self,spatial_entities):
         """
-        toponyms= kwargs.get("toponyms", None)
-        stop_words=kwargs.get("stop_words",[])
-
-        if isinstance(toponyms,list):
-            se_identified = self.disambiguator.disambiguate(self.lang,toponyms=[top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3])
-            input = ""
+        Extract relation information between spatial entities
+        Parameters
+        ----------
+        spatial_entities
 
-        elif se_identified:
-            input, se_identified = self.parse(text)
-        else:
-            input,se_identified=self.parse(text)
+        Returns
+        -------
 
-        str_=STR(word_tokenize(input),se_identified,toponym_first=True)
-        str_.build(adj=True,inc=True)
-        str_=self.transform(str_,**kwargs)
+        """
+        r = RelationExtractor(spatial_entities)
+        r.get_relation_geometry_based()
+        r.get_relation_meta_based()
+        df_adj, df_inc = r.fuse_meta_and_geom()
+        dict_adj = df_adj.to_dict()
+        dict_inc = df_inc.to_dict()
+        return dict_adj, dict_inc
+
+    def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs):
+
+        text_and_spatial_entities = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts"))
+        sp_es= []
+        for res in text_and_spatial_entities:
+            sp_es.extend(list(res[1].values()))
+        sp_es= [es for es in sp_es if es.startswith("GD")]
+        print("Extract Spatial Relation for all identified spatial entities")
+        adj_rel_dict, inc_rel_dict = self.extract_all_relation(sp_es)
+
+        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], adj_rel_dict, inc_rel_dict, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR"))
+        return str_s
+
+    def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs):
+
+        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.transform)(str_, **kwargs) for str_ in tqdm(strs_,desc="Transform STR"))
+        return str_s
+
+    def build(self, text_input, spatial_entities_identified, prec_adj, prec_inc):
+        str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=prec_adj,precomputed_inc=prec_inc)
+        str_.build(adj=True, inc=True)
         return str_
 
     def transform(self,str_,**kwargs):
-- 
GitLab