Debug spatial relation extraction

Debug disambiguator delete old disambiguator classes Add Parallelization for STR generation and Transform

Debug spatial relation extraction
Debug disambiguator delete old disambiguator classes Add Parallelization for STR generation and Transform
e9d151de · Fize Jacques · 176a106e · e9d151de · e9d151de · e9d151de
Commit e9d151de authored 6 years ago by Fize Jacques
Hide whitespace changes
Inline Side-by-side

Showing

with 228 additions and 450 deletions
+228 -450
--- a/strpython/helpers/collision.py
+++ b/strpython/helpers/collision.py
@@ -80,7 +80,7 @@ def getGEO(id_se):
    if "path" in data.other:
        return gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"])).convex_hull
    elif "coord" in data.other:
-        return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename(
+        return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(0.5)])).rename(
            columns={0: 'geometry'})
    return None

@@ -144,4 +144,5 @@ def collisionTwoSEBoundaries(id_se1, id_se2):
        __cache_adjacency[id_se1][id_se2] = True
        return True
    __cache_adjacency[id_se1][id_se2] = False
-    return False
\ No newline at end of file
+    return False
+
--- a/strpython/helpers/relation_extraction.py
+++ b/strpython/helpers/relation_extraction.py
+# coding = utf-8
+from shapely.geometry import Point
+
+from .collision import collide
+from .geo_relation_database import GeoRelationMatchingDatabase
+from ..helpers.geodict_helpers import gazetteer
+
+
+class RelationExtractor():
+    __cache_entity_data = {}
+
+    def __init__(self, pre_computed={}):
+        self.db_rel_match = pre_computed
+
+    def is_relation(self, id_se1: str, id_se2: str):
+        raise NotImplementedError()
+
+    def get_data(self, id_se):
+        """
+        Return an gazpy.Element object containing information about a spatial entity.
+
+        Parameters
+        ----------
+        id_se : str
+            Identifier of the spatial entity
+
+        Returns
+        -------
+        gazpy.Element
+            data
+        """
+
+        if id_se in RelationExtractor.__cache_entity_data:
+            return RelationExtractor.__cache_entity_data[id_se]
+        data = gazetteer.get_by_id(id_se)
+        if len(data) > 0:
+            RelationExtractor.__cache_entity_data[id_se] = data[0]
+            return data[0]
+
+    def in_cache(self, id_se1: str, id_se2: str):
+        raise NotImplementedError()
+
+    def add_cache(self,id_se1: str, id_se2: str, value : bool, two_way:bool = False):
+        if id_se1 not in self.db_rel_match:
+            self.db_rel_match[id_se1] = {}
+        if two_way and id_se2 not in self.db_rel_match:
+            self.db_rel_match[id_se2] = {}
+
+        self.db_rel_match[id_se1][id_se2] = value
+        if two_way:
+            self.db_rel_match[id_se2][id_se1] = value
+
+
+class InclusionRelation(RelationExtractor):
+
+    def __init__(self, precomputed={}):
+        RelationExtractor.__init__(self, precomputed)
+
+    def in_cache(self, id_se1: str, id_se2: str):
+        if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]:
+            return True, self.db_rel_match[id_se1][id_se2]
+        return False, False
+
+    def is_relation(self, id_se1: str, id_se2: str):
+        found_, value = self.in_cache(id_se1, id_se2)
+        if found_:
+            return value
+
+        inc_chain_P131, inc_chain_P706 = self.get_inclusion_chain(id_se1, "P131"), self.get_inclusion_chain(id_se1,"P706")
+        inc_chain = inc_chain_P131
+        inc_chain.extend(inc_chain_P706)
+        inc_chain = set(inc_chain)
+
+        if id_se2 in inc_chain:
+            self.add_cache(id_se1, id_se2, True)
+            return True
+
+        self.add_cache(id_se1, id_se2, False)
+        return False
+
+    def get_inclusion_chain(self, id_, prop):
+        """
+        For an entity return it geographical inclusion tree using a property.
+        """
+        arr__ = []
+        try:
+            current_entity = gazetteer.get_by_id(id_)[0]
+            if "inc_" + prop in current_entity.other:
+                arr__ = current_entity.other["inc_" + prop]
+            elif "inc_geoname" in current_entity.other:
+                arr__ = current_entity.other.inc_geoname
+            if isinstance(arr__, str):
+                arr__ = [arr__]
+        except:
+            pass
+        return arr__
+
+
+class AdjacencyRelation(RelationExtractor):
+
+    def __init__(self, precomputed={},inc_rel_extractor=InclusionRelation()):
+        RelationExtractor.__init__(self, precomputed)
+        self.inc_rel_extractor=inc_rel_extractor
+
+    def in_cache(self, id_se1: str, id_se2: str):
+        if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]:
+            return True, self.db_rel_match[id_se1][id_se2]
+        elif id_se2 in self.db_rel_match and id_se1 in self.db_rel_match[id_se2]:
+            return True, self.db_rel_match[id_se2][id_se1]
+        return False, False
+
+    def is_relation(self, id_se1: str, id_se2: str):
+        found_, value = self.in_cache(id_se1, id_se2)
+        if found_:
+            return value
+
+        stop_class = {"A-PCLI", "A-ADM1"}
+
+        def get_p47_adjacency_data(data):
+            p47se1 = []
+            for el in data.other.P47:
+                d = gazetteer.get_by_other_id(el, "wikidata")
+                if not d: continue
+                p47se1.append(d[0].id)
+            return p47se1
+
+
+        if self.inc_rel_extractor.is_relation(id_se1, id_se2) or self.inc_rel_extractor.is_relation(id_se2, id_se1):
+            self.add_cache(id_se1, id_se2, False, True)
+            return False
+
+        data_se1, data_se2 = self.get_data(id_se1), self.get_data(id_se2)
+        if not data_se1 or not  data_se2:
+            self.add_cache(id_se1, id_se2, False, True)
+            return False
+
+        if "P47" in data_se2.other and id_se1 in get_p47_adjacency_data(data_se2):
+            self.add_cache(id_se1, id_se2, True, True)
+            return True
+
+        elif "P47" in data_se1.other and id_se2 in get_p47_adjacency_data(data_se1):
+            self.add_cache(id_se1, id_se2, True,True)
+            return True
+
+        if collide(id_se1, id_se2):
+            self.add_cache(id_se1, id_se2, True,True)
+            return True
+
+        if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2:
+            if Point(data_se1.coord.lon, data_se1.coord.lat).distance(
+                    Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len(
+                set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1:
+                self.add_cache(id_se1, id_se2, True,True)
+                return True
+
+        self.add_cache(id_se1, id_se2, False,True)
+        return False
--- a/strpython/models/spatial_relation.py
+++ b/strpython/models/spatial_relation.py
@@ -74,11 +74,11 @@ class RelationExtractor(MetaCollector):
        spatial_entities : list
            list of spatial entities identifier
        """
-        self.spatial_entities = spatial_entities
+        self.spatial_entities = list(set(spatial_entities))

        # Retrieve Geometries
        data = [[sp_id, getGEO(sp_id)] for sp_id in
-                tqdm(spatial_entities, desc="Retrieving Geometries...")]
+                tqdm(self.spatial_entities, desc="Retrieving Geometries...")]

        self.all_geometry = []
        for i in data:
@@ -116,18 +116,18 @@ class RelationExtractor(MetaCollector):
            except Exception as e:
                print(e)

-        corr_ = gdf_intersect.iloc[:, 2:] ^ (gdf_within.iloc[:,2:] | gdf_within.iloc[:,2:].T) # An entity cannot be related to an other entity by two type of relation
-        adj_ = gdf_intersect.iloc[:, 2:] & corr_  # because if include and not adjacent does not mean Adjacent !
+        gdf_intersect.set_index("id", inplace=True)
+        gdf_within.set_index("id", inplace=True)
+        del gdf_intersect["geometry"]
+        del gdf_within["geometry"]

-        gdf_adjacency = gdf_within.iloc[:, :2]
-        gdf_adjacency = pd.concat((gdf_adjacency, adj_), axis=1)  # Stuck id and geom to adjacency data
+        corr_ = gdf_intersect ^ (gdf_within | gdf_within.T) # An entity cannot be related to an other entity by two type of relation
+        adj_ = gdf_intersect & corr_  # because if include and not adjacent does not mean Adjacent !

-        del gdf_adjacency["geometry"]
-        del gdf_within["geometry"]

        # Transform to dict for a fastest access !
-        self.adjacency_geom = gdf_adjacency.set_index("id")
-        self.inclusion_geom = gdf_within.set_index("id")
+        self.adjacency_geom = adj_
+        self.inclusion_geom = gdf_within

    def get_relation_meta_based(self):
        """
@@ -158,7 +158,7 @@ class RelationExtractor(MetaCollector):
                adj_res[se2][se1] = adj_res[se1][se2]

        self.adjacency_meta = pd.DataFrame.from_dict(adj_res)
-        self.inclusion_meta = pd.DataFrame.from_dict(inc_res)
+        self.inclusion_meta = pd.DataFrame.from_dict(inc_res,orient="index")

    def fuse_meta_and_geom(self):
        """
@@ -176,16 +176,15 @@ class RelationExtractor(MetaCollector):
        self.inclusion_meta.sort_index(inplace=True)
        self.adjacency_geom.sort_index(inplace=True)
        self.inclusion_geom.sort_index(inplace=True)
-
        self.adjacency_meta.sort_index(axis=1, inplace=True)
        self.inclusion_meta.sort_index(axis=1, inplace=True)
        self.adjacency_geom.sort_index(axis=1, inplace=True)
        self.inclusion_geom.sort_index(axis=1, inplace=True)

-        df_adj = self.adjacency_meta.copy()
-        df_inc = self.inclusion_meta.copy()
-        df_adj.iloc[:, :] = self.adjacency_meta | self.adjacency_geom
-        df_inc.iloc[:, :] = self.inclusion_meta | self.inclusion_geom
+        df_inc = self.inclusion_meta
+        self.adjacency_geom = (self.adjacency_geom ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_geom
+        self.adjacency_meta = (self.adjacency_meta ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_meta
+        df_adj = (self.adjacency_geom | self.adjacency_meta)

        return df_adj, df_inc


--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -39,7 +39,7 @@ class STR(object):
    """
    __cache_entity_data = {}  #  Store data about entity requested

-    def __init__(self, tagged_text, spatial_entities,toponym_first=True):
+    def __init__(self, tagged_text, spatial_entities,toponym_first=True, precomputed_inc={}, precomputed_adj={}):
        """
        Constructor

@@ -64,8 +64,8 @@ class STR(object):
        self.adjacency_relationships = {}
        self.inclusion_relationships = {}

-        self.adj_rel_db=AdjacencyRelation()
-        self.inc_rel_db = InclusionRelation()
+        self.inc_rel_db = InclusionRelation(precomputed_inc)
+        self.adj_rel_db = AdjacencyRelation(precomputed_adj,self.inc_rel_db)

        self.graph = nx.MultiDiGraph()


--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -30,8 +30,10 @@ class Disambiguator(object):
        dict
            {toponym : geodictID}
        """
-        if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2:
+        if isinstance(ner_output, np.ndarray) and len(ner_output.shape) == 2 and ner_output.shape[1] == 2:
            toponyms = self.parse_ner_output(ner_output)
+        elif len(ner_output.shape) != 2:
+            return  {}
        elif not toponyms:
            raise ValueError("Either enter a list of toponyms or give ner_output")
        if self.context_based:

--- a/strpython/nlp/disambiguator_old/__init__.py
+++ b/strpython/nlp/disambiguator_old/__init__.py
-# coding = utf-8
\ No newline at end of file
--- a/strpython/nlp/disambiguator_old/disambiguator.py
+++ b/strpython/nlp/disambiguator_old/disambiguator.py
-# coding = utf-8
-
-import copy
-import string
-
-import numpy as np
-
-from ..ner.ner import NER
-
-
-class Disambiguator(object):
-
-    def __init__(self):
-        """Constructor for Disambiguator"""
-        pass
-
-    def extract_se_entities(self, input):
-        out = Disambiguator.parse_corpus(input)
-        en_ = out[out[:, 1] == NER._unified_tag["place"]]
-        return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
-
-    def toponymes_frequencies(self, ens_):
-        count = {}
-        for en in ens_:
-            if not en in count: count[en] = 0
-            count[en] += 1
-        return count
-
-    @staticmethod
-    def parse_corpus(corpus):
-        final_corpus = []
-        t = 0
-        placeTag = NER._unified_tag["place"]
-        while t < len(corpus):
-            tag = copy.copy(corpus[t])
-
-            if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
-                lenw = 1
-                if tag[1] == "BEG-" + placeTag:
-                    compound_tag = tag[0]
-                    t += 1
-                    while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
-                        tag = copy.copy(corpus[t])
-                        if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
-                            compound_tag += tag[0]
-                        else:
-                            compound_tag += " " + tag[0]
-                        t += 1
-                        lenw += 1
-                    tag[0] = compound_tag
-                    tag[1] = placeTag
-                t += 1
-            else:
-                t += 1
-            final_corpus.append(tag)
-        return np.array(final_corpus)
-
-    def disambiguate(self, ner_result):
-        pass
-
-    def disambiguate_list(self,toponyms,lang):
-        pass
\ No newline at end of file
--- a/strpython/nlp/disambiguator_old/geodict_gaurav.py
+++ b/strpython/nlp/disambiguator_old/geodict_gaurav.py
-# coding = utf-8
-import math
-
-from ...helpers.collision import *
-#from ...helpers.geodict_helpers_old import *
-from ...helpers.geodict_helpers import *
-from .disambiguator import Disambiguator
-
-from ...models.str import get_inclusion_chain
-
-
-class GauravGeodict(Disambiguator):
-
-    def __init__(self):
-        Disambiguator.__init__(self)
-
-    def fib_formula(self, n):
-        if n in [0, 1]: return 0  # Modifying fibonacci behaviour
-        golden_ratio = (1 + math.sqrt(5)) / 2
-        val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
-        return int(round(val))
-
-    def inclusion_log(self, x, alpha=0.2):
-        if x==0:
-            return 1
-        return math.log(x)
-
-    def get_inclusion_tree(self, id_, prop):
-        """
-        For an entity return it geographical inclusion tree using a property.
-        """
-        arr = []
-        current_entity = gazetteer.get_by_id(id_)[0]
-        while True:
-            if prop in current_entity:
-                arr.append(current_entity[prop][0])
-                current_entity = gazetteer.get_by_other_id(current_entity[prop][0],"wikidata")
-            else:
-                arr.append(gazetteer.get_by_label("Earth","en")[0].id)  # Earth ID
-                break
-        return arr
-
-    def get_inclusion_score(self, id1, id2):  # is it really inclusion ? :)
-        list1 = get_inclusion_chain(id1, 'P131')
-        list2 = get_inclusion_chain(id2, 'P131')
-        interP131 = len(list(set(list1).intersection(list2)))
-        list1 = get_inclusion_chain(id1, 'P706')
-        list2 = get_inclusion_chain(id2, 'P706')
-        interP706 = len(list(set(list1).intersection(list2)))
-        # return fib_no[interP131]+fib_no[interP706]
-        return self.inclusion_log(interP131) + self.inclusion_log(interP706)
-
-    def Adjacency_P47(self, id1, id2):
-        data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
-        if "P47" in data_1 and "P47" in data_2:
-            if id1 in data_2.other.P47 or id2 in data_1.other.P47:
-                return True
-        return False
-
-    def Adjacency_Hull(self, id1, id2):
-        return collisionTwoSEBoundaries(id1, id2)
-
-    def disambiguateOne(self, spat_candidates, fixed_entities):
-        score_dc = {}
-
-        for cand in spat_candidates:
-            id_cand = cand.id
-            score_dc[id_cand] = 0
-            for fixed in fixed_entities:
-                id_fixed = fixed_entities[fixed].id
-                if self.Adjacency_P47(id_cand, id_fixed):
-                    score_dc[id_cand] += 3
-                elif self.Adjacency_Hull(id_cand, id_fixed):
-                    score_dc[id_cand] += 2
-                score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
-        m = max(score_dc, key=score_dc.get)
-        if score_dc[m] < 4:
-            return None
-        for cand in spat_candidates:
-            if cand.id == m:
-                return cand.id
-
-
-    def eval(self,se_,lang):
-        selected_en = {}
-        fixed_entities = {}
-        ambiguous_entities = {}
-        for en in se_:
-            request = gazetteer.get_by_label(en, lang)
-            if len(request) == 0:
-                request = gazetteer.get_by_alias(en, lang)
-
-            if len(request) > 1:
-                ambiguous_entities[en] = request
-            elif len(request) == 1:
-                fixed_entities[en] = request[0]
-
-        d_amb_results = {}
-        for amb_ent in ambiguous_entities:
-            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
-            if not d:
-                d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
-            else:
-                d_amb_results[amb_ent] = d
-        #print(fixed_entities)
-        for k, v in fixed_entities.items():
-            fixed_entities[k] = v.id
-        for k, v in d_amb_results.items():
-            fixed_entities[k] = v
-
-        return fixed_entities
\ No newline at end of file
--- a/strpython/nlp/disambiguator_old/models/__init__.py
+++ b/strpython/nlp/disambiguator_old/models/__init__.py
-# coding = utf-8
\ No newline at end of file
--- a/strpython/nlp/disambiguator_old/models/bigram.py
+++ b/strpython/nlp/disambiguator_old/models/bigram.py
-# coding = utf-8
-
-
-class BigramModel:
-    def __init__(self,freq={},count={}):
-        self.cooc_freq=freq
-        self.count_associated=count
-
-    def append(self,uri1,uri2):
-
-        if not uri1 in self.cooc_freq:
-            self.cooc_freq[uri1]={}
-        if not uri2 in self.cooc_freq[uri1]:
-            self.cooc_freq[uri1][uri2]=0
-        self.cooc_freq[uri1][uri2]+=1
-
-        self.increment_count(uri2)
-
-    def increment_count(self,uri):
-        if not uri in self.count_associated:
-            self.count_associated[uri]=0
-        self.count_associated[uri]+=1
-
-    def get_coocurence_probability(self, pr1, *args):
-        if len(args) < 2:
-            print("Only one URI indicated")
-            return 0.
-        res_=1.
-        for u in range(1,len(args)):
-            res_*=self.get_bigram_probability(args[0],args[u],pr1)
-        return res_
-
-
-    def get_bigram_probability(self,uri1,uri2,pr1=1):
-        nna=0.00000001
-        if  uri1 in self.cooc_freq:
-            if  uri2 in self.cooc_freq[uri1]:
-                return self.cooc_freq[uri1][uri2]
-                #return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1
-        elif uri2 in self.cooc_freq:
-            if uri1 in self.cooc_freq[uri2]:
-                return self.cooc_freq[uri2][uri1]
-                #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
-        return nna
-
-
--- a/strpython/nlp/disambiguator_old/most_common.py
+++ b/strpython/nlp/disambiguator_old/most_common.py
-# coding = utf-8
-
-
-
-from ...helpers.geodict_helpers import *
-from .disambiguator import Disambiguator
-import re, json, os
-from ...config.configuration import config
-
-from inflector import Inflector,English,Spanish,French
-
-inflectors= {
-    "en":Inflector(English()),
-    "fr":Inflector(French()),
-    "es":Inflector(Spanish())
-}
-stop_words = {
-    "fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")),
-    "en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n"))
-}
-
-common_words = {
-    "fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))),
-    "en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n"))
-}
-
-
-class MostCommonDisambiguator(Disambiguator):
-
-    def __init__(self):
-        Disambiguator.__init__(self)
-
-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en = {}
-        for en in se_:
-            id_,score=self.disambiguate_(en,lang)
-            if not id_ == "O" and id_:
-                selected_en[id_] = en
-                new_count[id_] = count[en]
-
-        return new_count, selected_en
-
-    def disambiguate_list(self,toponyms,lang):
-        result={}
-        for toponym in toponyms:
-            id_,_=self.disambiguate_(toponym,lang)
-            if id_:
-                result[id_]=toponym
-        return result
-
-    def disambiguate_(self, label, lang='fr'):
-        if re.match("^\d+$", label):
-            return 'O', -1
-        if lang in stop_words: #and lang in common_words:
-            if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
-                return 'O', -1
-
-            if lang in inflectors:
-                plural=inflectors[lang].singularize(label)
-            else:
-                plural = label.rstrip("s") + "s"
-            if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
-                return 'O', -1
-
-        data=get_most_common_id_v3(label, lang)
-        id_, score=None,0
-        if data:
-            id_,score=data.id,data.score
-        return id_, score
--- a/strpython/nlp/disambiguator_old/wikipedia_cooc.py
+++ b/strpython/nlp/disambiguator_old/wikipedia_cooc.py
-# coding = utf-8
-import re
-
-from .disambiguator import Disambiguator
-from .models.bigram import BigramModel
-import pickle
-from ...config.configuration import config
-#from ...helpers.geodict_helpers_old import *
-from ...helpers.geodict_helpers import *
-from .most_common import stop_words,common_words
-import networkx as nx
-
-def read_pickle(fn):
-    return pickle.load(open(fn,'rb'))
-
-class WikipediaDisambiguator(Disambiguator):
-
-    def __init__(self,measure="degree"):
-        Disambiguator.__init__(self)
-        # Load model
-        self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
-        self.measure=measure
-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en_rev = {}
-        selected_en = self.disambiguate_wiki(se_,lang)
-        for en in selected_en:
-            selected_en_rev[en]=selected_en[en]
-            #new_count[selected_en[en]] = count[en]
-
-        return new_count, selected_en
-
-    def disambiguate_list(self,toponyms,lang):
-        result=self.disambiguate_wiki(toponyms,lang)
-        return {k:v for k,v in result.items() if v}
-
-    def disambiguate_wiki(self, entities, lang):
-
-        spat_en=[]
-        for e in entities:
-            if re.match("^\d+$", e):
-                continue
-            if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
-                continue
-
-            plural = e.rstrip("s") + "s"
-            if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
-                continue
-            spat_en.append(e)
-        spat_en=list(set(spat_en))
-        g = nx.Graph()
-
-        possible_candidates = []
-        betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
-        group_candidate = {} #candidates per toponym
-
-        for e in spat_en:
-            cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
-            cand = [c.id for c in cand if c]
-            if not cand:
-                cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
-            group_candidate[e] = cand
-            betw_cand[e]=cand
-            for n in cand:
-                betw_cand[n]=set(cand)-set(n)
-            possible_candidates.extend(cand)
-
-        for cand in possible_candidates:
-            g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
-
-        data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
-        for cand in possible_candidates:
-            for cand2 in possible_candidates:
-                # Get PageRank score
-                d = data_candidate[cand]
-
-                sc = 1
-                sc=d.score
-                # Compute probability
-                prob = self.model.get_coocurence_probability(sc, cand, cand2)
-
-                if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
-                    prob = 0.0
-                if prob < 0.0000001:
-                    prob = 0.0
-                if not cand == cand2:
-                    # take the lowest co-occurrency between two candidates
-                    if g.has_edge(cand2, cand) :
-                        if g.edges[cand2,cand]["weight"] < prob:
-                            continue
-                    g.add_edge(cand, cand2, weight=prob)
-
-        selected = {}
-
-        #Take the candidates with the highest degree weighted
-        for gr in group_candidate:
-            try:
-
-                if self.measure == "degree":
-                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
-                elif self.measure == "centrality":
-                    selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight"))
-                else:# degree by default
-                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
-                #print(1)
-            except Exception as e:
-                selected[gr]=get_most_common_id_v3(gr,lang)
-        return selected
-
--- a/strpython/nlp/ner/__init__.py
+++ b/strpython/nlp/ner/__init__.py
 from .spacy import Spacy
 from .nltk import NLTK
-from .polyglot import Polyglot
+#from .polyglot import Polyglot
 from .stanford_ner import StanfordNER
 from .ner import NER
\ No newline at end of file
--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -2,8 +2,10 @@
 import re

 from nltk import word_tokenize
+from joblib import Parallel, delayed

-from strpython.models.str import STR
+from .models.spatial_relation import RelationExtractor
+from .models.str import STR
 from .models.transformation.transform import Generalisation, Expansion

 from .nlp.disambiguator import *
@@ -11,12 +13,13 @@ from .nlp.ner import *

 from .nlp.exception.disambiguator import NotADisambiguatorInstance
 from .nlp.exception.ner import NotANERInstance
-from .nlp.exception.tagger import NotATaggerInstance
-
-from .nlp.pos_tagger.tagger import Tagger
-from .nlp.pos_tagger.treetagger import TreeTagger
-
+from multiprocessing import cpu_count

+from mytoolbox.env import in_notebook
+if in_notebook():
+    from tqdm._tqdm_notebook import tqdm_notebook as tqdm
+else:
+    from tqdm import tqdm


 class Pipeline(object):
@@ -76,28 +79,46 @@ class Pipeline(object):
        else:
            raise NotADisambiguatorInstance()

-
-    def build(self,text,se_identified=None, **kwargs):
-        """
-        Return the corresponding STR for a text.
-        :param text:
-        :return: STR
+    def extract_all_relation(self,spatial_entities):
        """
-        toponyms= kwargs.get("toponyms", None)
-        stop_words=kwargs.get("stop_words",[])
-
-        if isinstance(toponyms,list):
-            se_identified = self.disambiguator.disambiguate(self.lang,toponyms=[top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3])
-            input = ""
+        Extract relation information between spatial entities
+        Parameters
+        ----------
+        spatial_entities

-        elif se_identified:
-            input, se_identified = self.parse(text)
-        else:
-            input,se_identified=self.parse(text)
+        Returns
+        -------

-        str_=STR(word_tokenize(input),se_identified,toponym_first=True)
-        str_.build(adj=True,inc=True)
-        str_=self.transform(str_,**kwargs)
+        """
+        r = RelationExtractor(spatial_entities)
+        r.get_relation_geometry_based()
+        r.get_relation_meta_based()
+        df_adj, df_inc = r.fuse_meta_and_geom()
+        dict_adj = df_adj.to_dict()
+        dict_inc = df_inc.to_dict()
+        return dict_adj, dict_inc
+
+    def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs):
+
+        text_and_spatial_entities = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts"))
+        sp_es= []
+        for res in text_and_spatial_entities:
+            sp_es.extend(list(res[1].values()))
+        sp_es= [es for es in sp_es if es.startswith("GD")]
+        print("Extract Spatial Relation for all identified spatial entities")
+        adj_rel_dict, inc_rel_dict = self.extract_all_relation(sp_es)
+
+        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], adj_rel_dict, inc_rel_dict, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR"))
+        return str_s
+
+    def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs):
+
+        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.transform)(str_, **kwargs) for str_ in tqdm(strs_,desc="Transform STR"))
+        return str_s
+
+    def build(self, text_input, spatial_entities_identified, prec_adj, prec_inc):
+        str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=prec_adj,precomputed_inc=prec_inc)
+        str_.build(adj=True, inc=True)
        return str_

    def transform(self,str_,**kwargs):