From 755998a619c54b211dd2a38e0976f9448f1ca8a0 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Tue, 12 Mar 2019 17:37:13 +0100
Subject: [PATCH] Debug, STR modif for faster generation, debug disambiguators,
 update pipeline,debug document selection

---
 depreciated/generate_data.py                  |   2 +-
 depreciated/generate_data_csv.py              |   2 +-
 generate_annotation_file.py                   |   2 +-
 generate_str.py                               |   2 +-
 strpython/eval/disambiguation.py              |   2 +-
 strpython/models/str.py                       | 681 +++++++++++++-----
 strpython/nlp/disambiguator/__init__.py       |   7 +-
 strpython/nlp/disambiguator/disambiguator.py  | 107 +--
 strpython/nlp/disambiguator/most_common.py    |  22 +-
 strpython/nlp/disambiguator/share_prop.py     | 174 +++++
 strpython/nlp/disambiguator/wikipedia_cooc.py |  76 +-
 strpython/nlp/disambiguator_old/__init__.py   |   1 +
 .../nlp/disambiguator_old/disambiguator.py    |  62 ++
 .../geodict_gaurav.py                         |   0
 .../nlp/disambiguator_old/models/__init__.py  |   1 +
 .../nlp/disambiguator_old/models/bigram.py    |  46 ++
 .../nlp/disambiguator_old/most_common.py      |  71 ++
 .../nlp/disambiguator_old/wikipedia_cooc.py   | 110 +++
 strpython/nlp/ner/__init__.py                 |   5 +
 strpython/nlp/ner/ner.py                      |  40 +-
 strpython/nlp/ner/nltk.py                     |  21 +-
 strpython/nlp/ner/polyglot.py                 |  31 +-
 strpython/nlp/ner/spacy.py                    |  46 +-
 strpython/nlp/ner/stanford_ner.py             |  75 +-
 strpython/pipeline.py                         |  68 +-
 25 files changed, 1185 insertions(+), 469 deletions(-)
 create mode 100644 strpython/nlp/disambiguator/share_prop.py
 create mode 100644 strpython/nlp/disambiguator_old/__init__.py
 create mode 100644 strpython/nlp/disambiguator_old/disambiguator.py
 rename strpython/nlp/{disambiguator => disambiguator_old}/geodict_gaurav.py (100%)
 create mode 100644 strpython/nlp/disambiguator_old/models/__init__.py
 create mode 100644 strpython/nlp/disambiguator_old/models/bigram.py
 create mode 100644 strpython/nlp/disambiguator_old/most_common.py
 create mode 100644 strpython/nlp/disambiguator_old/wikipedia_cooc.py

diff --git a/depreciated/generate_data.py b/depreciated/generate_data.py
index a1147eb..ac2fe26 100644
--- a/depreciated/generate_data.py
+++ b/depreciated/generate_data.py
@@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
 from langdetect import detect
 from progressbar import ProgressBar, Timer, Bar, ETA, Counter
 
-from strpython.nlp.disambiguator.geodict_gaurav import *
+from strpython.nlp.disambiguator.share_prop import *
 from strpython.pipeline import *
 import networkx as nx
 
diff --git a/depreciated/generate_data_csv.py b/depreciated/generate_data_csv.py
index dfcc9ee..41a5099 100644
--- a/depreciated/generate_data_csv.py
+++ b/depreciated/generate_data_csv.py
@@ -6,7 +6,7 @@ import argparse,glob, string,time,re
 from progressbar import ProgressBar, Timer, Bar, ETA, Counter
 
 from strpython.models.str import STR
-from strpython.nlp.disambiguator.geodict_gaurav import *
+from strpython.nlp.disambiguator.share_prop import *
 from strpython.pipeline import *
 import pandas as pd
 import networkx as nx
diff --git a/generate_annotation_file.py b/generate_annotation_file.py
index d18abc9..b4b1501 100644
--- a/generate_annotation_file.py
+++ b/generate_annotation_file.py
@@ -33,7 +33,7 @@ selected = json.load(open(args.selectedFile))
 for fn in matrix_fns:
     measure = os.path.basename(fn).split("_")[0]
     
-    type_= "_".join(fn.split("_")[1:]).replace(".npy.bz2","")
+    type_= "_".join(os.path.basename(fn).split("_")[1:]).replace(".npy.bz2","")
     print("Proceeding...",measure, type_)
     df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
                                     selected,
diff --git a/generate_str.py b/generate_str.py
index 3e08e1f..3e04cbd 100644
--- a/generate_str.py
+++ b/generate_str.py
@@ -21,7 +21,7 @@ from strpython.nlp.ner.polyglot import Polyglot as poly_ner
 from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner
 
 from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d
-from strpython.nlp.disambiguator.geodict_gaurav import GauravGeodict as shared_geo_d
+from strpython.nlp.disambiguator.share_prop import ShareProp as shared_geo_d
 from strpython.nlp.disambiguator.most_common import MostCommonDisambiguator as most_common_d
 
 from mytoolbox.text.clean import *
diff --git a/strpython/eval/disambiguation.py b/strpython/eval/disambiguation.py
index eeed6c5..960a53d 100644
--- a/strpython/eval/disambiguation.py
+++ b/strpython/eval/disambiguation.py
@@ -1,7 +1,7 @@
 # coding = utf-8
 
 from shapely.geometry import Point
-from ..nlp.disambiguator.geodict_gaurav import GauravGeodict
+from ..nlp.disambiguator.share_prop import GauravGeodict
 from ..nlp.disambiguator.most_common import MostCommonDisambiguator
 from ..nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator
 
diff --git a/strpython/models/str.py b/strpython/models/str.py
index 6bfbed9..2d673ba 100644
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -5,11 +5,12 @@ import os
 import time
 import warnings
 
+from tqdm import tqdm
 import folium
 import geopandas as gpd
 import networkx as nx
 import pandas as pd
-from shapely.geometry import MultiPoint,Polygon,Point,LineString
+from shapely.geometry import MultiPoint, Polygon, Point, LineString
 
 from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency
 from ..helpers.geodict_helpers import gazetteer
@@ -18,6 +19,7 @@ from ..eval.stats import most_common
 from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan
 import numpy as np
 
+
 # logging.basicConfig(filename=config.log_file,level=logging.INFO)
 
 
@@ -40,10 +42,31 @@ class STR(object):
     """
     Str basic structure
     """
-    __cache_inclusion = {}
-    def __init__(self, tagged_text, spatial_entities):
+    __cache_inclusion = {}  # Store inclusion relations found between spaital entities
+    __cache_adjacency = {}  # Store adjacency relations found between spaital entities
+    __cache_entity_data = {}  # Â Store data about entity requested
+
+    def __init__(self, tagged_text, spatial_entities,toponym_first=True):
+        """
+        Constructir
+
+        Parameters
+        ----------
+        tagged_text : list
+            Text in forms of token associated with tag (2D array 2*t where t == |tokens| )
+        spatial_entities : dict
+            spatial entities associated with a text. Follow this structure {"<id>: <label>"}
+
+        """
+
         self.tagged_text = tagged_text
         self.spatial_entities = spatial_entities
+        if toponym_first:
+            self.spatial_entities= {id_:topo for topo,id_ in self.spatial_entities.items()}
+
+        for k in list(spatial_entities.keys()):
+            if not k[:2] == "GD":
+                del spatial_entities[k]
 
         self.adjacency_relationships = {}
         self.inclusion_relationships = {}
@@ -51,11 +74,21 @@ class STR(object):
     @staticmethod
     def from_networkx_graph(g: nx.Graph, tagged_: list = []):
         """
-        Return a STR built from a Networkx imported graph
-        :param g:
-        :param tagged_:
-        :return:
+        Build a STR based on networkx graph
+
+        Parameters
+        ----------
+        g : nx.Graph
+            input graph
+        tagged_ : list, optional
+            tagged text (the default is []). A 2D array 2*t where t == |tokens|.
+
+        Returns
+        -------
+        STR
+            resulting STR
         """
+
         sp_en = {}
         for nod in g:
             try:
@@ -63,44 +96,96 @@ class STR(object):
             except KeyError:  # If no label found, grab one from the geo-database
                 data = gazetteer.get_by_id(nod)
                 if data:
-                    sp_en[nod] = data[0].label
+                    sp_en[nod] = data[0].name
 
-        str_ = STR(tagged_, sp_en)
+        str_ = STR(tagged_, sp_en,toponym_first=False)
         str_.set_graph(g)
         return str_
 
     @staticmethod
     def from_dict(spat_ent: dict, tagged_: list = []):
         """
-        Return a STR built from a Networkx imported graph
-        :param g:
-        :param tagged_:
-        :return:
+        Build a STR based on networkx graph
+
+        Parameters
+        ----------
+        spat_ent : dict
+            Dict of patial entities associated with a text. Follow this structure {"<id>: <label>"}
+        tagged_ : list, optional
+            tagged text (the default is []). A 2D array 2*t where t == |tokens|.
+
+        Returns
+        -------
+        STR
+            resulting STR
         """
         sp_en = {}
         for id_, label in spat_ent.items():
             sp_en[id_] = label
 
-        str_ = STR(tagged_, sp_en)
+        str_ = STR(tagged_, sp_en,toponym_first=False)
         str_.build()
         return str_
 
     @staticmethod
     def from_pandas(dataf: pd.DataFrame, tagged: list = []):
+        """
+        Build a STR from a Pandas Dataframe with two column : id and label.
+
+        Parameters
+        ----------
+        dataf : pd.DataFrame
+            dataframe containing the spatial entities
+        tagged : list, optional
+            tagged text (the default is []). A 2D array 2*t where t == |tokens|.
+
+        Returns
+        -------
+        STR
+            resulting STR
+        """
+
         return STR.from_dict(pd.Series(dataf.label.values, index=dataf.id).to_dict(), tagged)
 
+    def set_graph(self, g):
+        """
+        Apply changes to the current STR based on Networkx Graph.
+
+        Parameters
+        ----------
+        g : networkx.Graph
+            input graph
+
+        """
+
+        self.graph = g
+        rel_ = self.graph.edges(data=True)
+        for edge in rel_:
+            id1, id2 = edge[0], edge[1]
+            if edge[2]["color"] == "green":
+                self.add_adjacency_rel(edge[0], edge[1])
+                self.add_cache__adjacency(id1, id2, True)
+            elif edge[2]["color"] == "red":
+                self.add_inclusion_rel(edge[0], edge[1])
+                self.add_cache_inclusion(id1, id2, True)
+
     def add_spatial_entity(self, id, label=None, v=True):
         """
-        Adding a spatial entity to the current STR
-        :param id:
-        :param label:
-        :return:
+        Add a spatial entity to the current STR
+
+        Parameters
+        ----------
+        id : str
+            identifier of the spatial entity in Geodict
+        label : str, optional
+            if not available in Geodict (the default is None)
+
         """
-        data_ = gazetteer.get_by_id(id)
+        data_ = self.get_data(id)
         if not data_:
             warnings.warn("{0} wasn't found in Geo-Database".format(id))
             return False
-        data_=data_[0]
+        data_ = data_[0]
         if not label and v == True:
             warnings.warn("Label empty. @en label from Geo-Database will be used.")
             label = data_["en"]
@@ -110,9 +195,14 @@ class STR(object):
     def add_spatial_entities(self, ids: list, labels: list = []):
         """
         Add spatial entities to the current STR
-        :param ids:
-        :param label:
-        :return:
+
+        Parameters
+        ----------
+        ids : list
+            list of identifiers of each spatial entity
+        labels : list, optional
+            list of labels of each spatial entity
+
         """
         if not labels:
             warnings.warn("Labels list is empty. @en labels from Geo-Database will be used by default")
@@ -125,27 +215,120 @@ class STR(object):
             self.add_spatial_entity(id, label, False)
         # print(self.graph.nodes(data=True))
 
-    def add_adjacency_rel(self, se1, se2,v=True):
-        if not se1 in self.adjacency_relationships:
-            self.adjacency_relationships[se1] = {}
-        self.adjacency_relationships[se1][se2]=v
+    def add_adjacency_rel(self, se1, se2):
+        """
+        Add a adjacency relationship to the current STR.
+
+        Parameters
+        ----------
+        se1 : str
+            Identifier of the first spatial entity
+        se2 : str
+            Identifier of the second spatial entity
+
+        """
 
-    def add_inclusion_rel(self, se1, se2,v=True):
+        if not se1 in self.adjacency_relationships: self.adjacency_relationships[se1] = {}
+        if not se2 in self.adjacency_relationships: self.adjacency_relationships[se2] = {}
+        self.adjacency_relationships[se1][se2], self.adjacency_relationships[se2][se1] = True, True
+        self.add_cache__adjacency(se1, se2, True)
+
+    def add_inclusion_rel(self, se1, se2):
+        """
+        Add a inclusion relationship to the current STR.
+
+        Parameters
+        ----------
+        se1 : str
+            Identifier of the first spatial entity
+        se2 : str
+            Identifier of the second spatial entity
+
+        """
         if not se1 in self.inclusion_relationships:
             self.inclusion_relationships[se1] = {}
-        self.inclusion_relationships[se1][se2]=v
+        self.inclusion_relationships[se1][se2] = True
+        self.add_cache_inclusion(se1, se2, True)
+
+    def add_cache_inclusion(self, id1, id2, v=True):
+        """
+        Add a relation of inclusion in a cache variable
+
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+        v : bool, optional
+            if the relation exists between the two spatial entities. Default is True
+
+        """
+
+        if not id1 in STR.__cache_inclusion:
+            STR.__cache_inclusion[id1] = {}
+        STR.__cache_inclusion[id1][id2] = v
+
+    def add_cache__adjacency(self, se1, se2, v=True):
+        """
+        Add a relation of adjacency in a cache variable
+
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+        v : bool, optional
+            if the relation exists between the two spatial entities. Default is True
+
+        """
+        if not se1 in STR.__cache_adjacency:
+            STR.__cache_adjacency[se1] = {}
+        if not se2 in STR.__cache_adjacency:
+            STR.__cache_adjacency[se2] = {}
+        STR.__cache_adjacency[se1][se2] = v
+        STR.__cache_adjacency[se2][se1] = v
+
+    def get_data(self, id_se):
+        """
+        Return an gazpy.Element object containing information about a spatial entity.
+
+        Parameters
+        ----------
+        id_se : str
+            Identifier of the spatial entity
 
-    def transform_spatial_entities(self, transform_map):
+        Returns
+        -------
+        gazpy.Element
+            data
         """
-        Apply transformation to a STR
-        :param transform_map:
-        :return:
+
+        if id_se in STR.__cache_entity_data:
+            return STR.__cache_entity_data[id_se]
+        data = gazetteer.get_by_id(id_se)
+        if len(data) > 0:
+            STR.__cache_entity_data[id_se] = data[0]
+
+    def transform_spatial_entities(self, transform_map: dict):
         """
+        Replace or delete certain spatial entities based on a transformation map
+
+        Parameters
+        ----------
+        transform_map : dict
+            New mapping for the spatial entities in the current STR. Format required : {"<id of the old spatial entity>":"<id of the new spatial entity>"}
+
+        """
+
         final_transform_map = {}
         # Erase old spatial entities
         new_label = {}
+        to_del = set([])
         for old_se, new_se in transform_map.items():
-            data = gazetteer.get_by_id(new_se)
+            data = self.get_data(new_se)
+            to_del.add(old_se)
             if data:
                 data = data[0]
                 final_transform_map[old_se] = new_se
@@ -153,78 +336,186 @@ class STR(object):
                     self.add_spatial_entity(new_se, data.label.en)
 
                 del self.spatial_entities[old_se]
+
                 new_label[new_se] = data.label.en
             else:
                 warnings.warn("{0} doesn't exists in the geo database!".format(new_se))
+
         self.graph = nx.relabel_nodes(self.graph, final_transform_map)
+
+        for es in to_del:
+            if es in self.graph._node:
+                self.graph.remove_node(es)
+
         for se_ in new_label:
             self.graph.nodes[se_]["label"] = new_label[se_]
 
     def update(self):
         """
-        Method for updating links between spatial entities
-        :return:
+        Update the relationship between spatial entities in the STR. Used when transforming the STR.
         """
+
         nodes = copy.deepcopy(self.graph.nodes(data=True))
         self.graph.clear()
         self.graph.add_nodes_from(nodes)
 
-        print("inclusion")
         self.get_inclusion_relationships()
         for se1 in self.inclusion_relationships:
             for se2 in self.inclusion_relationships[se1]:
+                if not se1 in self.graph.nodes or not se2 in self.graph.nodes:
+                    continue
                 if self.inclusion_relationships[se1][se2]:
                     self.graph.add_edge(se1, se2, key=0, color="red")
 
-        print("adjacency")
         self.get_adjacency_relationships()
         for se1 in self.adjacency_relationships:
             for se2 in self.adjacency_relationships[se1]:
+                if not se1 in self.graph.nodes or not se2 in self.graph.nodes:
+                    continue
                 if self.adjacency_relationships[se1][se2]:
                     self.graph.add_edge(se1, se2, key=0, color="green")
-        print("fin adj")
-
-
-
 
-    def add_cache_inclusion(self,id1, id2):
-        if not id1 in STR.__cache_inclusion:
-            STR.__cache_inclusion[id1] = set([])
-            STR.__cache_inclusion[id1].add(id2)
     def is_included_in(self, se1_id, se2_id):
-        global __cache_inclusion
         """
-        Return true if the two spatial entities identified by @se1_id and @se2_id share an inclusion relationship
-        :param se1_id:
-        :param se2_id:
-        :return:
+        Return True if a spatial entity is included within another one.
+
+        Parameters
+        ----------
+        se1_id : str
+            id of the contained entity
+        se2_id : str
+            id of the entity container
+
+        Returns
+        -------
+        bool
+            if se1 included in se2
         """
+
         if se1_id in self.inclusion_relationships:
             if se2_id in self.inclusion_relationships[se1_id]:
                 return self.inclusion_relationships[se1_id][se2_id]
 
-        if se1_id in STR.__cache_inclusion:
-            if se2_id in STR.__cache_inclusion[se1_id]:
-                return True
-
         inc_chain_P131 = get_inclusion_chain(se1_id, "P131")
         inc_chain_P706 = get_inclusion_chain(se1_id, "P706")
         inc_chain = inc_chain_P131
         inc_chain.extend(inc_chain_P706)
         inc_chain = set(inc_chain)
         if se2_id in inc_chain:
-            self.add_cache_inclusion(se1_id,se2_id)
+            self.add_cache_inclusion(se1_id, se2_id, True)
+            return True
+
+        return False
+
+    def is_adjacent_cache(self, se1, se2):
+        """
+        Return true if two spatial entities were found adjacent previously.
+
+        Parameters
+        ----------
+        se1 : str
+            id of the first spatial entity
+        se2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        bool
+            if se1 adjacent to se2
+        """
+
+        if se1 in STR.__cache_adjacency:
+            if se2 in STR.__cache_adjacency[se1]:
+                return STR.__cache_adjacency[se1][se2]
+        if se2 in STR.__cache_adjacency:
+            if se1 in STR.__cache_adjacency[se2]:
+                return STR.__cache_adjacency[se2][se1]
+        return False
+
+    def is_included_cache(self, se1, se2):
+        """
+        Return true if a spatial entity were found included previously in an other one.
+
+        Parameters
+        ----------
+        se1 : str
+            id of the first spatial entity
+        se2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        bool
+            if se1 included to se2
+        """
+        if se1 in STR.__cache_inclusion:
+            if se2 in STR.__cache_inclusion[se1]:
+                return STR.__cache_inclusion[se1][se2]
+        return False
+
+    def is_adjacent(self, se1, se2, datase1=None, datase2=None):
+        """
+        Return true if se1 is adjacent to se2.
+
+        Parameters
+        ----------
+        se1 : str
+            id of the first spatial entity
+        se2 : str
+            id of the second spatial entity
+        datase1 : gazpy.Element, optional
+            if given cached data concerning the spatial entity with id = se1 (the default is None)
+        datase2 : gazpy.Element, optional
+            if given cached data concerning the spatial entity with id = se2 (the default is None)
+
+        Returns
+        -------
+        bool
+            true if adjacent
+        """
+
+        stop_class = set(["A-PCLI", "A-ADM1"])
+
+        def get_p47_adjacency_data(data):
+            p47se1 = []
+            for el in data.other.P47:
+                d = gazetteer.get_by_other_id(el, "wikidata")
+                if not d: continue
+                p47se1.append(d[0].id)
+            return p47se1
+
+        if self.is_adjacent_cache(se1, se2):
+            return False
+
+        if self.is_included_in(se1, se2) or self.is_included_in(se2, se1):
+            return False
+
+        data_se1, data_se2 = self.get_data(se1), self.get_data(se2)
+
+        if "P47" in data_se2 and se1 in get_p47_adjacency_data(data_se2):
+            return True
+            # print("P47")
+        elif "P47" in data_se1 and se2 in get_p47_adjacency_data(data_se1):
+            return True
+            # print("P47")
+
+        if collisionTwoSEBoundaries(se1, se2):
             return True
 
+        if "coord" in data_se1 and "coord" in data_se2:
+            if Point(data_se1.coord.lon, data_se1.coord.lat).distance(
+                    Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len(
+                set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1:
+                return True
         return False
 
     def get_inclusion_relationships(self):
         """
-        Return all the inclusion relationships between all the spatial entities in the STR.
-        :return:
+        Find all the inclusion relationships between the spatial entities declared in the current STR.
+
         """
-        inclusions_ = []
-        for se_ in self.spatial_entities:
+
+        for se_ in tqdm(self.spatial_entities, desc="Extract Inclusion"):
             inc_chain_P131 = get_inclusion_chain(se_, "P131")
             inc_chain_P706 = get_inclusion_chain(se_, "P706")
 
@@ -234,62 +525,19 @@ class STR(object):
 
             for se2_ in self.spatial_entities:
                 if se2_ in inc_chain:
-                    self.add_inclusion_rel(se_,se2_)
-        return inclusions_
-
-    def getP47AdjacencyData(self, data):
-        p47se1 = []
-        for el in data.other.P47:
-            d = gazetteer.get_by_other_id(el,"wikidata")
-            if not d:continue
-            p47se1.append(d[0].id)
-        return p47se1
-
-    def is_adjacent(self,se1,se2,datase1=None,datase2=None):
-        f = False
-        stop_class = set(["A-PCLI", "A-ADM1"])
-        if self.is_included_in(se1, se2):
-            return f
-
-        elif self.is_included_in(se2, se1):
-            return f
-
-        data_se1 = gazetteer.get_by_id(se1)[0] if not datase1 else datase1 # Ã‰vite de recharger Ã  chaque fois -_-
-        data_se2 = gazetteer.get_by_id(se2)[0] if not datase2 else datase2
-
-        # print("testP47")
-        if "P47" in data_se2.other:
-            if se1 in self.getP47AdjacencyData(data_se2):
-                return True
-                # print("P47")
-        if not f:
-            if "P47" in data_se1.other:
-                if se2 in self.getP47AdjacencyData(data_se1):
-                    return True
-                    # print("P47")
-        if not f:
-            # print("test collision")
-            if collisionTwoSEBoundaries(se1, se2):
-                return True
-        if not f:
-            if "coord" in data_se1.other and "coord" in data_se2.other:
-                if Point(data_se1.coord.lon, data_se1.coord.lat).distance(
-                        Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len(
-                    set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1:
-                    return True
-        return f
+                    self.add_inclusion_rel(se_, se2_)
 
     def get_adjacency_relationships(self):
         """
-        Return all the adjacency relationships between all the spatial entities in the STR.
-        :return:
+        Find all the adjacency relationships between the spatial entities declared in the current STR.
         """
-        data={se:gazetteer.get_by_id(se)[0] for se in self.spatial_entities}
-        for se1 in self.spatial_entities:
+
+        data = {se: self.get_data(se) for se in self.spatial_entities}
+
+        for se1 in tqdm(self.spatial_entities, desc="Extract Adjacency Relationship"):
             data_se1 = data[se1]
             for se2 in self.spatial_entities:
                 if se1 == se2: continue
-                # print("test adjacency")
                 if se1 in self.adjacency_relationships:
                     if se2 in self.adjacency_relationships[se1]:
                         continue
@@ -297,18 +545,28 @@ class STR(object):
                     if se1 in self.adjacency_relationships[se2]:
                         continue
                 data_se2 = data[se2]
-                self.add_adjacency_rel(se1, se2, self.is_adjacent(se1,se2,data_se1,data_se2))
-
-
+                if self.is_adjacent(se1, se2, data_se1, data_se2):
+                    self.add_adjacency_rel(se1, se2)
 
     def build(self, inc=True, adj=True, verbose=False):
         """
         Build the STR
-        :param inc:
-        :param adj:
-        :param verbose:
-        :return:
+
+        Parameters
+        ----------
+        inc : bool, optional
+            if inclusion relationship have to be included in the STR (the default is True)
+        adj : bool, optional
+            if adjacency relationship have to be included in the STR (the default is True)
+        verbose : bool, optional
+            Verbose mode activated (the default is False)
+
+        Returns
+        -------
+        networkx.Graph
+            graph representing the STR
         """
+
         nodes = []
         for k, v in self.spatial_entities.items():
             nodes.append((k, {"label": v}))
@@ -317,34 +575,35 @@ class STR(object):
         graph.add_nodes_from(nodes)
 
         if adj:
-            debut=time.time()
+            debut = time.time()
             self.get_adjacency_relationships()
             for se1 in self.adjacency_relationships:
                 for se2 in self.adjacency_relationships[se1]:
                     if self.adjacency_relationships[se1][se2]:
-                        graph.add_edge(se1,se2, key=0, color="green")
+                        graph.add_edge(se1, se2, key=0, color="green")
                         graph.add_edge(se2, se1, key=0, color="green")
 
-            logging.info("Extract Adjacency Rel\t{0}".format(time.time()-debut))
         if inc:
-            debut=time.time()
+            debut = time.time()
             self.get_inclusion_relationships()
             for se1 in self.inclusion_relationships:
                 for se2 in self.inclusion_relationships[se1]:
                     if self.inclusion_relationships[se1][se2]:
-                        graph.add_edge(se1,se2, key=0, color="red")
-            logging.info("Extract Inclusion Rel\t{0}".format(time.time() - debut))
+                        graph.add_edge(se1, se2, key=0, color="red")
+
         self.graph = graph
         return graph
 
     def save_graph_fig(self, output_fn, format="svg"):
         """
-        Save the graph graphiz reprensentation
+        Save the graphiz reprensentation of the STR graph.
 
         Parameters
         ----------
         output_fn : string
             Output filename
+        format : str
+            Output format (svg or pdf)
 
         """
         try:
@@ -357,44 +616,63 @@ class STR(object):
             print("Error while saving STR to {0}".format(format))
 
     def getUndirected(self):
-        return nx.Graph(self.graph)
+        """
+        Return the Undirected form of a STR graph.
 
-    def set_graph(self, g):
-        self.graph = g
-        rel_ = self.graph.edges(data=True)
-        for edge in rel_:
-            id1, id2 = edge[0], edge[1]
-            if edge[2]["color"] == "green":
-                self.add_adjacency_rel(edge[0],edge[1])
-                add_cache_adjacency(id1, id2)
-            elif edge[2]["color"] == "red":
-                self.add_inclusion_rel(edge[0], edge[1])
-                self.add_cache_inclusion(id1,id2)
+        Returns
+        -------
+        networkx.Graph
+            unidirected graph
+        """
 
+        return nx.Graph(self.graph)
 
     def get_geo_data_of_se(self):
-        points,label,class_ = [], [], []
+        """
+        Return Geographical information for each spatial entities in the STR
+
+        Returns
+        -------
+        geopandas.GeoDataFrame
+            dataframe containing geographical information of each entity in the STR
+        """
+
+        points, label, class_ = [], [], []
         for se in self.spatial_entities:
             data = gazetteer.get_by_id(se)[0]
             try:
                 points.append(Point(data.coord.lon, data.coord.lat))
-                label.append(data.label)
+                label.append(data.name)
                 # class_.append(most_common(data["class"]))
             except KeyError:
                 pass
         # print(len(points),len(label),len(class_))
-        df=gpd.GeoDataFrame({"geometry":points,"label":label})
-        df["x"]=df.geometry.apply(lambda p: p.x)
+        df = gpd.GeoDataFrame({"geometry": points, "label": label})
+        df["x"] = df.geometry.apply(lambda p: p.x)
         df["y"] = df.geometry.apply(lambda p: p.y)
         return df
 
-    def get_cluster(self,id_=None):
-        if id_ and os.path.exists("./temp_cluster/{0}.geojson".format(id_)):
+    def get_cluster(self, id_=None):
+        """
+        Return the cluster detected using spatial entities position.
+
+        Parameters
+        ----------
+        id_ : temp_file_id, optional
+            if cached version of geoinfo (the default is None)
+
+        Returns
+        -------
+        gpd.GeoDataFrame
+            cluster geometry
+        """
+
+        if os.path.exists("./temp_cluster/{0}.geojson".format(id_)):
             return gpd.read_file("./temp_cluster/{0}.geojson".format(id_))
 
-        data=self.get_geo_data_of_se()
-        X=data[["x", "y"]].values
-        if len(X) ==0: # if zero samples return Empty GeoDataFrame
+        data = self.get_geo_data_of_se()
+        X = data[["x", "y"]].values
+        if len(X) == 0:  # if zero samples return Empty GeoDataFrame
             return gpd.GeoDataFrame()
         try:
             bandwidth = estimate_bandwidth(X)
@@ -402,33 +680,25 @@ class STR(object):
             ms.fit(X)
             data["cluster"] = ms.labels_
         except:
-            samples,labels=dbscan(X)
+            samples, labels = dbscan(X)
             data["cluster"] = labels
 
-        """
-
-        # deuxiÃ¨me dÃ©coupe en cluster
-        c=data['cluster'].value_counts().idxmax()
-        X=data[data["cluster"] == c]
-        X=X[["x","y"]]
-        bandwidth = estimate_bandwidth(X.values)
-        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
-        ms.fit(X.values)
-        X["cluster"]=ms.labels_+(data['cluster'].max()+1)
-        lab=ms.labels_
-        lab+=data['cluster'].max()+1
-        
-        data["cluster"][data["cluster"] == c]=X["cluster"]
-        """
-
         geo = data.groupby("cluster").apply(to_Polygon)
         cluster_polybuff = gpd.GeoDataFrame(geometry=geo)
         if id_:
             cluster_polybuff.to_file("./temp_cluster/{0}.geojson".format(id_))
         return cluster_polybuff
 
-
     def to_folium(self):
+        """
+        Use the folium package to project the STR on a map
+
+        Returns
+        -------
+        folium.Map
+            folium map instance
+        """
+
         points = []
         for se in self.spatial_entities:
             data = gazetteer.get_by_id(se)[0]
@@ -449,10 +719,10 @@ class STR(object):
                     )
         lines_inc = []
         for se1 in self.inclusion_relationships:
-            data_se1 = data_se1=gazetteer.get_by_id(se1)[0]
+            data_se1 = data_se1 = gazetteer.get_by_id(se1)[0]
             for se2 in self.inclusion_relationships[se1]:
                 if self.inclusion_relationships[se1][se2]:
-                    data_se2 = data_se1=gazetteer.get_by_id(se2)[0]
+                    data_se2 = data_se1 = gazetteer.get_by_id(se2)[0]
                     lines_inc.append(
                         LineString([
                             (data_se1.coord.lon, data_se1.coord.lat),
@@ -460,45 +730,58 @@ class STR(object):
                         )
                     )
 
-        def to_fol(seris,color="#ff0000"):
-            df=gpd.GeoDataFrame(geometry=seris.values)
-            df.crs={'init' :'epsg:4326'}
-            return folium.features.GeoJson(df.to_json(),style_function=lambda x: {'color':color})
+        def to_fol(seris, color="#ff0000"):
+            df = gpd.GeoDataFrame(geometry=seris.values)
+            df.crs = {'init': 'epsg:4326'}
+            return folium.features.GeoJson(df.to_json(), style_function=lambda x: {'color': color})
 
         gjson1 = to_fol(gpd.GeoSeries(points))
-        gjson2 = to_fol(gpd.GeoSeries(lines_adj),color='#00ff00')
+        gjson2 = to_fol(gpd.GeoSeries(lines_adj), color='#00ff00')
         gjson3 = to_fol(gpd.GeoSeries(lines_inc))
 
-        map=folium.Map()
+        map = folium.Map()
         map.add_child(gjson1)
         map.add_child(gjson2)
         map.add_child(gjson3)
 
         return map
 
+    def map_projection(self, plt=False):
+        """
+        Return a matplotlib figure of the STR
+
+        Parameters
+        ----------
+        plt : bool, optional
+            if the user wish to use the plt.show() (the default is False)
+
+        Returns
+        -------
+        plt.Figure
+            Matplotlib figure instance
+        """
 
-    def map_projection(self,plt=False):
         import matplotlib.pyplot as plt
         world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
         base = world.plot(color='white', edgecolor='black', figsize=(16, 9))
-        points=[]
+        points = []
         for se in self.spatial_entities:
-            data=gazetteer.get_by_id(se)[0]
+            data = gazetteer.get_by_id(se)[0]
             try:
-                points.append(Point(data.coord.lon,data.coord.lat))
+                points.append(Point(data.coord.lon, data.coord.lat))
             except:
                 pass
 
-        lines_adj=[]
+        lines_adj = []
         for se1 in self.adjacency_relationships:
-            data_se1=gazetteer.get_by_id(se1)[0]
+            data_se1 = gazetteer.get_by_id(se1)[0]
             for se2 in self.adjacency_relationships[se1]:
                 data_se2 = gazetteer.get_by_id(se2)[0]
                 if self.adjacency_relationships[se1][se2]:
                     lines_adj.append(
-                                LineString([(data_se1.coord.lon,data_se1.coord.lat),(data_se2.coord.lon, data_se2.coord.lat)])
-                                )
-        lines_inc=[]
+                        LineString([(data_se1.coord.lon, data_se1.coord.lat), (data_se2.coord.lon, data_se2.coord.lat)])
+                    )
+        lines_inc = []
         for se1 in self.inclusion_relationships:
             data_se1 = gazetteer.get_by_id(se1)[0]
             for se2 in self.inclusion_relationships[se1]:
@@ -511,7 +794,7 @@ class STR(object):
                         )
                     )
 
-        gpd.GeoSeries(points).plot(ax=base,marker='o',markersize=5,color="blue")
+        gpd.GeoSeries(points).plot(ax=base, marker='o', markersize=5, color="blue")
         gpd.GeoSeries(lines_adj).plot(ax=base, color="green")
         gpd.GeoSeries(lines_inc).plot(ax=base, color="red")
 
@@ -520,17 +803,45 @@ class STR(object):
         plt.show()
 
 
-def to_Multipoints(x):
-    #print(x[["x","y"]].values)
-    return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1)
+# def to_Multipoints(x):
+#     """
+#     Return a polygon buffered representation for a set of point
+
+#     Parameters
+#     ----------
+#     x : pandas.Series
+#         coordinates columns
+
+#     Returns
+#     -------
+#     shapely.geometry.Polygon
+#         polygon
+#     """
+
+#     #print(x[["x","y"]].values)
+#     return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1)
 
 def to_Polygon(x):
-    points = [Point(z) for z in x[["x","y"]].values]
+    """
+    Return a polygon buffered representation for a set of points.
+
+    Parameters
+    ----------
+    x : pandas.Series
+        coordinates columns
+
+    Returns
+    -------
+    shapely.geometry.Polygon
+        polygon
+    """
+
+    points = [Point(z) for z in x[["x", "y"]].values]
     if len(points) > 2:
         coords = [p.coords[:][0] for p in points]
         poly = Polygon(coords).buffer(1)
         return poly
-    elif len(points)==1:
+    elif len(points) == 1:
         return points[0].buffer(1)
     else:
         coords = [p.coords[:][0] for p in points]
diff --git a/strpython/nlp/disambiguator/__init__.py b/strpython/nlp/disambiguator/__init__.py
index 950f635..bceef44 100644
--- a/strpython/nlp/disambiguator/__init__.py
+++ b/strpython/nlp/disambiguator/__init__.py
@@ -1 +1,6 @@
-# coding = utf-8
\ No newline at end of file
+# coding = utf-8
+
+from .most_common import MostCommonDisambiguator
+from .share_prop import ShareProp
+from .wikipedia_cooc import WikipediaDisambiguator
+from .disambiguator import Disambiguator
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py
index ee0d899..927a70f 100644
--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -10,53 +10,62 @@ from ..ner.ner import NER
 
 class Disambiguator(object):
 
-    def __init__(self):
+    def __init__(self,one_by_one=False,context_based=False):
         """Constructor for Disambiguator"""
-        pass
-
-    def extract_se_entities(self, input):
-        out = Disambiguator.parse_corpus(input)
-        en_ = out[out[:, 1] == NER._unified_tag["place"]]
-        return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
-
-    def toponymes_frequencies(self, ens_):
-        count = {}
-        for en in ens_:
-            if not en in count: count[en] = 0
-            count[en] += 1
-        return count
-
-    @staticmethod
-    def parse_corpus(corpus):
-        final_corpus = []
-        t = 0
-        placeTag = NER._unified_tag["place"]
-        while t < len(corpus):
-            tag = copy.copy(corpus[t])
-
-            if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
-                lenw = 1
-                if tag[1] == "BEG-" + placeTag:
-                    compound_tag = tag[0]
-                    t += 1
-                    while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
-                        tag = copy.copy(corpus[t])
-                        if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
-                            compound_tag += tag[0]
-                        else:
-                            compound_tag += " " + tag[0]
-                        t += 1
-                        lenw += 1
-                    tag[0] = compound_tag
-                    tag[1] = placeTag
-                t += 1
-            else:
-                t += 1
-            final_corpus.append(tag)
-        return np.array(final_corpus)
-
-    def disambiguate(self, ner_result):
-        pass
-
-    def disambiguate_list(self,toponyms,lang):
-        pass
\ No newline at end of file
+        self.one_by_one= one_by_one
+        self.context_based=context_based
+
+    def disambiguate(self,lang,ner_output=None,toponyms=None):
+        """
+        Run the disambiguation on the NER output
+        Parameters
+        ----------
+        ner_output : 2D numpy array
+            NER output
+        lang : str
+            language
+
+        Returns
+        -------
+        dict
+            {toponym : geodictID}
+        """
+        if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2:
+            toponyms = self.parse_ner_output(ner_output)
+        elif not toponyms:
+            raise ValueError("Either enter a list of toponyms or give ner_output")
+        if self.context_based:
+            return self.disambiguate_context_based(toponyms,lang)
+        else:
+            return self.disambiguate_one_by_one(toponyms,lang)
+
+    def disambiguate_one_by_one(self, toponyms, lang):
+        """
+        Disambiguation process when toponyms are geocoded one by one.
+        Parameters
+        ----------
+        toponyms :list
+            toponyms
+        Returns
+        -------
+        dict
+            {toponym : geodictID}
+        """
+        raise NotImplementedError
+
+    def disambiguate_context_based(self,toponyms,lang):
+        """
+        Disambiguation process when toponyms are geocoded using each one of them
+        Parameters
+        ----------
+        toponyms :list
+            toponyms
+        Returns
+        -------
+        dict
+            {toponym : geodictID}
+        """
+        raise NotImplementedError
+
+    def parse_ner_output(self,ner_output):
+        return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]]
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator/most_common.py b/strpython/nlp/disambiguator/most_common.py
index 2989325..be12646 100644
--- a/strpython/nlp/disambiguator/most_common.py
+++ b/strpython/nlp/disambiguator/most_common.py
@@ -28,40 +28,28 @@ common_words = {
 class MostCommonDisambiguator(Disambiguator):
 
     def __init__(self):
-        Disambiguator.__init__(self)
+        Disambiguator.__init__(self,one_by_one=True)
 
-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en = {}
-        for en in se_:
-            id_,score=self.disambiguate_(en,lang)
-            if not id_ == "O" and id_:
-                selected_en[id_] = en
-                new_count[id_] = count[en]
-
-        return new_count, selected_en
-
-    def disambiguate_list(self,toponyms,lang):
+    def disambiguate_one_by_one(self, toponyms,lang):
         result={}
         for toponym in toponyms:
             id_,_=self.disambiguate_(toponym,lang)
             if id_:
-                result[id_]=toponym
+                result[toponym]=id_
         return result
 
     def disambiguate_(self, label, lang='fr'):
         if re.match("^\d+$", label):
             return 'O', -1
         if lang in stop_words: #and lang in common_words:
-            if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
+            if label.lower().rstrip("s") in stop_words[lang]:
                 return 'O', -1
 
             if lang in inflectors:
                 plural=inflectors[lang].singularize(label)
             else:
                 plural = label.rstrip("s") + "s"
-            if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
+            if plural.lower() in stop_words[lang]:
                 return 'O', -1
 
         data=get_most_common_id_v3(label, lang)
diff --git a/strpython/nlp/disambiguator/share_prop.py b/strpython/nlp/disambiguator/share_prop.py
new file mode 100644
index 0000000..001bc78
--- /dev/null
+++ b/strpython/nlp/disambiguator/share_prop.py
@@ -0,0 +1,174 @@
+# coding = utf-8
+import math
+
+from ...helpers.collision import *
+#from ...helpers.geodict_helpers_old import *
+from ...helpers.geodict_helpers import *
+from .disambiguator import Disambiguator
+
+from ...models.str import get_inclusion_chain
+
+
+class ShareProp(Disambiguator):
+
+    def __init__(self):
+        Disambiguator.__init__(self,context_based=True)
+
+    def fib_formula(self, n):
+        """
+        Return the fibonacci value.
+        Parameters
+        ----------
+        n : int
+            parameter
+        Returns
+        -------
+        int
+            fibonnaci value
+        """
+        if n in [0, 1]: return 0  # Modifying fibonacci behaviour
+        golden_ratio = (1 + math.sqrt(5)) / 2
+        val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
+        return int(round(val))
+
+    def inclusion_log(self, x):
+        """
+        Return the inclusion log
+        Parameters
+        ----------
+        x : int
+            parameter
+
+        Returns
+        -------
+        int
+            inclusion log
+        """
+        if x==0:
+            return 1
+        return math.log(x)
+
+
+    def get_inclusion_score(self, id1, id2):
+        """
+        Return the inclusion score. Compute the distance between two entities in the hierarchy.
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        int
+            inclusion score
+        """
+        list1 = get_inclusion_chain(id1, 'P131')
+        list2 = get_inclusion_chain(id2, 'P131')
+        interP131 = len(list(set(list1).intersection(list2)))
+        list1 = get_inclusion_chain(id1, 'P706')
+        list2 = get_inclusion_chain(id2, 'P706')
+        interP706 = len(list(set(list1).intersection(list2)))
+        # return fib_no[interP131]+fib_no[interP706]
+        return self.inclusion_log(interP131) + self.inclusion_log(interP706)
+
+    def Adjacency_P47(self, id1, id2):
+        """
+        Return true, if two spatial entities are found adjacent using the P47 property (share borders) from Wikidata.
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        bool
+            true if adjacent using P47
+        """
+        data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
+        if "P47" in data_1 and "P47" in data_2:
+            if id1 in data_2.other.P47 or id2 in data_1.other.P47:
+                return True
+        return False
+
+    def Adjacency_Hull(self, id1, id2):
+        """
+        To find if two spatial entities hull "collide"
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        bool
+            if collide
+        """
+        return collisionTwoSEBoundaries(id1, id2)
+
+    def disambiguateOne(self, spat_candidates, fixed_entities):
+        """
+        Disambiguate one toponym
+        Parameters
+        ----------
+        spat_candidates
+            list of candidates found in the georeferential
+        fixed_entities
+            entities with no ambiguities
+
+        Returns
+        -------
+
+        """
+        score_dc = {}
+        for cand in spat_candidates:
+            id_cand = cand.id
+            score_dc[id_cand] = 0
+            for fixed in fixed_entities:
+                id_fixed = fixed_entities[fixed].id
+                if self.Adjacency_P47(id_cand, id_fixed):
+                    score_dc[id_cand] += 3
+                elif self.Adjacency_Hull(id_cand, id_fixed):
+                    score_dc[id_cand] += 2
+                score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
+
+        m = max(score_dc, key=score_dc.get)
+        if score_dc[m] < 4:
+            return None
+        for cand in spat_candidates:
+            if cand.id == m:
+                return cand.id
+
+
+    def disambiguate_context_based(self,toponyms,lang):
+        selected_en = {}
+        fixed_entities = {}
+        ambiguous_entities = {}
+        for topo in toponyms:
+            request = gazetteer.get_by_label(topo, lang)
+            if len(request) == 0:
+                request = gazetteer.get_by_alias(topo, lang)
+            if len(request) > 1:
+                ambiguous_entities[topo] = request
+            elif len(request) == 1:
+                fixed_entities[topo] = request[0]
+
+        d_amb_results = {}
+        for amb_ent in ambiguous_entities:
+            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
+            if not d:
+                d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
+            else:
+                d_amb_results[amb_ent] = d
+
+        for k, v in fixed_entities.items():
+            selected_en[k] = v.id
+        for k, v in d_amb_results.items():
+            selected_en[k] = v
+
+        return selected_en
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py
index c9a522a..a8dacfd 100644
--- a/strpython/nlp/disambiguator/wikipedia_cooc.py
+++ b/strpython/nlp/disambiguator/wikipedia_cooc.py
@@ -16,80 +16,71 @@ def read_pickle(fn):
 class WikipediaDisambiguator(Disambiguator):
 
     def __init__(self,measure="degree"):
-        Disambiguator.__init__(self)
+        Disambiguator.__init__(self,context_based=True)
         # Load model
         self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
         self.measure=measure
-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en_rev = {}
-        selected_en = self.disambiguate_wiki(se_,lang)
-        for en in selected_en:
-            selected_en_rev[en]=selected_en[en]
-            #new_count[selected_en[en]] = count[en]
 
-        return new_count, selected_en
 
     def disambiguate_list(self,toponyms,lang):
         result=self.disambiguate_wiki(toponyms,lang)
         return {k:v for k,v in result.items() if v}
 
-    def disambiguate_wiki(self, entities, lang):
-
-        spat_en=[]
-        for e in entities:
-            if re.match("^\d+$", e):
+    def disambiguate_context_based(self,toponyms,lang):
+        toponyms_filtered=[]
+        for toponym in toponyms:
+            if re.match("^\d+$", toponym):
                 continue
-            if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
+            if lang in stop_words and toponym.lower().rstrip("s") in stop_words[lang]:# or toponym.lower().rstrip("s") in common_words[lang]:
                 continue
 
-            plural = e.rstrip("s") + "s"
+            plural = toponym.rstrip("s") + "s"
             if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
                 continue
-            spat_en.append(e)
-        spat_en=list(set(spat_en))
+            toponyms_filtered.append(toponym)
+
+        toponyms_filtered=list(set(toponyms_filtered))
         g = nx.Graph()
 
         possible_candidates = []
         betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
         group_candidate = {} #candidates per toponym
 
-        for e in spat_en:
-            cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
-            cand = [c.id for c in cand if c]
-            if not cand:
-                cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
-            group_candidate[e] = cand
-            betw_cand[e]=cand
-            for n in cand:
-                betw_cand[n]=set(cand)-set(n)
-            possible_candidates.extend(cand)
+        for toponym in toponyms_filtered:
+            candidates = get_top_candidate(toponym, lang, 5)
+            candidates = [c.id for c in candidates if c]
+            if not candidates:
+                candidates = [c.id for c in gazetteer.get_n_label_similar(toponym,lang,5) if c]
+            group_candidate[toponym] = candidates
+            betw_cand[toponym]=candidates
+            for n in candidates:
+                betw_cand[n]=set(candidates)-set(n)
+            possible_candidates.extend(candidates)
 
-        for cand in possible_candidates:
-            g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
+        for candidate in possible_candidates:
+            g.add_node(candidate, label=gazetteer.get_by_id(candidate)[0].label[lang])
 
         data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
-        for cand in possible_candidates:
-            for cand2 in possible_candidates:
+        for candidate in possible_candidates:
+            for candidate2 in possible_candidates:
                 # Get PageRank score
-                d = data_candidate[cand]
+                d = data_candidate[candidate]
 
                 sc = 1
                 sc=d.score
                 # Compute probability
-                prob = self.model.get_coocurence_probability(sc, cand, cand2)
+                prob = self.model.get_coocurence_probability(sc, candidate, candidate2)
 
-                if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
+                if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]:
                     prob = 0.0
                 if prob < 0.0000001:
                     prob = 0.0
-                if not cand == cand2:
+                if not candidate == candidate2:
                     # take the lowest co-occurrency between two candidates
-                    if g.has_edge(cand2, cand) :
-                        if g.edges[cand2,cand]["weight"] < prob:
+                    if g.has_edge(candidate2, candidate) :
+                        if g.edges[candidate2,candidate]["weight"] < prob:
                             continue
-                    g.add_edge(cand, cand2, weight=prob)
+                    g.add_edge(candidate, candidate2, weight=prob)
 
         selected = {}
 
@@ -104,7 +95,8 @@ class WikipediaDisambiguator(Disambiguator):
                 else:# degree by default
                     selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
                 #print(1)
-            except Exception as e:
-                selected[gr]=get_most_common_id_v3(gr,lang)
+            except Exception as toponym:
+                most_common = get_most_common_id_v3(gr, lang)
+                if most_common and len(most_common)>0: selected[gr]=most_common[0].id
         return selected
 
diff --git a/strpython/nlp/disambiguator_old/__init__.py b/strpython/nlp/disambiguator_old/__init__.py
new file mode 100644
index 0000000..950f635
--- /dev/null
+++ b/strpython/nlp/disambiguator_old/__init__.py
@@ -0,0 +1 @@
+# coding = utf-8
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator_old/disambiguator.py b/strpython/nlp/disambiguator_old/disambiguator.py
new file mode 100644
index 0000000..ee0d899
--- /dev/null
+++ b/strpython/nlp/disambiguator_old/disambiguator.py
@@ -0,0 +1,62 @@
+# coding = utf-8
+
+import copy
+import string
+
+import numpy as np
+
+from ..ner.ner import NER
+
+
+class Disambiguator(object):
+
+    def __init__(self):
+        """Constructor for Disambiguator"""
+        pass
+
+    def extract_se_entities(self, input):
+        out = Disambiguator.parse_corpus(input)
+        en_ = out[out[:, 1] == NER._unified_tag["place"]]
+        return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
+
+    def toponymes_frequencies(self, ens_):
+        count = {}
+        for en in ens_:
+            if not en in count: count[en] = 0
+            count[en] += 1
+        return count
+
+    @staticmethod
+    def parse_corpus(corpus):
+        final_corpus = []
+        t = 0
+        placeTag = NER._unified_tag["place"]
+        while t < len(corpus):
+            tag = copy.copy(corpus[t])
+
+            if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
+                lenw = 1
+                if tag[1] == "BEG-" + placeTag:
+                    compound_tag = tag[0]
+                    t += 1
+                    while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
+                        tag = copy.copy(corpus[t])
+                        if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
+                            compound_tag += tag[0]
+                        else:
+                            compound_tag += " " + tag[0]
+                        t += 1
+                        lenw += 1
+                    tag[0] = compound_tag
+                    tag[1] = placeTag
+                t += 1
+            else:
+                t += 1
+            final_corpus.append(tag)
+        return np.array(final_corpus)
+
+    def disambiguate(self, ner_result):
+        pass
+
+    def disambiguate_list(self,toponyms,lang):
+        pass
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator/geodict_gaurav.py b/strpython/nlp/disambiguator_old/geodict_gaurav.py
similarity index 100%
rename from strpython/nlp/disambiguator/geodict_gaurav.py
rename to strpython/nlp/disambiguator_old/geodict_gaurav.py
diff --git a/strpython/nlp/disambiguator_old/models/__init__.py b/strpython/nlp/disambiguator_old/models/__init__.py
new file mode 100644
index 0000000..950f635
--- /dev/null
+++ b/strpython/nlp/disambiguator_old/models/__init__.py
@@ -0,0 +1 @@
+# coding = utf-8
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator_old/models/bigram.py b/strpython/nlp/disambiguator_old/models/bigram.py
new file mode 100644
index 0000000..ec146b4
--- /dev/null
+++ b/strpython/nlp/disambiguator_old/models/bigram.py
@@ -0,0 +1,46 @@
+# coding = utf-8
+
+
+class BigramModel:
+    def __init__(self,freq={},count={}):
+        self.cooc_freq=freq
+        self.count_associated=count
+
+    def append(self,uri1,uri2):
+
+        if not uri1 in self.cooc_freq:
+            self.cooc_freq[uri1]={}
+        if not uri2 in self.cooc_freq[uri1]:
+            self.cooc_freq[uri1][uri2]=0
+        self.cooc_freq[uri1][uri2]+=1
+
+        self.increment_count(uri2)
+
+    def increment_count(self,uri):
+        if not uri in self.count_associated:
+            self.count_associated[uri]=0
+        self.count_associated[uri]+=1
+
+    def get_coocurence_probability(self, pr1, *args):
+        if len(args) < 2:
+            print("Only one URI indicated")
+            return 0.
+        res_=1.
+        for u in range(1,len(args)):
+            res_*=self.get_bigram_probability(args[0],args[u],pr1)
+        return res_
+
+
+    def get_bigram_probability(self,uri1,uri2,pr1=1):
+        nna=0.00000001
+        if  uri1 in self.cooc_freq:
+            if  uri2 in self.cooc_freq[uri1]:
+                return self.cooc_freq[uri1][uri2]
+                #return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1
+        elif uri2 in self.cooc_freq:
+            if uri1 in self.cooc_freq[uri2]:
+                return self.cooc_freq[uri2][uri1]
+                #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
+        return nna
+
+
diff --git a/strpython/nlp/disambiguator_old/most_common.py b/strpython/nlp/disambiguator_old/most_common.py
new file mode 100644
index 0000000..2989325
--- /dev/null
+++ b/strpython/nlp/disambiguator_old/most_common.py
@@ -0,0 +1,71 @@
+# coding = utf-8
+
+
+
+from ...helpers.geodict_helpers import *
+from .disambiguator import Disambiguator
+import re, json, os
+from ...config.configuration import config
+
+from inflector import Inflector,English,Spanish,French
+
+inflectors= {
+    "en":Inflector(English()),
+    "fr":Inflector(French()),
+    "es":Inflector(Spanish())
+}
+stop_words = {
+    "fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")),
+    "en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n"))
+}
+
+common_words = {
+    "fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))),
+    "en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n"))
+}
+
+
+class MostCommonDisambiguator(Disambiguator):
+
+    def __init__(self):
+        Disambiguator.__init__(self)
+
+    def disambiguate(self, ner_result, lang="en"):
+        count, se_ = self.extract_se_entities(ner_result)
+        new_count = {}
+        selected_en = {}
+        for en in se_:
+            id_,score=self.disambiguate_(en,lang)
+            if not id_ == "O" and id_:
+                selected_en[id_] = en
+                new_count[id_] = count[en]
+
+        return new_count, selected_en
+
+    def disambiguate_list(self,toponyms,lang):
+        result={}
+        for toponym in toponyms:
+            id_,_=self.disambiguate_(toponym,lang)
+            if id_:
+                result[id_]=toponym
+        return result
+
+    def disambiguate_(self, label, lang='fr'):
+        if re.match("^\d+$", label):
+            return 'O', -1
+        if lang in stop_words: #and lang in common_words:
+            if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
+                return 'O', -1
+
+            if lang in inflectors:
+                plural=inflectors[lang].singularize(label)
+            else:
+                plural = label.rstrip("s") + "s"
+            if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
+                return 'O', -1
+
+        data=get_most_common_id_v3(label, lang)
+        id_, score=None,0
+        if data:
+            id_,score=data.id,data.score
+        return id_, score
diff --git a/strpython/nlp/disambiguator_old/wikipedia_cooc.py b/strpython/nlp/disambiguator_old/wikipedia_cooc.py
new file mode 100644
index 0000000..c9a522a
--- /dev/null
+++ b/strpython/nlp/disambiguator_old/wikipedia_cooc.py
@@ -0,0 +1,110 @@
+# coding = utf-8
+import re
+
+from .disambiguator import Disambiguator
+from .models.bigram import BigramModel
+import pickle
+from ...config.configuration import config
+#from ...helpers.geodict_helpers_old import *
+from ...helpers.geodict_helpers import *
+from .most_common import stop_words,common_words
+import networkx as nx
+
+def read_pickle(fn):
+    return pickle.load(open(fn,'rb'))
+
+class WikipediaDisambiguator(Disambiguator):
+
+    def __init__(self,measure="degree"):
+        Disambiguator.__init__(self)
+        # Load model
+        self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
+        self.measure=measure
+    def disambiguate(self, ner_result, lang="en"):
+        count, se_ = self.extract_se_entities(ner_result)
+        new_count = {}
+        selected_en_rev = {}
+        selected_en = self.disambiguate_wiki(se_,lang)
+        for en in selected_en:
+            selected_en_rev[en]=selected_en[en]
+            #new_count[selected_en[en]] = count[en]
+
+        return new_count, selected_en
+
+    def disambiguate_list(self,toponyms,lang):
+        result=self.disambiguate_wiki(toponyms,lang)
+        return {k:v for k,v in result.items() if v}
+
+    def disambiguate_wiki(self, entities, lang):
+
+        spat_en=[]
+        for e in entities:
+            if re.match("^\d+$", e):
+                continue
+            if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
+                continue
+
+            plural = e.rstrip("s") + "s"
+            if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
+                continue
+            spat_en.append(e)
+        spat_en=list(set(spat_en))
+        g = nx.Graph()
+
+        possible_candidates = []
+        betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
+        group_candidate = {} #candidates per toponym
+
+        for e in spat_en:
+            cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
+            cand = [c.id for c in cand if c]
+            if not cand:
+                cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
+            group_candidate[e] = cand
+            betw_cand[e]=cand
+            for n in cand:
+                betw_cand[n]=set(cand)-set(n)
+            possible_candidates.extend(cand)
+
+        for cand in possible_candidates:
+            g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
+
+        data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
+        for cand in possible_candidates:
+            for cand2 in possible_candidates:
+                # Get PageRank score
+                d = data_candidate[cand]
+
+                sc = 1
+                sc=d.score
+                # Compute probability
+                prob = self.model.get_coocurence_probability(sc, cand, cand2)
+
+                if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
+                    prob = 0.0
+                if prob < 0.0000001:
+                    prob = 0.0
+                if not cand == cand2:
+                    # take the lowest co-occurrency between two candidates
+                    if g.has_edge(cand2, cand) :
+                        if g.edges[cand2,cand]["weight"] < prob:
+                            continue
+                    g.add_edge(cand, cand2, weight=prob)
+
+        selected = {}
+
+        #Take the candidates with the highest degree weighted
+        for gr in group_candidate:
+            try:
+
+                if self.measure == "degree":
+                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
+                elif self.measure == "centrality":
+                    selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight"))
+                else:# degree by default
+                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
+                #print(1)
+            except Exception as e:
+                selected[gr]=get_most_common_id_v3(gr,lang)
+        return selected
+
diff --git a/strpython/nlp/ner/__init__.py b/strpython/nlp/ner/__init__.py
index e69de29..764d8f5 100644
--- a/strpython/nlp/ner/__init__.py
+++ b/strpython/nlp/ner/__init__.py
@@ -0,0 +1,5 @@
+from .spacy import Spacy
+from .nltk import NLTK
+from .polyglot import Polyglot
+from .stanford_ner import StanfordNER
+from .ner import NER
\ No newline at end of file
diff --git a/strpython/nlp/ner/ner.py b/strpython/nlp/ner/ner.py
index 2066a24..47006b3 100644
--- a/strpython/nlp/ner/ner.py
+++ b/strpython/nlp/ner/ner.py
@@ -12,7 +12,43 @@ class NER:
         self._lang = lang
 
     def identify(self, input):
-        return input
+        """
+
+        Parameters
+        ----------
+        input
+
+        Returns
+        -------
+
+        """
+        raise NotImplementedError
 
     def parse_output(self, output):
-        pass
+        """
+        Parse the output of the NER
+        Parameters
+        ----------
+        output: obj
+            ner output
+        Returns
+        -------
+        2D-array numpy
+            First col = Text, Second Col = Tag
+        """
+        raise NotImplementedError
+
+    def translate_tag(self, tag):
+        """
+        Translate the NER tag to a unique tag use in this module.
+        Parameters
+        ----------
+        tag :str
+            tag
+
+        Returns
+        -------
+        str
+            transformed tag
+        """
+        raise NotImplementedError
\ No newline at end of file
diff --git a/strpython/nlp/ner/nltk.py b/strpython/nlp/ner/nltk.py
index 265b9f1..1fdd64c 100644
--- a/strpython/nlp/ner/nltk.py
+++ b/strpython/nlp/ner/nltk.py
@@ -2,7 +2,7 @@
 import nltk
 
 from .ner import NER
-
+import numpy as np
 
 class NLTK(NER):
     """
@@ -25,22 +25,9 @@ class NLTK(NER):
         for tok_ in ner_tagged:
             if isinstance(tok_, nltk.tree.Tree):
                 corresponding_tag_ = self.translate_tag(tok_.label())
-                if not tok_.label() in NLTK._list_of_tags_available:
-                    for i in tok_: output.append(list(i))
-                else:
-                    if not len(tok_) > 1:
-                        output.append([tok_[0][0], corresponding_tag_])
-                    else:
-                        for i in range(len(tok_)):
-                            if i == 0:
-                                output.append([tok_[i][0], "BEG-" + corresponding_tag_])
-                            elif i + 1 == len(tok_):
-                                output.append([tok_[i][0], "END-" + corresponding_tag_])
-                            else:
-                                output.append([tok_[i][0], corresponding_tag_])
-            else:
-                output.append(list(tok_))
-        return output
+                if tok_.label() in NLTK._list_of_tags_available:
+                    output.append([" ".join([t[0] for t in tok_]),self.translate_tag(tok_.label())])
+        return np.array(output)
 
     def translate_tag(self, tag):
         if tag == "LOCATION" or tag == "GPE":
diff --git a/strpython/nlp/ner/polyglot.py b/strpython/nlp/ner/polyglot.py
index df8c083..005935c 100644
--- a/strpython/nlp/ner/polyglot.py
+++ b/strpython/nlp/ner/polyglot.py
@@ -15,37 +15,12 @@ class Polyglot(NER):
         self.poly_instance=None
 
     def identify(self,text):
-        self.poly_instance = json.loads(Text(text,hint_language_code=self._lang).to_json())
+        self.poly_instance = Text(text,hint_language_code=self._lang)
         result_=[]
-        for item in self.poly_instance:
-            pos_t=self.parse_polyglot_output(item["entities"], item["pos_tags"])
-            result_.extend(pos_t)
+        for en in self.poly_instance.entities:
+            result_.append([eval(en.__str__()),self.translate_tag(en.tag)])
         return np.array(result_)
 
-    def parse_polyglot_output(self, entities_list, sentence_pos_tagged):
-        """
-        """
-        tk_pos=0
-        sentence_pos_tagged=np.array(sentence_pos_tagged)
-        while  tk_pos < len(sentence_pos_tagged):
-            token_=sentence_pos_tagged[tk_pos]
-            for entity_ in entities_list:
-                if token_[0] == entity_[1][0]:
-                    if len(entity_[1]) > 1:
-                        en_=np.array(entity_[1])
-                        en_in_pos_tag=sentence_pos_tagged[tk_pos:tk_pos+len(en_)][:,0]
-                        if np.array_equal(en_ ,en_in_pos_tag):
-
-                            sentence_pos_tagged[tk_pos][1]="BEG-"+self.translate_tag(entity_[0])
-                            sentence_pos_tagged[tk_pos+len(en_)-1][1] = "END-" + self.translate_tag(entity_[0])
-                            if len(en_) >2:sentence_pos_tagged[tk_pos+1:tk_pos+len(en_)-1][:,1]=self.translate_tag(entity_[0])
-                    else:
-                        sentence_pos_tagged[tk_pos][1]=self.translate_tag(entity_[0])
-                    tk_pos+=len(entity_[1])
-                    break
-            tk_pos+=1
-        return sentence_pos_tagged
-
     def translate_tag(self,tag):
         if tag == "I-PER":
             return NER._unified_tag["pers"]
diff --git a/strpython/nlp/ner/spacy.py b/strpython/nlp/ner/spacy.py
index 47456bb..98e9ea2 100644
--- a/strpython/nlp/ner/spacy.py
+++ b/strpython/nlp/ner/spacy.py
@@ -2,6 +2,7 @@
 
 import spacy
 
+import numpy as np
 from .ner import NER
 from ..exception.language import LanguageNotAvailable
 
@@ -13,6 +14,7 @@ _tag_spacy = {
     "org": "ORG"
 }
 
+all_tags=["GPE", "LOC","PERSON","ORG"]
 
 def flatten(lis):
     """
@@ -59,26 +61,12 @@ class Spacy(NER):
         import multiprocessing
         if len(text) > 10000:
             output_=[]
-            for t in self._ner.pipe(self.split_text(text,10000),n_threads=multiprocessing.cpu_count(),batch_size=10000,as_tuples=False,):
-                output_.extend([[token.text, token.pos_, token.ent_type_] for token in t])
-            return self.parse_output(output_, [])
+            for t in self._ner.pipe(self.split_text(text,10000),n_threads=multiprocessing.cpu_count(),batch_size=100,as_tuples=False):
+                output_.extend([[token.text, self.translate_tag(token.label_)] for token in t.ents])
+            return np.array(output_)
         else:
-            output_ = [[token.text, token.pos_, token.ent_type_] for token in self._ner(text)]
-            return self.parse_output(output_, [])
-
-    def parse_output(self, output, pos_tags):
-        # Pre-Treatment on the output
-        # print(1)
-        tagged_ = []
-        _tag_entity = flatten(list(_tag_spacy.values()))
-
-        for token in output:
-            if token[-1] in _tag_entity:
-                tagged_.append([token[0], self.translate_tag(token[-1])])
-            else:
-                tagged_.append([token[0], token[-2]])
-
-        return self.add_beg_ending_to_tag(tagged_)
+            output_ = [[token.text, self.translate_tag(token.label_)] for token in self._ner(text).ents if token.label_ in all_tags]
+            return np.array(output_)
 
     def translate_tag(self, tag):
         if tag == _tag_spacy["pers"]:
@@ -88,23 +76,3 @@ class Spacy(NER):
         if tag == _tag_spacy["org"]:
             return NER._unified_tag["org"]
 
-    def add_beg_ending_to_tag(self, tag):
-        _tag_entity = list(NER._unified_tag.values())
-        t = 0
-        while t < len(tag):
-            if tag[t][1] in _tag_entity and t + 1 < len(tag):
-
-                if tag[t + 1][1] == tag[t][1]:
-                    tag[t][1] = "BEG-" + tag[t][1]
-                    t += 1
-                    if t + 1 >= len(tag):
-                        tag[t][1] = "END-" + tag[t][1]
-
-                    while t + 1 < len(tag):
-                        if tag[t + 1][1] != tag[t][1]:
-                            tag[t][1] = "END-" + tag[t][1]
-                            break
-                        else:
-                            t += 1
-            t += 1
-        return tag
diff --git a/strpython/nlp/ner/stanford_ner.py b/strpython/nlp/ner/stanford_ner.py
index 776b14f..6b7d72d 100644
--- a/strpython/nlp/ner/stanford_ner.py
+++ b/strpython/nlp/ner/stanford_ner.py
@@ -17,10 +17,29 @@ _stanfordner_to_treetagger_lang = {
     "es" : "espagnol"
 }
 
-_tag_stanford = {
-    "place": "LOCATION",
-    "org": "ORGANIZATION",
-    "pers": "PERSON"
+_tag_stanford = {"en":{
+                    "place": "LOCATION",
+                        "org": "ORGANIZATION",
+                        "pers": "PERSON"
+                    },
+                "fr":{
+                    "place": "I-LIEU",
+                    "org": "I-ORG",
+                    "pers": "I-PERS"
+                    }
+
+}
+nlp_config={"fr" : {
+                "tokenize.language" : "fr",
+                "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
+                "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
+                "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
+                "depparse.language" : "french",
+                "ner.model":  "/Users/jacquesfize/.services/stanford-corenlp-full-2017-06-09/eunews.fr.crf.gz",
+                "ssplit.newlineIsSentenceBreak": "always"
+            },
+    "en":{}
+
 }
 
 
@@ -51,11 +70,8 @@ class StanfordNER(NER):
         if not self._lang in _stanfordner_available_language:
             print(self._lang)
             raise LanguageNotAvailable(self._lang, self)
-
         self._ner= RestStanford(config.core_nlp_URL)
 
-
-
         self.identified = None
 
 
@@ -80,15 +96,16 @@ class StanfordNER(NER):
         if not text:
             raise TypeError("No value found in `text` parameter.")
 
+        properties = {'annotators': 'tokenize,ssplit,pos,ner', 'outputFormat': 'json',
+                      "tokenize.untokenizable": "noneDelete", "pipelineLanguage": self._lang}
+        properties.update(nlp_config[self._lang])
         if len(text) < maxlen :
-            output_=self._ner.annotate(text,properties={'annotators': 'tokenize,ssplit,pos,ner','outputFormat':'json',"tokenize.untokenizable":"noneDelete"})
+            output_=self._ner.annotate(text,properties=properties)
             if isinstance(output_, str):
                 output_ = json.loads(output_, strict=False)
         else:
             texts=self.split_text(text,maxlen)
-            output_ = self._ner.annotate(texts[0], properties={'annotators': 'tokenize,ssplit,pos,ner',
-                                                            'outputFormat': 'json',
-                                                           "tokenize.untokenizable": "noneDelete"})
+            output_ = self._ner.annotate(texts[0] ,properties=properties)
             if isinstance(output_, str):
                 output_ = json.loads(output_, strict=False)
 
@@ -122,50 +139,24 @@ class StanfordNER(NER):
         return self.parse_output(output_, [])
 
     def parse_output(self, output, pos_tags):
-        # Pre-Treatment on the output
-        #print(1)
         tagged_=[]
-        _tag_entity = list(_tag_stanford.values())
+        _tag_entity = list(_tag_stanford[self._lang].values())
 
         for sentence in output["sentences"]:
-            #print(sentence.keys())
             for w in sentence["tokens"]:
                 if w["ner"] in _tag_entity:
                     tagged_.append([w["originalText"],self.translate_tag(w["ner"])])
-                else:
-                    tagged_.append([w["originalText"], w["pos"]])
 
-        return self.add_beg_ending_to_tag(tagged_)
+        return tagged_
 
     def translate_tag(self,tag):
-        if tag == _tag_stanford["pers"]:
+        if tag == _tag_stanford[self._lang]["pers"]:
             return NER._unified_tag["pers"]
-        if tag ==_tag_stanford["place"]:
+        if tag ==_tag_stanford[self._lang]["place"]:
             return NER._unified_tag["place"]
-        if tag ==_tag_stanford["org"]:
+        if tag ==_tag_stanford[self._lang]["org"]:
             return NER._unified_tag["org"]
 
-    def add_beg_ending_to_tag(self, tag):
-        _tag_entity = list(NER._unified_tag.values())
-        t = 0
-        while t < len(tag):
-            if tag[t][1] in _tag_entity and t + 1 < len(tag):
-
-                if tag[t + 1][1] == tag[t][1]:
-                    tag[t][1] = "BEG-" + tag[t][1]
-                    t += 1
-                    if t + 1 >= len(tag):
-                        tag[t][1] = "END-" + tag[t][1]
-
-                    while t + 1 < len(tag):
-                        if tag[t + 1][1] != tag[t][1]:
-                            tag[t][1] = "END-" + tag[t][1]
-                            break
-                        else:
-                            t += 1
-            t += 1
-        return tag
-
 
 # java -mx600m -cp "*:lib\*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier eunews.fr.crf.gz -textFile ../ownCloud/THESE/NoteBookPython/corpus/corpus.txt > test.txt
 
diff --git a/strpython/pipeline.py b/strpython/pipeline.py
index 4db01b3..c7cd89f 100644
--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -1,17 +1,21 @@
 # coding =utf-8
-from strpython.models.str import STR
+import re
+
+from nltk import word_tokenize
 
+from strpython.models.str import STR
 from .models.transformation.transform import Generalisation, Expansion
-from .nlp.disambiguator.disambiguator import Disambiguator
-from .nlp.disambiguator.most_common import MostCommonDisambiguator
+
+from .nlp.disambiguator import *
+from .nlp.ner import *
+
 from .nlp.exception.disambiguator import NotADisambiguatorInstance
 from .nlp.exception.ner import NotANERInstance
 from .nlp.exception.tagger import NotATaggerInstance
-from .nlp.ner.ner import NER
-from .nlp.ner.stanford_ner import StanfordNER
+
 from .nlp.pos_tagger.tagger import Tagger
 from .nlp.pos_tagger.treetagger import TreeTagger
-import json,re
+
 
 
 
@@ -21,7 +25,7 @@ class Pipeline(object):
     Run the whole treatement on a given text
     """
 
-    def __init__(self,lang="english",**kwargs):
+    def __init__(self,lang="en",**kwargs):
         """
         Constructor
 
@@ -29,8 +33,7 @@ class Pipeline(object):
         :param kwargs:
         """
         self.lang=lang[:2]
-        self.tagger=kwargs["tagger"] if "tagger" in kwargs else TreeTagger(language=lang)
-        self.ner = kwargs["ner"] if "ner" in kwargs else StanfordNER(lang=lang[:2])
+        self.ner = kwargs["ner"] if "ner" in kwargs else Spacy(lang=lang[:2])
         self.disambiguator=kwargs["disambiguator"] if "disambiguator" in kwargs else MostCommonDisambiguator()
 
     def parse(self,text,debug=False):
@@ -40,30 +43,16 @@ class Pipeline(object):
         :rtype: list,dict
         """
         output = text
-        # If specificate POS
-        if self.tagger.active:
-            output = self.tagger.tag(output)
 
         # NER
         output = self.ner.identify(output)
-
         # Disambiguation
-        count,se_identified = self.disambiguator.disambiguate(output, self.lang)
+        se_identified = self.disambiguator.disambiguate(self.lang,ner_output=output)
         if debug:
             print(se_identified)
 
-        return count,output,se_identified
+        return text, se_identified
 
-    def set_tagger(self,tagger):
-        """
-        Set POS tagger used in the Pipeline
-        :param tagger:
-        :return:
-        """
-        if isinstance(tagger,Tagger):
-            self.tagger=tagger
-        else:
-            raise NotATaggerInstance()
 
     def set_ner(self,ner):
         """
@@ -94,23 +83,22 @@ class Pipeline(object):
         :param text:
         :return: STR
         """
-        cooc= kwargs.get("cooc",False)
-        adj = kwargs.get("adj", True)
-        inc = kwargs.get("inc", True)
         toponyms= kwargs.get("toponyms", None)
         stop_words=kwargs.get("stop_words",[])
+
         if isinstance(toponyms,list):
-            se_identified = self.disambiguator.disambiguate_list([top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3],self.lang)
-            count,output ={},text
-        #print(se_identified)
-        elif not se_identified:
-            count,output, se_identified = self.parse(text)
+            se_identified = self.disambiguator.disambiguate(self.lang,toponyms=[top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3])
+            input = ""
+
+        elif se_identified:
+            input, se_identified = self.parse(text)
         else:
-            count, output, _ = self.parse(text)
-        str_=STR(output,se_identified)
-        str_.build(adj=adj,inc=inc)
-        str_=self.transform(str_,**kwargs) #TODO : Add count
-        return str_,count,str_.spatial_entities
+            input,se_identified=self.parse(text)
+
+        str_=STR(word_tokenize(input),se_identified,toponym_first=True)
+        str_.build(adj=True,inc=True)
+        str_=self.transform(str_,**kwargs)
+        return str_
 
     def transform(self,str_,**kwargs):
         if not "type_trans" in kwargs:
@@ -121,7 +109,3 @@ class Pipeline(object):
         else:
             str_=Expansion().transform(str_,**kwargs)
         return str_
-
-
-if __name__ == '__main__':
-    pass
\ No newline at end of file
-- 
GitLab