Debug, STR modif for faster generation, debug disambiguators, update...

Debug, STR modif for faster generation, debug disambiguators, update pipeline,debug document selection

Debug, STR modif for faster generation, debug disambiguators, update...
Debug, STR modif for faster generation, debug disambiguators, update pipeline,debug document selection
755998a6 · Fize Jacques · 5da5fbd1 · 755998a6 · 755998a6 · 755998a6
Commit 755998a6 authored 6 years ago by Fize Jacques
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 1112 additions and 301 deletions
+1112 -301
--- a/depreciated/generate_data.py
+++ b/depreciated/generate_data.py
@@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
 from langdetect import detect
 from progressbar import ProgressBar, Timer, Bar, ETA, Counter

-from strpython.nlp.disambiguator.geodict_gaurav import *
+from strpython.nlp.disambiguator.share_prop import *
 from strpython.pipeline import *
 import networkx as nx


--- a/depreciated/generate_data_csv.py
+++ b/depreciated/generate_data_csv.py
@@ -6,7 +6,7 @@ import argparse,glob, string,time,re
 from progressbar import ProgressBar, Timer, Bar, ETA, Counter

 from strpython.models.str import STR
-from strpython.nlp.disambiguator.geodict_gaurav import *
+from strpython.nlp.disambiguator.share_prop import *
 from strpython.pipeline import *
 import pandas as pd
 import networkx as nx

--- a/generate_annotation_file.py
+++ b/generate_annotation_file.py
@@ -33,7 +33,7 @@ selected = json.load(open(args.selectedFile))
 for fn in matrix_fns:
    measure = os.path.basename(fn).split("_")[0]
    
-    type_= "_".join(fn.split("_")[1:]).replace(".npy.bz2","")
+    type_= "_".join(os.path.basename(fn).split("_")[1:]).replace(".npy.bz2","")
    print("Proceeding...",measure, type_)
    df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
                                    selected,

--- a/generate_str.py
+++ b/generate_str.py
@@ -21,7 +21,7 @@ from strpython.nlp.ner.polyglot import Polyglot as poly_ner
 from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner

 from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d
-from strpython.nlp.disambiguator.geodict_gaurav import GauravGeodict as shared_geo_d
+from strpython.nlp.disambiguator.share_prop import ShareProp as shared_geo_d
 from strpython.nlp.disambiguator.most_common import MostCommonDisambiguator as most_common_d

 from mytoolbox.text.clean import *

--- a/strpython/eval/disambiguation.py
+++ b/strpython/eval/disambiguation.py
 # coding = utf-8

 from shapely.geometry import Point
-from ..nlp.disambiguator.geodict_gaurav import GauravGeodict
+from ..nlp.disambiguator.share_prop import GauravGeodict
 from ..nlp.disambiguator.most_common import MostCommonDisambiguator
 from ..nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator


--- a/strpython/models/str.py
+++ b/strpython/models/str.py
--- a/strpython/nlp/disambiguator/__init__.py
+++ b/strpython/nlp/disambiguator/__init__.py
-# coding = utf-8
\ No newline at end of file
+# coding = utf-8
+
+from .most_common import MostCommonDisambiguator
+from .share_prop import ShareProp
+from .wikipedia_cooc import WikipediaDisambiguator
+from .disambiguator import Disambiguator
\ No newline at end of file
--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -10,53 +10,62 @@ from ..ner.ner import NER

 class Disambiguator(object):

-    def __init__(self):
+    def __init__(self,one_by_one=False,context_based=False):
        """Constructor for Disambiguator"""
-        pass
-
-    def extract_se_entities(self, input):
-        out = Disambiguator.parse_corpus(input)
-        en_ = out[out[:, 1] == NER._unified_tag["place"]]
-        return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
-
-    def toponymes_frequencies(self, ens_):
-        count = {}
-        for en in ens_:
-            if not en in count: count[en] = 0
-            count[en] += 1
-        return count
-
-    @staticmethod
-    def parse_corpus(corpus):
-        final_corpus = []
-        t = 0
-        placeTag = NER._unified_tag["place"]
-        while t < len(corpus):
-            tag = copy.copy(corpus[t])
-
-            if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
-                lenw = 1
-                if tag[1] == "BEG-" + placeTag:
-                    compound_tag = tag[0]
-                    t += 1
-                    while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
-                        tag = copy.copy(corpus[t])
-                        if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
-                            compound_tag += tag[0]
-                        else:
-                            compound_tag += " " + tag[0]
-                        t += 1
-                        lenw += 1
-                    tag[0] = compound_tag
-                    tag[1] = placeTag
-                t += 1
-            else:
-                t += 1
-            final_corpus.append(tag)
-        return np.array(final_corpus)
-
-    def disambiguate(self, ner_result):
-        pass
-
-    def disambiguate_list(self,toponyms,lang):
-        pass
\ No newline at end of file
+        self.one_by_one= one_by_one
+        self.context_based=context_based
+
+    def disambiguate(self,lang,ner_output=None,toponyms=None):
+        """
+        Run the disambiguation on the NER output
+        Parameters
+        ----------
+        ner_output : 2D numpy array
+            NER output
+        lang : str
+            language
+
+        Returns
+        -------
+        dict
+            {toponym : geodictID}
+        """
+        if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2:
+            toponyms = self.parse_ner_output(ner_output)
+        elif not toponyms:
+            raise ValueError("Either enter a list of toponyms or give ner_output")
+        if self.context_based:
+            return self.disambiguate_context_based(toponyms,lang)
+        else:
+            return self.disambiguate_one_by_one(toponyms,lang)
+
+    def disambiguate_one_by_one(self, toponyms, lang):
+        """
+        Disambiguation process when toponyms are geocoded one by one.
+        Parameters
+        ----------
+        toponyms :list
+            toponyms
+        Returns
+        -------
+        dict
+            {toponym : geodictID}
+        """
+        raise NotImplementedError
+
+    def disambiguate_context_based(self,toponyms,lang):
+        """
+        Disambiguation process when toponyms are geocoded using each one of them
+        Parameters
+        ----------
+        toponyms :list
+            toponyms
+        Returns
+        -------
+        dict
+            {toponym : geodictID}
+        """
+        raise NotImplementedError
+
+    def parse_ner_output(self,ner_output):
+        return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]]
\ No newline at end of file
--- a/strpython/nlp/disambiguator/most_common.py
+++ b/strpython/nlp/disambiguator/most_common.py
@@ -28,40 +28,28 @@ common_words = {
 class MostCommonDisambiguator(Disambiguator):

    def __init__(self):
-        Disambiguator.__init__(self)
+        Disambiguator.__init__(self,one_by_one=True)

-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en = {}
-        for en in se_:
-            id_,score=self.disambiguate_(en,lang)
-            if not id_ == "O" and id_:
-                selected_en[id_] = en
-                new_count[id_] = count[en]
-
-        return new_count, selected_en
-
-    def disambiguate_list(self,toponyms,lang):
+    def disambiguate_one_by_one(self, toponyms,lang):
        result={}
        for toponym in toponyms:
            id_,_=self.disambiguate_(toponym,lang)
            if id_:
-                result[id_]=toponym
+                result[toponym]=id_
        return result

    def disambiguate_(self, label, lang='fr'):
        if re.match("^\d+$", label):
            return 'O', -1
        if lang in stop_words: #and lang in common_words:
-            if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
+            if label.lower().rstrip("s") in stop_words[lang]:
                return 'O', -1

            if lang in inflectors:
                plural=inflectors[lang].singularize(label)
            else:
                plural = label.rstrip("s") + "s"
-            if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
+            if plural.lower() in stop_words[lang]:
                return 'O', -1

        data=get_most_common_id_v3(label, lang)

--- a/strpython/nlp/disambiguator/share_prop.py
+++ b/strpython/nlp/disambiguator/share_prop.py
+# coding = utf-8
+import math
+
+from ...helpers.collision import *
+#from ...helpers.geodict_helpers_old import *
+from ...helpers.geodict_helpers import *
+from .disambiguator import Disambiguator
+
+from ...models.str import get_inclusion_chain
+
+
+class ShareProp(Disambiguator):
+
+    def __init__(self):
+        Disambiguator.__init__(self,context_based=True)
+
+    def fib_formula(self, n):
+        """
+        Return the fibonacci value.
+        Parameters
+        ----------
+        n : int
+            parameter
+        Returns
+        -------
+        int
+            fibonnaci value
+        """
+        if n in [0, 1]: return 0  # Modifying fibonacci behaviour
+        golden_ratio = (1 + math.sqrt(5)) / 2
+        val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
+        return int(round(val))
+
+    def inclusion_log(self, x):
+        """
+        Return the inclusion log
+        Parameters
+        ----------
+        x : int
+            parameter
+
+        Returns
+        -------
+        int
+            inclusion log
+        """
+        if x==0:
+            return 1
+        return math.log(x)
+
+
+    def get_inclusion_score(self, id1, id2):
+        """
+        Return the inclusion score. Compute the distance between two entities in the hierarchy.
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        int
+            inclusion score
+        """
+        list1 = get_inclusion_chain(id1, 'P131')
+        list2 = get_inclusion_chain(id2, 'P131')
+        interP131 = len(list(set(list1).intersection(list2)))
+        list1 = get_inclusion_chain(id1, 'P706')
+        list2 = get_inclusion_chain(id2, 'P706')
+        interP706 = len(list(set(list1).intersection(list2)))
+        # return fib_no[interP131]+fib_no[interP706]
+        return self.inclusion_log(interP131) + self.inclusion_log(interP706)
+
+    def Adjacency_P47(self, id1, id2):
+        """
+        Return true, if two spatial entities are found adjacent using the P47 property (share borders) from Wikidata.
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        bool
+            true if adjacent using P47
+        """
+        data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
+        if "P47" in data_1 and "P47" in data_2:
+            if id1 in data_2.other.P47 or id2 in data_1.other.P47:
+                return True
+        return False
+
+    def Adjacency_Hull(self, id1, id2):
+        """
+        To find if two spatial entities hull "collide"
+        Parameters
+        ----------
+        id1 : str
+            id of the first spatial entity
+        id2 : str
+            id of the second spatial entity
+
+        Returns
+        -------
+        bool
+            if collide
+        """
+        return collisionTwoSEBoundaries(id1, id2)
+
+    def disambiguateOne(self, spat_candidates, fixed_entities):
+        """
+        Disambiguate one toponym
+        Parameters
+        ----------
+        spat_candidates
+            list of candidates found in the georeferential
+        fixed_entities
+            entities with no ambiguities
+
+        Returns
+        -------
+
+        """
+        score_dc = {}
+        for cand in spat_candidates:
+            id_cand = cand.id
+            score_dc[id_cand] = 0
+            for fixed in fixed_entities:
+                id_fixed = fixed_entities[fixed].id
+                if self.Adjacency_P47(id_cand, id_fixed):
+                    score_dc[id_cand] += 3
+                elif self.Adjacency_Hull(id_cand, id_fixed):
+                    score_dc[id_cand] += 2
+                score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
+
+        m = max(score_dc, key=score_dc.get)
+        if score_dc[m] < 4:
+            return None
+        for cand in spat_candidates:
+            if cand.id == m:
+                return cand.id
+
+
+    def disambiguate_context_based(self,toponyms,lang):
+        selected_en = {}
+        fixed_entities = {}
+        ambiguous_entities = {}
+        for topo in toponyms:
+            request = gazetteer.get_by_label(topo, lang)
+            if len(request) == 0:
+                request = gazetteer.get_by_alias(topo, lang)
+            if len(request) > 1:
+                ambiguous_entities[topo] = request
+            elif len(request) == 1:
+                fixed_entities[topo] = request[0]
+
+        d_amb_results = {}
+        for amb_ent in ambiguous_entities:
+            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
+            if not d:
+                d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
+            else:
+                d_amb_results[amb_ent] = d
+
+        for k, v in fixed_entities.items():
+            selected_en[k] = v.id
+        for k, v in d_amb_results.items():
+            selected_en[k] = v
+
+        return selected_en
\ No newline at end of file
--- a/strpython/nlp/disambiguator/wikipedia_cooc.py
+++ b/strpython/nlp/disambiguator/wikipedia_cooc.py
@@ -16,80 +16,71 @@ def read_pickle(fn):
 class WikipediaDisambiguator(Disambiguator):

    def __init__(self,measure="degree"):
-        Disambiguator.__init__(self)
+        Disambiguator.__init__(self,context_based=True)
        # Load model
        self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
        self.measure=measure
-    def disambiguate(self, ner_result, lang="en"):
-        count, se_ = self.extract_se_entities(ner_result)
-        new_count = {}
-        selected_en_rev = {}
-        selected_en = self.disambiguate_wiki(se_,lang)
-        for en in selected_en:
-            selected_en_rev[en]=selected_en[en]
-            #new_count[selected_en[en]] = count[en]

-        return new_count, selected_en

    def disambiguate_list(self,toponyms,lang):
        result=self.disambiguate_wiki(toponyms,lang)
        return {k:v for k,v in result.items() if v}

-    def disambiguate_wiki(self, entities, lang):
-
-        spat_en=[]
-        for e in entities:
-            if re.match("^\d+$", e):
+    def disambiguate_context_based(self,toponyms,lang):
+        toponyms_filtered=[]
+        for toponym in toponyms:
+            if re.match("^\d+$", toponym):
                continue
-            if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
+            if lang in stop_words and toponym.lower().rstrip("s") in stop_words[lang]:# or toponym.lower().rstrip("s") in common_words[lang]:
                continue

-            plural = e.rstrip("s") + "s"
+            plural = toponym.rstrip("s") + "s"
            if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
                continue
-            spat_en.append(e)
-        spat_en=list(set(spat_en))
+            toponyms_filtered.append(toponym)
+
+        toponyms_filtered=list(set(toponyms_filtered))
        g = nx.Graph()

        possible_candidates = []
        betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
        group_candidate = {} #candidates per toponym

-        for e in spat_en:
-            cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
-            cand = [c.id for c in cand if c]
-            if not cand:
-                cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
-            group_candidate[e] = cand
-            betw_cand[e]=cand
-            for n in cand:
-                betw_cand[n]=set(cand)-set(n)
-            possible_candidates.extend(cand)
+        for toponym in toponyms_filtered:
+            candidates = get_top_candidate(toponym, lang, 5)
+            candidates = [c.id for c in candidates if c]
+            if not candidates:
+                candidates = [c.id for c in gazetteer.get_n_label_similar(toponym,lang,5) if c]
+            group_candidate[toponym] = candidates
+            betw_cand[toponym]=candidates
+            for n in candidates:
+                betw_cand[n]=set(candidates)-set(n)
+            possible_candidates.extend(candidates)

-        for cand in possible_candidates:
-            g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
+        for candidate in possible_candidates:
+            g.add_node(candidate, label=gazetteer.get_by_id(candidate)[0].label[lang])

        data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
-        for cand in possible_candidates:
-            for cand2 in possible_candidates:
+        for candidate in possible_candidates:
+            for candidate2 in possible_candidates:
                # Get PageRank score
-                d = data_candidate[cand]
+                d = data_candidate[candidate]

                sc = 1
                sc=d.score
                # Compute probability
-                prob = self.model.get_coocurence_probability(sc, cand, cand2)
+                prob = self.model.get_coocurence_probability(sc, candidate, candidate2)

-                if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
+                if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]:
                    prob = 0.0
                if prob < 0.0000001:
                    prob = 0.0
-                if not cand == cand2:
+                if not candidate == candidate2:
                    # take the lowest co-occurrency between two candidates
-                    if g.has_edge(cand2, cand) :
-                        if g.edges[cand2,cand]["weight"] < prob:
+                    if g.has_edge(candidate2, candidate) :
+                        if g.edges[candidate2,candidate]["weight"] < prob:
                            continue
-                    g.add_edge(cand, cand2, weight=prob)
+                    g.add_edge(candidate, candidate2, weight=prob)

        selected = {}

@@ -104,7 +95,8 @@ class WikipediaDisambiguator(Disambiguator):
                else:# degree by default
                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
                #print(1)
-            except Exception as e:
-                selected[gr]=get_most_common_id_v3(gr,lang)
+            except Exception as toponym:
+                most_common = get_most_common_id_v3(gr, lang)
+                if most_common and len(most_common)>0: selected[gr]=most_common[0].id
        return selected

--- a/strpython/nlp/disambiguator_old/__init__.py
+++ b/strpython/nlp/disambiguator_old/__init__.py
+# coding = utf-8
\ No newline at end of file
--- a/strpython/nlp/disambiguator_old/disambiguator.py
+++ b/strpython/nlp/disambiguator_old/disambiguator.py
+# coding = utf-8
+
+import copy
+import string
+
+import numpy as np
+
+from ..ner.ner import NER
+
+
+class Disambiguator(object):
+
+    def __init__(self):
+        """Constructor for Disambiguator"""
+        pass
+
+    def extract_se_entities(self, input):
+        out = Disambiguator.parse_corpus(input)
+        en_ = out[out[:, 1] == NER._unified_tag["place"]]
+        return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
+
+    def toponymes_frequencies(self, ens_):
+        count = {}
+        for en in ens_:
+            if not en in count: count[en] = 0
+            count[en] += 1
+        return count
+
+    @staticmethod
+    def parse_corpus(corpus):
+        final_corpus = []
+        t = 0
+        placeTag = NER._unified_tag["place"]
+        while t < len(corpus):
+            tag = copy.copy(corpus[t])
+
+            if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
+                lenw = 1
+                if tag[1] == "BEG-" + placeTag:
+                    compound_tag = tag[0]
+                    t += 1
+                    while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
+                        tag = copy.copy(corpus[t])
+                        if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
+                            compound_tag += tag[0]
+                        else:
+                            compound_tag += " " + tag[0]
+                        t += 1
+                        lenw += 1
+                    tag[0] = compound_tag
+                    tag[1] = placeTag
+                t += 1
+            else:
+                t += 1
+            final_corpus.append(tag)
+        return np.array(final_corpus)
+
+    def disambiguate(self, ner_result):
+        pass
+
+    def disambiguate_list(self,toponyms,lang):
+        pass
\ No newline at end of file
--- a/strpython/nlp/disambiguator/geodict_gaurav.py
+++ b/strpython/nlp/disambiguator/geodict_gaurav.py
--- a/strpython/nlp/disambiguator_old/models/__init__.py
+++ b/strpython/nlp/disambiguator_old/models/__init__.py
+# coding = utf-8
\ No newline at end of file
--- a/strpython/nlp/disambiguator_old/models/bigram.py
+++ b/strpython/nlp/disambiguator_old/models/bigram.py
+# coding = utf-8
+
+
+class BigramModel:
+    def __init__(self,freq={},count={}):
+        self.cooc_freq=freq
+        self.count_associated=count
+
+    def append(self,uri1,uri2):
+
+        if not uri1 in self.cooc_freq:
+            self.cooc_freq[uri1]={}
+        if not uri2 in self.cooc_freq[uri1]:
+            self.cooc_freq[uri1][uri2]=0
+        self.cooc_freq[uri1][uri2]+=1
+
+        self.increment_count(uri2)
+
+    def increment_count(self,uri):
+        if not uri in self.count_associated:
+            self.count_associated[uri]=0
+        self.count_associated[uri]+=1
+
+    def get_coocurence_probability(self, pr1, *args):
+        if len(args) < 2:
+            print("Only one URI indicated")
+            return 0.
+        res_=1.
+        for u in range(1,len(args)):
+            res_*=self.get_bigram_probability(args[0],args[u],pr1)
+        return res_
+
+
+    def get_bigram_probability(self,uri1,uri2,pr1=1):
+        nna=0.00000001
+        if  uri1 in self.cooc_freq:
+            if  uri2 in self.cooc_freq[uri1]:
+                return self.cooc_freq[uri1][uri2]
+                #return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1
+        elif uri2 in self.cooc_freq:
+            if uri1 in self.cooc_freq[uri2]:
+                return self.cooc_freq[uri2][uri1]
+                #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
+        return nna
+
+
--- a/strpython/nlp/disambiguator_old/most_common.py
+++ b/strpython/nlp/disambiguator_old/most_common.py
+# coding = utf-8
+
+
+
+from ...helpers.geodict_helpers import *
+from .disambiguator import Disambiguator
+import re, json, os
+from ...config.configuration import config
+
+from inflector import Inflector,English,Spanish,French
+
+inflectors= {
+    "en":Inflector(English()),
+    "fr":Inflector(French()),
+    "es":Inflector(Spanish())
+}
+stop_words = {
+    "fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")),
+    "en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n"))
+}
+
+common_words = {
+    "fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))),
+    "en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n"))
+}
+
+
+class MostCommonDisambiguator(Disambiguator):
+
+    def __init__(self):
+        Disambiguator.__init__(self)
+
+    def disambiguate(self, ner_result, lang="en"):
+        count, se_ = self.extract_se_entities(ner_result)
+        new_count = {}
+        selected_en = {}
+        for en in se_:
+            id_,score=self.disambiguate_(en,lang)
+            if not id_ == "O" and id_:
+                selected_en[id_] = en
+                new_count[id_] = count[en]
+
+        return new_count, selected_en
+
+    def disambiguate_list(self,toponyms,lang):
+        result={}
+        for toponym in toponyms:
+            id_,_=self.disambiguate_(toponym,lang)
+            if id_:
+                result[id_]=toponym
+        return result
+
+    def disambiguate_(self, label, lang='fr'):
+        if re.match("^\d+$", label):
+            return 'O', -1
+        if lang in stop_words: #and lang in common_words:
+            if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
+                return 'O', -1
+
+            if lang in inflectors:
+                plural=inflectors[lang].singularize(label)
+            else:
+                plural = label.rstrip("s") + "s"
+            if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
+                return 'O', -1
+
+        data=get_most_common_id_v3(label, lang)
+        id_, score=None,0
+        if data:
+            id_,score=data.id,data.score
+        return id_, score
--- a/strpython/nlp/disambiguator_old/wikipedia_cooc.py
+++ b/strpython/nlp/disambiguator_old/wikipedia_cooc.py
+# coding = utf-8
+import re
+
+from .disambiguator import Disambiguator
+from .models.bigram import BigramModel
+import pickle
+from ...config.configuration import config
+#from ...helpers.geodict_helpers_old import *
+from ...helpers.geodict_helpers import *
+from .most_common import stop_words,common_words
+import networkx as nx
+
+def read_pickle(fn):
+    return pickle.load(open(fn,'rb'))
+
+class WikipediaDisambiguator(Disambiguator):
+
+    def __init__(self,measure="degree"):
+        Disambiguator.__init__(self)
+        # Load model
+        self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
+        self.measure=measure
+    def disambiguate(self, ner_result, lang="en"):
+        count, se_ = self.extract_se_entities(ner_result)
+        new_count = {}
+        selected_en_rev = {}
+        selected_en = self.disambiguate_wiki(se_,lang)
+        for en in selected_en:
+            selected_en_rev[en]=selected_en[en]
+            #new_count[selected_en[en]] = count[en]
+
+        return new_count, selected_en
+
+    def disambiguate_list(self,toponyms,lang):
+        result=self.disambiguate_wiki(toponyms,lang)
+        return {k:v for k,v in result.items() if v}
+
+    def disambiguate_wiki(self, entities, lang):
+
+        spat_en=[]
+        for e in entities:
+            if re.match("^\d+$", e):
+                continue
+            if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
+                continue
+
+            plural = e.rstrip("s") + "s"
+            if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
+                continue
+            spat_en.append(e)
+        spat_en=list(set(spat_en))
+        g = nx.Graph()
+
+        possible_candidates = []
+        betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
+        group_candidate = {} #candidates per toponym
+
+        for e in spat_en:
+            cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
+            cand = [c.id for c in cand if c]
+            if not cand:
+                cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
+            group_candidate[e] = cand
+            betw_cand[e]=cand
+            for n in cand:
+                betw_cand[n]=set(cand)-set(n)
+            possible_candidates.extend(cand)
+
+        for cand in possible_candidates:
+            g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
+
+        data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
+        for cand in possible_candidates:
+            for cand2 in possible_candidates:
+                # Get PageRank score
+                d = data_candidate[cand]
+
+                sc = 1
+                sc=d.score
+                # Compute probability
+                prob = self.model.get_coocurence_probability(sc, cand, cand2)
+
+                if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
+                    prob = 0.0
+                if prob < 0.0000001:
+                    prob = 0.0
+                if not cand == cand2:
+                    # take the lowest co-occurrency between two candidates
+                    if g.has_edge(cand2, cand) :
+                        if g.edges[cand2,cand]["weight"] < prob:
+                            continue
+                    g.add_edge(cand, cand2, weight=prob)
+
+        selected = {}
+
+        #Take the candidates with the highest degree weighted
+        for gr in group_candidate:
+            try:
+
+                if self.measure == "degree":
+                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
+                elif self.measure == "centrality":
+                    selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight"))
+                else:# degree by default
+                    selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
+                #print(1)
+            except Exception as e:
+                selected[gr]=get_most_common_id_v3(gr,lang)
+        return selected
+
--- a/strpython/nlp/ner/__init__.py
+++ b/strpython/nlp/ner/__init__.py
+from .spacy import Spacy
+from .nltk import NLTK
+from .polyglot import Polyglot
+from .stanford_ner import StanfordNER
+from .ner import NER
\ No newline at end of file
--- a/strpython/nlp/ner/ner.py
+++ b/strpython/nlp/ner/ner.py
@@ -12,7 +12,43 @@ class NER:
        self._lang = lang

    def identify(self, input):
-        return input
+        """
+
+        Parameters
+        ----------
+        input
+
+        Returns
+        -------
+
+        """
+        raise NotImplementedError

    def parse_output(self, output):
-        pass
+        """
+        Parse the output of the NER
+        Parameters
+        ----------
+        output: obj
+            ner output
+        Returns
+        -------
+        2D-array numpy
+            First col = Text, Second Col = Tag
+        """
+        raise NotImplementedError
+
+    def translate_tag(self, tag):
+        """
+        Translate the NER tag to a unique tag use in this module.
+        Parameters
+        ----------
+        tag :str
+            tag
+
+        Returns
+        -------
+        str
+            transformed tag
+        """
+        raise NotImplementedError
\ No newline at end of file