Initialise Git with previous code

156baa8e · Pokiros · 435b84a0 · 156baa8e · 156baa8e · 156baa8e
Commit 156baa8e authored 7 years ago by Pokiros
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 3156 additions and 0 deletions
+3156 -0
--- a/.gitignore
+++ b/.gitignore
+# Created by .ignore support plugin (hsz.mobi)
+### Example user template template
+### Example user template
+
+# IntelliJ project files
+.idea
+*.iml
+test*
--- a/config/__init__.py
+++ b/config/__init__.py
--- a/config/config.json
+++ b/config/config.json
+{
+  "tree_tagger_home":"/Users/jacquesfize/.tree-tagger/cmd/",
+  "osm_boundaries_directory":"/Users/jacquesfize/install",
+  "core_nlp_URL":"http://localhost:9000"
+}
\ No newline at end of file
--- a/config/configuration.py
+++ b/config/configuration.py
+# coding = utf-8
+
+import json
+
+class Configuration(object):
+    def __init__(self, data):
+        self.__dict__=data
+        for d in self.__dict__:
+            if isinstance(self.__dict__[d],dict):
+                self.__dict__[d]=Configuration(self.__dict__[d])
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+
+config = Configuration(json.load(open("config/config.json")))
+
--- a/disambiguator/__init__.py
+++ b/disambiguator/__init__.py
+# coding = utf-8
\ No newline at end of file
--- a/disambiguator/disambiguator.py
+++ b/disambiguator/disambiguator.py
+# coding = utf-8
+
+from ner.ner import *
+import copy
+import numpy as np
+
+class Disambiguator(object):
+
+
+    def __init__(self):
+        """Constructor for Disambiguator"""
+        pass
+
+    def extract_se_entities(self, input):
+        out = self.parse_corpus(input)
+        en_ = out[out[:, 1] == NER._unified_tag["place"]]
+        return np.unique(en_[:, 0])
+
+    def parse_corpus(self, corpus):
+        final_corpus = []
+        t = 0
+        placeTag = NER._unified_tag["place"]
+        while t < len(corpus):
+            tag = copy.copy(corpus[t])
+
+            if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
+                lenw = 1
+                if tag[1] == "BEG-" + placeTag:
+                    compound_tag = tag[0]
+                    t += 1
+                    while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
+                        tag = copy.copy(corpus[t])
+                        if tag[0].endswith("-") or compound_tag.endswith("-"):
+                            compound_tag += tag[0]
+                        else:
+                            compound_tag += " " + tag[0]
+                        t += 1
+                        lenw += 1
+                    tag[0] = compound_tag
+                    tag[1] = placeTag
+                t += 1
+            else:
+                t += 1
+            final_corpus.append(tag)
+        return np.array(final_corpus)
+
+    def disambiguate(self,ner_result):
+        pass
--- a/disambiguator/geodict_gaurav.py
+++ b/disambiguator/geodict_gaurav.py
+# coding = utf-8
+from .disambiguator import Disambiguator
+from helpers.collision_with_gazetteer_data import *
+from helpers.gazeteer_helpers import *
+import math
+
+
+class GauravGeodict(Disambiguator):
+
+    def __init__(self):
+        Disambiguator.__init__(self)
+
+    def fib_formula(self,n):
+        if n in [0,1]: return 0 # Modifying fibonacci behaviour
+        golden_ratio = (1 + math.sqrt(5)) / 2
+        val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
+        return int(round(val))
+
+    def inclusion_log(self,x, alpha=0.2):
+        return math.log(x)
+
+    def get_inclusion_tree(self, id_, prop):
+        """
+        For an entity return it geographical inclusion tree using a property.
+        """
+        arr = []
+        current_entity = get_data(id_)
+        while True:
+            if prop in current_entity:
+                arr.append(current_entity[prop][0])
+                current_entity = get_data(current_entity[prop][0])
+            else:
+                arr.append('Q2') # Earth ID
+                break
+        return arr
+
+    def get_inclusion_score(self,id1, id2):  # is it really inclusion ? :)
+        list1 = self.get_inclusion_tree(id1, 'P131')
+        list2 = self.get_inclusion_tree(id2, 'P131')
+        interP131 = len(list(set(list1).intersection(list2)))
+        list1 = self.get_inclusion_tree(id1, 'P706')
+        list2 = self.get_inclusion_tree(id2, 'P706')
+        interP706 = len(list(set(list1).intersection(list2)))
+        # return fib_no[interP131]+fib_no[interP706]
+        return self.inclusion_log(interP131) + self.inclusion_log(interP706)
+
+    def Adjacency_P47(self,id1, id2):
+        data_1, data_2 = get_data(id1), get_data(id2)
+        if "P47" in data_1 and "P47" in data_2:
+            if id1 in data_2["P47"] or id2 in data_1["P47"]:
+                return True
+        return False
+
+    def Adjacency_Hull(self,id1, id2):
+        return collisionTwoSEBoundaries(id1, id2)
+
+    def disambiguateOne(self,spat_candidates, fixed_entities):
+        score_dc = {}
+
+        for cand in spat_candidates:
+            id_cand = cand["id"]
+            score_dc[id_cand] = 0
+            for fixed in fixed_entities:
+                id_fixed = fixed_entities[fixed]["id"]
+                if self.Adjacency_P47(id_cand, id_fixed):
+                    score_dc[id_cand] += 3
+                if self.Adjacency_Hull(id_cand, id_fixed):
+                    score_dc[id_cand] += 2
+                score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
+        m = max(score_dc, key=score_dc.get)
+        if score_dc[m] < 4:
+            return None
+        for cand in spat_candidates:
+            if cand["id"] == m:
+                return cand["id"]
+
+
+
+    def disambiguate(self,ner_result,lang="en"):
+        se_ = self.extract_se_entities(ner_result)
+        selected_en = {}
+
+        fixed_entities = {}
+        ambiguous_entities = {}
+        for en in se_:
+            request = get_by_label(en, lang)
+            if len(request) ==0:
+                request = get_by_alias(en, lang)
+
+            if len(request) > 1:
+                ambiguous_entities[en] = [r["_source"] for r in request]
+            elif len(request) == 1:
+                fixed_entities[en] = request[0]["_source"]
+
+        d_amb_results = {}
+        for amb_ent in ambiguous_entities:
+            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
+            if not d:
+                d_amb_results[amb_ent] = get_most_common_id(amb_ent, lang)
+            else:
+                d_amb_results[amb_ent] = d
+        for k, v in fixed_entities.items():
+            fixed_entities[k] = v["id"]
+        for k, v in d_amb_results.items():
+            fixed_entities[k] = v
+
+        return fixed_entities
+
+
+
--- a/disambiguator/pagerank.py
+++ b/disambiguator/pagerank.py
+# coding = utf-8
+
+from .disambiguator import Disambiguator
+from ner.ner import *
+import copy
+import numpy as np
+from helpers.gazeteer_helpers import get_most_common_id, label_exists, alias_exists, get_most_common_id_alias
+
+
+class PageRankDisambiguator(Disambiguator):
+    def __init__(self):
+        Disambiguator.__init__(self)
+
+    def disambiguate(self, ner_result, lang="en"):
+        se_ = self.extract_se_entities(ner_result)
+        selected_en = {}
+        for en in se_:
+            if label_exists(en, lang):
+                id_ = get_most_common_id(en, lang)
+                selected_en[id_] = en
+            elif alias_exists(en,lang):
+                id_ = get_most_common_id_alias(en, lang)
+                selected_en[id_] = en
+        return selected_en
--- a/epidemio.json
+++ b/epidemio.json
--- a/evaluate/__init__.py
+++ b/evaluate/__init__.py
--- a/evaluate/evaluate.py
+++ b/evaluate/evaluate.py
+# coding=utf-8
+import json
+import numpy as np
+class GeneralStat(object):
+    """docstring f#or GeneralStat."""
+    def __init__(self, doc_id2text, doc_id2label):
+        self.doc_id2text = doc_id2text
+        self.doc_id2label = doc_id2label
+
+    def similarity(self, doc1_id,doc2_id):
+        return 0.0
+    def avg_precision_at_n(self,n):
+        res = np.array([])
+        for i in self.doc_id2text.keys():
+            res = np.append(res,self.precision_at_n(i,n))
+        return np.mean(res)
+
+    def precision_at_n(self,doc_id,n):
+        result = []
+        for k in self.doc_id2text:
+            if k != doc_id:
+                result.append((k, self.similarity(doc_id,k)))
+
+        res = np.array(result, dtype=[("i", int), ("j", float)])
+        res = np.sort(res, order="j")[::-1][:n]
+        relev = 0
+        lab_rel = self.doc_id2label[doc_id]
+        for i in res:
+            if self.doc_id2label[i[0]] == lab_rel:
+                relev += 1
+        return relev / n
+
+
+    def rank_doc(self,doc_id):
+        result = []
+        for k in self.doc_id2text:
+            if k != doc_id:
+                result.append((k, self.similarity(doc_id,k)))
+
+        res = np.array(result, dtype=[("i", int), ("j", float)])
+        res = np.sort(res, order="j")[::-1]
+        for i in range(len(res)):
+            if res[i][0] == self.doc_id2label[doc_id]:
+                return i
+        #print(111,res,self.doc_id2label[doc_id])
+
+    def MRR(self):
+        res = np.array([])
+        for i in self.doc_id2text.keys():
+            try:
+                res = np.append(res,1/self.rank_doc(i))
+            except Exception as e:
+                pass 
+
+        return np.mean(res)
--- a/evaluate/statBOW.py
+++ b/evaluate/statBOW.py
+# coding=utf-8
+import numpy as np
+from extractor import *
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from evaluate.evaluate import *
+from models.str import *
+
+
+class BOWStatistics(GeneralStat):
+    """
+    Extract different statistic values from a corpus using our spatial representation and similarity measures.
+    """
+    sp_en = None
+
+    def __init__(self, corpus_name, doc_id2text, doc_id2label,lang="fr"):
+        """
+        Constructor
+
+        Parameters
+        ----------
+        doc_id2text : dict
+            Map that link document id to text
+        doc_id2label : dict
+            Map that link document id to label (for IR stats)
+        """
+        super(BOWStatistics, GeneralStat.__init__(self, doc_id2text, doc_id2label))
+        self.doc_id2text = doc_id2text
+        self.doc_id2label = doc_id2label
+        self.doc_id2graph, self.not_found = None, None
+        self.lang = lang
+        self.corpus_name = corpus_name
+
+        if not BOWStatistics.sp_en or BOWStatistics.sp_en.language[:2] != lang:
+            BOWStatistics.sp_en = SpatialEntityExtractor(language=self.lang)
+
+        self.bow_ = None
+        self.__spatial_e=set([])
+
+        self.extract_data()
+
+        transformer = TfidfVectorizer(smooth_idf=False,vocabulary=list(self.__spatial_e))
+        self.id2bow_id, self.corpus = {}, []
+        i = 0
+        for k, v in self.doc_id2text.items():
+            self.corpus.append(v)
+            self.id2bow_id[k] = i
+            i += 1
+        self.bow_ = transformer.fit_transform(self.corpus)
+
+    def extract_se(self, text):
+        """
+        Create a spatial graph from a text.
+
+        [TODO Description]
+
+        Parameters
+        ----------
+        text : string
+            Text content you want to transform
+        occ : boolean
+            Include cooccurrency relation in graph
+        adj : boolean
+            Include adjacency relation in graph
+        inc : boolean
+            Include inclusion relation in graph
+
+        Returns
+        -------
+        STR
+            graph of the spatial configuration in the text
+        """
+        try:
+            text = BOWStatistics.sp_en.clean(text)
+            ann = BOWStatistics.sp_en.tag(text)
+            output = BOWStatistics.sp_en.parse_output(
+                ann, text, "tree_tagger")
+            graph = STR(text=output, lang=self.lang)
+            places=graph.get_place_order()
+            #print(places)
+            if places:
+                for p in places:self.__spatial_e.add(p)
+
+                return " ".join(places)
+        except Exception as e:
+            return None
+
+
+    def compute_thematic_similarity(self, index_doc1, index_doc2):
+        """
+        Compute the cosine similarity
+        """
+        try:
+            v1 = self.bow_[self.id2bow_id[index_doc1]]
+            v2 = self.bow_[self.id2bow_id[index_doc2]]
+        except Exception as e:
+            print("An document id don't exists in the BOW !")
+            return False
+        return cosine_similarity(v1, v2)[0][0]
+
+    def similarity(self, doc1_id, doc2_id):
+        try:
+            return self.compute_thematic_similarity(doc1_id, doc2_id)
+        except ZeroDivisionError as e:
+            return 0.0
+
+    def extract_data(self):
+        """
+        Generate a spatial graph for each text in the corpus.
+
+        [TODO Description]
+
+        Parameters
+        ----------
+        occ : boolean
+            Include cooccurrency relation in graph
+        adj : boolean
+            Include adjacency relation in graph
+        inc : boolean
+            Include inclusion relation in graph
+        """
+
+        self.doc_id2graph = {}
+        del_node = []
+        for i in self.doc_id2text:
+            self.doc_id2text[i] = self.extract_se(self.doc_id2text[i])
+            if self.doc_id2text[i] == None :
+                del_node.append(i)
+        self.not_found = len(del_node)
+        for i in del_node:
+            del self.doc_id2label[i]
+            del self.doc_id2text[i]
+
+    def extract_common_statistic(self):
+        """
+        Extract common statistics extracted from the corpus
+
+        Statistics:
+            * Number of documents
+            * Number of documents without Spatial Entity
+            * Number of documents with Spatial Entity
+            * Average Size of a graph
+            * Standard Deviation of graph size
+            * Average Document size
+            * Average Edge Intersection length
+            * Average Node Intersection length
+            * Maximum Egde Intersection length
+            * Maximum Node Intersection length
+            * Average Node Jaccard Similarity
+            * Average Edge Jaccard Similarity
+            * Maximum Similarity between graphs
+
+        Returns
+        -------
+        dict
+            statistic dictionnary
+
+        """
+        stats = {}
+        stats["nb_doc"] = len(self.doc_id2text.keys())
+        stats["nb_doc_without_SP_EN"] = self.not_found
+        stats["nb_doc_without_graph"] = self.not_found
+        stats["nb_doc_with_SP_EN"] = len(
+            self.doc_id2graph.keys()) - stats["nb_doc_without_SP_EN"]
+
+        sim_data = {}
+        max_sim_V1 = 0.0
+        documents_size = []
+        for kg1 in self.doc_id2graph:
+            g1 = self.doc_id2graph[kg1]
+            if g1 == None:
+                continue
+            documents_size.append(len(self.doc_id2graph[kg1].corpus))
+
+            for kg2 in self.doc_id2graph:
+                g2 = self.doc_id2graph[kg2]
+                if g1 == None or g2 == None:
+                    continue
+                if g1 != g2 and not (g1, g2) in sim_data and not (g2, g1) in sim_data:
+                    if len(g1.graph.nodes()) == 0 or len(g2.graph.nodes()) == 0:
+                        continue
+                    if self.doc_id2label[kg1] == self.doc_id2label[kg2]:
+                        sim_data[(kg1, kg2)] = self.similarity(
+                            kg1, kg2)
+                        if sim_data[(kg1, kg2)] > max_sim_V1 and sim_data[(kg1, kg2)] < 1:
+                            max_sim_V1 = sim_data[(kg1, kg2)]
+
+        i, size = 0, 0
+        size_list = []
+        for kg1 in self.doc_id2graph:
+            g1 = self.doc_id2graph[kg1]
+            if g1 != None:
+                size_list.append(len(g1.graph.nodes()))
+
+        # Graph Size statistics
+        stats["avg_size"] = np.mean(size_list)
+        stats["std_size"] = np.std(size_list)
+
+        # Average Document size
+        stats["avg_document_size"] = np.mean(documents_size)
+
+        # Max similarity value
+        stats["max_sim_v1"] = max_sim_V1
+        return stats
+
+
+
+
+
--- a/evaluate/statSTR.py
+++ b/evaluate/statSTR.py
+# coding=utf-8
+import networkx as nx
+import numpy as np
+from extractor import *
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from evaluate.evaluate import *
+from models.str import *
+from similarity.str_sim import heuristic_1
+
+
+class STRStatistics(GeneralStat):
+    """
+    Extract different statistic values from a corpus using our spatial representation and similarity measures.
+    """
+    sp_en = None
+
+    def __init__(self,corpus_name, doc_id2text, doc_id2label, sim_function="jaccard", lang="fr", thematic_=False):
+        """
+        Constructor
+
+        Parameters
+        ----------
+        doc_id2text : dict
+            Map that link document id to text
+        doc_id2label : dict
+            Map that link document id to label (for IR stats)
+        """
+        super(STRStatistics, GeneralStat.__init__(self, doc_id2text, doc_id2label))
+        self.doc_id2text = doc_id2text
+        self.doc_id2label = doc_id2label
+        self.doc_id2graph, self.not_found = None, None
+        self.lang = lang
+        self.corpus_name=corpus_name
+
+        if not STRStatistics.sp_en or STRStatistics.sp_en.language[:2] != lang:
+            STRStatistics.sp_en = SpatialEntityExtractor(language=self.lang)
+        self.sim_func = (sim_function if sim_function in ["hypergeo", "node2vec", "jaccard", "sim_mcs", "sim_mcs_e",
+                                                          "sim_wgu","heur1"] else "jaccard")
+        self.bow_ = None
+        self.only_thematic=False
+        if thematic_:
+            transformer = TfidfVectorizer(smooth_idf=False)
+            self.id2bow_id, self.corpus = {}, []
+            i = 0
+            for k, v in self.doc_id2text.items():
+                self.corpus.append(v)
+                self.id2bow_id[k] = i
+                i += 1
+            self.bow_ = transformer.fit_transform(self.corpus)
+
+    def loadGraph(self, text, occ=True, adj=True, inc=True):
+        """
+        Create a spatial graph from a text.
+
+        [TODO Description]
+
+        Parameters
+        ----------
+        text : string
+            Text content you want to transform
+        occ : boolean
+            Include cooccurrency relation in graph
+        adj : boolean
+            Include adjacency relation in graph
+        inc : boolean
+            Include inclusion relation in graph
+
+        Returns
+        -------
+        STR
+            graph of the spatial configuration in the text
+        """
+        try:
+            text = STRStatistics.sp_en.clean(text)
+            ann = STRStatistics.sp_en.tag(text)
+            output = STRStatistics.sp_en.parse_output(
+                ann, text, "tree_tagger")
+            graph = STR(text=output, lang=self.lang)
+            graph.extract_names_associated_to_place()
+            graph.create_place_repr_vector()
+            graph.create_multi_graph(occ, adj, inc, False)
+            if len(graph.graph.nodes()) == 0: return None
+            if self.sim_func == "node2vec":
+                graph.node2vec_models(num_walks=10, directed=True)
+                #graph.node2vec_model(num_walks=10, directed=True)
+            return graph
+        except Exception as e:
+            return None
+    """
+    def similarity_jaccard_edge(self,g1, g2):
+    \"""
+    Compute jaccard indice between two graph using their edges.
+
+    Parameters
+    ----------
+    g1 : nx.Graph
+        first graph
+    g2 : nx.Graph
+        second graph
+
+    Returns
+    -------
+    float
+        jaccard similarity value
+    \"""
+    if isinstance(g1, nx.MultiDiGraph): return self.similarity_jaccard_edge_multi(g1, g2)
+    ed_g1 = g1.edges()
+    ed_g2 = g2.edges()
+    union = []
+    for ed1 in ed_g1:
+        if ed1 not in union: union.append(ed1)
+    for ed1 in ed_g2:
+        if ed1 not in union: union.append(ed1)
+
+    if not union:
+        return 0
+    inter = []
+    for ed1 in ed_g1:
+        if ed1 in ed_g2 and not ed1 in inter:
+            # print(ed1)
+            inter.append(ed1)
+    # print(len(inter) / len(union),len(inter) , len(union))
+    return len(inter) / len(union)
+    """
+
+
+    def transform_edge_data(self,data):
+        new_ = []
+        for ed1 in data:
+            new_.append((ed1[0], ed1[1], ed1[2]["color"]))
+        return new_
+
+    def similarity_jaccard_edge(self,g1, g2):
+        ed_g1 = self.transform_edge_data(g1.edges(data=True))
+        ed_g2 = self.transform_edge_data(g2.edges(data=True))
+
+        union = []
+        for ed1 in ed_g1:
+            if ed1 not in union: union.append(ed1)
+        for ed1 in ed_g2:
+            if ed1 not in union: union.append(ed1)
+
+        if not union:
+            return 0
+        inter = []
+        for ed1 in ed_g1:
+            if ed1 in ed_g2 and not ed1 in inter:
+                inter.append(ed1)
+        # print(len(inter) / len(union),len(inter) , len(union))
+        return len(inter) / len(union)
+
+    def similarity_jaccard_node(self, g1, g2):
+        """
+        Compute jaccard indice between two graph using their nodes.
+
+        Parameters
+        ----------
+        g1 : nx.Graph
+            first graph
+        g2 : nx.Graph
+            second graph
+
+        Returns
+        -------
+        float
+            jaccard similarity value
+        """
+        so_g1 = g1.nodes()
+        so_g2 = g2.nodes()
+        union = set(so_g1 + so_g2)
+        if not union:
+            return 0
+        inter = []
+        for so1 in so_g1:
+            if so1 in so_g2:
+                inter.append(so1)
+        return len(inter) / len(union)
+
+    def union_nodes(self, g1, g2):
+        """
+        Compute union of two graph nodes.
+
+        Parameters
+        ----------
+        g1 : nx.Graph
+            first graph
+        g2 : nx.Graph
+            second graph
+
+        Returns
+        -------
+        list
+            union
+        """
+        so_g1 = g1.nodes()
+        so_g2 = g2.nodes()
+        return set(so_g1 + so_g2)
+
+    def union_edges(self, g1, g2):
+        """
+        Compute union of two graph edges.
+
+        Parameters
+        ----------
+        g1 : nx.Graph
+            first graph
+        g2 : nx.Graph
+            second graph
+
+        Returns
+        -------
+        list
+            union
+        """
+        ed_g1 = g1.edges()
+        ed_g2 = g2.edges()
+        return set(ed_g1 + ed_g2)
+
+    def inter_edges(self, g1, g2):
+        """
+        Compute intersection of two graph edges.
+
+        Parameters
+        ----------
+        g1 : nx.Graph
+            first graph
+        g2 : nx.Graph
+            second graph
+
+        Returns
+        -------
+        list
+            intersection
+        """
+        ed_g1 = g1.edges()
+        ed_g2 = g2.edges()
+        inter = []
+        for ed1 in ed_g1:
+            if ed1 in ed_g2:
+                inter.append(ed1)
+        return inter
+
+    def inter_nodes(self, g1, g2):
+        """
+        Compute intersection of two graph nodes.
+
+        Parameters
+        ----------
+        g1 : nx.Graph
+            first graph
+        g2 : nx.Graph
+            second graph
+
+        Returns
+        -------
+        list
+            intersection
+        """
+        so_g1 = g1.nodes()
+        so_g2 = g2.nodes()
+        inter = []
+        for so1 in so_g1:
+            if so1 in so_g2:
+                inter.append(so1)
+        return inter
+
+    def similarity_jaccard(self, g1, g2):
+        """
+        Compute a "jaccard" similarity between two graph.
+
+        .. math:: \frac{|E_{G_1}\cap E_{G_2}|}{|E_{G_1}\cup E_{G_2}|} \times \frac{|N_{G_1}\cap N_{G_2}|}{|N_{G_1}\cup N_{G_2}|}
+
+        Parameters
+        ----------
+        g1 : nx.Graph
+            first graph
+        g2 : nx.Graph
+            second graph
+
+        Returns
+        -------
+        float
+            similarity
+        """
+        return self.similarity_jaccard_node(g1, g2) * self.similarity_jaccard_edge(g1, g2)
+
+    def mcs(self, g1, g2):
+        res = nx.MultiDiGraph()
+        res.add_nodes_from(self.inter_nodes(g1, g2))
+        res.add_edges_from(self.inter_edges(g1, g2))
+        return res
+
+    def MCS(self, g1, g2):
+        res = nx.MultiDiGraph()
+        res.add_nodes_from(self.union_nodes(g1, g2))
+        res.add_edges_from(self.union_edges(g1, g2))
+        return res
+
+    def s_mcs(self, g1, g2):
+        """
+        A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer,
+        Pattern Recognition Letters, 1998
+        """
+        return len(self.mcs(g1, g2)) / max(len(g1), len(g2))
+
+    def s_mcs_with_edge(self, g1, g2):
+        """
+        A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer,
+        Pattern Recognition Letters, 1998
+        """
+        mcs = self.mcs(g1, g2)
+        len_mcs = len(mcs.nodes()) + len(mcs.edges())
+        len_g1 = len(g1.nodes()) + len(g1.edges())
+        len_g2 = len(g2.nodes()) + len(g2.edges())
+        return (len_mcs) / max(len_g1, len_g2)
+
+    def s_wgu(self, g1, g2):
+        """
+        Graph distances using graph union, W.D. Wallis an P.Shoubridge and M. Kraetzl and D. Ray
+        Pattern Recognition Letters, 2001
+        """
+        return len(self.mcs(g1, g2)) / (len(g1) + len(g2) - len(self.mcs(g1, g2)))
+
+    def node2vec_similarity(self, g1_, g2_):
+        mod1 = g1_.n2vec_models
+        mod2 = g2_.n2vec_models
+        g1,g2=g1_.graph,g2_.graph
+        available = []
+        for a in mod1:
+            if a in mod2: available.append(a)
+        moy = []
+        jaccard = self.similarity_jaccard_node(g1, g2)
+        for index in available:
+            sum_ = 0
+            l_ = 0
+            for vec in list(g1.nodes()):
+                for vec2 in list(g2.nodes()):
+                    if vec == vec2:
+                        sim_topo = \
+                            (cosine_similarity(mod1[index][vec].reshape(1, -1), mod2[index][vec2].reshape(1, -1)))[0][0]
+                        sum_ += sim_topo
+                        l_ += 1
+            if l_ == 0:
+                moy.append(0.0)
+            else:
+                moy.append(sum_ / l_)
+        return np.mean(moy) * jaccard
+
+    def hyper_geo_similarity(self, g1, g2, data_returned=1):
+        from models.hypergeo import compareWithHyperGeom
+        probs = compareWithHyperGeom(g1, g2)
+        return probs[data_returned]
+
+    def compute_thematic_similarity(self, index_doc1, index_doc2):
+        """
+        Compute the cosine similarity
+        """
+        try:
+            v1 = self.bow_[self.id2bow_id[index_doc1]]
+            v2 = self.bow_[self.id2bow_id[index_doc2]]
+        except Exception as e:
+            print("An document id don't exists in the BOW !")
+            return False
+        return cosine_similarity(v1, v2)[0][0]
+
+    def similarity(self, doc1_id, doc2_id, bow_=False):
+        if self.only_thematic:
+            return self.compute_thematic_similarity(doc1_id, doc2_id)
+        if self.bow_ != None and not bow_:
+            theme_sim = self.compute_thematic_similarity(doc1_id, doc2_id)
+            return theme_sim + self.similarity(doc1_id, doc2_id, True)
+
+        try:
+            g1, g2 = self.doc_id2graph[doc1_id], self.doc_id2graph[doc2_id]
+            if not g1 or not g2: return 0.0
+            if self.sim_func == "jaccard":
+                return self.similarity_jaccard(g1.graph, g2.graph)
+            elif self.sim_func == "heur1":
+                return heuristic_1(g1,g2)
+            elif self.sim_func == "sim_wgu":
+                return self.s_wgu(g1.graph, g2.graph)
+            elif self.sim_func == "sim_mcs_e":
+                return self.s_mcs_with_edge(g1.graph, g2.graph)
+            elif self.sim_func == "node2vec":
+                return self.node2vec_similarity(g1, g2)
+            elif self.sim_func == "hypergeo":
+                return self.hyper_geo_similarity(g1.graph, g2.graph)
+            else:
+                return self.s_mcs(g1.graph, g2.graph)
+        except ZeroDivisionError as e:
+            return 0.0
+
+    def generate_graphs(self, occ=True, adj=True, inc=True,min_graph_size=1):
+        """
+        Generate a spatial graph for each text in the corpus.
+
+        [TODO Description]
+
+        Parameters
+        ----------
+        occ : boolean
+            Include cooccurrency relation in graph
+        adj : boolean
+            Include adjacency relation in graph
+        inc : boolean
+            Include inclusion relation in graph
+        """
+
+        self.doc_id2graph = {}
+        del_node = []
+        for i in self.doc_id2text:
+            self.doc_id2graph[i] = self.loadGraph(
+                self.doc_id2text[i], occ, adj, inc)
+            if self.doc_id2graph[i] == None\
+                or len(self.doc_id2graph[i].graph.nodes()) < min_graph_size:
+                del_node.append(i)
+        self.not_found = len(del_node)
+        for i in del_node:
+            del self.doc_id2label[i]
+            del self.doc_id2text[i]
+            del self.doc_id2graph[i]
+            if self.bow_ != None:
+                del self.id2bow_id[i]
+        
+            
+
+    def extract_common_statistic(self):
+        """
+        Extract common statistics extracted from the corpus
+
+        Statistics:
+            * Number of documents
+            * Number of documents without Spatial Entity
+            * Number of documents with Spatial Entity
+            * Average Size of a graph
+            * Standard Deviation of graph size
+            * Average Document size
+            * Average Edge Intersection length
+            * Average Node Intersection length
+            * Maximum Egde Intersection length
+            * Maximum Node Intersection length
+            * Average Node Jaccard Similarity
+            * Average Edge Jaccard Similarity
+            * Maximum Similarity between graphs
+
+        Returns
+        -------
+        dict
+            statistic dictionnary
+
+        """
+        stats = {}
+        stats["nb_doc"] = len(self.doc_id2graph.keys())
+        stats["nb_doc_without_SP_EN"] = self.not_found
+        stats["nb_doc_without_graph"] = self.not_found
+        stats["nb_doc_with_SP_EN"] = len(
+            self.doc_id2graph.keys()) - stats["nb_doc_without_SP_EN"]
+
+        sim_data = {}
+        max_sim_V1 = 0.0
+        documents_size = []
+        for kg1 in self.doc_id2graph:
+            g1 = self.doc_id2graph[kg1]
+            if g1 == None:
+                continue
+            documents_size.append(len(self.doc_id2graph[kg1].corpus))
+
+            for kg2 in self.doc_id2graph:
+                g2 = self.doc_id2graph[kg2]
+                if g1 == None or g2 == None:
+                    continue
+                if g1 != g2 and not (g1, g2) in sim_data and not (g2, g1) in sim_data:
+                    if len(g1.graph.nodes()) == 0 or len(g2.graph.nodes()) == 0:
+                        continue
+                    if self.doc_id2label[kg1] == self.doc_id2label[kg2]:
+                        sim_data[(kg1, kg2)] = self.similarity(
+                            kg1, kg2)
+                        if sim_data[(kg1, kg2)] > max_sim_V1 and sim_data[(kg1, kg2)] < 1:
+                            max_sim_V1 = sim_data[(kg1, kg2)]
+
+        i, size = 0, 0
+        size_list = []
+        for kg1 in self.doc_id2graph:
+            g1 = self.doc_id2graph[kg1]
+            if g1 != None:
+                size_list.append(len(g1.graph.nodes()))
+
+        # Graph Size statistics
+        stats["avg_size"] = np.mean(size_list)
+        stats["std_size"] = np.std(size_list)
+
+        # Average Document size
+        stats["avg_document_size"] = np.mean(documents_size)
+
+        # Max similarity value
+        stats["max_sim_v1"] = max_sim_V1
+        return stats
+
+    def save_graph_data(self,occ,adj,inc,min_graph_size):
+        path_temp_dir = "temp_/{4}/{0}_{1}_{2}_{3}/".format(int(occ), int(adj), int(inc), min_graph_size,
+                                                            self.corpus_name)
+        if not os.path.exists(path_temp_dir):
+            os.makedirs(path_temp_dir)
+            os.makedirs(path_temp_dir+"text")
+            os.makedirs(path_temp_dir + "graph")
+        for i in self.doc_id2graph:
+            nx.write_gexf(self.doc_id2graph[i].graph,path_temp_dir+"graph/{0}".format(i))
+        open(path_temp_dir + "corpus.json",'w').write(json.dumps(self.doc_id2text))
+        open(path_temp_dir + "labels.json", 'w').write(json.dumps(self.doc_id2label))
+
+
+
+
--- a/exception/__init__.py
+++ b/exception/__init__.py
--- a/exception/disambiguator.py
+++ b/exception/disambiguator.py
+# coding = utf-8
+
+from termcolor import colored
+class NotADisambiguatorInstance(Exception):
+    def __init__(self):
+        # Call the base class constructor with the parameters it needs
+        super(Exception, self).__init__(colored("Setting disambiguator: Give a Disambiguator or Disambiguator sub-class instance","red"))
+
--- a/exception/language.py
+++ b/exception/language.py
+# coding = utf-8
+from termcolor import colored
+
+class LanguageNotAvailable(Exception):
+    def __init__(self, lang, object):
+        # Call the base class constructor with the parameters it needs
+        super(Exception, self).__init__("{0} not available for {1}".format(colored(lang,"red"), colored(object.__class__.__name__,"magenta")))
--- a/exception/ner.py
+++ b/exception/ner.py
+#coding = utf-8
+
+from termcolor import colored
+
+class ClassifierNotFound(Exception):
+    def __init__(self, file):
+        # Call the base class constructor with the parameters it needs
+        super(Exception, self).__init__("Classifier at {0} doesn't exists. Check your configuration file !".format(colored(file,"red")))
+
+class BinairyDirectoryNotFound(Exception):
+    def __init__(self, dir,object):
+        # Call the base class constructor with the parameters it needs
+        super(Exception, self).__init__("Binairies for {0} at {1} doesn't exists. Check your configuration file !".format(colored(object.__class__.__name__,"magenta"),colored(dir,"red")))
+
+class NotANERInstance(Exception):
+    def __init__(self):
+        # Call the base class constructor with the parameters it needs
+        super(Exception, self).__init__(colored("Setting Named Entity Recognizer: Give a NER or NER sub-class instance","red"))
+
+
+
--- a/exception/tagger.py
+++ b/exception/tagger.py
+# coding = utf-8
+
+from termcolor import colored
+class NotATaggerInstance(Exception):
+    def __init__(self):
+        # Call the base class constructor with the parameters it needs
+        super(Exception, self).__init__(colored("Setting pos-tagger: Give a Tagger or Tagger sub-class instance","red"))
+
--- a/helpers/__init__.py
+++ b/helpers/__init__.py
--- a/helpers/collision.py
+++ b/helpers/collision.py
+import numpy as np
+
+"""
+Source : https://hackmd.io/s/ryFmIZrsl#
+"""
+def is_separating_axis(o, p1, p2):
+    """
+    Return True and the push vector if o is a separating axis of p1 and p2.
+    Otherwise, return False and None.
+    """
+    min1, max1 = float('+inf'), float('-inf')
+    min2, max2 = float('+inf'), float('-inf')
+
+    for v in p1:
+        projection = np.dot(v, o)
+
+        min1 = min(min1, projection)
+        max1 = max(max1, projection)
+
+    for v in p2:
+        projection = np.dot(v, o)
+
+        min2 = min(min2, projection)
+        max2 = max(max2, projection)
+
+    if max1 >= min2 and max2 >= min1:
+        d = min(max2 - min1, max1 - min2)
+        # push a bit more than needed so the shapes do not overlap in future
+        # tests due to float precision
+        d_over_o_squared = d/np.dot(o, o) + 1e-10
+        pv = d_over_o_squared*o
+        return False, pv
+    else:
+        return True, None
+
+
+def edges_of(vertices):
+    """
+    Return the vectors for the edges of the polygon p.
+
+    p is a polygon.
+    """
+    edges = []
+    N = len(vertices)
+
+    for i in range(N):
+        edge = vertices[(i + 1)%N] - vertices[i]
+        edges.append(edge)
+
+    return edges
+
+def orthogonal(v):
+    """
+    Return a 90 degree clockwise rotation of the vector v.
+    """
+    return np.array([-v[1], v[0]])
+
+
+def collide(p1, p2):
+    '''
+    Return True and the MPV if the shapes collide. Otherwise, return False and
+    None.
+
+    p1 and p2 are lists of ordered pairs, the vertices of the polygons in the
+    counterclockwise direction.
+    '''
+
+    p1 = [np.array(v, 'float64') for v in p1]
+    p2 = [np.array(v, 'float64') for v in p2]
+
+    edges = edges_of(p1)
+    edges += edges_of(p2)
+    orthogonals = [orthogonal(e) for e in edges]
+
+    push_vectors = []
+    for o in orthogonals:
+        separates, pv = is_separating_axis(o, p1, p2)
+
+        if separates:
+            # they do not collide and there is no push vector
+            return False, None
+        else:
+            push_vectors.append(pv)
+
+    # they do collide and the push_vector with the smallest length is the MPV
+    mpv =  min(push_vectors, key=(lambda v: np.dot(v, v)))
+
+    # assert mpv pushes p1 away from p2
+    d = centers_displacement(p1, p2) # direction from p1 to p2
+    if np.dot(d, mpv) > 0: # if it's the same direction, then invert
+        mpv = -mpv
+
+    return True, mpv
+
+
+def centers_displacement(p1, p2):
+    """
+    Return the displacement between the geometric center of p1 and p2.
+    """
+    # geometric center
+    c1 = np.mean(np.array(p1), axis=0)
+    c2 = np.mean(np.array(p2), axis=0)
+    return c2 - c1
\ No newline at end of file