From 0cd549d91f9bcede5b6b653925eef158e5b9f227 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Thu, 17 May 2018 16:26:06 +0200
Subject: [PATCH] - Cythonize Gmatch4py

- Debug disambiguisation

- Debug Spacy NER API and StanfordNER Api

- Add Notebooks for Evaluations
---
 generate_data_csv.py                          |   26 +-
 .../helpers/__init__.py                       |    0
 gmatch4py/helpers/networkx_parser.py          |  148 ++
 gmatch4py/jaccard.py                          |    2 +-
 gmatch4py_cython/__init__.pyx                 |    1 -
 gmatch4py_cython/gmatch4py/__init__.py        |    1 +
 .../{ => gmatch4py}/bag_of_cliques.pyx        |   22 +-
 gmatch4py_cython/{ => gmatch4py}/deltacon.pyx |    0
 .../gmatch4py/exception/__init__.py           |    1 +
 gmatch4py_cython/gmatch4py/ged/__init__.py    |    2 +
 .../{ => gmatch4py}/ged/algorithm/__init__.py |    0
 .../algorithm/abstract_graph_edit_dist.pyx    |   32 +-
 .../ged/algorithm/edge_edit_dist.pyx          |    4 +-
 .../ged/algorithm/graph_edit_dist.pyx         |    7 +-
 .../gmatch4py/ged/approximate_ged.pyx         |   20 +
 .../ged/bipartite_graph_matching_2.pyx        |  150 ++
 .../gmatch4py/ged/graph/__init__.py           |    1 +
 .../{ => gmatch4py}/ged/graph/__init__.pyx    |    0
 .../{ => gmatch4py}/ged/graph/edge_graph.pyx  |    0
 .../gmatch4py/ged/greedy_edit_distance.pyx    |   44 +
 .../gmatch4py/ged/hausdorff_edit_distance.pyx |  156 ++
 gmatch4py_cython/{ => gmatch4py}/jaccard.pyx  |   24 +-
 .../gmatch4py/kernels/__init__.py             |    1 +
 gmatch4py_cython/{ => gmatch4py}/mcs.pyx      |   10 +-
 .../{ => gmatch4py}/vertex_edge_overlap.pyx   |   17 +-
 .../{ => gmatch4py}/vertex_ranking.pyx        |   12 +-
 gmatch4py_cython/setup.py                     |   47 +
 gmatch4py_cython/utils.pyx                    |   94 -
 helpers/classic.py                            |   26 +
 helpers/gazeteer_helpers.py                   |   18 +
 nlp/disambiguator/disambiguator.py            |    4 +-
 nlp/disambiguator/geodict_gaurav.py           |   41 +-
 nlp/disambiguator/most_common.py              |   59 +
 nlp/disambiguator/pagerank.py                 |    4 +-
 nlp/ner/spacy.py                              |   56 +-
 nlp/ner/stanford_ner.py                       |   13 +-
 notebooks/Cython Enhancement on HED.ipynb     | 1311 ++++++++++
 notebooks/EvalDesambiguisationMada.ipynb      |  311 +++
 notebooks/EvalDesambiguisationPADIWEB.ipynb   |  378 +++
 notebooks/EvalTopoMadagascar.ipynb            |  719 ++++++
 notebooks/NER Evaluation.ipynb                |  612 +++--
 notebooks/StanfordMadaAgro.ipynb              |  950 +++++++
 notebooks/corpusmadahard.ipynb                | 2285 ++++++++++++++---
 pipeline.py                                   |    1 +
 temp.py                                       |  181 ++
 45 files changed, 7073 insertions(+), 718 deletions(-)
 rename gmatch4py_cython/ged/__init__.pyx => gmatch4py/helpers/__init__.py (100%)
 create mode 100644 gmatch4py/helpers/networkx_parser.py
 delete mode 100644 gmatch4py_cython/__init__.pyx
 create mode 100644 gmatch4py_cython/gmatch4py/__init__.py
 rename gmatch4py_cython/{ => gmatch4py}/bag_of_cliques.pyx (79%)
 rename gmatch4py_cython/{ => gmatch4py}/deltacon.pyx (100%)
 create mode 100644 gmatch4py_cython/gmatch4py/exception/__init__.py
 create mode 100644 gmatch4py_cython/gmatch4py/ged/__init__.py
 rename gmatch4py_cython/{ => gmatch4py}/ged/algorithm/__init__.py (100%)
 rename gmatch4py_cython/{ => gmatch4py}/ged/algorithm/abstract_graph_edit_dist.pyx (84%)
 rename gmatch4py_cython/{ => gmatch4py}/ged/algorithm/edge_edit_dist.pyx (88%)
 rename gmatch4py_cython/{ => gmatch4py}/ged/algorithm/graph_edit_dist.pyx (93%)
 create mode 100644 gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx
 create mode 100644 gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx
 create mode 100644 gmatch4py_cython/gmatch4py/ged/graph/__init__.py
 rename gmatch4py_cython/{ => gmatch4py}/ged/graph/__init__.pyx (100%)
 rename gmatch4py_cython/{ => gmatch4py}/ged/graph/edge_graph.pyx (100%)
 create mode 100644 gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx
 create mode 100644 gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx
 rename gmatch4py_cython/{ => gmatch4py}/jaccard.pyx (77%)
 create mode 100644 gmatch4py_cython/gmatch4py/kernels/__init__.py
 rename gmatch4py_cython/{ => gmatch4py}/mcs.pyx (86%)
 rename gmatch4py_cython/{ => gmatch4py}/vertex_edge_overlap.pyx (79%)
 rename gmatch4py_cython/{ => gmatch4py}/vertex_ranking.pyx (76%)
 create mode 100644 gmatch4py_cython/setup.py
 delete mode 100644 gmatch4py_cython/utils.pyx
 create mode 100644 helpers/classic.py
 create mode 100644 nlp/disambiguator/most_common.py
 create mode 100644 notebooks/Cython Enhancement on HED.ipynb
 create mode 100644 notebooks/EvalDesambiguisationMada.ipynb
 create mode 100644 notebooks/EvalDesambiguisationPADIWEB.ipynb
 create mode 100644 notebooks/EvalTopoMadagascar.ipynb
 create mode 100644 notebooks/StanfordMadaAgro.ipynb
 create mode 100644 temp.py

diff --git a/generate_data_csv.py b/generate_data_csv.py
index 141a0f0..fffe0ff 100644
--- a/generate_data_csv.py
+++ b/generate_data_csv.py
@@ -81,10 +81,7 @@ with ProgressBar(max_value=len(files_glob),widgets=[' [', Timer(), '] ',Bar(),'(
 
         id_=int(re.findall("\d+", fn)[-1])
         df=pd.read_csv(fn)
-        try:
-            df=df[(df["GID"]!='O') & (df.GID.notnull())]
-        except:
-            df = df[(df.GID.notnull())]
+        df = df[-df["GID"].isin(['0', 'o', 'NR', 'O'])]
         try:
             count_per_doc[id_]=json.loads(df.groupby("GID").GID.count().to_json())
             associated_es[id_] = df[["GID","text"]].groupby("GID",as_index=False).max().set_index('GID').to_dict()["text"]
@@ -98,22 +95,29 @@ all_es=set([])
 for k,v in associated_es.items():
     for k2 in v:
         all_es.add(k2)
+
 logging.info("Get All Shapes from Database for all ES")
 all_shapes=get_all_shapes(list(all_es))
-#print(all_shapes.keys())
+
 i=0
+def foo_(x):
+    try:
+        return get_data(x)["en"]
+    except:
+        print(x)
 with ProgressBar(max_value=len(files_glob),
                  widgets=[' [', Timer(), '] ', Bar(), '(', Counter(), ')', '(', ETA(), ')']) as pg:
     for fn in files_glob:
 
         id_ = int(re.findall("\d+", fn)[-1])
         df = pd.read_csv(fn)
-        try:
-            df = df[(df["GID"] != 'O') & (df.GID.notnull())]
-        except:
-            df = df[(df.GID.notnull())]
-
-        df["label"]=df.GID.apply(lambda x:get_data(x)["en"])
+        # try:
+        df= df[-df["GID"].isin(['0','o','NR','O'])]
+        #print(df)
+        # except:
+        #     df = df[(df.GID.notnull())]
+        #     print("BUG",df)
+        df["label"]=df.GID.apply(foo_)
         df = df.rename(columns={"GID": "id"})
         str_=STR.from_pandas(df,[],all_shapes).build()
         nx.write_gexf(str_, args.graphs_output_dir + "/{0}.gexf".format(id_))
diff --git a/gmatch4py_cython/ged/__init__.pyx b/gmatch4py/helpers/__init__.py
similarity index 100%
rename from gmatch4py_cython/ged/__init__.pyx
rename to gmatch4py/helpers/__init__.py
diff --git a/gmatch4py/helpers/networkx_parser.py b/gmatch4py/helpers/networkx_parser.py
new file mode 100644
index 0000000..d67049a
--- /dev/null
+++ b/gmatch4py/helpers/networkx_parser.py
@@ -0,0 +1,148 @@
+# coding = utf-8
+
+import networkx as nx
+import graph_tool as gt
+
+
+
+def get_prop_type(value, key=None):
+    """
+    Performs typing and value conversion for the graph_tool PropertyMap class.
+    If a key is provided, it also ensures the key is in a format that can be
+    used with the PropertyMap. Returns a tuple, (type name, value, key)
+    """
+    # Deal with the value
+    if isinstance(value, bool):
+        tname = 'bool'
+
+    elif isinstance(value, int):
+        tname = 'float'
+        value = float(value)
+
+    elif isinstance(value, float):
+        tname = 'float'
+
+    elif isinstance(value, str):
+        tname = 'string'
+        value = str(value)
+
+    elif isinstance(value, dict):
+        tname = 'object'
+
+    else:
+        tname = 'string'
+        value = str(value)
+
+    return tname, value, key
+
+
+def nx2gt(nxG):
+    """
+    Converts a networkx graph to a graph-tool graph.
+    """
+    # Phase 0: Create a directed or undirected graph-tool Graph
+    gtG = gt.Graph(directed=nxG.is_directed())
+
+    # Add the Graph properties as "internal properties"
+    for key, value in nxG.graph.items():
+        # Convert the value and key into a type for graph-tool
+        tname, value, key = get_prop_type(value, key)
+
+        prop = gtG.new_graph_property(tname) # Create the PropertyMap
+        gtG.graph_properties[key] = prop     # Set the PropertyMap
+        gtG.graph_properties[key] = value    # Set the actual value
+
+    # Phase 1: Add the vertex and edge property maps
+    # Go through all nodes and edges and add seen properties
+    # Add the node properties first
+    nprops = set() # cache keys to only add properties once
+    for node, data in nxG.nodes_iter(data=True):
+
+        # Go through all the properties if not seen and add them.
+        for key, val in data.items():
+            if key in nprops: continue # Skip properties already added
+
+            # Convert the value and key into a type for graph-tool
+            tname, _, key  = get_prop_type(val, key)
+
+            prop = gtG.new_vertex_property(tname) # Create the PropertyMap
+            gtG.vertex_properties[key] = prop     # Set the PropertyMap
+
+            # Add the key to the already seen properties
+            nprops.add(key)
+
+    # Also add the node id: in NetworkX a node can be any hashable type, but
+    # in graph-tool node are defined as indices. So we capture any strings
+    # in a special PropertyMap called 'id' -- modify as needed!
+    gtG.vertex_properties['id'] = gtG.new_vertex_property('string')
+
+    # Add the edge properties second
+    eprops = set() # cache keys to only add properties once
+    for src, dst, data in nxG.edges_iter(data=True):
+
+        # Go through all the edge properties if not seen and add them.
+        for key, val in data.items():
+            if key in eprops: continue # Skip properties already added
+
+            # Convert the value and key into a type for graph-tool
+            tname, _, key = get_prop_type(val, key)
+
+            prop = gtG.new_edge_property(tname) # Create the PropertyMap
+            gtG.edge_properties[key] = prop     # Set the PropertyMap
+
+            # Add the key to the already seen properties
+            eprops.add(key)
+
+    # Phase 2: Actually add all the nodes and vertices with their properties
+    # Add the nodes
+    vertices = {} # vertex mapping for tracking edges later
+    for node, data in nxG.nodes_iter(data=True):
+
+        # Create the vertex and annotate for our edges later
+        v = gtG.add_vertex()
+        vertices[node] = v
+
+        # Set the vertex properties, not forgetting the id property
+        data['id'] = str(node)
+        for key, value in data.items():
+            gtG.vp[key][v] = value # vp is short for vertex_properties
+
+    # Add the edges
+    for src, dst, data in nxG.edges_iter(data=True):
+
+        # Look up the vertex structs from our vertices mapping and add edge.
+        e = gtG.add_edge(vertices[src], vertices[dst])
+
+        # Add the edge properties
+        for key, value in data.items():
+            gtG.ep[key][e] = value # ep is short for edge_properties
+
+    # Done, finally!
+    return gtG
+
+
+if __name__ == '__main__':
+
+    # Create the networkx graph
+    nxG = nx.Graph(name="Undirected Graph")
+    nxG.add_node("v1", name="alpha", color="red")
+    nxG.add_node("v2", name="bravo", color="blue")
+    nxG.add_node("v3", name="charlie", color="blue")
+    nxG.add_node("v4", name="hub", color="purple")
+    nxG.add_node("v5", name="delta", color="red")
+    nxG.add_node("v6", name="echo", color="red")
+
+    nxG.add_edge("v1", "v2", weight=0.5, label="follows")
+    nxG.add_edge("v1", "v3", weight=0.25, label="follows")
+    nxG.add_edge("v2", "v4", weight=0.05, label="follows")
+    nxG.add_edge("v3", "v4", weight=0.35, label="follows")
+    nxG.add_edge("v5", "v4", weight=0.65, label="follows")
+    nxG.add_edge("v6", "v4", weight=0.53, label="follows")
+    nxG.add_edge("v5", "v6", weight=0.21, label="follows")
+
+    for item in nxG.edges_iter(data=True):
+        print(item)
+
+    # Convert to graph-tool graph
+    gtG = nx2gt(nxG)
+    gtG.list_properties()
\ No newline at end of file
diff --git a/gmatch4py/jaccard.py b/gmatch4py/jaccard.py
index bb4c61b..1676ff1 100644
--- a/gmatch4py/jaccard.py
+++ b/gmatch4py/jaccard.py
@@ -44,7 +44,7 @@ class Jaccard():
     def union_nodes(g1, g2):
         union=set([])
         for n in g1.nodes():union.add(n)
-        for n in g2.nodes(): union.add(n)
+        for n in g2.nodes():union.add(n)
         return union
 
     @staticmethod
diff --git a/gmatch4py_cython/__init__.pyx b/gmatch4py_cython/__init__.pyx
deleted file mode 100644
index a4e2017..0000000
--- a/gmatch4py_cython/__init__.pyx
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = "0.1"
diff --git a/gmatch4py_cython/gmatch4py/__init__.py b/gmatch4py_cython/gmatch4py/__init__.py
new file mode 100644
index 0000000..950f635
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/__init__.py
@@ -0,0 +1 @@
+# coding = utf-8
\ No newline at end of file
diff --git a/gmatch4py_cython/bag_of_cliques.pyx b/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx
similarity index 79%
rename from gmatch4py_cython/bag_of_cliques.pyx
rename to gmatch4py_cython/gmatch4py/bag_of_cliques.pyx
index a91827c..f297507 100644
--- a/gmatch4py_cython/bag_of_cliques.pyx
+++ b/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx
@@ -5,7 +5,7 @@ from typing import Sequence
 
 import networkx as nx
 import numpy as np
-
+cimport numpy as np
 
 class BagOfCliques():
 
@@ -14,7 +14,7 @@ class BagOfCliques():
         b=BagOfCliques()
         bog=b.getBagOfCliques(graphs)
         #Compute cosine similarity
-        scores=np.dot(bog,bog.T)
+        cdef np.ndarray scores=np.dot(bog,bog.T)
         for i in range(len(scores)):
             for j in range(len(scores)):
                 scores[i,j]/=(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line
@@ -27,9 +27,11 @@ class BagOfCliques():
         """
         tree = {}
         c_ = 0
-        clique_vocab = []
+        cdef list clique_vocab = []
+        cdef list cli_temp
+        cdef list cliques
         for g in graphs:
-            cliques = list(nx.algorithms.clique.find_cliques(nx.Graph(g)))
+            cliques = list(nx.find_cliques(nx.Graph(g)))
             for clique in cliques:
                 t = tree
                 cli_temp = copy.deepcopy(clique)
@@ -55,7 +57,7 @@ class BagOfCliques():
         return clique_vocab
 
 
-    def ifHaveMinor(self,G: nx.Graph, H: list):
+    def ifHaveMinor(self,G, list H):
         """
         If a clique (minor) H belong to a graph G
         :param H:
@@ -66,16 +68,18 @@ class BagOfCliques():
         return 0
 
 
-    def getBagOfCliques(self,graphs : Sequence[nx.Graph]):
+    def  getBagOfCliques(self,graphs ):
         """
 
         :param clique_vocab:
         :return:
         """
-        clique_vocab=self.getUniqueCliques(graphs)
+        cdef list clique_vocab=self.getUniqueCliques(graphs)
+
+        cdef int l_v=len(clique_vocab)
+        cdef np.ndarray boc = np.zeros((len(graphs), l_v))
+        cdef np.ndarray vector
 
-        l_v=len(clique_vocab)
-        boc = np.zeros((len(graphs), l_v))
         for g in range(len(graphs)):
             gr = graphs[g]
             vector = np.zeros(l_v)
diff --git a/gmatch4py_cython/deltacon.pyx b/gmatch4py_cython/gmatch4py/deltacon.pyx
similarity index 100%
rename from gmatch4py_cython/deltacon.pyx
rename to gmatch4py_cython/gmatch4py/deltacon.pyx
diff --git a/gmatch4py_cython/gmatch4py/exception/__init__.py b/gmatch4py_cython/gmatch4py/exception/__init__.py
new file mode 100644
index 0000000..950f635
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/exception/__init__.py
@@ -0,0 +1 @@
+# coding = utf-8
\ No newline at end of file
diff --git a/gmatch4py_cython/gmatch4py/ged/__init__.py b/gmatch4py_cython/gmatch4py/ged/__init__.py
new file mode 100644
index 0000000..e5c6c3c
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/ged/__init__.py
@@ -0,0 +1,2 @@
+# coding = utf-8
+
diff --git a/gmatch4py_cython/ged/algorithm/__init__.py b/gmatch4py_cython/gmatch4py/ged/algorithm/__init__.py
similarity index 100%
rename from gmatch4py_cython/ged/algorithm/__init__.py
rename to gmatch4py_cython/gmatch4py/ged/algorithm/__init__.py
diff --git a/gmatch4py_cython/ged/algorithm/abstract_graph_edit_dist.pyx b/gmatch4py_cython/gmatch4py/ged/algorithm/abstract_graph_edit_dist.pyx
similarity index 84%
rename from gmatch4py_cython/ged/algorithm/abstract_graph_edit_dist.pyx
rename to gmatch4py_cython/gmatch4py/ged/algorithm/abstract_graph_edit_dist.pyx
index e0a1d3b..481ec69 100644
--- a/gmatch4py_cython/ged/algorithm/abstract_graph_edit_dist.pyx
+++ b/gmatch4py_cython/gmatch4py/ged/algorithm/abstract_graph_edit_dist.pyx
@@ -5,9 +5,12 @@ import sys
 
 import numpy as np
 from scipy.optimize import linear_sum_assignment
+cimport numpy as np
 
 
 class AbstractGraphEditDistance(object):
+
+
     def __init__(self, g1, g2,debug=False,**kwargs):
         self.g1 = g1
         self.g2 = g2
@@ -26,12 +29,14 @@ class AbstractGraphEditDistance(object):
         return sum(opt_path)
 
     def print_operations(self,cost_matrix,row_ind,col_ind):
-        nodes1 = self.g1.nodes()
-        nodes2 = self.g2.nodes()
+        cdef list nodes1 = self.g1.nodes()
+        cdef list nodes2 = self.g2.nodes()
         dn1 = self.g1.node
         dn2 = self.g2.node
 
-        n,m=len(nodes1),len(nodes2)
+        cdef int n=len(nodes1)
+        cdef int m=len(nodes2)
+        cdef int x,y,i
         for i in range(len(row_ind)):
             y,x=row_ind[i],col_ind[i]
             val=cost_matrix[row_ind[i]][col_ind[i]]
@@ -43,7 +48,7 @@ class AbstractGraphEditDistance(object):
                 print("DEL {0} cost = {1}".format(dn1[nodes1[m-x]]["label"],val))
 
     def edit_costs(self):
-        cost_matrix = self.create_cost_matrix()
+        cdef np.ndarray cost_matrix = self.create_cost_matrix()
         if self.debug:
             np.set_printoptions(precision=3)
             print("Cost Matrix for ",str(self.__class__.__name__),"\n",cost_matrix)
@@ -51,7 +56,8 @@ class AbstractGraphEditDistance(object):
         row_ind,col_ind = linear_sum_assignment(cost_matrix)
         if self.debug:
             self.print_operations(cost_matrix,row_ind,col_ind)
-        return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(len(row_ind))]
+        cdef int f=len(row_ind)
+        return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(f)]
 
     def create_cost_matrix(self):
         """
@@ -67,13 +73,13 @@ class AbstractGraphEditDistance(object):
 
         The delete -> delete region is filled with zeros
         """
-        n = len(self.g1)
-        m = len(self.g2)
-        cost_matrix = np.zeros((n+m,n+m))
+        cdef int n = len(self.g1)
+        cdef int m = len(self.g2)
+        cdef np.ndarray cost_matrix = np.zeros((n+m,n+m))
         #cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)]
-        nodes1 = self.g1.nodes()
-        nodes2 = self.g2.nodes()
-
+        cdef list nodes1 = self.g1.nodes()
+        cdef list nodes2 = self.g2.nodes()
+        cdef int i,j
         for i in range(n):
             for j in range(m):
                 cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j])
@@ -89,10 +95,10 @@ class AbstractGraphEditDistance(object):
         self.cost_matrix = cost_matrix
         return cost_matrix
 
-    def insert_cost(self, i, j):
+    def insert_cost(self, int i, int j):
         raise NotImplementedError
 
-    def delete_cost(self, i, j):
+    def delete_cost(self, int i, int j):
         raise NotImplementedError
 
     def substitute_cost(self, nodes1, nodes2):
diff --git a/gmatch4py_cython/ged/algorithm/edge_edit_dist.pyx b/gmatch4py_cython/gmatch4py/ged/algorithm/edge_edit_dist.pyx
similarity index 88%
rename from gmatch4py_cython/ged/algorithm/edge_edit_dist.pyx
rename to gmatch4py_cython/gmatch4py/ged/algorithm/edge_edit_dist.pyx
index 684dc09..80f24e7 100644
--- a/gmatch4py_cython/ged/algorithm/edge_edit_dist.pyx
+++ b/gmatch4py_cython/gmatch4py/ged/algorithm/edge_edit_dist.pyx
@@ -13,12 +13,12 @@ class EdgeEditDistance(AbstractGraphEditDistance):
     def __init__(self, g1, g2,**kwargs):
         AbstractGraphEditDistance.__init__(self, g1, g2,**kwargs)
 
-    def insert_cost(self, i, j, nodes2):
+    def insert_cost(self, int i, int j, nodes2):
         if i == j:
             return self.edge_ins
         return sys.maxsize
 
-    def delete_cost(self, i, j, nodes1):
+    def delete_cost(self, int i, int j, nodes1):
         if i == j:
             return self.edge_del
         return sys.maxsize
diff --git a/gmatch4py_cython/ged/algorithm/graph_edit_dist.pyx b/gmatch4py_cython/gmatch4py/ged/algorithm/graph_edit_dist.pyx
similarity index 93%
rename from gmatch4py_cython/ged/algorithm/graph_edit_dist.pyx
rename to gmatch4py_cython/gmatch4py/ged/algorithm/graph_edit_dist.pyx
index 2c23758..cc38d08 100644
--- a/gmatch4py_cython/ged/algorithm/graph_edit_dist.pyx
+++ b/gmatch4py_cython/gmatch4py/ged/algorithm/graph_edit_dist.pyx
@@ -30,12 +30,12 @@ class GraphEditDistance(AbstractGraphEditDistance):
         else:
             return self.node_ins+self.node_del
 
-    def delete_cost(self, i, j, nodes1):
+    def delete_cost(self, int i, int j, nodes1):
         if i == j:
             return self.node_del+self.g1.degree(nodes1[i]) # Deleting a node implicate to delete in and out edges
         return sys.maxsize
 
-    def insert_cost(self, i, j, nodes2):
+    def insert_cost(self, int i, int j, nodes2):
         if i == j:
             deg=self.g2.degree(nodes2[j])
             if isinstance(deg,dict):deg=0
@@ -44,7 +44,7 @@ class GraphEditDistance(AbstractGraphEditDistance):
             return sys.maxsize
 
     def get_edge_multigraph(self,g,node):
-        edges=[]
+        cdef list edges=[]
         for id_,val in g.edge[node].items():
             if not 0 in val:
                 edges.append(str(id_) + val["color"])
@@ -54,6 +54,7 @@ class GraphEditDistance(AbstractGraphEditDistance):
         return edges
 
     def edge_diff(self, node1, node2):
+        cdef list edges1,edges2
         if isinstance(self.g1,nx.MultiDiGraph):
             edges1 = self.get_edge_multigraph(self.g1,node1)
             edges2 = self.get_edge_multigraph(self.g2,node2)
diff --git a/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx b/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx
new file mode 100644
index 0000000..d77f522
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx
@@ -0,0 +1,20 @@
+# coding = utf-8
+
+import numpy as np
+
+from .algorithm.graph_edit_dist import GraphEditDistance
+
+
+class ApproximateGraphEditDistance():
+    __type__ = "dist"
+
+    @staticmethod
+    def compare(listgs,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1):
+        n= len(listgs)
+        comparison_matrix = np.zeros((n,n))
+        for i in range(n):
+            for j in range(i,n):
+                comparison_matrix[i,j]= GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
+                comparison_matrix[j,i]= comparison_matrix[i,j] # Unethical ! Since AGED is not a symmetric similarity measure !
+
+        return comparison_matrix
\ No newline at end of file
diff --git a/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx b/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx
new file mode 100644
index 0000000..a09627a
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx
@@ -0,0 +1,150 @@
+# coding = utf-8
+import numpy as np
+cimport numpy as np
+
+cdef class BP_2():
+    """
+
+    """
+    __type__="dist"
+
+    cdef int node_del
+    cdef int node_ins
+    cdef int edge_del
+    cdef int edge_ins
+
+    @staticmethod
+    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
+        cdef int n = len(listgs)
+        comparator = BP_2(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
+        cdef np.ndarray comparison_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                comparison_matrix[i, j] = comparator.bp2(listgs[i], listgs[j])
+                comparison_matrix[j, i] = comparison_matrix[i, j]
+
+        return comparison_matrix
+
+    def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1):
+        """Constructor for HED"""
+        self.node_del = node_del
+        self.node_ins = node_ins
+        self.edge_del = edge_del
+        self.edge_ins = edge_ins
+
+    def bp2(self, g1, g2):
+        """
+        Compute de Hausdorff Edit Distance
+        :param g1: first graph
+        :param g2: second graph
+        :return:
+        """
+        return min(self.distance(self.psi(g1,g2)),self.distance(self.psi(g2,g1)))
+
+    def distance(self,e):
+        return np.sum(e)
+
+    def psi(self,g1,g2):
+        cdef list psi_=[]
+        cdef list nodes1 = g1.nodes()
+        cdef list nodes2 = g2.nodes()
+        for u in nodes1:
+            v=None
+            for w in nodes2:
+                if 2*self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,None) + self.fuv(g1,g2,None,w)\
+                     and self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,v):
+                    v=w
+                psi_.append(self.fuv(g1,g2,u,v))
+            if u:
+                nodes1= list(set(nodes1).difference(set([u])))
+            if v:
+                nodes2= list(set(nodes2).difference(set([v])))
+        for v in nodes2:
+            psi_.append(self.fuv(g1,g2,None,v))
+        return  psi_
+
+
+    def fuv(self, g1, g2, n1, n2):
+        """
+        Compute the Node Distance function
+        :param g1: first graph
+        :param g2: second graph
+        :param n1: node of the first graph
+        :param n2: node of the second graph
+        :return:
+        """
+        if n2 == None:  # Del
+            return self.node_del + ((self.edge_del / 2) * g1.degree(n1))
+        if n1 == None:  # Insert
+            return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))
+        else:
+            if n1 == n2:
+                return 0.
+            return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2
+
+    def hed_edge(self, g1, g2, n1, n2):
+        """
+        Compute HEDistance between edges of n1 and n2, respectively in g1 and g2
+        :param g1: first graph
+        :param g2: second graph
+        :param n1: node of the first graph
+        :param n2: node of the second graph
+        :return:
+        """
+        return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)
+
+    def get_edge_multigraph(self, g, node):
+        """
+        Get list of edge around a node in a Multigraph
+        :param g: multigraph
+        :param node: node in the multigraph
+        :return:
+        """
+        edges = []
+        for edge in g.edges(data=True):
+            if node == edge[0] or node == edge[1]:
+                edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"]))
+        return edges
+
+    def sum_gpq(self, g1, n1, g2, n2):
+        """
+        Compute Nearest Neighbour Distance between edges around n1 in G1  and edges around n2 in G2
+        :param g1: first graph
+        :param n1: node in the first graph
+        :param g2: second graph
+        :param n2: node in the second graph
+        :return:
+        """
+
+        #if isinstance(g1, nx.MultiDiGraph):
+        cdef list edges1 = self.get_edge_multigraph(g1, n1)
+        cdef list edges2 = self.get_edge_multigraph(g2, n2)
+        #else:
+            #print(1)
+            #edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())]
+            #edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())]
+        edges2.extend([None])
+        cdef np.ndarray min_sum = np.zeros(len(edges1))
+        for i in range(len(edges1)):
+            min_i = np.zeros(len(edges2))
+            for j in range(len(edges2)):
+                min_i[j] = self.gpq(edges1[i], edges2[j])
+            min_sum[i] = np.min(min_i)
+        return np.sum(min_sum)
+
+    def gpq(self, e1, e2):
+        """
+        Compute the edge distance function
+        :param e1: edge1
+        :param e2: edge2
+        :return:
+        """
+        if e2 == None:  # Del
+            return self.edge_del
+        if e1 == None:  # Insert
+            return self.edge_ins
+        else:
+            if e1 == e2:
+                return 0.
+            return (self.edge_del + self.edge_ins) / 2
+
diff --git a/gmatch4py_cython/gmatch4py/ged/graph/__init__.py b/gmatch4py_cython/gmatch4py/ged/graph/__init__.py
new file mode 100644
index 0000000..950f635
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/ged/graph/__init__.py
@@ -0,0 +1 @@
+# coding = utf-8
\ No newline at end of file
diff --git a/gmatch4py_cython/ged/graph/__init__.pyx b/gmatch4py_cython/gmatch4py/ged/graph/__init__.pyx
similarity index 100%
rename from gmatch4py_cython/ged/graph/__init__.pyx
rename to gmatch4py_cython/gmatch4py/ged/graph/__init__.pyx
diff --git a/gmatch4py_cython/ged/graph/edge_graph.pyx b/gmatch4py_cython/gmatch4py/ged/graph/edge_graph.pyx
similarity index 100%
rename from gmatch4py_cython/ged/graph/edge_graph.pyx
rename to gmatch4py_cython/gmatch4py/ged/graph/edge_graph.pyx
diff --git a/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx b/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx
new file mode 100644
index 0000000..96478dd
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx
@@ -0,0 +1,44 @@
+# coding = utf-8
+import numpy as np
+
+from .algorithm.graph_edit_dist import GraphEditDistance
+cimport numpy as np
+
+class GreedyEditDistance(GraphEditDistance):
+    """
+    Implementation of the Greedy Edit Distance presented in :
+
+    Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement
+    Andreas Fischer, Kaspar Riesen, Horst Bunke
+    2016
+    """
+    __type__ = "dist"
+    @staticmethod
+    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
+        cdef int n = len(listgs)
+        cdef np.ndarray comparison_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                comparison_matrix[i, j] = GreedyEditDistance(listgs[i], listgs[j],False, node_del=c_del_node,
+                                                            node_ins=c_ins_node, edge_del=c_del_edge,
+                                                            edge_ins=c_ins_edge).distance()
+                comparison_matrix[j, i] = comparison_matrix[i, j]
+
+
+        return comparison_matrix
+
+    def __init__(self,g1,g2,debug=False,**kwargs):
+        """Constructor for GreedyEditDistance"""
+        super().__init__(g1,g2,debug,**kwargs)
+
+
+    def edit_costs(self):
+        cdef np.ndarray cost_matrix=self.create_cost_matrix()
+        cdef np.ndarray cost_matrix_2=cost_matrix.copy()
+        cdef list psi=[]
+        for i in range(len(cost_matrix)):
+            phi_i=np.argmin((cost_matrix[i]))
+            cost_matrix=np.delete(cost_matrix,phi_i,1)
+            psi.append([i,phi_i+i]) #+i to compensate the previous column deletion
+        return [cost_matrix_2[psi[i][0]][psi[i][1]] for i in range(len(psi))]
+
diff --git a/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx b/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx
new file mode 100644
index 0000000..11eb6c5
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx
@@ -0,0 +1,156 @@
+# coding = utf-8
+
+
+import numpy as np
+cimport numpy as np
+cdef class HED:
+    """
+    Implementation of Hausdorff Edit Distance described in
+
+    Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement
+    Andreas Fischer, Kaspar Riesen, Horst Bunke
+    2016
+    """
+
+    cdef int node_del
+    cdef int node_ins
+    cdef int edge_del
+    cdef int edge_ins
+
+    __type__ = "dist"
+    @staticmethod
+    def compare(list listgs, int c_del_node=1, int c_del_edge=1, int c_ins_node=1, int c_ins_edge=1):
+        cdef int n = len(listgs)
+        comparator = HED(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
+        cdef np.ndarray comparison_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])
+                comparison_matrix[j, i] = comparison_matrix[i, j]
+
+        return comparison_matrix
+
+
+    def __init__(self, int node_del=1, int node_ins=1, int edge_del=1, int edge_ins=1):
+        """Constructor for HED"""
+        self.node_del = node_del
+        self.node_ins = node_ins
+        self.edge_del = edge_del
+        self.edge_ins = edge_ins
+
+    cpdef float hed(self, g1, g2):
+        """
+        Compute de Hausdorff Edit Distance
+        :param g1: first graph
+        :param g2: second graph
+        :return:
+        """
+        return self.sum_fuv(g1, g2) + self.sum_fuv(g2, g1)
+
+    cdef float sum_fuv(self, g1, g2):
+        """
+        Compute Nearest Neighbour Distance between G1 and G2
+        :param g1: First Graph
+        :param g2: Second Graph
+        :return:
+        """
+        cdef np.ndarray min_sum = np.zeros(len(g1))
+        nodes1 = g1.nodes()
+        nodes2 = g2.nodes()
+        nodes2.extend([None])
+        cdef np.ndarray min_i
+        for i in range(len(nodes1)):
+            min_i = np.zeros(len(nodes2))
+            for j in range(len(nodes2)):
+                min_i[j] = self.fuv(g1, g2, nodes1[i], nodes2[j])
+            min_sum[i] = np.min(min_i)
+        return np.sum(min_sum)
+
+    cdef float fuv(self, g1, g2, n1, n2):
+        """
+        Compute the Node Distance function
+        :param g1: first graph
+        :param g2: second graph
+        :param n1: node of the first graph
+        :param n2: node of the second graph
+        :return:
+        """
+        if n2 == None:  # Del
+            return self.node_del + ((self.edge_del / 2) * g1.degree(n1))
+        if n1 == None:  # Insert
+            return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))
+        else:
+            if n1 == n2:
+                return 0
+            return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2
+
+    cdef float hed_edge(self, g1, g2, n1, n2):
+        """
+        Compute HEDistance between edges of n1 and n2, respectively in g1 and g2
+        :param g1: first graph
+        :param g2: second graph
+        :param n1: node of the first graph
+        :param n2: node of the second graph
+        :return:
+        """
+        return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)
+
+    cdef list get_edge_multigraph(self, g, node):
+        """
+        Get list of edge around a node in a Multigraph
+        :param g: multigraph
+        :param node: node in the multigraph
+        :return:
+        """
+        cdef list edges = []
+        for edge in g.edges(data=True):
+            if node == edge[0] or node == edge[1]:
+                try:
+                    edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"]))
+                except:
+                    edges.append("{0}-{1}".format(edge[0],edge[1]))
+        return edges
+
+    cdef float  sum_gpq(self, g1, n1, g2, n2):
+        """
+        Compute Nearest Neighbour Distance between edges around n1 in G1  and edges around n2 in G2
+        :param g1: first graph
+        :param n1: node in the first graph
+        :param g2: second graph
+        :param n2: node in the second graph
+        :return:
+        """
+
+        #if isinstance(g1, nx.MultiDiGraph):
+        cdef list edges1 = self.get_edge_multigraph(g1, n1)
+        cdef list edges2 = self.get_edge_multigraph(g2, n2)
+
+        #else:
+            #edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())]
+            #edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())]
+
+        cdef np.ndarray min_sum = np.zeros(len(edges1))
+        edges2.extend([None])
+        cdef np.ndarray min_i
+        for i in range(len(edges1)):
+            min_i = np.zeros(len(edges2))
+            for j in range(len(edges2)):
+                min_i[j] = self.gpq(edges1[i], edges2[j])
+            min_sum[i] = np.min(min_i)
+        return np.sum(min_sum)
+
+    cdef float gpq(self, str e1, str e2):
+        """
+        Compute the edge distance function
+        :param e1: edge1
+        :param e2: edge2
+        :return:
+        """
+        if e2 == None:  # Del
+            return self.edge_del
+        if e1 == None:  # Insert
+            return self.edge_ins
+        else:
+            if e1 == e2:
+                return 0
+            return (self.edge_del + self.edge_ins) / 2
\ No newline at end of file
diff --git a/gmatch4py_cython/jaccard.pyx b/gmatch4py_cython/gmatch4py/jaccard.pyx
similarity index 77%
rename from gmatch4py_cython/jaccard.pyx
rename to gmatch4py_cython/gmatch4py/jaccard.pyx
index bb4c61b..237acd3 100644
--- a/gmatch4py_cython/jaccard.pyx
+++ b/gmatch4py_cython/gmatch4py/jaccard.pyx
@@ -3,7 +3,7 @@
 # coding = utf-8
 
 import numpy as np
-
+cimport numpy as np
 
 def intersect(a, b):
     return list(set(a) & set(b))
@@ -13,8 +13,10 @@ class Jaccard():
 
     @staticmethod
     def compare(listgs):
-        n = len(listgs)
-        comparison_matrix = np.zeros((n, n))
+        cdef int n = len(listgs)
+        cdef np.ndarray comparison_matrix = np.zeros((n, n))
+        cdef i=0
+        cdef j=0
         for i in range(n):
             for j in range(i,n):
                 g1 = listgs[i]
@@ -31,9 +33,9 @@ class Jaccard():
 
     @staticmethod
     def intersect_edges(g1,g2):
-        ed1 = Jaccard.transform_edges(g1.edges(data=True))
-        ed2 = Jaccard.transform_edges(g2.edges(data=True))
-        inter_ed=[]
+        cdef list ed1 = Jaccard.transform_edges(g1.edges(data=True))
+        cdef list ed2 = Jaccard.transform_edges(g2.edges(data=True))
+        cdef list inter_ed=[]
         for e1 in ed1:
             for e2 in ed2:
                 if e1 == e2:
@@ -42,17 +44,17 @@ class Jaccard():
 
     @staticmethod
     def union_nodes(g1, g2):
-        union=set([])
+        cdef set union=set([])
         for n in g1.nodes():union.add(n)
         for n in g2.nodes(): union.add(n)
         return union
 
     @staticmethod
     def union_edges(g1, g2):
-        ed1 = Jaccard.transform_edges(g1.edges(data=True))
-        ed2 = Jaccard.transform_edges(g2.edges(data=True))
-        union = []
-        register=set([])
+        cdef list ed1 = Jaccard.transform_edges(g1.edges(data=True))
+        cdef list ed2 = Jaccard.transform_edges(g2.edges(data=True))
+        cdef list union = []
+        cdef set register=set([])
         trans_=lambda x : "{0}-{1}:{2}".format(x[0],x[1],x[2]["color"])
         for e1 in ed1:
             if not trans_(e1) in register:
diff --git a/gmatch4py_cython/gmatch4py/kernels/__init__.py b/gmatch4py_cython/gmatch4py/kernels/__init__.py
new file mode 100644
index 0000000..950f635
--- /dev/null
+++ b/gmatch4py_cython/gmatch4py/kernels/__init__.py
@@ -0,0 +1 @@
+# coding = utf-8
\ No newline at end of file
diff --git a/gmatch4py_cython/mcs.pyx b/gmatch4py_cython/gmatch4py/mcs.pyx
similarity index 86%
rename from gmatch4py_cython/mcs.pyx
rename to gmatch4py_cython/gmatch4py/mcs.pyx
index 4b902a3..17a8fd0 100644
--- a/gmatch4py_cython/mcs.pyx
+++ b/gmatch4py_cython/gmatch4py/mcs.pyx
@@ -1,17 +1,17 @@
 # coding = utf-8
 import networkx as nx
 import numpy as np
+cimport numpy as np
 
 class MCS():
     """
     *A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer,
     Pattern Recognition Letters, 1998*
     """
-
     @staticmethod
     def compare(listgs):
-        n = len(listgs)
-        comparison_matrix = np.zeros((n, n))
+        cdef int n = len(listgs)
+        cdef np.ndarray comparison_matrix = np.zeros((n, n))
         for i in range(n):
             for j in range(i, n):
                 g1 = listgs[i]
@@ -36,8 +36,8 @@ class MCS():
 
     @staticmethod
     def intersect_edges(g1, g2):
-        ed1 = MCS.transform_edges(g1.edges(data=True))
-        ed2 = MCS.transform_edges(g2.edges(data=True))
+        cdef list ed1 = MCS.transform_edges(g1.edges(data=True))
+        cdef list ed2 = MCS.transform_edges(g2.edges(data=True))
         inter_ed = []
         for e1 in ed1:
             for e2 in ed2:
diff --git a/gmatch4py_cython/vertex_edge_overlap.pyx b/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx
similarity index 79%
rename from gmatch4py_cython/vertex_edge_overlap.pyx
rename to gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx
index 3fe4fc6..f0856b1 100644
--- a/gmatch4py_cython/vertex_edge_overlap.pyx
+++ b/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx
@@ -1,9 +1,11 @@
 # coding = utf-8
 
 import numpy as np
+cimport numpy as np
 
 
-def intersect(a, b):
+
+cdef list intersect(a, b):
     return list(set(a) & set(b))
 class VertexEdgeOverlap():
     __type__ = "sim"
@@ -17,9 +19,12 @@ class VertexEdgeOverlap():
     """
 
     @staticmethod
-    def compare(listgs):
+    def compare(list listgs):
         n = len(listgs)
-        comparison_matrix = np.zeros((n, n))
+        cdef np.ndarray comparison_matrix = np.zeros((n, n))
+        cdef list inter_ver
+        cdef list inter_ed
+        cdef int denom
         for i in range(n):
             for j in range(i,n):
                 g1 = listgs[i]
@@ -36,9 +41,9 @@ class VertexEdgeOverlap():
 
     @staticmethod
     def intersect_edges(g1,g2):
-        ed1 = VertexEdgeOverlap.transform_edges(g1.edges(data=True))
-        ed2 = VertexEdgeOverlap.transform_edges(g2.edges(data=True))
-        inter_ed=[]
+        cdef list ed1 = VertexEdgeOverlap.transform_edges(g1.edges(data=True))
+        cdef list ed2 = VertexEdgeOverlap.transform_edges(g2.edges(data=True))
+        cdef list inter_ed=[]
         for e1 in ed1:
             for e2 in ed2:
                 if e1 == e2:
diff --git a/gmatch4py_cython/vertex_ranking.pyx b/gmatch4py_cython/gmatch4py/vertex_ranking.pyx
similarity index 76%
rename from gmatch4py_cython/vertex_ranking.pyx
rename to gmatch4py_cython/gmatch4py/vertex_ranking.pyx
index 5b8c8ef..8f72a4d 100644
--- a/gmatch4py_cython/vertex_ranking.pyx
+++ b/gmatch4py_cython/gmatch4py/vertex_ranking.pyx
@@ -2,6 +2,7 @@
 
 import networkx as nx
 import numpy as np
+cimport numpy as np
 from scipy.stats import spearmanr
 
 
@@ -19,10 +20,13 @@ class VertexRanking():
     """
     __type__ = "sim"
     @staticmethod
-    def compare(listgs):
-        n = len(listgs)
-        comparison_matrix = np.zeros((n,n))
-        page_r=[nx.pagerank(nx.DiGraph(g)) for g in listgs]
+    def  compare(listgs):
+        cdef int n = len(listgs)
+        cdef np.ndarray comparison_matrix = np.zeros((n,n))
+        cdef list page_r=[nx.pagerank(nx.DiGraph(g)) for g in listgs]
+        cdef list node_intersection
+        cdef list X
+        cdef list Y
         for i in range(n):
             for j in range(i,n):
                 node_intersection=intersect(list(page_r[i].keys()),list(page_r[j].keys()))
diff --git a/gmatch4py_cython/setup.py b/gmatch4py_cython/setup.py
new file mode 100644
index 0000000..c8df67b
--- /dev/null
+++ b/gmatch4py_cython/setup.py
@@ -0,0 +1,47 @@
+import sys, os
+from distutils.core import setup
+from distutils.extension import Extension
+
+# we'd better have Cython installed, or it's a no-go
+try:
+    from Cython.Distutils import build_ext
+except:
+    print("You don't seem to have Cython installed. Please get a")
+    print("copy from www.cython.org and install it")
+    sys.exit(1)
+
+
+# scan the 'dvedit' directory for extension files, converting
+# them to extension names in dotted notation
+def scandir(dir, files=[]):
+    for file in os.listdir(dir):
+        path = os.path.join(dir, file)
+        if os.path.isfile(path) and path.endswith(".pyx"):
+            files.append(path.replace(os.path.sep, ".")[:-4])
+        elif os.path.isdir(path):
+            scandir(path, files)
+    return files
+
+
+# generate an Extension object from its dotted name
+def makeExtension(extName):
+    extPath = extName.replace(".", os.path.sep)+".pyx"
+    return Extension(
+        extName,
+        [extPath],
+        extra_compile_args = ["-O3", "-Wall"]
+        )
+
+# get the list of extensions
+extNames = scandir("gmatch4py")
+
+# and build up the set of Extension objects
+extensions = [makeExtension(name) for name in extNames]
+
+# finally, we can pass all this to distutils
+setup(
+  name="gmatch4py_test",
+  packages=["gmatch4py", "gmatch4py.ged","gmatch4py.kernels"],
+  ext_modules=extensions,
+  cmdclass = {'build_ext': build_ext},
+)
\ No newline at end of file
diff --git a/gmatch4py_cython/utils.pyx b/gmatch4py_cython/utils.pyx
deleted file mode 100644
index 656e072..0000000
--- a/gmatch4py_cython/utils.pyx
+++ /dev/null
@@ -1,94 +0,0 @@
-# coding = utf-8
-
-import numpy as np
-from shapely.geometry import Point
-
-from helpers.collision_with_gazetteer_data import collisionTwoSEBoundaries
-from helpers.gazeteer_helpers import get_data
-from models.str import get_inclusion_chain
-
-_cache_distance={}
-def get_nodes_geolocalization(graph):
-    info = {}
-    for node in graph.nodes():
-        if not node in info:
-            info[node] = get_data(node)
-    return info
-
-def is_included_in(se1_id,se2_id):
-    inc_chain_P131 = get_inclusion_chain(se1_id, "P131")
-    inc_chain_P706 = get_inclusion_chain(se1_id, "P706")
-
-    print("mixDEB")
-    inc_chain = inc_chain_P131
-    inc_chain.extend(inc_chain_P706)
-    inc_chain = set(inc_chain)
-    print("mixFIN")
-    if se2_id in inc_chain:
-        return True
-    return False
-
-def are_adjacent(se1,se2):
-    if "P47" in se1:
-        if se2["id"] in se1["P47"]:
-            return True
-    elif collisionTwoSEBoundaries(se1["id"], se2["id"]):
-        return True
-    return False
-
-def geoDistance(info1,info2,n1,n2):
-    if n1 in _cache_distance:
-        if n2 in _cache_distance[n1]:
-            return _cache_distance[n1][n2]
-    if n2 in _cache_distance:
-        if n1 in _cache_distance[n2]:
-            return _cache_distance[n2][n1]
-
-
-    coord1 = (info1["coord"]["lon"], info1["coord"]["lat"])
-    coord2 = (info2["coord"]["lon"], info2["coord"]["lat"])
-    dist=Point(coord1).distance(Point(coord2))
-    if not n1 in _cache_distance:_cache_distance[n1]={}
-    if not n2 in _cache_distance[n1]:_cache_distance[n1][n2]=0.
-    _cache_distance[n1][n2] = dist
-    return dist
-
-def get_score_distance(n1,n2,all_info1,all_info2):
-    if n1 == n2 :
-        return 0
-    score = geoDistance(all_info1[n1],all_info2[n2],n1,n2)
-    return score
-    avg=[]
-    for ni in all_info1:
-        if ni != n1:
-            avg.append(geoDistance(all_info1[ni],all_info1[n1],ni,n1))
-    if len(avg)>0:
-        return score/np.mean(avg)
-    return 0
-
-def get_distance_two_entity(n1,n2,info1,info2):
-    if n1 == None or n2 == None:
-        return 0
-    score = 0
-    try:
-        dist=get_score_distance(n1,n2,info1,info2)
-    except:
-        dist=0
-    if  dist >1 and dist < 10 :
-        #print(n1,info1[n1]["fr"],info2[n2]["fr"])
-        score+=0.5
-    else:
-        score+=1
-    #if set(info1[n1]["class"]) and info2[n2]["class"]:
-     #   score-=1
-
-    # included=is_included_in(n1,n2)
-    # if not included:
-    #     included = is_included_in(n2, n1)
-    # if not included:
-    #     if not are_adjacent(info1[n1],info2[n2]):
-    #         score+=1
-    return score
-
-
-
diff --git a/helpers/classic.py b/helpers/classic.py
new file mode 100644
index 0000000..a6f269d
--- /dev/null
+++ b/helpers/classic.py
@@ -0,0 +1,26 @@
+# coding = utf-8
+import string
+
+
+def flatten(lis):
+    """Given a list, possibly nested to any level, return it flattened."""
+    new_lis = []
+    for item in lis:
+        if type(item) == type([]):
+            new_lis.extend(flatten(item))
+        else:
+            new_lis.append(item)
+    return new_lis
+
+
+def join_string(tokens):
+    ch = ""
+    for i in range(len(tokens)):
+        if i == 0:
+            ch += tokens[i]
+            continue
+        if not tokens[i - 1][-1] in string.punctuation and tokens[i] not in string.punctuation:
+            ch += " " + tokens[i]
+        else:
+            ch += tokens[i]
+    return ch
diff --git a/helpers/gazeteer_helpers.py b/helpers/gazeteer_helpers.py
index 4deab05..6de2bb1 100644
--- a/helpers/gazeteer_helpers.py
+++ b/helpers/gazeteer_helpers.py
@@ -151,3 +151,21 @@ def get_by_alias(alias, lang):
         return response['hits']['hits']
     return None
 
+
+
+def get_most_common_id_v3(label,lang='fr'):
+    id_,score=get_most_common_id_v2(label,lang)
+    if id_:
+        return id_,score
+    if not id_ and lang !='en':
+        id_,score=get_most_common_id_v2(label,'en')
+        if id_:
+            return id_,score
+    id_,score=get_most_common_id_alias_v2(label,lang)
+    if not id_ and lang !='en':
+        id_,score=get_most_common_id_v2(label,'en')
+        if id_:
+            return id_,score
+    return None,-1
+
+
diff --git a/nlp/disambiguator/disambiguator.py b/nlp/disambiguator/disambiguator.py
index e4e3bd1..95e4d85 100644
--- a/nlp/disambiguator/disambiguator.py
+++ b/nlp/disambiguator/disambiguator.py
@@ -1,10 +1,12 @@
 # coding = utf-8
 
 import copy
+import string
 
 import numpy as np
 
 from nlp.ner.ner import NER
+from helpers.classic import join_string
 
 
 class Disambiguator(object):
@@ -40,7 +42,7 @@ class Disambiguator(object):
                     t += 1
                     while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
                         tag = copy.copy(corpus[t])
-                        if tag[0].endswith("-") or compound_tag.endswith("-"):
+                        if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
                             compound_tag += tag[0]
                         else:
                             compound_tag += " " + tag[0]
diff --git a/nlp/disambiguator/geodict_gaurav.py b/nlp/disambiguator/geodict_gaurav.py
index a650dff..0f6e906 100644
--- a/nlp/disambiguator/geodict_gaurav.py
+++ b/nlp/disambiguator/geodict_gaurav.py
@@ -5,7 +5,7 @@ from helpers.collision_with_gazetteer_data import *
 from helpers.gazeteer_helpers import *
 from .disambiguator import Disambiguator
 
-
+from models.str import get_inclusion_chain
 class GauravGeodict(Disambiguator):
 
     def __init__(self):
@@ -18,6 +18,8 @@ class GauravGeodict(Disambiguator):
         return int(round(val))
 
     def inclusion_log(self, x, alpha=0.2):
+        if x==0:
+            return 1
         return math.log(x)
 
     def get_inclusion_tree(self, id_, prop):
@@ -36,11 +38,11 @@ class GauravGeodict(Disambiguator):
         return arr
 
     def get_inclusion_score(self, id1, id2):  # is it really inclusion ? :)
-        list1 = self.get_inclusion_tree(id1, 'P131')
-        list2 = self.get_inclusion_tree(id2, 'P131')
+        list1 = get_inclusion_chain(id1, 'P131')
+        list2 = get_inclusion_chain(id2, 'P131')
         interP131 = len(list(set(list1).intersection(list2)))
-        list1 = self.get_inclusion_tree(id1, 'P706')
-        list2 = self.get_inclusion_tree(id2, 'P706')
+        list1 = get_inclusion_chain(id1, 'P706')
+        list2 = get_inclusion_chain(id2, 'P706')
         interP706 = len(list(set(list1).intersection(list2)))
         # return fib_no[interP131]+fib_no[interP706]
         return self.inclusion_log(interP131) + self.inclusion_log(interP706)
@@ -104,3 +106,32 @@ class GauravGeodict(Disambiguator):
             fixed_entities[k] = v
 
         return count, fixed_entities
+
+    def eval(self,se_,lang):
+        selected_en = {}
+
+        fixed_entities = {}
+        ambiguous_entities = {}
+        for en in se_:
+            request = get_by_label(en, lang)
+            if len(request) == 0:
+                request = get_by_alias(en, lang)
+
+            if len(request) > 1:
+                ambiguous_entities[en] = [r["_source"] for r in request]
+            elif len(request) == 1:
+                fixed_entities[en] = request[0]["_source"]
+
+        d_amb_results = {}
+        for amb_ent in ambiguous_entities:
+            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
+            if not d:
+                d_amb_results[amb_ent] = get_most_common_id_v2(amb_ent, lang)[0]
+            else:
+                d_amb_results[amb_ent] = d
+        for k, v in fixed_entities.items():
+            fixed_entities[k] = v["id"]
+        for k, v in d_amb_results.items():
+            fixed_entities[k] = v
+
+        return fixed_entities
\ No newline at end of file
diff --git a/nlp/disambiguator/most_common.py b/nlp/disambiguator/most_common.py
new file mode 100644
index 0000000..affd138
--- /dev/null
+++ b/nlp/disambiguator/most_common.py
@@ -0,0 +1,59 @@
+# coding = utf-8
+
+
+from helpers.gazeteer_helpers import label_exists, alias_exists, get_most_common_id_v2,get_most_common_id_v3, get_most_common_id_alias_v2
+from .disambiguator import Disambiguator
+import re, json, os
+
+stop_words = {
+    "fr": set(open("/Users/jacquesfize/LOD_DATASETS/language_resources/stop_words_fr.txt").read().split("\n")),
+    "en": set(open("/Users/jacquesfize/LOD_DATASETS/language_resources/stop_words_en.txt").read().split("\n"))
+}
+
+common_words = {
+    # "fr":set(open("/Users/jacquesfize/LOD_DATASETS/language_resources/french_common_words.txt").read().split("\n")),
+    "fr": set(json.load(open("/Users/jacquesfize/LOD_DATASETS/language_resources/dic_fr.json"))),
+    "en": set(
+        open("/Users/jacquesfize/LOD_DATASETS/language_resources/english_common_words_filtered.txt").read().split("\n"))
+}
+
+
+class MostCommonDisambiguator(Disambiguator):
+
+    def __init__(self):
+        Disambiguator.__init__(self)
+
+    def disambiguate(self, ner_result, lang="en"):
+        count, se_ = self.extract_se_entities(ner_result)
+        new_count = {}
+        selected_en = {}
+        for en in se_:
+            id_,score=self.disambiguate_(en,lang)
+            if not id_ =="O":
+                selected_en[id_] = en
+                new_count[id_] = count[en]
+
+        return new_count, selected_en
+
+    def disambiguate_(self, label, lang='fr'):
+        if re.match("^\d+$", label):
+            return 'O', -1
+        if label.lower() in stop_words[lang] or label.lower() in common_words[lang]:
+            return 'O', -1
+
+        plural = label.rstrip("s") + "s"
+        if plural.lower() in stop_words[lang] or plural.lower() in common_words[lang]:
+            return 'O', -1
+
+        id_, score = get_most_common_id_v3(label, lang)
+        if id_:
+            id_en, score_en = get_most_common_id_v3(label, "en")
+            if id_en and score_en:
+                if score_en > score:
+                    id_, score = id_en, score_en
+            id_alias, score_alias = get_most_common_id_alias_v2(label, lang)
+            if id_alias and score_alias:
+                if score_alias > score:
+                    id_, score = id_alias, score_alias
+
+        return id_, score
diff --git a/nlp/disambiguator/pagerank.py b/nlp/disambiguator/pagerank.py
index 05c6490..b0a7423 100644
--- a/nlp/disambiguator/pagerank.py
+++ b/nlp/disambiguator/pagerank.py
@@ -23,7 +23,7 @@ class PageRankDisambiguator(Disambiguator):
                 selected_en[en_most_common] = en
                 new_count[en_most_common] = count[en]
             else:
-                selected_en[id_] = en
-                new_count[id_] = count[en]
+                selected_en[en_most_common] = en
+                new_count[en_most_common] = count[en]
 
         return new_count, selected_en
diff --git a/nlp/ner/spacy.py b/nlp/ner/spacy.py
index f6133be..e7dcde2 100644
--- a/nlp/ner/spacy.py
+++ b/nlp/ner/spacy.py
@@ -1,23 +1,21 @@
-# coding = utf-8
-
 # coding=utf-8
 
 import spacy
 
 from helpers.deprecated import deprecated
+from helpers.classic import flatten
 from .ner import NER
 from ..exception.language import LanguageNotAvailable
 
-_spacy_available_language = ["fr", "en", "es", "de"]
+_spacy_available_language = ["fr", "en","es","de"]
 
 _tag_spacy = {
     "place": ["GPE", "LOC"],  # Petite particularité
-    "person": "PERSON",
+    "pers": "PERSON",
     "org": "ORG"
 }
 
 
-@deprecated("Not finished yet !")
 class Spacy(NER):
     """
     Python wrapper for StanfordNER
@@ -31,29 +29,43 @@ class Spacy(NER):
 
         self._ner = spacy.load(self._lang)
 
-    def identify(self, text=None):
+    def split_text(self,text,maxlen=50000):
+        texts=text.split(".")
+        phrases_given=[]
+        c=0
+        current_phrase=""
+        for t in texts:
+            if c + len(t)+1 <maxlen:
+                current_phrase+="."+t
+                c+=len(t)+1
+            elif c + len(t) > maxlen:
+                phrases_given.append(current_phrase)
+                current_phrase, c ="",0
+        if not phrases_given:
+            phrases_given=[text]
+        return phrases_given
 
-        output_ = [[token, token.pos_, token.type_ent_] for token in self.ner(text)]
-        new_output_ = []
-        for o in output_:
-            if o[-1]:
-                o[-2] = o[-1]
-                new_output_.append(o[:-1])
-        return self.parse_output(new_output_, [])
+    def identify(self, text=None):
+        if len(text) > 1000000:
+            output_=[]
+            for t in self.split_text(text,1000000):
+                output_.extend([[token.text, token.pos_, token.ent_type_] for token in self._ner(t)])
+            return self.parse_output(output_, [])
+        else:
+            output_ = [[token.text, token.pos_, token.ent_type_] for token in self._ner(text)]
+            return self.parse_output(output_, [])
 
     def parse_output(self, output, pos_tags):
         # Pre-Treatment on the output
         # print(1)
         tagged_ = []
-        _tag_entity = list(_tag_spacy.values())
-
-        for sentence in output["sentences"]:
-            # print(sentence.keys())
-            for w in sentence["tokens"]:
-                if w["ner"] in _tag_entity:
-                    tagged_.append([w["originalText"], self.translate_tag(w["ner"])])
-                else:
-                    tagged_.append([w["originalText"], w["pos"]])
+        _tag_entity = flatten(list(_tag_spacy.values()))
+
+        for token in output:
+            if token[-1] in _tag_entity:
+                tagged_.append([token[0], self.translate_tag(token[-1])])
+            else:
+                tagged_.append([token[0], token[-2]])
 
         return self.add_beg_ending_to_tag(tagged_)
 
diff --git a/nlp/ner/stanford_ner.py b/nlp/ner/stanford_ner.py
index 4a3a77f..82b382b 100644
--- a/nlp/ner/stanford_ner.py
+++ b/nlp/ner/stanford_ner.py
@@ -2,7 +2,7 @@
 
 from queue import Queue
 from threading import Thread
-
+import json
 from pycorenlp import StanfordCoreNLP as RestStanford
 
 from config.configuration import config
@@ -38,6 +38,8 @@ class NERWorker(Thread):
             self.outputs.append((id_,self.ner.annotate(text, properties={'annotators': 'tokenize,ssplit,pos,ner',
                                                         'outputFormat': 'json',"tokenize.untokenizable": "noneDelete"})))
             self.queue.task_done()
+            if self.queue.empty():
+                break
 class StanfordNER(NER):
     """
     Python wrapper for StanfordNER
@@ -69,6 +71,8 @@ class StanfordNER(NER):
             elif c + len(t) > maxlen:
                 phrases_given.append(current_phrase)
                 current_phrase, c ="",0
+        if not phrases_given:
+            phrases_given=[text]
         return phrases_given
 
     def identify(self, text=None):
@@ -97,8 +101,13 @@ class StanfordNER(NER):
                 worker=NERWorker(self._ner,queue,self._lang)
                 list_worker.append(worker)
                 list_worker[-1].daemon=True
-                list_worker[-1].start()
+                #print(type(list_worker[-1]))
+                try:
+                    list_worker[-1].start()
+                except:
+                    print("Worker {0} couldn't be activated !".format(t))
             queue.join()
+            queue.queue.clear()
             outputs=["" for i in range(len(texts)-1)]
             for worker in list_worker:
                 for id,out in worker.outputs:
diff --git a/notebooks/Cython Enhancement on HED.ipynb b/notebooks/Cython Enhancement on HED.ipynb
new file mode 100644
index 0000000..b651169
--- /dev/null
+++ b/notebooks/Cython Enhancement on HED.ipynb	
@@ -0,0 +1,1311 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.266539Z",
+     "start_time": "2018-04-20T07:03:05.937225Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext Cython"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.281912Z",
+     "start_time": "2018-04-20T07:03:06.268809Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<!DOCTYPE html>\n",
+       "<!-- Generated by Cython 0.25.2 -->\n",
+       "<html>\n",
+       "<head>\n",
+       "    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n",
+       "    <title>Cython: _cython_magic_fcaa11e6595f50de5667d45b47247f97.pyx</title>\n",
+       "    <style type=\"text/css\">\n",
+       "    \n",
+       "body.cython { font-family: courier; font-size: 12; }\n",
+       "\n",
+       ".cython.tag  {  }\n",
+       ".cython.line { margin: 0em }\n",
+       ".cython.code { font-size: 9; color: #444444; display: none; margin: 0px 0px 0px 8px; border-left: 8px none; }\n",
+       "\n",
+       ".cython.line .run { background-color: #B0FFB0; }\n",
+       ".cython.line .mis { background-color: #FFB0B0; }\n",
+       ".cython.code.run  { border-left: 8px solid #B0FFB0; }\n",
+       ".cython.code.mis  { border-left: 8px solid #FFB0B0; }\n",
+       "\n",
+       ".cython.code .py_c_api  { color: red; }\n",
+       ".cython.code .py_macro_api  { color: #FF7000; }\n",
+       ".cython.code .pyx_c_api  { color: #FF3000; }\n",
+       ".cython.code .pyx_macro_api  { color: #FF7000; }\n",
+       ".cython.code .refnanny  { color: #FFA000; }\n",
+       ".cython.code .trace  { color: #FFA000; }\n",
+       ".cython.code .error_goto  { color: #FFA000; }\n",
+       "\n",
+       ".cython.code .coerce  { color: #008000; border: 1px dotted #008000 }\n",
+       ".cython.code .py_attr { color: #FF0000; font-weight: bold; }\n",
+       ".cython.code .c_attr  { color: #0000FF; }\n",
+       ".cython.code .py_call { color: #FF0000; font-weight: bold; }\n",
+       ".cython.code .c_call  { color: #0000FF; }\n",
+       "\n",
+       ".cython.score-0 {background-color: #FFFFff;}\n",
+       ".cython.score-1 {background-color: #FFFFe7;}\n",
+       ".cython.score-2 {background-color: #FFFFd4;}\n",
+       ".cython.score-3 {background-color: #FFFFc4;}\n",
+       ".cython.score-4 {background-color: #FFFFb6;}\n",
+       ".cython.score-5 {background-color: #FFFFaa;}\n",
+       ".cython.score-6 {background-color: #FFFF9f;}\n",
+       ".cython.score-7 {background-color: #FFFF96;}\n",
+       ".cython.score-8 {background-color: #FFFF8d;}\n",
+       ".cython.score-9 {background-color: #FFFF86;}\n",
+       ".cython.score-10 {background-color: #FFFF7f;}\n",
+       ".cython.score-11 {background-color: #FFFF79;}\n",
+       ".cython.score-12 {background-color: #FFFF73;}\n",
+       ".cython.score-13 {background-color: #FFFF6e;}\n",
+       ".cython.score-14 {background-color: #FFFF6a;}\n",
+       ".cython.score-15 {background-color: #FFFF66;}\n",
+       ".cython.score-16 {background-color: #FFFF62;}\n",
+       ".cython.score-17 {background-color: #FFFF5e;}\n",
+       ".cython.score-18 {background-color: #FFFF5b;}\n",
+       ".cython.score-19 {background-color: #FFFF57;}\n",
+       ".cython.score-20 {background-color: #FFFF55;}\n",
+       ".cython.score-21 {background-color: #FFFF52;}\n",
+       ".cython.score-22 {background-color: #FFFF4f;}\n",
+       ".cython.score-23 {background-color: #FFFF4d;}\n",
+       ".cython.score-24 {background-color: #FFFF4b;}\n",
+       ".cython.score-25 {background-color: #FFFF48;}\n",
+       ".cython.score-26 {background-color: #FFFF46;}\n",
+       ".cython.score-27 {background-color: #FFFF44;}\n",
+       ".cython.score-28 {background-color: #FFFF43;}\n",
+       ".cython.score-29 {background-color: #FFFF41;}\n",
+       ".cython.score-30 {background-color: #FFFF3f;}\n",
+       ".cython.score-31 {background-color: #FFFF3e;}\n",
+       ".cython.score-32 {background-color: #FFFF3c;}\n",
+       ".cython.score-33 {background-color: #FFFF3b;}\n",
+       ".cython.score-34 {background-color: #FFFF39;}\n",
+       ".cython.score-35 {background-color: #FFFF38;}\n",
+       ".cython.score-36 {background-color: #FFFF37;}\n",
+       ".cython.score-37 {background-color: #FFFF36;}\n",
+       ".cython.score-38 {background-color: #FFFF35;}\n",
+       ".cython.score-39 {background-color: #FFFF34;}\n",
+       ".cython.score-40 {background-color: #FFFF33;}\n",
+       ".cython.score-41 {background-color: #FFFF32;}\n",
+       ".cython.score-42 {background-color: #FFFF31;}\n",
+       ".cython.score-43 {background-color: #FFFF30;}\n",
+       ".cython.score-44 {background-color: #FFFF2f;}\n",
+       ".cython.score-45 {background-color: #FFFF2e;}\n",
+       ".cython.score-46 {background-color: #FFFF2d;}\n",
+       ".cython.score-47 {background-color: #FFFF2c;}\n",
+       ".cython.score-48 {background-color: #FFFF2b;}\n",
+       ".cython.score-49 {background-color: #FFFF2b;}\n",
+       ".cython.score-50 {background-color: #FFFF2a;}\n",
+       ".cython.score-51 {background-color: #FFFF29;}\n",
+       ".cython.score-52 {background-color: #FFFF29;}\n",
+       ".cython.score-53 {background-color: #FFFF28;}\n",
+       ".cython.score-54 {background-color: #FFFF27;}\n",
+       ".cython.score-55 {background-color: #FFFF27;}\n",
+       ".cython.score-56 {background-color: #FFFF26;}\n",
+       ".cython.score-57 {background-color: #FFFF26;}\n",
+       ".cython.score-58 {background-color: #FFFF25;}\n",
+       ".cython.score-59 {background-color: #FFFF24;}\n",
+       ".cython.score-60 {background-color: #FFFF24;}\n",
+       ".cython.score-61 {background-color: #FFFF23;}\n",
+       ".cython.score-62 {background-color: #FFFF23;}\n",
+       ".cython.score-63 {background-color: #FFFF22;}\n",
+       ".cython.score-64 {background-color: #FFFF22;}\n",
+       ".cython.score-65 {background-color: #FFFF22;}\n",
+       ".cython.score-66 {background-color: #FFFF21;}\n",
+       ".cython.score-67 {background-color: #FFFF21;}\n",
+       ".cython.score-68 {background-color: #FFFF20;}\n",
+       ".cython.score-69 {background-color: #FFFF20;}\n",
+       ".cython.score-70 {background-color: #FFFF1f;}\n",
+       ".cython.score-71 {background-color: #FFFF1f;}\n",
+       ".cython.score-72 {background-color: #FFFF1f;}\n",
+       ".cython.score-73 {background-color: #FFFF1e;}\n",
+       ".cython.score-74 {background-color: #FFFF1e;}\n",
+       ".cython.score-75 {background-color: #FFFF1e;}\n",
+       ".cython.score-76 {background-color: #FFFF1d;}\n",
+       ".cython.score-77 {background-color: #FFFF1d;}\n",
+       ".cython.score-78 {background-color: #FFFF1c;}\n",
+       ".cython.score-79 {background-color: #FFFF1c;}\n",
+       ".cython.score-80 {background-color: #FFFF1c;}\n",
+       ".cython.score-81 {background-color: #FFFF1c;}\n",
+       ".cython.score-82 {background-color: #FFFF1b;}\n",
+       ".cython.score-83 {background-color: #FFFF1b;}\n",
+       ".cython.score-84 {background-color: #FFFF1b;}\n",
+       ".cython.score-85 {background-color: #FFFF1a;}\n",
+       ".cython.score-86 {background-color: #FFFF1a;}\n",
+       ".cython.score-87 {background-color: #FFFF1a;}\n",
+       ".cython.score-88 {background-color: #FFFF1a;}\n",
+       ".cython.score-89 {background-color: #FFFF19;}\n",
+       ".cython.score-90 {background-color: #FFFF19;}\n",
+       ".cython.score-91 {background-color: #FFFF19;}\n",
+       ".cython.score-92 {background-color: #FFFF19;}\n",
+       ".cython.score-93 {background-color: #FFFF18;}\n",
+       ".cython.score-94 {background-color: #FFFF18;}\n",
+       ".cython.score-95 {background-color: #FFFF18;}\n",
+       ".cython.score-96 {background-color: #FFFF18;}\n",
+       ".cython.score-97 {background-color: #FFFF17;}\n",
+       ".cython.score-98 {background-color: #FFFF17;}\n",
+       ".cython.score-99 {background-color: #FFFF17;}\n",
+       ".cython.score-100 {background-color: #FFFF17;}\n",
+       ".cython.score-101 {background-color: #FFFF16;}\n",
+       ".cython.score-102 {background-color: #FFFF16;}\n",
+       ".cython.score-103 {background-color: #FFFF16;}\n",
+       ".cython.score-104 {background-color: #FFFF16;}\n",
+       ".cython.score-105 {background-color: #FFFF16;}\n",
+       ".cython.score-106 {background-color: #FFFF15;}\n",
+       ".cython.score-107 {background-color: #FFFF15;}\n",
+       ".cython.score-108 {background-color: #FFFF15;}\n",
+       ".cython.score-109 {background-color: #FFFF15;}\n",
+       ".cython.score-110 {background-color: #FFFF15;}\n",
+       ".cython.score-111 {background-color: #FFFF15;}\n",
+       ".cython.score-112 {background-color: #FFFF14;}\n",
+       ".cython.score-113 {background-color: #FFFF14;}\n",
+       ".cython.score-114 {background-color: #FFFF14;}\n",
+       ".cython.score-115 {background-color: #FFFF14;}\n",
+       ".cython.score-116 {background-color: #FFFF14;}\n",
+       ".cython.score-117 {background-color: #FFFF14;}\n",
+       ".cython.score-118 {background-color: #FFFF13;}\n",
+       ".cython.score-119 {background-color: #FFFF13;}\n",
+       ".cython.score-120 {background-color: #FFFF13;}\n",
+       ".cython.score-121 {background-color: #FFFF13;}\n",
+       ".cython.score-122 {background-color: #FFFF13;}\n",
+       ".cython.score-123 {background-color: #FFFF13;}\n",
+       ".cython.score-124 {background-color: #FFFF13;}\n",
+       ".cython.score-125 {background-color: #FFFF12;}\n",
+       ".cython.score-126 {background-color: #FFFF12;}\n",
+       ".cython.score-127 {background-color: #FFFF12;}\n",
+       ".cython.score-128 {background-color: #FFFF12;}\n",
+       ".cython.score-129 {background-color: #FFFF12;}\n",
+       ".cython.score-130 {background-color: #FFFF12;}\n",
+       ".cython.score-131 {background-color: #FFFF12;}\n",
+       ".cython.score-132 {background-color: #FFFF11;}\n",
+       ".cython.score-133 {background-color: #FFFF11;}\n",
+       ".cython.score-134 {background-color: #FFFF11;}\n",
+       ".cython.score-135 {background-color: #FFFF11;}\n",
+       ".cython.score-136 {background-color: #FFFF11;}\n",
+       ".cython.score-137 {background-color: #FFFF11;}\n",
+       ".cython.score-138 {background-color: #FFFF11;}\n",
+       ".cython.score-139 {background-color: #FFFF11;}\n",
+       ".cython.score-140 {background-color: #FFFF11;}\n",
+       ".cython.score-141 {background-color: #FFFF10;}\n",
+       ".cython.score-142 {background-color: #FFFF10;}\n",
+       ".cython.score-143 {background-color: #FFFF10;}\n",
+       ".cython.score-144 {background-color: #FFFF10;}\n",
+       ".cython.score-145 {background-color: #FFFF10;}\n",
+       ".cython.score-146 {background-color: #FFFF10;}\n",
+       ".cython.score-147 {background-color: #FFFF10;}\n",
+       ".cython.score-148 {background-color: #FFFF10;}\n",
+       ".cython.score-149 {background-color: #FFFF10;}\n",
+       ".cython.score-150 {background-color: #FFFF0f;}\n",
+       ".cython.score-151 {background-color: #FFFF0f;}\n",
+       ".cython.score-152 {background-color: #FFFF0f;}\n",
+       ".cython.score-153 {background-color: #FFFF0f;}\n",
+       ".cython.score-154 {background-color: #FFFF0f;}\n",
+       ".cython.score-155 {background-color: #FFFF0f;}\n",
+       ".cython.score-156 {background-color: #FFFF0f;}\n",
+       ".cython.score-157 {background-color: #FFFF0f;}\n",
+       ".cython.score-158 {background-color: #FFFF0f;}\n",
+       ".cython.score-159 {background-color: #FFFF0f;}\n",
+       ".cython.score-160 {background-color: #FFFF0f;}\n",
+       ".cython.score-161 {background-color: #FFFF0e;}\n",
+       ".cython.score-162 {background-color: #FFFF0e;}\n",
+       ".cython.score-163 {background-color: #FFFF0e;}\n",
+       ".cython.score-164 {background-color: #FFFF0e;}\n",
+       ".cython.score-165 {background-color: #FFFF0e;}\n",
+       ".cython.score-166 {background-color: #FFFF0e;}\n",
+       ".cython.score-167 {background-color: #FFFF0e;}\n",
+       ".cython.score-168 {background-color: #FFFF0e;}\n",
+       ".cython.score-169 {background-color: #FFFF0e;}\n",
+       ".cython.score-170 {background-color: #FFFF0e;}\n",
+       ".cython.score-171 {background-color: #FFFF0e;}\n",
+       ".cython.score-172 {background-color: #FFFF0e;}\n",
+       ".cython.score-173 {background-color: #FFFF0d;}\n",
+       ".cython.score-174 {background-color: #FFFF0d;}\n",
+       ".cython.score-175 {background-color: #FFFF0d;}\n",
+       ".cython.score-176 {background-color: #FFFF0d;}\n",
+       ".cython.score-177 {background-color: #FFFF0d;}\n",
+       ".cython.score-178 {background-color: #FFFF0d;}\n",
+       ".cython.score-179 {background-color: #FFFF0d;}\n",
+       ".cython.score-180 {background-color: #FFFF0d;}\n",
+       ".cython.score-181 {background-color: #FFFF0d;}\n",
+       ".cython.score-182 {background-color: #FFFF0d;}\n",
+       ".cython.score-183 {background-color: #FFFF0d;}\n",
+       ".cython.score-184 {background-color: #FFFF0d;}\n",
+       ".cython.score-185 {background-color: #FFFF0d;}\n",
+       ".cython.score-186 {background-color: #FFFF0d;}\n",
+       ".cython.score-187 {background-color: #FFFF0c;}\n",
+       ".cython.score-188 {background-color: #FFFF0c;}\n",
+       ".cython.score-189 {background-color: #FFFF0c;}\n",
+       ".cython.score-190 {background-color: #FFFF0c;}\n",
+       ".cython.score-191 {background-color: #FFFF0c;}\n",
+       ".cython.score-192 {background-color: #FFFF0c;}\n",
+       ".cython.score-193 {background-color: #FFFF0c;}\n",
+       ".cython.score-194 {background-color: #FFFF0c;}\n",
+       ".cython.score-195 {background-color: #FFFF0c;}\n",
+       ".cython.score-196 {background-color: #FFFF0c;}\n",
+       ".cython.score-197 {background-color: #FFFF0c;}\n",
+       ".cython.score-198 {background-color: #FFFF0c;}\n",
+       ".cython.score-199 {background-color: #FFFF0c;}\n",
+       ".cython.score-200 {background-color: #FFFF0c;}\n",
+       ".cython.score-201 {background-color: #FFFF0c;}\n",
+       ".cython.score-202 {background-color: #FFFF0c;}\n",
+       ".cython.score-203 {background-color: #FFFF0b;}\n",
+       ".cython.score-204 {background-color: #FFFF0b;}\n",
+       ".cython.score-205 {background-color: #FFFF0b;}\n",
+       ".cython.score-206 {background-color: #FFFF0b;}\n",
+       ".cython.score-207 {background-color: #FFFF0b;}\n",
+       ".cython.score-208 {background-color: #FFFF0b;}\n",
+       ".cython.score-209 {background-color: #FFFF0b;}\n",
+       ".cython.score-210 {background-color: #FFFF0b;}\n",
+       ".cython.score-211 {background-color: #FFFF0b;}\n",
+       ".cython.score-212 {background-color: #FFFF0b;}\n",
+       ".cython.score-213 {background-color: #FFFF0b;}\n",
+       ".cython.score-214 {background-color: #FFFF0b;}\n",
+       ".cython.score-215 {background-color: #FFFF0b;}\n",
+       ".cython.score-216 {background-color: #FFFF0b;}\n",
+       ".cython.score-217 {background-color: #FFFF0b;}\n",
+       ".cython.score-218 {background-color: #FFFF0b;}\n",
+       ".cython.score-219 {background-color: #FFFF0b;}\n",
+       ".cython.score-220 {background-color: #FFFF0b;}\n",
+       ".cython.score-221 {background-color: #FFFF0b;}\n",
+       ".cython.score-222 {background-color: #FFFF0a;}\n",
+       ".cython.score-223 {background-color: #FFFF0a;}\n",
+       ".cython.score-224 {background-color: #FFFF0a;}\n",
+       ".cython.score-225 {background-color: #FFFF0a;}\n",
+       ".cython.score-226 {background-color: #FFFF0a;}\n",
+       ".cython.score-227 {background-color: #FFFF0a;}\n",
+       ".cython.score-228 {background-color: #FFFF0a;}\n",
+       ".cython.score-229 {background-color: #FFFF0a;}\n",
+       ".cython.score-230 {background-color: #FFFF0a;}\n",
+       ".cython.score-231 {background-color: #FFFF0a;}\n",
+       ".cython.score-232 {background-color: #FFFF0a;}\n",
+       ".cython.score-233 {background-color: #FFFF0a;}\n",
+       ".cython.score-234 {background-color: #FFFF0a;}\n",
+       ".cython.score-235 {background-color: #FFFF0a;}\n",
+       ".cython.score-236 {background-color: #FFFF0a;}\n",
+       ".cython.score-237 {background-color: #FFFF0a;}\n",
+       ".cython.score-238 {background-color: #FFFF0a;}\n",
+       ".cython.score-239 {background-color: #FFFF0a;}\n",
+       ".cython.score-240 {background-color: #FFFF0a;}\n",
+       ".cython.score-241 {background-color: #FFFF0a;}\n",
+       ".cython.score-242 {background-color: #FFFF0a;}\n",
+       ".cython.score-243 {background-color: #FFFF0a;}\n",
+       ".cython.score-244 {background-color: #FFFF0a;}\n",
+       ".cython.score-245 {background-color: #FFFF0a;}\n",
+       ".cython.score-246 {background-color: #FFFF09;}\n",
+       ".cython.score-247 {background-color: #FFFF09;}\n",
+       ".cython.score-248 {background-color: #FFFF09;}\n",
+       ".cython.score-249 {background-color: #FFFF09;}\n",
+       ".cython.score-250 {background-color: #FFFF09;}\n",
+       ".cython.score-251 {background-color: #FFFF09;}\n",
+       ".cython.score-252 {background-color: #FFFF09;}\n",
+       ".cython.score-253 {background-color: #FFFF09;}\n",
+       ".cython.score-254 {background-color: #FFFF09;}\n",
+       ".cython .hll { background-color: #ffffcc }\n",
+       ".cython  { background: #f8f8f8; }\n",
+       ".cython .c { color: #408080; font-style: italic } /* Comment */\n",
+       ".cython .err { border: 1px solid #FF0000 } /* Error */\n",
+       ".cython .k { color: #008000; font-weight: bold } /* Keyword */\n",
+       ".cython .o { color: #666666 } /* Operator */\n",
+       ".cython .ch { color: #408080; font-style: italic } /* Comment.Hashbang */\n",
+       ".cython .cm { color: #408080; font-style: italic } /* Comment.Multiline */\n",
+       ".cython .cp { color: #BC7A00 } /* Comment.Preproc */\n",
+       ".cython .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */\n",
+       ".cython .c1 { color: #408080; font-style: italic } /* Comment.Single */\n",
+       ".cython .cs { color: #408080; font-style: italic } /* Comment.Special */\n",
+       ".cython .gd { color: #A00000 } /* Generic.Deleted */\n",
+       ".cython .ge { font-style: italic } /* Generic.Emph */\n",
+       ".cython .gr { color: #FF0000 } /* Generic.Error */\n",
+       ".cython .gh { color: #000080; font-weight: bold } /* Generic.Heading */\n",
+       ".cython .gi { color: #00A000 } /* Generic.Inserted */\n",
+       ".cython .go { color: #888888 } /* Generic.Output */\n",
+       ".cython .gp { color: #000080; font-weight: bold } /* Generic.Prompt */\n",
+       ".cython .gs { font-weight: bold } /* Generic.Strong */\n",
+       ".cython .gu { color: #800080; font-weight: bold } /* Generic.Subheading */\n",
+       ".cython .gt { color: #0044DD } /* Generic.Traceback */\n",
+       ".cython .kc { color: #008000; font-weight: bold } /* Keyword.Constant */\n",
+       ".cython .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */\n",
+       ".cython .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */\n",
+       ".cython .kp { color: #008000 } /* Keyword.Pseudo */\n",
+       ".cython .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */\n",
+       ".cython .kt { color: #B00040 } /* Keyword.Type */\n",
+       ".cython .m { color: #666666 } /* Literal.Number */\n",
+       ".cython .s { color: #BA2121 } /* Literal.String */\n",
+       ".cython .na { color: #7D9029 } /* Name.Attribute */\n",
+       ".cython .nb { color: #008000 } /* Name.Builtin */\n",
+       ".cython .nc { color: #0000FF; font-weight: bold } /* Name.Class */\n",
+       ".cython .no { color: #880000 } /* Name.Constant */\n",
+       ".cython .nd { color: #AA22FF } /* Name.Decorator */\n",
+       ".cython .ni { color: #999999; font-weight: bold } /* Name.Entity */\n",
+       ".cython .ne { color: #D2413A; font-weight: bold } /* Name.Exception */\n",
+       ".cython .nf { color: #0000FF } /* Name.Function */\n",
+       ".cython .nl { color: #A0A000 } /* Name.Label */\n",
+       ".cython .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */\n",
+       ".cython .nt { color: #008000; font-weight: bold } /* Name.Tag */\n",
+       ".cython .nv { color: #19177C } /* Name.Variable */\n",
+       ".cython .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */\n",
+       ".cython .w { color: #bbbbbb } /* Text.Whitespace */\n",
+       ".cython .mb { color: #666666 } /* Literal.Number.Bin */\n",
+       ".cython .mf { color: #666666 } /* Literal.Number.Float */\n",
+       ".cython .mh { color: #666666 } /* Literal.Number.Hex */\n",
+       ".cython .mi { color: #666666 } /* Literal.Number.Integer */\n",
+       ".cython .mo { color: #666666 } /* Literal.Number.Oct */\n",
+       ".cython .sa { color: #BA2121 } /* Literal.String.Affix */\n",
+       ".cython .sb { color: #BA2121 } /* Literal.String.Backtick */\n",
+       ".cython .sc { color: #BA2121 } /* Literal.String.Char */\n",
+       ".cython .dl { color: #BA2121 } /* Literal.String.Delimiter */\n",
+       ".cython .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */\n",
+       ".cython .s2 { color: #BA2121 } /* Literal.String.Double */\n",
+       ".cython .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */\n",
+       ".cython .sh { color: #BA2121 } /* Literal.String.Heredoc */\n",
+       ".cython .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */\n",
+       ".cython .sx { color: #008000 } /* Literal.String.Other */\n",
+       ".cython .sr { color: #BB6688 } /* Literal.String.Regex */\n",
+       ".cython .s1 { color: #BA2121 } /* Literal.String.Single */\n",
+       ".cython .ss { color: #19177C } /* Literal.String.Symbol */\n",
+       ".cython .bp { color: #008000 } /* Name.Builtin.Pseudo */\n",
+       ".cython .fm { color: #0000FF } /* Name.Function.Magic */\n",
+       ".cython .vc { color: #19177C } /* Name.Variable.Class */\n",
+       ".cython .vg { color: #19177C } /* Name.Variable.Global */\n",
+       ".cython .vi { color: #19177C } /* Name.Variable.Instance */\n",
+       ".cython .vm { color: #19177C } /* Name.Variable.Magic */\n",
+       ".cython .il { color: #666666 } /* Literal.Number.Integer.Long */\n",
+       "    </style>\n",
+       "    <script>\n",
+       "    function toggleDiv(id) {\n",
+       "        theDiv = id.nextElementSibling\n",
+       "        if (theDiv.style.display != 'block') theDiv.style.display = 'block';\n",
+       "        else theDiv.style.display = 'none';\n",
+       "    }\n",
+       "    </script>\n",
+       "</head>\n",
+       "<body class=\"cython\">\n",
+       "<p><span style=\"border-bottom: solid 1px grey;\">Generated by Cython 0.25.2</span></p>\n",
+       "<p>\n",
+       "    <span style=\"background-color: #FFFF00\">Yellow lines</span> hint at Python interaction.<br />\n",
+       "    Click on a line that starts with a \"<code>+</code>\" to see the C code that Cython generated for it.\n",
+       "</p>\n",
+       "<div class=\"cython\"><pre class=\"cython line score-17\" onclick='toggleDiv(this)'>+<span class=\"\">1</span>: <span class=\"k\">def</span> <span class=\"nf\">foo</span><span class=\"p\">():</span></pre>\n",
+       "<pre class='cython code score-17 '>/* Python wrapper */\n",
+       "static PyObject *__pyx_pw_46_cython_magic_fcaa11e6595f50de5667d45b47247f97_1foo(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused); /*proto*/\n",
+       "static PyMethodDef __pyx_mdef_46_cython_magic_fcaa11e6595f50de5667d45b47247f97_1foo = {\"foo\", (PyCFunction)__pyx_pw_46_cython_magic_fcaa11e6595f50de5667d45b47247f97_1foo, METH_NOARGS, 0};\n",
+       "static PyObject *__pyx_pw_46_cython_magic_fcaa11e6595f50de5667d45b47247f97_1foo(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused) {\n",
+       "  PyObject *__pyx_r = 0;\n",
+       "  <span class='refnanny'>__Pyx_RefNannyDeclarations</span>\n",
+       "  <span class='refnanny'>__Pyx_RefNannySetupContext</span>(\"foo (wrapper)\", 0);\n",
+       "  __pyx_r = __pyx_pf_46_cython_magic_fcaa11e6595f50de5667d45b47247f97_foo(__pyx_self);\n",
+       "\n",
+       "  /* function exit code */\n",
+       "  <span class='refnanny'>__Pyx_RefNannyFinishContext</span>();\n",
+       "  return __pyx_r;\n",
+       "}\n",
+       "\n",
+       "static PyObject *__pyx_pf_46_cython_magic_fcaa11e6595f50de5667d45b47247f97_foo(CYTHON_UNUSED PyObject *__pyx_self) {\n",
+       "  PyObject *__pyx_v_i = NULL;\n",
+       "  PyObject *__pyx_r = NULL;\n",
+       "  <span class='refnanny'>__Pyx_RefNannyDeclarations</span>\n",
+       "  <span class='refnanny'>__Pyx_RefNannySetupContext</span>(\"foo\", 0);\n",
+       "/* … */\n",
+       "  /* function exit code */\n",
+       "  __pyx_r = Py_None; <span class='pyx_macro_api'>__Pyx_INCREF</span>(Py_None);\n",
+       "  goto __pyx_L0;\n",
+       "  __pyx_L1_error:;\n",
+       "  <span class='pyx_macro_api'>__Pyx_XDECREF</span>(__pyx_t_1);\n",
+       "  <span class='pyx_macro_api'>__Pyx_XDECREF</span>(__pyx_t_2);\n",
+       "  <span class='pyx_c_api'>__Pyx_AddTraceback</span>(\"_cython_magic_fcaa11e6595f50de5667d45b47247f97.foo\", __pyx_clineno, __pyx_lineno, __pyx_filename);\n",
+       "  __pyx_r = NULL;\n",
+       "  __pyx_L0:;\n",
+       "  <span class='pyx_macro_api'>__Pyx_XDECREF</span>(__pyx_v_i);\n",
+       "  <span class='refnanny'>__Pyx_XGIVEREF</span>(__pyx_r);\n",
+       "  <span class='refnanny'>__Pyx_RefNannyFinishContext</span>();\n",
+       "  return __pyx_r;\n",
+       "}\n",
+       "/* … */\n",
+       "  __pyx_tuple__2 = <span class='py_c_api'>PyTuple_Pack</span>(1, __pyx_n_s_i); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(0, 1, __pyx_L1_error)\n",
+       "  <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_tuple__2);\n",
+       "  <span class='refnanny'>__Pyx_GIVEREF</span>(__pyx_tuple__2);\n",
+       "/* … */\n",
+       "  __pyx_t_1 = PyCFunction_NewEx(&amp;__pyx_mdef_46_cython_magic_fcaa11e6595f50de5667d45b47247f97_1foo, NULL, __pyx_n_s_cython_magic_fcaa11e6595f50de56); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)\n",
+       "  <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_t_1);\n",
+       "  if (<span class='py_c_api'>PyDict_SetItem</span>(__pyx_d, __pyx_n_s_foo, __pyx_t_1) &lt; 0) __PYX_ERR(0, 1, __pyx_L1_error)\n",
+       "  <span class='pyx_macro_api'>__Pyx_DECREF</span>(__pyx_t_1); __pyx_t_1 = 0;\n",
+       "</pre><pre class=\"cython line score-54\" onclick='toggleDiv(this)'>+<span class=\"\">2</span>:     <span class=\"k\">for</span> <span class=\"n\">i</span> <span class=\"ow\">in</span> <span class=\"nb\">range</span><span class=\"p\">(</span><span class=\"mf\">50000</span><span class=\"p\">):</span></pre>\n",
+       "<pre class='cython code score-54 '>  __pyx_t_1 = <span class='pyx_c_api'>__Pyx_PyObject_Call</span>(__pyx_builtin_range, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "  <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_t_1);\n",
+       "  if (likely(<span class='py_c_api'>PyList_CheckExact</span>(__pyx_t_1)) || <span class='py_c_api'>PyTuple_CheckExact</span>(__pyx_t_1)) {\n",
+       "    __pyx_t_2 = __pyx_t_1; <span class='pyx_macro_api'>__Pyx_INCREF</span>(__pyx_t_2); __pyx_t_3 = 0;\n",
+       "    __pyx_t_4 = NULL;\n",
+       "  } else {\n",
+       "    __pyx_t_3 = -1; __pyx_t_2 = <span class='py_c_api'>PyObject_GetIter</span>(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "    <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_t_2);\n",
+       "    __pyx_t_4 = Py_TYPE(__pyx_t_2)-&gt;tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "  }\n",
+       "  <span class='pyx_macro_api'>__Pyx_DECREF</span>(__pyx_t_1); __pyx_t_1 = 0;\n",
+       "  for (;;) {\n",
+       "    if (likely(!__pyx_t_4)) {\n",
+       "      if (likely(<span class='py_c_api'>PyList_CheckExact</span>(__pyx_t_2))) {\n",
+       "        if (__pyx_t_3 &gt;= <span class='py_macro_api'>PyList_GET_SIZE</span>(__pyx_t_2)) break;\n",
+       "        #if CYTHON_ASSUME_SAFE_MACROS &amp;&amp; !CYTHON_AVOID_BORROWED_REFS\n",
+       "        __pyx_t_1 = <span class='py_macro_api'>PyList_GET_ITEM</span>(__pyx_t_2, __pyx_t_3); <span class='pyx_macro_api'>__Pyx_INCREF</span>(__pyx_t_1); __pyx_t_3++; if (unlikely(0 &lt; 0)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "        #else\n",
+       "        __pyx_t_1 = <span class='py_macro_api'>PySequence_ITEM</span>(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "        <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_t_1);\n",
+       "        #endif\n",
+       "      } else {\n",
+       "        if (__pyx_t_3 &gt;= <span class='py_macro_api'>PyTuple_GET_SIZE</span>(__pyx_t_2)) break;\n",
+       "        #if CYTHON_ASSUME_SAFE_MACROS &amp;&amp; !CYTHON_AVOID_BORROWED_REFS\n",
+       "        __pyx_t_1 = <span class='py_macro_api'>PyTuple_GET_ITEM</span>(__pyx_t_2, __pyx_t_3); <span class='pyx_macro_api'>__Pyx_INCREF</span>(__pyx_t_1); __pyx_t_3++; if (unlikely(0 &lt; 0)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "        #else\n",
+       "        __pyx_t_1 = <span class='py_macro_api'>PySequence_ITEM</span>(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "        <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_t_1);\n",
+       "        #endif\n",
+       "      }\n",
+       "    } else {\n",
+       "      __pyx_t_1 = __pyx_t_4(__pyx_t_2);\n",
+       "      if (unlikely(!__pyx_t_1)) {\n",
+       "        PyObject* exc_type = <span class='py_c_api'>PyErr_Occurred</span>();\n",
+       "        if (exc_type) {\n",
+       "          if (likely(exc_type == PyExc_StopIteration || <span class='py_c_api'>PyErr_GivenExceptionMatches</span>(exc_type, PyExc_StopIteration))) <span class='py_c_api'>PyErr_Clear</span>();\n",
+       "          else __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "        }\n",
+       "        break;\n",
+       "      }\n",
+       "      <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_t_1);\n",
+       "    }\n",
+       "    <span class='pyx_macro_api'>__Pyx_XDECREF_SET</span>(__pyx_v_i, __pyx_t_1);\n",
+       "    __pyx_t_1 = 0;\n",
+       "/* … */\n",
+       "  }\n",
+       "  <span class='pyx_macro_api'>__Pyx_DECREF</span>(__pyx_t_2); __pyx_t_2 = 0;\n",
+       "/* … */\n",
+       "  __pyx_tuple_ = <span class='py_c_api'>PyTuple_Pack</span>(1, __pyx_int_50000); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 2, __pyx_L1_error)\n",
+       "  <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_tuple_);\n",
+       "  <span class='refnanny'>__Pyx_GIVEREF</span>(__pyx_tuple_);\n",
+       "</pre><pre class=\"cython line score-6\" onclick='toggleDiv(this)'>+<span class=\"\">3</span>:         <span class=\"n\">i</span><span class=\"o\">*</span><span class=\"n\">i</span></pre>\n",
+       "<pre class='cython code score-6 '>    __pyx_t_1 = <span class='py_c_api'>PyNumber_Multiply</span>(__pyx_v_i, __pyx_v_i); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 3, __pyx_L1_error)\n",
+       "    <span class='refnanny'>__Pyx_GOTREF</span>(__pyx_t_1);\n",
+       "    <span class='pyx_macro_api'>__Pyx_DECREF</span>(__pyx_t_1); __pyx_t_1 = 0;\n",
+       "</pre></div></body></html>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%cython --annotate\n",
+    "def foo():\n",
+    "    for i in range(50000):\n",
+    "        i*i"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.289592Z",
+     "start_time": "2018-04-20T07:03:06.283966Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1.91 ms, sys: 3 µs, total: 1.92 ms\n",
+      "Wall time: 1.93 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time foo()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.296109Z",
+     "start_time": "2018-04-20T07:03:06.292000Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%cython\n",
+    "\n",
+    "cdef list ch=[\"Le\",\"pont\",\"d'\",\"avignon\",\"est\",\"-\",\"sympa\"]\n",
+    "def foo():\n",
+    "    print([c+(\"\" if c[-1] in [\"\\'\",\"-\"] else \" \") for c in ch])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.302224Z",
+     "start_time": "2018-04-20T07:03:06.298061Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Le ', 'pont ', \"d'\", 'avignon ', 'est ', '-', 'sympa ']\n",
+      "CPU times: user 70 µs, sys: 29 µs, total: 99 µs\n",
+      "Wall time: 83.9 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time foo()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.307291Z",
+     "start_time": "2018-04-20T07:03:06.304153Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ch=[\"Le\",\"pont\",\"d'\",\"avignon\",\"est\",\"-\",\"sympa\"]\n",
+    "def foo():\n",
+    "    print([c+(\"\" if c[-1] in [\"\\'\",\"-\"] else \" \") for c in ch])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.312937Z",
+     "start_time": "2018-04-20T07:03:06.309325Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Le ', 'pont ', \"d'\", 'avignon ', 'est ', '-', 'sympa ']\n",
+      "CPU times: user 49 µs, sys: 20 µs, total: 69 µs\n",
+      "Wall time: 59.8 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time foo()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:06.320412Z",
+     "start_time": "2018-04-20T07:03:06.315555Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%cython\n",
+    "\n",
+    "def cyfac(n):\n",
+    "    if n <= 1:\n",
+    "        return 1\n",
+    "    return n * cyfac(n - 1)\n",
+    "\n",
+    "def cyfac_double(double n):\n",
+    "    if n <= 1:\n",
+    "        return 1.0\n",
+    "    return n * cyfac_double(n - 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:19.194254Z",
+     "start_time": "2018-04-20T07:03:06.323051Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.58 µs ± 15.5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2.43290200817664e+18"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "%timeit cyfac(20.0)\n",
+    "cyfac(20.0)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:27.738161Z",
+     "start_time": "2018-04-20T07:03:19.196269Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.05 µs ± 6.72 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2.43290200817664e+18"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%timeit cyfac_double(20.0)\n",
+    "cyfac_double(20.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:27.745810Z",
+     "start_time": "2018-04-20T07:03:27.740673Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%cython\n",
+    "\n",
+    "cpdef double cyfac_double_fast(double n):\n",
+    "    if n <= 1:\n",
+    "        return 1.0\n",
+    "    return n * cyfac_double_fast(n - 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.326307Z",
+     "start_time": "2018-04-20T07:03:27.748079Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "117 ns ± 4.03 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2.43290200817664e+18"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%timeit cyfac_double_fast(20.0)\n",
+    "cyfac_double_fast(20.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.332486Z",
+     "start_time": "2018-04-20T07:03:37.328492Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jacquesfize/nas_cloud/Code/str-python\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.421880Z",
+     "start_time": "2018-04-20T07:03:37.334723Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "class PureHED():\n",
+    "    \"\"\"\n",
+    "    Implementation of Hausdorff Edit Distance described in\n",
+    "\n",
+    "    Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement\n",
+    "    Andreas Fischer, Kaspar Riesen, Horst Bunke\n",
+    "    2016\n",
+    "    \"\"\"\n",
+    "    __type__ = \"dist\"\n",
+    "    @staticmethod\n",
+    "    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):\n",
+    "        n = len(listgs)\n",
+    "        comparator = PureHED(c_del_node, c_ins_node, c_del_edge, c_ins_edge)\n",
+    "        comparison_matrix = np.zeros((n, n))\n",
+    "        for i in range(n):\n",
+    "            for j in range(i, n):\n",
+    "                comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])\n",
+    "                comparison_matrix[j, i] = comparison_matrix[i, j]\n",
+    "\n",
+    "        return comparison_matrix\n",
+    "\n",
+    "\n",
+    "    def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1):\n",
+    "        \"\"\"Constructor for HED\"\"\"\n",
+    "        self.node_del = node_del\n",
+    "        self.node_ins = node_ins\n",
+    "        self.edge_del = edge_del\n",
+    "        self.edge_ins = edge_ins\n",
+    "\n",
+    "    def hed(self, g1, g2):\n",
+    "        \"\"\"\n",
+    "        Compute de Hausdorff Edit Distance\n",
+    "        :param g1: first graph\n",
+    "        :param g2: second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        return self.sum_fuv(g1, g2) + self.sum_fuv(g2, g1)\n",
+    "\n",
+    "    def sum_fuv(self, g1, g2):\n",
+    "        \"\"\"\n",
+    "        Compute Nearest Neighbour Distance between G1 and G2\n",
+    "        :param g1: First Graph\n",
+    "        :param g2: Second Graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        min_sum = np.zeros(len(g1))\n",
+    "        nodes1 = g1.nodes()\n",
+    "        nodes2 = g2.nodes()\n",
+    "        nodes2.extend([None])\n",
+    "        for i in range(len(nodes1)):\n",
+    "            min_i = np.zeros(len(nodes2))\n",
+    "            for j in range(len(nodes2)):\n",
+    "                min_i[j] = self.fuv(g1, g2, nodes1[i], nodes2[j])\n",
+    "            min_sum[i] = np.min(min_i)\n",
+    "        return np.sum(min_sum)\n",
+    "\n",
+    "    def fuv(self, g1, g2, n1, n2):\n",
+    "        \"\"\"\n",
+    "        Compute the Node Distance function\n",
+    "        :param g1: first graph\n",
+    "        :param g2: second graph\n",
+    "        :param n1: node of the first graph\n",
+    "        :param n2: node of the second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        if n2 == None:  # Del\n",
+    "            return self.node_del + ((self.edge_del / 2) * g1.degree(n1))\n",
+    "        if n1 == None:  # Insert\n",
+    "            return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))\n",
+    "        else:\n",
+    "            if n1 == n2:\n",
+    "                return 0.\n",
+    "            return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2\n",
+    "\n",
+    "    def hed_edge(self, g1, g2, n1, n2):\n",
+    "        \"\"\"\n",
+    "        Compute HEDistance between edges of n1 and n2, respectively in g1 and g2\n",
+    "        :param g1: first graph\n",
+    "        :param g2: second graph\n",
+    "        :param n1: node of the first graph\n",
+    "        :param n2: node of the second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)\n",
+    "\n",
+    "    def get_edge_multigraph(self, g, node):\n",
+    "        \"\"\"\n",
+    "        Get list of edge around a node in a Multigraph\n",
+    "        :param g: multigraph\n",
+    "        :param node: node in the multigraph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        edges = []\n",
+    "        for edge in g.edges(data=True):\n",
+    "            if node == edge[0] or node == edge[1]:\n",
+    "                edges.append(\"{0}-{1}\".format(edge[0],edge[1]))\n",
+    "        return edges\n",
+    "\n",
+    "    def sum_gpq(self, g1, n1, g2, n2):\n",
+    "        \"\"\"\n",
+    "        Compute Nearest Neighbour Distance between edges around n1 in G1  and edges around n2 in G2\n",
+    "        :param g1: first graph\n",
+    "        :param n1: node in the first graph\n",
+    "        :param g2: second graph\n",
+    "        :param n2: node in the second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "\n",
+    "        #if isinstance(g1, nx.MultiDiGraph):\n",
+    "        edges1 = self.get_edge_multigraph(g1, n1)\n",
+    "        edges2 = self.get_edge_multigraph(g2, n2)\n",
+    "\n",
+    "        #else:\n",
+    "            #edges1 = [str(n1 + \"-\" + ef) for ef in list(g1.edge[n1].keys())]\n",
+    "            #edges2 = [str(n2 + \"-\" + ef) for ef in list(g2.edge[n2].keys())]\n",
+    "\n",
+    "        min_sum = np.zeros(len(edges1))\n",
+    "        edges2.extend([None])\n",
+    "        for i in range(len(edges1)):\n",
+    "            min_i = np.zeros(len(edges2))\n",
+    "            for j in range(len(edges2)):\n",
+    "                min_i[j] = self.gpq(edges1[i], edges2[j])\n",
+    "            min_sum[i] = np.min(min_i)\n",
+    "        return np.sum(min_sum)\n",
+    "\n",
+    "    def gpq(self, e1, e2):\n",
+    "        \"\"\"\n",
+    "        Compute the edge distance function\n",
+    "        :param e1: edge1\n",
+    "        :param e2: edge2\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        if e2 == None:  # Del\n",
+    "            return self.edge_del\n",
+    "        if e1 == None:  # Insert\n",
+    "            return self.edge_ins\n",
+    "        else:\n",
+    "            if e1 == e2:\n",
+    "                return 0\n",
+    "            return (self.edge_del + self.edge_ins) / 2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.430098Z",
+     "start_time": "2018-04-20T07:03:37.424291Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%cython \n",
+    "import numpy as np\n",
+    "cimport numpy as np\n",
+    "cdef class HED:\n",
+    "    \"\"\"\n",
+    "    Implementation of Hausdorff Edit Distance described in\n",
+    "\n",
+    "    Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement\n",
+    "    Andreas Fischer, Kaspar Riesen, Horst Bunke\n",
+    "    2016\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    cdef int node_del \n",
+    "    cdef int node_ins \n",
+    "    cdef int edge_del \n",
+    "    cdef int edge_ins \n",
+    "    \n",
+    "    __type__ = \"dist\"\n",
+    "    @staticmethod\n",
+    "    def compare(list listgs, int c_del_node=1, int c_del_edge=1, int c_ins_node=1, int c_ins_edge=1):\n",
+    "        cdef int n = len(listgs)\n",
+    "        comparator = HED(c_del_node, c_ins_node, c_del_edge, c_ins_edge)\n",
+    "        cdef np.ndarray comparison_matrix = np.zeros((n, n))\n",
+    "        for i in range(n):\n",
+    "            for j in range(i, n):\n",
+    "                comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])\n",
+    "                comparison_matrix[j, i] = comparison_matrix[i, j]\n",
+    "\n",
+    "        return comparison_matrix\n",
+    "\n",
+    "\n",
+    "    def __init__(self, int node_del=1, int node_ins=1, int edge_del=1, int edge_ins=1):\n",
+    "        \"\"\"Constructor for HED\"\"\"\n",
+    "        self.node_del = node_del\n",
+    "        self.node_ins = node_ins\n",
+    "        self.edge_del = edge_del\n",
+    "        self.edge_ins = edge_ins\n",
+    "\n",
+    "    cpdef float hed(self, g1, g2):\n",
+    "        \"\"\"\n",
+    "        Compute de Hausdorff Edit Distance\n",
+    "        :param g1: first graph\n",
+    "        :param g2: second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        return self.sum_fuv(g1, g2) + self.sum_fuv(g2, g1)\n",
+    "\n",
+    "    cpdef float sum_fuv(self, g1, g2):\n",
+    "        \"\"\"\n",
+    "        Compute Nearest Neighbour Distance between G1 and G2\n",
+    "        :param g1: First Graph\n",
+    "        :param g2: Second Graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        cdef np.ndarray min_sum = np.zeros(len(g1))\n",
+    "        nodes1 = g1.nodes()\n",
+    "        nodes2 = g2.nodes()\n",
+    "        nodes2.extend([None])\n",
+    "        cdef np.ndarray min_i\n",
+    "        for i in range(len(nodes1)):\n",
+    "            min_i = np.zeros(len(nodes2))\n",
+    "            for j in range(len(nodes2)):\n",
+    "                min_i[j] = self.fuv(g1, g2, nodes1[i], nodes2[j])\n",
+    "            min_sum[i] = np.min(min_i)\n",
+    "        return np.sum(min_sum)\n",
+    "\n",
+    "    cpdef float fuv(self, g1, g2, n1, n2):\n",
+    "        \"\"\"\n",
+    "        Compute the Node Distance function\n",
+    "        :param g1: first graph\n",
+    "        :param g2: second graph\n",
+    "        :param n1: node of the first graph\n",
+    "        :param n2: node of the second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        if n2 == None:  # Del\n",
+    "            return self.node_del + ((self.edge_del / 2) * g1.degree(n1))\n",
+    "        if n1 == None:  # Insert\n",
+    "            return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))\n",
+    "        else:\n",
+    "            if n1 == n2:\n",
+    "                return 0\n",
+    "            return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2\n",
+    "\n",
+    "    cpdef float hed_edge(self, g1, g2, n1, n2):\n",
+    "        \"\"\"\n",
+    "        Compute HEDistance between edges of n1 and n2, respectively in g1 and g2\n",
+    "        :param g1: first graph\n",
+    "        :param g2: second graph\n",
+    "        :param n1: node of the first graph\n",
+    "        :param n2: node of the second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)\n",
+    "\n",
+    "    cpdef list get_edge_multigraph(self, g, node):\n",
+    "        \"\"\"\n",
+    "        Get list of edge around a node in a Multigraph\n",
+    "        :param g: multigraph\n",
+    "        :param node: node in the multigraph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        cdef list edges = []\n",
+    "        for edge in g.edges(data=True):\n",
+    "            if node == edge[0] or node == edge[1]:\n",
+    "                edges.append(\"{0}-{1}\".format(edge[0],edge[1]))\n",
+    "        return edges\n",
+    "\n",
+    "    cpdef float  sum_gpq(self, g1, n1, g2, n2):\n",
+    "        \"\"\"\n",
+    "        Compute Nearest Neighbour Distance between edges around n1 in G1  and edges around n2 in G2\n",
+    "        :param g1: first graph\n",
+    "        :param n1: node in the first graph\n",
+    "        :param g2: second graph\n",
+    "        :param n2: node in the second graph\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "\n",
+    "        #if isinstance(g1, nx.MultiDiGraph):\n",
+    "        cdef list edges1 = self.get_edge_multigraph(g1, n1)\n",
+    "        cdef list edges2 = self.get_edge_multigraph(g2, n2)\n",
+    "\n",
+    "        #else:\n",
+    "            #edges1 = [str(n1 + \"-\" + ef) for ef in list(g1.edge[n1].keys())]\n",
+    "            #edges2 = [str(n2 + \"-\" + ef) for ef in list(g2.edge[n2].keys())]\n",
+    "\n",
+    "        cdef np.ndarray min_sum = np.zeros(len(edges1))\n",
+    "        edges2.extend([None])\n",
+    "        cdef np.ndarray min_i\n",
+    "        for i in range(len(edges1)):\n",
+    "            min_i = np.zeros(len(edges2))\n",
+    "            for j in range(len(edges2)):\n",
+    "                min_i[j] = self.gpq(edges1[i], edges2[j])\n",
+    "            min_sum[i] = np.min(min_i)\n",
+    "        return np.sum(min_sum)\n",
+    "\n",
+    "    cpdef float gpq(self, str e1, str e2):\n",
+    "        \"\"\"\n",
+    "        Compute the edge distance function\n",
+    "        :param e1: edge1\n",
+    "        :param e2: edge2\n",
+    "        :return:\n",
+    "        \"\"\"\n",
+    "        if e2 == None:  # Del\n",
+    "            return self.edge_del\n",
+    "        if e1 == None:  # Insert\n",
+    "            return self.edge_ins\n",
+    "        else:\n",
+    "            if e1 == e2:\n",
+    "                return 0\n",
+    "            return (self.edge_del + self.edge_ins) / 2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.434790Z",
+     "start_time": "2018-04-20T07:03:37.432514Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.736648Z",
+     "start_time": "2018-04-20T07:03:37.437089Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import networkx as nx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.750336Z",
+     "start_time": "2018-04-20T07:03:37.738699Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "g1=nx.read_gexf(\"/Users/jacquesfize/LOD_DATASETS/exp_17_avr18/normal/2000.gexf\")\n",
+    "g2=nx.read_gexf(\"/Users/jacquesfize/LOD_DATASETS/exp_17_avr18/normal/4620.gexf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:03:37.763916Z",
+     "start_time": "2018-04-20T07:03:37.753086Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "g1=nx.random_graphs.barabasi_albert_graph(50,25)\n",
+    "g2=nx.random_graphs.barabasi_albert_graph(50,25)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:04:01.654836Z",
+     "start_time": "2018-04-20T07:03:37.766300Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 23.8 s, sys: 191 ms, total: 24 s\n",
+      "Wall time: 23.9 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[0., 0.],\n",
+       "       [0., 0.]])"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%time PureHED.compare([g1,g2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-04-20T07:04:14.050637Z",
+     "start_time": "2018-04-20T07:04:01.657104Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 12.4 s, sys: 41.7 ms, total: 12.4 s\n",
+      "Wall time: 12.4 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[0., 0.],\n",
+       "       [0., 0.]])"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%time HED.compare([g1,g2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/EvalDesambiguisationMada.ipynb b/notebooks/EvalDesambiguisationMada.ipynb
new file mode 100644
index 0000000..4cb419e
--- /dev/null
+++ b/notebooks/EvalDesambiguisationMada.ipynb
@@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T23:58:48.134280Z",
+     "start_time": "2018-05-16T23:58:47.729327Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T23:58:48.140894Z",
+     "start_time": "2018-05-16T23:58:48.136384Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jacquesfize/nas_cloud/Code/str-python\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T23:58:48.150739Z",
+     "start_time": "2018-05-16T23:58:48.143107Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import glob,re,sys\n",
+    "fns=glob.glob(\"data/mada_disambiguisation/*.csv\")\n",
+    "ids_list=[int(re.findall(r\"\\d+\",fn)[-1]) for fn in fns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T23:58:48.173363Z",
+     "start_time": "2018-05-16T23:58:48.153066Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "data_lang=json.load(open(\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json\"))\n",
+    "data_lang={int(k):v for k,v in data_lang.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T23:58:48.864223Z",
+     "start_time": "2018-05-16T23:58:48.177516Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from nlp.disambiguator.geodict_gaurav import GauravGeodict\n",
+    "from nlp.disambiguator.most_common import MostCommonDisambiguator\n",
+    "disMost_common=MostCommonDisambiguator()\n",
+    "disGaurav=GauravGeodict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:59:53.695102Z",
+     "start_time": "2018-05-17T00:59:53.685756Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df=pd.read_csv(\"data/mada_disambiguisation/11.csv\")\n",
+    "\n",
+    "def accuracyMostCommon(df,lang):\n",
+    "    df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"text\",\"GID\"]]\n",
+    "    df2[\"disambiguation\"]=df2.text.apply(lambda x:disMost_common.disambiguate_(x,lang)[0])\n",
+    "    return (df2.GID == df2.disambiguation).sum()/len(df2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:01:52.885111Z",
+     "start_time": "2018-05-17T00:01:52.850434Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:00:09.181696Z",
+     "start_time": "2018-05-17T01:00:09.178578Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:59:55.445531Z",
+     "start_time": "2018-05-17T00:59:55.407867Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%autoreload\n",
+    "def accuracyGeodict(df,lang):\n",
+    "    df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"text\",\"GID\"]]\n",
+    "    res_dis=disGaurav.eval(df2[\"text\"].unique(),lang)\n",
+    "    df2[\"disambiguation\"]=df2.text.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n",
+    "    return (df2.GID == df2.disambiguation).sum()/len(df2)\n",
+    "#df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:22:15.528864Z",
+     "start_time": "2018-05-17T01:01:01.373760Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars\n",
+      "  \n"
+     ]
+    }
+   ],
+   "source": [
+    "acc_MC,acc_GEO=[],[]\n",
+    "for fn in fns:\n",
+    "    id_=int(re.findall(r\"\\d+\",fn)[-1])\n",
+    "    \n",
+    "    df=pd.read_csv(fn)\n",
+    "    acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\n",
+    "    acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:22:15.574548Z",
+     "start_time": "2018-05-17T01:22:15.567387Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6118508350166977"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "np.mean(np.nan_to_num(acc_GEO))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:22:15.618633Z",
+     "start_time": "2018-05-17T01:22:15.612431Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.7694373020389706"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(np.nan_to_num(acc_MC))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "position": {
+    "height": "297px",
+    "left": "914px",
+    "right": "20px",
+    "top": "120px",
+    "width": "350px"
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/EvalDesambiguisationPADIWEB.ipynb b/notebooks/EvalDesambiguisationPADIWEB.ipynb
new file mode 100644
index 0000000..ba763a8
--- /dev/null
+++ b/notebooks/EvalDesambiguisationPADIWEB.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:50:38.399698Z",
+     "start_time": "2018-05-17T00:50:38.396888Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:48:13.001356Z",
+     "start_time": "2018-05-17T00:48:12.994569Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jacquesfize/nas_cloud/Code/str-python\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:54:11.406691Z",
+     "start_time": "2018-05-17T00:54:11.400933Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from elasticsearch import Elasticsearch\n",
+    "\n",
+    "from config.configuration import config\n",
+    "\n",
+    "es = Elasticsearch(config.es_server)\n",
+    "def get_data_by_geoname_id(id):\n",
+    "    res = es.search(\"gazetteer\", \"place\",\n",
+    "                    body={\"query\": {\"bool\": {\"must\": [{\"term\": {\"geonameID\": id}}], \"must_not\": [], \"should\": []}}, \"from\": 0,\n",
+    "                          \"size\": 10, \"sort\": [], \"aggs\": {}})\n",
+    "    if res[\"hits\"][\"total\"] > 0:\n",
+    "        res = res[\"hits\"][\"hits\"][0][\"_source\"]\n",
+    "    return res\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:25:05.006779Z",
+     "start_time": "2018-05-17T01:25:05.000357Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def parse_file(fn):\n",
+    "    id_=int(re.findall(r\"\\d+\",fn)[-1])\n",
+    "    lang=langdetect.detect(open(\"data/EPI_ELENA/raw_text/{0}.txt\".format(id_)).read())\n",
+    "    df=pd.read_json(fn,orient=\"index\")\n",
+    "    try:\n",
+    "        df=df[(df[\"type\"]==\"location\") & (df[\"annotation\"]==\"correct\")]\n",
+    "    except:\n",
+    "        return\n",
+    "    df[\"GID\"]=df[\"info\"].apply(lambda x:get_data_by_geoname_id(x[\"id\"])[\"id\"])\n",
+    "    df[\"content\"]=df[\"content\"].apply(lambda x:re.sub(r\"\\s+\",\" \",x.strip()))\n",
+    "    return df,lang\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:56:26.195260Z",
+     "start_time": "2018-05-17T00:56:26.185713Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import glob,re,sys\n",
+    "fns=glob.glob(\"data/EPI_ELENA/final_annotations/*.json\")\n",
+    "ids_list=[int(re.findall(r\"\\d+\",fn)[-1]) for fn in fns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:05:10.917961Z",
+     "start_time": "2018-05-17T01:05:10.915317Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import langdetect"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:57:28.905930Z",
+     "start_time": "2018-05-17T00:57:28.346854Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from nlp.disambiguator.geodict_gaurav import GauravGeodict\n",
+    "from nlp.disambiguator.most_common import MostCommonDisambiguator\n",
+    "disMost_common=MostCommonDisambiguator()\n",
+    "disGaurav=GauravGeodict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:10:19.593778Z",
+     "start_time": "2018-05-17T01:10:19.585332Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df=pd.read_csv(\"data/mada_disambiguisation/11.csv\")\n",
+    "\n",
+    "def accuracyMostCommon(df,lang):\n",
+    "    df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"content\",\"GID\"]]\n",
+    "    df2[\"disambiguation\"]=df2.content.apply(lambda x:disMost_common.disambiguate_(x,lang)[0])\n",
+    "    return (df2.GID == df2.disambiguation).sum()/len(df2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T00:01:52.885111Z",
+     "start_time": "2018-05-17T00:01:52.850434Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:10:21.463216Z",
+     "start_time": "2018-05-17T01:10:21.098003Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6666666666666666"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df,lang=parse_file(fns[0])\n",
+    "accuracyMostCommon(df,lang)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:06:38.089187Z",
+     "start_time": "2018-05-17T01:06:38.080846Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%autoreload\n",
+    "def accuracyGeodict(df,lang):\n",
+    "    df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"content\",\"GID\"]]\n",
+    "    res_dis=disGaurav.eval(df2[\"content\"].unique(),lang)\n",
+    "    df2[\"disambiguation\"]=df2.content.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n",
+    "    return (df2.GID == df2.disambiguation).sum()/len(df2)\n",
+    "#df\n",
+    "#df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:37:02.165192Z",
+     "start_time": "2018-05-17T01:25:31.325566Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.6/site-packages/pandas/core/ops.py:816: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
+      "  result = getattr(x, name)(y)\n",
+      "/Users/jacquesfize/nas_cloud/Code/str-python/helpers/collision.py:30: RuntimeWarning: invalid value encountered in double_scalars\n",
+      "  d_over_o_squared = d/np.dot(o, o) + 1e-10\n",
+      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars\n",
+      "  \n"
+     ]
+    }
+   ],
+   "source": [
+    "acc_MC,acc_GEO=[],[]\n",
+    "for fn in fns:\n",
+    "    \n",
+    "    try:\n",
+    "        df,lang=parse_file(fn)\n",
+    "        acc_MC.append(accuracyMostCommon(df,lang))\n",
+    "        acc_GEO.append(accuracyGeodict(df,lang))\n",
+    "    except:\n",
+    "        pass\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:37:02.209200Z",
+     "start_time": "2018-05-17T01:37:02.200462Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5139891137064413"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "np.mean(np.nan_to_num(acc_GEO))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T01:37:02.250591Z",
+     "start_time": "2018-05-17T01:37:02.246260Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5267050989770068"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(np.nan_to_num(acc_MC))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "position": {
+    "height": "297px",
+    "left": "914px",
+    "right": "20px",
+    "top": "120px",
+    "width": "350px"
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/EvalTopoMadagascar.ipynb b/notebooks/EvalTopoMadagascar.ipynb
new file mode 100644
index 0000000..9f6a358
--- /dev/null
+++ b/notebooks/EvalTopoMadagascar.ipynb
@@ -0,0 +1,719 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:39.543009Z",
+     "start_time": "2018-05-17T06:15:39.538598Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jacquesfize/nas_cloud/Code/str-python\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:39.906690Z",
+     "start_time": "2018-05-17T06:15:39.545042Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from nlp.disambiguator.disambiguator import Disambiguator\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:41.165016Z",
+     "start_time": "2018-05-17T06:15:39.908807Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pipeline import *\n",
+    "from nlp.pos_tagger.tagger import Tagger\n",
+    "from nlp.disambiguator.pagerank import *\n",
+    "from nlp.disambiguator.geodict_gaurav import *\n",
+    "from nlp.pos_tagger.treetagger import TreeTagger\n",
+    "from nlp.ner.stanford_ner import StanfordNER\n",
+    "from nlp.ner.polyglot import Polyglot\n",
+    "from nlp.ner.nltk import NLTK\n",
+    "from nlp.ner.gate_annie import GateAnnie\n",
+    "from nlp.ner.spacy import Spacy\n",
+    "from nlp.ner.ner import NER\n",
+    "from progressbar import ProgressBar\n",
+    "from polyglot.text import Text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:50.113793Z",
+     "start_time": "2018-05-17T06:15:41.167223Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Language may not be supported by NTLK !\n"
+     ]
+    }
+   ],
+   "source": [
+    "pipStanford={\n",
+    "    \"en\":Pipeline(lang=\"english\",tagger=Tagger(),ner=StanfordNER(lang=\"en\")),\n",
+    "    \"fr\":Pipeline(lang=\"french\",tagger=Tagger(),ner=StanfordNER(lang=\"fr\"))\n",
+    "}\n",
+    "\n",
+    "pipNLTK={\n",
+    "    \"en\":Pipeline(lang=\"english\",tagger=Tagger(),ner=NLTK(lang=\"en\")),\n",
+    "    \"fr\":Pipeline(lang=\"french\",tagger=Tagger(),ner=NLTK(lang=\"fr\"))\n",
+    "}\n",
+    "\n",
+    "pipPolyglot={\n",
+    "    \"en\":Pipeline(lang=\"english\",tagger=Tagger(),ner=Polyglot(lang=\"en\")),\n",
+    "    \"fr\":Pipeline(lang=\"english\",tagger=Tagger(),ner=Polyglot(lang=\"fr\"))\n",
+    "}\n",
+    "\n",
+    "pipGate={\n",
+    "    \"en\":Pipeline(lang=\"english\",tagger=Tagger(),ner=GateAnnie(lang=\"en\")),\n",
+    "    \"fr\":Pipeline(lang=\"french\",tagger=Tagger(),ner=GateAnnie(lang=\"fr\"))\n",
+    "}\n",
+    "\n",
+    "pipSpacy={\n",
+    "    \"en\":Pipeline(lang=\"english\",tagger=Tagger(),ner=Spacy(lang=\"en\")),\n",
+    "    \"fr\":Pipeline(lang=\"french\",tagger=Tagger(),ner=Spacy(lang=\"fr\"))\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:50.130340Z",
+     "start_time": "2018-05-17T06:15:50.115895Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "data_lang=json.load(open(\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json\"))\n",
+    "data_lang={int(k):v for k,v in data_lang.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:50.138305Z",
+     "start_time": "2018-05-17T06:15:50.132448Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import glob,re,sys\n",
+    "fns=glob.glob(\"data/mada_disambiguisation/*.csv\")\n",
+    "ids_list=[int(re.findall(r\"\\d+\",fn)[-1]) for fn in fns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:50.143454Z",
+     "start_time": "2018-05-17T06:15:50.139829Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from ipywidgets import IntProgress\n",
+    "from IPython.display import display\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:50.169663Z",
+     "start_time": "2018-05-17T06:15:50.145641Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "input_dir=\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac/\"\n",
+    "\n",
+    "def compute_precision_recall(pipeline):\n",
+    "    precision=[]\n",
+    "    recall=[]\n",
+    "    co=0\n",
+    "    for i in ids_list:\n",
+    "        sys.stdout.write(\"\\r{0}/{1}\".format(co,len(ids_list)))\n",
+    "        lang=data_lang[i]\n",
+    "        data_real=pd.read_csv(\"data/mada_disambiguisation/{0}.csv\".format(i))\n",
+    "        data_real=data_real[-data_real[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"text\",\"GID\"]]\n",
+    "        text=open(\"{0}/{1}.txt\".format(input_dir.rstrip(\"/\"),i)).read()\n",
+    "        \n",
+    "        try:\n",
+    "            res_ner=pipeline[lang].ner.identify(text)\n",
+    "            res_ner=Disambiguator.parse_corpus(res_ner)\n",
+    "        except Exception as e:\n",
+    "            print(e)\n",
+    "            continue\n",
+    "        system_data=pd.DataFrame(res_ner,columns=[\"text\",\"pos\"])\n",
+    "        system_data=system_data[system_data[\"pos\"]==\"LOC\"]\n",
+    "        #count_tp=system_data[\"text\"].str.lower().isin(data_real[\"text\"].str.lower()).sum()\n",
+    "        count_tp=len(set(data_real[\"text\"].str.lower().unique())&(set(system_data[\"text\"].str.lower().unique())))\n",
+    "        count_fp=len(system_data)-count_tp\n",
+    "        try:\n",
+    "            precision.append(count_tp/len(system_data[\"text\"].unique()))\n",
+    "        except:\n",
+    "            print(1)\n",
+    "            precision.append(0)\n",
+    "        try:\n",
+    "            recall.append(count_tp/len(data_real[\"text\"].unique()))\n",
+    "        except:\n",
+    "            print(2)\n",
+    "            recall.append(0)\n",
+    "        co+=1\n",
+    "    return precision,recall\n",
+    "        #pd.DataFrame(res_ner,columns=[\"text\",\"pos\"])\n",
+    "#compute_precision_recall(pipSpacy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:15:50.201209Z",
+     "start_time": "2018-05-17T06:15:50.171396Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:27:25.917340Z",
+     "start_time": "2018-05-17T06:17:25.038572Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "117/2322\n",
+      "231/232"
+     ]
+    }
+   ],
+   "source": [
+    "%autoreload\n",
+    "prec_sp,rec_sp=compute_precision_recall(pipSpacy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:43:36.230684Z",
+     "start_time": "2018-05-17T06:27:55.927495Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3/2321\n",
+      "4/2321\n",
+      "41/2321\n",
+      "42/2321\n",
+      "43/2321\n",
+      "44/2321\n",
+      "46/2321\n",
+      "48/2321\n",
+      "51/232list index out of range\n",
+      "54/2321\n",
+      "61/2321\n",
+      "65/2321\n",
+      "76/2321\n",
+      "78/2321\n",
+      "79/2321\n",
+      "82/2321\n",
+      "83/2321\n",
+      "114/2321\n",
+      "116/2321\n",
+      "2\n",
+      "117/2321\n",
+      "156/2321\n",
+      "157/2321\n",
+      "174/2321\n",
+      "193/2321\n",
+      "194/2321\n",
+      "205/2321\n",
+      "211/2321\n",
+      "214/2321\n",
+      "215/2321\n",
+      "220/232list index out of range\n",
+      "222/2321\n",
+      "223/2321\n",
+      "229/232"
+     ]
+    }
+   ],
+   "source": [
+    "%autoreload\n",
+    "prec_st,rec_st=compute_precision_recall(pipStanford)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T06:56:10.536873Z",
+     "start_time": "2018-05-17T06:43:36.284258Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "117/2322\n",
+      "231/232"
+     ]
+    }
+   ],
+   "source": [
+    "prec_nl,rec_nl=compute_precision_recall(pipNLTK)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T07:05:03.304819Z",
+     "start_time": "2018-05-17T06:56:10.591028Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "41/232Package 'ner2.mg' not found in index\n",
+      "41/232Package 'ner2.mg' not found in index\n",
+      "67/232Package 'ner2.mg' not found in index\n",
+      "114/2321\n",
+      "2\n",
+      "228/232"
+     ]
+    }
+   ],
+   "source": [
+    "prec_po,rec_po=compute_precision_recall(pipPolyglot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T07:19:35.445903Z",
+     "start_time": "2018-05-17T07:05:03.362992Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2/232list index out of range\n",
+      "3/232list index out of range\n",
+      "5/232list index out of range\n",
+      "8/232list index out of range\n",
+      "15/232list index out of range\n",
+      "15/232list index out of range\n",
+      "15/232list index out of range\n",
+      "15/232list index out of range\n",
+      "15/232list index out of range\n",
+      "15/232list index out of range\n",
+      "15/232list index out of range\n",
+      "16/232list index out of range\n",
+      "21/232list index out of range\n",
+      "27/232list index out of range\n",
+      "27/232list index out of range\n",
+      "27/232list index out of range\n",
+      "27/232list index out of range\n",
+      "27/232list index out of range\n",
+      "27/232list index out of range\n",
+      "28/232list index out of range\n",
+      "28/2321\n",
+      "29/232list index out of range\n",
+      "29/232list index out of range\n",
+      "34/232list index out of range\n",
+      "34/232list index out of range\n",
+      "34/232list index out of range\n",
+      "34/232list index out of range\n",
+      "34/232list index out of range\n",
+      "35/232list index out of range\n",
+      "36/232list index out of range\n",
+      "38/232list index out of range\n",
+      "38/232list index out of range\n",
+      "38/232list index out of range\n",
+      "38/232list index out of range\n",
+      "38/232list index out of range\n",
+      "44/232list index out of range\n",
+      "49/232list index out of range\n",
+      "50/232list index out of range\n",
+      "51/232list index out of range\n",
+      "51/232list index out of range\n",
+      "52/232list index out of range\n",
+      "52/232list index out of range\n",
+      "53/232list index out of range\n",
+      "54/232list index out of range\n",
+      "56/232list index out of range\n",
+      "58/232list index out of range\n",
+      "58/232list index out of range\n",
+      "60/232list index out of range\n",
+      "60/232list index out of range\n",
+      "61/2321\n",
+      "62/2321\n",
+      "63/232list index out of range\n",
+      "63/232list index out of range\n",
+      "63/232list index out of range\n",
+      "63/232list index out of range\n",
+      "64/232list index out of range\n",
+      "64/2321\n",
+      "2\n",
+      "65/2321\n",
+      "66/232list index out of range\n",
+      "66/232list index out of range\n",
+      "66/232list index out of range\n",
+      "66/232list index out of range\n",
+      "66/232list index out of range\n",
+      "72/232list index out of range\n",
+      "72/232list index out of range\n",
+      "72/232list index out of range\n",
+      "72/232list index out of range\n",
+      "72/232list index out of range\n",
+      "72/232list index out of range\n",
+      "72/232list index out of range\n",
+      "72/232list index out of range\n",
+      "73/232list index out of range\n",
+      "73/232list index out of range\n",
+      "73/232list index out of range\n",
+      "73/232list index out of range\n",
+      "73/232list index out of range\n",
+      "74/232list index out of range\n",
+      "77/232list index out of range\n",
+      "80/232list index out of range\n",
+      "80/232list index out of range\n",
+      "82/232list index out of range\n",
+      "84/232list index out of range\n",
+      "84/232list index out of range\n",
+      "89/232list index out of range\n",
+      "89/232list index out of range\n",
+      "89/232list index out of range\n",
+      "89/232list index out of range\n",
+      "89/232list index out of range\n",
+      "89/232list index out of range\n",
+      "95/232list index out of range\n",
+      "95/2321\n",
+      "96/232list index out of range\n",
+      "100/232list index out of range\n",
+      "101/232list index out of range\n",
+      "102/232list index out of range\n",
+      "102/232list index out of range\n",
+      "102/232list index out of range\n",
+      "105/232list index out of range\n",
+      "108/232"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m    379\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m  \u001b[0;31m# Python 2.7, use buffering of HTTP responses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 380\u001b[0;31m                 \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    381\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m  \u001b[0;31m# Python 2.6 and older, Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: getresponse() got an unexpected keyword argument 'buffering'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-15-ddd472848dde>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprec_ga\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrec_ga\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcompute_precision_recall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipGate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-8-f7e3a40e4d49>\u001b[0m in \u001b[0;36mcompute_precision_recall\u001b[0;34m(pipeline)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m             \u001b[0mres_ner\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlang\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0midentify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m             \u001b[0mres_ner\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDisambiguator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse_corpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_ner\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Users/jacquesfize/nas_cloud/Code/str-python/nlp/ner/gate_annie.py\u001b[0m in \u001b[0;36midentify\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m     18\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhost\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/ner\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     21\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mr\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m    110\u001b[0m     \"\"\"\n\u001b[1;32m    111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m     56\u001b[0m     \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    506\u001b[0m         }\n\u001b[1;32m    507\u001b[0m         \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m         \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    510\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m    616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    617\u001b[0m         \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m         \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    620\u001b[0m         \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    438\u001b[0m                     \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    439\u001b[0m                     \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m                     \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    441\u001b[0m                 )\n\u001b[1;32m    442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m    599\u001b[0m                                                   \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    600\u001b[0m                                                   \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m                                                   chunked=chunked)\n\u001b[0m\u001b[1;32m    602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    603\u001b[0m             \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m    381\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m  \u001b[0;31m# Python 2.6 and older, Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    382\u001b[0m                 \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 383\u001b[0;31m                     \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    384\u001b[0m                 \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    385\u001b[0m                     \u001b[0;31m# Remove the TypeError from the exception chain in Python 3;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1329\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1330\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1331\u001b[0;31m                 \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1332\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1333\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36mbegin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    295\u001b[0m         \u001b[0;31m# read until we get a non-100 response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    296\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 297\u001b[0;31m             \u001b[0mversion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    298\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    299\u001b[0m                 \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36m_read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    256\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    257\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 258\u001b[0;31m         \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"iso-8859-1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    259\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    260\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"status line\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    584\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    585\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    587\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    588\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "prec_ga,rec_ga=compute_precision_recall(pipGate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T07:25:36.506464Z",
+     "start_time": "2018-05-17T07:25:36.496991Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "def m(x):\n",
+    "    return np.mean(np.nan_to_num(x))\n",
+    "cols=[\"NER\",\"P\",\"R\"]\n",
+    "df=pd.DataFrame(columns=cols)\n",
+    "df=pd.DataFrame([[\"StanfordNER\",m(prec_st),m(rec_st)],\n",
+    "                        [\"Polyglot\",m(prec_po),m(rec_po)],[\"NLTK\",m(prec_nl),m(rec_nl)],\n",
+    "                       [\"Spacy\",m(prec_sp),m(rec_sp)]],columns=cols)\n",
+    "df[\"F\"]= df.apply(lambda x: 2*((x[\"P\"]*x[\"R\"])/(x[\"P\"]+x[\"R\"])), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T07:25:37.723293Z",
+     "start_time": "2018-05-17T07:25:37.713231Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>NER</th>\n",
+       "      <th>P</th>\n",
+       "      <th>R</th>\n",
+       "      <th>F</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>StanfordNER</td>\n",
+       "      <td>0.319804</td>\n",
+       "      <td>0.169799</td>\n",
+       "      <td>0.221822</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Polyglot</td>\n",
+       "      <td>0.207006</td>\n",
+       "      <td>0.356064</td>\n",
+       "      <td>0.261805</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>NLTK</td>\n",
+       "      <td>0.137581</td>\n",
+       "      <td>0.158004</td>\n",
+       "      <td>0.147087</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Spacy</td>\n",
+       "      <td>0.147053</td>\n",
+       "      <td>0.849829</td>\n",
+       "      <td>0.250722</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           NER         P         R         F\n",
+       "0  StanfordNER  0.319804  0.169799  0.221822\n",
+       "1     Polyglot  0.207006  0.356064  0.261805\n",
+       "2         NLTK  0.137581  0.158004  0.147087\n",
+       "3        Spacy  0.147053  0.849829  0.250722"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-17T07:51:46.198366Z",
+     "start_time": "2018-05-17T07:51:46.192160Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\\begin{tabular}{llrrr}\n",
+      "\\toprule\n",
+      "{} &          NER &         P &         R &         F \\\\\n",
+      "\\midrule\n",
+      "0 &  StanfordNER &  0.319804 &  0.169799 &  0.221822 \\\\\n",
+      "1 &     Polyglot &  0.207006 &  0.356064 &  0.261805 \\\\\n",
+      "2 &         NLTK &  0.137581 &  0.158004 &  0.147087 \\\\\n",
+      "3 &        Spacy &  0.147053 &  0.849829 &  0.250722 \\\\\n",
+      "\\bottomrule\n",
+      "\\end{tabular}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.to_latex())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "position": {
+    "height": "217px",
+    "left": "915px",
+    "right": "28px",
+    "top": "120px",
+    "width": "341px"
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/NER Evaluation.ipynb b/notebooks/NER Evaluation.ipynb
index 67d847a..85610ef 100644
--- a/notebooks/NER Evaluation.ipynb	
+++ b/notebooks/NER Evaluation.ipynb	
@@ -4,16 +4,17 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:37.341397Z",
+     "start_time": "2018-05-08T15:19:37.337211Z"
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/Users/jacquesfize/ownCloud/THESE/Code/str-python\n"
+      "/Users/jacquesfize/nas_cloud/Code/str-python\n"
      ]
     }
    ],
@@ -23,28 +24,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 2,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:37.412267Z",
+     "start_time": "2018-05-08T15:19:37.343429Z"
+    }
    },
    "outputs": [],
    "source": [
     "import json\n",
     "import os,sys,re,glob\n",
     "from elasticsearch import Elasticsearch\n",
-    "\n",
-    "es_client=Elasticsearch(hosts=\"172.16.50.33:32773\")"
+    "es_client=Elasticsearch()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true,
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:37.738728Z",
+     "start_time": "2018-05-08T15:19:37.414740Z"
+    },
     "scrolled": false
    },
    "outputs": [],
@@ -91,9 +93,10 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:37.750366Z",
+     "start_time": "2018-05-08T15:19:37.740586Z"
+    }
    },
    "outputs": [
     {
@@ -142,20 +145,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "deletable": true,
-    "editable": true
-   },
+   "metadata": {},
    "source": [
     "## Chargement des Données"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "deletable": true,
-    "editable": true
-   },
+   "metadata": {},
    "source": [
     "# Transformation des données\n",
     "\n",
@@ -164,11 +161,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:37.754389Z",
+     "start_time": "2018-05-08T15:19:37.752283Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -177,11 +175,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 6,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:37.768916Z",
+     "start_time": "2018-05-08T15:19:37.756469Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -212,11 +211,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:41.186582Z",
+     "start_time": "2018-05-08T15:19:37.771576Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -225,45 +225,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:42.471682Z",
+     "start_time": "2018-05-08T15:19:41.188081Z"
+    }
    },
    "outputs": [],
    "source": [
     "from pipeline import *\n",
-    "from pos_tagger.tagger import Tagger\n",
-    "from disambiguator.pagerank import *\n",
-    "from disambiguator.geodict_gaurav import *\n",
-    "from pos_tagger.treetagger import TreeTagger\n",
-    "from ner.stanford_ner import StanfordNER\n",
-    "from ner.polyglot import Polyglot\n",
-    "from ner.nltk import NLTK\n",
-    "from ner.gate_annie import GateAnnie\n",
-    "from ner.ner import NER\n",
+    "from nlp.pos_tagger.tagger import Tagger\n",
+    "from nlp.disambiguator.pagerank import *\n",
+    "from nlp.disambiguator.geodict_gaurav import *\n",
+    "from nlp.pos_tagger.treetagger import TreeTagger\n",
+    "from nlp.ner.stanford_ner import StanfordNER\n",
+    "from nlp.ner.polyglot import Polyglot\n",
+    "from nlp.ner.nltk import NLTK\n",
+    "from nlp.ner.gate_annie import GateAnnie\n",
+    "from nlp.ner.spacy import Spacy\n",
+    "from nlp.ner.ner import NER\n",
     "from progressbar import ProgressBar\n",
     "from polyglot.text import Text\n",
     "\n",
     "\n",
-    "from disambiguator.disambiguator import Disambiguator"
+    "from nlp.disambiguator.disambiguator import Disambiguator"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:43.131232Z",
+     "start_time": "2018-05-08T15:19:42.473854Z"
+    }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated class Spacy (Not finished yet !).\n",
+      "  \"\"\"\n",
+      "/usr/local/lib/python3.6/site-packages/msgpack_numpy.py:84: DeprecationWarning: The binary mode of fromstring is deprecated, as it behaves surprisingly on unicode inputs. Use frombuffer instead\n",
+      "  dtype=np.dtype(descr)).reshape(obj[b'shape'])\n",
+      "/usr/local/lib/python3.6/site-packages/msgpack_numpy.py:88: DeprecationWarning: The binary mode of fromstring is deprecated, as it behaves surprisingly on unicode inputs. Use frombuffer instead\n",
+      "  dtype=np.dtype(descr))[0]\n"
+     ]
+    }
+   ],
    "source": [
     "pipStanford=Pipeline(lang=\"english\",tagger=Tagger(),ner=StanfordNER(lang=\"en\"))\n",
     "pipNLTK=Pipeline(lang=\"english\",tagger=Tagger(),ner=NLTK(lang=\"en\"))\n",
     "pipPolyglot=Pipeline(lang=\"english\",tagger=Tagger(),ner=Polyglot())\n",
-    "pipGate=Pipeline(lang=\"english\",tagger=Tagger(),ner=GateAnnie(lang=\"en\"))\n"
+    "pipGate=Pipeline(lang=\"english\",tagger=Tagger(),ner=GateAnnie(lang=\"en\"))\n",
+    "pipSpacy=Pipeline(lang=\"english\",tagger=Tagger(),ner=Spacy(lang=\"en\"))"
    ]
   },
   {
@@ -281,13 +298,140 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 13,
    "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:23:08.238218Z",
+     "start_time": "2018-05-08T15:23:08.179062Z"
+    }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<input>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "<ipython-input-13-b29052c515c1>:2: DeprecationWarning: invalid escape sequence \\s\n",
+      "  return re.sub(\"[!]+\",\" \",re.sub(\"\\s\",\"!!\",label.strip()))\n"
+     ]
+    }
+   ],
    "source": [
     "def parse_epi_labels(label):\n",
     "    return re.sub(\"[!]+\",\" \",re.sub(\"\\s\",\"!!\",label.strip()))\n",
@@ -302,7 +446,7 @@
     "        for i in texts:\n",
     "            if not i in ann_data or not texts[i]:continue\n",
     "            try:\n",
-    "                output,spat_entities=pipeline.parse(texts[i])\n",
+    "                _,output,spat_entities=pipeline.parse(texts[i])\n",
     "            except:\n",
     "                #print(texts[i])\n",
     "                continue\n",
@@ -328,7 +472,7 @@
     "        for i in texts:\n",
     "            if not i in ann_data or not texts[i]:continue\n",
     "            try:\n",
-    "                output,spat_entities=pipeline.parse(texts[i])\n",
+    "                _,output,spat_entities=pipeline.parse(texts[i])\n",
     "            except:\n",
     "                #print(texts[i])\n",
     "                continue\n",
@@ -359,9 +503,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {
-    "collapsed": false
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:19:43.278565Z",
+     "start_time": "2018-05-08T15:19:43.190533Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -383,7 +530,7 @@
     "            if not i in ann_data or not texts[i]:\n",
     "                continue\n",
     "            try:\n",
-    "                output, spat_entities = pipeline.parse(texts[i])\n",
+    "                _,output, spat_entities = pipeline.parse(texts[i])\n",
     "            except:\n",
     "                # print(texts[i])\n",
     "                continue\n",
@@ -431,9 +578,9 @@
     "            if not i in ann_data or not texts[i]:\n",
     "                continue\n",
     "            try:\n",
-    "                output, spat_entities = pipeline.parse(texts[i])\n",
+    "                _,output, spat_entities = pipeline.parse(texts[i])\n",
     "            except:\n",
-    "                # print(texts[i])\n",
+    "                #print(texts[i])\n",
     "                continue\n",
     "            out = Disambiguator.parse_corpus(output)\n",
     "\n",
@@ -461,17 +608,65 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:25:06.163620Z",
+     "start_time": "2018-05-08T15:23:14.468336Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:55 Time:  0:00:55\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "464 464\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:55 Time:  0:00:55\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "464 501\n"
+     ]
+    }
+   ],
+   "source": [
+    "rec_spacy = ner_recall_epi(texts, ann_data, pipSpacy)\n",
+    "prec_spacy = ner_precision_epi(texts, ann_data, pipSpacy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {
-    "collapsed": false
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:37:02.477840Z",
+     "start_time": "2018-05-08T15:25:25.959366Z"
+    }
    },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:02:27 Time: 0:02:27\n",
-      "  0% (1 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:01:38"
+      "100% (532 of 532) |######################| Elapsed Time: 0:02:15 Time:  0:02:15\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -485,25 +680,22 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:02:28 Time: 0:02:28\n",
-      "  0% (2 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:29"
+      "100% (532 of 532) |######################| Elapsed Time: 0:02:25 Time:  0:02:25\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "457 494\n"
+      "463 500\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  3% (21 of 532) |###                                                                             | Elapsed Time: 0:00:02 ETA: 0:01:01Detector is not able to detect the language reliably.\n",
-      " 67% (361 of 532) |#####################################################                          | Elapsed Time: 0:00:42 ETA: 0:00:19Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:56 Time: 0:00:56\n",
-      "  0% (2 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:28"
+      "100% (532 of 532) |######################| Elapsed Time: 0:01:06 Time:  0:01:06\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -517,25 +709,22 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  3% (21 of 532) |###                                                                             | Elapsed Time: 0:00:02 ETA: 0:01:04Detector is not able to detect the language reliably.\n",
-      " 70% (374 of 532) |#######################################################                        | Elapsed Time: 0:00:43 ETA: 0:00:19Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:56 Time: 0:00:56\n",
-      "N/A% (0 of 532) |                                                                               | Elapsed Time: 0:00:00 ETA:  --:--:--"
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:56 Time:  0:00:56\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "444 479\n"
+      "455 490\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:01:23 Time: 0:01:23\n",
-      "  0% (1 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:01:01"
+      "100% (532 of 532) |######################| Elapsed Time: 0:01:24 Time:  0:01:24\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -549,25 +738,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:01:25 Time: 0:01:25\n",
-      "  0% (1 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:01:05"
+      "100% (532 of 532) |######################| Elapsed Time: 0:01:24 Time:  0:01:24\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "454 491\n"
+      "463 500\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  3% (21 of 532) |###                                                                             | Elapsed Time: 0:00:02 ETA: 0:01:04Detector is not able to detect the language reliably.\n",
-      " 66% (355 of 532) |####################################################                           | Elapsed Time: 0:00:36 ETA: 0:00:19Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:47 Time: 0:00:47\n",
-      "  0% (2 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:32"
+      "100% (532 of 532) |######################| Elapsed Time: 0:01:00 Time:  0:01:00\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -581,16 +768,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  3% (21 of 532) |###                                                                             | Elapsed Time: 0:00:02 ETA: 0:00:51Detector is not able to detect the language reliably.\n",
-      " 68% (365 of 532) |######################################################                         | Elapsed Time: 0:00:35 ETA: 0:00:17Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:46 Time: 0:00:46\n"
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:56 Time:  0:00:56\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "430 465\n"
+      "443 478\n"
      ]
     }
    ],
@@ -605,22 +790,24 @@
     "prec_nltk = ner_precision_epi(texts, ann_data, pipNLTK)\n",
     "\n",
     "rec_gate = ner_recall_epi(texts, ann_data, pipGate)\n",
-    "prec_gate = ner_precision_epi(texts, ann_data, pipGate)"
+    "prec_gate = ner_precision_epi(texts, ann_data, pipGate)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 16,
    "metadata": {
-    "collapsed": false
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:48:32.390113Z",
+     "start_time": "2018-05-08T15:37:09.480442Z"
+    }
    },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:02:19 Time: 0:02:19\n",
-      "  0% (1 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:01:37"
+      "100% (532 of 532) |######################| Elapsed Time: 0:02:17 Time:  0:02:17\n"
      ]
     },
     {
@@ -634,8 +821,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:02:18 Time: 0:02:18\n",
-      "  0% (2 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:27"
+      "100% (532 of 532) |######################| Elapsed Time: 0:02:20 Time:  0:02:20\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -649,10 +836,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  3% (21 of 532) |###                                                                             | Elapsed Time: 0:00:02 ETA: 0:00:59Detector is not able to detect the language reliably.\n",
-      " 67% (361 of 532) |#####################################################                          | Elapsed Time: 0:00:41 ETA: 0:00:19Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:53 Time: 0:00:53\n",
-      "  0% (2 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:29"
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:57 Time:  0:00:57\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -666,10 +851,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  4% (24 of 532) |###                                                                             | Elapsed Time: 0:00:02 ETA: 0:00:54Detector is not able to detect the language reliably.\n",
-      " 71% (383 of 532) |########################################################                       | Elapsed Time: 0:00:41 ETA: 0:00:17Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:53 Time: 0:00:53\n",
-      "  0% (1 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:01:02"
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:57 Time:  0:00:57\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -683,8 +866,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:01:20 Time: 0:01:20\n",
-      "  0% (1 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:57"
+      "100% (532 of 532) |######################| Elapsed Time: 0:01:25 Time:  0:01:25\n",
+      "  0% (3 of 532) |                        | Elapsed Time: 0:00:00 ETA:   0:00:18"
      ]
     },
     {
@@ -698,8 +881,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:01:20 Time: 0:01:20\n",
-      "  0% (4 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:21"
+      "100% (532 of 532) |######################| Elapsed Time: 0:01:25 Time:  0:01:25\n",
+      "  0% (4 of 532) |                        | Elapsed Time: 0:00:00 ETA:   0:00:18"
      ]
     },
     {
@@ -713,10 +896,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  3% (21 of 532) |###                                                                             | Elapsed Time: 0:00:01 ETA: 0:00:47Detector is not able to detect the language reliably.\n",
-      " 66% (355 of 532) |####################################################                           | Elapsed Time: 0:00:34 ETA: 0:00:17Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:45 Time: 0:00:45\n",
-      "  0% (2 of 532) |                                                                                 | Elapsed Time: 0:00:00 ETA: 0:00:32"
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:57 Time:  0:00:57\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
      ]
     },
     {
@@ -730,9 +911,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  4% (22 of 532) |###                                                                             | Elapsed Time: 0:00:01 ETA: 0:00:47Detector is not able to detect the language reliably.\n",
-      " 70% (374 of 532) |#######################################################                        | Elapsed Time: 0:00:34 ETA: 0:00:15Detector is not able to detect the language reliably.\n",
-      "100% (532 of 532) |##############################################################################| Elapsed Time: 0:00:45 Time: 0:00:45\n"
+      "100% (532 of 532) |######################| Elapsed Time: 0:01:00 Time:  0:01:00\n"
      ]
     },
     {
@@ -759,9 +938,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 17,
    "metadata": {
-    "collapsed": false
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:50:37.503408Z",
+     "start_time": "2018-05-08T15:48:45.036943Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:56 Time:  0:00:56\n",
+      "N/A% (0 of 532) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "464 464\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100% (532 of 532) |######################| Elapsed Time: 0:00:56 Time:  0:00:56\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "464 501\n"
+     ]
+    }
+   ],
+   "source": [
+    "rec_spacy_all = ner_recall_epi_all(texts, ann_data, pipSpacy)\n",
+    "prec_spacy_all = ner_precision_epi_all(texts, ann_data, pipSpacy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:51:07.306398Z",
+     "start_time": "2018-05-08T15:51:07.295232Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -769,22 +996,39 @@
     "df=pd.DataFrame(columns=cols)\n",
     "df=pd.DataFrame([[\"StanfordNER\",prec_SFNER,rec_SFNER,prec_SFNER_all,rec_SFNER_all],\n",
     "                        [\"Polyglot\",prec_poly,rec_poly,prec_poly_all,rec_poly_all],[\"NLTK\",prec_nltk,rec_nltk,prec_nltk_all,rec_nltk_all],\n",
-    "                       [\"GATE\",prec_gate,rec_gate,prec_gate_all,rec_gate_all]],columns=cols)\n",
+    "                       [\"GATE\",prec_gate,rec_gate,prec_gate_all,rec_gate_all],\n",
+    "                [\"Spacy\",prec_spacy,rec_spacy,prec_spacy_all,rec_spacy_all]],columns=cols)\n",
     "df[\"F-Measure(D)\"]= df.apply(lambda x: 2*((x[\"Precision(ID)\"]*x[\"Recall(ID)\"])/(x[\"Precision(ID)\"]+x[\"Recall(ID)\"])), axis=1)\n",
     "df[\"F-Measure(ALL)\"]= df.apply(lambda x: 2*((x[\"Precision(ALL)\"]*x[\"Recall(ALL)\"])/(x[\"Precision(ALL)\"]+x[\"Recall(ALL)\"])), axis=1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 19,
    "metadata": {
-    "collapsed": false
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:51:08.070983Z",
+     "start_time": "2018-05-08T15:51:08.058901Z"
+    }
    },
    "outputs": [
     {
      "data": {
       "text/html": [
        "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
@@ -802,90 +1046,92 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>StanfordNER</td>\n",
-       "      <td>0.689301</td>\n",
-       "      <td>0.737253</td>\n",
-       "      <td>0.667065</td>\n",
-       "      <td>0.720500</td>\n",
-       "      <td>0.712471</td>\n",
-       "      <td>0.692754</td>\n",
+       "      <td>0.594245</td>\n",
+       "      <td>0.771514</td>\n",
+       "      <td>0.666652</td>\n",
+       "      <td>0.718504</td>\n",
+       "      <td>0.671375</td>\n",
+       "      <td>0.691608</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>Polyglot</td>\n",
-       "      <td>0.669563</td>\n",
-       "      <td>0.703822</td>\n",
+       "      <td>0.532444</td>\n",
+       "      <td>0.724127</td>\n",
        "      <td>0.608216</td>\n",
        "      <td>0.666334</td>\n",
-       "      <td>0.686265</td>\n",
+       "      <td>0.613666</td>\n",
        "      <td>0.635950</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>NLTK</td>\n",
-       "      <td>0.561466</td>\n",
-       "      <td>0.635291</td>\n",
-       "      <td>0.488915</td>\n",
-       "      <td>0.609152</td>\n",
-       "      <td>0.596101</td>\n",
-       "      <td>0.542451</td>\n",
+       "      <td>0.429511</td>\n",
+       "      <td>0.665637</td>\n",
+       "      <td>0.497519</td>\n",
+       "      <td>0.617828</td>\n",
+       "      <td>0.522119</td>\n",
+       "      <td>0.551185</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>GATE</td>\n",
-       "      <td>0.701907</td>\n",
-       "      <td>0.617005</td>\n",
+       "      <td>0.578102</td>\n",
+       "      <td>0.626061</td>\n",
        "      <td>0.633567</td>\n",
        "      <td>0.585320</td>\n",
-       "      <td>0.656724</td>\n",
+       "      <td>0.601126</td>\n",
        "      <td>0.608488</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Spacy</td>\n",
+       "      <td>0.406803</td>\n",
+       "      <td>0.652530</td>\n",
+       "      <td>0.404245</td>\n",
+       "      <td>0.616491</td>\n",
+       "      <td>0.501167</td>\n",
+       "      <td>0.488301</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "           NER  Precision(ID)  Recall(ID)  Precision(ALL)  Recall(ALL)  \\\n",
-       "0  StanfordNER       0.689301    0.737253        0.667065     0.720500   \n",
-       "1     Polyglot       0.669563    0.703822        0.608216     0.666334   \n",
-       "2         NLTK       0.561466    0.635291        0.488915     0.609152   \n",
-       "3         GATE       0.701907    0.617005        0.633567     0.585320   \n",
+       "0  StanfordNER       0.594245    0.771514        0.666652     0.718504   \n",
+       "1     Polyglot       0.532444    0.724127        0.608216     0.666334   \n",
+       "2         NLTK       0.429511    0.665637        0.497519     0.617828   \n",
+       "3         GATE       0.578102    0.626061        0.633567     0.585320   \n",
+       "4        Spacy       0.406803    0.652530        0.404245     0.616491   \n",
        "\n",
        "   F-Measure(D)  F-Measure(ALL)  \n",
-       "0      0.712471        0.692754  \n",
-       "1      0.686265        0.635950  \n",
-       "2      0.596101        0.542451  \n",
-       "3      0.656724        0.608488  "
+       "0      0.671375        0.691608  \n",
+       "1      0.613666        0.635950  \n",
+       "2      0.522119        0.551185  \n",
+       "3      0.601126        0.608488  \n",
+       "4      0.501167        0.488301  "
       ]
      },
-     "execution_count": 28,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.to"
+    "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "|    | NER         |   Precision(ID) |   Recall(ID) |   Precision(ALL) |   Recall(ALL) |   F-Measure(D) |   F-Measure(ALL) |\n",
-      "|---:|:------------|----------------:|-------------:|-----------------:|--------------:|---------------:|-----------------:|\n",
-      "|  0 | StanfordNER |        0.689301 |     0.737253 |         0.667065 |      0.7205   |       0.712471 |         0.692754 |\n",
-      "|  1 | Polyglot    |        0.669563 |     0.703822 |         0.608216 |      0.666334 |       0.686265 |         0.63595  |\n",
-      "|  2 | NLTK        |        0.561466 |     0.635291 |         0.488915 |      0.609152 |       0.596101 |         0.542451 |\n",
-      "|  3 | GATE        |        0.701907 |     0.617005 |         0.633567 |      0.58532  |       0.656724 |         0.608488 |\n"
-     ]
+    "ExecuteTime": {
+     "end_time": "2018-05-08T15:20:22.410670Z",
+     "start_time": "2018-05-08T15:19:38.062Z"
     }
-   ],
+   },
+   "outputs": [],
    "source": [
     "from tabulate import tabulate\n",
     "print(tabulate(df, headers='keys', tablefmt='pipe'))"
@@ -894,9 +1140,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -917,13 +1161,23 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.6.5"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
   },
   "varInspector": {
    "cols": {
-    "lenName": 16.0,
-    "lenType": 16.0,
-    "lenVar": 40.0
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
    },
    "kernels_config": {
     "python": {
diff --git a/notebooks/StanfordMadaAgro.ipynb b/notebooks/StanfordMadaAgro.ipynb
new file mode 100644
index 0000000..faedc5b
--- /dev/null
+++ b/notebooks/StanfordMadaAgro.ipynb
@@ -0,0 +1,950 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:11.698091Z",
+     "start_time": "2018-05-15T05:07:11.253243Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:12.270692Z",
+     "start_time": "2018-05-15T05:07:12.257655Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "selected=pd.read_csv(\"/Users/jacquesfize/LOD_DATASETS/selected_mada.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:12.613016Z",
+     "start_time": "2018-05-15T05:07:12.610457Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "base_dir='/Users/jacquesfize/LOD_DATASETS/raw_bvlac/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:12.920064Z",
+     "start_time": "2018-05-15T05:07:12.914272Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jacquesfize/nas_cloud/Code/str-python\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:14.617383Z",
+     "start_time": "2018-05-15T05:07:13.522309Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pipeline import *\n",
+    "from nlp.ner.stanford_ner import StanfordNER"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:14.850467Z",
+     "start_time": "2018-05-15T05:07:14.760004Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_lang=pd.DataFrame(data=list(\n",
+    "    json.load(open(\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac/associated_lang.json\")).items()),\n",
+    "    columns=[\"id_doc\",\"lang\"]\n",
+    ")\n",
+    "data_lang[\"id_doc\"]=data_lang[\"id_doc\"].astype(int)\n",
+    "selected[\"id_doc\"]=selected[\"id_doc\"].astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-08T17:11:00.213069Z",
+     "start_time": "2018-05-08T17:11:00.208408Z"
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:15.813223Z",
+     "start_time": "2018-05-15T05:07:15.808854Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_lang=data_lang[data_lang[\"id_doc\"].isin(selected[\"id_doc\"])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:16.101978Z",
+     "start_time": "2018-05-15T05:07:16.096855Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_lang=data_lang.set_index(\"id_doc\")\n",
+    "selected=selected.set_index(\"id_doc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:16.371936Z",
+     "start_time": "2018-05-15T05:07:16.368656Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "selected[\"lang\"]=data_lang[\"lang\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:16.794152Z",
+     "start_time": "2018-05-15T05:07:16.775373Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['fr', 'en'], dtype=object)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "selected[\"lang\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:17.265106Z",
+     "start_time": "2018-05-15T05:07:17.261840Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pipeline= {\n",
+    "    \"en\":Pipeline(lang=\"english\",tagger=Tagger(),ner=StanfordNER(lang=\"en\")),\n",
+    "    \"fr\":Pipeline(lang=\"french\",tagger=Tagger(),ner=StanfordNER(lang=\"fr\"))\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:17.802814Z",
+     "start_time": "2018-05-15T05:07:17.795266Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dfEn=selected[selected.lang == \"en\"]\n",
+    "dfFr=selected[selected.lang == \"fr\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-15T05:07:18.417499Z",
+     "start_time": "2018-05-15T05:07:18.395886Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>count</th>\n",
+       "      <th>format</th>\n",
+       "      <th>lang</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>id_doc</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>12</td>\n",
+       "      <td>txt</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1000</th>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>txt</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1001</th>\n",
+       "      <td>9</td>\n",
+       "      <td>5</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1002</th>\n",
+       "      <td>15</td>\n",
+       "      <td>5</td>\n",
+       "      <td>docx</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1003</th>\n",
+       "      <td>26</td>\n",
+       "      <td>11</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1004</th>\n",
+       "      <td>37</td>\n",
+       "      <td>11</td>\n",
+       "      <td>xls</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10044</th>\n",
+       "      <td>41</td>\n",
+       "      <td>5</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1005</th>\n",
+       "      <td>47</td>\n",
+       "      <td>6</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10052</th>\n",
+       "      <td>50</td>\n",
+       "      <td>4</td>\n",
+       "      <td>docx</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1006</th>\n",
+       "      <td>58</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10060</th>\n",
+       "      <td>59</td>\n",
+       "      <td>5</td>\n",
+       "      <td>docx</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10062</th>\n",
+       "      <td>61</td>\n",
+       "      <td>4</td>\n",
+       "      <td>xls</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10065</th>\n",
+       "      <td>64</td>\n",
+       "      <td>5</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1007</th>\n",
+       "      <td>69</td>\n",
+       "      <td>7</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10070</th>\n",
+       "      <td>70</td>\n",
+       "      <td>5</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10073</th>\n",
+       "      <td>73</td>\n",
+       "      <td>4</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10074</th>\n",
+       "      <td>74</td>\n",
+       "      <td>4</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10078</th>\n",
+       "      <td>78</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1008</th>\n",
+       "      <td>80</td>\n",
+       "      <td>7</td>\n",
+       "      <td>xls</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10084</th>\n",
+       "      <td>85</td>\n",
+       "      <td>5</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10086</th>\n",
+       "      <td>87</td>\n",
+       "      <td>5</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10087</th>\n",
+       "      <td>88</td>\n",
+       "      <td>4</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1009</th>\n",
+       "      <td>91</td>\n",
+       "      <td>7</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10092</th>\n",
+       "      <td>94</td>\n",
+       "      <td>5</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10094</th>\n",
+       "      <td>96</td>\n",
+       "      <td>4</td>\n",
+       "      <td>xls</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10099</th>\n",
+       "      <td>101</td>\n",
+       "      <td>4</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1010</th>\n",
+       "      <td>103</td>\n",
+       "      <td>7</td>\n",
+       "      <td>docx</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1011</th>\n",
+       "      <td>114</td>\n",
+       "      <td>4</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1012</th>\n",
+       "      <td>125</td>\n",
+       "      <td>4</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10129</th>\n",
+       "      <td>135</td>\n",
+       "      <td>4</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9709</th>\n",
+       "      <td>13323</td>\n",
+       "      <td>4</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9710</th>\n",
+       "      <td>13325</td>\n",
+       "      <td>4</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9711</th>\n",
+       "      <td>13326</td>\n",
+       "      <td>4</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>973</th>\n",
+       "      <td>13340</td>\n",
+       "      <td>4</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>974</th>\n",
+       "      <td>13351</td>\n",
+       "      <td>4</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>975</th>\n",
+       "      <td>13360</td>\n",
+       "      <td>7</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>976</th>\n",
+       "      <td>13371</td>\n",
+       "      <td>7</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>977</th>\n",
+       "      <td>13380</td>\n",
+       "      <td>9</td>\n",
+       "      <td>xls</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>978</th>\n",
+       "      <td>13389</td>\n",
+       "      <td>9</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>979</th>\n",
+       "      <td>13400</td>\n",
+       "      <td>4</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>980</th>\n",
+       "      <td>13411</td>\n",
+       "      <td>4</td>\n",
+       "      <td>docx</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>981</th>\n",
+       "      <td>13418</td>\n",
+       "      <td>6</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>982</th>\n",
+       "      <td>13428</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>983</th>\n",
+       "      <td>13438</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>984</th>\n",
+       "      <td>13448</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>985</th>\n",
+       "      <td>13458</td>\n",
+       "      <td>5</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>986</th>\n",
+       "      <td>13469</td>\n",
+       "      <td>5</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>987</th>\n",
+       "      <td>13480</td>\n",
+       "      <td>4</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>988</th>\n",
+       "      <td>13489</td>\n",
+       "      <td>4</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>989</th>\n",
+       "      <td>13499</td>\n",
+       "      <td>6</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>990</th>\n",
+       "      <td>13511</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>991</th>\n",
+       "      <td>13522</td>\n",
+       "      <td>10</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>992</th>\n",
+       "      <td>13531</td>\n",
+       "      <td>10</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>993</th>\n",
+       "      <td>13542</td>\n",
+       "      <td>7</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>994</th>\n",
+       "      <td>13549</td>\n",
+       "      <td>7</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>995</th>\n",
+       "      <td>13560</td>\n",
+       "      <td>11</td>\n",
+       "      <td>docx</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>996</th>\n",
+       "      <td>13569</td>\n",
+       "      <td>11</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>997</th>\n",
+       "      <td>13578</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>998</th>\n",
+       "      <td>13586</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>999</th>\n",
+       "      <td>13597</td>\n",
+       "      <td>6</td>\n",
+       "      <td>doc</td>\n",
+       "      <td>fr</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5273 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        Unnamed: 0  count format lang\n",
+       "id_doc                               \n",
+       "1                1     12    txt   fr\n",
+       "1000             3      6    txt   fr\n",
+       "1001             9      5    pdf   fr\n",
+       "1002            15      5   docx   fr\n",
+       "1003            26     11    doc   fr\n",
+       "1004            37     11    xls   fr\n",
+       "10044           41      5    pdf   fr\n",
+       "1005            47      6    NaN   fr\n",
+       "10052           50      4   docx   fr\n",
+       "1006            58      6    doc   fr\n",
+       "10060           59      5   docx   fr\n",
+       "10062           61      4    xls   fr\n",
+       "10065           64      5    NaN   fr\n",
+       "1007            69      7    doc   fr\n",
+       "10070           70      5    doc   fr\n",
+       "10073           73      4    doc   fr\n",
+       "10074           74      4    doc   fr\n",
+       "10078           78      6    doc   fr\n",
+       "1008            80      7    xls   fr\n",
+       "10084           85      5    doc   fr\n",
+       "10086           87      5    doc   fr\n",
+       "10087           88      4    doc   fr\n",
+       "1009            91      7    doc   fr\n",
+       "10092           94      5    doc   fr\n",
+       "10094           96      4    xls   fr\n",
+       "10099          101      4    doc   fr\n",
+       "1010           103      7   docx   fr\n",
+       "1011           114      4    doc   fr\n",
+       "1012           125      4    doc   fr\n",
+       "10129          135      4    doc   fr\n",
+       "...            ...    ...    ...  ...\n",
+       "9709         13323      4    pdf   fr\n",
+       "9710         13325      4    pdf   fr\n",
+       "9711         13326      4    pdf   fr\n",
+       "973          13340      4    NaN   fr\n",
+       "974          13351      4    NaN   fr\n",
+       "975          13360      7    pdf   fr\n",
+       "976          13371      7    doc   fr\n",
+       "977          13380      9    xls   fr\n",
+       "978          13389      9    pdf   fr\n",
+       "979          13400      4    pdf   fr\n",
+       "980          13411      4   docx   fr\n",
+       "981          13418      6    pdf   fr\n",
+       "982          13428      6    doc   fr\n",
+       "983          13438      6    doc   fr\n",
+       "984          13448      6    doc   fr\n",
+       "985          13458      5    pdf   fr\n",
+       "986          13469      5    pdf   fr\n",
+       "987          13480      4    pdf   fr\n",
+       "988          13489      4    pdf   fr\n",
+       "989          13499      6    pdf   fr\n",
+       "990          13511      6    doc   fr\n",
+       "991          13522     10    doc   fr\n",
+       "992          13531     10    doc   fr\n",
+       "993          13542      7    pdf   fr\n",
+       "994          13549      7    doc   fr\n",
+       "995          13560     11   docx   fr\n",
+       "996          13569     11    doc   fr\n",
+       "997          13578      6    doc   fr\n",
+       "998          13586      6    doc   fr\n",
+       "999          13597      6    doc   fr\n",
+       "\n",
+       "[5273 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfFr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2018-05-15T05:07:19.146Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9cf91ec038374d759ada26870b7760df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "IntProgress(value=0, description='Processing', max=5273)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import json,os\n",
+    "from ipywidgets import IntProgress\n",
+    "from IPython.display import display\n",
+    "p=IntProgress(description=\"Processing\",max=len(dfFr))\n",
+    "display(p)\n",
+    "\n",
+    "for row in dfFr.itertuples():\n",
+    "    p.value+=1\n",
+    "    id_doc=row[0]\n",
+    "    if not os.path.exists(\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac_ner/{0}.csv\".format(id_doc)):\n",
+    "        try:\n",
+    "        #print(len(open(base_dir+str(id_doc)+\".txt\").read()))\n",
+    "            test=pipeline[\"fr\"].ner.identify(open(base_dir+str(id_doc)+\".txt\").read())\n",
+    "            pd.DataFrame(test,columns=[\"text\",\"pos\"]).to_csv(\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac_ner/{0}.csv\".format(id_doc))\n",
+    "\n",
+    "        except Exception as e:\n",
+    "            print(e)\n",
+    "            print(id_doc,row[-2],row[-1])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-08T19:18:48.737157Z",
+     "start_time": "2018-05-08T18:43:55.338361Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5accc27c2ee9432fa8f38afe70125a7d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "IntProgress(value=0, description='Processing', max=279)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "from ipywidgets import IntProgress\n",
+    "from IPython.display import display\n",
+    "p=IntProgress(description=\"Processing\",max=len(dfEn))\n",
+    "display(p)\n",
+    "\n",
+    "for row in dfEn.itertuples():\n",
+    "    p.value+=1\n",
+    "    id_doc=row[0]\n",
+    "    try:\n",
+    "        test=pipeline[\"en\"].ner.identify(open(base_dir+str(id_doc)+\".txt\").read())\n",
+    "        pd.DataFrame(test,columns=[\"text\",\"pos\"]).to_csv(\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac_ner/{0}.csv\".format(id_doc))\n",
+    "        \n",
+    "    except:\n",
+    "        print(id_doc,row[-2],row[-1])\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-09T07:52:26.414920Z",
+     "start_time": "2018-05-09T07:52:26.410473Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10014"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(open(base_dir+str(id_doc)+\".txt\").read())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/corpusmadahard.ipynb b/notebooks/corpusmadahard.ipynb
index 6f580bc..7bac11d 100644
--- a/notebooks/corpusmadahard.ipynb
+++ b/notebooks/corpusmadahard.ipynb
@@ -5,8 +5,8 @@
    "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:10.161616Z",
-     "start_time": "2018-04-19T16:00:10.155255Z"
+     "end_time": "2018-05-16T10:00:15.686303Z",
+     "start_time": "2018-05-16T10:00:15.680340Z"
     }
    },
    "outputs": [
@@ -24,11 +24,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-20T06:04:58.583630Z",
-     "start_time": "2018-04-20T06:04:58.202679Z"
+     "end_time": "2018-05-16T10:00:15.895344Z",
+     "start_time": "2018-05-16T10:00:15.892752Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from ipywidgets import IntProgress\n",
+    "from IPython.display import display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T10:00:16.511473Z",
+     "start_time": "2018-05-16T10:00:16.102310Z"
     }
    },
    "outputs": [],
@@ -38,28 +53,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:10.589362Z",
-     "start_time": "2018-04-19T16:00:10.164034Z"
+     "end_time": "2018-05-16T10:00:17.122139Z",
+     "start_time": "2018-05-16T10:00:16.556517Z"
     }
    },
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import os,re\n",
-    "output_dir=\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac_ner/\"\n",
+    "import numpy as np\n",
+    "output_dir=\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac_ner_spacy/\"\n",
     "%matplotlib inline"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:10.694797Z",
-     "start_time": "2018-04-19T16:00:10.591777Z"
+     "end_time": "2018-05-16T10:00:17.284662Z",
+     "start_time": "2018-05-16T10:00:17.167430Z"
     }
    },
    "outputs": [],
@@ -69,21 +85,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 22,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:10.712869Z",
-     "start_time": "2018-04-19T16:00:10.697137Z"
+     "end_time": "2018-05-16T20:55:10.902547Z",
+     "start_time": "2018-05-16T20:55:10.890232Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "('GD2373613', 2363.0420701386847)"
+       "('GD13263662', -1)"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -103,16 +119,16 @@
     "        if id_:\n",
     "            return id_,score\n",
     "    return None,-1\n",
-    "get_most_common_id_v3(\"Berlin\")"
+    "get_most_common_id_v3(\"Tibet\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:10.913147Z",
-     "start_time": "2018-04-19T16:00:10.715134Z"
+     "end_time": "2018-05-16T10:00:17.624018Z",
+     "start_time": "2018-05-16T10:00:17.395144Z"
     }
    },
    "outputs": [],
@@ -158,11 +174,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 8,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-20T06:05:03.276532Z",
-     "start_time": "2018-04-20T06:05:02.384996Z"
+     "end_time": "2018-05-16T10:00:19.411617Z",
+     "start_time": "2018-05-16T10:00:19.403105Z"
     }
    },
    "outputs": [],
@@ -170,17 +186,17 @@
     "%%cython\n",
     "\n",
     "#cdef list ch=[\"Le\",\"pont\",\"d'\",\"avignon\",\"est\",\"-\",\"sympa\"]\n",
-    "def foo(list ch):\n",
-    "    print([c+(\"\" if c[-1] in [\"\\'\",\"-\"] else \" \") for c in ch])"
+    "def foo2(list ch):\n",
+    "    return [c+(\"\" if c[-1] in [\"\\'\",\"’\",\"-\"] else \" \") for c in ch]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 9,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-20T06:06:07.432292Z",
-     "start_time": "2018-04-20T06:06:07.426838Z"
+     "end_time": "2018-05-16T10:00:21.268761Z",
+     "start_time": "2018-05-16T10:00:21.226880Z"
     },
     "format": "row"
    },
@@ -194,12 +210,12 @@
     "    data[\"diff2\"]=(data[(data[\"ent_type_\"]==\"LOC\")][\"diff\"]>1).cumsum()\n",
     "    mx_=data[\"diff2\"].notnull().max()\n",
     "    def foo(x):\n",
-    "        if np.isnan(x):\n",
+    "        if pd.isnull(x).any():\n",
     "            mx_+=1\n",
     "            return mx_\n",
     "        return x\n",
     "    f={\n",
-    "        'text':lambda x: \" \".join(foo(list(map(str,x)))),\n",
+    "        'text':lambda x: \"\".join(foo2(list(map(str,x)))).rstrip(),\n",
     "        'pos_':'max',\n",
     "        'ent_type_':'max'\n",
     "\n",
@@ -210,11 +226,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.070182Z",
-     "start_time": "2018-04-19T16:00:10.922999Z"
+     "end_time": "2018-05-16T10:00:23.252597Z",
+     "start_time": "2018-05-16T10:00:23.056531Z"
     }
    },
    "outputs": [
@@ -235,11 +251,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.250132Z",
-     "start_time": "2018-04-19T16:00:11.073331Z"
+     "end_time": "2018-05-16T10:00:25.134319Z",
+     "start_time": "2018-05-16T10:00:25.080059Z"
     }
    },
    "outputs": [],
@@ -256,26 +272,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.265256Z",
-     "start_time": "2018-04-19T16:00:11.252226Z"
+     "end_time": "2018-05-16T10:00:27.099706Z",
+     "start_time": "2018-05-16T10:00:27.062820Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_ext=pd.DataFrame(data=list(\n",
+    "    json.load(open(\"/Users/jacquesfize/LOD_DATASETS/raw_bvlac/association.json\")).items()),\n",
+    "    columns=[\"id_doc\",\"format\"]\n",
+    ")\n",
+    "data_ext[\"format\"]=data_ext[\"format\"].apply(lambda x : x.split(\".\")[-1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T10:00:28.928246Z",
+     "start_time": "2018-05-16T10:00:28.908037Z"
     }
    },
    "outputs": [],
    "source": [
     "good_lang=data_lang[(data_lang[\"lang\"] == \"fr\") | (data_lang[\"lang\"] == \"en\")]\n",
-    "selected=data_count_agro[data_count_agro[\"id_doc\"].isin(good_lang[\"id_doc\"])]"
+    "selected=data_count_agro[data_count_agro[\"id_doc\"].isin(good_lang[\"id_doc\"])]\n",
+    "selected[\"format\"]=data_ext[data_ext[\"id_doc\"].isin(good_lang[\"id_doc\"])][\"format\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.271793Z",
-     "start_time": "2018-04-19T16:00:11.267370Z"
+     "end_time": "2018-05-16T10:00:30.954784Z",
+     "start_time": "2018-05-16T10:00:30.949524Z"
     }
    },
    "outputs": [],
@@ -285,29 +320,119 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.690969Z",
-     "start_time": "2018-04-19T16:00:11.273812Z"
+     "end_time": "2018-05-16T09:10:31.246298Z",
+     "start_time": "2018-05-16T09:10:31.220975Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.axes._subplots.AxesSubplot at 0x10af5d4a8>"
+       "90848"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "open(\"selected_mada.json\",'w').write(selected.to_csv())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T10:00:33.484417Z",
+     "start_time": "2018-05-16T10:00:33.477182Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "format,id_doc,count\n",
+      "doc,2163,2163\n",
+      "docx,194,194\n",
+      "html,791,791\n",
+      "pdf,598,598\n",
+      "txt,43,43\n",
+      "xls,826,826\n",
+      "xlsx,35,35\n",
+      "xml,7,7\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(selected.groupby('format').count().to_csv())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T10:00:35.621805Z",
+     "start_time": "2018-05-16T10:00:35.615322Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "format,id_doc\n",
+      "doc,6651\n",
+      "docx,838\n",
+      "html,3465\n",
+      "pdf,1931\n",
+      "ppt,228\n",
+      "pptx,40\n",
+      "sql,1\n",
+      "txt,157\n",
+      "xls,2544\n",
+      "xlsx,126\n",
+      "xml,29\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data_ext.groupby('format').count().to_csv())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T09:10:31.667687Z",
+     "start_time": "2018-05-16T09:10:31.266005Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<matplotlib.axes._subplots.AxesSubplot at 0x114914d68>"
+      ]
+     },
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     },
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAD8CAYAAABgmUMCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFZpJREFUeJzt3X2wXHWd5/H3lyRDeBohuTELubAX19Qo1EKATGQESgklRmEMbgHD1MBEpMhWLVNoOVtM0KkNy8OWli5EqR1rshInKDOZDPKQFXaZCGHc9QkSwwohWmQ0LDcgCbk8jkKI+e4f/buhDblJH7inu+/t96uqq8/59enT33uq+n76dx5+JzITSZJadUCnC5AkjS0GhySpEoNDklSJwSFJqsTgkCRVYnBIkioxOCRJlRgckqRKDA5JUiUTO11AHfr6+nJgYKDTZUjSmLJu3brnMnPa/pYbl8ExMDDA2rVrO12GJI0pEfFkK8u5q0qSVInBIUmqxOCQJFUyLo9x7M3rr7/O4OAgr776aqdLabvJkyfT39/PpEmTOl2KpHGgZ4JjcHCQww47jIGBASKi0+W0TWayfft2BgcHOfbYYztdjqRxoGd2Vb366qtMnTq1p0IDICKYOnVqT/a0JNWjZ4ID6LnQGNarf7ekevRUcEiS3r6eOcaxp4FF94zq+jZ//pxRXd9bsWTJEhYuXMjBBx/c6VIkjWM9Gxzj0ZIlS7j44osNjh4x2j9+ND6040eswdFmt956K1/60peICE444QSuu+46PvnJT/Lcc88xbdo0vv71r3PMMcfwiU98gnPPPZfzzz8fgEMPPZRXXnmFBx98kGuuuYa+vj4ee+wxTjnlFL75zW9y88038/TTT3PmmWfS19fHmjVrOvyXjm/+01YvMzjaaMOGDVx//fV8//vfp6+vj6GhIRYsWLD7sWzZMq688kruuuuufa5n/fr1bNiwgaOOOorTTjuN733ve1x55ZXceOONrFmzhr6+vjb9ReODISBVY3C00QMPPMAFF1yw+x/7lClT+MEPfsAdd9wBwCWXXMJVV1213/XMmTOH/v5+AGbNmsXmzZs5/fTT6yu8i/hPXuo8g6NLTZw4kV27dgGwa9cuduzYsfu1Aw88cPf0hAkT2LlzZ9vrGy0GgTT2GBxtNHfuXD7+8Y/zmc98hqlTpzI0NMT73/9+VqxYwSWXXMJtt93GGWecATSGhl+3bh0XXnghq1at4vXXX9/v+g877DBefvnlrtxVZUBI40fPBkcnTp89/vjj+dznPscHPvABJkyYwEknncTNN9/MpZdeyhe/+MXdB8cBLr/8cubPn8+JJ57IvHnzOOSQQ/a7/oULFzJv3jyOOuqojh0cNyCk8S8ys9M1jLrZs2fnnjdy2rhxI+9973s7VFHntevvNzikzno7P4ojYl1mzt7fcj3b41B1hoIkcMgRSVJFPdXjyMyeHPCvyu5IexWS9qdnehyTJ09m+/btlf6JjgfD9+OYPHlyp0uRNE70TI+jv7+fwcFBtm3b1ulS2m74DoCSNBp6JjgmTZrkHfAkaRT0zK4qSdLo6Jkeh36bB8ElvVX2OCRJldQaHBGxOSIejYhHImJtaZsSEasj4onyfERpj4j4SkRsioifRMTJTetZUJZ/IiIW1FmzJGnf2tHjODMzZzVdxr4IuD8zZwL3l3mAjwAzy2Mh8FVoBA2wGHgfMAdYPBw2kqT268SuqvnA8jK9HDivqf3WbPghcHhEHAl8GFidmUOZ+TywGpjX7qIlSQ11B0cC/xgR6yJiYWmbnpnPlOlfAtPL9Azgqab3Dpa2kdp/S0QsjIi1EbG2F6/VkKR2qfusqtMzc0tEvBNYHRE/bX4xMzMiRuVS7sxcCiyFxui4o7FOSdKb1drjyMwt5XkrcCeNYxTPll1QlOetZfEtwNFNb+8vbSO1S5I6oLYeR0QcAhyQmS+X6bOBa4FVwALg8+X57vKWVcCfRcQKGgfCX8zMZyLiPuC/NB0QPxu4uq66xxuv15A02urcVTUduLOMRjsR+NvM/F8R8TCwMiIuA54ELizL3wt8FNgE/Aq4FCAzhyLiOuDhsty1mTlUY92SpH2oLTgy8+fAiXtp3w6ctZf2BK4YYV3LgGWjXaMkqTqvHJckVWJwSJIqMTgkSZU4Ou444dlTktrFHockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSryOY4zxeg1JnWaPQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRIvAOwiXtwnaSywxyFJqsTgkCRVYnBIkioxOCRJlRgckqRKDA5JUiUGhySpktqDIyImRMT6iPh2mT82In4UEZsi4u8j4ndK+4FlflN5faBpHVeX9p9FxIfrrlmSNLJ29Dg+BWxsmv8CcFNmvht4HristF8GPF/abyrLERHHARcBxwPzgL+KiAltqFuStBe1BkdE9APnAF8r8wHMBW4viywHzivT88s85fWzyvLzgRWZ+Vpm/gLYBMyps25J0sjq7nEsAa4CdpX5qcALmbmzzA8CM8r0DOApgPL6i2X53e17eY8kqc1qC46IOBfYmpnr6vqMPT5vYUSsjYi127Zta8dHSlJPqnOQw9OAj0XER4HJwO8CXwYOj4iJpVfRD2wpy28BjgYGI2Ii8A5ge1P7sOb37JaZS4GlALNnz85a/qJR4mCGksay2nocmXl1ZvZn5gCNg9sPZOafAGuA88tiC4C7y/SqMk95/YHMzNJ+UTnr6lhgJvBQXXVLkvatE8Oq/wWwIiKuB9YDt5T2W4BvRMQmYIhG2JCZGyJiJfA4sBO4IjN/0/6yJUnQpuDIzAeBB8v0z9nLWVGZ+SpwwQjvvwG4ob4KJUmt8spxSVIlBockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRKDQ5JUSUvBERH/tu5CJEljQ6s9jr+KiIci4j9ExDtqrUiS1NVaGh03M8+IiJnAJ4F1EfEQ8PXMXF1rdWOcN2ySNB61fIwjM58A/pLG/TQ+AHwlIn4aEf+uruIkSd2n1WMcJ0TETcBGYC7wh5n53jJ9U431SZK6TKs3croZ+Brw2cz89XBjZj4dEX9ZS2WSpK7UanCcA/x6+JatEXEAMDkzf5WZ36itOklS12n1GMd3gIOa5g8ubZKkHtNqcEzOzFeGZ8r0wfWUJEnqZq0Gx79ExMnDMxFxCvDrfSwvSRqnWj3G8WngHyLiaSCAfwX8UW1VSZK6VqsXAD4cEe8Bfq80/SwzX6+vLElSt2q1xwHw+8BAec/JEUFm3lpLVZKkrtVScETEN4B/AzwC/KY0J2BwSFKPabXHMRs4LjOzzmIkSd2v1bOqHqNxQFyS1ONa7XH0AY+XUXFfG27MzI/VUpUkqWu1GhzXVF1xREwGvgscWD7n9sxcHBHHAiuAqcA64JLM3BERB9I4ZnIKsB34o8zcXNZ1NXAZjeMrV2bmfVXrkSSNjpZ2VWXmPwGbgUll+mHgx/t522vA3Mw8EZgFzIuIU4EvADdl5ruB52kEAuX5+dJ+U1mOiDgOuAg4HphH46ZSE1r+CyVJo6rVYdUvB24H/ro0zQDu2td7smF4mJJJ5ZE0hmK/vbQvB84r0/PLPOX1syIiSvuKzHwtM38BbALmtFK3JGn0tXpw/ArgNOAl2H1Tp3fu700RMSEiHgG2AquBfwZeyMydZZFBGiFEeX6qrH8n8CKN3Vm72/fyHklSm7UaHK9l5o7hmYiYSKP3sE+Z+ZvMnAX00+glvOctVdmCiFgYEWsjYu22bdvq+hhJ6nmtBsc/RcRngYMi4kPAPwD/o9UPycwXgDXAHwCHl+CBRqBsKdNbgKNhdzC9g8ZB8t3te3lP82cszczZmTl72rRprZYmSaqo1eBYBGwDHgX+PXAvjfuPjygipkXE4WX6IOBDNG49uwY4vyy2ALi7TK8q85TXHygXHK4CLoqIA8sZWTOBh1qsW5I0ylod5HAX8N/Lo1VHAsvLGVAHACsz89sR8TiwIiKuB9YDt5TlbwG+ERGbgCEaZ1KRmRsiYiXwOLATuGL4ToSSpPZrdayqX7CXYxqZ+a6R3pOZPwFO2kv7z9nLWVGZ+SpwwQjrugG4oZVaO2Fg0T2dLkGS2qbKWFXDJtP4Bz9l9MuRJHW7Vi8A3N702JKZS4Bzaq5NktSFWt1VdXLT7AE0eiBV7uUhSRonWv3n/1+bpnfSGH7kwlGvRpLU9Vo9q+rMuguRJI0Nre6q+sy+Xs/MG0enHElSt6tyVtXv07gYD+APaVyE90QdRUmSulerwdEPnJyZLwNExDXAPZl5cV2FSZK6U6tDjkwHdjTN7yhtkqQe02qP41bgoYi4s8yfxxv3zpAk9ZBWz6q6ISL+J3BGabo0M9fXV5YkqVu1uqsK4GDgpcz8MjBYRqqVJPWYVm8duxj4C+Dq0jQJ+GZdRUmSulerPY6PAx8D/gUgM58GDqurKElS92o1OHaUmyolQEQcUl9JkqRu1mpwrIyIv6Zx29fLge9Q7aZOkqRxotWzqr5U7jX+EvB7wH/KzNW1ViZJ6kr7DY5y69fvlIEODQtJ6nH73VVV7u+9KyLe0YZ6JEldrtUrx18BHo2I1ZQzqwAy88paqpIkda1Wg+OO8pAk9bh9BkdEHJOZ/y8zHZdKkgTs/xjHXcMTEfGtmmuRJI0B+wuOaJp+V52FSJLGhv0FR44wLUnqUfs7OH5iRLxEo+dxUJmmzGdm/m6t1XWZgUX3dLoESeq4fQZHZk5oVyGSpLGhyv04JEkyOCRJ1dQWHBFxdESsiYjHI2JDRHyqtE+JiNUR8UR5PqK0R0R8JSI2RcRPIuLkpnUtKMs/EREL6qpZkrR/dfY4dgJ/npnHAacCV0TEccAi4P7MnAncX+YBPgLMLI+FwFehETTAYuB9wBxg8XDYSJLar7bgyMxnMvPHZfplYCMwA5gPDF+Jvhw4r0zPB27Nhh/SuPfHkcCHgdWZOZSZz9MYoXdeXXVLkvatLcc4ImIAOAn4ETA9M58pL/0SmF6mZwBPNb1tsLSN1L7nZyyMiLURsXbbtm2jWr8k6Q21B0dEHAp8C/h0Zr7U/Frz7WjfrsxcmpmzM3P2tGnTRmOVkqS9qDU4ImISjdC4LTOHR9d9tuyCojxvLe1bgKOb3t5f2kZqlyR1QJ1nVQVwC7AxM29semkVMHxm1ALg7qb2Py1nV50KvFh2ad0HnB0RR5SD4meXNklSB7R6P4634jTgEho3gHqktH0W+DywMiIuA54ELiyv3Qt8FNgE/Aq4FCAzhyLiOuDhsty1mTlUY92SpH2oLTgy8//w26PrNjtrL8sncMUI61oGLBu96iRJb5VXjkuSKjE4JEmVGBySpEoMDklSJQaHJKkSg0OSVInBIUmqxOCQJFVicEiSKjE4JEmVGBySpEoMDklSJQaHJKkSg0OSVInBIUmqxOCQJFVicEiSKjE4JEmVGBySpEpqu+f4WDaw6J5OlyBJXcsehySpEoNDklSJwSFJqsTgkCRVYnBIkioxOCRJlRgckqRKDA5JUiW1BUdELIuIrRHxWFPblIhYHRFPlOcjSntExFciYlNE/CQiTm56z4Ky/BMRsaCueiVJramzx/E3wLw92hYB92fmTOD+Mg/wEWBmeSwEvgqNoAEWA+8D5gCLh8NGktQZtQVHZn4XGNqjeT6wvEwvB85rar81G34IHB4RRwIfBlZn5lBmPg+s5s1hJElqo3Yf45iemc+U6V8C08v0DOCppuUGS9tI7ZKkDunYwfHMTCBHa30RsTAi1kbE2m3bto3WaiVJe2h3cDxbdkFRnreW9i3A0U3L9Ze2kdrfJDOXZubszJw9bdq0US9cktTQ7uBYBQyfGbUAuLup/U/L2VWnAi+WXVr3AWdHxBHloPjZpU2S1CG13Y8jIv4O+CDQFxGDNM6O+jywMiIuA54ELiyL3wt8FNgE/Aq4FCAzhyLiOuDhsty1mbnnAXdJUhvVFhyZ+ccjvHTWXpZN4IoR1rMMWDaKpUmS3gavHJckVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSsZMcETEvIj4WURsiohFna5HknrVmAiOiJgA/DfgI8BxwB9HxHGdrUqSetOYCA5gDrApM3+emTuAFcD8DtckST1prATHDOCppvnB0iZJarOJnS5gtETEQmBhmX0lIn7WyXpq1Ac81+kiuoDb4Q1uiwa3AxBfeFvb4V+3stBYCY4twNFN8/2lbbfMXAosbWdRnRARazNzdqfr6DS3wxvcFg1uh4Z2bIexsqvqYWBmRBwbEb8DXASs6nBNktSTxkSPIzN3RsSfAfcBE4Blmbmhw2VJUk8aE8EBkJn3Avd2uo4uMO53x7XI7fAGt0WD26Gh9u0QmVn3Z0iSxpGxcoxDktQlDI4uFhHLImJrRDzW1DYlIlZHxBPl+YhO1tgOEXF0RKyJiMcjYkNEfKq099S2iIjJEfFQRPzfsh3+c2k/NiJ+VIbj+ftyAsm4FxETImJ9RHy7zPfcdoiIzRHxaEQ8EhFrS1vt3wuDo7v9DTBvj7ZFwP2ZORO4v8yPdzuBP8/M44BTgSvKkDO9ti1eA+Zm5onALGBeRJwKfAG4KTPfDTwPXNbBGtvpU8DGpvle3Q5nZuasplNwa/9eGBxdLDO/Cwzt0TwfWF6mlwPntbWoDsjMZzLzx2X6ZRr/LGbQY9siG14ps5PKI4G5wO2lfdxvB4CI6AfOAb5W5oMe3A4jqP17YXCMPdMz85ky/UtgeieLabeIGABOAn5ED26LsnvmEWArsBr4Z+CFzNxZFumV4XiWAFcBu8r8VHpzOyTwjxGxroyeAW34XoyZ03H1ZpmZEdEzp8VFxKHAt4BPZ+ZLjR+ZDb2yLTLzN8CsiDgcuBN4T4dLaruIOBfYmpnrIuKDna6nw07PzC0R8U5gdUT8tPnFur4X9jjGnmcj4kiA8ry1w/W0RURMohEat2XmHaW5J7cFQGa+AKwB/gA4PCKGfwS+aTieceg04GMRsZnGSNlzgS/Te9uBzNxSnrfS+CExhzZ8LwyOsWcVsKBMLwDu7mAtbVH2X98CbMzMG5te6qltERHTSk+DiDgI+BCN4z1rgPPLYuN+O2Tm1ZnZn5kDNIYfeiAz/4Qe2w4RcUhEHDY8DZwNPEYbvhdeANjFIuLvgA/SGPXzWWAxcBewEjgGeBK4MDP3PIA+rkTE6cD/Bh7ljX3an6VxnKNntkVEnEDjYOcEGj/6VmbmtRHxLhq/vKcA64GLM/O1zlXaPmVX1X/MzHN7bTuUv/fOMjsR+NvMvCEiplLz98LgkCRV4q4qSVIlBockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSv4/Uk1ROHVVFLoAAAAASUVORK5CYII=\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAD8CAYAAABgmUMCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFbVJREFUeJzt3X+QXeV93/H3F0lB/IpBP6AyC13caFKggwUoMmPRsUEuFphY0AGHTAxrQ1FnSgbbTYcIO1NRjDpm7IKK27hRghyBSRSMAVGbligg4paxDZJFDbLsQbFV2IpBG4mfwSBkvv3jPisuYrV7D95z792979fMzj3nuc85+90zs/dzn/MzMhNJklp1UKcLkCRNLAaHJKkSg0OSVInBIUmqxOCQJFVicEiSKjE4JEmVGBySpEoMDklSJVM7XUAdZs2alf39/Z0uQ5ImlE2bNv19Zs4eq9+kDI7+/n42btzY6TIkaUKJiP/bSj93VUmSKjE4JEmVGBySpEom5TGOkbzxxhsMDg7y2muvdbqUtps+fTp9fX1Mmzat06VImgR6JjgGBwc54ogj6O/vJyI6XU7bZCa7du1icHCQE044odPlSJoEemZX1WuvvcbMmTN7KjQAIoKZM2f25EhLUj16JjiAnguNYb36d0uqR08FhyTpV9czxzj217/sO+O6vu1f+ti4ru/dWLlyJUuXLuXQQw/tdCmSJrGeDY7JaOXKlXzyk580OHrEeH/50eTQji+xBkeb3XbbbXzlK18hIjjllFO44YYbuPzyyxkaGmL27Nl8/etf5/jjj+dTn/oU559/PhdddBEAhx9+OK+88goPP/ww1113HbNmzeLJJ5/k9NNP5xvf+AZf/epX2bFjB2eddRazZs1iw4YNHf5LJzc/tNXLDI422rJlCytWrOCRRx5h1qxZ7N69m4GBAS677DIGBgZYvXo1V199Nffee++o69m8eTNbtmzhve99LwsXLuSRRx7h6quv5qabbmLDhg3MmjWrTX/R5GAISNUYHG300EMPcdFFF+37YJ8xYwbf+973uPvuuwG49NJLueaaa8Zcz4IFC+jr6wNg3rx5bN++nTPPPLO+wruIH/JS5xkcbZSZY54aO/z+1KlTefPNN/ctt2fPnn19Dj744H3TU6ZMYe/evTVU2x4GgTTxGBxttGjRIi688EI+97nPMXPmTHbv3s0HP/hB1q5dy6WXXsodd9yxb+TQ39/Ppk2b+MQnPsG6det44403xlz/EUccwcsvv9yVu6oMCGny6Nng6MTpsyeffDJf+MIX+NCHPsSUKVM49dRTueWWW7j88sv58pe/vO/gOMCVV17JkiVLWLBgAYsWLeKwww4bc/1Lly7l3HPPZc6cOR07OG5ASJNfZGanaxh38+fPz/0f5LR161ZOPPHEDlXUee36+w0OqbN+lS/FEbEpM+eP1a9nRxyqzlCQBN5yRJJUUU+NOFo5q2kyqrI70lGFpLH0zIhj+vTp7Nq1q9KH6GQw/DyO6dOnd7oUSZNEz4w4+vr6GBwcZGhoqNOltN3wEwAlaTz0THBMmzbNJ+BJ0jjomV1VkqTx0TMjDr2dB8ElvVuOOCRJldQaHBGxPSKeiIjHI2JjaZsREesj4qnyelRpj4i4JSK2RcSPIuK0pvUMlP5PRcRAnTVLkkbXjhHHWZk5r+ky9mXAg5k5F3iwzAOcC8wtP0uBr0EjaIDlwAeABcDy4bCRJLVfJ3ZVLQHWlOk1wAVN7bdlw/eBIyNiDvBRYH1m7s7M54H1wOJ2Fy1Jaqg7OBL464jYFBFLS9sxmfksQHk9urQfCzzTtOxgaTtQ+9tExNKI2BgRG3vxWg1Jape6z6pamJk7IuJoYH1E/GSUviPdCyRHaX97Q+YqYBU07o77boqVJI2t1hFHZu4orzuBe2gco3iu7IKivO4s3QeB45oW7wN2jNIuSeqA2kYcEXEYcFBmvlymzwGuB+4DBoAvldd1ZZH7gN+PiLU0DoS/mJnPRsQDwH9sOiB+DnBtXXVPNl6vIWm81bmr6hjgnnI32qnAX2Tm/4yIx4A7I+IK4Gng4tL/fuA8YBvwKvBpgMzcHRFfBB4r/a7PzN011i1JGkVtwZGZPwPeP0L7LmDRCO0JXHWAda0GVo93jZKk6rxyXJJUicEhSarE4JAkVeLdcScJz56S1C6OOCRJlRgckqRKDA5JUiUGhySpEoNDklSJwSFJqsTgkCRV4nUcE4zXa0jqNEcckqRKDA5JUiUGhySpEoNDklSJwSFJqsTgkCRVYnBIkioxOCRJlXgBYBfx4j5JE4EjDklSJQaHJKkSg0OSVInBIUmqxOCQJFVicEiSKjE4JEmV1B4cETElIjZHxLfL/AkR8YOIeCoi/ioifq20H1zmt5X3+5vWcW1p/2lEfLTumiVJB9aOEcdngK1N8zcCN2fmXOB54IrSfgXwfGb+BnBz6UdEnARcApwMLAb+OCKmtKFuSdIIag2OiOgDPgb8WZkP4GzgrtJlDXBBmV5S5invLyr9lwBrM/P1zPw5sA1YUGfdkqQDq3vEsRK4BnizzM8EXsjMvWV+EDi2TB8LPANQ3n+x9N/XPsIykqQ2qy04IuJ8YGdmbmpuHqFrjvHeaMs0/76lEbExIjYODQ1VrleS1Jo6b3K4EPh4RJwHTAd+ncYI5MiImFpGFX3AjtJ/EDgOGIyIqcB7gN1N7cOal9knM1cBqwDmz5//jmDpJt7MUNJEVtuIIzOvzcy+zOyncXD7ocz8PWADcFHpNgCsK9P3lXnK+w9lZpb2S8pZVycAc4FH66pbkjS6TtxW/Q+BtRFxA7AZuLW03wrcHhHbaIw0LgHIzC0RcSfwY2AvcFVm/rL9ZUuSoE3BkZkPAw+X6Z8xwllRmfkacPEBll8BrKivQklSq7xyXJJUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRKDQ5JUicEhSarE4JAkVdJScETEP6u7EEnSxNDqiOO/RcSjEfFvIuLIWiuSJHW1lu6Om5lnRsRc4HJgY0Q8Cnw9M9fXWt0E5wObJE1GLR/jyMyngD+i8TyNDwG3RMRPIuJf1lWcJKn7tHqM45SIuBnYCpwN/HZmnlimb66xPklSl2n1QU7/BfhT4POZ+YvhxszcERF/VEtlkqSu1GpwnAf8YviRrRFxEDA9M1/NzNtrq06S1HVaPcbxN8AhTfOHljZJUo9pNTimZ+YrwzNl+tB6SpIkdbNWg+MfIuK04ZmIOB34xSj9JUmTVKvHOD4LfDMidpT5OcDv1FOSJKmbtXoB4GMR8U+B3wQC+ElmvlFrZZKkrtTqiAPgt4D+ssypEUFm3lZLVZKkrtVScETE7cA/AR4HflmaEzA4JKnHtDrimA+clJlZZzGSpO7X6llVTwL/qM5CJEkTQ6sjjlnAj8tdcV8fbszMj9dSlSSpa7UaHNdVXXFETAe+Cxxcfs9dmbk8Ik4A1gIzgB8Cl2bmnog4mMYxk9OBXcDvZOb2sq5rgStoHF+5OjMfqFqPJGl8tLSrKjP/FtgOTCvTj9H40B/N68DZmfl+YB6wOCLOAG4Ebs7MucDzNAKB8vp8Zv4GjTvu3ggQEScBlwAnA4uBP46IKS3/hZKkcdXqbdWvBO4C/qQ0HQvcO9oy2TB8m5Jp5Sdp3Ir9rtK+BrigTC8p85T3F0VElPa1mfl6Zv4c2AYsaKVuSdL4a/Xg+FXAQuAl2PdQp6PHWigipkTE48BOYD3wd8ALmbm3dBmkEUKU12fK+vcCLwIzm9tHWEaS1GatBsfrmblneCYiptIYPYwqM3+ZmfOAPhqjhBNH6ja82gO8d6D2t4mIpRGxMSI2Dg0NjVWaJOldajU4/jYiPg8cEhH/Avgm8N9b/SWZ+QLwMHAGcGQJHmgEyvD9rwaB42BfML0H2N3cPsIyzb9jVWbOz8z5s2fPbrU0SVJFrQbHMmAIeAL418D9NJ4/fkARMTsijizThwAfofHo2Q3ARaXbALCuTN9X5invP1QuOLwPuCQiDi5nZM0FHm2xbknSOGv1Jodv0nh07J9WWPccYE05A+og4M7M/HZE/BhYGxE3AJuBW0v/W4HbI2IbjZHGJeV3b4mIO4EfA3uBq4afRChJar9W71X1c0Y4rpCZ7zvQMpn5I+DUEdp/xghnRWXma8DFB1jXCmBFK7V2Qv+y73S6BElqmyr3qho2ncYH/IzxL0eS1O1avQBwV9PP/8vMlTSux5Ak9ZhWd1Wd1jR7EI0RyBG1VCRJ6mqt7qr6T03Te2ncfuQT416NJKnrtXpW1Vl1FyJJmhha3VX1b0d7PzNvGp9yJEndrspZVb9F42I8gN+mccv0Zw64hCRpUqryIKfTMvNlgIi4DvhmZv6rugqTJHWnVm85cjywp2l+D9A/7tVIkrpeqyOO24FHI+IeGleQX0jjaX2SpB7T6llVKyLifwD/vDR9OjM311eWJKlbtbqrCuBQ4KXM/M/AYLlTrSSpx7T66NjlwB8C15amacA36ipKktS9Wh1xXAh8HPgHgMzcgbcckaSe1Gpw7CkPVUqAiDisvpIkSd2s1eC4MyL+hMZjX68E/oZqD3WSJE0SrZ5V9ZXyrPGXgN8E/n1mrq+1MklSVxozOMqjXx/IzI8AhoUk9bgxd1WV53u/GhHvaUM9kqQu1+qV468BT0TEesqZVQCZeXUtVUmSularwfGd8iNJ6nGjBkdEHJ+ZT2fmmnYVJEnqbmMd47h3eCIivlVzLZKkCWCs4Iim6ffVWYgkaWIYKzjyANOSpB411sHx90fESzRGHoeUacp8Zuav11pdl+lf5vkBkjRqcGTmlHYVIkmaGKo8j0OSJINDklRNbcEREcdFxIaI2BoRWyLiM6V9RkSsj4inyutRpT0i4paI2BYRP4qI05rWNVD6PxURA3XVLEkaW50jjr3AH2TmicAZwFURcRKwDHgwM+cCD5Z5gHOBueVnKfA1aAQNsBz4ALAAWD4cNpKk9qstODLz2cz8YZl+GdgKHAssAYavRF8DXFCmlwC3ZcP3aTz7Yw7wUWB9Zu7OzOdp3KF3cV11S5JG15ZjHBHRD5wK/AA4JjOfhUa4AEeXbscCzzQtNljaDtS+/+9YGhEbI2Lj0NDQeP8JkqSi9uCIiMOBbwGfzcyXRus6QluO0v72hsxVmTk/M+fPnj373RUrSRpTrcEREdNohMYdmXl3aX6u7IKivO4s7YPAcU2L9wE7RmmXJHVAnWdVBXArsDUzb2p66z5g+MyoAWBdU/tl5eyqM4AXy66sB4BzIuKoclD8nNImSeqAVp/H8W4sBC6l8QCox0vb54EvAXdGxBXA08DF5b37gfOAbcCrwKcBMnN3RHwReKz0uz4zd9dYtyRpFLUFR2b+b0Y+PgGwaIT+CVx1gHWtBlaPX3WSpHfLK8clSZUYHJKkSgwOSVIlBockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIltT1zfCLrX/adTpcgSV3LEYckqRKDQ5JUicEhSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIltQVHRKyOiJ0R8WRT24yIWB8RT5XXo0p7RMQtEbEtIn4UEac1LTNQ+j8VEQN11StJak2dI44/Bxbv17YMeDAz5wIPlnmAc4G55Wcp8DVoBA2wHPgAsABYPhw2kqTOqC04MvO7wO79mpcAa8r0GuCCpvbbsuH7wJERMQf4KLA+M3dn5vPAet4ZRpKkNmr3MY5jMvNZgPJ6dGk/Fnimqd9gaTtQuySpQ7rl4HiM0JajtL9zBRFLI2JjRGwcGhoa1+IkSW9pd3A8V3ZBUV53lvZB4Limfn3AjlHa3yEzV2Xm/MycP3v27HEvXJLU0O7guA8YPjNqAFjX1H5ZObvqDODFsivrAeCciDiqHBQ/p7RJkjqktudxRMRfAh8GZkXEII2zo74E3BkRVwBPAxeX7vcD5wHbgFeBTwNk5u6I+CLwWOl3fWbuf8BdktRGtQVHZv7uAd5aNELfBK46wHpWA6vHsTRJ0q+gWw6OS5ImCINDklSJwSFJqsTgkCRVYnBIkioxOCRJlRgckqRKDA5JUiUGhySpEoNDklSJwSFJqsTgkCRVYnBIkioxOCRJlRgckqRKDA5JUiUGhySpEoNDklSJwSFJqsTgkCRVYnBIkioxOCRJlRgckqRKDA5JUiUGhySpEoNDklSJwSFJqsTgkCRVMmGCIyIWR8RPI2JbRCzrdD2S1KsmRHBExBTgvwLnAicBvxsRJ3W2KknqTRMiOIAFwLbM/Flm7gHWAks6XJMk9aSJEhzHAs80zQ+WNklSm03tdAEtihHa8m0dIpYCS8vsKxHx09qr6oxZwN93uogu4HZ4i9uiwe0AxI2/0nb4x610mijBMQgc1zTfB+xo7pCZq4BV7SyqEyJiY2bO73QdneZ2eIvbosHt0NCO7TBRdlU9BsyNiBMi4teAS4D7OlyTJPWkCTHiyMy9EfH7wAPAFGB1Zm7pcFmS1JMmRHAAZOb9wP2drqMLTPrdcS1yO7zFbdHgdmiofTtEZo7dS5KkYqIc45AkdQmDo4tFxOqI2BkRTza1zYiI9RHxVHk9qpM1tkNEHBcRGyJia0RsiYjPlPae2hYRMT0iHo2I/1O2w38o7SdExA/KdvircgLJpBcRUyJic0R8u8z33HaIiO0R8UREPB4RG0tb7f8XBkd3+3Ng8X5ty4AHM3Mu8GCZn+z2An+QmScCZwBXlVvO9Nq2eB04OzPfD8wDFkfEGcCNwM1lOzwPXNHBGtvpM8DWpvle3Q5nZea8plNwa/+/MDi6WGZ+F9i9X/MSYE2ZXgNc0NaiOiAzn83MH5bpl2l8WBxLj22LbHilzE4rPwmcDdxV2if9dgCIiD7gY8CflfmgB7fDAdT+f2FwTDzHZOaz0PhABY7ucD1tFRH9wKnAD+jBbVF2zzwO7ATWA38HvJCZe0uXXrkdz0rgGuDNMj+T3twOCfx1RGwqd8+ANvxfTJjTcaWIOBz4FvDZzHyp8SWzt2TmL4F5EXEkcA9w4kjd2ltVe0XE+cDOzNwUER8ebh6h66TeDsXCzNwREUcD6yPiJ+34pY44Jp7nImIOQHnd2eF62iIiptEIjTsy8+7S3JPbAiAzXwAepnHM58iIGP4S+I7b8UxCC4GPR8R2GnfKPpvGCKTXtgOZuaO87qTxRWIBbfi/MDgmnvuAgTI9AKzrYC1tUfZf3wpszcybmt7qqW0REbPLSIOIOAT4CI3jPRuAi0q3Sb8dMvPazOzLzH4atx96KDN/jx7bDhFxWEQcMTwNnAM8SRv+L7wAsItFxF8CH6Zx18/ngOXAvcCdwPHA08DFmbn/AfRJJSLOBP4X8ARv7dP+PI3jHD2zLSLiFBoHO6fQ+NJ3Z2ZeHxHvo/HNewawGfhkZr7euUrbp+yq+neZeX6vbYfy995TZqcCf5GZKyJiJjX/XxgckqRK3FUlSarE4JAkVWJwSJIqMTgkSZUYHJKkSgwOSVIlBockqRKDQ5JUyf8HjzxYP5opw4cAAAAASUVORK5CYII=\n",
       "text/plain": [
-       "<matplotlib.figure.Figure at 0x10aafbd30>"
+       "<matplotlib.figure.Figure at 0x114914550>"
       ]
      },
      "metadata": {},
@@ -320,11 +445,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 17,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.781858Z",
-     "start_time": "2018-04-19T16:00:11.693274Z"
+     "end_time": "2018-05-16T10:01:04.739745Z",
+     "start_time": "2018-05-16T10:01:04.644633Z"
     }
    },
    "outputs": [],
@@ -337,11 +462,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 18,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.787422Z",
-     "start_time": "2018-04-19T16:00:11.784106Z"
+     "end_time": "2018-05-16T10:01:22.697656Z",
+     "start_time": "2018-05-16T10:01:22.693785Z"
     }
    },
    "outputs": [],
@@ -354,33 +479,49 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 19,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T16:00:11.914635Z",
-     "start_time": "2018-04-19T16:00:11.789825Z"
-    }
+     "end_time": "2018-05-16T10:01:24.995281Z",
+     "start_time": "2018-05-16T10:01:24.869933Z"
+    },
+    "scrolled": true
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "mkdir: /Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_3: File exists\r\n"
+      "mkdir: /Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_5: File exists\r\n"
      ]
     }
    ],
    "source": [
-    "!mkdir /Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_3"
+    "!mkdir /Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_5"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 22,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-20T06:16:37.554432Z",
-     "start_time": "2018-04-20T06:06:13.862194Z"
+     "end_time": "2018-05-16T09:10:31.904875Z",
+     "start_time": "2018-05-16T09:10:31.901715Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "skip=0\n",
+    "skipPercentage=0.10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T09:10:32.031609Z",
+     "start_time": "2018-05-16T09:10:31.907100Z"
     }
    },
    "outputs": [
@@ -388,23 +529,1520 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\r",
-      "64/5552"
+      "mkdir: /Users/jacquesfize/LOD_DATASETS/disambiguate_1: File exists\r\n"
      ]
+    }
+   ],
+   "source": [
+    "%mkdir /Users/jacquesfize/LOD_DATASETS/disambiguate_1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T11:46:45.348959Z",
+     "start_time": "2018-05-16T10:02:19.180944Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e1ea34fcab9149af988ed0c3327c1828",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "IntProgress(value=0, description='Processing', max=5552)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/ops.py:792: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
-      "  result = getattr(x, name)(y)\n"
+      "64/5552\n",
+      "Empty 1029\n",
+      "110/5552\n",
+      " 1077\n",
+      "111/5552\n",
+      " 1078\n",
+      "127/5552\n",
+      " 1099\n",
+      "130/5552\n",
+      " 1100\n",
+      "145/5552\n",
+      " 1121\n",
+      "146/5552\n",
+      " 1122\n",
+      "154/5552\n",
+      " 1135\n",
+      "155/5552\n",
+      " 1136\n",
+      "165/5552\n",
+      " 11438\n",
+      "168/5552\n",
+      " 11440\n",
+      "175/5552\n",
+      " 11493\n",
+      "180/5552\n",
+      " 11500\n",
+      "246/5552\n",
+      " 11682\n",
+      "251/5552\n",
+      " 117\n",
+      "262/5552\n",
+      "Empty 11724\n",
+      "279/5552\n",
+      "Empty 11770\n",
+      "294/5552\n",
+      " 118\n",
+      "298/5552\n",
+      "Empty 11806\n",
+      "331/5552\n",
+      " 11899\n",
+      "334/5552\n",
+      " 11903\n",
+      "342/5552\n",
+      " 11910\n",
+      "368/5552\n",
+      " 11952\n",
+      "403/5552\n",
+      " 12009\n",
+      "405/5552\n",
+      " 12013\n",
+      "412/5552\n",
+      " 12020\n",
+      "437/5552\n",
+      " 12061\n",
+      "457/5552\n",
+      " 12095\n",
+      "470/5552\n",
+      " 12137\n",
+      "477/5552\n",
+      " 1217\n",
+      "478/5552\n",
+      " 1218\n",
+      "481/5552\n",
+      "Empty 12193\n",
+      "\n",
+      " 12194\n",
+      "482/5552\n",
+      "Empty 12194\n",
+      "491/5552\n",
+      " 12205\n",
+      "496/5552\n",
+      " 12211\n",
+      "508/5552\n",
+      " 12223\n",
+      "513/5552\n",
+      " 1223\n",
+      "516/5552\n",
+      " 1224\n",
+      "522/5552\n",
+      " 12247\n",
+      "541/5552\n",
+      " 12264\n",
+      "542/5552\n",
+      " 12265\n",
+      "543/5552\n",
+      " 12266\n",
+      "545/5552\n",
+      " 12268\n",
+      "553/5552\n",
+      " 12275\n",
+      "558/5552\n",
+      " 12282\n",
+      "576/5552\n",
+      " 1231\n",
+      "581/5552\n",
+      " 1232\n",
+      "586/5552\n",
+      " 12372\n",
+      "589/5552\n",
+      " 12375\n",
+      "603/5552\n",
+      " 12392\n",
+      "607/5552\n",
+      " 12407\n",
+      "615/5552\n",
+      " 12422\n",
+      "619/5552\n",
+      " 12427\n",
+      "620/5552\n",
+      " 12428\n",
+      "639/5552\n",
+      " 12496\n",
+      "646/5552\n",
+      " 12510\n",
+      "647/5552\n",
+      " 12511\n",
+      "649/5552\n",
+      " 12513\n",
+      "651/5552\n",
+      " 12515\n",
+      "652/5552\n",
+      " 12516\n",
+      "656/5552\n",
+      " 12520\n",
+      "657/5552\n",
+      " 12521\n",
+      "670/5552\n",
+      " 12547\n",
+      "672/5552\n",
+      " 12554\n",
+      "673/5552\n",
+      " 12557\n",
+      "675/5552\n",
+      " 12559\n",
+      "678/5552\n",
+      " 12562\n",
+      "679/5552\n",
+      " 12563\n",
+      "681/5552\n",
+      " 12572\n",
+      "682/5552\n",
+      " 12575\n",
+      "687/5552\n",
+      " 12595\n",
+      "688/5552\n",
+      " 12598\n",
+      "693/5552\n",
+      "Empty 12601\n",
+      "694/5552\n",
+      "Empty 12602\n",
+      "\n",
+      " 12603\n",
+      "695/5552\n",
+      "Empty 12603\n",
+      "696/5552\n",
+      " 12606\n",
+      "704/5552\n",
+      " 12614\n",
+      "708/5552\n",
+      " 12618\n",
+      "709/5552\n",
+      " 12619\n",
+      "713/5552\n",
+      " 12623\n",
+      "730/5552\n",
+      " 12651\n",
+      "732/5552\n",
+      " 12660\n",
+      "738/5552\n",
+      "Empty 12678\n",
+      "742/5552\n",
+      " 12687\n",
+      "744/5552\n",
+      "Empty 12688\n",
+      "746/5552\n",
+      " 12690\n",
+      "748/5552\n",
+      " 12692\n",
+      "752/5552\n",
+      " 12699\n",
+      "765/5552\n",
+      " 1271\n",
+      "776/5552\n",
+      " 1272\n",
+      "874/5552\n",
+      " 12810\n",
+      "876/5552\n",
+      " 12812\n",
+      "884/5552\n",
+      " 12841\n",
+      "922/5552\n",
+      " 1293\n",
+      "924/5552\n",
+      " 1294\n",
+      "928/5552\n",
+      " 1295\n",
+      "933/5552\n",
+      " 1296\n",
+      "934/5552\n",
+      " 12960\n",
+      "937/5552\n",
+      " 12964\n",
+      "939/5552\n",
+      " 12968\n",
+      "944/5552\n",
+      " 12976\n",
+      "946/5552\n",
+      " 12980\n",
+      "949/5552\n",
+      " 12985\n",
+      "950/5552\n",
+      " 12986\n",
+      "957/5552\n",
+      " 12994\n",
+      "967/5552\n",
+      " 13004\n",
+      "969/5552\n",
+      " 13006\n",
+      "970/5552\n",
+      " 13007\n",
+      "979/5552\n",
+      " 13016\n",
+      "980/5552\n",
+      " 13017\n",
+      "1002/5552\n",
+      " 13043\n",
+      "1009/5552\n",
+      " 1305\n",
+      "1015/5552\n",
+      " 1306\n",
+      "1019/5552\n",
+      " 13069\n",
+      "1024/5552\n",
+      " 13073\n",
+      "1027/5552\n",
+      " 13076\n",
+      "1028/5552\n",
+      " 13077\n",
+      "1034/5552\n",
+      "Empty 13088\n",
+      "1038/5552\n",
+      " 13096\n",
+      "1040/5552\n",
+      " 13098\n",
+      "1041/5552\n",
+      " 13099\n",
+      "1043/5552\n",
+      " 13100\n",
+      "1044/5552\n",
+      " 13101\n",
+      "1046/5552\n",
+      " 13103\n",
+      "1049/5552\n",
+      " 1311\n",
+      "1053/5552\n",
+      "Empty 13115\n",
+      "1054/5552\n",
+      " 1312\n",
+      "1055/5552\n",
+      " 13122\n",
+      "1057/5552\n",
+      " 13124\n",
+      "1059/5552\n",
+      " 13126\n",
+      "1060/5552\n",
+      " 13127\n",
+      "1062/5552\n",
+      " 13129\n",
+      "1066/5552\n",
+      " 13133\n",
+      "1070/5552\n",
+      " 13140\n",
+      "1073/5552\n",
+      " 13143\n",
+      "1076/5552\n",
+      " 13147\n",
+      "1077/5552\n",
+      " 13148\n",
+      "1082/5552\n",
+      " 13152\n",
+      "1083/5552\n",
+      " 13153\n",
+      "1085/5552\n",
+      " 13155\n",
+      "1092/5552\n",
+      " 13161\n",
+      "1096/5552\n",
+      " 13167\n",
+      "1103/5552\n",
+      " 13173\n",
+      "1105/5552\n",
+      " 13177\n",
+      "1106/5552\n",
+      " 13178\n",
+      "1172/5552\n",
+      "Empty 13293\n",
+      "1175/5552\n",
+      "Empty 13303\n",
+      "1176/5552\n",
+      "Empty 13304\n",
+      "1183/5552\n",
+      " 13319\n",
+      "1185/5552\n",
+      " 13320\n",
+      "1194/5552\n",
+      " 13336\n",
+      "1216/5552\n",
+      " 13372\n",
+      "1220/5552\n",
+      " 13377\n",
+      "1260/5552\n",
+      " 13426\n",
+      "1264/5552\n",
+      " 13430\n",
+      "1266/5552\n",
+      " 13432\n",
+      "1268/5552\n",
+      " 13434\n",
+      "1270/5552\n",
+      " 13436\n",
+      "1275/5552\n",
+      " 13440\n",
+      "1277/5552\n",
+      " 13442\n",
+      "1278/5552\n",
+      " 13443\n",
+      "1312/5552\n",
+      " 13493\n",
+      "1317/5552\n",
+      " 13499\n",
+      "1321/5552\n",
+      " 13502\n",
+      "1326/5552\n",
+      " 13507\n",
+      "1327/5552\n",
+      " 13508\n",
+      "1330/5552\n",
+      " 13511\n",
+      "1331/5552\n",
+      " 13512\n",
+      "1332/5552\n",
+      " 13513\n",
+      "1333/5552\n",
+      " 13514\n",
+      "1334/5552\n",
+      " 13517\n",
+      "1337/5552\n",
+      " 13520\n",
+      "1338/5552\n",
+      " 13521\n",
+      "1339/5552\n",
+      " 13522\n",
+      "1340/5552\n",
+      " 13523\n",
+      "1343/5552\n",
+      " 13526\n",
+      "1344/5552\n",
+      " 13527\n",
+      "1347/5552\n",
+      " 13530\n",
+      "1348/5552\n",
+      " 13531\n",
+      "1350/5552\n",
+      " 13534\n",
+      "1352/5552\n",
+      " 13536\n",
+      "1354/5552\n",
+      " 13538\n",
+      "1356/5552\n",
+      " 13540\n",
+      "1364/5552\n",
+      " 13549\n",
+      "1376/5552\n",
+      " 13561\n",
+      "1390/5552\n",
+      " 13576\n",
+      "1397/5552\n",
+      " 13582\n",
+      "1398/5552\n",
+      " 13583\n",
+      "1402/5552\n",
+      " 1359\n",
+      "1408/5552\n",
+      " 1360\n",
+      "1411/5552\n",
+      " 13613\n",
+      "1507/5552\n",
+      " 13721\n",
+      "1523/5552\n",
+      " 13742\n",
+      "1525/5552\n",
+      " 13744\n",
+      "1532/5552\n",
+      " 13750\n",
+      "1535/5552\n",
+      " 13753\n",
+      "1537/5552\n",
+      " 13756\n",
+      "1539/5552\n",
+      " 13758\n",
+      "1548/5552\n",
+      " 13766\n",
+      "1549/5552\n",
+      " 13767\n",
+      "1551/5552\n",
+      " 13769\n",
+      "1552/5552\n",
+      " 1377\n",
+      "1568/5552\n",
+      " 13786\n",
+      "1569/5552\n",
+      " 13787\n",
+      "1576/5552\n",
+      " 13794\n",
+      "1586/5552\n",
+      " 13803\n",
+      "1589/5552\n",
+      " 13808\n",
+      "1592/5552\n",
+      " 13811\n",
+      "1593/5552\n",
+      " 13812\n",
+      "1603/5552\n",
+      " 13821\n",
+      "1668/5552\n",
+      " 1391\n",
+      "1687/5552\n",
+      " 1397\n",
+      "1688/5552\n",
+      " 13975\n",
+      "1690/5552\n",
+      " 13977\n",
+      "1702/5552\n",
+      " 13997\n",
+      "1710/5552\n",
+      " 14010\n",
+      "1786/5552\n",
+      " 14165\n",
+      "1789/5552\n",
+      "Empty 14167\n",
+      "1810/5552\n",
+      " 14201\n",
+      "1813/5552\n",
+      " 14210\n",
+      "1835/5552\n",
+      "Empty 14241\n",
+      "1845/5552\n",
+      " 14258\n",
+      "1851/5552\n",
+      " 14269\n",
+      "1858/5552\n",
+      " 14280\n",
+      "1862/5552\n",
+      " 14286\n",
+      "1867/5552\n",
+      " 14294\n",
+      "1877/5552\n",
+      " 14306\n",
+      "1888/5552\n",
+      " 14324\n",
+      "1895/5552\n",
+      " 14339\n",
+      "1918/5552\n",
+      " 14376\n",
+      "1924/5552\n",
+      " 14396\n",
+      "1930/5552\n",
+      " 14402\n",
+      "1933/5552\n",
+      " 14409\n",
+      "1943/5552\n",
+      " 14423\n",
+      "1952/5552\n",
+      " 14438\n",
+      "1957/5552\n",
+      " 14446\n",
+      "1961/5552\n",
+      " 14457\n",
+      "1963/5552\n",
+      " 14460\n",
+      "1969/5552\n",
+      " 14474\n",
+      "1976/5552\n",
+      " 14486\n",
+      "1988/5552\n",
+      " 14505\n",
+      "1994/5552\n",
+      " 14517\n",
+      "1995/5552\n",
+      " 14519\n",
+      "2004/5552\n",
+      " 14536\n",
+      "2008/5552\n",
+      " 14541\n",
+      "2010/5552\n",
+      " 14547\n",
+      "2021/5552\n",
+      " 14570\n",
+      "2028/5552\n",
+      " 14584\n",
+      "2037/5552\n",
+      " 14598\n",
+      "2043/5552\n",
+      " 14610\n",
+      "2051/5552\n",
+      " 14619\n",
+      "2058/5552\n",
+      " 14634\n",
+      "2065/5552\n",
+      " 14642\n",
+      "2072/5552\n",
+      " 14659\n",
+      "2078/5552\n",
+      " 14671\n",
+      "2086/5552\n",
+      " 14683\n",
+      "2092/5552\n",
+      " 14694\n",
+      "2128/5552\n",
+      " 14754\n",
+      "2133/5552\n",
+      " 14768\n",
+      "2144/5552\n",
+      " 14791\n",
+      "2145/5552\n",
+      " 14792\n",
+      "2158/5552\n",
+      " 14831\n",
+      "2163/5552\n",
+      " 14842\n",
+      "2174/5552\n",
+      " 14862\n",
+      "2178/5552\n",
+      " 1487\n",
+      "2182/5552\n",
+      " 14874\n",
+      "2186/5552\n",
+      " 1488\n",
+      "2192/5552\n",
+      " 14888\n",
+      "2199/5552\n",
+      " 14898\n",
+      "2205/5552\n",
+      " 14904\n",
+      "2210/5552\n",
+      " 14909\n",
+      "2212/5552\n",
+      " 14910\n",
+      "2214/5552\n",
+      " 14914\n",
+      "2218/5552\n",
+      " 14919\n",
+      "2222/5552\n",
+      " 14924\n",
+      "2241/5552\n",
+      " 14945\n",
+      "2244/5552\n",
+      " 14949\n",
+      "2255/5552\n",
+      " 14961\n",
+      "2257/5552\n",
+      " 14963\n",
+      "2259/5552\n",
+      " 1497\n",
+      "2279/5552\n",
+      "Empty 14998\n",
+      "\n",
+      " 14999\n",
+      "2280/5552\n",
+      "Empty 14999\n",
+      "2283/5552\n",
+      " 15002\n",
+      "2285/5552\n",
+      " 15007\n",
+      "2298/5552\n",
+      " 1502\n",
+      "2318/5552\n",
+      " 15049\n",
+      "2338/5552\n",
+      " 15082\n",
+      "2341/5552\n",
+      " 15086\n",
+      "2342/5552\n",
+      " 15087\n",
+      "2343/5552\n",
+      " 15088\n",
+      "2351/5552\n",
+      " 15107\n",
+      "2383/5552\n",
+      " 15202\n",
+      "2399/5552\n",
+      " 15218\n",
+      "2401/5552\n",
+      " 1522\n",
+      "2411/5552\n",
+      " 15230\n",
+      "2415/5552\n",
+      " 15235\n",
+      "2418/5552\n",
+      " 15238\n",
+      "2425/5552\n",
+      " 15260\n",
+      "2426/5552\n",
+      " 15263\n",
+      "2431/5552\n",
+      " 15270\n",
+      "2444/5552\n",
+      " 15284\n",
+      "2492/5552\n",
+      " 15376\n",
+      "2493/5552\n",
+      " 15377\n",
+      "2503/5552\n",
+      " 15387\n",
+      "2518/5552\n",
+      " 15415\n",
+      "2519/5552\n",
+      " 15416\n",
+      "2552/5552\n",
+      " 15449\n",
+      "2557/5552\n",
+      " 15457\n",
+      "2560/5552\n",
+      " 15460\n",
+      "2561/5552\n",
+      " 15461\n",
+      "2562/5552\n",
+      " 15462\n",
+      "2567/5552\n",
+      " 15472\n",
+      "2573/5552\n",
+      " 15488\n",
+      "2575/5552\n",
+      " 15490\n",
+      "2582/5552\n",
+      " 15511\n",
+      "2585/5552\n",
+      " 15515\n",
+      "2588/5552\n",
+      " 15518\n",
+      "2604/5552\n",
+      " 15549\n",
+      "2605/5552\n",
+      " 1555\n",
+      "2606/5552\n",
+      " 15550\n",
+      "2616/5552\n",
+      " 15563\n",
+      "2617/5552\n",
+      " 15564\n",
+      "2623/5552\n",
+      "Empty 15577\n",
+      "2625/5552\n",
+      " 15587\n",
+      "2626/5552\n",
+      " 15588\n",
+      "2627/5552\n",
+      " 15589\n",
+      "2629/5552\n",
+      " 15591\n",
+      "2634/5552\n",
+      " 15617\n",
+      "2640/5552\n",
+      " 15630\n",
+      "2641/5552\n",
+      " 15631\n",
+      "2653/5552\n",
+      " 15643\n",
+      "2657/5552\n",
+      " 15647\n",
+      "2664/5552\n",
+      " 1566\n",
+      "2678/5552\n",
+      " 15677\n",
+      "2685/5552\n",
+      " 15690\n",
+      "2687/5552\n",
+      " 15693\n",
+      "2688/5552\n",
+      " 15694\n",
+      "2689/5552\n",
+      " 15695\n",
+      "2691/5552\n",
+      " 15697\n",
+      "2704/5552\n",
+      " 15709\n",
+      "2711/5552\n",
+      " 1572\n",
+      "2718/5552\n",
+      "Empty 15737\n",
+      "2721/5552\n",
+      " 1575\n",
+      "2729/5552\n",
+      " 1588\n",
+      "2732/5552\n",
+      " 1592\n",
+      "2735/5552\n",
+      " 1602\n",
+      "2736/5552\n",
+      " 1603\n",
+      "2737/5552\n",
+      " 1604\n",
+      "2738/5552\n",
+      " 1605\n",
+      "2748/5552\n",
+      " 1623\n",
+      "2750/5552\n",
+      " 1625\n",
+      "2755/5552\n",
+      " 1633\n",
+      "2756/5552\n",
+      " 1634\n",
+      "2757/5552\n",
+      " 1635\n",
+      "2758/5552\n",
+      " 1636\n",
+      "2759/5552\n",
+      " 1637\n",
+      "2760/5552\n",
+      " 1638\n",
+      "2768/5552\n",
+      " 1661\n",
+      "2770/5552\n",
+      " 1663\n",
+      "2775/5552\n",
+      " 1668\n",
+      "2776/5552\n",
+      " 1669\n",
+      "2785/5552\n",
+      " 1678\n",
+      "2786/5552\n",
+      " 1679\n",
+      "2793/5552\n",
+      " 1686\n",
+      "2794/5552\n",
+      " 1687\n",
+      "2806/5552\n",
+      " 17\n",
+      "2825/5552\n",
+      " 1721\n",
+      "2842/5552\n",
+      " 1748\n",
+      "2843/5552\n",
+      " 1749\n",
+      "2850/5552\n",
+      " 1762\n",
+      "2851/5552\n",
+      " 1763\n",
+      "2910/5552\n",
+      " 1892\n",
+      "2911/5552\n",
+      " 1893\n",
+      "2977/5552\n",
+      " 2038\n",
+      "2978/5552\n",
+      " 2039\n",
+      "2993/5552\n",
+      " 2068\n",
+      "2994/5552\n",
+      " 2069\n",
+      "2995/5552\n",
+      " 2072\n",
+      "2996/5552\n",
+      " 2073\n",
+      "3016/5552\n",
+      " 2116\n",
+      "3017/5552\n",
+      " 2117\n",
+      "3024/5552\n",
+      " 2128\n",
+      "3025/5552\n",
+      " 2129\n",
+      "3036/5552\n",
+      " 2144\n",
+      "3037/5552\n",
+      " 2145\n",
+      "3052/5552\n",
+      " 2168\n",
+      "3053/5552\n",
+      " 2169\n",
+      "3056/5552\n",
+      " 2176\n",
+      "3057/5552\n",
+      " 2177\n",
+      "3062/5552\n",
+      " 2184\n",
+      "3063/5552\n",
+      " 2185\n",
+      "3107/5552\n",
+      " 2280\n",
+      "3108/5552\n",
+      " 2281\n",
+      "3114/5552\n",
+      " 2296\n",
+      "3115/5552\n",
+      " 2297\n",
+      "3128/5552\n",
+      " 2317\n",
+      "3170/5552\n",
+      " 2365\n",
+      "3171/5552\n",
+      " 2366\n",
+      "3174/5552\n",
+      " 2371\n",
+      "3175/5552\n",
+      " 2372\n",
+      "3180/5552\n",
+      " 2379\n",
+      "3181/5552\n",
+      " 2380\n",
+      "3184/5552\n",
+      " 2385\n",
+      "3185/5552\n",
+      " 2386\n",
+      "3235/5552\n",
+      " 2451\n",
+      "3236/5552\n",
+      " 2452\n",
+      "3237/5552\n",
+      " 2453\n",
+      "3238/5552\n",
+      " 2454\n",
+      "3250/5552\n",
+      " 2481\n",
+      "3251/5552\n",
+      " 2482\n",
+      "3272/5552\n",
+      " 2505\n",
+      "3273/5552\n",
+      " 2506\n",
+      "3292/5552\n",
+      "Empty 2538\n",
+      "3309/5552\n",
+      " 2589\n",
+      "3311/5552\n",
+      " 2590\n",
+      "3319/5552\n",
+      "Empty 2641\n",
+      "3320/5552\n",
+      "Empty 2645\n",
+      "3321/5552\n",
+      "Empty 2646\n",
+      "3325/5552\n",
+      "Empty 2665\n",
+      "3326/5552\n",
+      "Empty 2666\n",
+      "3327/5552\n",
+      "Empty 2667\n",
+      "3328/5552\n",
+      "Empty 2668\n",
+      "3338/5552\n",
+      "Empty 2697\n",
+      "3339/5552\n",
+      "Empty 2698\n",
+      "3381/5552\n",
+      "Empty 2849\n",
+      "3383/5552\n",
+      "Empty 2854\n",
+      "3384/5552\n",
+      "Empty 2856\n",
+      "3385/5552\n",
+      "Empty 2862\n",
+      "3386/5552\n",
+      "Empty 2867\n",
+      "3390/5552\n",
+      " 2893\n",
+      "3391/5552\n",
+      " 2894\n",
+      "3392/5552\n",
+      " 2895\n",
+      "3411/5552\n",
+      " 2924\n",
+      "3414/5552\n",
+      "Empty 2936\n",
+      "3415/5552\n",
+      "Empty 2937\n",
+      "\n",
+      " 2943\n",
+      "3416/5552\n",
+      "Empty 2943\n",
+      "3417/5552\n",
+      " 2948\n",
+      "3418/5552\n",
+      " 2950\n",
+      "3419/5552\n",
+      " 2951\n",
+      "3424/5552\n",
+      "Empty 2961\n",
+      "\n",
+      " 2963\n",
+      "3425/5552\n",
+      "Empty 2963\n",
+      "3426/5552\n",
+      " 2967\n",
+      "3429/5552\n",
+      "Empty 2970\n",
+      "3430/5552\n",
+      " 2977\n",
+      "3432/5552\n",
+      " 2981\n",
+      "3435/5552\n",
+      " 2986\n",
+      "3443/5552\n",
+      " 2995\n",
+      "3444/5552\n",
+      " 2996\n",
+      "3445/5552\n",
+      " 2999\n",
+      "3448/5552\n",
+      " 3000\n",
+      "3449/5552\n",
+      " 3001\n",
+      "3450/5552\n",
+      " 3003\n",
+      "3451/5552\n",
+      " 3004\n",
+      "3452/5552\n",
+      " 3005\n",
+      "3453/5552\n",
+      " 3006\n",
+      "3454/5552\n",
+      " 3008\n",
+      "3456/5552\n",
+      " 3010\n",
+      "3457/5552\n",
+      " 3011\n",
+      "3468/5552\n",
+      " 3023\n",
+      "3478/5552\n",
+      "Empty 3035\n",
+      "3482/5552\n",
+      " 3040\n",
+      "3484/5552\n",
+      " 3042\n",
+      "3492/5552\n",
+      " 3050\n",
+      "3493/5552\n",
+      " 3051\n",
+      "3498/5552\n",
+      " 3060\n",
+      "3532/5552\n",
+      " 3266\n",
+      "3534/5552\n",
+      "Empty 3267\n",
+      "3541/5552\n",
+      " 3331\n",
+      "3573/5552\n",
+      " 3424\n",
+      "3579/5552\n",
+      "Empty 3453\n",
+      "3588/5552\n",
+      "Empty 3486\n",
+      "3620/5552\n",
+      " 3556\n",
+      "3621/5552\n",
+      " 3559\n",
+      "3649/5552\n",
+      " 3614\n",
+      "3652/5552\n",
+      " 3619\n",
+      "3663/5552\n",
+      " 3644\n",
+      "3666/5552\n",
+      "Empty 3653\n",
+      "3673/5552\n",
+      " 3677\n",
+      "3676/5552\n",
+      " 3680\n",
+      "3685/5552\n",
+      " 3690\n",
+      "3687/5552\n",
+      "Empty 3691\n",
+      "\n",
+      " 3692\n",
+      "3688/5552\n",
+      "Empty 3692\n",
+      "\n",
+      " 3693\n",
+      "3689/5552\n",
+      "Empty 3693\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "5525/5552"
+      "3696/5552\n",
+      " 3706\n",
+      "3701/5552\n",
+      " 3723\n",
+      "3704/5552\n",
+      " 3732\n",
+      "3739/5552\n",
+      " 3860\n",
+      "3740/5552\n",
+      " 3861\n",
+      "3749/5552\n",
+      "Empty 3886\n",
+      "\n",
+      " 389\n",
+      "3750/5552\n",
+      "Empty 389\n",
+      "3751/5552\n",
+      " 3891\n",
+      "3754/5552\n",
+      " 390\n",
+      "3766/5552\n",
+      " 393\n",
+      "3773/5552\n",
+      " 394\n",
+      "3777/5552\n",
+      " 3945\n",
+      "3789/5552\n",
+      " 3964\n",
+      "3796/5552\n",
+      " 3984\n",
+      "3799/5552\n",
+      "Empty 3998\n",
+      "3802/5552\n",
+      " 401\n",
+      "3808/5552\n",
+      " 402\n",
+      "3809/5552\n",
+      " 4021\n",
+      "3811/5552\n",
+      "Empty 4023\n",
+      "3812/5552\n",
+      "Empty 4025\n",
+      "3813/5552\n",
+      "Empty 4026\n",
+      "3814/5552\n",
+      " 403\n",
+      "3816/5552\n",
+      "Empty 4032\n",
+      "3817/5552\n",
+      "Empty 4034\n",
+      "3818/5552\n",
+      "Empty 4035\n",
+      "3820/5552\n",
+      " 404\n",
+      "3822/5552\n",
+      "Empty 4042\n",
+      "3827/5552\n",
+      " 4050\n",
+      "3914/5552\n",
+      " 4358\n",
+      "3921/5552\n",
+      " 4372\n",
+      "3922/5552\n",
+      " 4373\n",
+      "3956/5552\n",
+      " 4554\n",
+      "3958/5552\n",
+      "Empty 4555\n",
+      "3970/5552\n",
+      " 4621\n",
+      "3994/5552\n",
+      "Empty 4699\n",
+      "4001/5552\n",
+      " 4714\n",
+      "4010/5552\n",
+      "Empty 4743\n",
+      "4014/5552\n",
+      " 476\n",
+      "4018/5552\n",
+      " 477\n",
+      "4022/5552\n",
+      "Empty 4776\n",
+      "4051/5552\n",
+      " 4845\n",
+      "4052/5552\n",
+      " 4849\n",
+      "4078/5552\n",
+      " 4904\n",
+      "4081/5552\n",
+      " 4909\n",
+      "4090/5552\n",
+      " 4934\n",
+      "4093/5552\n",
+      "Empty 4943\n",
+      "4100/5552\n",
+      " 4967\n",
+      "4103/5552\n",
+      " 4970\n",
+      "4112/5552\n",
+      " 4980\n",
+      "4114/5552\n",
+      "Empty 4981\n",
+      "\n",
+      " 4982\n",
+      "4115/5552\n",
+      "Empty 4982\n",
+      "\n",
+      " 4983\n",
+      "4116/5552\n",
+      "Empty 4983\n",
+      "4123/5552\n",
+      " 4996\n",
+      "4131/5552\n",
+      " 5013\n",
+      "4135/5552\n",
+      " 5022\n",
+      "4145/5552\n",
+      " 504\n",
+      "4147/5552\n",
+      " 505\n",
+      "4161/5552\n",
+      " 509\n",
+      "4166/5552\n",
+      " 510\n",
+      "4176/5552\n",
+      " 5150\n",
+      "4177/5552\n",
+      " 5151\n",
+      "4186/5552\n",
+      "Empty 5176\n",
+      "4188/5552\n",
+      " 5181\n",
+      "4209/5552\n",
+      " 5235\n",
+      "4219/5552\n",
+      " 5254\n",
+      "4224/5552\n",
+      " 5274\n",
+      "4227/5552\n",
+      "Empty 5288\n",
+      "4237/5552\n",
+      " 5311\n",
+      "4239/5552\n",
+      "Empty 5315\n",
+      "4240/5552\n",
+      "Empty 5316\n",
+      "4244/5552\n",
+      "Empty 5325\n",
+      "4250/5552\n",
+      " 5333\n",
+      "4277/5552\n",
+      "Empty 539\n",
+      "4319/5552\n",
+      " 555\n",
+      "4323/5552\n",
+      "Empty 5554\n",
+      "4325/5552\n",
+      " 556\n",
+      "4350/5552\n",
+      " 5641\n",
+      "4358/5552\n",
+      " 5655\n",
+      "4359/5552\n",
+      " 5656\n",
+      "4362/5552\n",
+      " 5661\n",
+      "4363/5552\n",
+      " 5662\n",
+      "4364/5552\n",
+      " 5665\n",
+      "4366/5552\n",
+      " 5667\n",
+      "4367/5552\n",
+      " 5668\n",
+      "4368/5552\n",
+      " 5669\n",
+      "4369/5552\n",
+      " 569\n",
+      "4371/5552\n",
+      " 570\n",
+      "4380/5552\n",
+      " 571\n",
+      "4391/5552\n",
+      "Empty 5740\n",
+      "4393/5552\n",
+      " 5744\n",
+      "4405/5552\n",
+      " 5784\n",
+      "4406/5552\n",
+      " 5786\n",
+      "4413/5552\n",
+      " 5804\n",
+      "4424/5552\n",
+      " 5829\n",
+      "4428/5552\n",
+      "Empty 5838\n",
+      "4449/5552\n",
+      " 5953\n",
+      "4450/5552\n",
+      " 5955\n",
+      "4451/5552\n",
+      " 5956\n",
+      "4452/5552\n",
+      " 5957\n",
+      "4453/5552\n",
+      " 5958\n",
+      "4457/5552\n",
+      " 5966\n",
+      "4460/5552\n",
+      " 597\n",
+      "4462/5552\n",
+      " 5972\n",
+      "4463/5552\n",
+      " 5973\n",
+      "4467/5552\n",
+      " 598\n",
+      "4468/5552\n",
+      " 5980\n",
+      "4470/5552\n",
+      " 5983\n",
+      "4471/5552\n",
+      " 5985\n",
+      "4492/5552\n",
+      "Empty 6039\n",
+      "4506/5552\n",
+      "Empty 6112\n",
+      "4507/5552\n",
+      " 6118\n",
+      "4538/5552\n",
+      "Empty 6244\n",
+      "4543/5552\n",
+      " 6259\n",
+      "4550/5552\n",
+      "Empty 6288\n",
+      "4560/5552\n",
+      "Empty 6321\n",
+      "4574/5552\n",
+      "Empty 6366\n",
+      "4575/5552\n",
+      " 6370\n",
+      "4578/5552\n",
+      "Empty 6372\n",
+      "4579/5552\n",
+      " 6374\n",
+      "4586/5552\n",
+      " 6391\n",
+      "4590/5552\n",
+      " 6400\n",
+      "4609/5552\n",
+      "Empty 6469\n",
+      "4610/5552\n",
+      "Empty 6472\n",
+      "4613/5552\n",
+      " 6488\n",
+      "4614/5552\n",
+      " 6489\n",
+      "4622/5552\n",
+      " 6503\n",
+      "4624/5552\n",
+      " 6505\n",
+      "4625/5552\n",
+      " 6508\n",
+      "4626/5552\n",
+      " 6509\n",
+      "4628/5552\n",
+      " 6511\n",
+      "4644/5552\n",
+      " 6535\n",
+      "4646/5552\n",
+      " 6537\n",
+      "4656/5552\n",
+      " 6551\n",
+      "4664/5552\n",
+      " 6565\n",
+      "4669/5552\n",
+      " 6570\n",
+      "4678/5552\n",
+      " 6584\n",
+      "4692/5552\n",
+      " 6605\n",
+      "4715/5552\n",
+      " 6659\n",
+      "4719/5552\n",
+      " 6663\n",
+      "4720/5552\n",
+      " 6664\n",
+      "4727/5552\n",
+      "Empty 6681\n",
+      "4736/5552\n",
+      "Empty 6692\n",
+      "4739/5552\n",
+      "Empty 6697\n",
+      "4743/5552\n",
+      " 6706\n",
+      "4747/5552\n",
+      "Empty 6712\n",
+      "4748/5552\n",
+      " 6716\n",
+      "4764/5552\n",
+      " 675\n",
+      "4767/5552\n",
+      "Empty 6752\n",
+      "4770/5552\n",
+      " 676\n",
+      "4796/5552\n",
+      "Empty 6816\n",
+      "4799/5552\n",
+      "Empty 6821\n",
+      "4802/5552\n",
+      " 6830\n",
+      "4805/5552\n",
+      "Empty 6836\n",
+      "4807/5552\n",
+      " 6840\n",
+      "4826/5552\n",
+      "Empty 6876\n",
+      "4838/5552\n",
+      "Empty 6904\n",
+      "4851/5552\n",
+      "Empty 6932\n",
+      "4856/5552\n",
+      " 6943\n",
+      "4857/5552\n",
+      " 6944\n",
+      "4858/5552\n",
+      " 6945\n",
+      "4872/5552\n",
+      " 6961\n",
+      "4932/5552\n",
+      " 7100\n",
+      "4943/5552\n",
+      " 7152\n",
+      "4947/5552\n",
+      " 7156\n",
+      "4950/5552\n",
+      " 7159\n",
+      "4960/5552\n",
+      " 720\n",
+      "4961/5552\n",
+      " 721\n",
+      "4971/5552\n",
+      " 723\n",
+      "5005/5552\n",
+      " 728\n",
+      "5013/5552\n",
+      " 729\n",
+      "5018/5552\n",
+      " 7294\n",
+      "5019/5552\n",
+      " 7295\n",
+      "5027/5552\n",
+      " 7308\n",
+      "5036/5552\n",
+      " 733\n",
+      "5044/5552\n",
+      " 734\n",
+      "5076/5552\n",
+      "Empty 7389\n",
+      "5081/5552\n",
+      " 7401\n",
+      "5094/5552\n",
+      " 7432\n",
+      "5101/5552\n",
+      " 7439\n",
+      "5110/5552\n",
+      " 7451\n",
+      "5111/5552\n",
+      " 7452\n",
+      "5112/5552\n",
+      " 7453\n",
+      "5114/5552\n",
+      " 7455\n",
+      "5125/5552\n",
+      " 7471\n",
+      "5127/5552\n",
+      " 7473\n",
+      "5134/5552\n",
+      " 7482\n",
+      "5135/5552\n",
+      " 7483\n",
+      "5137/5552\n",
+      " 7489\n",
+      "5142/5552\n",
+      " 7493\n",
+      "5155/5552\n",
+      " 7505\n",
+      "5156/5552\n",
+      " 7506\n",
+      "5160/5552\n",
+      " 751\n",
+      "5165/5552\n",
+      " 7518\n",
+      "5166/5552\n",
+      " 7519\n",
+      "5167/5552\n",
+      " 7521\n",
+      "5168/5552\n",
+      " 7522\n",
+      "5196/5552\n",
+      " 7555\n",
+      "5223/5552\n",
+      " 7584\n",
+      "5247/5552\n",
+      " 7698\n",
+      "5248/5552\n",
+      " 7699\n",
+      "5255/5552\n",
+      "Empty 7721\n",
+      "5258/5552\n",
+      " 7758\n",
+      "5268/5552\n",
+      " 7770\n",
+      "5271/5552\n",
+      " 7774\n",
+      "5272/5552\n",
+      " 7775\n",
+      "5284/5552\n",
+      " 7799\n",
+      "5292/5552\n",
+      " 7808\n",
+      "5299/5552\n",
+      "Empty 7825\n",
+      "5303/5552\n",
+      "Empty 7843\n",
+      "5307/5552\n",
+      " 7866\n",
+      "5312/5552\n",
+      " 7878\n",
+      "5314/5552\n",
+      " 7880\n",
+      "5316/5552\n",
+      " 7884\n",
+      "5317/5552\n",
+      " 7888\n",
+      "5322/5552\n",
+      " 7894\n",
+      "5325/5552\n",
+      " 7898\n",
+      "5332/5552\n",
+      " 7909\n",
+      "5342/5552\n",
+      " 794\n",
+      "5343/5552\n",
+      " 795\n",
+      "5350/5552\n",
+      "Empty 8027\n",
+      "5351/5552\n",
+      "Empty 8028\n",
+      "5353/5552\n",
+      "Empty 8031\n",
+      "5354/5552\n",
+      "Empty 8032\n",
+      "5355/5552\n",
+      " 805\n",
+      "5373/5552\n",
+      "Empty 8216\n",
+      "5419/5552\n",
+      " 888\n",
+      "5423/5552\n",
+      "Empty 8892\n",
+      "5426/5552\n",
+      "Empty 8971\n",
+      "5427/5552\n",
+      " 901\n",
+      "5438/5552\n",
+      "Empty 9104\n",
+      "5469/5552\n",
+      " 930\n",
+      "5501/5552\n",
+      " 949\n",
+      "5503/5552\n",
+      " 950\n",
+      "5515/5552\n",
+      "Empty 9689\n",
+      "5518/5552\n",
+      "Empty 9703\n",
+      "5519/5552\n",
+      "Empty 9704\n",
+      "5520/5552\n",
+      "Empty 9705\n",
+      "5521/5552\n",
+      "Empty 9706\n",
+      "5522/5552\n",
+      "Empty 9707\n",
+      "5523/5552\n",
+      "Empty 9709\n",
+      "5524/5552\n",
+      "Empty 9710\n",
+      "5525/5552\n",
+      "Empty 9711\n",
+      "5552/5552"
      ]
     }
    ],
@@ -412,111 +2050,98 @@
     "import sys\n",
     "i=0\n",
     "n=len(selected)\n",
+    "import time\n",
+    "p=IntProgress(description=\"Processing\",max=n)\n",
+    "display(p)\n",
+    "\n",
     "for id_,row in selected.iterrows():\n",
+    "    p.value+=1\n",
     "    i+=1\n",
     "    try:\n",
     "        df=reformat_data(read_csv_ner(output_dir+\"{0}.csv\".format(row[\"id_doc\"])))\n",
-    "        sys.stdout.write(\"\\r{0}/{1}\".format(i,n))\n",
+    "    except Exception as e:\n",
+    "        print(\"\\n\",row[\"id_doc\"])\n",
+    "    #df=read_csv_ner(output_dir+\"{0}.csv\".format(row[\"id_doc\"]))\n",
+    "    #df=df[skip:]\n",
+    "    #df=reformat_data(df)\n",
+    "    #skip=int(skipPercentage*len(df))\n",
+    "    #df=df[skip:]\n",
+    "    sys.stdout.write(\"\\r{0}/{1}\".format(i,n))\n",
+    "    if df.empty:\n",
+    "        print(\"\\nEmpty\",row[\"id_doc\"])\n",
+    "        df.to_csv(\"/Users/jacquesfize/LOD_DATASETS/disambiguate_1/{0}.csv\".format(row[\"id_doc\"]))\n",
+    "        continue\n",
+    "    df[\"GID\"]=df[df[\"ent_type_\"] == \"LOC\"][\"text\"].apply(\n",
+    "        lambda x: disambiguate(x,lang=data_lang[data_lang[\"id_doc\"] == row[\"id_doc\"]][\"lang\"].values[0])[0]\n",
+    "    )\n",
+    "    df.to_csv(\"/Users/jacquesfize/LOD_DATASETS/disambiguate_1/{0}.csv\".format(row[\"id_doc\"]))\n",
     "\n",
-    "        df[\"GID\"]=df[df[\"ent_type_\"] == \"LOC\"][\"text\"].apply(\n",
-    "            lambda x: disambiguate(x,lang=data_lang[data_lang[\"id_doc\"] == row[\"id_doc\"]][\"lang\"].values[0])[0]\n",
-    "        )\n",
-    "    except:\n",
-    "        df[\"GID\"]='O'\n",
-    "    df.to_csv(\"/Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_3/{0}.csv\".format(id_))\n"
+    "    \n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 27,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:19:13.241583Z",
-     "start_time": "2018-04-19T17:19:12.415531Z"
+     "end_time": "2018-05-16T09:13:16.096156Z",
+     "start_time": "2018-05-16T09:13:15.843266Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div style=\"width:100%;\"><div style=\"position:relative;width:100%;height:0;padding-bottom:60%;\"><iframe src=\"data:text/html;charset=utf-8;base64,\" style=\"position:absolute;width:100%;height:100%;left:0;top:0;border:none !important;\" allowfullscreen webkitallowfullscreen mozallowfullscreen></iframe></div></div>"
-      ],
-      "text/plain": [
-       "<folium.folium.Map at 0x1106b3128>"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import folium\n",
-    "m = folium.Map()\n",
-    "for id,row in df[df[\"ent_type_\"] == \"LOC\"].iterrows():\n",
-    "    if not row[\"GID\"] or row[\"GID\"] == \"O\":\n",
-    "        continue\n",
-    "    data=pd.Series(get_data(row[\"GID\"]))\n",
-    "    if \"coord\" in data:\n",
-    "        folium.Marker([data[\"coord\"][\"lat\"], data[\"coord\"][\"lon\"]], popup=data[\"fr\"]).add_to(m)\n",
-    "#folium.Marker([45.3311, -121.7113], popup='<b>Timberline Lodge</b>').add_to(m)\n",
-    "m"
+    "df=reformat_data(read_csv_ner(output_dir+\"{0}.csv\".format(2)))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:19:13.246441Z",
-     "start_time": "2018-04-19T17:19:13.243633Z"
+     "end_time": "2018-05-16T09:11:35.133829Z",
+     "start_time": "2018-05-16T09:10:29.700Z"
     }
    },
    "outputs": [],
    "source": [
     "from glob import glob\n",
     "import numpy as np\n",
-    "import sys"
+    "import sys\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:19:13.297655Z",
-     "start_time": "2018-04-19T17:19:13.249152Z"
+     "end_time": "2018-05-16T09:11:35.135250Z",
+     "start_time": "2018-05-16T09:10:29.702Z"
     }
    },
    "outputs": [],
    "source": [
-    "files=glob(\"/Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_2/*.csv\")"
+    "files=glob(\"/Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_5/*.csv\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:21:12.711563Z",
-     "start_time": "2018-04-19T17:19:13.300567Z"
+     "end_time": "2018-05-16T09:11:35.136229Z",
+     "start_time": "2018-05-16T09:10:29.704Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "5552/5552"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "i=0\n",
     "n=len(files)\n",
     "points={}\n",
+    "p=IntProgress(description=\"Processing\",max=n)\n",
+    "display(p)\n",
     "for fn in files:\n",
     "    i+=1\n",
+    "    p.value+=1\n",
     "    sys.stdout.write(\"\\r{0}/{1}\".format(i,n))\n",
     "    df=pd.read_csv(fn)\n",
     "    df=df.fillna(\"O\")\n",
@@ -531,65 +2156,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-04-19T17:21:12.844337Z",
-     "start_time": "2018-04-19T17:21:12.713671Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "README.md                      \u001b[1m\u001b[36mgraphs\u001b[m\u001b[m/\r\n",
-      "\u001b[1m\u001b[36m__pycache__\u001b[m\u001b[m/                   \u001b[1m\u001b[36mgui_graph_viewer\u001b[m\u001b[m/\r\n",
-      "\u001b[1m\u001b[36mconfig\u001b[m\u001b[m/                        \u001b[1m\u001b[36mhelpers\u001b[m\u001b[m/\r\n",
-      "\u001b[1m\u001b[36mdata\u001b[m\u001b[m/                          \u001b[1m\u001b[36mmodels\u001b[m\u001b[m/\r\n",
-      "\u001b[1m\u001b[36meval\u001b[m\u001b[m/                          \u001b[1m\u001b[36mnlp\u001b[m\u001b[m/\r\n",
-      "eval.py                        \u001b[1m\u001b[36mnotebooks\u001b[m\u001b[m/\r\n",
-      "\u001b[31mexp_17_avril.sh\u001b[m\u001b[m*               pipeline.py\r\n",
-      "\u001b[31mexp_30mars.sh\u001b[m\u001b[m*                 points_dump.txt\r\n",
-      "\u001b[31mexp_fev_18.sh\u001b[m\u001b[m*                 \u001b[31mrequirements.txt\u001b[m\u001b[m*\r\n",
-      "\u001b[31mexp_mar_12.sh\u001b[m\u001b[m*                 \u001b[1m\u001b[36mresources\u001b[m\u001b[m/\r\n",
-      "extract_log                    temp.py\r\n",
-      "generate_data.py               test.py\r\n",
-      "generate_data_csv.py           test_gmatch4py.py\r\n",
-      "generate_selected_document.py  \u001b[1m\u001b[36mtests\u001b[m\u001b[m/\r\n",
-      "generate_transform.py          tools.py\r\n",
-      "\u001b[1m\u001b[36mgmatch4py\u001b[m\u001b[m/                     \u001b[1m\u001b[36mtt4py\u001b[m\u001b[m/\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "ls"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:22:50.320185Z",
-     "start_time": "2018-04-19T17:21:12.847627Z"
+     "end_time": "2018-05-16T09:11:35.137502Z",
+     "start_time": "2018-05-16T09:10:29.706Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "5552/5552"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "i=0\n",
     "n=len(files)\n",
     "count={}\n",
+    "p=IntProgress(description=\"Processing\",max=n)\n",
+    "display(p)\n",
     "for fn in files:\n",
     "    i+=1\n",
+    "    p.value+=1\n",
     "    sys.stdout.write(\"\\r{0}/{1}\".format(i,n))\n",
     "    df=pd.read_csv(fn)\n",
     "    df=df.fillna(\"O\")\n",
@@ -603,28 +2186,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:23:02.158693Z",
-     "start_time": "2018-04-19T17:22:50.322656Z"
+     "end_time": "2018-05-16T09:11:35.139281Z",
+     "start_time": "2018-05-16T09:10:29.708Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "5552/5552"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "i=0\n",
     "n=len(files)\n",
     "count_idf={}\n",
+    "p=IntProgress(description=\"Processing\",max=n)\n",
+    "display(p)\n",
     "for fn in files:\n",
     "    i+=1\n",
+    "    p.value+=1\n",
     "    sys.stdout.write(\"\\r{0}/{1}\".format(i,n))\n",
     "    df=pd.read_csv(fn)\n",
     "    df=df.fillna(\"O\")\n",
@@ -638,11 +2216,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:23:02.183240Z",
-     "start_time": "2018-04-19T17:23:02.160929Z"
+     "end_time": "2018-05-16T09:11:35.140723Z",
+     "start_time": "2018-05-16T09:10:29.708Z"
     }
    },
    "outputs": [],
@@ -656,107 +2234,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:23:02.200130Z",
-     "start_time": "2018-04-19T17:23:02.187288Z"
+     "end_time": "2018-05-16T09:11:35.142191Z",
+     "start_time": "2018-05-16T09:10:29.710Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>lat</th>\n",
-       "      <th>lon</th>\n",
-       "      <th>count</th>\n",
-       "      <th>idf</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>GD3404996</th>\n",
-       "      <td>-20.00000</td>\n",
-       "      <td>47.00000</td>\n",
-       "      <td>16408</td>\n",
-       "      <td>-3.574217</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>GD4803039</th>\n",
-       "      <td>9.80000</td>\n",
-       "      <td>38.73330</td>\n",
-       "      <td>1774</td>\n",
-       "      <td>-1.901458</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>GD4160491</th>\n",
-       "      <td>40.86677</td>\n",
-       "      <td>-74.31626</td>\n",
-       "      <td>1335</td>\n",
-       "      <td>-1.935504</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>GD12293461</th>\n",
-       "      <td>52.41000</td>\n",
-       "      <td>16.83000</td>\n",
-       "      <td>75</td>\n",
-       "      <td>0.938270</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>GD11540371</th>\n",
-       "      <td>30.47187</td>\n",
-       "      <td>-97.27472</td>\n",
-       "      <td>501</td>\n",
-       "      <td>-1.246532</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                 lat       lon  count       idf\n",
-       "GD3404996  -20.00000  47.00000  16408 -3.574217\n",
-       "GD4803039    9.80000  38.73330   1774 -1.901458\n",
-       "GD4160491   40.86677 -74.31626   1335 -1.935504\n",
-       "GD12293461  52.41000  16.83000     75  0.938270\n",
-       "GD11540371  30.47187 -97.27472    501 -1.246532"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "df.head(5)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:23:07.914581Z",
-     "start_time": "2018-04-19T17:23:02.202900Z"
+     "end_time": "2018-05-16T09:11:35.143577Z",
+     "start_time": "2018-05-16T09:10:29.712Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x10b99d5f8>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Libraries\n",
     "from mpl_toolkits.basemap import Basemap\n",
@@ -780,30 +2279,19 @@
     "m.colorbar()\n",
     "plt.title(\"Spatial Entities Occurrence in BVLAC Corpus (World Scale)\",fontdict={\"fontsize\":24})\n",
     "# Save as png\n",
-    "plt.savefig('SE_Dispersion_World.pdf', bbox_inches='tight')"
+    "plt.savefig('SE_Dispersion_World_{0}Per.pdf'.format(skipPercentage), bbox_inches='tight')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-20T06:03:14.313187Z",
-     "start_time": "2018-04-20T06:03:10.007003Z"
+     "end_time": "2018-05-16T09:11:35.145171Z",
+     "start_time": "2018-05-16T09:10:29.714Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x10f361da0>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Libraries\n",
     "from mpl_toolkits.basemap import Basemap\n",
@@ -831,38 +2319,19 @@
     "# Save as png\n",
     "m.colorbar(location='bottom')\n",
     "plt.title(\"Spatial Entities Occurrence in BVLAC Corpus (Madagascar Scale)\",fontdict={\"fontsize\":15})\n",
-    "plt.savefig('SE_Dispersion_MADA.pdf', bbox_inches='tight')"
+    "plt.savefig('SE_Dispersion_MADA_{0}Per.pdf'.format(skipPercentage), bbox_inches='tight')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:23:15.457718Z",
-     "start_time": "2018-04-19T17:23:10.235688Z"
+     "end_time": "2018-05-16T09:11:35.146513Z",
+     "start_time": "2018-05-16T09:10:29.716Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/matplotlib/collections.py:877: RuntimeWarning: invalid value encountered in sqrt\n",
-      "  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x10aee8ef0>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Libraries\n",
     "from mpl_toolkits.basemap import Basemap\n",
@@ -886,38 +2355,19 @@
     "m.colorbar()\n",
     "plt.title(\"Spatial Entities IDF in BVLAC Corpus (World Scale)\",fontdict={\"fontsize\":24})\n",
     "# Save as png\n",
-    "plt.savefig('SE_Dispersion_IDF_World.pdf', bbox_inches='tight')"
+    "plt.savefig('SE_Dispersion_IDF_World_{0}Per.pdf'.format(skipPercentage), bbox_inches='tight')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-20T05:55:33.963110Z",
-     "start_time": "2018-04-20T05:55:29.673733Z"
+     "end_time": "2018-05-16T09:11:35.147911Z",
+     "start_time": "2018-05-16T09:10:29.718Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/matplotlib/collections.py:877: RuntimeWarning: invalid value encountered in sqrt\n",
-      "  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x110b92390>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Libraries\n",
     "from mpl_toolkits.basemap import Basemap\n",
@@ -945,16 +2395,16 @@
     "# Save as png\n",
     "m.colorbar(location='bottom')\n",
     "plt.title(\"Spatial Entities IDF in BVLAC Corpus (Madagascar Scale)\",fontdict={\"fontsize\":15})\n",
-    "plt.savefig('SE_Dispersion_IDF_MADA.pdf', bbox_inches='tight')"
+    "plt.savefig('SE_Dispersion_IDF_MADA_{0}Per.pdf'.format(skipPercentage), bbox_inches='tight')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-04-19T17:23:17.645281Z",
-     "start_time": "2018-04-19T17:23:17.642734Z"
+     "end_time": "2018-05-16T09:11:35.149138Z",
+     "start_time": "2018-05-16T09:10:29.720Z"
     }
    },
    "outputs": [],
@@ -963,6 +2413,147 @@
     "[c+(\"\" if c[-1] in [\"\\'\",\"-\"] else \" \") for c in ch]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T09:11:35.150112Z",
+     "start_time": "2018-05-16T09:10:29.722Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "files=glob(\"/Users/jacquesfize/LOD_DATASETS/bv_lac_pos_ner_disambiguate_3/*.csv\")\n",
+    "i=0\n",
+    "n=len(files)\n",
+    "old_points={}\n",
+    "for fn in files:\n",
+    "    i+=1\n",
+    "    sys.stdout.write(\"\\r{0}/{1}\".format(i,n))\n",
+    "    df=pd.read_csv(fn)\n",
+    "    df=df.fillna(\"O\")\n",
+    "    for id,row in df.iterrows():\n",
+    "        if not row[\"GID\"] or row[\"GID\"] == \"O\":\n",
+    "            continue\n",
+    "        if not row[\"GID\"] in old_points:\n",
+    "            data=pd.Series(get_data(row[\"GID\"]))\n",
+    "            if \"coord\" in data:\n",
+    "                old_points[row[\"GID\"]]=[data[\"coord\"][\"lat\"], data[\"coord\"][\"lon\"]]\n",
+    "\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T09:11:35.151756Z",
+     "start_time": "2018-05-16T09:10:29.722Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "len(points),len(old_points)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T09:11:35.153353Z",
+     "start_time": "2018-05-16T09:10:29.724Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "diff={}\n",
+    "new_keys=list(old_points.keys())\n",
+    "for k in new_keys:\n",
+    "    if not k in points:\n",
+    "        diff[k]=old_points[k]\n",
+    "len(diff)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T09:11:35.155215Z",
+     "start_time": "2018-05-16T09:10:29.726Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Libraries\n",
+    "# Set the dimension of the figure\n",
+    "\n",
+    "df=pd.DataFrame.from_dict(diff, orient='index')\n",
+    "df=df.rename(columns={0:\"lat\",1:\"lon\"})\n",
+    "\n",
+    "my_dpi=96\n",
+    "plt.figure(figsize=(2600/my_dpi, 1800/my_dpi), dpi=my_dpi)\n",
+    " \n",
+    "# Make the background map\n",
+    "m=Basemap(llcrnrlon=-180, llcrnrlat=-65,urcrnrlon=180,urcrnrlat=80)\n",
+    "#m.drawmapboundary(fill_color='#A6CAE0', linewidth=0)\n",
+    "m.fillcontinents(color='grey', alpha=0.3)\n",
+    "m.drawcoastlines(linewidth=0.1, color=\"#666666\")\n",
+    "#m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 1500)\n",
+    "# Add a point per position\n",
+    "m.scatter(df['lon'], df['lat'], s=1 ,cmap=\"autumn\")\n",
+    " \n",
+    "# copyright and source data info\n",
+    "#plt.text( -170, -58,\"Répartition des entités spatiales dans le corpus BVLAC (5500 documents)\", ha='left', va='bottom', size=9, color='#555555' )\n",
+    "#m.colorbar()\n",
+    "plt.title(\"Spatial Entities IDF in BVLAC Corpus (World Scale)\",fontdict={\"fontsize\":24})\n",
+    "# Save as png\n",
+    "plt.savefig('SE_Dispersion_Diff_World_{0}Per.pdf'.format(skipPercentage), bbox_inches='tight')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-05-16T09:11:35.156455Z",
+     "start_time": "2018-05-16T09:10:29.728Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Libraries\n",
+    "from mpl_toolkits.basemap import Basemap\n",
+    "import matplotlib.pyplot as plt\n",
+    " \n",
+    "# Set the dimension of the figure\n",
+    "my_dpi=96\n",
+    "plt.figure(figsize=(2600/my_dpi, 1800/my_dpi), dpi=my_dpi)\n",
+    " \n",
+    "# Make the background map 43.2541870461, -25.6014344215, 50.4765368996, -12.0405567359)\n",
+    "m=Basemap(llcrnrlon=43.2541870461, llcrnrlat=-25.6,urcrnrlon=50.4765368996,urcrnrlat=-11.5,resolution=\"h\")\n",
+    "#m.drawmapboundary(fill_color='#A6CAE0', linewidth=0)\n",
+    "m.fillcontinents(color='grey', alpha=0.3)\n",
+    "m.drawcoastlines(linewidth=0.1, color=\"#666666\")\n",
+    "#m.arcgisimage(service='ESRI_Imagery_World_2D', xpixels = 1500)\n",
+    "\n",
+    "df2=df[(df['lon'] > 43.5) & (df['lon'] < 50.47) & (df['lat'] > -25.6) & (df['lat'] < -12.04) ]\n",
+    "\n",
+    "# Add a point per position\n",
+    "#m.scatter(df2['lon'], df2['lat'], s=df2['count']/6, alpha=0.4,  cmap=\"autumn\")\n",
+    "m.scatter(df2['lon'], df2['lat'],s=1,  cmap=\"YlOrRd\")\n",
+    "# copyright and source data info\n",
+    "#plt.text( -170, -58,\"Répartition des entités spatiales dans le corpus BVLAC (5500 documents)\", ha='left', va='bottom', size=9, color='#555555' )\n",
+    " \n",
+    "# Save as png\n",
+    "#m.colorbar(location='bottom')\n",
+    "plt.title(\"Spatial Entities IDF in BVLAC Corpus (Madagascar Scale)\",fontdict={\"fontsize\":15})\n",
+    "plt.savefig('SE_Dispersion_Diff_MADA_{0}Per.pdf'.format(skipPercentage), bbox_inches='tight')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -987,7 +2578,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.6.5"
   },
   "toc": {
    "nav_menu": {},
diff --git a/pipeline.py b/pipeline.py
index 8c47af0..264f4a0 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -12,6 +12,7 @@ from nlp.ner.ner import NER
 from nlp.ner.stanford_ner import StanfordNER
 from nlp.pos_tagger.tagger import Tagger
 from nlp.pos_tagger.treetagger import TreeTagger
+import json
 
 
 class Pipeline(object):
diff --git a/temp.py b/temp.py
new file mode 100644
index 0000000..37b907c
--- /dev/null
+++ b/temp.py
@@ -0,0 +1,181 @@
+# coding = utf-8
+import argparse
+import glob
+import logging
+import string
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from langdetect import detect
+from progressbar import ProgressBar, Timer, Bar, ETA, Counter
+
+from nlp.disambiguator.geodict_gaurav import *
+from pipeline import *
+
+
+logging.basicConfig(format='%(asctime)s %(message)s')
+
+def filter_nonprintable(text):
+    # Get the difference of all ASCII characters from the set of printable characters
+    nonprintable = set([chr(i) for i in range(128)]).difference(string.printable)
+    # Use translate to remove all non-printable characters
+    return text.translate({ord(character):None for character in nonprintable})
+
+parser = argparse.ArgumentParser()
+parser.add_argument("texts_input_dir")
+parser.add_argument("graphs_output_dir")
+parser.add_argument("metadata_output_fn")
+
+subparsers = parser.add_subparsers(help='commands')
+
+normal = subparsers.add_parser(
+    'normal', help='Basic STR generation. No argument are necessary !')
+normal.set_defaults(which="norm")
+
+
+gen_parser = subparsers.add_parser(
+    'generalisation', help='Apply a generalisation transformation on the generated STRs')
+gen_parser.set_defaults(which="gene")
+gen_parser.add_argument(
+    '-t','--type_gen', help='Type of generalisation',default="all")
+gen_parser.add_argument(
+    '-n', help='Language',default=1)
+gen_parser.add_argument(
+    '-b','--bound', help='If Generalisation is bounded, this arg. correspond'
+                         'to the maximal ',default="country")
+
+ext_parser = subparsers.add_parser(
+    'extension', help='Apply a extension process on the STRs')
+ext_parser.set_defaults(which="ext")
+ext_parser.add_argument(
+    '-d','--distance', help='radius distance',default=150)
+ext_parser.add_argument(
+    '-u','--unit', help='unit used for the radius distance',default="km")
+ext_parser.add_argument(
+    '-a','--adjacent_count', help='number of adjacent SE add to the STR',default=1)
+
+args = parser.parse_args()
+if "which" in args:
+    if args.which =="gene":
+        args.type_trans="gen"
+    elif args.which =="ext":
+        args.type_trans="ext"
+
+print("Parameters entered : ",args)
+
+
+start = time.time()
+class_=StanfordNER
+# Initialise Graphs Transformers
+pipeline= {
+    "en":Pipeline(lang="english",tagger=Tagger(),ner=class_(lang="en")),
+    "fr":Pipeline(lang="french",tagger=Tagger(),ner=class_(lang="fr")),
+    "es":Pipeline(lang="espagnol",tagger=Tagger(),ner=class_(lang="es"))
+}
+
+
+
+# Read Input Files
+import re
+texts_=[]
+if os.path.exists(args.texts_input_dir):
+    files_glob= glob.glob(args.texts_input_dir+"/*.txt")
+    files_=["" ]* len(files_glob)
+    for fn in files_glob:
+        id = int(re.findall("\d+", fn)[-1])
+        files_[id]=fn
+    if not files_:
+        print("No .txt files found in {0}".format(args.texts_input_dir))
+        exit()
+    for fn in files_:
+        try:
+            tex=open(fn).read()
+            #lang = detect(tex) #for bug encoding
+            texts_.append(tex)
+        except:
+            print("{0} could'nt be read ! Add Lorem Ipsum instead".format(fn))
+            texts_.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.")
+
+
+# If output Dir doesn't exists
+if not os.path.exists(args.graphs_output_dir):
+    os.makedirs(args.graphs_output_dir)
+
+if not texts_:
+    print("No text files were loaded !")
+    exit()
+
+
+
+
+data={}
+n=0
+logging.info("Identify Document(s) language(s)")
+with ProgressBar(max_value=len(texts_),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg:
+    for text in range(len(texts_)):
+        pg.update(text)
+        if not text:
+            lang="en"
+        else:
+            try:
+                lang=detect(texts_[text])
+
+            except Exception as e:
+                lang="en"
+            #print(lang, text)
+        if not lang in data and lang in pipeline:
+            data[lang]=[]
+        if lang in pipeline:
+            data[lang].append(text)
+        else:
+            if not "en" in data:data["en"]=[] # Ca peut arriver :s :s :s !!!
+            data["en"].append(text)
+    # except:
+        #     n+=1 # encoding error
+
+associated_es={}
+count_per_doc={}
+list_gs=[]
+i=0
+
+
+
+def workSTR(id_doc,text,count_per_doc,associated_es, list_gs,pg,lang):
+    global i
+    if not text:
+        count_per_doc[id_doc] = {}
+        associated_es[id_doc] = {}
+        g = nx.MultiDiGraph()
+        list_gs.append(g)
+
+    else:
+        t = filter_nonprintable(text)
+        # try:
+        str, count, se_identified = pipeline[lang].build(t, None, **vars(args))
+        list_gs.append(str.graph)
+        # Save Metadata
+        count_per_doc[id_doc] = count
+        associated_es[id_doc] = se_identified
+
+
+    # Save Graph structure
+    nx.write_gexf(list_gs[-1], args.graphs_output_dir + "/{0}.gexf".format(id_doc))
+    i+=1
+    pg.update(i)
+
+
+logging.info("Extracting Toponyms and Building STR...")
+queue=[]
+with  ThreadPoolExecutor(max_workers=4) as executor:
+    with ProgressBar(max_value=len(texts_),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg:
+        pg.start()
+        for lang in data:
+            for id_doc in data[lang]:
+                future = executor.submit(workSTR,id_doc,texts_[id_doc],count_per_doc,associated_es, list_gs,pg,lang)
+
+
+# Save Metadata
+open(os.path.join(args.graphs_output_dir,args.metadata_output_fn),'w').write(json.dumps([associated_es,count_per_doc],indent=4))
+
+
+print("--- %s seconds ---" % (time.time() - start))
\ No newline at end of file
-- 
GitLab