From fbcd363465474f0f8fed385282957873f3128c47 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Fri, 29 Jun 2018 12:10:11 +0200
Subject: [PATCH] - Update Gmatch4py

- new interface to annote spatial similarity

- New adjacency source
---
 config/config.json                            |   4 +-
 eval.py                                       | 206 ++---
 exp_22_may.sh                                 |  55 +-
 gmatch4py/data/source/source1.txt             |  36 -
 gmatch4py/exception/__init__.py               |   8 +-
 .../ged/algorithm/abstract_graph_edit_dist.py | 112 ---
 gmatch4py/ged/algorithm/edge_edit_dist.py     |  29 -
 gmatch4py/ged/algorithm/graph_edit_dist.py    |  71 --
 gmatch4py/ged/approximate_ged.py              |  20 -
 gmatch4py/ged/bipartite_graph_matching_2.py   | 147 ----
 gmatch4py/ged/graph/edge_graph.py             |  16 -
 gmatch4py/ged/greedy_edit_distance.py         |  44 -
 gmatch4py/ged/hausdorff_edit_distance.py      | 145 ----
 gmatch4py/helpers/__init__.py                 |   1 -
 gmatch4py/helpers/networkx_parser.py          | 148 ----
 gmatch4py/kernels/weisfeiler_lehman.py        | 136 ---
 gmatch4py_cython/gmatch4py/bag_of_cliques.pyx | 109 ++-
 .../gmatch4py/ged/approximate_ged.pyx         |  33 +-
 .../ged/bipartite_graph_matching_2.pyx        |   2 +-
 .../gmatch4py/ged/greedy_edit_distance.pyx    |   2 +-
 .../gmatch4py/ged/hausdorff_edit_distance.pyx |  12 +-
 gmatch4py_cython/gmatch4py/jaccard.pyx        |   7 +-
 .../gmatch4py/kernels/weisfeiler_lehman.pyx   |   2 +-
 .../gmatch4py/vertex_edge_overlap.pyx         |  30 +-
 gui_graph_viewer/db.py                        |  22 +-
 gui_graph_viewer/server.py                    |  93 ++
 gui_graph_viewer/static/css/dashboard.css     |   6 +-
 gui_graph_viewer/static/js/helpers.js         |  10 +-
 gui_graph_viewer/templates/indexv2.html       | 189 +++++
 helpers/gazeteer_helpers.py                   |  24 +-
 models/str.py                                 |  12 +-
 nlp/bow_se.py                                 |   2 +-
 nlp/disambiguator/models/bigram.py            |  22 +-
 nlp/disambiguator/wikipedia_cooc.py           |  89 +-
 notebooks/EvalDesambiguisationMada.ipynb      | 794 +++++++++++++++++-
 notebooks/EvalDesambiguisationPADIWEB.ipynb   | 187 ++++-
 tools.py                                      |   2 +-
 37 files changed, 1616 insertions(+), 1211 deletions(-)
 delete mode 100644 gmatch4py/data/source/source1.txt
 delete mode 100644 gmatch4py/ged/algorithm/abstract_graph_edit_dist.py
 delete mode 100644 gmatch4py/ged/algorithm/edge_edit_dist.py
 delete mode 100644 gmatch4py/ged/algorithm/graph_edit_dist.py
 delete mode 100644 gmatch4py/ged/approximate_ged.py
 delete mode 100644 gmatch4py/ged/bipartite_graph_matching_2.py
 delete mode 100644 gmatch4py/ged/graph/edge_graph.py
 delete mode 100644 gmatch4py/ged/greedy_edit_distance.py
 delete mode 100644 gmatch4py/ged/hausdorff_edit_distance.py
 delete mode 100644 gmatch4py/helpers/__init__.py
 delete mode 100644 gmatch4py/helpers/networkx_parser.py
 delete mode 100644 gmatch4py/kernels/weisfeiler_lehman.py
 create mode 100644 gui_graph_viewer/templates/indexv2.html

diff --git a/config/config.json b/config/config.json
index eefac00..c96b03b 100644
--- a/config/config.json
+++ b/config/config.json
@@ -3,9 +3,9 @@
   "stanford_nlp_home":"/Users/jacquesfize/.services/stanford-corenlp-full-2017-06-09",
   "osm_boundaries_directory":"/Users/jacquesfize/install",
   "core_nlp_URL":"http://localhost:9000",
-  "es_server_old":"http://192.168.1.15:9200/",
+  "es_server_old":"http://172.16.10.11:9200/",
   "es_server":"http://localhost:9200/",
-  "database_json":"resources/database_exp_12_mars.db",
+  "database_json":"resources/database_exp_25_may.db",
   "log_file":"extract_log",
   "wiki_cooc_dis":{
     "cooc_freq":"/Users/jacquesfize/nas_cloud/Code/str-python/resources/coocurrence_wiki.pkl",
diff --git a/eval.py b/eval.py
index 48b8e81..9e809f9 100644
--- a/eval.py
+++ b/eval.py
@@ -21,112 +21,127 @@ from nlp.bow_se import BOWSE
 from pipeline import *
 
 # Function for output generation
-def_temp=[36,-36]
-temp=def_temp
-max_temp=-30
-dec=5
+def_temp = [36, -36]
+temp = def_temp
+max_temp = -30
+dec = 5
+
 
 def getLocInfo(id_):
-    global temp,dec
+    global temp, dec
     try:
-        data=get_data(id_)
+        data = get_data(id_)
         if 'coord' in data:
-            return [data["coord"]["lat"],data["coord"]["lon"]]
+            return [data["coord"]["lat"], data["coord"]["lon"]]
         else:
-            temp = [temp[0] , temp[1]+dec]
+            temp = [temp[0], temp[1] + dec]
             if temp[1] >= max_temp:
-                temp = [temp[0] +dec, def_temp[1]]
+                temp = [temp[0] + dec, def_temp[1]]
             return temp
     except:
         pass
 
+
 def get_associated_es(associated_es_data):
     global temp
-    new_={}
-    temp=def_temp
+    new_ = {}
+    temp = def_temp
     for id_ in associated_es_data:
         try:
-            new_[id_]={"label":get_data(id_)["en"],"coord":getLocInfo(id_)}
+            new_[id_] = {"label": get_data(id_)["en"], "coord": getLocInfo(id_)}
         except:
             new_[id_] = {"label": id_, "coord": getLocInfo(id_)}
     return new_
 
-def getEdges4Draw(data,edges):
-    lines=[]
+
+def getEdges4Draw(data, edges):
+    lines = []
     for ed in edges:
-        lines.append([data[ed[0]]["coord"],data[ed[1]]["coord"],ed[2]["color"]])
+        lines.append([data[ed[0]]["coord"], data[ed[1]]["coord"], ed[2]["color"]])
         if lines[-1][-1] == "cyan":
             lines[-1][-1] = "blue";
 
     return lines
 
+
 # Similarity Function between graph and a set of graphs
 
-def compareMCS(graphs,selected):
-    return 1-MCS.compare(graphs,selected)
+def compareMCS(graphs, selected):
+    return 1 - MCS.compare(graphs, selected)
+
+
 # GED algorithm
-def compareGED(graphs,selected):
-    return ApproximateGraphEditDistance.compare(graphs,selected)
+def compareGED(graphs, selected):
+    return ApproximateGraphEditDistance.compare(graphs, selected)
 
-def compareBP2(graphs,selected):
-    return BP_2.compare(graphs,selected)
 
-def compareHED(graphs,selected):
-    return HED.compare(graphs,selected)
+def compareBP2(graphs, selected):
+    return BP_2.compare(graphs, selected)
 
-def compareGreedy(graphs,selected):
-    return GreedyEditDistance.compare(graphs,selected)
 
-def compareWLSubTreeKernel(graphs,selected):
-    return 1 - WeisfeleirLehmanKernel.compare(graphs,selected,h=3)
+def compareHED(graphs, selected):
+    return HED.compare(graphs, selected)
 
-def compareBOWSE(graphs,selected):
-    return 1-BOWSE.compare(graphs,selected)
 
-def compareBOC(graphs_array,selected):
-    return 1 - BagOfCliques.compare(graphs_array,selected)
+def compareGreedy(graphs, selected):
+    return GreedyEditDistance.compare(graphs, selected)
 
-def compareVEO(graphs_array,selected):
-    return 1 - VertexEdgeOverlap.compare(graphs_array,selected)
 
-def compareJaccard(graphs_array,selected):
-    return 1 - Jaccard.compare(graphs_array,selected)
+def compareWLSubTreeKernel(graphs, selected):
+    return 1 - WeisfeleirLehmanKernel.compare(graphs, selected, h=3)
+
+
+def compareBOWSE(graphs, selected):
+    return 1 - BOWSE.compare(graphs, selected)
+
+
+def compareBOC(graphs_array, selected):
+    return np.ones((len(graphs_array),len(graphs_array))) - BagOfCliques.compare(graphs_array, selected)
 
-funcDict={
-    "MCS":compareMCS,
-    "VEO":compareVEO,
-    "GED":compareGED,
-    "BP2":compareBP2,
-    "HED":compareHED,
-    "GREEDY":compareGreedy,
-    "WLSUBTREE":compareWLSubTreeKernel,
-    "BOWSE":compareBOWSE,
-    "BOC":compareBOC,
-    "JACCARD":compareJaccard
-}
 
+def compareVEO(graphs_array, selected):
+    return 1 - VertexEdgeOverlap.compare(graphs_array, selected)
+
+
+def compareJaccard(graphs_array, selected):
+    return 1 - Jaccard.compare(graphs_array, selected)
+
+
+funcDict = {
+    "MCS": compareMCS,
+    "VEO": compareVEO,
+    "GED": compareGED,
+    "BP2": compareBP2,
+    "HED": compareHED,
+    "GREEDY": compareGreedy,
+    "WLSUBTREE": compareWLSubTreeKernel,
+    "BOWSE": compareBOWSE,
+    "BOC": compareBOC,
+    "JACCARD": compareJaccard
+}
 
 import argparse
+
 parser = argparse.ArgumentParser()
 parser.add_argument("distance")
 parser.add_argument("texts_dir")
 parser.add_argument("graphs_dir")
 parser.add_argument("metadata_fn")
 parser.add_argument("original_dir")
-parser.add_argument("-s","--selectedGraph")
-parser.add_argument("-a","--all",action="store_true")
-parser.add_argument("-o","--output",help="Output Filename")
+parser.add_argument("-s", "--selectedGraph")
+parser.add_argument("-a", "--all", action="store_true")
+parser.add_argument("-o", "--output", help="Output Filename")
 args = parser.parse_args()
 
-original_dir=args.original_dir
+original_dir = args.original_dir
 if not args.distance in funcDict.keys():
-    raise NotFoundDistance(args.distance,funcDict)
+    raise NotFoundDistance(args.distance, funcDict)
     exit()
 
 # Load all the text from the corpus
-texts=[]
+texts = []
 if os.path.exists(args.texts_dir):
-    files_glob= glob.glob(args.texts_dir+"/*.txt")
+    files_glob = glob.glob(args.texts_dir + "/*.txt")
     texts = [""] * len(files_glob)
     for fn in files_glob:
         id = int(re.findall("\d+", fn)[-1])
@@ -150,24 +165,24 @@ if not texts:
 
 # Load graph data and associated spatial entities of each graph
 
-assC=json.load(open(args.metadata_fn))
-associated_es,count_per_doc=assC[0],assC[1]
+assC = json.load(open(args.metadata_fn))
+associated_es, count_per_doc = assC[0], assC[1]
 
-graphs={}
-for file in glob.glob(args.graphs_dir.rstrip("/")+"/*.gexf"):
-    id=int(re.findall("\d+",file)[-1])
-    graphs[id]=nx.read_gexf(file)
+graphs = {}
+for file in glob.glob(args.graphs_dir.rstrip("/") + "/*.gexf"):
+    id = int(re.findall("\d+", file)[-1])
+    graphs[id] = nx.read_gexf(file)
 
-graphs_array = [None for i in range(max(graphs.keys())+1)]
-for i,g in graphs.items():
-        graphs_array[i]=g
+graphs_array = [nx.Graph() for i in range(max(graphs.keys()) + 1)]
+for i, g in graphs.items():
+    graphs_array[i] = g
 
 # We take 50 documents chosen randomly. Then we test, if the top-10 returned documents are relevant !
 
 if args.all:
-    selected_documents_=list(graphs.keys())
+    selected_documents_ = list(graphs.keys())
 elif args.selectedGraph:
-    selected_documents_=json.load(open(args.selectedGraph))
+    selected_documents_ = json.load(open(args.selectedGraph))
 # if args.all:
 #     selected_documents_=list(graphs.keys())
 # else:
@@ -186,36 +201,34 @@ elif args.selectedGraph:
 
 
 # Generating Evaluation Output
-top_ten_documents=[]
-final_data={}
+top_ten_documents = []
+final_data = {}
 
-deb=time.time()
+deb = time.time()
 print("Computing Similarity Matrix ...")
-similarity_matrix = funcDict[args.distance](graphs_array,selected_documents_)
-print("Similarity Matrix Computed in {0} s.".format(time.time()-deb))
-
-graphs={}
-for file in glob.glob(original_dir.rstrip("/")+"/*.gexf"):
-    id=int(re.findall("\d+",file)[-1])
-    graphs[id]=nx.read_gexf(file)
-
+similarity_matrix = funcDict[args.distance](graphs_array, selected_documents_)
+print("Similarity Matrix Computed in {0} s.".format(time.time() - deb))
 
+graphs = {}
+for file in glob.glob(original_dir.rstrip("/") + "/*.gexf"):
+    id = int(re.findall("\d+", file)[-1])
+    graphs[id] = nx.read_gexf(file)
 
 nn_ = 5
 
-with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg:
-    inc=0
+with ProgressBar(max_value=len(selected_documents_), widgets=[' [', Timer(), '] ', Bar(), ' (', ETA(), ') ', ]) as pg:
+    inc = 0
     for doc_s in selected_documents_:
-        if not len(graphs[doc_s])>0:
+        if not len(graphs[doc_s]) > 0:
             continue
-        bow_score=similarity_matrix[doc_s]
+        bow_score = similarity_matrix[doc_s]
         top_docs_score = np.sort(bow_score).astype(float)
         top_docs = np.argsort(bow_score).astype(int)
-        final_data[doc_s]={
-            "sp_entities":get_associated_es(graphs[doc_s].nodes()),
-            "text":texts[doc_s],
+        final_data[doc_s] = {
+            "sp_entities": get_associated_es(graphs[doc_s].nodes()),
+            "text": texts[doc_s],
         }
-        final_data[doc_s]["edges"] = getEdges4Draw(final_data[doc_s]["sp_entities"],graphs[doc_s].edges(data=True))
+        final_data[doc_s]["edges"] = getEdges4Draw(final_data[doc_s]["sp_entities"], graphs[doc_s].edges(data=True))
         final_data[doc_s]["topk"] = []
         n_top_docs = len(top_docs)
         for d in range(n_top_docs):
@@ -223,22 +236,25 @@ with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] '
                 continue
             if len(final_data[doc_s]["topk"]) == nn_:
                 break
-            doc_data={}
-            doc_data["score"]=top_docs_score[d]
-            doc_data["id_txt"]=int(top_docs[d])
-            doc_data["text"]=""#texts[int(top_10_docs[d])]
-            doc_data["sp_entities"]=get_associated_es(graphs[doc_data["id_txt"]].nodes())
-            doc_data["edges"]=getEdges4Draw(doc_data["sp_entities"],graphs[doc_data["id_txt"]].edges(data=True))
-            doc_data["relevant"]=None
+            doc_data = {}
+            doc_data["score"] = top_docs_score[d]
+            doc_data["id_txt"] = int(top_docs[d])
+            doc_data["text"] = ""  # texts[int(top_10_docs[d])]
+            doc_data["sp_entities"] = get_associated_es(graphs[doc_data["id_txt"]].nodes())
+            doc_data["edges"] = getEdges4Draw(doc_data["sp_entities"], graphs[doc_data["id_txt"]].edges(data=True))
+            doc_data["relevant"] = None
             final_data[doc_s]["topk"].append(doc_data)
-        inc+=1
+        inc += 1
         pg.update(inc)
 
 if not args.output:
     print("Saved in gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance))
-    open("gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4))
+    open("gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance), 'w').write(json.dumps(final_data, indent=4))
 else:
-    print("Saved in {0}/evalTop10STR_{1}.json".format(args.output,args.distance))
+    print("Saved in {0}/evalTop10STR_{1}.json".format(args.output, args.distance))
     if not os.path.exists(args.output):
         os.makedirs(args.output)
-    open("{0}/evalTop10STR_{1}.json".format(args.output.rstrip("/"),args.distance), 'w').write(json.dumps(final_data, indent=4))
+    open("{0}/evalTop10STR_{1}.json".format(args.output.rstrip("/"), args.distance), 'w').write(
+        json.dumps(final_data, indent=4))
+
+
diff --git a/exp_22_may.sh b/exp_22_may.sh
index ae037f3..1ef7e44 100755
--- a/exp_22_may.sh
+++ b/exp_22_may.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
 path_csv=/Users/jacquesfize/LOD_DATASETS/disambiguation
+path_texts=/Users/jacquesfize/LOD_DATASETS/raw_bvlac/
 output_dir=data/graph_exp_may_25
 if [ "$1" == "generate" ]; then
 
@@ -22,44 +23,48 @@ fi
 
 if [ "$1" == "eval" ]; then
     ## Normal STR eval
-    original=data/graph_exp_may_24/normal
+    original=$output_dir/normal;
     dir=normal;
-    mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE");
-    for me in ${mesure[@]}; do
-        echo $me" for STR "$dir;
-        python3 eval.py "$me" "$path_texts" "$output_dir/$dir" "$output_dir/$dir/asso.json" "$original" -s "$output_dir/selected.json"  -o "$output_dir/result_eval/$dir/";
-    done;
+    #mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE");
+#    mesure=("BOC" "JACCARD");
+#    for me in ${mesure[@]}; do
+#        echo $me" for STR "$dir;
+#        python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
+#    done;
 
-    ## Generalised STR eval
-    dir=gen_all_1
-    mesure=( "MCS" "VEO"  "JACCARD" "HED" "GREEDY"  "GED" "BOC" "BOWSE");
-    for me in ${mesure[@]}; do
-        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original  -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
-    done;
+#    ## Generalised STR eval
+#    dir=gen_all_1
+#    mesure=( "MCS" "VEO"  "JACCARD" "HED" "GREEDY"  "GED" "BOC" "BOWSE");
+#    for me in ${mesure[@]}; do
+#        echo $me" for STR "$dir;
+#        python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original  -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
+#    done;
 
 #    dir=gen_all_2
 #    mesure=( "MCS" "VEO"  "JACCARD" "HED" "GREEDY" "BOC" "BOWSE");
 #    for me in ${mesure[@]}; do
 #        echo $me" for STR "$dir;
 #        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
+#    done;
+    #mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED"  "BOWSE");
+#    dir=gen_region
+#    for me in ${mesure[@]}; do
+#        echo $me" for STR "$dir;
+#        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
 #    done;
 
-    dir=gen_region
-    for me in ${mesure[@]}; do
-        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
-    done;
-
-    dir=gen_country
-    for me in ${mesure[@]}; do
-        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
-    done;
+# "BOWSE");
+    mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC");
+#    dir=gen_country
+#    for me in ${mesure[@]}; do
+#        echo $me" for STR "$dir;
+#        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
+#    done;bn
 
     ## Extended STR eval
+    # "BOWSE");
     dir=extension_1
-    mesure=( "MCS" "VEO" "JACCARD" "BOC" "WLSUBTREE" "BOWSE");
+    mesure=( "MCS" "VEO" "JACCARD" "BOC"  "HED" "GREEDY" "GED");
     for me in ${mesure[@]}; do
         echo $me" for STR "$dir;
         python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
diff --git a/gmatch4py/data/source/source1.txt b/gmatch4py/data/source/source1.txt
deleted file mode 100644
index de26da6..0000000
--- a/gmatch4py/data/source/source1.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-  "id": "source1.txt-1",
-  "sentenceNumber": 1,
-  "length": 17,
-  "tokens": [{
-    "id": "1",
-    "lemma": "Haakon",
-    "deprel": "nsubj",
-    "word": "Haakon",
-    "rel": "4",
-    "pos": "NNP"
-  }, {
-    "id": "2",
-    "lemma": "be",
-    "deprel": "cop",
-    "word": "is",
-    "rel": "4",
-    "pos": "VBZ"
-  }, {
-    "id": "3",
-    "lemma": "my",
-    "deprel": "poss",
-    "word": "my",
-    "rel": "4",
-    "pos": "PRP$"
-  }, {
-    "id": "4",
-    "lemma": "name",
-    "deprel": "null",
-    "word": "name",
-    "rel": "0",
-    "pos": "NN"
-  }],
-  "filename": "source1.txt",
-  "offset": 0
-}
diff --git a/gmatch4py/exception/__init__.py b/gmatch4py/exception/__init__.py
index 1d997a2..950f635 100644
--- a/gmatch4py/exception/__init__.py
+++ b/gmatch4py/exception/__init__.py
@@ -1,7 +1 @@
-# coding = utf-8
-from termcolor import colored
-class NotFoundDistance(Exception):
-    def __init__(self,dd,distanceFunctionDict):
-        # Call the base class constructor with the parameters it needs
-        super(Exception, self).__init__(colored("{0} is not an edit distance implemented ! Select a distance from : {1}".format(dd,",".join(distanceFunctionDict.keys())),"red"))
-
+# coding = utf-8
\ No newline at end of file
diff --git a/gmatch4py/ged/algorithm/abstract_graph_edit_dist.py b/gmatch4py/ged/algorithm/abstract_graph_edit_dist.py
deleted file mode 100644
index e0a1d3b..0000000
--- a/gmatch4py/ged/algorithm/abstract_graph_edit_dist.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# -*- coding: UTF-8 -*-
-from __future__ import print_function
-
-import sys
-
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-
-
-class AbstractGraphEditDistance(object):
-    def __init__(self, g1, g2,debug=False,**kwargs):
-        self.g1 = g1
-        self.g2 = g2
-        self.debug=debug
-
-        self.node_del = kwargs.get("node_del",1)
-        self.node_ins = kwargs.get("node_ins",1)
-        self.edge_del = kwargs.get("edge_del",1)
-        self.edge_ins = kwargs.get("edge_ins",1)
-
-
-    def distance(self):
-        opt_path = self.edit_costs()
-        if self.debug:
-            print("Edit path for ",str(self.__class__.__name__),"\n",opt_path)
-        return sum(opt_path)
-
-    def print_operations(self,cost_matrix,row_ind,col_ind):
-        nodes1 = self.g1.nodes()
-        nodes2 = self.g2.nodes()
-        dn1 = self.g1.node
-        dn2 = self.g2.node
-
-        n,m=len(nodes1),len(nodes2)
-        for i in range(len(row_ind)):
-            y,x=row_ind[i],col_ind[i]
-            val=cost_matrix[row_ind[i]][col_ind[i]]
-            if x<m and y<n:
-                print("SUB {0} to {1} cost = {2}".format(dn1[nodes1[y]]["label"],dn2[nodes2[x]]["label"],val))
-            elif x <m and y>=n:
-                print("ADD {0} cost = {1}".format(dn2[nodes2[y-n]]["label"],val))
-            elif x>=m and y<n:
-                print("DEL {0} cost = {1}".format(dn1[nodes1[m-x]]["label"],val))
-
-    def edit_costs(self):
-        cost_matrix = self.create_cost_matrix()
-        if self.debug:
-            np.set_printoptions(precision=3)
-            print("Cost Matrix for ",str(self.__class__.__name__),"\n",cost_matrix)
-
-        row_ind,col_ind = linear_sum_assignment(cost_matrix)
-        if self.debug:
-            self.print_operations(cost_matrix,row_ind,col_ind)
-        return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(len(row_ind))]
-
-    def create_cost_matrix(self):
-        """
-        Creates a |N+M| X |N+M| cost matrix between all nodes in
-        graphs g1 and g2
-        Each cost represents the cost of substituting,
-        deleting or inserting a node
-        The cost matrix consists of four regions:
-
-        substitute 	| insert costs
-        -------------------------------
-        delete 		| delete -> delete
-
-        The delete -> delete region is filled with zeros
-        """
-        n = len(self.g1)
-        m = len(self.g2)
-        cost_matrix = np.zeros((n+m,n+m))
-        #cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)]
-        nodes1 = self.g1.nodes()
-        nodes2 = self.g2.nodes()
-
-        for i in range(n):
-            for j in range(m):
-                cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j])
-
-        for i in range(m):
-            for j in range(m):
-                cost_matrix[i+n,j] = self.insert_cost(i, j, nodes2)
-
-        for i in range(n):
-            for j in range(n):
-                cost_matrix[j,i+m] = self.delete_cost(i, j, nodes1)
-
-        self.cost_matrix = cost_matrix
-        return cost_matrix
-
-    def insert_cost(self, i, j):
-        raise NotImplementedError
-
-    def delete_cost(self, i, j):
-        raise NotImplementedError
-
-    def substitute_cost(self, nodes1, nodes2):
-        raise NotImplementedError
-
-    def print_matrix(self):
-        print("cost matrix:")
-        print(self.g1.nodes())
-        print(self.g2.nodes())
-        print(np.array(self.create_cost_matrix()))
-        for column in self.create_cost_matrix():
-            for row in column:
-                if row == sys.maxsize:
-                    print ("inf\t")
-                else:
-                    print ("%.2f\t" % float(row))
-            print("")
diff --git a/gmatch4py/ged/algorithm/edge_edit_dist.py b/gmatch4py/ged/algorithm/edge_edit_dist.py
deleted file mode 100644
index 74ef2e9..0000000
--- a/gmatch4py/ged/algorithm/edge_edit_dist.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import sys
-
-from gmatch4py_old.ged.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
-
-
-class EdgeEditDistance(AbstractGraphEditDistance):
-    """
-    Calculates the graph edit distance between two edges.
-    A node in this context is interpreted as a graph,
-    and edges are interpreted as nodes.
-    """
-
-    def __init__(self, g1, g2,**kwargs):
-        AbstractGraphEditDistance.__init__(self, g1, g2,**kwargs)
-
-    def insert_cost(self, i, j, nodes2):
-        if i == j:
-            return self.edge_ins
-        return sys.maxsize
-
-    def delete_cost(self, i, j, nodes1):
-        if i == j:
-            return self.edge_del
-        return sys.maxsize
-
-    def substitute_cost(self, edge1, edge2):
-        if edge1 == edge2:
-            return 0.
-        return self.edge_del+self.edge_ins
diff --git a/gmatch4py/ged/algorithm/graph_edit_dist.py b/gmatch4py/ged/algorithm/graph_edit_dist.py
deleted file mode 100644
index 210e15e..0000000
--- a/gmatch4py/ged/algorithm/graph_edit_dist.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding: UTF-8 -*-
-
-import sys
-
-import networkx as nx
-
-from gmatch4py_old.ged.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
-from gmatch4py_old.ged.algorithm.edge_edit_dist import EdgeEditDistance
-from gmatch4py_old.ged.graph.edge_graph import EdgeGraph
-
-
-def compare(g1, g2, print_details=False):
-    ged = GraphEditDistance(g1, g2,print_details)
-    return ged.distance()
-
-
-class GraphEditDistance(AbstractGraphEditDistance):
-
-    def __init__(self, g1, g2,debug=False,**kwargs):
-        AbstractGraphEditDistance.__init__(self, g1, g2,debug,**kwargs)
-
-    def substitute_cost(self, node1, node2):
-        return self.relabel_cost(node1, node2) + self.edge_diff(node1, node2)
-
-    def relabel_cost(self, node1, node2):
-        if node1 == node2:
-            edges1=set(self.get_edge_multigraph(self.g1,node1))
-            edges2=set(self.get_edge_multigraph(self.g2,node2))
-            return abs(len(edges2.difference(edges1))) # Take in account if there is a different number of edges
-        else:
-            return self.node_ins+self.node_del
-
-    def delete_cost(self, i, j, nodes1):
-        if i == j:
-            return self.node_del+self.g1.degree(nodes1[i]) # Deleting a node implicate to delete in and out edges
-        return sys.maxsize
-
-    def insert_cost(self, i, j, nodes2):
-        if i == j:
-            deg=self.g2.degree(nodes2[j])
-            if isinstance(deg,dict):deg=0
-            return self.node_ins+deg
-        else:
-            return sys.maxsize
-
-    def get_edge_multigraph(self,g,node):
-        edges=[]
-        for id_,val in g.edge[node].items():
-            if not 0 in val:
-                edges.append(str(id_) + val["color"])
-            else:
-                for _,edge in val.items():
-                    edges.append(str(id_)+edge["color"])
-        return edges
-
-    def edge_diff(self, node1, node2):
-        if isinstance(self.g1,nx.MultiDiGraph):
-            edges1 = self.get_edge_multigraph(self.g1,node1)
-            edges2 = self.get_edge_multigraph(self.g2,node2)
-        else:
-            edges1 = list(self.g1.edge[node1].keys())
-            edges2 = list(self.g2.edge[node2].keys())
-        if len(edges1) == 0 or len(edges2) == 0:
-            return max(len(edges1), len(edges2))
-
-        edit_edit_dist = EdgeEditDistance(
-            EdgeGraph(node1,edges1),
-            EdgeGraph(node2,edges2),
-            edge_del=self.edge_del,edge_ins=self.edge_ins,node_ins=self.node_ins,node_del=self.node_del
-        )
-        return edit_edit_dist.distance()
diff --git a/gmatch4py/ged/approximate_ged.py b/gmatch4py/ged/approximate_ged.py
deleted file mode 100644
index d77f522..0000000
--- a/gmatch4py/ged/approximate_ged.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# coding = utf-8
-
-import numpy as np
-
-from .algorithm.graph_edit_dist import GraphEditDistance
-
-
-class ApproximateGraphEditDistance():
-    __type__ = "dist"
-
-    @staticmethod
-    def compare(listgs,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1):
-        n= len(listgs)
-        comparison_matrix = np.zeros((n,n))
-        for i in range(n):
-            for j in range(i,n):
-                comparison_matrix[i,j]= GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
-                comparison_matrix[j,i]= comparison_matrix[i,j] # Unethical ! Since AGED is not a symmetric similarity measure !
-
-        return comparison_matrix
\ No newline at end of file
diff --git a/gmatch4py/ged/bipartite_graph_matching_2.py b/gmatch4py/ged/bipartite_graph_matching_2.py
deleted file mode 100644
index e63d7ff..0000000
--- a/gmatch4py/ged/bipartite_graph_matching_2.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding = utf-8
-import numpy as np
-
-
-class BP_2():
-    """
-
-    """
-    __type__="dist"
-    @staticmethod
-    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
-        n = len(listgs)
-        comparator = BP_2(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
-        comparison_matrix = np.zeros((n, n))
-        for i in range(n):
-            for j in range(i, n):
-                comparison_matrix[i, j] = comparator.bp2(listgs[i], listgs[j])
-                comparison_matrix[j, i] = comparison_matrix[i, j]
-
-
-
-
-        return comparison_matrix
-
-    def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1):
-        """Constructor for HED"""
-        self.node_del = node_del
-        self.node_ins = node_ins
-        self.edge_del = edge_del
-        self.edge_ins = edge_ins
-
-    def bp2(self, g1, g2):
-        """
-        Compute de Hausdorff Edit Distance
-        :param g1: first graph
-        :param g2: second graph
-        :return:
-        """
-        return min(self.distance(self.psi(g1,g2)),self.distance(self.psi(g2,g1)))
-
-    def distance(self,e):
-        return np.sum(e)
-
-    def psi(self,g1,g2):
-        psi_=[]
-        nodes1 = g1.nodes()
-        nodes2 = g2.nodes()
-        for u in nodes1:
-            v=None
-            for w in nodes2:
-                if 2*self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,None) + self.fuv(g1,g2,None,w)\
-                     and self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,v):
-                    v=w
-                psi_.append(self.fuv(g1,g2,u,v))
-            if u:
-                nodes1= list(set(nodes1).difference(set([u])))
-            if v:
-                nodes2= list(set(nodes2).difference(set([v])))
-        for v in nodes2:
-            psi_.append(self.fuv(g1,g2,None,v))
-        return  psi_
-
-
-    def fuv(self, g1, g2, n1, n2):
-        """
-        Compute the Node Distance function
-        :param g1: first graph
-        :param g2: second graph
-        :param n1: node of the first graph
-        :param n2: node of the second graph
-        :return:
-        """
-        if n2 == None:  # Del
-            return self.node_del + ((self.edge_del / 2) * g1.degree(n1))
-        if n1 == None:  # Insert
-            return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))
-        else:
-            if n1 == n2:
-                return 0.
-            return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2
-
-    def hed_edge(self, g1, g2, n1, n2):
-        """
-        Compute HEDistance between edges of n1 and n2, respectively in g1 and g2
-        :param g1: first graph
-        :param g2: second graph
-        :param n1: node of the first graph
-        :param n2: node of the second graph
-        :return:
-        """
-        return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)
-
-    def get_edge_multigraph(self, g, node):
-        """
-        Get list of edge around a node in a Multigraph
-        :param g: multigraph
-        :param node: node in the multigraph
-        :return:
-        """
-        edges = []
-        for edge in g.edges(data=True):
-            if node == edge[0] or node == edge[1]:
-                edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"]))
-        return edges
-
-    def sum_gpq(self, g1, n1, g2, n2):
-        """
-        Compute Nearest Neighbour Distance between edges around n1 in G1  and edges around n2 in G2
-        :param g1: first graph
-        :param n1: node in the first graph
-        :param g2: second graph
-        :param n2: node in the second graph
-        :return:
-        """
-
-        #if isinstance(g1, nx.MultiDiGraph):
-        edges1 = self.get_edge_multigraph(g1, n1)
-        edges2 = self.get_edge_multigraph(g2, n2)
-        #else:
-            #print(1)
-            #edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())]
-            #edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())]
-        edges2.extend([None])
-        min_sum = np.zeros(len(edges1))
-        for i in range(len(edges1)):
-            min_i = np.zeros(len(edges2))
-            for j in range(len(edges2)):
-                min_i[j] = self.gpq(edges1[i], edges2[j])
-            min_sum[i] = np.min(min_i)
-        return np.sum(min_sum)
-
-    def gpq(self, e1, e2):
-        """
-        Compute the edge distance function
-        :param e1: edge1
-        :param e2: edge2
-        :return:
-        """
-        if e2 == None:  # Del
-            return self.edge_del
-        if e1 == None:  # Insert
-            return self.edge_ins
-        else:
-            if e1 == e2:
-                return 0.
-            return (self.edge_del + self.edge_ins) / 2
-
diff --git a/gmatch4py/ged/graph/edge_graph.py b/gmatch4py/ged/graph/edge_graph.py
deleted file mode 100644
index 24b8bda..0000000
--- a/gmatch4py/ged/graph/edge_graph.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# -*- coding: UTF-8 -*-
-
-
-class EdgeGraph():
-
-    def __init__(self, init_node, nodes):
-        self.init_node=init_node
-        self.nodes_ = nodes
-        self.edge=nodes
-    def nodes(self):
-        return self.nodes_
-
-    def size(self):
-        return len(self.nodes)
-    def __len__(self):
-        return len(self.nodes_)
diff --git a/gmatch4py/ged/greedy_edit_distance.py b/gmatch4py/ged/greedy_edit_distance.py
deleted file mode 100644
index a4b148e..0000000
--- a/gmatch4py/ged/greedy_edit_distance.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# coding = utf-8
-import numpy as np
-
-from .algorithm.graph_edit_dist import GraphEditDistance
-
-
-class GreedyEditDistance(GraphEditDistance):
-    """
-    Implementation of the Greedy Edit Distance presented in :
-
-    Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement
-    Andreas Fischer, Kaspar Riesen, Horst Bunke
-    2016
-    """
-    __type__ = "dist"
-    @staticmethod
-    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
-        n = len(listgs)
-        comparison_matrix = np.zeros((n, n))
-        for i in range(n):
-            for j in range(i, n):
-                comparison_matrix[i, j] = GreedyEditDistance(listgs[i], listgs[j],False, node_del=c_del_node,
-                                                            node_ins=c_ins_node, edge_del=c_del_edge,
-                                                            edge_ins=c_ins_edge).distance()
-                comparison_matrix[j, i] = comparison_matrix[i, j]
-
-
-        return comparison_matrix
-
-    def __init__(self,g1,g2,debug=False,**kwargs):
-        """Constructor for GreedyEditDistance"""
-        super().__init__(g1,g2,debug,**kwargs)
-
-
-    def edit_costs(self):
-        cost_matrix=self.create_cost_matrix()
-        cost_matrix_2=cost_matrix.copy()
-        psi=[]
-        for i in range(len(cost_matrix)):
-            phi_i=np.argmin((cost_matrix[i]))
-            cost_matrix=np.delete(cost_matrix,phi_i,1)
-            psi.append([i,phi_i+i]) #+i to compensate the previous column deletion
-        return [cost_matrix_2[psi[i][0]][psi[i][1]] for i in range(len(psi))]
-
diff --git a/gmatch4py/ged/hausdorff_edit_distance.py b/gmatch4py/ged/hausdorff_edit_distance.py
deleted file mode 100644
index e3e24c9..0000000
--- a/gmatch4py/ged/hausdorff_edit_distance.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# coding = utf-8
-
-import numpy as np
-
-
-class HED():
-    """
-    Implementation of Hausdorff Edit Distance described in
-
-    Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement
-    Andreas Fischer, Kaspar Riesen, Horst Bunke
-    2016
-    """
-    __type__ = "dist"
-    @staticmethod
-    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
-        n = len(listgs)
-        comparator = HED(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
-        comparison_matrix = np.zeros((n, n))
-        for i in range(n):
-            for j in range(i, n):
-                comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])
-                comparison_matrix[j, i] = comparison_matrix[i, j]
-
-        return comparison_matrix
-
-
-    def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1):
-        """Constructor for HED"""
-        self.node_del = node_del
-        self.node_ins = node_ins
-        self.edge_del = edge_del
-        self.edge_ins = edge_ins
-
-    def hed(self, g1, g2):
-        """
-        Compute de Hausdorff Edit Distance
-        :param g1: first graph
-        :param g2: second graph
-        :return:
-        """
-        return self.sum_fuv(g1, g2) + self.sum_fuv(g2, g1)
-
-    def sum_fuv(self, g1, g2):
-        """
-        Compute Nearest Neighbour Distance between G1 and G2
-        :param g1: First Graph
-        :param g2: Second Graph
-        :return:
-        """
-        min_sum = np.zeros(len(g1))
-        nodes1 = g1.nodes()
-        nodes2 = g2.nodes()
-        nodes2.extend([None])
-        for i in range(len(nodes1)):
-            min_i = np.zeros(len(nodes2))
-            for j in range(len(nodes2)):
-                min_i[j] = self.fuv(g1, g2, nodes1[i], nodes2[j])
-            min_sum[i] = np.min(min_i)
-        return np.sum(min_sum)
-
-    def fuv(self, g1, g2, n1, n2):
-        """
-        Compute the Node Distance function
-        :param g1: first graph
-        :param g2: second graph
-        :param n1: node of the first graph
-        :param n2: node of the second graph
-        :return:
-        """
-        if n2 == None:  # Del
-            return self.node_del + ((self.edge_del / 2) * g1.degree(n1))
-        if n1 == None:  # Insert
-            return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))
-        else:
-            if n1 == n2:
-                return 0.
-            return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2
-
-    def hed_edge(self, g1, g2, n1, n2):
-        """
-        Compute HEDistance between edges of n1 and n2, respectively in g1 and g2
-        :param g1: first graph
-        :param g2: second graph
-        :param n1: node of the first graph
-        :param n2: node of the second graph
-        :return:
-        """
-        return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)
-
-    def get_edge_multigraph(self, g, node):
-        """
-        Get list of edge around a node in a Multigraph
-        :param g: multigraph
-        :param node: node in the multigraph
-        :return:
-        """
-        edges = []
-        for edge in g.edges(data=True):
-            if node == edge[0] or node == edge[1]:
-                edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"]))
-        return edges
-
-    def sum_gpq(self, g1, n1, g2, n2):
-        """
-        Compute Nearest Neighbour Distance between edges around n1 in G1  and edges around n2 in G2
-        :param g1: first graph
-        :param n1: node in the first graph
-        :param g2: second graph
-        :param n2: node in the second graph
-        :return:
-        """
-
-        #if isinstance(g1, nx.MultiDiGraph):
-        edges1 = self.get_edge_multigraph(g1, n1)
-        edges2 = self.get_edge_multigraph(g2, n2)
-
-        #else:
-            #edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())]
-            #edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())]
-
-        min_sum = np.zeros(len(edges1))
-        edges2.extend([None])
-        for i in range(len(edges1)):
-            min_i = np.zeros(len(edges2))
-            for j in range(len(edges2)):
-                min_i[j] = self.gpq(edges1[i], edges2[j])
-            min_sum[i] = np.min(min_i)
-        return np.sum(min_sum)
-
-    def gpq(self, e1, e2):
-        """
-        Compute the edge distance function
-        :param e1: edge1
-        :param e2: edge2
-        :return:
-        """
-        if e2 == None:  # Del
-            return self.edge_del
-        if e1 == None:  # Insert
-            return self.edge_ins
-        else:
-            if e1 == e2:
-                return 0
-            return (self.edge_del + self.edge_ins) / 2
diff --git a/gmatch4py/helpers/__init__.py b/gmatch4py/helpers/__init__.py
deleted file mode 100644
index 950f635..0000000
--- a/gmatch4py/helpers/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# coding = utf-8
\ No newline at end of file
diff --git a/gmatch4py/helpers/networkx_parser.py b/gmatch4py/helpers/networkx_parser.py
deleted file mode 100644
index d67049a..0000000
--- a/gmatch4py/helpers/networkx_parser.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding = utf-8
-
-import networkx as nx
-import graph_tool as gt
-
-
-
-def get_prop_type(value, key=None):
-    """
-    Performs typing and value conversion for the graph_tool PropertyMap class.
-    If a key is provided, it also ensures the key is in a format that can be
-    used with the PropertyMap. Returns a tuple, (type name, value, key)
-    """
-    # Deal with the value
-    if isinstance(value, bool):
-        tname = 'bool'
-
-    elif isinstance(value, int):
-        tname = 'float'
-        value = float(value)
-
-    elif isinstance(value, float):
-        tname = 'float'
-
-    elif isinstance(value, str):
-        tname = 'string'
-        value = str(value)
-
-    elif isinstance(value, dict):
-        tname = 'object'
-
-    else:
-        tname = 'string'
-        value = str(value)
-
-    return tname, value, key
-
-
-def nx2gt(nxG):
-    """
-    Converts a networkx graph to a graph-tool graph.
-    """
-    # Phase 0: Create a directed or undirected graph-tool Graph
-    gtG = gt.Graph(directed=nxG.is_directed())
-
-    # Add the Graph properties as "internal properties"
-    for key, value in nxG.graph.items():
-        # Convert the value and key into a type for graph-tool
-        tname, value, key = get_prop_type(value, key)
-
-        prop = gtG.new_graph_property(tname) # Create the PropertyMap
-        gtG.graph_properties[key] = prop     # Set the PropertyMap
-        gtG.graph_properties[key] = value    # Set the actual value
-
-    # Phase 1: Add the vertex and edge property maps
-    # Go through all nodes and edges and add seen properties
-    # Add the node properties first
-    nprops = set() # cache keys to only add properties once
-    for node, data in nxG.nodes_iter(data=True):
-
-        # Go through all the properties if not seen and add them.
-        for key, val in data.items():
-            if key in nprops: continue # Skip properties already added
-
-            # Convert the value and key into a type for graph-tool
-            tname, _, key  = get_prop_type(val, key)
-
-            prop = gtG.new_vertex_property(tname) # Create the PropertyMap
-            gtG.vertex_properties[key] = prop     # Set the PropertyMap
-
-            # Add the key to the already seen properties
-            nprops.add(key)
-
-    # Also add the node id: in NetworkX a node can be any hashable type, but
-    # in graph-tool node are defined as indices. So we capture any strings
-    # in a special PropertyMap called 'id' -- modify as needed!
-    gtG.vertex_properties['id'] = gtG.new_vertex_property('string')
-
-    # Add the edge properties second
-    eprops = set() # cache keys to only add properties once
-    for src, dst, data in nxG.edges_iter(data=True):
-
-        # Go through all the edge properties if not seen and add them.
-        for key, val in data.items():
-            if key in eprops: continue # Skip properties already added
-
-            # Convert the value and key into a type for graph-tool
-            tname, _, key = get_prop_type(val, key)
-
-            prop = gtG.new_edge_property(tname) # Create the PropertyMap
-            gtG.edge_properties[key] = prop     # Set the PropertyMap
-
-            # Add the key to the already seen properties
-            eprops.add(key)
-
-    # Phase 2: Actually add all the nodes and vertices with their properties
-    # Add the nodes
-    vertices = {} # vertex mapping for tracking edges later
-    for node, data in nxG.nodes_iter(data=True):
-
-        # Create the vertex and annotate for our edges later
-        v = gtG.add_vertex()
-        vertices[node] = v
-
-        # Set the vertex properties, not forgetting the id property
-        data['id'] = str(node)
-        for key, value in data.items():
-            gtG.vp[key][v] = value # vp is short for vertex_properties
-
-    # Add the edges
-    for src, dst, data in nxG.edges_iter(data=True):
-
-        # Look up the vertex structs from our vertices mapping and add edge.
-        e = gtG.add_edge(vertices[src], vertices[dst])
-
-        # Add the edge properties
-        for key, value in data.items():
-            gtG.ep[key][e] = value # ep is short for edge_properties
-
-    # Done, finally!
-    return gtG
-
-
-if __name__ == '__main__':
-
-    # Create the networkx graph
-    nxG = nx.Graph(name="Undirected Graph")
-    nxG.add_node("v1", name="alpha", color="red")
-    nxG.add_node("v2", name="bravo", color="blue")
-    nxG.add_node("v3", name="charlie", color="blue")
-    nxG.add_node("v4", name="hub", color="purple")
-    nxG.add_node("v5", name="delta", color="red")
-    nxG.add_node("v6", name="echo", color="red")
-
-    nxG.add_edge("v1", "v2", weight=0.5, label="follows")
-    nxG.add_edge("v1", "v3", weight=0.25, label="follows")
-    nxG.add_edge("v2", "v4", weight=0.05, label="follows")
-    nxG.add_edge("v3", "v4", weight=0.35, label="follows")
-    nxG.add_edge("v5", "v4", weight=0.65, label="follows")
-    nxG.add_edge("v6", "v4", weight=0.53, label="follows")
-    nxG.add_edge("v5", "v6", weight=0.21, label="follows")
-
-    for item in nxG.edges_iter(data=True):
-        print(item)
-
-    # Convert to graph-tool graph
-    gtG = nx2gt(nxG)
-    gtG.list_properties()
\ No newline at end of file
diff --git a/gmatch4py/kernels/weisfeiler_lehman.py b/gmatch4py/kernels/weisfeiler_lehman.py
deleted file mode 100644
index e7139c7..0000000
--- a/gmatch4py/kernels/weisfeiler_lehman.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# coding = utf-8
-
-"""Weisfeiler_Lehman graph kernel.
-
-Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by:
-Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt
-Mehlhorn, Karsten M. Borgwardt, JMLR, 2012.
-http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html
-
-Author : Sandro Vega-Pons, Emanuele Olivetti
-Source : https://github.com/emanuele/jstsp2015/blob/master/gk_weisfeiler_lehman.py
-Modified by : Jacques Fizen
-"""
-
-import copy
-
-import networkx as nx
-import numpy as np
-
-
-class WeisfeleirLehmanKernel(object):
-    __type__ = "sim"
-    @staticmethod
-    def compare(graph_list,h=2):
-        """Compute the all-pairs kernel values for a list of graphs.
-        This function can be used to directly compute the kernel
-        matrix for a list of graphs. The direct computation of the
-        kernel matrix is faster than the computation of all individual
-        pairwise kernel values.
-        Parameters
-        ----------
-        graph_list: list
-            A list of graphs (list of networkx graphs)
-        h : interger
-            Number of iterations.
-        node_label : boolean
-            Whether to use original node labels. True for using node labels
-            saved in the attribute 'node_label'. False for using the node
-            degree of each node as node attribute.
-        Return
-        ------
-        K: numpy.array, shape = (len(graph_list), len(graph_list))
-        The similarity matrix of all graphs in graph_list.
-        """
-
-        n = len(graph_list)
-        k = [0] * (h + 1)
-        n_nodes = 0
-        n_max = 0
-
-        # Compute adjacency lists and n_nodes, the total number of
-        # nodes in the dataset.
-        for i in range(n):
-            n_nodes += graph_list[i].number_of_nodes()
-
-            # Computing the maximum number of nodes in the graphs. It
-            # will be used in the computation of vectorial
-            # representation.
-            if (n_max < graph_list[i].number_of_nodes()):
-                n_max = graph_list[i].number_of_nodes()
-
-        phi = np.zeros((n_nodes, n), dtype=np.uint64)
-
-        # INITIALIZATION: initialize the nodes labels for each graph
-        # with their labels or with degrees (for unlabeled graphs)
-
-        labels = [0] * n
-        label_lookup = {}
-        label_counter = 0
-
-        # label_lookup is an associative array, which will contain the
-        # mapping from multiset labels (strings) to short labels
-        # (integers)
-        for i in range(n):
-            nodes = graph_list[i].nodes()
-            # It is assumed that the graph has an attribute
-            # 'node_label'
-            labels[i] = np.zeros(len(nodes), dtype=np.int32)
-
-            for j in range(len(nodes)):
-                if not (nodes[j] in label_lookup):
-                    label_lookup[nodes[j]] = str(label_counter)
-                    labels[i][j] = label_counter
-                    label_counter += 1
-                else:
-                    labels[i][j] = label_lookup[nodes[j]]
-                # labels are associated to a natural number
-                # starting with 0.
-
-                phi[labels[i][j], i] += 1
-
-            graph_list[i]=nx.relabel_nodes(graph_list[i],label_lookup)
-        k = np.dot(phi.transpose(), phi).astype(np.float64)
-
-        # MAIN LOOP
-        it = 0
-        new_labels = copy.deepcopy(labels) # Can't work without it !!!
-
-        while it < h:
-            # create an empty lookup table
-            label_lookup = {}
-            label_counter = 0
-
-            phi = np.zeros((n_nodes, n))
-            for i in range(n):
-                nodes = graph_list[i].nodes()
-                for v in range(len(nodes)):
-                    # form a multiset label of the node v of the i'th graph
-                    # and convert it to a string
-
-                    long_label = []
-                    long_label.extend(nx.neighbors(graph_list[i],nodes[v]))
-
-                    long_label_string = "".join(long_label)
-                    # if the multiset label has not yet occurred, add it to the
-                    # lookup table and assign a number to it
-                    if not (long_label_string in label_lookup):
-                        label_lookup[long_label_string] = str(label_counter)
-                        new_labels[i][v] = label_counter
-                        label_counter += 1
-                    else:
-                        new_labels[i][v] = label_lookup[long_label_string]
-                # fill the column for i'th graph in phi
-                aux = np.bincount(new_labels[i])
-                phi[new_labels[i], i] += aux[new_labels[i]]
-
-            k += np.dot(phi.transpose(), phi)
-            it = it + 1
-
-        # Compute the normalized version of the kernel
-        k_norm = np.zeros(k.shape)
-        for i in range(k.shape[0]):
-            for j in range(k.shape[1]):
-                k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j])
-
-        return k_norm
\ No newline at end of file
diff --git a/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx b/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx
index f297507..14c22da 100644
--- a/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx
+++ b/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx
@@ -6,18 +6,68 @@ from typing import Sequence
 import networkx as nx
 import numpy as np
 cimport numpy as np
+import sys
+
+from networkit import graph
+from networkit.clique import MaximalCliques
+
+def nx2nk(nxG, weightAttr=None):
+    """
+    Convert a networkx.Graph to a NetworKit.Graph
+        :param weightAttr: the edge attribute which should be treated as the edge weight.
+    """
+
+    # map networkx node ids to consecutive numerical node ids
+    idmap = dict((id, u) for (id, u) in zip(nxG.nodes(), range(nxG.number_of_nodes())))
+    z = max(idmap.values()) + 1
+    # print("z = {0}".format(z))
+
+    if weightAttr is not None:
+        nkG = graph.Graph(z, weighted=True, directed=nxG.is_directed())
+        for (u_, v_) in nxG.edges():
+            u, v = idmap[u_], idmap[v_]
+            w = nxG[u_][v_][weightAttr]
+            nkG.addEdge(u, v, w)
+    else:
+        nkG = graph.Graph(z, directed=nxG.is_directed())
+        for (u_, v_) in nxG.edges():
+            u, v = idmap[u_], idmap[v_]
+            # print(u_, v_, u, v)
+            assert (u < z)
+            assert (v < z)
+            nkG.addEdge(u, v)
+
+    assert (nkG.numberOfNodes() == nxG.number_of_nodes())
+    assert (nkG.numberOfEdges() == nxG.number_of_edges())
+    return nkG.removeSelfLoops(),idmap
+
+def getClique(nx_graph):
+    final_cliques=[]
+    if len(nx_graph) ==0:
+        return final_cliques
+    netkit_graph,idmap=nx2nk(nx_graph)
+    idmap={v:k for k,v in idmap.items()}
+    cliques=MaximalCliques(netkit_graph).run().getCliques()
+    for cl in cliques:
+        final_cliques.append(list(map(lambda x:idmap[x],cl)))
+    return final_cliques
 
 class BagOfCliques():
 
     @staticmethod
-    def compare(graphs):
+    def compare(graphs,selected):
         b=BagOfCliques()
-        bog=b.getBagOfCliques(graphs)
+        bog=b.getBagOfCliques(graphs).astype(np.float32)
         #Compute cosine similarity
-        cdef np.ndarray scores=np.dot(bog,bog.T)
+        cdef int n=bog.shape[0]
+        cdef double[:,:] scores = np.zeros((n,n))
+        cdef int i
         for i in range(len(scores)):
-            for j in range(len(scores)):
-                scores[i,j]/=(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line
+            if not i in selected:
+                continue
+            for j in range(i,len(scores)):
+                scores[i,j]=(np.dot(bog[i],bog[j]))/(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line
+                scores[j,i]=scores[i,j]
         return scores
 
     def getUniqueCliques(self,graphs):
@@ -25,15 +75,26 @@ class BagOfCliques():
         Return unique cliques from a population of graphs
         :return:
         """
-        tree = {}
+        t = {}
         c_ = 0
         cdef list clique_vocab = []
         cdef list cli_temp
         cdef list cliques
+        cdef int len_graphs=len(graphs)
+        cdef int km= -1
         for g in graphs:
-            cliques = list(nx.find_cliques(nx.Graph(g)))
+            km+=1
+            if not g:
+                continue
+            sys.stdout.write("\r{0}/{1} -- {2}".format(km,len_graphs,len(g)))
+            try:
+                cliques = list(getClique(nx.Graph(g)))
+            except:
+                #no clique found
+                print(nx.Graph(g).edges())
+                cliques =[]
             for clique in cliques:
-                t = tree
+
                 cli_temp = copy.deepcopy(clique)
                 new_clique = False
                 for i in range(len(clique)):
@@ -57,33 +118,51 @@ class BagOfCliques():
         return clique_vocab
 
 
-    def ifHaveMinor(self,G, list H):
+    def clique2str(self,cliques):
+        return "".join(sorted(cliques))
+
+    def transform_clique_vocab(self,clique_vocab):
+        cdef dict new_vocab={}
+        cdef int len_voc=len(clique_vocab)
+        for c in range(len_voc):
+            print(c)
+            new_vocab[self.clique2str(clique_vocab[c])]=c
+        return new_vocab
+
+
+    def ifHaveMinor(self,clique, dict mapping):
         """
         If a clique (minor) H belong to a graph G
         :param H:
         :return:
         """
-        if nx.Graph(G).subgraph(H).nodes() == H:
+        if self.clique2str(clique) in mapping:
             return 1
         return 0
 
 
-    def  getBagOfCliques(self,graphs ):
+    def getBagOfCliques(self,graphs ):
         """
 
         :param clique_vocab:
         :return:
         """
         cdef list clique_vocab=self.getUniqueCliques(graphs)
-
+        print("DONE")
+        cdef dict map_str_cliques=self.transform_clique_vocab(clique_vocab)
+        print("DONE2")
         cdef int l_v=len(clique_vocab)
         cdef np.ndarray boc = np.zeros((len(graphs), l_v))
         cdef np.ndarray vector
-
+        cdef list cliques
         for g in range(len(graphs)):
+            sys.stdout.write("\r{0}/{1}".format(g,5552))
             gr = graphs[g]
             vector = np.zeros(l_v)
-            for m in range(l_v):
-                vector[m] = self.ifHaveMinor(gr, clique_vocab[m])
+            cliques = list(getClique(nx.Graph(gr)))
+            for clique in cliques:
+                hash=self.clique2str(clique)
+                if hash in map_str_cliques:
+                    vector[map_str_cliques[hash]] = 1
             boc[g] = vector
         return boc
\ No newline at end of file
diff --git a/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx b/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx
index 5a50e9e..27ea437 100644
--- a/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx
+++ b/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx
@@ -3,7 +3,7 @@
 import numpy as np
 
 from .algorithm.graph_edit_dist import GraphEditDistance
-
+from cython.parallel import prange
 
 class ApproximateGraphEditDistance():
     __type__ = "dist"
@@ -11,20 +11,23 @@ class ApproximateGraphEditDistance():
     @staticmethod
     def compare(listgs,selected,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1):
         cdef int n= len(listgs)
-        comparison_matrix = np.zeros((n,n))
-        for i in range(n):
+        cdef double[:,:] comparison_matrix = np.zeros((n,n))
+        cdef int i,j
+        for i in prange(n,nogil=True):
             for j in range(i,n):
-                f=True
-                if not listgs[i] or not listgs[j]:
-                    f=False
-                elif len(listgs[i])== 0 or len(listgs[j]) == 0:
-                    f=False
-                if selected:
-                    if not i in selected:
+                with gil:
+                    f=True
+                    if not listgs[i] or not listgs[j]:
+                        f=False
+                    elif len(listgs[i])== 0 or len(listgs[j]) == 0:
                         f=False
-                if f:
-                    comparison_matrix[i, j] = GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
-                else:
-                    comparison_matrix[i, j] = 0.
-                comparison_matrix[j, i] = comparison_matrix[i, j]
+                    if selected:
+                        if not i in selected:
+                            f=False
+
+                    if f:
+                        comparison_matrix[i][j] = GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
+                    else:
+                        comparison_matrix[i][j] = np.inf
+                    comparison_matrix[j][i] = comparison_matrix[i][j]
         return comparison_matrix
\ No newline at end of file
diff --git a/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx b/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx
index 772f73a..d6d68e6 100644
--- a/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx
+++ b/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx
@@ -31,7 +31,7 @@ cdef class BP_2():
                 if f:
                     comparison_matrix[i, j] = comparator.bp2(listgs[i], listgs[j])
                 else:
-                    comparison_matrix[i, j] = 0.
+                    comparison_matrix[i, j] = np.inf
                 comparison_matrix[j, i] = comparison_matrix[i, j]
 
         return comparison_matrix
diff --git a/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx b/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx
index b297bea..7752030 100644
--- a/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx
+++ b/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx
@@ -32,7 +32,7 @@ class GreedyEditDistance(GraphEditDistance):
                                                             node_ins=c_ins_node, edge_del=c_del_edge,
                                                             edge_ins=c_ins_edge).distance()
                 else:
-                    comparison_matrix[i, j] = 0.
+                    comparison_matrix[i, j] =  np.inf
                 comparison_matrix[j, i] = comparison_matrix[i, j]
 
 
diff --git a/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx b/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx
index a3abc77..06dc664 100644
--- a/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx
+++ b/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx
@@ -25,12 +25,18 @@ cdef class HED:
         cdef np.ndarray comparison_matrix = np.zeros((n, n))
         for i in range(n):
             for j in range(i, n):
+                f=True
                 if not listgs[i] or not listgs[j]:
-                    continue
+                    f=False
+                elif len(listgs[i])== 0 or len(listgs[j]) == 0:
+                    f=False
                 if selected:
                     if not i in selected:
-                        continue
-                comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])
+                        f=False
+                if f:
+                    comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])
+                else:
+                    comparison_matrix[i, j] = np.inf
                 comparison_matrix[j, i] = comparison_matrix[i, j]
 
         return comparison_matrix
diff --git a/gmatch4py_cython/gmatch4py/jaccard.pyx b/gmatch4py_cython/gmatch4py/jaccard.pyx
index 2699457..894afc6 100644
--- a/gmatch4py_cython/gmatch4py/jaccard.pyx
+++ b/gmatch4py_cython/gmatch4py/jaccard.pyx
@@ -31,9 +31,14 @@ class Jaccard():
                         f=False
                 if f:
                     inter_ver,inter_ed = Jaccard.intersect_graph(g1,g2)
-                    comparison_matrix[i,j]=(len(inter_ver)/len(Jaccard.union_nodes(g1,g2)))*(len(inter_ed)/len(Jaccard.union_edges(g1,g2)))
+                    un_ver,un_edg=Jaccard.union_nodes(g1,g2),Jaccard.union_edges(g1,g2)
+                    if len(un_ver) == 0 or len(un_edg) == 0:
+                        comparison_matrix[i, j] = 0.
+                    else:
+                        comparison_matrix[i,j]=(len(inter_ver)/len(un_ver))*(len(inter_ed)/len(un_edg))
                 else:
                     comparison_matrix[i, j] = 0.
+
                 comparison_matrix[j, i] = comparison_matrix[i, j]
 
         return comparison_matrix
diff --git a/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx b/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx
index 809fb27..2731f0f 100644
--- a/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx
+++ b/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx
@@ -22,7 +22,7 @@ cimport numpy as np
 class WeisfeleirLehmanKernel(object):
     __type__ = "sim"
     @staticmethod
-    def compare(graph_list,h=2):
+    def compare(graph_list,selected,h=2):
         """Compute the all-pairs kernel values for a list of graphs.
         This function can be used to directly compute the kernel
         matrix for a list of graphs. The direct computation of the
diff --git a/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx b/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx
index f0856b1..7888376 100644
--- a/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx
+++ b/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx
@@ -19,7 +19,7 @@ class VertexEdgeOverlap():
     """
 
     @staticmethod
-    def compare(list listgs):
+    def compare(list listgs,selected):
         n = len(listgs)
         cdef np.ndarray comparison_matrix = np.zeros((n, n))
         cdef list inter_ver
@@ -27,15 +27,25 @@ class VertexEdgeOverlap():
         cdef int denom
         for i in range(n):
             for j in range(i,n):
-                g1 = listgs[i]
-                g2 = listgs[j]
-                inter_ver,inter_ed = VertexEdgeOverlap.intersect_graph(g1,g2)
-                denom=len(g1)+len(g2)+len(g1.edges(data=True))+len(g2.edges(data=True))
-                if denom == 0:
-                    continue
-                comparison_matrix[i,j]=2*(len(inter_ver)+len(inter_ed))/denom # Data = True --> For nx.MultiDiGraph
-                comparison_matrix[j,i]=comparison_matrix[i,j]
-
+                f=True
+                if not listgs[i] or not listgs[j]:
+                    f=False
+                elif len(listgs[i])== 0 or len(listgs[j]) == 0:
+                    f=False
+                if selected:
+                    if not i in selected:
+                        f=False
+                if f:
+                    g1 = listgs[i]
+                    g2 = listgs[j]
+                    inter_ver,inter_ed = VertexEdgeOverlap.intersect_graph(g1,g2)
+                    denom=len(g1)+len(g2)+len(g1.edges(data=True))+len(g2.edges(data=True))
+                    if denom == 0:
+                        continue
+                    comparison_matrix[i,j]=2*(len(inter_ver)+len(inter_ed))/denom # Data = True --> For nx.MultiDiGraph
+                else:
+                    comparison_matrix[i, j] = 0.
+                comparison_matrix[j, i] = comparison_matrix[i, j]
         return comparison_matrix
 
 
diff --git a/gui_graph_viewer/db.py b/gui_graph_viewer/db.py
index 9c159cd..eb13f76 100644
--- a/gui_graph_viewer/db.py
+++ b/gui_graph_viewer/db.py
@@ -73,22 +73,22 @@ class Eval(Base):
     __tablename__="evals"
     __table_args__ = {'sqlite_autoincrement': True}
     id = Column(Integer, primary_key=True)
-    id_g1 = Column(Integer,ForeignKey('graphs.id'))
-    id_g2 = Column(Integer, ForeignKey('graphs.id'))
-    mesure = Column(Integer, ForeignKey('mesures.id'))
-    type = Column(String(20))
+    id_g1 = Column(Integer)#,ForeignKey('graphs.id'))
+    id_g2 = Column(Integer)#, ForeignKey('graphs.id'))
+    #mesure = Column(Integer, ForeignKey('mesures.id'))
+    #type = Column(String(20))
     id_user = Column(Integer, ForeignKey('users.id'))
     c1_val= Column(Boolean)
     c2_val = Column(Boolean)
     c3_val = Column(Boolean)
     c4_val = Column(Boolean)
 
-    def __init__(self, id_g1, id_g2, mesure,type_,id_user,c1,c2,c3,c4):
+    def __init__(self, id_g1, id_g2,id_user,c1,c2,c3,c4):
         self.id_g1 = id_g1
         self.id_g2 = id_g2
-        self.mesure = mesure
+        #self.mesure = mesure
         self.id_user = id_user
-        self.type = type_
+        #self.type = type_
         self.c1_val = c1
         self.c2_val = c2
         self.c3_val = c3
@@ -97,8 +97,8 @@ class Eval(Base):
         return {
             "id_g1":self.id_g1,
             "id_g2": self.id_g2,
-            "mesure": self.mesure,
-            "type": self.type,
+            #"mesure": self.mesure,
+            #"type": self.type,
             "id_user": self.id_user,
             "c1": self.c1_val,
             "c2": self.c2_val,
@@ -148,6 +148,6 @@ if __name__ == '__main__':
     types=["normal","extension_1","extension_2","extension_3","gen_all_1","gen_all_2",
           "gen_capital","gen_country","gen_region","gen_town"]
     add_users(session,user_input)
-    add_unique(session,mesures,Mesure)
-    add_graphs_data(session,range(532),"graph_exp_18_fev")
+    #add_unique(session,mesures,Mesure)
+    #add_graphs_data(session,range(532),"graph_exp_18_fev")
 
diff --git a/gui_graph_viewer/server.py b/gui_graph_viewer/server.py
index 15f7076..66acd7f 100644
--- a/gui_graph_viewer/server.py
+++ b/gui_graph_viewer/server.py
@@ -49,6 +49,22 @@ for fn in dataFiles:
     data_[fn.replace(all_,"").rstrip(".json")]=fn
 print("File Available",data_.keys())
 
+# couple to annotate
+new_data=json.load(open(os.path.join(dir_,"couple_to_annotate.json")))
+current_index=0
+graph_data=json.load(open(os.path.join(dir_,"data_graphs.json")))
+
+
+def init():
+    global current_index
+    last_row = sql_session.query(Eval).order_by(Eval.id.desc()).first()
+    last_id_g1 = last_row.id_g1
+    last_id_g2 = last_row.id_g2
+    print(last_id_g1, last_id_g2)
+    for i in range(len(new_data)):
+        if new_data[i][1] == last_id_g1 and new_data[i][0] == last_id_g2:
+            current_index = i
+            break
 
 
 @app.route("/")
@@ -73,8 +89,39 @@ def index(gmmeasure=None):
             max_[int(k)]=0.0
         if math.isnan(min_[int(k)]):
             min_[int(k)]=0.0
+    init()
     return render_template("index.html",data=json.dumps(data),measureAvailable=list(data_.keys()),measure=gmmeasure,max=max_,min=min_,dirs=available_dir,type_=current_type)
 
+
+@app.route("/annot_2")
+@login_required
+def index2():
+    current_couple=new_data[current_index]
+    progress=(current_index/len(new_data))*100
+    return render_template(
+        "indexv2.html",
+        current_couple=current_couple,
+        graphs_data_1=graph_data[str(current_couple[0])],
+        graphs_data_2=graph_data[str(current_couple[1])],
+        progress=progress
+    )
+
+
+@app.route("/couple_moins")
+@login_required
+def couple_moins(gmmeasure=None):
+    global current_index
+    if not current_index-1 <0:
+        current_index-=1
+    return redirect("/annot_2")
+
+@app.route("/couple_plus")
+@login_required
+def couple_plus(gmmeasure=None):
+    global current_index
+    if not current_index >= len(new_data):
+        current_index+=1
+    return redirect("/annot_2")
 @app.route("/about")
 def about():
     global available_dir
@@ -112,6 +159,52 @@ def get_assoc(g1id,g2id):
             return jsonify({})
     except:
         return jsonify({})
+
+@app.route("/get_info_2/<g1id>/<g2id>")
+def getinfo_2(g1id,g2id):
+    try:
+        info=sql_session.query(Eval).filter_by(id_g1=g1id,id_g2=g2id)
+        print(info)
+        if info.count() >0:
+            return jsonify(info.first().__repr__())
+        else:
+            return jsonify({})
+    except:
+        return jsonify({})
+@app.route("/get_assoc_2/<g1id>/<g2id>")
+def get_assoc_2(g1id,g2id):
+
+    info=sql_session.query(Eval).filter_by(id_g1=g1id,id_g2=g2id)
+    print(info)
+    if info.count() >0:
+        return jsonify(info.first().__repr__())
+    else:
+        return jsonify({})
+
+
+@app.route("/save_eval_2/<g1id>/<g2id>/<int:c1>/<int:c2>/<int:c3>/<int:c4>")
+def save_eval_2(g1id,g2id,c1,c2,c3,c4):
+    c1,c2,c3,c4=bool(c1),bool(c2),bool(c3),bool(c4)
+    eval_query = sql_session.query(Eval).filter_by(
+        id_g1=g1id,
+        id_g2=g2id,
+        id_user=current_user.id
+    )
+    if eval_query.count()< 1:
+        sql_session.add(Eval(g1id,g2id,current_user.id,c1,c2,c3,c4))
+        print("ADD",g1id, g2id, c1, c2, c3, c4)
+    else:
+        print("UPD",g1id, g2id, c1, c2, c3, c4)
+        eval_=eval_query.first()
+        eval_.c1_val = c1
+        eval_.c2_val = c2
+        eval_.c3_val = c3
+        eval_.c4_val = c4
+    sql_session.commit()
+
+
+    return "Oh Yeah"
+
 def getMeasureid(mesure):
     mesure_query = sql_session.query(Mesure).filter_by(
         label=mesure
diff --git a/gui_graph_viewer/static/css/dashboard.css b/gui_graph_viewer/static/css/dashboard.css
index 8ffff76..c2f0b96 100644
--- a/gui_graph_viewer/static/css/dashboard.css
+++ b/gui_graph_viewer/static/css/dashboard.css
@@ -129,9 +129,13 @@ color:white !important;
   behavior: url(PIE.htc);
   /* remove if you don't care about IE8 */
   padding-left: 0.3em;
-padding-right: 0.3em;
+  padding-right: 0.3em;
   background: #fff;
   border: 2px solid #666;
   color: #666;
   text-align: center;
+}
+
+.map_preview{
+    height: 400px;
 }
\ No newline at end of file
diff --git a/gui_graph_viewer/static/js/helpers.js b/gui_graph_viewer/static/js/helpers.js
index 8c70a19..676b4a3 100644
--- a/gui_graph_viewer/static/js/helpers.js
+++ b/gui_graph_viewer/static/js/helpers.js
@@ -30,7 +30,15 @@ function getColorEdge(col){
         return "#87D37C"
     }
 }
-
+function getGeoData(data_) {
+    final_ = []
+    var data2_ = data_["sp_entities"]
+    //console.log(data_);
+    for (var i in data2_) {
+        final_.push([data2_[i]["coord"][0], data2_[i]["coord"][1], data2_[i]["label"]]);
+    }
+    return final_;
+}
 
 function generate_map(id_tiles, locations, edges) {
     /*
diff --git a/gui_graph_viewer/templates/indexv2.html b/gui_graph_viewer/templates/indexv2.html
new file mode 100644
index 0000000..7c64246
--- /dev/null
+++ b/gui_graph_viewer/templates/indexv2.html
@@ -0,0 +1,189 @@
+{% extends "skeleton.html" %} {% block body %}
+  <div class="container" style="margin-top: 1em">
+    <div class="row">
+        <div class="col-lg-12">
+            <h2 class="text-center">Graph Matching Validation</h2>
+            <div class="progress" style="margin-bottom: 0.5em">
+              <div class="progress-bar" role="progressbar" style="width: {{progress}}%" aria-valuenow="{{progress}}" aria-valuemin="0" aria-valuemax="100"></div>
+            </div>
+            <div class="row">
+                <div class="col-lg-6" >
+                    <h2 class="text-center">Graph NÂ°<span id="label1"></span></h2>
+                    <div id="map1" class="map_preview"></div>
+                </div>
+                <div class="col-lg-6" >
+                    <h2 class="text-center">Graph NÂ°<span id="label2"></span></h2>
+                    <div id="map2" class="map_preview"></div>
+                </div>
+
+                <div class="col-lg-12 offset-lg-2" style="margin-top: 1em;">
+                    <div class="row">
+                        <div class="col-lg-6">
+                            <h3 class=""> G1 &rarr; G2</h3>
+                            <div id="annot_g1_g2">
+
+                            </div>
+                        </div>
+                        <div class="col-lg-6">
+                            <h3> G1 &larr; G2</h3>
+                            <div id="annot_g2_g1">
+
+                            </div>
+                        </div>
+                    </div>
+                    <div>
+
+                    </div>
+                    <div class="col-lg-8  text-center">
+                        <a href="#" class="btn btn-warning" id="check_0">Different</a>
+                        <a href="#" class="btn btn-warning" id="check_1">Check the 1s</a>
+                        <a href="#" class="btn btn-warning" id="check_2">Check the 2s</a>
+                        <a href="#" class="btn btn-warning" id="check_3">Check the 3s</a>
+                        <a href="#" class="btn btn-warning" id="check_4">Check the 4s</a>
+                        <br>
+                        <br>
+                        <a href="/couple_moins" id="prev_btn" class="btn btn-success">Previous Annotation</a>
+                        <a href="/couple_plus" id="next_btn" class="btn btn-success">Next Annotation</a>
+                    </div>
+
+                </div>
+
+            </div>
+        </div>
+    </div>
+  </div>
+
+{% endblock %} {% block script %}
+<script>
+
+    var couple={{current_couple | safe}};
+    var g1={{graphs_data_1 | safe}};
+    var g2={{graphs_data_2 | safe}};
+    var point_1=getGeoData(g1);
+    var point_2=getGeoData(g2);
+
+    generate_map("map1",point_1,g1["edges"]);
+    generate_map("map2",point_2,g2["edges"]);
+    $("#label1").text(couple[0]);
+    $("#label2").text(couple[1]);
+
+    /* RÃ©cupÃ©ration des rÃ©sultats prÃ©cÃ©dents*/
+    var result={};
+    var bool_val=[false,false,false,false];
+    var bool_val_2=[false,false,false,false];
+
+    function get_bool_val(couple) {
+        var result = {};
+        $.ajax({
+          url: "/get_assoc_2/"+couple[0]+"/"+couple[1]+"",
+          async: false,
+          dataType: 'json',
+          success: function (json) {
+            result = json;
+          }
+        });
+        return result
+    }
+    //In case the correspondance has been already annotated
+
+    var result_1=get_bool_val(couple);
+    var result_2=get_bool_val([couple[1],couple[0]]);
+
+    if (!isEmpty(result_1)){
+       bool_val=[result_1.c1,result_1.c2,result_1.c3,result_1.c4];
+    }
+    if (!isEmpty(result_2)){
+       bool_val_2=[result_2.c1,result_2.c2,result_2.c3,result_2.c4];
+    }
+    var id_=couple[0]+"_"+couple[1]
+    var id_inv=couple[1]+"_"+couple[0]
+    // Fin de la rÃ©cuparation des rÃ©sultats prÃ©cÃ©dents
+    $("#annot_g1_g2").html(
+        getValidateForm(
+            bool_val,["ESS","ESC","KER","SPR"],
+            [id_,id_,id_,id_]
+        )
+    );
+    $("#annot_g2_g1").html(
+        getValidateForm(
+            bool_val_2,["ESS","ESC","KER","SPR"],
+            [id_inv,id_inv,id_inv,id_inv]
+        )
+    );
+
+    $(".criteria-checkbox").change(function () {
+        c_val=[]
+        graph_id=this.getAttribute("name").split("_");
+        console.debug(graph_id);
+        var g1=parseInt(graph_id[0]);
+        var g2= parseInt(graph_id[1]);
+        var _url="/save_eval_2/"+g1+"/"+g2;
+         $(this).parent().parent().find('input:checkbox').each(function () {
+             checked=$(this).is(':checked')
+             _url+="/"+(checked ? "1" : "0");
+         });
+
+        console.log(_url);
+        $.ajax(_url).done(function () {
+            console.log("Saved Successfuly")
+        });
+      });
+
+    function automatic_check(n){
+        n-=1;
+        var i =0;
+        $("input[name*='"+id_+"']").each(function () {
+            if (i > n){
+                return false;
+            }
+            $(this).prop("checked", true).change();
+            i+=1;
+        });
+        i=0;
+        $("input[name*='"+id_inv+"']").each(function () {
+            if (i > n){
+                return false;
+            }
+            $(this).prop("checked", true).change();
+            i+=1;
+        });
+    }
+    function no_sim_check(){
+        n=1;
+        var i =0;
+        $("input[name*='"+id_+"']").each(function () {
+            if (i > n){
+                return false;
+            }
+            $(this).prop("checked", true).change();
+            $(this).prop("checked", false).change();
+            i+=1;
+        });
+        i=0;
+        $("input[name*='"+id_inv+"']").each(function () {
+            if (i > n){
+                return false;
+            }
+            $(this).prop("checked", true).change();
+            $(this).prop("checked", false).change();
+            i+=1;
+        });
+    }
+
+    $("#check_0").click(function () {
+        no_sim_check();
+    });
+    $("#check_1").click(function () {
+        automatic_check(1);
+    });
+    $("#check_2").click(function () {
+        automatic_check(2);
+    });
+    $("#check_3").click(function () {
+        automatic_check(3);
+    });
+    $("#check_4").click(function () {
+        automatic_check(4);
+    });
+</script>
+{% endblock %}
\ No newline at end of file
diff --git a/helpers/gazeteer_helpers.py b/helpers/gazeteer_helpers.py
index be39e22..3d7a2bb 100644
--- a/helpers/gazeteer_helpers.py
+++ b/helpers/gazeteer_helpers.py
@@ -174,12 +174,18 @@ def count_of_se(label, lang):
     return response["count"]
 
 
-
-def get_most_most_probable_candidates(label, lang="fr",n=5):
-    res = es.search("gazetteer", "place",
-                    body={"query":{"bool":{"must":[{"fuzzy":{lang:{"value":label}}}],"must_not":[],"should":[]}},"from":0,"size":n,"sort": [{'score':"desc"}],"aggs":{}})
-    results=[]
-    if res["hits"]["total"] > 0:
-        for hit in res["hits"]["hits"]:
-            results.append(hit["_source"])
-    return results
+def get_top_candidate(label, lang):
+    query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}},"sort": [
+    {
+      "score": {
+        "order": "desc"
+      }
+    }
+  ],"size": 5}
+    # query = {"query":{"bool":{"must":[{"multi_match" :
+    # "fuzzy":{"query":label,"fields": [ "en", "fr"
+    # ,"es","de"]}}],"must_not":[],"should":[]}},"from":0,"size":500,"sort":[],"aggs":{}}
+    response = es.search('gazetteer', 'place', body=query)
+    if 'hits' in response['hits']:
+        return [x["_source"]["id"] for x in response['hits']['hits']]
+    return None
diff --git a/models/str.py b/models/str.py
index 88eef86..8ebe3c9 100644
--- a/models/str.py
+++ b/models/str.py
@@ -5,8 +5,11 @@ import warnings
 import networkx as nx
 import pandas as pd
 import logging
+
+from shapely.geometry import Point
+
 from config.configuration import config
-logging.basicConfig(filename=config.log_file,level=logging.INFO)
+#logging.basicConfig(filename=config.log_file,level=logging.INFO)
 
 
 from helpers.boundary import is_intersect
@@ -229,6 +232,7 @@ class STR(object):
         Return all the adjacency relationships between all the spatial entities in the STR.
         :return:
         """
+        stop_class=set(["A-PCLI","A-ADM1"])
         for se1 in self.spatial_entities:
             data_se1 = get_data(se1)
             for se2 in self.spatial_entities:
@@ -263,6 +267,11 @@ class STR(object):
                     #else:
                         #if is_intersect(se1, se2, self.shapes):
                             #f = True
+                if not f:
+                    #print(data_se1,data_se2)
+                    if Point(data_se1["coord"]["lat"],data_se1["coord"]["lon"]).distance(Point(data_se2["coord"]["lat"], data_se2["coord"]["lon"])) < 1\
+                            and len(set(data_se1["class"]) & stop_class) < 1 and len(set(data_se2["class"]) & stop_class) < 1:
+                        f=True
 
                 self.add_adjacency_rel(se1, se2,f)
 
@@ -290,6 +299,7 @@ class STR(object):
                 for se2 in self.adjacency_relationships[se1]:
                     if self.adjacency_relationships[se1][se2]:
                         graph.add_edge(se1,se2, key=0, color="green")
+                        graph.add_edge(se2, se1, key=0, color="green")
 
             logging.info("Extract Adjacency Rel\t{0}".format(time.time()-debut))
         if inc:
diff --git a/nlp/bow_se.py b/nlp/bow_se.py
index 9eee362..43e67c5 100644
--- a/nlp/bow_se.py
+++ b/nlp/bow_se.py
@@ -15,7 +15,7 @@ class BOWSE(object):
     __depreciated__ = True
 
     @staticmethod
-    def compare(graph_list, verbose=False):
+    def compare(graph_list, selected,verbose=False):
         """Compute the all-pairs kernel values for a list of graphs.
         This function can be used to directly compute the kernel
         matrix for a list of graphs. The direct computation of the
diff --git a/nlp/disambiguator/models/bigram.py b/nlp/disambiguator/models/bigram.py
index aa50f70..d9ce129 100644
--- a/nlp/disambiguator/models/bigram.py
+++ b/nlp/disambiguator/models/bigram.py
@@ -20,22 +20,24 @@ class BigramModel:
             self.count_associated[uri]=0
         self.count_associated[uri]+=1
 
-    def get_coocurence_probability_dict(self,*args):
+    def get_coocurence_probability(self, pr1, *args):
         if len(args) <2:
             print("Only one URI indicated")
             return 0.
         res_=1.
         for u in range(1,len(args)):
-            res_*=self.get_bigram_probability(args[0],args[u])
+            res_*=self.get_bigram_probability(args[0],args[u],pr1)
         return res_
 
 
-    def get_bigram_probability(self,uri1,uri2):
-        try:
-            return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])
-        except Exception as e:
-            try:
-                return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])
-            except Exception as e:
-                return 0.00000000000000001
+    def get_bigram_probability(self,uri1,uri2,pr1=1):
+        nna=0.00000000000000001
+        if  uri1 in self.cooc_freq:
+            if  uri2 in self.cooc_freq[uri1]:
+                return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1
+        elif uri2 in self.cooc_freq:
+            if uri1 in self.cooc_freq[uri2]:
+                return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
+        return nna
+
 
diff --git a/nlp/disambiguator/wikipedia_cooc.py b/nlp/disambiguator/wikipedia_cooc.py
index d441218..29d3591 100644
--- a/nlp/disambiguator/wikipedia_cooc.py
+++ b/nlp/disambiguator/wikipedia_cooc.py
@@ -1,10 +1,13 @@
 # coding = utf-8
+import re
+
 from nlp.disambiguator.disambiguator import Disambiguator
 from nlp.disambiguator.models.bigram import BigramModel
 import pickle
 from config.configuration import config
-from helpers.gazeteer_helpers import get_data,count_of_se,get_most_common_id_v3
+from helpers.gazeteer_helpers import get_data,count_of_se,get_most_common_id_v3,get_top_candidate
 from .most_common import stop_words,common_words
+import networkx as nx
 
 def read_pickle(fn):
     return pickle.load(open(fn,'rb'))
@@ -19,27 +22,75 @@ class WikipediaDisambiguator(Disambiguator):
     def disambiguate(self, ner_result, lang="en"):
         count, se_ = self.extract_se_entities(ner_result)
         new_count = {}
-        selected_en = {}
-        fixed_se=set([])
-        for en in se_:
-            if count_of_se(en,lang) == 1 :
-                fixed_se.add(en)
-        if len(fixed_se) >0:
-            selected_en = self.part_ambiguous(list(set(se_)-fixed_se),list(fixed_se))
-        else:
-            selected_en = self.all_ambiguous(se_)
-        for en in se_:
-            id_,score=self.disambiguate_(en,lang)
-            if not id_ =="O" and id_:
-                selected_en[id_] = en
-                new_count[id_] = count[en]
+        selected_en_rev = {}
+        selected_en = self.disambiguate_wiki(se_,lang)
+        for en in selected_en:
+            selected_en_rev[selected_en[en]]=en
+            #new_count[selected_en[en]] = count[en]
 
         return new_count, selected_en
 
 
 
-    def all_ambiguous(self,entities):
-        pass
+    def disambiguate_wiki(self, entities, lang):
+
+        spat_en=[]
+        for e in entities:
+            if re.match("^\d+$", e):
+                continue
+            if e.lower().rstrip("s") in stop_words[lang] or e.lower().rstrip("s") in common_words[lang]:
+                continue
+
+            plural = e.rstrip("s") + "s"
+            if plural.lower() in stop_words[lang] or plural.lower() in common_words[lang]:
+                continue
+            spat_en.append(e)
+
+        g = nx.Graph()
+
+        possible_candidates = []
+        betw_cand={} # indicate which toponym group a candidate belong to # maybe useless ...
+        group_candidate = {} #candidates per toponym
+        for e in spat_en:
+            cand = get_top_candidate(e, lang)
+            group_candidate[e] = cand
+            betw_cand[e]=cand
+            for n in cand:
+                betw_cand[n]=set(cand)-set(n)
+            possible_candidates.extend(cand)
+
+        for cand in possible_candidates:
+            g.add_node(cand, label=get_data(cand)[lang])
+
+        for cand in possible_candidates:
+            for cand2 in possible_candidates:
+                # Get PageRank score
+                d = get_data(cand)
+                sc = 1
+                if "score" in d:
+                    sc = float(d["score"])
+
+                # Compute probability
+                prob = self.model.get_coocurence_probability(sc, cand, cand2)
+                if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
+                    prob = 0.0
+                if prob < 0.0000001:
+                    prob = 0.0
+                if not cand == cand2:
+                    # take the lowest co-occurrency between two candidates
+                    if (cand2, cand) in g.edges():
+                        if g.edge[cand2][cand]["weight"] < prob:
+                            continue
+                    g.add_edge(cand, cand2, weight=prob)
+
+        selected = {}
+        #Take the candidates with the highest degree weighted
+        for gr in group_candidate:
+            try:
+                selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
+            except:
+                #print(group_candidate[gr]) empty group
+                selected[gr]=get_most_common_id_v3(gr,lang)
+        #print(entities,selected)
+        return selected
 
-    def part_ambiguous(self,ambiguous_entities,fixed_entities):
-        pass
\ No newline at end of file
diff --git a/notebooks/EvalDesambiguisationMada.ipynb b/notebooks/EvalDesambiguisationMada.ipynb
index 4cb419e..9045fca 100644
--- a/notebooks/EvalDesambiguisationMada.ipynb
+++ b/notebooks/EvalDesambiguisationMada.ipynb
@@ -5,8 +5,8 @@
    "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-16T23:58:48.134280Z",
-     "start_time": "2018-05-16T23:58:47.729327Z"
+     "end_time": "2018-06-19T13:09:12.991345Z",
+     "start_time": "2018-06-19T13:09:12.578369Z"
     }
    },
    "outputs": [],
@@ -19,8 +19,8 @@
    "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-16T23:58:48.140894Z",
-     "start_time": "2018-05-16T23:58:48.136384Z"
+     "end_time": "2018-06-19T13:09:13.002216Z",
+     "start_time": "2018-06-19T13:09:12.998336Z"
     }
    },
    "outputs": [
@@ -41,8 +41,8 @@
    "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-16T23:58:48.150739Z",
-     "start_time": "2018-05-16T23:58:48.143107Z"
+     "end_time": "2018-06-19T13:09:14.674713Z",
+     "start_time": "2018-06-19T13:09:14.668234Z"
     }
    },
    "outputs": [],
@@ -57,8 +57,8 @@
    "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-16T23:58:48.173363Z",
-     "start_time": "2018-05-16T23:58:48.153066Z"
+     "end_time": "2018-06-19T13:09:14.912185Z",
+     "start_time": "2018-06-19T13:09:14.895298Z"
     }
    },
    "outputs": [],
@@ -73,25 +73,36 @@
    "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-16T23:58:48.864223Z",
-     "start_time": "2018-05-16T23:58:48.177516Z"
+     "end_time": "2018-06-19T13:09:20.638699Z",
+     "start_time": "2018-06-19T13:09:17.343687Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ERROR:root:Line magic function `%autoreload` not found.\n"
+     ]
+    }
+   ],
    "source": [
+    "%autoreload\n",
     "from nlp.disambiguator.geodict_gaurav import GauravGeodict\n",
     "from nlp.disambiguator.most_common import MostCommonDisambiguator\n",
+    "from nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator\n",
     "disMost_common=MostCommonDisambiguator()\n",
-    "disGaurav=GauravGeodict()"
+    "disGaurav=GauravGeodict()\n",
+    "disWiki=WikipediaDisambiguator()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:59:53.695102Z",
-     "start_time": "2018-05-17T00:59:53.685756Z"
+     "end_time": "2018-06-19T13:10:29.965681Z",
+     "start_time": "2018-06-19T13:10:29.952223Z"
     }
    },
    "outputs": [],
@@ -106,11 +117,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:01:52.885111Z",
-     "start_time": "2018-05-17T00:01:52.850434Z"
+     "end_time": "2018-06-03T19:13:08.776780Z",
+     "start_time": "2018-06-03T19:13:08.752046Z"
     }
    },
    "outputs": [],
@@ -120,11 +131,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:00:09.181696Z",
-     "start_time": "2018-05-17T01:00:09.178578Z"
+     "end_time": "2018-06-03T19:13:13.030925Z",
+     "start_time": "2018-06-03T19:13:13.028591Z"
     }
    },
    "outputs": [],
@@ -134,11 +145,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 8,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:59:55.445531Z",
-     "start_time": "2018-05-17T00:59:55.407867Z"
+     "end_time": "2018-06-03T19:13:13.238647Z",
+     "start_time": "2018-06-03T19:13:13.212601Z"
     }
    },
    "outputs": [],
@@ -149,16 +160,22 @@
     "    res_dis=disGaurav.eval(df2[\"text\"].unique(),lang)\n",
     "    df2[\"disambiguation\"]=df2.text.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n",
     "    return (df2.GID == df2.disambiguation).sum()/len(df2)\n",
+    "\n",
+    "def accuracyWiki(df,lang):\n",
+    "    df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"text\",\"GID\"]]\n",
+    "    res_dis=disWiki.disambiguate_wiki(df2[\"text\"].unique(),lang)\n",
+    "    df2[\"disambiguation\"]=df2.text.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n",
+    "    return (df2.GID == df2.disambiguation).sum()/len(df2)\n",
     "#df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 11,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:22:15.528864Z",
-     "start_time": "2018-05-17T01:01:01.373760Z"
+     "end_time": "2018-06-03T19:43:28.769834Z",
+     "start_time": "2018-06-03T19:15:06.598715Z"
     }
    },
    "outputs": [
@@ -166,19 +183,20 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars\n",
-      "  \n"
+      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:12: RuntimeWarning: invalid value encountered in long_scalars\n",
+      "  if sys.path[0] == '':\n"
      ]
     }
    ],
    "source": [
-    "acc_MC,acc_GEO=[],[]\n",
+    "acc_MC,acc_GEO,acc_wiki=[],[],[]\n",
     "for fn in fns:\n",
     "    id_=int(re.findall(r\"\\d+\",fn)[-1])\n",
     "    \n",
     "    df=pd.read_csv(fn)\n",
-    "    acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\n",
-    "    acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\n",
+    "    acc_wiki.append(accuracyWiki(df,data_lang[id_]))\n",
+    "    #acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\n",
+    "    #acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\n",
     "    "
    ]
   },
@@ -233,6 +251,718 @@
     "np.mean(np.nan_to_num(acc_MC))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-06-03T19:44:42.307528Z",
+     "start_time": "2018-06-03T19:44:42.295687Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.740705700091002"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "np.mean(np.nan_to_num(acc_wiki))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-06-19T13:12:33.632268Z",
+     "start_time": "2018-06-19T13:12:26.349957Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "57451 9248\n",
+      "6.212262110726644\n"
+     ]
+    }
+   ],
+   "source": [
+    "from helpers.gazeteer_helpers import count_of_se\n",
+    "sum_,count=0,0\n",
+    "for fn in fns:\n",
+    "    try:\n",
+    "        id_=int(re.findall(r\"\\d+\",fn)[-1])\n",
+    "        df=pd.read_csv(fn)\n",
+    "        df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"text\",\"GID\"]]\n",
+    "        counts_t=df2.text.apply(lambda x: count_of_se(x,lang=data_lang[id_]))\n",
+    "        sum_+=counts_t.sum()\n",
+    "        count+=len(counts_t)\n",
+    "    except:\n",
+    "        pass\n",
+    "print(sum_,count)\n",
+    "print(sum_/count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-06-19T13:10:32.794585Z",
+     "start_time": "2018-06-19T13:10:32.759937Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>Unnamed: 0.1</th>\n",
+       "      <th>Unnamed: 0.1.1</th>\n",
+       "      <th>Unnamed: 0.1.1.1</th>\n",
+       "      <th>diff2</th>\n",
+       "      <th>text</th>\n",
+       "      <th>pos_</th>\n",
+       "      <th>ent_type_</th>\n",
+       "      <th>GID</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>RÃ©union</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>Lac 2</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>Sud</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>Lac 2</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>BV Lac 2</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>BV Lac 2</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Madagascar</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3404996</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>7</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>Madagascar</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3404996</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>8</td>\n",
+       "      <td>8</td>\n",
+       "      <td>8</td>\n",
+       "      <td>8</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>â€“</td>\n",
+       "      <td>PUNCT</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>9</td>\n",
+       "      <td>9</td>\n",
+       "      <td>9</td>\n",
+       "      <td>9</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>Etat</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>10</td>\n",
+       "      <td>10</td>\n",
+       "      <td>10</td>\n",
+       "      <td>10</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>Madagascar</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3404996</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>11</td>\n",
+       "      <td>11</td>\n",
+       "      <td>11</td>\n",
+       "      <td>11</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>Lac   2</td>\n",
+       "      <td>SPACE</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>Madagascar</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3404996</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>13</td>\n",
+       "      <td>13</td>\n",
+       "      <td>13</td>\n",
+       "      <td>13</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>Madagascar</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3404996</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>14</td>\n",
+       "      <td>14</td>\n",
+       "      <td>14</td>\n",
+       "      <td>14</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>Directeur</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>15</td>\n",
+       "      <td>15</td>\n",
+       "      <td>15</td>\n",
+       "      <td>15</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>Lac</td>\n",
+       "      <td>SPACE</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>Lac 2</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>17</td>\n",
+       "      <td>17</td>\n",
+       "      <td>17</td>\n",
+       "      <td>17</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>Paris</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD5400765</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>18</td>\n",
+       "      <td>18</td>\n",
+       "      <td>18</td>\n",
+       "      <td>18</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>Antananarivo</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3682867</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>19</td>\n",
+       "      <td>19</td>\n",
+       "      <td>19</td>\n",
+       "      <td>19</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>Directions   RÃ©gionales</td>\n",
+       "      <td>SPACE</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>20</td>\n",
+       "      <td>20</td>\n",
+       "      <td>20</td>\n",
+       "      <td>20</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>Centres</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>21</td>\n",
+       "      <td>21</td>\n",
+       "      <td>21</td>\n",
+       "      <td>21</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>Services   Agricoles</td>\n",
+       "      <td>SPACE</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>22</td>\n",
+       "      <td>22</td>\n",
+       "      <td>22</td>\n",
+       "      <td>22</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>BV Lac</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>23</td>\n",
+       "      <td>23</td>\n",
+       "      <td>23</td>\n",
+       "      <td>23</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>jusquâ€™</td>\n",
+       "      <td>VERB</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>24.0</td>\n",
+       "      <td>Antananarivo</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3682867</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>Suivi</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>26</td>\n",
+       "      <td>26</td>\n",
+       "      <td>26</td>\n",
+       "      <td>26</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>Ambositra</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD6124882</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>27</td>\n",
+       "      <td>27</td>\n",
+       "      <td>27</td>\n",
+       "      <td>27</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>Farafangana</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD2452325</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>28</td>\n",
+       "      <td>28</td>\n",
+       "      <td>28</td>\n",
+       "      <td>28</td>\n",
+       "      <td>28.0</td>\n",
+       "      <td>du Sud</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>29</td>\n",
+       "      <td>29</td>\n",
+       "      <td>29</td>\n",
+       "      <td>29</td>\n",
+       "      <td>29.0</td>\n",
+       "      <td>Est</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>30</td>\n",
+       "      <td>30</td>\n",
+       "      <td>30</td>\n",
+       "      <td>30</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>seuil</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>31</td>\n",
+       "      <td>31</td>\n",
+       "      <td>31</td>\n",
+       "      <td>31</td>\n",
+       "      <td>31.0</td>\n",
+       "      <td>BV Lac 2</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>32</td>\n",
+       "      <td>32</td>\n",
+       "      <td>32</td>\n",
+       "      <td>32</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>jusquâ€™</td>\n",
+       "      <td>VERB</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>33</td>\n",
+       "      <td>33</td>\n",
+       "      <td>33</td>\n",
+       "      <td>33</td>\n",
+       "      <td>33.0</td>\n",
+       "      <td>BV Lac 2</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>34</td>\n",
+       "      <td>34</td>\n",
+       "      <td>34</td>\n",
+       "      <td>34</td>\n",
+       "      <td>34.0</td>\n",
+       "      <td>SecrÃ©taire</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>35</td>\n",
+       "      <td>35</td>\n",
+       "      <td>35</td>\n",
+       "      <td>35</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>Alaotra</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>36</td>\n",
+       "      <td>36</td>\n",
+       "      <td>36</td>\n",
+       "      <td>36</td>\n",
+       "      <td>36.0</td>\n",
+       "      <td>Mangoro</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3574285</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>37</td>\n",
+       "      <td>37</td>\n",
+       "      <td>37</td>\n",
+       "      <td>37</td>\n",
+       "      <td>37.0</td>\n",
+       "      <td>Directeur</td>\n",
+       "      <td>NOUN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>38</td>\n",
+       "      <td>38</td>\n",
+       "      <td>38</td>\n",
+       "      <td>38</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>Lac   2   et</td>\n",
+       "      <td>SPACE</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>39</td>\n",
+       "      <td>39</td>\n",
+       "      <td>39</td>\n",
+       "      <td>39</td>\n",
+       "      <td>39.0</td>\n",
+       "      <td>Sous   rÃ©serve</td>\n",
+       "      <td>VERB</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>40</td>\n",
+       "      <td>40</td>\n",
+       "      <td>40</td>\n",
+       "      <td>40</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>Grandjean</td>\n",
+       "      <td>PROPN</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>GD3254594</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>41</td>\n",
+       "      <td>41</td>\n",
+       "      <td>41</td>\n",
+       "      <td>41</td>\n",
+       "      <td>41.0</td>\n",
+       "      <td>jusquâ€™</td>\n",
+       "      <td>VERB</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1  Unnamed: 0.1.1.1  diff2  \\\n",
+       "0            0             0               0                 0    0.0   \n",
+       "1            1             1               1                 1    1.0   \n",
+       "2            2             2               2                 2    2.0   \n",
+       "3            3             3               3                 3    3.0   \n",
+       "4            4             4               4                 4    4.0   \n",
+       "5            5             5               5                 5    5.0   \n",
+       "6            6             6               6                 6    6.0   \n",
+       "7            7             7               7                 7    7.0   \n",
+       "8            8             8               8                 8    8.0   \n",
+       "9            9             9               9                 9    9.0   \n",
+       "10          10            10              10                10   10.0   \n",
+       "11          11            11              11                11   11.0   \n",
+       "12          12            12              12                12   12.0   \n",
+       "13          13            13              13                13   13.0   \n",
+       "14          14            14              14                14   14.0   \n",
+       "15          15            15              15                15   15.0   \n",
+       "16          16            16              16                16   16.0   \n",
+       "17          17            17              17                17   17.0   \n",
+       "18          18            18              18                18   18.0   \n",
+       "19          19            19              19                19   19.0   \n",
+       "20          20            20              20                20   20.0   \n",
+       "21          21            21              21                21   21.0   \n",
+       "22          22            22              22                22   22.0   \n",
+       "23          23            23              23                23   23.0   \n",
+       "24          24            24              24                24   24.0   \n",
+       "25          25            25              25                25   25.0   \n",
+       "26          26            26              26                26   26.0   \n",
+       "27          27            27              27                27   27.0   \n",
+       "28          28            28              28                28   28.0   \n",
+       "29          29            29              29                29   29.0   \n",
+       "30          30            30              30                30   30.0   \n",
+       "31          31            31              31                31   31.0   \n",
+       "32          32            32              32                32   32.0   \n",
+       "33          33            33              33                33   33.0   \n",
+       "34          34            34              34                34   34.0   \n",
+       "35          35            35              35                35   35.0   \n",
+       "36          36            36              36                36   36.0   \n",
+       "37          37            37              37                37   37.0   \n",
+       "38          38            38              38                38   38.0   \n",
+       "39          39            39              39                39   39.0   \n",
+       "40          40            40              40                40   40.0   \n",
+       "41          41            41              41                41   41.0   \n",
+       "\n",
+       "                       text   pos_ ent_type_        GID  \n",
+       "0                   RÃ©union   NOUN       LOC          O  \n",
+       "1                     Lac 2  PROPN       LOC          O  \n",
+       "2                       Sud  PROPN       LOC          O  \n",
+       "3                     Lac 2  PROPN       LOC          O  \n",
+       "4                  BV Lac 2  PROPN       LOC          O  \n",
+       "5                  BV Lac 2  PROPN       LOC          O  \n",
+       "6                Madagascar  PROPN       LOC  GD3404996  \n",
+       "7                Madagascar  PROPN       LOC  GD3404996  \n",
+       "8                         â€“  PUNCT       LOC          O  \n",
+       "9                      Etat   NOUN       LOC          O  \n",
+       "10               Madagascar  PROPN       LOC  GD3404996  \n",
+       "11                  Lac   2  SPACE       LOC          O  \n",
+       "12               Madagascar  PROPN       LOC  GD3404996  \n",
+       "13               Madagascar  PROPN       LOC  GD3404996  \n",
+       "14                Directeur   NOUN       LOC          O  \n",
+       "15                    Lac    SPACE       LOC          O  \n",
+       "16                    Lac 2  PROPN       LOC          O  \n",
+       "17                    Paris  PROPN       LOC  GD5400765  \n",
+       "18             Antananarivo  PROPN       LOC  GD3682867  \n",
+       "19  Directions   RÃ©gionales  SPACE       LOC          O  \n",
+       "20                  Centres  PROPN       LOC          O  \n",
+       "21     Services   Agricoles  SPACE       LOC          O  \n",
+       "22                   BV Lac  PROPN       LOC          O  \n",
+       "23                   jusquâ€™   VERB       LOC          O  \n",
+       "24             Antananarivo  PROPN       LOC  GD3682867  \n",
+       "25                    Suivi  PROPN       LOC          O  \n",
+       "26                Ambositra  PROPN       LOC  GD6124882  \n",
+       "27              Farafangana  PROPN       LOC  GD2452325  \n",
+       "28                   du Sud  PROPN       LOC          O  \n",
+       "29                      Est   NOUN       LOC          O  \n",
+       "30                    seuil   NOUN       LOC          O  \n",
+       "31                 BV Lac 2  PROPN       LOC          O  \n",
+       "32                   jusquâ€™   VERB       LOC          O  \n",
+       "33                 BV Lac 2  PROPN       LOC          O  \n",
+       "34               SecrÃ©taire   NOUN       LOC          O  \n",
+       "35                  Alaotra  PROPN       LOC          O  \n",
+       "36                  Mangoro  PROPN       LOC  GD3574285  \n",
+       "37                Directeur   NOUN       LOC          O  \n",
+       "38           Lac   2   et    SPACE       LOC          O  \n",
+       "39           Sous   rÃ©serve   VERB       LOC          O  \n",
+       "40                Grandjean  PROPN       LOC  GD3254594  \n",
+       "41                   jusquâ€™   VERB       LOC          O  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebooks/EvalDesambiguisationPADIWEB.ipynb b/notebooks/EvalDesambiguisationPADIWEB.ipynb
index ba763a8..83ab07b 100644
--- a/notebooks/EvalDesambiguisationPADIWEB.ipynb
+++ b/notebooks/EvalDesambiguisationPADIWEB.ipynb
@@ -2,11 +2,11 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:50:38.399698Z",
-     "start_time": "2018-05-17T00:50:38.396888Z"
+     "end_time": "2018-06-19T12:57:56.566077Z",
+     "start_time": "2018-06-19T12:57:56.076820Z"
     }
    },
    "outputs": [],
@@ -20,8 +20,8 @@
    "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:48:13.001356Z",
-     "start_time": "2018-05-17T00:48:12.994569Z"
+     "end_time": "2018-06-19T12:57:56.766774Z",
+     "start_time": "2018-06-19T12:57:56.761060Z"
     }
    },
    "outputs": [
@@ -39,11 +39,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:54:11.406691Z",
-     "start_time": "2018-05-17T00:54:11.400933Z"
+     "end_time": "2018-06-19T12:58:25.165818Z",
+     "start_time": "2018-06-19T12:58:25.056576Z"
     }
    },
    "outputs": [],
@@ -64,11 +64,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:25:05.006779Z",
-     "start_time": "2018-05-17T01:25:05.000357Z"
+     "end_time": "2018-06-19T12:58:25.614490Z",
+     "start_time": "2018-06-19T12:58:25.607038Z"
     }
    },
    "outputs": [],
@@ -88,11 +88,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 8,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:56:26.195260Z",
-     "start_time": "2018-05-17T00:56:26.185713Z"
+     "end_time": "2018-06-19T13:00:51.545645Z",
+     "start_time": "2018-06-19T13:00:51.538149Z"
     }
    },
    "outputs": [],
@@ -104,11 +104,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:05:10.917961Z",
-     "start_time": "2018-05-17T01:05:10.915317Z"
+     "end_time": "2018-06-19T12:58:56.147169Z",
+     "start_time": "2018-06-19T12:58:56.132754Z"
     }
    },
    "outputs": [],
@@ -118,28 +118,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 17,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:57:28.905930Z",
-     "start_time": "2018-05-17T00:57:28.346854Z"
+     "end_time": "2018-06-03T18:46:38.252413Z",
+     "start_time": "2018-06-03T18:46:35.836908Z"
     }
    },
    "outputs": [],
    "source": [
+    "%autoreload\n",
     "from nlp.disambiguator.geodict_gaurav import GauravGeodict\n",
     "from nlp.disambiguator.most_common import MostCommonDisambiguator\n",
+    "from nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator\n",
     "disMost_common=MostCommonDisambiguator()\n",
-    "disGaurav=GauravGeodict()"
+    "disGaurav=GauravGeodict()\n",
+    "disWiki=WikipediaDisambiguator()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 8,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:10:19.593778Z",
-     "start_time": "2018-05-17T01:10:19.585332Z"
+     "end_time": "2018-06-03T18:40:57.064904Z",
+     "start_time": "2018-06-03T18:40:57.043921Z"
     }
    },
    "outputs": [],
@@ -154,11 +157,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T00:01:52.885111Z",
-     "start_time": "2018-05-17T00:01:52.850434Z"
+     "end_time": "2018-06-03T18:40:58.360243Z",
+     "start_time": "2018-06-03T18:40:58.203320Z"
     }
    },
    "outputs": [],
@@ -168,21 +171,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 19,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:10:21.463216Z",
-     "start_time": "2018-05-17T01:10:21.098003Z"
+     "end_time": "2018-06-03T18:46:54.196478Z",
+     "start_time": "2018-06-03T18:46:53.863582Z"
     }
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Rivers State GD4106855 12.73152386775468\n",
+      "Kano GD4103071 21.675014816832682\n",
+      "Kano GD4103071 21.675014816832682\n",
+      "Lagos GD4468122 124.6205202335819\n",
+      "Lagos GD4468122 124.6205202335819\n",
+      "Port Harcourt GD791183 15.777445058883712\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
        "0.6666666666666666"
       ]
      },
-     "execution_count": 52,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -194,11 +209,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 14,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:06:38.089187Z",
-     "start_time": "2018-05-17T01:06:38.080846Z"
+     "end_time": "2018-06-03T18:45:45.708459Z",
+     "start_time": "2018-06-03T18:45:45.679984Z"
     }
    },
    "outputs": [],
@@ -209,17 +224,22 @@
     "    res_dis=disGaurav.eval(df2[\"content\"].unique(),lang)\n",
     "    df2[\"disambiguation\"]=df2.content.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n",
     "    return (df2.GID == df2.disambiguation).sum()/len(df2)\n",
+    "def accuracyWiki(df,lang):\n",
+    "    df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"content\",\"GID\"]]\n",
+    "    res_dis=disWiki.disambiguate(df2[\"content\"].unique(),lang)\n",
+    "    df2[\"disambiguation\"]=df2.content.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n",
+    "    return (df2.GID == df2.disambiguation).sum()/len(df2)\n",
     "#df\n",
     "#df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 20,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-05-17T01:37:02.165192Z",
-     "start_time": "2018-05-17T01:25:31.325566Z"
+     "end_time": "2018-06-03T18:53:53.880676Z",
+     "start_time": "2018-06-03T18:48:05.294472Z"
     }
    },
    "outputs": [
@@ -229,21 +249,20 @@
      "text": [
       "/usr/local/lib/python3.6/site-packages/pandas/core/ops.py:816: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
       "  result = getattr(x, name)(y)\n",
-      "/Users/jacquesfize/nas_cloud/Code/str-python/helpers/collision.py:30: RuntimeWarning: invalid value encountered in double_scalars\n",
-      "  d_over_o_squared = d/np.dot(o, o) + 1e-10\n",
-      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars\n",
-      "  \n"
+      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:11: RuntimeWarning: invalid value encountered in long_scalars\n",
+      "  # This is added back by InteractiveShellApp.init_path()\n"
      ]
     }
    ],
    "source": [
-    "acc_MC,acc_GEO=[],[]\n",
+    "acc_MC,acc_GEO,acc_wiki=[],[],[]\n",
     "for fn in fns:\n",
     "    \n",
     "    try:\n",
     "        df,lang=parse_file(fn)\n",
-    "        acc_MC.append(accuracyMostCommon(df,lang))\n",
-    "        acc_GEO.append(accuracyGeodict(df,lang))\n",
+    "        #acc_MC.append(accuracyMostCommon(df,lang))\n",
+    "        #acc_GEO.append(accuracyGeodict(df,lang))\n",
+    "        acc_wiki.append(accuracyWiki(df,lang))\n",
     "    except:\n",
     "        pass\n",
     "    "
@@ -300,6 +319,86 @@
     "np.mean(np.nan_to_num(acc_MC))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-06-03T18:55:22.909028Z",
+     "start_time": "2018-06-03T18:55:22.904693Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5630869832932465"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "np.mean(np.nan_to_num(acc_wiki))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-06-19T13:01:36.778853Z",
+     "start_time": "2018-06-19T13:01:36.775832Z"
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-06-19T13:10:53.120884Z",
+     "start_time": "2018-06-19T13:09:52.611805Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.6/site-packages/pandas/core/ops.py:816: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
+      "  result = getattr(x, name)(y)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "151959 7898\n",
+      "19.24018738921246\n"
+     ]
+    }
+   ],
+   "source": [
+    "from helpers.gazeteer_helpers import count_of_se\n",
+    "sum_,count=0,0\n",
+    "for fn in fns:\n",
+    "    try:\n",
+    "        df,lang=parse_file(fn)\n",
+    "        counts_t=df.content.apply(lambda x: count_of_se(x,lang=lang))\n",
+    "        sum_+=counts_t.sum()\n",
+    "        count+=len(counts_t)\n",
+    "    except:\n",
+    "        pass\n",
+    "print(sum_,count)\n",
+    "print(sum_/count)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/tools.py b/tools.py
index f5b1dd8..fd2dd99 100644
--- a/tools.py
+++ b/tools.py
@@ -4,7 +4,7 @@ import argparse
 
 from termcolor import colored
 
-from .helpers.gazeteer_helpers import get_most_common_id, get_data, get_by_label
+from helpers.gazeteer_helpers import get_most_common_id, get_data, get_by_label
 
 parser = argparse.ArgumentParser()
 
-- 
GitLab