Add graph Transformations + modify graph viewer + add field in config +

change gmatch4py + change count in disambiguator (geodictID used instead of label)

Add graph Transformations + modify graph viewer + add field in config +
change gmatch4py + change count in disambiguator (geodictID used instead of label)
e7ec63d0 · Pokiros · 526352ea · e7ec63d0 · e7ec63d0 · e7ec63d0
Commit e7ec63d0 authored 7 years ago by Pokiros
Hide whitespace changes
Inline Side-by-side

Showing

with 470 additions and 428 deletions
+470 -428
--- a/config/config.json
+++ b/config/config.json
@@ -3,5 +3,6 @@
  "osm_boundaries_directory":"/Users/jacquesfize/install",
  "core_nlp_URL":"http://localhost:9000",
  "es_server_old":"http://192.168.1.15:9200/",
-  "es_server":"http://localhost:9200/"
+  "es_server":"http://localhost:9200/",
+  "database_json":"resources/database_graph_viewer.db"
 }
\ No newline at end of file
--- a/disambiguator/pagerank.py
+++ b/disambiguator/pagerank.py
@@ -13,12 +13,16 @@ class PageRankDisambiguator(Disambiguator):

    def disambiguate(self, ner_result, lang="en"):
        count,se_ = self.extract_se_entities(ner_result)
+        new_count={}
        selected_en = {}
        for en in se_:
            if label_exists(en, lang):
                id_ = get_most_common_id(en, lang)
                selected_en[id_] = en
+                new_count[id_]=count[en]
            elif alias_exists(en,lang):
                id_ = get_most_common_id_alias(en, lang)
                selected_en[id_] = en
-        return count,selected_en
+                new_count[id_] = count[en]
+
+        return new_count,selected_en
--- a/eval.py
+++ b/eval.py
 # coding: utf-8
-
-import glob
-
-
-# Graph Edit Distance Algorithm Import
-
+from gmatch4py.ged.geo_hed import GeoHED
 from gmatch4py.ged.greedy_edit_distance import GreedyEditDistance
 from gmatch4py.ged.hausdorff_edit_distance import HED
 from progressbar import ProgressBar, Timer, Bar, ETA
@@ -16,52 +11,74 @@ from gmatch4py.kernels.weisfeiler_lehman import *
 from gmatch4py.kernels.weisfeiler_lehman_geo import *
 from gmatch4py.kernels.weisfeiler_lehman_edge_geo import *
 from gmatch4py.ged.bipartite_graph_matching_2 import BP_2
+from gmatch4py.mcs import MCS
 from pipeline import *
+import glob,time

-
+# Function for output generation
+def_temp=[36,-36]
+temp=def_temp
+max_temp=-30
+dec=5

 def getLocInfo(id_):
-    data=get_data(id_)
-    if 'coord' in data:
-        return [data["coord"]["lat"],data["coord"]["lon"]]
-    return [0,0]
+    global temp,dec
+    try:
+        data=get_data(id_)
+        if 'coord' in data:
+            return [data["coord"]["lat"],data["coord"]["lon"]]
+        else:
+            temp = [temp[0] , temp[1]+dec]
+            if temp[1] >= max_temp:
+                temp = [temp[0] +dec, def_temp[1]]
+            return temp
+    except:
+        pass

 def get_associated_es(associated_es_data):
+    global temp
    new_={}
+    temp=def_temp
    for id_ in associated_es_data:
-        new_[id_]={"label":associated_es_data[id_],"coord":getLocInfo(id_)}
+        try:
+            new_[id_]={"label":get_data(id_)["en"],"coord":getLocInfo(id_)}
+        except:
+            new_[id_] = {"label": id_, "coord": getLocInfo(id_)}
    return new_

-def getEdges4Draw(associated_es,edges):
-    data={}
-    for es in associated_es:
-        data[es]=getLocInfo(es)
+def getEdges4Draw(data,edges):
    lines=[]
    for ed in edges:
-        lines.append([data[ed[0]],data[ed[1]],ed[2]["color"]])
-    return lines
-
+        lines.append([data[ed[0]]["coord"],data[ed[1]]["coord"],ed[2]["color"]])
+        if lines[-1][-1] == "cyan":
+            lines[-1][-1] = "blue";

+    return lines

 # Similarity Function between graph and a set of graphs

 grap_kernel_results=[]
 graph_lookup={}

+def compareMCS(graphs):
+    return MCS.compare(graphs)
 # GED algorithm
-def compareGED(id_,graphs):
-    return ApproximateGraphEditDistance.compare(graphs)[id_]
+def compareGED(graphs):
+    return ApproximateGraphEditDistance.compare(graphs)
+
+def compareBP2(graphs):
+    return BP_2.compare(graphs)

-def compareBP2(id_,graphs):
-    return BP_2.compare(graphs)[id_]
+def compareHED(graphs):
+    return HED.compare(graphs)

-def compareHED(id_,graphs):
-    return HED.compare(graphs)[id_]
+def compareGEOHED(graphs):
+    return GeoHED.compare(graphs)

-def compareGreedy(id_,graphs):
-    return GreedyEditDistance.compare(graphs)[id_]
+def compareGreedy(graphs):
+    return GreedyEditDistance.compare(graphs)

-def compareWLSubTreeKernel(id_,graphs):
+def compareWLSubTreeKernel(graphs):

    global grap_kernel_results, graph_lookup
    sc = np.zeros(len(graphs))
@@ -74,9 +91,9 @@ def compareWLSubTreeKernel(id_,graphs):

        grap_kernel_results=WeisfeleirLehmanKernel.compare(graphs_array,h=3)

-    return 1 - grap_kernel_results[id_]
+    return 1 - grap_kernel_results

-def compareWLSubTreeKernelGeo(id_,graphs):
+def compareWLSubTreeKernelGeo(graphs):

    global grap_kernel_results, graph_lookup
    sc = np.zeros(len(graphs))
@@ -92,30 +109,32 @@ def compareWLSubTreeKernelGeo(id_,graphs):
        grap_kernel_results=WeisfeleirLehmanKernelGEO.compare(graphs_array,h=3)
        grap_kernel_results= np.nan_to_num(grap_kernel_results)

-    return 1-grap_kernel_results[id_]
+    return 1-grap_kernel_results

-def compareWLSubTreeKernelEdgeGeo(id_,graphs):
+def compareWLSubTreeKernelEdgeGeo(graphs):

    global grap_kernel_results, graph_lookup
    sc = np.zeros(len(graphs))
    if len(grap_kernel_results)<1:

-        graphs_array = [None for i in range(max(graphs.keys()))]
+        graphs_array = [None for i in range(len(graphs))]
        for i,g in graphs.items():
            graphs_array[i]=g

        grap_kernel_results=WeisfeleirLehmanKernelEdgeGeo.compare(graphs_array,h=3)
        grap_kernel_results= np.nan_to_num(grap_kernel_results)

-    return 1-grap_kernel_results[id_]
+    return 1-grap_kernel_results




 funcDict={
+    "MCS":compareMCS,
    "GED":compareGED,
    "BP2":compareBP2,
    "HED":compareHED,
+    "GEOHED":compareGEOHED,
    "GREEDY":compareGreedy,
    "WLSUBTREE":compareWLSubTreeKernel,
    "WLSUBTREEGEO":compareWLSubTreeKernelGeo,
@@ -130,6 +149,7 @@ parser.add_argument("texts_dir")
 parser.add_argument("graphs_dir")
 parser.add_argument("metadata_fn")
 parser.add_argument("-e","--evalEPI",action="store_true")
+parser.add_argument("-a","--all",action="store_true")
 parser.add_argument("-o","--output",help="Output Filename",default="GED")
 args = parser.parse_args()

@@ -137,8 +157,6 @@ args = parser.parse_args()
 if not args.distance in funcDict.keys():
    raise NotFoundDistance(args.distance,funcDict)
    exit()
-# Initialize Pipeline for Spatial Entities extraction and STR construction
-

 # Load all the text from the corpus
 texts=[]
@@ -161,9 +179,8 @@ if not texts:
    print("No text files were loaded !")
    exit()

+# Load graph data and associated spatial entities of each graph

-
-# LOAD graph data and associated spatial entities of each graph
 assC=json.load(open(args.metadata_fn))
 associated_es,count_per_doc=assC[0],assC[1]

@@ -172,17 +189,19 @@ for file in glob.glob(args.graphs_dir.rstrip("/")+"/*.gexf"):
    id=int(re.findall("\d+",file)[0])
    graphs[id]=nx.read_gexf(file)

-
 # We take 50 documents chosen randomly. Then we test, if the top-10 returned documents are relevant !

-
-
 if args.evalEPI:
    selected_documents_=json.load(open("data/random_selected_doc.json"))
-
+elif args.all:
+    selected_documents_=list(graphs.keys())
 else:
    selected_documents_ = []
-    ids=list(range(len(graphs)))
+    ids=[]
+    for i in range(len(graphs)):
+        if len(graphs[i])>1:
+            ids.append(i)
+
    import random
    random.shuffle(ids)
    try:
@@ -192,39 +211,38 @@ else:


 # Generating Evaluation Output
-
-
 top_ten_documents=[]
 final_data={}

+deb=time.time()
+similarity_matrix = funcDict[args.distance](graphs)
+print("Similarity Matrix Computed in {0} s.".format(time.time()-deb))
 with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg:
    inc=0
    for doc_s in selected_documents_:
-        if not len(graphs)>0:
-            pass
+        if not len(graphs[doc_s])>0:
+            continue
        id_json=str(doc_s)
-        bow_score=funcDict[args.distance](doc_s,graphs)
+        bow_score=similarity_matrix[doc_s]
        top_10_docs_score=np.sort(bow_score)[1:11].astype(float)
        top_10_docs=np.argsort(bow_score)[1:11].astype(int)
-        final_data[id_json]={
-            "sp_entities":get_associated_es(associated_es[id_json]),
+        final_data[doc_s]={
+            "sp_entities":get_associated_es(graphs[doc_s].nodes()),
            "text":texts[doc_s],
-            "edges":getEdges4Draw(associated_es[id_json],graphs[doc_s].edges(data=True))
        }
-        final_data[id_json]["top_10"]=[]
+        final_data[doc_s]["edges"]=getEdges4Draw(final_data[doc_s]["sp_entities"],graphs[doc_s].edges(data=True))
+        #print(final_data[doc_s]["edges"])
+        final_data[doc_s]["top_10"]=[]
        for d in range(len(top_10_docs)):
            doc_data={}
            doc_data["score"]=top_10_docs_score[d]
            doc_data["id_txt"]=int(top_10_docs[d])
            doc_data["text"]=texts[int(top_10_docs[d])]
-            doc_data["sp_entities"]=get_associated_es(associated_es[str(doc_data["id_txt"])])
-            doc_data["edges"]=getEdges4Draw(associated_es[str(doc_data["id_txt"])],graphs[doc_data["id_txt"]].edges(data=True))
+            doc_data["sp_entities"]=get_associated_es(graphs[doc_data["id_txt"]].nodes())
+            doc_data["edges"]=getEdges4Draw(doc_data["sp_entities"],graphs[doc_data["id_txt"]].edges(data=True))
            doc_data["relevant"]=None
-            final_data[id_json]["top_10"].append(doc_data)
+            final_data[doc_s]["top_10"].append(doc_data)
        inc+=1
        pg.update(inc)

-
-
-
-open("graph_viewer/evalTop10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4))
+open("gui_graph_viewer/evalTopJPT10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4))
--- a/generate_data.py
+++ b/generate_data.py
@@ -35,6 +35,7 @@ pipeline= {
 }


+
 # Read Input Files

 texts_=[]
@@ -45,9 +46,12 @@ if os.path.exists(args.texts_input_dir):
        exit()
    for fn in files_:
        try:
-            texts_.append(open(fn).read())
+            tex=open(fn).read()
+            lang = Detector(tex, quiet=True).language.code #for bug encoding
+            texts_.append(tex)
        except:
-            print("{0} could'nt be read !".format(fn))
+            print("{0} could'nt be read ! Add Lorem Ipsum instead".format(fn))
+            texts_.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.")


 # If output Dir doesn't exists
@@ -62,6 +66,7 @@ if not texts_:


 data={}
+n=0
 for text in range(len(texts_)):
    # try:
    lang=Detector(texts_[text],quiet=True).language.code
@@ -70,9 +75,10 @@ for text in range(len(texts_)):
    if lang in pipeline:
        data[lang].append(text)
    else:
+        if not "en" in data:data["en"]=[] # Ca peut arriver :s :s :s !!!
        data["en"].append(text)
-    # except:
-    #     print("No Language Detected")
+# except:
+    #     n+=1 # encoding error

 associated_es={}
 count_per_doc={}
@@ -91,11 +97,18 @@ with ProgressBar(max_value=len(texts_),widgets=[' [', Timer(), '] ',Bar(),' (',

            else:
                t=filter_nonprintable(texts_[id_doc])
-                a, b, c = pipeline[lang].parse(t)
-                list_gs.append(pipeline[lang].build(t).graph)
-                # Save Metadata
-                count_per_doc[id_doc] = a
-                associated_es[id_doc] = c
+                try:
+                    a, b, c = pipeline[lang].parse(t)
+                    list_gs.append(pipeline[lang].build(t).graph)
+                    # Save Metadata
+                    count_per_doc[id_doc] = a
+                    associated_es[id_doc] = c
+                except: # NER Bug
+                    count_per_doc[id_doc] = {}
+                    associated_es[id_doc] = {}
+                    g = nx.MultiDiGraph()
+                    list_gs.append(g)
+
            # Save Graph structure
            nx.write_gexf(list_gs[-1], args.graphs_output_dir+"/{0}.gexf".format(id_doc))
            i+=1

--- a/generate_eval_support_JPT.py
+++ b/generate_eval_support_JPT.py
-# coding: utf-8
-
-import glob
-
-from gmatch4py.ged.geo_bp2 import GeoBP2
-# Graph Edit Distance Algorithm Import
-from gmatch4py.ged.geo_ged import GeoGED
-from gmatch4py.ged.geo_hed import GeoHED
-from gmatch4py.ged.greedy_edit_distance import GreedyEditDistance
-from gmatch4py.ged.hausdorff_edit_distance import HED
-from progressbar import ProgressBar, Timer, Bar, ETA
-
-# Disa
-from disambiguator.geodict_gaurav import *
-from gmatch4py.exception import NotFoundDistance
-from gmatch4py.kernels.weisfeiler_lehman import *
-from gmatch4py.kernels.weisfeiler_lehman_geo import *
-from gmatch4py.kernels.weisfeiler_lehman_edge_geo import WeisfeleirLehmanKernelEdgeGeo
-from pipeline import *
-from pos_tagger.tagger import Tagger
-
-# Similarity Function between graph and a set of graphs
-
-grap_kernel_results=[]
-graph_lookup={}
-def compareGED(id_,graphs):
-    g=graphs[id_]
-    sc=np.zeros(len(graphs))
-    for id_,g2 in graphs.items():
-        score=ged.compare(g,g2)
-        sc[id_]=score
-    return sc
-
-def compareGEOGED(id1,graphs):
-    g=graphs[id1]
-    sc=np.zeros(len(graphs))
-    for id_,g2 in graphs.items():
-        try:
-            if len(g2) >1:
-                gg=GeoGED(g,g2)
-                score=gg.distance()
-                sc[id_] = score
-            else:
-                sc[id_]=np.inf
-        except:
-            sc[id_] = np.inf
-
-    return sc
-
-def compareBP2(id_,graphs):
-    bp2=BP_2()
-    g = graphs[id_]
-    sc = np.zeros(len(graphs))
-    for id_, g2 in graphs.items():
-        if len(g2) >0:
-            score = bp2.bp2(g, g2)
-            sc[id_] = score
-        else:
-            sc[id_] = np.inf
-    return sc
-
-def compareSubTreeKernel(id_,graphs):
-
-    global grap_kernel_results, graph_lookup
-    sc = np.zeros(len(graphs))
-    if len(grap_kernel_results)<1:
-
-        graphs_array=[None for i in range(len(graphs))]
-        for i,g in graphs.items():
-            graphs_array[i]=g
-
-        grap_kernel_results=WeisfeleirLehmanKernel.compare(graphs_array,h=3)
-
-    return 1 - grap_kernel_results[id_]
-def compareSubTreeKernelGeo(id_,graphs):
-
-    global grap_kernel_results, graph_lookup
-    sc = np.zeros(len(graphs))
-    if len(grap_kernel_results)<1:
-
-        graphs_array=[None for i in range(len(graphs))]
-        for i,g in graphs.items():
-            graphs_array[i]=g
-
-        grap_kernel_results=WeisfeleirLehmanKernelGEO.compare(graphs_array,h=3)
-        grap_kernel_results= np.nan_to_num(grap_kernel_results)
-
-    return 1-grap_kernel_results[id_]
-
-def compareSubTreeKernelEdgeGeo(id_,graphs):
-
-    global grap_kernel_results, graph_lookup
-    sc = np.zeros(len(graphs))
-    if len(grap_kernel_results)<1:
-
-        graphs_array=[None for i in range(len(graphs))]
-        for i,g in graphs.items():
-            graphs_array[i]=g
-
-        grap_kernel_results=WeisfeleirLehmanKernelEdgeGeo.compare(graphs_array,h=3)
-        grap_kernel_results= np.nan_to_num(grap_kernel_results)
-
-    return 1-grap_kernel_results[id_]
-def compareGEOBP2(id_,graphs):
-    bp2=GeoBP2()
-    g = graphs[id_]
-    sc = np.zeros(len(graphs))
-    for id_, g2 in graphs.items():
-        if len(g2) >0:
-            score = bp2.bp2(g, g2)
-            sc[id_] = score
-        else:
-            sc[id_] = np.inf
-    return sc
-
-def compareHED(id_,graphs):
-    h=HED()
-    g = graphs[id_]
-    sc = np.zeros(len(graphs))
-    for id_, g2 in graphs.items():
-        if len(g2) >0:
-            score = h.hed(g, g2)
-            sc[id_] = score
-        else:
-            sc[id_]=np.inf
-    return sc
-
-def compareGEOHED(id_,graphs):
-    h=GeoHED()
-    g = graphs[id_]
-    sc = np.zeros(len(graphs))
-    for id_, g2 in graphs.items():
-        if len(g2)>1:
-            score = h.hed(g, g2)
-            sc[id_] = score
-        else:
-            sc[id_] = np.inf
-    return sc
-
-def compareGreedy(id_,graphs):
-
-    g = graphs[id_]
-    sc = np.zeros(len(graphs))
-    for id_, g2 in graphs.items():
-        h = GreedyEditDistance(g,g2)
-        score = h.distance()
-        sc[id_] = score
-    return sc
-
-funcDict={
-    "GED":compareGED,
-    "GEOGED":compareGEOGED,
-    "BP2":compareBP2,
-    "GEOBP2":compareGEOBP2,
-    "HED":compareHED,
-    "GEOHED":compareGEOHED,
-    "GREEDY":compareGreedy,
-    "WLSUBTREE":compareSubTreeKernel,
-    "WLSUBTREEGEO":compareSubTreeKernelGeo,
-    "WLSUBTREEEDGEGEO":compareSubTreeKernelEdgeGeo
-}
-
-
-import argparse
-parser = argparse.ArgumentParser()
-parser.add_argument("distance")
-parser.add_argument("graphs_dir")
-parser.add_argument("--ignore",help="Ignore Output",action="store_true")
-parser.add_argument("-o","--output",help="Output Filename",default="GED")
-args = parser.parse_args()
-
-
-if not args.distance in funcDict.keys():
-    raise NotFoundDistance(args.distance,funcDict)
-    exit()
-# Initialize Pipeline for Spatial Entities extraction and STR construction
-
-pip=Pipeline(lang="english",tagger=Tagger(),ner=StanfordNER(lang="en"))
-
-# Load all the text from the corpus
-
-def get_text_data(directory):
-    """
-    Load the Epidemiology corpus
-    """
-    files = glob.glob(os.path.join(directory,"*.json.processed.json"))
-    texts={}
-    for filepath in files:
-        id_doc=int(re.findall("\d+",filepath)[-1])
-        data=json.load(open(filepath))["content"]
-        texts[id_doc]=data
-    return texts
-
-__t = json.load(open("data/CorpusHeterogene21docs.txt"))
-texts = {}
-for i in range(len(__t)):
-    texts[i] = __t[i]
-
-# Extract All spatial entities
-
-if not os.path.exists("associated_and_count_JPT.json"):
-    pass
-    associated_es={}
-    count_per_doc={}
-    for id_,text in texts.items():
-        if text:
-            a,b,c=pip.parse(text)
-            count_per_doc[id_]=a
-            associated_es[id_]=c
-        else:
-            associated_es[id_]={}
-            count_per_doc[id_]={}
-
-    open("associated_and_count_JPT.json",'w').write(json.dumps([associated_es,count_per_doc],indent=4))
-
-if not os.path.exists(args.graphs_dir):
-    pass
-    import networkx as nx
-    graphs={}
-    for t,text in texts.items():
-        if text:
-            graphs[t]=pip.buildSemSTR(text,win_size=7).graph
-        else:
-            graphs[t]=nx.MultiDiGraph()
-    os.mkdir(args.graphs_dir)
-    for t,g in graphs.items():
-        print(t)
-        nx.write_gexf(g,os.path.join(args.graphs_dir,"{0}.gexf".format(t)))
-
-# LOAD graph data and associated spatial entities of each graph
-assC=json.load(open("associated_and_count_JPT.json"))
-associated_es,count_per_doc=assC[0],assC[1]
-
-graphs={}
-for file in glob.glob(args.graphs_dir.rstrip("/")+"/*.gexf"):
-    id=int(re.findall("\d+",file)[0])
-    graphs[id]=nx.read_gexf(file)
-
-
-
-#print("TEST associated_es and graphs",ass,gra)
-
-
-
-
-
-# We take 50 documents chosen randomly. Then we test, if the top-10 returned documents are relevant !
-
-
-selected_documents_=range(len(graphs))
-
-from gmatch4py.ged.algorithm import graph_edit_dist as ged
-from gmatch4py.ged.bipartite_graph_matching_2 import BP_2
-
-
-
-
-def getLocInfo(id_):
-    data=get_data(id_)
-    if 'coord' in data:
-        return [data["coord"]["lat"],data["coord"]["lon"]]
-    return [0,0]
-
-def get_associated_es(associated_es_data):
-    new_={}
-    for id_ in associated_es_data:
-        new_[id_]={"label":associated_es_data[id_],"coord":getLocInfo(id_)}
-    return new_
-
-def getEdges4Draw(associated_es,edges):
-
-    data={}
-    for es in associated_es:
-        data[es]=getLocInfo(es)
-    lines=[]
-    for ed in edges:
-        try:
-            lines.append([data[ed[0]],data[ed[1]],ed[2]["color"]])
-        except:
-            print(ed)
-    return lines
-
-
-# Generating Evaluation Output
-
-
-top_ten_documents=[]
-final_data={}
-import copy
-with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg:
-    inc=0
-    for doc_s in selected_documents_:
-        id_json=str(doc_s)
-        #print(id_json)
-        bow_score=funcDict[args.distance](doc_s,copy.deepcopy(graphs))
-        top_10_docs_score=np.sort(bow_score)[1:11].astype(float)
-        top_10_docs=np.argsort(bow_score)[1:11].astype(int)
-        #print(top_10_docs)
-        final_data[id_json]={
-            "sp_entities":get_associated_es(associated_es[id_json]),
-            "text":texts[doc_s],
-            "edges":getEdges4Draw(associated_es[id_json],graphs[doc_s].edges(data=True))
-        }
-        final_data[id_json]["top_10"]=[]
-        for d in range(len(top_10_docs)):
-            doc_data={}
-            doc_data["score"]=top_10_docs_score[d]
-            doc_data["id_txt"]=int(top_10_docs[d])
-            doc_data["text"]=texts[int(top_10_docs[d])]
-            doc_data["sp_entities"]=get_associated_es(associated_es[str(doc_data["id_txt"])])
-            doc_data["edges"]=getEdges4Draw(associated_es[str(doc_data["id_txt"])],graphs[doc_data["id_txt"]].edges(data=True))
-            doc_data["relevant"]=None
-            final_data[id_json]["top_10"].append(doc_data)
-        inc+=1
-        pg.update(inc)
-
-
-
-if not args.ignore:
-    open("graph_viewer/evalTopJPT10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4))
--- a/gmatch4py/README.md
+++ b/gmatch4py/README.md
@@ -6,7 +6,7 @@ Gmatch4py is a library dedicated to graph matching. Graph structure are stored i

 * DeltaCon and DeltaCon0 (*debug needed*) [1]
 * Vertex Ranking (*debug needed*) [2]
- * Vertex Edge Overlap [2
+ * Vertex Edge Overlap [2]
 * Graph kernels
    * Random Walk Kernel (*debug needed*) [3]
        * Geometrical 
@@ -21,7 +21,8 @@ Gmatch4py is a library dedicated to graph matching. Graph structure are stored i
    * Approximated Graph Edit Distance 
    * Hausdorff Graph Edit Distance 
    * Bipartite Graph Edit Distance 
-    * Greedy Edit Distance 
+    * Greedy Edit Distance
+ * MCS [6]
    

 ## Publications associated
@@ -31,7 +32,7 @@ Gmatch4py is a library dedicated to graph matching. Graph structure are stored i
  * [3] Vishwanathan, S. V. N., Schraudolph, N. N., Kondor, R., & Borgwardt, K. M. (2010). Graph kernels. Journal of Machine Learning Research, 11(Apr), 1201-1242.
  * [4] Shervashidze, N., Schweitzer, P., Leeuwen, E. J. V., Mehlhorn, K., & Borgwardt, K. M. (2011). Weisfeiler-lehman graph kernels. Journal of Machine Learning Research, 12(Sep), 2539-2561.
  * [5] Fischer, A., Riesen, K., & Bunke, H. (2017). Improved quadratic time approximation of graph edit distance by combining Hausdorff matching and greedy assignment. Pattern Recognition Letters, 87, 55-62.
-  
+  * [6] A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer, Pattern Recognition Letters, 1998  

 ## Authors

@@ -39,5 +40,4 @@ Jacques Fize

 ## TODO

-  * Add MCS + Jaccard
  * Debug algorithms with --> (*debug needed*)
\ No newline at end of file
--- a/gmatch4py/__init__.py
+++ b/gmatch4py/__init__.py
 __version__ = "0.1"
+import os
\ No newline at end of file
--- a/gmatch4py/ged/approximate_ged.py
+++ b/gmatch4py/ged/approximate_ged.py
@@ -12,6 +12,6 @@ class ApproximateGraphEditDistance():
        for i in range(n):
            for j in range(i,n):
                comparison_matrix[i,j]= GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
-                comparison_matrix[j,i]= comparison_matrix[i,j]
+                comparison_matrix[j,i]= comparison_matrix[i,j] # Unethical ! Since AGED is not a symmetric similarity measure !

        return comparison_matrix
\ No newline at end of file
--- a/gmatch4py/ged/geo_ged.py
+++ b/gmatch4py/ged/geo_ged.py
@@ -24,7 +24,19 @@ class GeoGED(GraphEditDistance):
        else:
            self.g2_info=_cache_g_info[",".join(g2.nodes())]

+    @staticmethod
+    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
+        n = len(listgs)
+        comparison_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                comparison_matrix[i, j] = GraphEditDistance(listgs[i], listgs[j], False, node_del=c_del_node,
+                                                            node_ins=c_ins_node, edge_del=c_del_edge,
+                                                            edge_ins=c_ins_edge).distance()
+                comparison_matrix[j, i] = comparison_matrix[
+                    i, j]  # Unethical ! Since AGED is not a symmetric similarity measure !

+        return comparison_matrix

    def insert_geo_distance(self,node2):
        # If one nodes given, compute average distance

--- a/gmatch4py/ged/geo_hed.py
+++ b/gmatch4py/ged/geo_hed.py
@@ -11,6 +11,17 @@ class GeoHED(HED):
        """Constructor for GeoHED"""
        HED.__init__(self,node_del, node_ins, edge_del, edge_ins)

+    @staticmethod
+    def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
+        n = len(listgs)
+        comparator = GeoHED(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
+        comparison_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])
+                comparison_matrix[j, i] = comparison_matrix[i, j]
+
+        return comparison_matrix

    def geo_distance(self,g1,g2,node1,node2):
        g1_info = get_nodes_geolocalization(g1)

--- a/gmatch4py/kernels/random_walk_kernel.py
+++ b/gmatch4py/kernels/random_walk_kernel.py
@@ -19,10 +19,14 @@ class GeometricRandomWalkKernel():
        comparison_matrix=np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
+                if len(listgs[i]) <1 or len(listgs[j]) <1:
+                    comparison_matrix[i, j] = 0
+                    comparison_matrix[j, i] = 0
+                    continue
                direct_product_graph=nx.tensor_product(listgs[i],listgs[j])
                Ax = nx.adjacency_matrix(direct_product_graph).todense()
                try:
-                    la = 1/ GeometricRandomWalkKernel.maxDegree(direct_product_graph) # lambda value
+                    la = 1/ ((GeometricRandomWalkKernel.maxDegree(direct_product_graph)**2)+1) # lambda value
                except:
                    la= pow(1,-6)
                eps = pow(10,-10)
@@ -31,21 +35,20 @@ class GeometricRandomWalkKernel():
                x=I_vec.copy()
                x_pre=np.zeros(Ax.shape[0])
                c=0
+
                while (np.linalg.norm(x-x_pre)) > eps:
                    if c > 100:
                        break
                    x_pre=x

-                    x= I_vec + la*(np.multiply(Ax,x_pre))
+                    x= I_vec + la*np.dot(Ax,x_pre.T)
                    c+=1
                comparison_matrix[i,j]=np.sum(x)
                comparison_matrix[j,i]=comparison_matrix[i,j]
-
+        print(comparison_matrix)
        for i in range(n):
            for j in range(i,n):
                comparison_matrix[i,j] = (comparison_matrix[i,j]/np.sqrt(comparison_matrix[i,i]*comparison_matrix[j,j]))
-                if i != j:
-                    comparison_matrix[i,j]/=100 #Pourquoi ?
                comparison_matrix[j,i]=comparison_matrix[i,j]
        return comparison_matrix

@@ -58,13 +61,18 @@ class KStepRandomWalkKernel():
        dmax = max(degree_sequence)
        return dmax
    @staticmethod
-    def compare(listgs,lambda_list=[1,2,3],k=3):
+    def compare(listgs,lambda_list=[1,1,1]):
+        k=len(lambda_list)
        if not len(lambda_list) == k:
            raise AttributeError
        n = len(listgs)
        comparison_matrix=np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
+                if len(listgs[i]) <1 or len(listgs[j]) <1:
+                    comparison_matrix[i, j] = 0
+                    comparison_matrix[j, i] = 0
+                    continue
                direct_product_graph=nx.tensor_product(listgs[i],listgs[j])
                Ax = nx.adjacency_matrix(direct_product_graph).todense()
                eps = pow(10,-10)
@@ -75,13 +83,11 @@ class KStepRandomWalkKernel():
                    ax_pow *= Ax
                    sum_ += lambda_list[kk] * ax_pow

-                comparison_matrix[i, j] = np.sum(sum_)
+                comparison_matrix[i, j] = np.sum(sum_)/(len(listgs[i])**2 * len(listgs[j])**2)
                comparison_matrix[j,i] = comparison_matrix[i,j]

        for i in range(n):
            for j in range(i,n):
                comparison_matrix[i,j] = comparison_matrix[i,j]/np.sqrt(comparison_matrix[i,i]*comparison_matrix[j,j])
-                if i != j:
-                    comparison_matrix[i,j]/=100 #Pourquoi ?
                comparison_matrix[j,i]=comparison_matrix[i,j]
        return comparison_matrix
\ No newline at end of file
--- a/gmatch4py/kernels/weisfeiler_lehman_geo.py
+++ b/gmatch4py/kernels/weisfeiler_lehman_geo.py
@@ -13,9 +13,10 @@ import numpy as np
 import networkx as nx
 import copy

+
 class WeisfeleirLehmanKernelGEO(object):
    __type__ = "sim"
-
+    __depreciated__=True

    @staticmethod
    def compare(graph_list,h=2,verbose=False):

--- a/gmatch4py/mcs.py
+++ b/gmatch4py/mcs.py
+# coding = utf-8
+import networkx as nx
+import numpy as np
+
+class MCS():
+    """
+    A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer,
+    Pattern Recognition Letters, 1998
+    """
+
+    @staticmethod
+    def compare(listgs):
+        n = len(listgs)
+        comparison_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                g1 = listgs[i]
+                g2 = listgs[j]
+                comparison_matrix[i, j] = MCS.s_mcs(g1,g2)
+                comparison_matrix[j, i] = comparison_matrix[i, j]
+        return comparison_matrix
+
+    @staticmethod
+    def intersect(a, b):
+        return list(set(a) & set(b))
+
+    @staticmethod
+    def transform_edges(ed):
+        for e in range(len(ed)):
+            if "id" in ed[e][-1]:
+                del ed[e][-1]["id"]
+        return ed
+
+
+    @staticmethod
+    def intersect_edges(g1, g2):
+        ed1 = MCS.transform_edges(g1.edges(data=True))
+        ed2 = MCS.transform_edges(g2.edges(data=True))
+        inter_ed = []
+        for e1 in ed1:
+            for e2 in ed2:
+                if e1 == e2:
+                    inter_ed.append(e1)
+        return inter_ed
+
+    @staticmethod
+    def intersect_nodes(g1, g2):
+        return MCS.intersect(g1.nodes(), g2.nodes())
+
+    @staticmethod
+    def maximum_common_subgraph(g1, g2):
+        """
+        Extract maximum common subgraph
+        """
+        res = nx.MultiDiGraph()
+        res.add_nodes_from(MCS.intersect_nodes(g1, g2))
+        res.add_edges_from(MCS.intersect_edges(g1, g2))
+        return res
+
+    @staticmethod
+    def s_mcs(g1, g2):
+
+        return len(MCS.maximum_common_subgraph(g1, g2)) / max(len(g1), len(g2))
+
--- a/gmatch4py/utils.py
+++ b/gmatch4py/utils.py
@@ -76,7 +76,7 @@ def get_distance_two_entity(n1,n2,info1,info2):
        #print(n1,info1[n1]["fr"],info2[n2]["fr"])
        score+=0.5
    else:
-        score+=4
+        score+=1
    #if set(info1[n1]["class"]) and info2[n2]["class"]:
     #   score-=1


--- a/graph_viewer/server.py
+++ b/graph_viewer/server.py
-# coding = utf-8
-
-
-import os, json, re, datetime, random, uuid, glob
-from flask import Flask,jsonify, render_template, url_for, flash, make_response, request, redirect, session, Markup, jsonify
-app = Flask(__name__)
-
-dataFiles=glob.glob("evalTop10STR_*")
-data_={}
-for fn in dataFiles:
-    data_[fn.replace("evalTop10STR_","").rstrip(".json")]=fn
-print(data_.keys())
-
-@app.route("/<gmmeasure>")
-def index(gmmeasure="GED"):
-    if not gmmeasure in data_.keys():
-        gmmeasure="GED"
-    return render_template("index.html",data=json.dumps(json.load(open(data_[gmmeasure]))),measureAvailable=list(data_.keys()))
-
-if __name__ == '__main__':
-
-    app.run("0.0.0.0",port=5000,debug=True)
\ No newline at end of file
--- a/similarity/__init__.py
+++ b/similarity/__init__.py
--- a/gui_graph_viewer/config/config.json
+++ b/gui_graph_viewer/config/config.json
+{
+  "database_json":"../resources/database_graph_viewer.db"
+}
\ No newline at end of file
--- a/gui_graph_viewer/config/configuration.py
+++ b/gui_graph_viewer/config/configuration.py
+# coding = utf-8
+
+import json
+
+class Configuration(object):
+    def __init__(self, data):
+        self.__dict__=data
+        for d in self.__dict__:
+            if isinstance(self.__dict__[d],dict):
+                self.__dict__[d]=Configuration(self.__dict__[d])
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+
+config = Configuration(json.load(open("config/config.json")))
+
--- a/gui_graph_viewer/db.py
+++ b/gui_graph_viewer/db.py
+# coding = utf-8
+
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column,Integer,String, Boolean, ForeignKey, Text
+from sqlalchemy.orm import sessionmaker
+
+from config.configuration import config
+
+import bcrypt
+
+
+engine = create_engine('sqlite:///'+config.database_json, echo=True)
+Base = declarative_base()
+
+
+class User(Base):
+    __tablename__="users"
+
+    id = Column(Integer, primary_key=True)
+    name = Column(String(80), unique=True)
+    email = Column(String(120), unique=True)
+    password = Column(String(120))
+    level = Column(Integer)
+
+    def __init__(self, name, email,password,level=1,sign_up=True):
+        self.name=name
+        self.email=email
+        self.password= password
+        if sign_up:self.password=bcrypt.hashpw(password, bcrypt.gensalt(14))
+        self.level=level
+
+
+    def check_password(self,password):
+        return self.password == bcrypt.hashpw(password.encode(), self.password)
+    def is_authenticated(self):
+        return True
+
+    def is_active(self):
+        return True
+
+    def is_anonymous(self):
+        return False
+
+    def get_id(self):
+        return str(self.id)
+
+class Annotation(Base):
+    __tablename__="annotations"
+
+    id = Column(Integer, primary_key=True)
+    type_annotation = Column(String(120))
+    user_id = Column(Integer,ForeignKey('users.id'))
+    data=Column(Text)
+    finished = Column(Boolean)
+
+    def __init__(self, type_annotation,user_id,data,finished=False):
+        self.type_annotation=type_annotation
+        self.user_id=user_id
+        self.finished=finished
+        self.data=data
+###################################################
+#              Database Population functions
+###################################################
+
+def add_users(session,data):
+    for d in range(len(data)):
+        line=data.iloc[[d]].values[0]
+        user=User(line[0],line[1],line[2].encode(),line[3])
+        session.add(user)
+    session.commit()
+
+if __name__ == '__main__':
+    # Create tables (delete if exists)
+    Base.metadata.drop_all(engine)
+    Base.metadata.create_all(engine)
+
+    # Initialize session
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    # Load data
+    user_input=pd.read_csv("user.csv",sep=";")
+    # Populate the database
+    add_users(session,user_input)
--- a/gui_graph_viewer/server.py
+++ b/gui_graph_viewer/server.py
+# coding = utf-8
+
+
+import os, json, re, datetime, random, uuid, glob
+
+from flask import Flask, render_template, url_for, flash, make_response, request, redirect, session, Markup, jsonify
+from flask_session import Session
+from flask_login import LoginManager, login_user, logout_user, current_user, login_required
+from db import *
+
+app = Flask(__name__)
+
+Sessiona = sessionmaker(bind=engine)
+sql_session = Sessiona()
+
+login_manager = LoginManager()
+login_manager.init_app(app)
+
+"""
+Load results files
+"""
+dataFiles=glob.glob("evalTop10STR_*")
+data_={}
+for fn in dataFiles:
+    data_[fn.replace("evalTop10STR_","").rstrip(".json")]=fn
+print("File Available",data_.keys())
+
+
+
+@app.route("/")
+@app.route("/<gmmeasure>")
+@login_required
+def index(gmmeasure="GED"):
+    """
+    Home Route
+    :param gmmeasure:
+    :return:
+    """
+    if not gmmeasure in data_.keys():
+        gmmeasure="GED"
+    return render_template("index.html",data=json.dumps(json.load(open(data_[gmmeasure]))),measureAvailable=list(data_.keys()),measure=gmmeasure)
+
+@app.route("/about")
+def about():
+    return render_template("about.html",measureAvailable=list(data_.keys()))
+
+@app.route("/save")
+@login_required
+
+def save():
+    pass
+
+###################################################
+#        User Login/Signup/Logout managment
+###################################################
+
+@app.route('/login', methods=['GET', 'POST'])
+def login():
+    """
+    User login
+    """
+    # If already logged in
+    if current_user.is_authenticated:
+        return redirect("/")
+    # Login page render
+    if request.method == 'GET':
+        return render_template('login.html',measureAvailable=list(data_.keys()))
+
+    # Get necessary variable
+    email = request.form['email']
+    password = request.form['password']
+    registered_user = sql_session.query(User).filter_by(email=email).first()
+
+    # Error message
+    error = Markup(
+        '<strong>Email</strong> or <strong>Password</strong> is invalid')
+    # If no user found
+    if registered_user is None:
+        flash(Markup(error), 'error')
+        return redirect(url_for('login'))
+    # If password is incorrect
+    if not registered_user.check_password(password):
+        flash(error, 'error')
+        return redirect(url_for('login'))
+    # Logged the user
+    login_user(registered_user)
+
+    return redirect(request.args.get('next') or url_for('index'))
+
+@app.route('/signup', methods=['GET', 'POST'])
+@login_required
+def signup():
+    """
+    User signup
+    """
+    # If already logged in
+    if current_user.level != 1:
+        return redirect("/")
+    # Login page render
+    if request.method == 'GET':
+        return render_template('signup.html',measureAvailable=list(data_.keys()))
+
+    # Get necessary variable
+    name = request.form['name']
+    email = request.form['email']
+    password = request.form['password']
+    password2 = request.form['password_2']
+    if name and email and password:
+        if password == password2:
+            user=User(name,email,password.encode(),1)
+            sql_session.add(user)
+            sql_session.commit()
+            flash(Markup('Account for {0} is created !'.format(name)), 'success')
+        else:
+            error = Markup('Indicate two identical password !')
+            flash(error, 'danger')
+    else:
+        error = Markup('<strong>Email</strong> or <strong>Password</strong> or <strong>Name</strong> is empty')
+        flash(error, 'danger')
+    return redirect("/signup")
+
+@app.route('/logout')
+def logout():
+    """
+    Logout page
+    """
+    logout_user()
+    session.clear()
+    return redirect("/")
+
+###################################################
+#         Login Manager Functions Overrided
+###################################################
+
+@login_manager.user_loader
+def load_user(id):
+    return sql_session.query(User).get(int(id))
+
+
+@login_manager.unauthorized_handler
+def unauthorized_handler():
+    return redirect("/login")
+
+if __name__ == '__main__':
+    app.secret_key = os.urandom(24)
+    app.run("0.0.0.0",port=5000,debug=True)
\ No newline at end of file