From fbcd363465474f0f8fed385282957873f3128c47 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Fri, 29 Jun 2018 12:10:11 +0200 Subject: [PATCH] - Update Gmatch4py - new interface to annote spatial similarity - New adjacency source --- config/config.json | 4 +- eval.py | 206 ++--- exp_22_may.sh | 55 +- gmatch4py/data/source/source1.txt | 36 - gmatch4py/exception/__init__.py | 8 +- .../ged/algorithm/abstract_graph_edit_dist.py | 112 --- gmatch4py/ged/algorithm/edge_edit_dist.py | 29 - gmatch4py/ged/algorithm/graph_edit_dist.py | 71 -- gmatch4py/ged/approximate_ged.py | 20 - gmatch4py/ged/bipartite_graph_matching_2.py | 147 ---- gmatch4py/ged/graph/edge_graph.py | 16 - gmatch4py/ged/greedy_edit_distance.py | 44 - gmatch4py/ged/hausdorff_edit_distance.py | 145 ---- gmatch4py/helpers/__init__.py | 1 - gmatch4py/helpers/networkx_parser.py | 148 ---- gmatch4py/kernels/weisfeiler_lehman.py | 136 --- gmatch4py_cython/gmatch4py/bag_of_cliques.pyx | 109 ++- .../gmatch4py/ged/approximate_ged.pyx | 33 +- .../ged/bipartite_graph_matching_2.pyx | 2 +- .../gmatch4py/ged/greedy_edit_distance.pyx | 2 +- .../gmatch4py/ged/hausdorff_edit_distance.pyx | 12 +- gmatch4py_cython/gmatch4py/jaccard.pyx | 7 +- .../gmatch4py/kernels/weisfeiler_lehman.pyx | 2 +- .../gmatch4py/vertex_edge_overlap.pyx | 30 +- gui_graph_viewer/db.py | 22 +- gui_graph_viewer/server.py | 93 ++ gui_graph_viewer/static/css/dashboard.css | 6 +- gui_graph_viewer/static/js/helpers.js | 10 +- gui_graph_viewer/templates/indexv2.html | 189 +++++ helpers/gazeteer_helpers.py | 24 +- models/str.py | 12 +- nlp/bow_se.py | 2 +- nlp/disambiguator/models/bigram.py | 22 +- nlp/disambiguator/wikipedia_cooc.py | 89 +- notebooks/EvalDesambiguisationMada.ipynb | 794 +++++++++++++++++- notebooks/EvalDesambiguisationPADIWEB.ipynb | 187 ++++- tools.py | 2 +- 37 files changed, 1616 insertions(+), 1211 deletions(-) delete mode 100644 gmatch4py/data/source/source1.txt delete mode 100644 gmatch4py/ged/algorithm/abstract_graph_edit_dist.py delete mode 100644 gmatch4py/ged/algorithm/edge_edit_dist.py delete mode 100644 gmatch4py/ged/algorithm/graph_edit_dist.py delete mode 100644 gmatch4py/ged/approximate_ged.py delete mode 100644 gmatch4py/ged/bipartite_graph_matching_2.py delete mode 100644 gmatch4py/ged/graph/edge_graph.py delete mode 100644 gmatch4py/ged/greedy_edit_distance.py delete mode 100644 gmatch4py/ged/hausdorff_edit_distance.py delete mode 100644 gmatch4py/helpers/__init__.py delete mode 100644 gmatch4py/helpers/networkx_parser.py delete mode 100644 gmatch4py/kernels/weisfeiler_lehman.py create mode 100644 gui_graph_viewer/templates/indexv2.html diff --git a/config/config.json b/config/config.json index eefac00..c96b03b 100644 --- a/config/config.json +++ b/config/config.json @@ -3,9 +3,9 @@ "stanford_nlp_home":"/Users/jacquesfize/.services/stanford-corenlp-full-2017-06-09", "osm_boundaries_directory":"/Users/jacquesfize/install", "core_nlp_URL":"http://localhost:9000", - "es_server_old":"http://192.168.1.15:9200/", + "es_server_old":"http://172.16.10.11:9200/", "es_server":"http://localhost:9200/", - "database_json":"resources/database_exp_12_mars.db", + "database_json":"resources/database_exp_25_may.db", "log_file":"extract_log", "wiki_cooc_dis":{ "cooc_freq":"/Users/jacquesfize/nas_cloud/Code/str-python/resources/coocurrence_wiki.pkl", diff --git a/eval.py b/eval.py index 48b8e81..9e809f9 100644 --- a/eval.py +++ b/eval.py @@ -21,112 +21,127 @@ from nlp.bow_se import BOWSE from pipeline import * # Function for output generation -def_temp=[36,-36] -temp=def_temp -max_temp=-30 -dec=5 +def_temp = [36, -36] +temp = def_temp +max_temp = -30 +dec = 5 + def getLocInfo(id_): - global temp,dec + global temp, dec try: - data=get_data(id_) + data = get_data(id_) if 'coord' in data: - return [data["coord"]["lat"],data["coord"]["lon"]] + return [data["coord"]["lat"], data["coord"]["lon"]] else: - temp = [temp[0] , temp[1]+dec] + temp = [temp[0], temp[1] + dec] if temp[1] >= max_temp: - temp = [temp[0] +dec, def_temp[1]] + temp = [temp[0] + dec, def_temp[1]] return temp except: pass + def get_associated_es(associated_es_data): global temp - new_={} - temp=def_temp + new_ = {} + temp = def_temp for id_ in associated_es_data: try: - new_[id_]={"label":get_data(id_)["en"],"coord":getLocInfo(id_)} + new_[id_] = {"label": get_data(id_)["en"], "coord": getLocInfo(id_)} except: new_[id_] = {"label": id_, "coord": getLocInfo(id_)} return new_ -def getEdges4Draw(data,edges): - lines=[] + +def getEdges4Draw(data, edges): + lines = [] for ed in edges: - lines.append([data[ed[0]]["coord"],data[ed[1]]["coord"],ed[2]["color"]]) + lines.append([data[ed[0]]["coord"], data[ed[1]]["coord"], ed[2]["color"]]) if lines[-1][-1] == "cyan": lines[-1][-1] = "blue"; return lines + # Similarity Function between graph and a set of graphs -def compareMCS(graphs,selected): - return 1-MCS.compare(graphs,selected) +def compareMCS(graphs, selected): + return 1 - MCS.compare(graphs, selected) + + # GED algorithm -def compareGED(graphs,selected): - return ApproximateGraphEditDistance.compare(graphs,selected) +def compareGED(graphs, selected): + return ApproximateGraphEditDistance.compare(graphs, selected) -def compareBP2(graphs,selected): - return BP_2.compare(graphs,selected) -def compareHED(graphs,selected): - return HED.compare(graphs,selected) +def compareBP2(graphs, selected): + return BP_2.compare(graphs, selected) -def compareGreedy(graphs,selected): - return GreedyEditDistance.compare(graphs,selected) -def compareWLSubTreeKernel(graphs,selected): - return 1 - WeisfeleirLehmanKernel.compare(graphs,selected,h=3) +def compareHED(graphs, selected): + return HED.compare(graphs, selected) -def compareBOWSE(graphs,selected): - return 1-BOWSE.compare(graphs,selected) -def compareBOC(graphs_array,selected): - return 1 - BagOfCliques.compare(graphs_array,selected) +def compareGreedy(graphs, selected): + return GreedyEditDistance.compare(graphs, selected) -def compareVEO(graphs_array,selected): - return 1 - VertexEdgeOverlap.compare(graphs_array,selected) -def compareJaccard(graphs_array,selected): - return 1 - Jaccard.compare(graphs_array,selected) +def compareWLSubTreeKernel(graphs, selected): + return 1 - WeisfeleirLehmanKernel.compare(graphs, selected, h=3) + + +def compareBOWSE(graphs, selected): + return 1 - BOWSE.compare(graphs, selected) + + +def compareBOC(graphs_array, selected): + return np.ones((len(graphs_array),len(graphs_array))) - BagOfCliques.compare(graphs_array, selected) -funcDict={ - "MCS":compareMCS, - "VEO":compareVEO, - "GED":compareGED, - "BP2":compareBP2, - "HED":compareHED, - "GREEDY":compareGreedy, - "WLSUBTREE":compareWLSubTreeKernel, - "BOWSE":compareBOWSE, - "BOC":compareBOC, - "JACCARD":compareJaccard -} +def compareVEO(graphs_array, selected): + return 1 - VertexEdgeOverlap.compare(graphs_array, selected) + + +def compareJaccard(graphs_array, selected): + return 1 - Jaccard.compare(graphs_array, selected) + + +funcDict = { + "MCS": compareMCS, + "VEO": compareVEO, + "GED": compareGED, + "BP2": compareBP2, + "HED": compareHED, + "GREEDY": compareGreedy, + "WLSUBTREE": compareWLSubTreeKernel, + "BOWSE": compareBOWSE, + "BOC": compareBOC, + "JACCARD": compareJaccard +} import argparse + parser = argparse.ArgumentParser() parser.add_argument("distance") parser.add_argument("texts_dir") parser.add_argument("graphs_dir") parser.add_argument("metadata_fn") parser.add_argument("original_dir") -parser.add_argument("-s","--selectedGraph") -parser.add_argument("-a","--all",action="store_true") -parser.add_argument("-o","--output",help="Output Filename") +parser.add_argument("-s", "--selectedGraph") +parser.add_argument("-a", "--all", action="store_true") +parser.add_argument("-o", "--output", help="Output Filename") args = parser.parse_args() -original_dir=args.original_dir +original_dir = args.original_dir if not args.distance in funcDict.keys(): - raise NotFoundDistance(args.distance,funcDict) + raise NotFoundDistance(args.distance, funcDict) exit() # Load all the text from the corpus -texts=[] +texts = [] if os.path.exists(args.texts_dir): - files_glob= glob.glob(args.texts_dir+"/*.txt") + files_glob = glob.glob(args.texts_dir + "/*.txt") texts = [""] * len(files_glob) for fn in files_glob: id = int(re.findall("\d+", fn)[-1]) @@ -150,24 +165,24 @@ if not texts: # Load graph data and associated spatial entities of each graph -assC=json.load(open(args.metadata_fn)) -associated_es,count_per_doc=assC[0],assC[1] +assC = json.load(open(args.metadata_fn)) +associated_es, count_per_doc = assC[0], assC[1] -graphs={} -for file in glob.glob(args.graphs_dir.rstrip("/")+"/*.gexf"): - id=int(re.findall("\d+",file)[-1]) - graphs[id]=nx.read_gexf(file) +graphs = {} +for file in glob.glob(args.graphs_dir.rstrip("/") + "/*.gexf"): + id = int(re.findall("\d+", file)[-1]) + graphs[id] = nx.read_gexf(file) -graphs_array = [None for i in range(max(graphs.keys())+1)] -for i,g in graphs.items(): - graphs_array[i]=g +graphs_array = [nx.Graph() for i in range(max(graphs.keys()) + 1)] +for i, g in graphs.items(): + graphs_array[i] = g # We take 50 documents chosen randomly. Then we test, if the top-10 returned documents are relevant ! if args.all: - selected_documents_=list(graphs.keys()) + selected_documents_ = list(graphs.keys()) elif args.selectedGraph: - selected_documents_=json.load(open(args.selectedGraph)) + selected_documents_ = json.load(open(args.selectedGraph)) # if args.all: # selected_documents_=list(graphs.keys()) # else: @@ -186,36 +201,34 @@ elif args.selectedGraph: # Generating Evaluation Output -top_ten_documents=[] -final_data={} +top_ten_documents = [] +final_data = {} -deb=time.time() +deb = time.time() print("Computing Similarity Matrix ...") -similarity_matrix = funcDict[args.distance](graphs_array,selected_documents_) -print("Similarity Matrix Computed in {0} s.".format(time.time()-deb)) - -graphs={} -for file in glob.glob(original_dir.rstrip("/")+"/*.gexf"): - id=int(re.findall("\d+",file)[-1]) - graphs[id]=nx.read_gexf(file) - +similarity_matrix = funcDict[args.distance](graphs_array, selected_documents_) +print("Similarity Matrix Computed in {0} s.".format(time.time() - deb)) +graphs = {} +for file in glob.glob(original_dir.rstrip("/") + "/*.gexf"): + id = int(re.findall("\d+", file)[-1]) + graphs[id] = nx.read_gexf(file) nn_ = 5 -with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg: - inc=0 +with ProgressBar(max_value=len(selected_documents_), widgets=[' [', Timer(), '] ', Bar(), ' (', ETA(), ') ', ]) as pg: + inc = 0 for doc_s in selected_documents_: - if not len(graphs[doc_s])>0: + if not len(graphs[doc_s]) > 0: continue - bow_score=similarity_matrix[doc_s] + bow_score = similarity_matrix[doc_s] top_docs_score = np.sort(bow_score).astype(float) top_docs = np.argsort(bow_score).astype(int) - final_data[doc_s]={ - "sp_entities":get_associated_es(graphs[doc_s].nodes()), - "text":texts[doc_s], + final_data[doc_s] = { + "sp_entities": get_associated_es(graphs[doc_s].nodes()), + "text": texts[doc_s], } - final_data[doc_s]["edges"] = getEdges4Draw(final_data[doc_s]["sp_entities"],graphs[doc_s].edges(data=True)) + final_data[doc_s]["edges"] = getEdges4Draw(final_data[doc_s]["sp_entities"], graphs[doc_s].edges(data=True)) final_data[doc_s]["topk"] = [] n_top_docs = len(top_docs) for d in range(n_top_docs): @@ -223,22 +236,25 @@ with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] ' continue if len(final_data[doc_s]["topk"]) == nn_: break - doc_data={} - doc_data["score"]=top_docs_score[d] - doc_data["id_txt"]=int(top_docs[d]) - doc_data["text"]=""#texts[int(top_10_docs[d])] - doc_data["sp_entities"]=get_associated_es(graphs[doc_data["id_txt"]].nodes()) - doc_data["edges"]=getEdges4Draw(doc_data["sp_entities"],graphs[doc_data["id_txt"]].edges(data=True)) - doc_data["relevant"]=None + doc_data = {} + doc_data["score"] = top_docs_score[d] + doc_data["id_txt"] = int(top_docs[d]) + doc_data["text"] = "" # texts[int(top_10_docs[d])] + doc_data["sp_entities"] = get_associated_es(graphs[doc_data["id_txt"]].nodes()) + doc_data["edges"] = getEdges4Draw(doc_data["sp_entities"], graphs[doc_data["id_txt"]].edges(data=True)) + doc_data["relevant"] = None final_data[doc_s]["topk"].append(doc_data) - inc+=1 + inc += 1 pg.update(inc) if not args.output: print("Saved in gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance)) - open("gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4)) + open("gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance), 'w').write(json.dumps(final_data, indent=4)) else: - print("Saved in {0}/evalTop10STR_{1}.json".format(args.output,args.distance)) + print("Saved in {0}/evalTop10STR_{1}.json".format(args.output, args.distance)) if not os.path.exists(args.output): os.makedirs(args.output) - open("{0}/evalTop10STR_{1}.json".format(args.output.rstrip("/"),args.distance), 'w').write(json.dumps(final_data, indent=4)) + open("{0}/evalTop10STR_{1}.json".format(args.output.rstrip("/"), args.distance), 'w').write( + json.dumps(final_data, indent=4)) + + diff --git a/exp_22_may.sh b/exp_22_may.sh index ae037f3..1ef7e44 100755 --- a/exp_22_may.sh +++ b/exp_22_may.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash path_csv=/Users/jacquesfize/LOD_DATASETS/disambiguation +path_texts=/Users/jacquesfize/LOD_DATASETS/raw_bvlac/ output_dir=data/graph_exp_may_25 if [ "$1" == "generate" ]; then @@ -22,44 +23,48 @@ fi if [ "$1" == "eval" ]; then ## Normal STR eval - original=data/graph_exp_may_24/normal + original=$output_dir/normal; dir=normal; - mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE"); - for me in ${mesure[@]}; do - echo $me" for STR "$dir; - python3 eval.py "$me" "$path_texts" "$output_dir/$dir" "$output_dir/$dir/asso.json" "$original" -s "$output_dir/selected.json" -o "$output_dir/result_eval/$dir/"; - done; + #mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE"); +# mesure=("BOC" "JACCARD"); +# for me in ${mesure[@]}; do +# echo $me" for STR "$dir; +# python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; +# done; - ## Generalised STR eval - dir=gen_all_1 - mesure=( "MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE"); - for me in ${mesure[@]}; do - echo $me" for STR "$dir; - python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; - done; +# ## Generalised STR eval +# dir=gen_all_1 +# mesure=( "MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE"); +# for me in ${mesure[@]}; do +# echo $me" for STR "$dir; +# python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; +# done; # dir=gen_all_2 # mesure=( "MCS" "VEO" "JACCARD" "HED" "GREEDY" "BOC" "BOWSE"); # for me in ${mesure[@]}; do # echo $me" for STR "$dir; # python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; +# done; + #mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOWSE"); +# dir=gen_region +# for me in ${mesure[@]}; do +# echo $me" for STR "$dir; +# python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; # done; - dir=gen_region - for me in ${mesure[@]}; do - echo $me" for STR "$dir; - python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; - done; - - dir=gen_country - for me in ${mesure[@]}; do - echo $me" for STR "$dir; - python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; - done; +# "BOWSE"); + mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC"); +# dir=gen_country +# for me in ${mesure[@]}; do +# echo $me" for STR "$dir; +# python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; +# done;bn ## Extended STR eval + # "BOWSE"); dir=extension_1 - mesure=( "MCS" "VEO" "JACCARD" "BOC" "WLSUBTREE" "BOWSE"); + mesure=( "MCS" "VEO" "JACCARD" "BOC" "HED" "GREEDY" "GED"); for me in ${mesure[@]}; do echo $me" for STR "$dir; python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/; diff --git a/gmatch4py/data/source/source1.txt b/gmatch4py/data/source/source1.txt deleted file mode 100644 index de26da6..0000000 --- a/gmatch4py/data/source/source1.txt +++ /dev/null @@ -1,36 +0,0 @@ -{ - "id": "source1.txt-1", - "sentenceNumber": 1, - "length": 17, - "tokens": [{ - "id": "1", - "lemma": "Haakon", - "deprel": "nsubj", - "word": "Haakon", - "rel": "4", - "pos": "NNP" - }, { - "id": "2", - "lemma": "be", - "deprel": "cop", - "word": "is", - "rel": "4", - "pos": "VBZ" - }, { - "id": "3", - "lemma": "my", - "deprel": "poss", - "word": "my", - "rel": "4", - "pos": "PRP$" - }, { - "id": "4", - "lemma": "name", - "deprel": "null", - "word": "name", - "rel": "0", - "pos": "NN" - }], - "filename": "source1.txt", - "offset": 0 -} diff --git a/gmatch4py/exception/__init__.py b/gmatch4py/exception/__init__.py index 1d997a2..950f635 100644 --- a/gmatch4py/exception/__init__.py +++ b/gmatch4py/exception/__init__.py @@ -1,7 +1 @@ -# coding = utf-8 -from termcolor import colored -class NotFoundDistance(Exception): - def __init__(self,dd,distanceFunctionDict): - # Call the base class constructor with the parameters it needs - super(Exception, self).__init__(colored("{0} is not an edit distance implemented ! Select a distance from : {1}".format(dd,",".join(distanceFunctionDict.keys())),"red")) - +# coding = utf-8 \ No newline at end of file diff --git a/gmatch4py/ged/algorithm/abstract_graph_edit_dist.py b/gmatch4py/ged/algorithm/abstract_graph_edit_dist.py deleted file mode 100644 index e0a1d3b..0000000 --- a/gmatch4py/ged/algorithm/abstract_graph_edit_dist.py +++ /dev/null @@ -1,112 +0,0 @@ -# -*- coding: UTF-8 -*- -from __future__ import print_function - -import sys - -import numpy as np -from scipy.optimize import linear_sum_assignment - - -class AbstractGraphEditDistance(object): - def __init__(self, g1, g2,debug=False,**kwargs): - self.g1 = g1 - self.g2 = g2 - self.debug=debug - - self.node_del = kwargs.get("node_del",1) - self.node_ins = kwargs.get("node_ins",1) - self.edge_del = kwargs.get("edge_del",1) - self.edge_ins = kwargs.get("edge_ins",1) - - - def distance(self): - opt_path = self.edit_costs() - if self.debug: - print("Edit path for ",str(self.__class__.__name__),"\n",opt_path) - return sum(opt_path) - - def print_operations(self,cost_matrix,row_ind,col_ind): - nodes1 = self.g1.nodes() - nodes2 = self.g2.nodes() - dn1 = self.g1.node - dn2 = self.g2.node - - n,m=len(nodes1),len(nodes2) - for i in range(len(row_ind)): - y,x=row_ind[i],col_ind[i] - val=cost_matrix[row_ind[i]][col_ind[i]] - if x<m and y<n: - print("SUB {0} to {1} cost = {2}".format(dn1[nodes1[y]]["label"],dn2[nodes2[x]]["label"],val)) - elif x <m and y>=n: - print("ADD {0} cost = {1}".format(dn2[nodes2[y-n]]["label"],val)) - elif x>=m and y<n: - print("DEL {0} cost = {1}".format(dn1[nodes1[m-x]]["label"],val)) - - def edit_costs(self): - cost_matrix = self.create_cost_matrix() - if self.debug: - np.set_printoptions(precision=3) - print("Cost Matrix for ",str(self.__class__.__name__),"\n",cost_matrix) - - row_ind,col_ind = linear_sum_assignment(cost_matrix) - if self.debug: - self.print_operations(cost_matrix,row_ind,col_ind) - return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(len(row_ind))] - - def create_cost_matrix(self): - """ - Creates a |N+M| X |N+M| cost matrix between all nodes in - graphs g1 and g2 - Each cost represents the cost of substituting, - deleting or inserting a node - The cost matrix consists of four regions: - - substitute | insert costs - ------------------------------- - delete | delete -> delete - - The delete -> delete region is filled with zeros - """ - n = len(self.g1) - m = len(self.g2) - cost_matrix = np.zeros((n+m,n+m)) - #cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)] - nodes1 = self.g1.nodes() - nodes2 = self.g2.nodes() - - for i in range(n): - for j in range(m): - cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j]) - - for i in range(m): - for j in range(m): - cost_matrix[i+n,j] = self.insert_cost(i, j, nodes2) - - for i in range(n): - for j in range(n): - cost_matrix[j,i+m] = self.delete_cost(i, j, nodes1) - - self.cost_matrix = cost_matrix - return cost_matrix - - def insert_cost(self, i, j): - raise NotImplementedError - - def delete_cost(self, i, j): - raise NotImplementedError - - def substitute_cost(self, nodes1, nodes2): - raise NotImplementedError - - def print_matrix(self): - print("cost matrix:") - print(self.g1.nodes()) - print(self.g2.nodes()) - print(np.array(self.create_cost_matrix())) - for column in self.create_cost_matrix(): - for row in column: - if row == sys.maxsize: - print ("inf\t") - else: - print ("%.2f\t" % float(row)) - print("") diff --git a/gmatch4py/ged/algorithm/edge_edit_dist.py b/gmatch4py/ged/algorithm/edge_edit_dist.py deleted file mode 100644 index 74ef2e9..0000000 --- a/gmatch4py/ged/algorithm/edge_edit_dist.py +++ /dev/null @@ -1,29 +0,0 @@ -import sys - -from gmatch4py_old.ged.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance - - -class EdgeEditDistance(AbstractGraphEditDistance): - """ - Calculates the graph edit distance between two edges. - A node in this context is interpreted as a graph, - and edges are interpreted as nodes. - """ - - def __init__(self, g1, g2,**kwargs): - AbstractGraphEditDistance.__init__(self, g1, g2,**kwargs) - - def insert_cost(self, i, j, nodes2): - if i == j: - return self.edge_ins - return sys.maxsize - - def delete_cost(self, i, j, nodes1): - if i == j: - return self.edge_del - return sys.maxsize - - def substitute_cost(self, edge1, edge2): - if edge1 == edge2: - return 0. - return self.edge_del+self.edge_ins diff --git a/gmatch4py/ged/algorithm/graph_edit_dist.py b/gmatch4py/ged/algorithm/graph_edit_dist.py deleted file mode 100644 index 210e15e..0000000 --- a/gmatch4py/ged/algorithm/graph_edit_dist.py +++ /dev/null @@ -1,71 +0,0 @@ -# -*- coding: UTF-8 -*- - -import sys - -import networkx as nx - -from gmatch4py_old.ged.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance -from gmatch4py_old.ged.algorithm.edge_edit_dist import EdgeEditDistance -from gmatch4py_old.ged.graph.edge_graph import EdgeGraph - - -def compare(g1, g2, print_details=False): - ged = GraphEditDistance(g1, g2,print_details) - return ged.distance() - - -class GraphEditDistance(AbstractGraphEditDistance): - - def __init__(self, g1, g2,debug=False,**kwargs): - AbstractGraphEditDistance.__init__(self, g1, g2,debug,**kwargs) - - def substitute_cost(self, node1, node2): - return self.relabel_cost(node1, node2) + self.edge_diff(node1, node2) - - def relabel_cost(self, node1, node2): - if node1 == node2: - edges1=set(self.get_edge_multigraph(self.g1,node1)) - edges2=set(self.get_edge_multigraph(self.g2,node2)) - return abs(len(edges2.difference(edges1))) # Take in account if there is a different number of edges - else: - return self.node_ins+self.node_del - - def delete_cost(self, i, j, nodes1): - if i == j: - return self.node_del+self.g1.degree(nodes1[i]) # Deleting a node implicate to delete in and out edges - return sys.maxsize - - def insert_cost(self, i, j, nodes2): - if i == j: - deg=self.g2.degree(nodes2[j]) - if isinstance(deg,dict):deg=0 - return self.node_ins+deg - else: - return sys.maxsize - - def get_edge_multigraph(self,g,node): - edges=[] - for id_,val in g.edge[node].items(): - if not 0 in val: - edges.append(str(id_) + val["color"]) - else: - for _,edge in val.items(): - edges.append(str(id_)+edge["color"]) - return edges - - def edge_diff(self, node1, node2): - if isinstance(self.g1,nx.MultiDiGraph): - edges1 = self.get_edge_multigraph(self.g1,node1) - edges2 = self.get_edge_multigraph(self.g2,node2) - else: - edges1 = list(self.g1.edge[node1].keys()) - edges2 = list(self.g2.edge[node2].keys()) - if len(edges1) == 0 or len(edges2) == 0: - return max(len(edges1), len(edges2)) - - edit_edit_dist = EdgeEditDistance( - EdgeGraph(node1,edges1), - EdgeGraph(node2,edges2), - edge_del=self.edge_del,edge_ins=self.edge_ins,node_ins=self.node_ins,node_del=self.node_del - ) - return edit_edit_dist.distance() diff --git a/gmatch4py/ged/approximate_ged.py b/gmatch4py/ged/approximate_ged.py deleted file mode 100644 index d77f522..0000000 --- a/gmatch4py/ged/approximate_ged.py +++ /dev/null @@ -1,20 +0,0 @@ -# coding = utf-8 - -import numpy as np - -from .algorithm.graph_edit_dist import GraphEditDistance - - -class ApproximateGraphEditDistance(): - __type__ = "dist" - - @staticmethod - def compare(listgs,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1): - n= len(listgs) - comparison_matrix = np.zeros((n,n)) - for i in range(n): - for j in range(i,n): - comparison_matrix[i,j]= GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance() - comparison_matrix[j,i]= comparison_matrix[i,j] # Unethical ! Since AGED is not a symmetric similarity measure ! - - return comparison_matrix \ No newline at end of file diff --git a/gmatch4py/ged/bipartite_graph_matching_2.py b/gmatch4py/ged/bipartite_graph_matching_2.py deleted file mode 100644 index e63d7ff..0000000 --- a/gmatch4py/ged/bipartite_graph_matching_2.py +++ /dev/null @@ -1,147 +0,0 @@ -# coding = utf-8 -import numpy as np - - -class BP_2(): - """ - - """ - __type__="dist" - @staticmethod - def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1): - n = len(listgs) - comparator = BP_2(c_del_node, c_ins_node, c_del_edge, c_ins_edge) - comparison_matrix = np.zeros((n, n)) - for i in range(n): - for j in range(i, n): - comparison_matrix[i, j] = comparator.bp2(listgs[i], listgs[j]) - comparison_matrix[j, i] = comparison_matrix[i, j] - - - - - return comparison_matrix - - def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1): - """Constructor for HED""" - self.node_del = node_del - self.node_ins = node_ins - self.edge_del = edge_del - self.edge_ins = edge_ins - - def bp2(self, g1, g2): - """ - Compute de Hausdorff Edit Distance - :param g1: first graph - :param g2: second graph - :return: - """ - return min(self.distance(self.psi(g1,g2)),self.distance(self.psi(g2,g1))) - - def distance(self,e): - return np.sum(e) - - def psi(self,g1,g2): - psi_=[] - nodes1 = g1.nodes() - nodes2 = g2.nodes() - for u in nodes1: - v=None - for w in nodes2: - if 2*self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,None) + self.fuv(g1,g2,None,w)\ - and self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,v): - v=w - psi_.append(self.fuv(g1,g2,u,v)) - if u: - nodes1= list(set(nodes1).difference(set([u]))) - if v: - nodes2= list(set(nodes2).difference(set([v]))) - for v in nodes2: - psi_.append(self.fuv(g1,g2,None,v)) - return psi_ - - - def fuv(self, g1, g2, n1, n2): - """ - Compute the Node Distance function - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: - """ - if n2 == None: # Del - return self.node_del + ((self.edge_del / 2) * g1.degree(n1)) - if n1 == None: # Insert - return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2)) - else: - if n1 == n2: - return 0. - return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2 - - def hed_edge(self, g1, g2, n1, n2): - """ - Compute HEDistance between edges of n1 and n2, respectively in g1 and g2 - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: - """ - return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2) - - def get_edge_multigraph(self, g, node): - """ - Get list of edge around a node in a Multigraph - :param g: multigraph - :param node: node in the multigraph - :return: - """ - edges = [] - for edge in g.edges(data=True): - if node == edge[0] or node == edge[1]: - edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"])) - return edges - - def sum_gpq(self, g1, n1, g2, n2): - """ - Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2 - :param g1: first graph - :param n1: node in the first graph - :param g2: second graph - :param n2: node in the second graph - :return: - """ - - #if isinstance(g1, nx.MultiDiGraph): - edges1 = self.get_edge_multigraph(g1, n1) - edges2 = self.get_edge_multigraph(g2, n2) - #else: - #print(1) - #edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())] - #edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())] - edges2.extend([None]) - min_sum = np.zeros(len(edges1)) - for i in range(len(edges1)): - min_i = np.zeros(len(edges2)) - for j in range(len(edges2)): - min_i[j] = self.gpq(edges1[i], edges2[j]) - min_sum[i] = np.min(min_i) - return np.sum(min_sum) - - def gpq(self, e1, e2): - """ - Compute the edge distance function - :param e1: edge1 - :param e2: edge2 - :return: - """ - if e2 == None: # Del - return self.edge_del - if e1 == None: # Insert - return self.edge_ins - else: - if e1 == e2: - return 0. - return (self.edge_del + self.edge_ins) / 2 - diff --git a/gmatch4py/ged/graph/edge_graph.py b/gmatch4py/ged/graph/edge_graph.py deleted file mode 100644 index 24b8bda..0000000 --- a/gmatch4py/ged/graph/edge_graph.py +++ /dev/null @@ -1,16 +0,0 @@ -# -*- coding: UTF-8 -*- - - -class EdgeGraph(): - - def __init__(self, init_node, nodes): - self.init_node=init_node - self.nodes_ = nodes - self.edge=nodes - def nodes(self): - return self.nodes_ - - def size(self): - return len(self.nodes) - def __len__(self): - return len(self.nodes_) diff --git a/gmatch4py/ged/greedy_edit_distance.py b/gmatch4py/ged/greedy_edit_distance.py deleted file mode 100644 index a4b148e..0000000 --- a/gmatch4py/ged/greedy_edit_distance.py +++ /dev/null @@ -1,44 +0,0 @@ -# coding = utf-8 -import numpy as np - -from .algorithm.graph_edit_dist import GraphEditDistance - - -class GreedyEditDistance(GraphEditDistance): - """ - Implementation of the Greedy Edit Distance presented in : - - Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement - Andreas Fischer, Kaspar Riesen, Horst Bunke - 2016 - """ - __type__ = "dist" - @staticmethod - def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1): - n = len(listgs) - comparison_matrix = np.zeros((n, n)) - for i in range(n): - for j in range(i, n): - comparison_matrix[i, j] = GreedyEditDistance(listgs[i], listgs[j],False, node_del=c_del_node, - node_ins=c_ins_node, edge_del=c_del_edge, - edge_ins=c_ins_edge).distance() - comparison_matrix[j, i] = comparison_matrix[i, j] - - - return comparison_matrix - - def __init__(self,g1,g2,debug=False,**kwargs): - """Constructor for GreedyEditDistance""" - super().__init__(g1,g2,debug,**kwargs) - - - def edit_costs(self): - cost_matrix=self.create_cost_matrix() - cost_matrix_2=cost_matrix.copy() - psi=[] - for i in range(len(cost_matrix)): - phi_i=np.argmin((cost_matrix[i])) - cost_matrix=np.delete(cost_matrix,phi_i,1) - psi.append([i,phi_i+i]) #+i to compensate the previous column deletion - return [cost_matrix_2[psi[i][0]][psi[i][1]] for i in range(len(psi))] - diff --git a/gmatch4py/ged/hausdorff_edit_distance.py b/gmatch4py/ged/hausdorff_edit_distance.py deleted file mode 100644 index e3e24c9..0000000 --- a/gmatch4py/ged/hausdorff_edit_distance.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding = utf-8 - -import numpy as np - - -class HED(): - """ - Implementation of Hausdorff Edit Distance described in - - Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement - Andreas Fischer, Kaspar Riesen, Horst Bunke - 2016 - """ - __type__ = "dist" - @staticmethod - def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1): - n = len(listgs) - comparator = HED(c_del_node, c_ins_node, c_del_edge, c_ins_edge) - comparison_matrix = np.zeros((n, n)) - for i in range(n): - for j in range(i, n): - comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j]) - comparison_matrix[j, i] = comparison_matrix[i, j] - - return comparison_matrix - - - def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1): - """Constructor for HED""" - self.node_del = node_del - self.node_ins = node_ins - self.edge_del = edge_del - self.edge_ins = edge_ins - - def hed(self, g1, g2): - """ - Compute de Hausdorff Edit Distance - :param g1: first graph - :param g2: second graph - :return: - """ - return self.sum_fuv(g1, g2) + self.sum_fuv(g2, g1) - - def sum_fuv(self, g1, g2): - """ - Compute Nearest Neighbour Distance between G1 and G2 - :param g1: First Graph - :param g2: Second Graph - :return: - """ - min_sum = np.zeros(len(g1)) - nodes1 = g1.nodes() - nodes2 = g2.nodes() - nodes2.extend([None]) - for i in range(len(nodes1)): - min_i = np.zeros(len(nodes2)) - for j in range(len(nodes2)): - min_i[j] = self.fuv(g1, g2, nodes1[i], nodes2[j]) - min_sum[i] = np.min(min_i) - return np.sum(min_sum) - - def fuv(self, g1, g2, n1, n2): - """ - Compute the Node Distance function - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: - """ - if n2 == None: # Del - return self.node_del + ((self.edge_del / 2) * g1.degree(n1)) - if n1 == None: # Insert - return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2)) - else: - if n1 == n2: - return 0. - return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2 - - def hed_edge(self, g1, g2, n1, n2): - """ - Compute HEDistance between edges of n1 and n2, respectively in g1 and g2 - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: - """ - return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2) - - def get_edge_multigraph(self, g, node): - """ - Get list of edge around a node in a Multigraph - :param g: multigraph - :param node: node in the multigraph - :return: - """ - edges = [] - for edge in g.edges(data=True): - if node == edge[0] or node == edge[1]: - edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"])) - return edges - - def sum_gpq(self, g1, n1, g2, n2): - """ - Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2 - :param g1: first graph - :param n1: node in the first graph - :param g2: second graph - :param n2: node in the second graph - :return: - """ - - #if isinstance(g1, nx.MultiDiGraph): - edges1 = self.get_edge_multigraph(g1, n1) - edges2 = self.get_edge_multigraph(g2, n2) - - #else: - #edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())] - #edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())] - - min_sum = np.zeros(len(edges1)) - edges2.extend([None]) - for i in range(len(edges1)): - min_i = np.zeros(len(edges2)) - for j in range(len(edges2)): - min_i[j] = self.gpq(edges1[i], edges2[j]) - min_sum[i] = np.min(min_i) - return np.sum(min_sum) - - def gpq(self, e1, e2): - """ - Compute the edge distance function - :param e1: edge1 - :param e2: edge2 - :return: - """ - if e2 == None: # Del - return self.edge_del - if e1 == None: # Insert - return self.edge_ins - else: - if e1 == e2: - return 0 - return (self.edge_del + self.edge_ins) / 2 diff --git a/gmatch4py/helpers/__init__.py b/gmatch4py/helpers/__init__.py deleted file mode 100644 index 950f635..0000000 --- a/gmatch4py/helpers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# coding = utf-8 \ No newline at end of file diff --git a/gmatch4py/helpers/networkx_parser.py b/gmatch4py/helpers/networkx_parser.py deleted file mode 100644 index d67049a..0000000 --- a/gmatch4py/helpers/networkx_parser.py +++ /dev/null @@ -1,148 +0,0 @@ -# coding = utf-8 - -import networkx as nx -import graph_tool as gt - - - -def get_prop_type(value, key=None): - """ - Performs typing and value conversion for the graph_tool PropertyMap class. - If a key is provided, it also ensures the key is in a format that can be - used with the PropertyMap. Returns a tuple, (type name, value, key) - """ - # Deal with the value - if isinstance(value, bool): - tname = 'bool' - - elif isinstance(value, int): - tname = 'float' - value = float(value) - - elif isinstance(value, float): - tname = 'float' - - elif isinstance(value, str): - tname = 'string' - value = str(value) - - elif isinstance(value, dict): - tname = 'object' - - else: - tname = 'string' - value = str(value) - - return tname, value, key - - -def nx2gt(nxG): - """ - Converts a networkx graph to a graph-tool graph. - """ - # Phase 0: Create a directed or undirected graph-tool Graph - gtG = gt.Graph(directed=nxG.is_directed()) - - # Add the Graph properties as "internal properties" - for key, value in nxG.graph.items(): - # Convert the value and key into a type for graph-tool - tname, value, key = get_prop_type(value, key) - - prop = gtG.new_graph_property(tname) # Create the PropertyMap - gtG.graph_properties[key] = prop # Set the PropertyMap - gtG.graph_properties[key] = value # Set the actual value - - # Phase 1: Add the vertex and edge property maps - # Go through all nodes and edges and add seen properties - # Add the node properties first - nprops = set() # cache keys to only add properties once - for node, data in nxG.nodes_iter(data=True): - - # Go through all the properties if not seen and add them. - for key, val in data.items(): - if key in nprops: continue # Skip properties already added - - # Convert the value and key into a type for graph-tool - tname, _, key = get_prop_type(val, key) - - prop = gtG.new_vertex_property(tname) # Create the PropertyMap - gtG.vertex_properties[key] = prop # Set the PropertyMap - - # Add the key to the already seen properties - nprops.add(key) - - # Also add the node id: in NetworkX a node can be any hashable type, but - # in graph-tool node are defined as indices. So we capture any strings - # in a special PropertyMap called 'id' -- modify as needed! - gtG.vertex_properties['id'] = gtG.new_vertex_property('string') - - # Add the edge properties second - eprops = set() # cache keys to only add properties once - for src, dst, data in nxG.edges_iter(data=True): - - # Go through all the edge properties if not seen and add them. - for key, val in data.items(): - if key in eprops: continue # Skip properties already added - - # Convert the value and key into a type for graph-tool - tname, _, key = get_prop_type(val, key) - - prop = gtG.new_edge_property(tname) # Create the PropertyMap - gtG.edge_properties[key] = prop # Set the PropertyMap - - # Add the key to the already seen properties - eprops.add(key) - - # Phase 2: Actually add all the nodes and vertices with their properties - # Add the nodes - vertices = {} # vertex mapping for tracking edges later - for node, data in nxG.nodes_iter(data=True): - - # Create the vertex and annotate for our edges later - v = gtG.add_vertex() - vertices[node] = v - - # Set the vertex properties, not forgetting the id property - data['id'] = str(node) - for key, value in data.items(): - gtG.vp[key][v] = value # vp is short for vertex_properties - - # Add the edges - for src, dst, data in nxG.edges_iter(data=True): - - # Look up the vertex structs from our vertices mapping and add edge. - e = gtG.add_edge(vertices[src], vertices[dst]) - - # Add the edge properties - for key, value in data.items(): - gtG.ep[key][e] = value # ep is short for edge_properties - - # Done, finally! - return gtG - - -if __name__ == '__main__': - - # Create the networkx graph - nxG = nx.Graph(name="Undirected Graph") - nxG.add_node("v1", name="alpha", color="red") - nxG.add_node("v2", name="bravo", color="blue") - nxG.add_node("v3", name="charlie", color="blue") - nxG.add_node("v4", name="hub", color="purple") - nxG.add_node("v5", name="delta", color="red") - nxG.add_node("v6", name="echo", color="red") - - nxG.add_edge("v1", "v2", weight=0.5, label="follows") - nxG.add_edge("v1", "v3", weight=0.25, label="follows") - nxG.add_edge("v2", "v4", weight=0.05, label="follows") - nxG.add_edge("v3", "v4", weight=0.35, label="follows") - nxG.add_edge("v5", "v4", weight=0.65, label="follows") - nxG.add_edge("v6", "v4", weight=0.53, label="follows") - nxG.add_edge("v5", "v6", weight=0.21, label="follows") - - for item in nxG.edges_iter(data=True): - print(item) - - # Convert to graph-tool graph - gtG = nx2gt(nxG) - gtG.list_properties() \ No newline at end of file diff --git a/gmatch4py/kernels/weisfeiler_lehman.py b/gmatch4py/kernels/weisfeiler_lehman.py deleted file mode 100644 index e7139c7..0000000 --- a/gmatch4py/kernels/weisfeiler_lehman.py +++ /dev/null @@ -1,136 +0,0 @@ -# coding = utf-8 - -"""Weisfeiler_Lehman graph kernel. - -Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by: -Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt -Mehlhorn, Karsten M. Borgwardt, JMLR, 2012. -http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html - -Author : Sandro Vega-Pons, Emanuele Olivetti -Source : https://github.com/emanuele/jstsp2015/blob/master/gk_weisfeiler_lehman.py -Modified by : Jacques Fizen -""" - -import copy - -import networkx as nx -import numpy as np - - -class WeisfeleirLehmanKernel(object): - __type__ = "sim" - @staticmethod - def compare(graph_list,h=2): - """Compute the all-pairs kernel values for a list of graphs. - This function can be used to directly compute the kernel - matrix for a list of graphs. The direct computation of the - kernel matrix is faster than the computation of all individual - pairwise kernel values. - Parameters - ---------- - graph_list: list - A list of graphs (list of networkx graphs) - h : interger - Number of iterations. - node_label : boolean - Whether to use original node labels. True for using node labels - saved in the attribute 'node_label'. False for using the node - degree of each node as node attribute. - Return - ------ - K: numpy.array, shape = (len(graph_list), len(graph_list)) - The similarity matrix of all graphs in graph_list. - """ - - n = len(graph_list) - k = [0] * (h + 1) - n_nodes = 0 - n_max = 0 - - # Compute adjacency lists and n_nodes, the total number of - # nodes in the dataset. - for i in range(n): - n_nodes += graph_list[i].number_of_nodes() - - # Computing the maximum number of nodes in the graphs. It - # will be used in the computation of vectorial - # representation. - if (n_max < graph_list[i].number_of_nodes()): - n_max = graph_list[i].number_of_nodes() - - phi = np.zeros((n_nodes, n), dtype=np.uint64) - - # INITIALIZATION: initialize the nodes labels for each graph - # with their labels or with degrees (for unlabeled graphs) - - labels = [0] * n - label_lookup = {} - label_counter = 0 - - # label_lookup is an associative array, which will contain the - # mapping from multiset labels (strings) to short labels - # (integers) - for i in range(n): - nodes = graph_list[i].nodes() - # It is assumed that the graph has an attribute - # 'node_label' - labels[i] = np.zeros(len(nodes), dtype=np.int32) - - for j in range(len(nodes)): - if not (nodes[j] in label_lookup): - label_lookup[nodes[j]] = str(label_counter) - labels[i][j] = label_counter - label_counter += 1 - else: - labels[i][j] = label_lookup[nodes[j]] - # labels are associated to a natural number - # starting with 0. - - phi[labels[i][j], i] += 1 - - graph_list[i]=nx.relabel_nodes(graph_list[i],label_lookup) - k = np.dot(phi.transpose(), phi).astype(np.float64) - - # MAIN LOOP - it = 0 - new_labels = copy.deepcopy(labels) # Can't work without it !!! - - while it < h: - # create an empty lookup table - label_lookup = {} - label_counter = 0 - - phi = np.zeros((n_nodes, n)) - for i in range(n): - nodes = graph_list[i].nodes() - for v in range(len(nodes)): - # form a multiset label of the node v of the i'th graph - # and convert it to a string - - long_label = [] - long_label.extend(nx.neighbors(graph_list[i],nodes[v])) - - long_label_string = "".join(long_label) - # if the multiset label has not yet occurred, add it to the - # lookup table and assign a number to it - if not (long_label_string in label_lookup): - label_lookup[long_label_string] = str(label_counter) - new_labels[i][v] = label_counter - label_counter += 1 - else: - new_labels[i][v] = label_lookup[long_label_string] - # fill the column for i'th graph in phi - aux = np.bincount(new_labels[i]) - phi[new_labels[i], i] += aux[new_labels[i]] - - k += np.dot(phi.transpose(), phi) - it = it + 1 - - # Compute the normalized version of the kernel - k_norm = np.zeros(k.shape) - for i in range(k.shape[0]): - for j in range(k.shape[1]): - k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j]) - - return k_norm \ No newline at end of file diff --git a/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx b/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx index f297507..14c22da 100644 --- a/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx +++ b/gmatch4py_cython/gmatch4py/bag_of_cliques.pyx @@ -6,18 +6,68 @@ from typing import Sequence import networkx as nx import numpy as np cimport numpy as np +import sys + +from networkit import graph +from networkit.clique import MaximalCliques + +def nx2nk(nxG, weightAttr=None): + """ + Convert a networkx.Graph to a NetworKit.Graph + :param weightAttr: the edge attribute which should be treated as the edge weight. + """ + + # map networkx node ids to consecutive numerical node ids + idmap = dict((id, u) for (id, u) in zip(nxG.nodes(), range(nxG.number_of_nodes()))) + z = max(idmap.values()) + 1 + # print("z = {0}".format(z)) + + if weightAttr is not None: + nkG = graph.Graph(z, weighted=True, directed=nxG.is_directed()) + for (u_, v_) in nxG.edges(): + u, v = idmap[u_], idmap[v_] + w = nxG[u_][v_][weightAttr] + nkG.addEdge(u, v, w) + else: + nkG = graph.Graph(z, directed=nxG.is_directed()) + for (u_, v_) in nxG.edges(): + u, v = idmap[u_], idmap[v_] + # print(u_, v_, u, v) + assert (u < z) + assert (v < z) + nkG.addEdge(u, v) + + assert (nkG.numberOfNodes() == nxG.number_of_nodes()) + assert (nkG.numberOfEdges() == nxG.number_of_edges()) + return nkG.removeSelfLoops(),idmap + +def getClique(nx_graph): + final_cliques=[] + if len(nx_graph) ==0: + return final_cliques + netkit_graph,idmap=nx2nk(nx_graph) + idmap={v:k for k,v in idmap.items()} + cliques=MaximalCliques(netkit_graph).run().getCliques() + for cl in cliques: + final_cliques.append(list(map(lambda x:idmap[x],cl))) + return final_cliques class BagOfCliques(): @staticmethod - def compare(graphs): + def compare(graphs,selected): b=BagOfCliques() - bog=b.getBagOfCliques(graphs) + bog=b.getBagOfCliques(graphs).astype(np.float32) #Compute cosine similarity - cdef np.ndarray scores=np.dot(bog,bog.T) + cdef int n=bog.shape[0] + cdef double[:,:] scores = np.zeros((n,n)) + cdef int i for i in range(len(scores)): - for j in range(len(scores)): - scores[i,j]/=(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line + if not i in selected: + continue + for j in range(i,len(scores)): + scores[i,j]=(np.dot(bog[i],bog[j]))/(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line + scores[j,i]=scores[i,j] return scores def getUniqueCliques(self,graphs): @@ -25,15 +75,26 @@ class BagOfCliques(): Return unique cliques from a population of graphs :return: """ - tree = {} + t = {} c_ = 0 cdef list clique_vocab = [] cdef list cli_temp cdef list cliques + cdef int len_graphs=len(graphs) + cdef int km= -1 for g in graphs: - cliques = list(nx.find_cliques(nx.Graph(g))) + km+=1 + if not g: + continue + sys.stdout.write("\r{0}/{1} -- {2}".format(km,len_graphs,len(g))) + try: + cliques = list(getClique(nx.Graph(g))) + except: + #no clique found + print(nx.Graph(g).edges()) + cliques =[] for clique in cliques: - t = tree + cli_temp = copy.deepcopy(clique) new_clique = False for i in range(len(clique)): @@ -57,33 +118,51 @@ class BagOfCliques(): return clique_vocab - def ifHaveMinor(self,G, list H): + def clique2str(self,cliques): + return "".join(sorted(cliques)) + + def transform_clique_vocab(self,clique_vocab): + cdef dict new_vocab={} + cdef int len_voc=len(clique_vocab) + for c in range(len_voc): + print(c) + new_vocab[self.clique2str(clique_vocab[c])]=c + return new_vocab + + + def ifHaveMinor(self,clique, dict mapping): """ If a clique (minor) H belong to a graph G :param H: :return: """ - if nx.Graph(G).subgraph(H).nodes() == H: + if self.clique2str(clique) in mapping: return 1 return 0 - def getBagOfCliques(self,graphs ): + def getBagOfCliques(self,graphs ): """ :param clique_vocab: :return: """ cdef list clique_vocab=self.getUniqueCliques(graphs) - + print("DONE") + cdef dict map_str_cliques=self.transform_clique_vocab(clique_vocab) + print("DONE2") cdef int l_v=len(clique_vocab) cdef np.ndarray boc = np.zeros((len(graphs), l_v)) cdef np.ndarray vector - + cdef list cliques for g in range(len(graphs)): + sys.stdout.write("\r{0}/{1}".format(g,5552)) gr = graphs[g] vector = np.zeros(l_v) - for m in range(l_v): - vector[m] = self.ifHaveMinor(gr, clique_vocab[m]) + cliques = list(getClique(nx.Graph(gr))) + for clique in cliques: + hash=self.clique2str(clique) + if hash in map_str_cliques: + vector[map_str_cliques[hash]] = 1 boc[g] = vector return boc \ No newline at end of file diff --git a/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx b/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx index 5a50e9e..27ea437 100644 --- a/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx +++ b/gmatch4py_cython/gmatch4py/ged/approximate_ged.pyx @@ -3,7 +3,7 @@ import numpy as np from .algorithm.graph_edit_dist import GraphEditDistance - +from cython.parallel import prange class ApproximateGraphEditDistance(): __type__ = "dist" @@ -11,20 +11,23 @@ class ApproximateGraphEditDistance(): @staticmethod def compare(listgs,selected,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1): cdef int n= len(listgs) - comparison_matrix = np.zeros((n,n)) - for i in range(n): + cdef double[:,:] comparison_matrix = np.zeros((n,n)) + cdef int i,j + for i in prange(n,nogil=True): for j in range(i,n): - f=True - if not listgs[i] or not listgs[j]: - f=False - elif len(listgs[i])== 0 or len(listgs[j]) == 0: - f=False - if selected: - if not i in selected: + with gil: + f=True + if not listgs[i] or not listgs[j]: + f=False + elif len(listgs[i])== 0 or len(listgs[j]) == 0: f=False - if f: - comparison_matrix[i, j] = GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance() - else: - comparison_matrix[i, j] = 0. - comparison_matrix[j, i] = comparison_matrix[i, j] + if selected: + if not i in selected: + f=False + + if f: + comparison_matrix[i][j] = GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance() + else: + comparison_matrix[i][j] = np.inf + comparison_matrix[j][i] = comparison_matrix[i][j] return comparison_matrix \ No newline at end of file diff --git a/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx b/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx index 772f73a..d6d68e6 100644 --- a/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx +++ b/gmatch4py_cython/gmatch4py/ged/bipartite_graph_matching_2.pyx @@ -31,7 +31,7 @@ cdef class BP_2(): if f: comparison_matrix[i, j] = comparator.bp2(listgs[i], listgs[j]) else: - comparison_matrix[i, j] = 0. + comparison_matrix[i, j] = np.inf comparison_matrix[j, i] = comparison_matrix[i, j] return comparison_matrix diff --git a/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx b/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx index b297bea..7752030 100644 --- a/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx +++ b/gmatch4py_cython/gmatch4py/ged/greedy_edit_distance.pyx @@ -32,7 +32,7 @@ class GreedyEditDistance(GraphEditDistance): node_ins=c_ins_node, edge_del=c_del_edge, edge_ins=c_ins_edge).distance() else: - comparison_matrix[i, j] = 0. + comparison_matrix[i, j] = np.inf comparison_matrix[j, i] = comparison_matrix[i, j] diff --git a/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx b/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx index a3abc77..06dc664 100644 --- a/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx +++ b/gmatch4py_cython/gmatch4py/ged/hausdorff_edit_distance.pyx @@ -25,12 +25,18 @@ cdef class HED: cdef np.ndarray comparison_matrix = np.zeros((n, n)) for i in range(n): for j in range(i, n): + f=True if not listgs[i] or not listgs[j]: - continue + f=False + elif len(listgs[i])== 0 or len(listgs[j]) == 0: + f=False if selected: if not i in selected: - continue - comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j]) + f=False + if f: + comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j]) + else: + comparison_matrix[i, j] = np.inf comparison_matrix[j, i] = comparison_matrix[i, j] return comparison_matrix diff --git a/gmatch4py_cython/gmatch4py/jaccard.pyx b/gmatch4py_cython/gmatch4py/jaccard.pyx index 2699457..894afc6 100644 --- a/gmatch4py_cython/gmatch4py/jaccard.pyx +++ b/gmatch4py_cython/gmatch4py/jaccard.pyx @@ -31,9 +31,14 @@ class Jaccard(): f=False if f: inter_ver,inter_ed = Jaccard.intersect_graph(g1,g2) - comparison_matrix[i,j]=(len(inter_ver)/len(Jaccard.union_nodes(g1,g2)))*(len(inter_ed)/len(Jaccard.union_edges(g1,g2))) + un_ver,un_edg=Jaccard.union_nodes(g1,g2),Jaccard.union_edges(g1,g2) + if len(un_ver) == 0 or len(un_edg) == 0: + comparison_matrix[i, j] = 0. + else: + comparison_matrix[i,j]=(len(inter_ver)/len(un_ver))*(len(inter_ed)/len(un_edg)) else: comparison_matrix[i, j] = 0. + comparison_matrix[j, i] = comparison_matrix[i, j] return comparison_matrix diff --git a/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx b/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx index 809fb27..2731f0f 100644 --- a/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx +++ b/gmatch4py_cython/gmatch4py/kernels/weisfeiler_lehman.pyx @@ -22,7 +22,7 @@ cimport numpy as np class WeisfeleirLehmanKernel(object): __type__ = "sim" @staticmethod - def compare(graph_list,h=2): + def compare(graph_list,selected,h=2): """Compute the all-pairs kernel values for a list of graphs. This function can be used to directly compute the kernel matrix for a list of graphs. The direct computation of the diff --git a/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx b/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx index f0856b1..7888376 100644 --- a/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx +++ b/gmatch4py_cython/gmatch4py/vertex_edge_overlap.pyx @@ -19,7 +19,7 @@ class VertexEdgeOverlap(): """ @staticmethod - def compare(list listgs): + def compare(list listgs,selected): n = len(listgs) cdef np.ndarray comparison_matrix = np.zeros((n, n)) cdef list inter_ver @@ -27,15 +27,25 @@ class VertexEdgeOverlap(): cdef int denom for i in range(n): for j in range(i,n): - g1 = listgs[i] - g2 = listgs[j] - inter_ver,inter_ed = VertexEdgeOverlap.intersect_graph(g1,g2) - denom=len(g1)+len(g2)+len(g1.edges(data=True))+len(g2.edges(data=True)) - if denom == 0: - continue - comparison_matrix[i,j]=2*(len(inter_ver)+len(inter_ed))/denom # Data = True --> For nx.MultiDiGraph - comparison_matrix[j,i]=comparison_matrix[i,j] - + f=True + if not listgs[i] or not listgs[j]: + f=False + elif len(listgs[i])== 0 or len(listgs[j]) == 0: + f=False + if selected: + if not i in selected: + f=False + if f: + g1 = listgs[i] + g2 = listgs[j] + inter_ver,inter_ed = VertexEdgeOverlap.intersect_graph(g1,g2) + denom=len(g1)+len(g2)+len(g1.edges(data=True))+len(g2.edges(data=True)) + if denom == 0: + continue + comparison_matrix[i,j]=2*(len(inter_ver)+len(inter_ed))/denom # Data = True --> For nx.MultiDiGraph + else: + comparison_matrix[i, j] = 0. + comparison_matrix[j, i] = comparison_matrix[i, j] return comparison_matrix diff --git a/gui_graph_viewer/db.py b/gui_graph_viewer/db.py index 9c159cd..eb13f76 100644 --- a/gui_graph_viewer/db.py +++ b/gui_graph_viewer/db.py @@ -73,22 +73,22 @@ class Eval(Base): __tablename__="evals" __table_args__ = {'sqlite_autoincrement': True} id = Column(Integer, primary_key=True) - id_g1 = Column(Integer,ForeignKey('graphs.id')) - id_g2 = Column(Integer, ForeignKey('graphs.id')) - mesure = Column(Integer, ForeignKey('mesures.id')) - type = Column(String(20)) + id_g1 = Column(Integer)#,ForeignKey('graphs.id')) + id_g2 = Column(Integer)#, ForeignKey('graphs.id')) + #mesure = Column(Integer, ForeignKey('mesures.id')) + #type = Column(String(20)) id_user = Column(Integer, ForeignKey('users.id')) c1_val= Column(Boolean) c2_val = Column(Boolean) c3_val = Column(Boolean) c4_val = Column(Boolean) - def __init__(self, id_g1, id_g2, mesure,type_,id_user,c1,c2,c3,c4): + def __init__(self, id_g1, id_g2,id_user,c1,c2,c3,c4): self.id_g1 = id_g1 self.id_g2 = id_g2 - self.mesure = mesure + #self.mesure = mesure self.id_user = id_user - self.type = type_ + #self.type = type_ self.c1_val = c1 self.c2_val = c2 self.c3_val = c3 @@ -97,8 +97,8 @@ class Eval(Base): return { "id_g1":self.id_g1, "id_g2": self.id_g2, - "mesure": self.mesure, - "type": self.type, + #"mesure": self.mesure, + #"type": self.type, "id_user": self.id_user, "c1": self.c1_val, "c2": self.c2_val, @@ -148,6 +148,6 @@ if __name__ == '__main__': types=["normal","extension_1","extension_2","extension_3","gen_all_1","gen_all_2", "gen_capital","gen_country","gen_region","gen_town"] add_users(session,user_input) - add_unique(session,mesures,Mesure) - add_graphs_data(session,range(532),"graph_exp_18_fev") + #add_unique(session,mesures,Mesure) + #add_graphs_data(session,range(532),"graph_exp_18_fev") diff --git a/gui_graph_viewer/server.py b/gui_graph_viewer/server.py index 15f7076..66acd7f 100644 --- a/gui_graph_viewer/server.py +++ b/gui_graph_viewer/server.py @@ -49,6 +49,22 @@ for fn in dataFiles: data_[fn.replace(all_,"").rstrip(".json")]=fn print("File Available",data_.keys()) +# couple to annotate +new_data=json.load(open(os.path.join(dir_,"couple_to_annotate.json"))) +current_index=0 +graph_data=json.load(open(os.path.join(dir_,"data_graphs.json"))) + + +def init(): + global current_index + last_row = sql_session.query(Eval).order_by(Eval.id.desc()).first() + last_id_g1 = last_row.id_g1 + last_id_g2 = last_row.id_g2 + print(last_id_g1, last_id_g2) + for i in range(len(new_data)): + if new_data[i][1] == last_id_g1 and new_data[i][0] == last_id_g2: + current_index = i + break @app.route("/") @@ -73,8 +89,39 @@ def index(gmmeasure=None): max_[int(k)]=0.0 if math.isnan(min_[int(k)]): min_[int(k)]=0.0 + init() return render_template("index.html",data=json.dumps(data),measureAvailable=list(data_.keys()),measure=gmmeasure,max=max_,min=min_,dirs=available_dir,type_=current_type) + +@app.route("/annot_2") +@login_required +def index2(): + current_couple=new_data[current_index] + progress=(current_index/len(new_data))*100 + return render_template( + "indexv2.html", + current_couple=current_couple, + graphs_data_1=graph_data[str(current_couple[0])], + graphs_data_2=graph_data[str(current_couple[1])], + progress=progress + ) + + +@app.route("/couple_moins") +@login_required +def couple_moins(gmmeasure=None): + global current_index + if not current_index-1 <0: + current_index-=1 + return redirect("/annot_2") + +@app.route("/couple_plus") +@login_required +def couple_plus(gmmeasure=None): + global current_index + if not current_index >= len(new_data): + current_index+=1 + return redirect("/annot_2") @app.route("/about") def about(): global available_dir @@ -112,6 +159,52 @@ def get_assoc(g1id,g2id): return jsonify({}) except: return jsonify({}) + +@app.route("/get_info_2/<g1id>/<g2id>") +def getinfo_2(g1id,g2id): + try: + info=sql_session.query(Eval).filter_by(id_g1=g1id,id_g2=g2id) + print(info) + if info.count() >0: + return jsonify(info.first().__repr__()) + else: + return jsonify({}) + except: + return jsonify({}) +@app.route("/get_assoc_2/<g1id>/<g2id>") +def get_assoc_2(g1id,g2id): + + info=sql_session.query(Eval).filter_by(id_g1=g1id,id_g2=g2id) + print(info) + if info.count() >0: + return jsonify(info.first().__repr__()) + else: + return jsonify({}) + + +@app.route("/save_eval_2/<g1id>/<g2id>/<int:c1>/<int:c2>/<int:c3>/<int:c4>") +def save_eval_2(g1id,g2id,c1,c2,c3,c4): + c1,c2,c3,c4=bool(c1),bool(c2),bool(c3),bool(c4) + eval_query = sql_session.query(Eval).filter_by( + id_g1=g1id, + id_g2=g2id, + id_user=current_user.id + ) + if eval_query.count()< 1: + sql_session.add(Eval(g1id,g2id,current_user.id,c1,c2,c3,c4)) + print("ADD",g1id, g2id, c1, c2, c3, c4) + else: + print("UPD",g1id, g2id, c1, c2, c3, c4) + eval_=eval_query.first() + eval_.c1_val = c1 + eval_.c2_val = c2 + eval_.c3_val = c3 + eval_.c4_val = c4 + sql_session.commit() + + + return "Oh Yeah" + def getMeasureid(mesure): mesure_query = sql_session.query(Mesure).filter_by( label=mesure diff --git a/gui_graph_viewer/static/css/dashboard.css b/gui_graph_viewer/static/css/dashboard.css index 8ffff76..c2f0b96 100644 --- a/gui_graph_viewer/static/css/dashboard.css +++ b/gui_graph_viewer/static/css/dashboard.css @@ -129,9 +129,13 @@ color:white !important; behavior: url(PIE.htc); /* remove if you don't care about IE8 */ padding-left: 0.3em; -padding-right: 0.3em; + padding-right: 0.3em; background: #fff; border: 2px solid #666; color: #666; text-align: center; +} + +.map_preview{ + height: 400px; } \ No newline at end of file diff --git a/gui_graph_viewer/static/js/helpers.js b/gui_graph_viewer/static/js/helpers.js index 8c70a19..676b4a3 100644 --- a/gui_graph_viewer/static/js/helpers.js +++ b/gui_graph_viewer/static/js/helpers.js @@ -30,7 +30,15 @@ function getColorEdge(col){ return "#87D37C" } } - +function getGeoData(data_) { + final_ = [] + var data2_ = data_["sp_entities"] + //console.log(data_); + for (var i in data2_) { + final_.push([data2_[i]["coord"][0], data2_[i]["coord"][1], data2_[i]["label"]]); + } + return final_; +} function generate_map(id_tiles, locations, edges) { /* diff --git a/gui_graph_viewer/templates/indexv2.html b/gui_graph_viewer/templates/indexv2.html new file mode 100644 index 0000000..7c64246 --- /dev/null +++ b/gui_graph_viewer/templates/indexv2.html @@ -0,0 +1,189 @@ +{% extends "skeleton.html" %} {% block body %} + <div class="container" style="margin-top: 1em"> + <div class="row"> + <div class="col-lg-12"> + <h2 class="text-center">Graph Matching Validation</h2> + <div class="progress" style="margin-bottom: 0.5em"> + <div class="progress-bar" role="progressbar" style="width: {{progress}}%" aria-valuenow="{{progress}}" aria-valuemin="0" aria-valuemax="100"></div> + </div> + <div class="row"> + <div class="col-lg-6" > + <h2 class="text-center">Graph N°<span id="label1"></span></h2> + <div id="map1" class="map_preview"></div> + </div> + <div class="col-lg-6" > + <h2 class="text-center">Graph N°<span id="label2"></span></h2> + <div id="map2" class="map_preview"></div> + </div> + + <div class="col-lg-12 offset-lg-2" style="margin-top: 1em;"> + <div class="row"> + <div class="col-lg-6"> + <h3 class=""> G1 → G2</h3> + <div id="annot_g1_g2"> + + </div> + </div> + <div class="col-lg-6"> + <h3> G1 ← G2</h3> + <div id="annot_g2_g1"> + + </div> + </div> + </div> + <div> + + </div> + <div class="col-lg-8 text-center"> + <a href="#" class="btn btn-warning" id="check_0">Different</a> + <a href="#" class="btn btn-warning" id="check_1">Check the 1s</a> + <a href="#" class="btn btn-warning" id="check_2">Check the 2s</a> + <a href="#" class="btn btn-warning" id="check_3">Check the 3s</a> + <a href="#" class="btn btn-warning" id="check_4">Check the 4s</a> + <br> + <br> + <a href="/couple_moins" id="prev_btn" class="btn btn-success">Previous Annotation</a> + <a href="/couple_plus" id="next_btn" class="btn btn-success">Next Annotation</a> + </div> + + </div> + + </div> + </div> + </div> + </div> + +{% endblock %} {% block script %} +<script> + + var couple={{current_couple | safe}}; + var g1={{graphs_data_1 | safe}}; + var g2={{graphs_data_2 | safe}}; + var point_1=getGeoData(g1); + var point_2=getGeoData(g2); + + generate_map("map1",point_1,g1["edges"]); + generate_map("map2",point_2,g2["edges"]); + $("#label1").text(couple[0]); + $("#label2").text(couple[1]); + + /* Récupération des résultats précédents*/ + var result={}; + var bool_val=[false,false,false,false]; + var bool_val_2=[false,false,false,false]; + + function get_bool_val(couple) { + var result = {}; + $.ajax({ + url: "/get_assoc_2/"+couple[0]+"/"+couple[1]+"", + async: false, + dataType: 'json', + success: function (json) { + result = json; + } + }); + return result + } + //In case the correspondance has been already annotated + + var result_1=get_bool_val(couple); + var result_2=get_bool_val([couple[1],couple[0]]); + + if (!isEmpty(result_1)){ + bool_val=[result_1.c1,result_1.c2,result_1.c3,result_1.c4]; + } + if (!isEmpty(result_2)){ + bool_val_2=[result_2.c1,result_2.c2,result_2.c3,result_2.c4]; + } + var id_=couple[0]+"_"+couple[1] + var id_inv=couple[1]+"_"+couple[0] + // Fin de la récuparation des résultats précédents + $("#annot_g1_g2").html( + getValidateForm( + bool_val,["ESS","ESC","KER","SPR"], + [id_,id_,id_,id_] + ) + ); + $("#annot_g2_g1").html( + getValidateForm( + bool_val_2,["ESS","ESC","KER","SPR"], + [id_inv,id_inv,id_inv,id_inv] + ) + ); + + $(".criteria-checkbox").change(function () { + c_val=[] + graph_id=this.getAttribute("name").split("_"); + console.debug(graph_id); + var g1=parseInt(graph_id[0]); + var g2= parseInt(graph_id[1]); + var _url="/save_eval_2/"+g1+"/"+g2; + $(this).parent().parent().find('input:checkbox').each(function () { + checked=$(this).is(':checked') + _url+="/"+(checked ? "1" : "0"); + }); + + console.log(_url); + $.ajax(_url).done(function () { + console.log("Saved Successfuly") + }); + }); + + function automatic_check(n){ + n-=1; + var i =0; + $("input[name*='"+id_+"']").each(function () { + if (i > n){ + return false; + } + $(this).prop("checked", true).change(); + i+=1; + }); + i=0; + $("input[name*='"+id_inv+"']").each(function () { + if (i > n){ + return false; + } + $(this).prop("checked", true).change(); + i+=1; + }); + } + function no_sim_check(){ + n=1; + var i =0; + $("input[name*='"+id_+"']").each(function () { + if (i > n){ + return false; + } + $(this).prop("checked", true).change(); + $(this).prop("checked", false).change(); + i+=1; + }); + i=0; + $("input[name*='"+id_inv+"']").each(function () { + if (i > n){ + return false; + } + $(this).prop("checked", true).change(); + $(this).prop("checked", false).change(); + i+=1; + }); + } + + $("#check_0").click(function () { + no_sim_check(); + }); + $("#check_1").click(function () { + automatic_check(1); + }); + $("#check_2").click(function () { + automatic_check(2); + }); + $("#check_3").click(function () { + automatic_check(3); + }); + $("#check_4").click(function () { + automatic_check(4); + }); +</script> +{% endblock %} \ No newline at end of file diff --git a/helpers/gazeteer_helpers.py b/helpers/gazeteer_helpers.py index be39e22..3d7a2bb 100644 --- a/helpers/gazeteer_helpers.py +++ b/helpers/gazeteer_helpers.py @@ -174,12 +174,18 @@ def count_of_se(label, lang): return response["count"] - -def get_most_most_probable_candidates(label, lang="fr",n=5): - res = es.search("gazetteer", "place", - body={"query":{"bool":{"must":[{"fuzzy":{lang:{"value":label}}}],"must_not":[],"should":[]}},"from":0,"size":n,"sort": [{'score':"desc"}],"aggs":{}}) - results=[] - if res["hits"]["total"] > 0: - for hit in res["hits"]["hits"]: - results.append(hit["_source"]) - return results +def get_top_candidate(label, lang): + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}},"sort": [ + { + "score": { + "order": "desc" + } + } + ],"size": 5} + # query = {"query":{"bool":{"must":[{"multi_match" : + # "fuzzy":{"query":label,"fields": [ "en", "fr" + # ,"es","de"]}}],"must_not":[],"should":[]}},"from":0,"size":500,"sort":[],"aggs":{}} + response = es.search('gazetteer', 'place', body=query) + if 'hits' in response['hits']: + return [x["_source"]["id"] for x in response['hits']['hits']] + return None diff --git a/models/str.py b/models/str.py index 88eef86..8ebe3c9 100644 --- a/models/str.py +++ b/models/str.py @@ -5,8 +5,11 @@ import warnings import networkx as nx import pandas as pd import logging + +from shapely.geometry import Point + from config.configuration import config -logging.basicConfig(filename=config.log_file,level=logging.INFO) +#logging.basicConfig(filename=config.log_file,level=logging.INFO) from helpers.boundary import is_intersect @@ -229,6 +232,7 @@ class STR(object): Return all the adjacency relationships between all the spatial entities in the STR. :return: """ + stop_class=set(["A-PCLI","A-ADM1"]) for se1 in self.spatial_entities: data_se1 = get_data(se1) for se2 in self.spatial_entities: @@ -263,6 +267,11 @@ class STR(object): #else: #if is_intersect(se1, se2, self.shapes): #f = True + if not f: + #print(data_se1,data_se2) + if Point(data_se1["coord"]["lat"],data_se1["coord"]["lon"]).distance(Point(data_se2["coord"]["lat"], data_se2["coord"]["lon"])) < 1\ + and len(set(data_se1["class"]) & stop_class) < 1 and len(set(data_se2["class"]) & stop_class) < 1: + f=True self.add_adjacency_rel(se1, se2,f) @@ -290,6 +299,7 @@ class STR(object): for se2 in self.adjacency_relationships[se1]: if self.adjacency_relationships[se1][se2]: graph.add_edge(se1,se2, key=0, color="green") + graph.add_edge(se2, se1, key=0, color="green") logging.info("Extract Adjacency Rel\t{0}".format(time.time()-debut)) if inc: diff --git a/nlp/bow_se.py b/nlp/bow_se.py index 9eee362..43e67c5 100644 --- a/nlp/bow_se.py +++ b/nlp/bow_se.py @@ -15,7 +15,7 @@ class BOWSE(object): __depreciated__ = True @staticmethod - def compare(graph_list, verbose=False): + def compare(graph_list, selected,verbose=False): """Compute the all-pairs kernel values for a list of graphs. This function can be used to directly compute the kernel matrix for a list of graphs. The direct computation of the diff --git a/nlp/disambiguator/models/bigram.py b/nlp/disambiguator/models/bigram.py index aa50f70..d9ce129 100644 --- a/nlp/disambiguator/models/bigram.py +++ b/nlp/disambiguator/models/bigram.py @@ -20,22 +20,24 @@ class BigramModel: self.count_associated[uri]=0 self.count_associated[uri]+=1 - def get_coocurence_probability_dict(self,*args): + def get_coocurence_probability(self, pr1, *args): if len(args) <2: print("Only one URI indicated") return 0. res_=1. for u in range(1,len(args)): - res_*=self.get_bigram_probability(args[0],args[u]) + res_*=self.get_bigram_probability(args[0],args[u],pr1) return res_ - def get_bigram_probability(self,uri1,uri2): - try: - return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1]) - except Exception as e: - try: - return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1]) - except Exception as e: - return 0.00000000000000001 + def get_bigram_probability(self,uri1,uri2,pr1=1): + nna=0.00000000000000001 + if uri1 in self.cooc_freq: + if uri2 in self.cooc_freq[uri1]: + return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1 + elif uri2 in self.cooc_freq: + if uri1 in self.cooc_freq[uri2]: + return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1 + return nna + diff --git a/nlp/disambiguator/wikipedia_cooc.py b/nlp/disambiguator/wikipedia_cooc.py index d441218..29d3591 100644 --- a/nlp/disambiguator/wikipedia_cooc.py +++ b/nlp/disambiguator/wikipedia_cooc.py @@ -1,10 +1,13 @@ # coding = utf-8 +import re + from nlp.disambiguator.disambiguator import Disambiguator from nlp.disambiguator.models.bigram import BigramModel import pickle from config.configuration import config -from helpers.gazeteer_helpers import get_data,count_of_se,get_most_common_id_v3 +from helpers.gazeteer_helpers import get_data,count_of_se,get_most_common_id_v3,get_top_candidate from .most_common import stop_words,common_words +import networkx as nx def read_pickle(fn): return pickle.load(open(fn,'rb')) @@ -19,27 +22,75 @@ class WikipediaDisambiguator(Disambiguator): def disambiguate(self, ner_result, lang="en"): count, se_ = self.extract_se_entities(ner_result) new_count = {} - selected_en = {} - fixed_se=set([]) - for en in se_: - if count_of_se(en,lang) == 1 : - fixed_se.add(en) - if len(fixed_se) >0: - selected_en = self.part_ambiguous(list(set(se_)-fixed_se),list(fixed_se)) - else: - selected_en = self.all_ambiguous(se_) - for en in se_: - id_,score=self.disambiguate_(en,lang) - if not id_ =="O" and id_: - selected_en[id_] = en - new_count[id_] = count[en] + selected_en_rev = {} + selected_en = self.disambiguate_wiki(se_,lang) + for en in selected_en: + selected_en_rev[selected_en[en]]=en + #new_count[selected_en[en]] = count[en] return new_count, selected_en - def all_ambiguous(self,entities): - pass + def disambiguate_wiki(self, entities, lang): + + spat_en=[] + for e in entities: + if re.match("^\d+$", e): + continue + if e.lower().rstrip("s") in stop_words[lang] or e.lower().rstrip("s") in common_words[lang]: + continue + + plural = e.rstrip("s") + "s" + if plural.lower() in stop_words[lang] or plural.lower() in common_words[lang]: + continue + spat_en.append(e) + + g = nx.Graph() + + possible_candidates = [] + betw_cand={} # indicate which toponym group a candidate belong to # maybe useless ... + group_candidate = {} #candidates per toponym + for e in spat_en: + cand = get_top_candidate(e, lang) + group_candidate[e] = cand + betw_cand[e]=cand + for n in cand: + betw_cand[n]=set(cand)-set(n) + possible_candidates.extend(cand) + + for cand in possible_candidates: + g.add_node(cand, label=get_data(cand)[lang]) + + for cand in possible_candidates: + for cand2 in possible_candidates: + # Get PageRank score + d = get_data(cand) + sc = 1 + if "score" in d: + sc = float(d["score"]) + + # Compute probability + prob = self.model.get_coocurence_probability(sc, cand, cand2) + if cand2 in betw_cand[cand] or cand in betw_cand[cand2]: + prob = 0.0 + if prob < 0.0000001: + prob = 0.0 + if not cand == cand2: + # take the lowest co-occurrency between two candidates + if (cand2, cand) in g.edges(): + if g.edge[cand2][cand]["weight"] < prob: + continue + g.add_edge(cand, cand2, weight=prob) + + selected = {} + #Take the candidates with the highest degree weighted + for gr in group_candidate: + try: + selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) + except: + #print(group_candidate[gr]) empty group + selected[gr]=get_most_common_id_v3(gr,lang) + #print(entities,selected) + return selected - def part_ambiguous(self,ambiguous_entities,fixed_entities): - pass \ No newline at end of file diff --git a/notebooks/EvalDesambiguisationMada.ipynb b/notebooks/EvalDesambiguisationMada.ipynb index 4cb419e..9045fca 100644 --- a/notebooks/EvalDesambiguisationMada.ipynb +++ b/notebooks/EvalDesambiguisationMada.ipynb @@ -5,8 +5,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2018-05-16T23:58:48.134280Z", - "start_time": "2018-05-16T23:58:47.729327Z" + "end_time": "2018-06-19T13:09:12.991345Z", + "start_time": "2018-06-19T13:09:12.578369Z" } }, "outputs": [], @@ -19,8 +19,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2018-05-16T23:58:48.140894Z", - "start_time": "2018-05-16T23:58:48.136384Z" + "end_time": "2018-06-19T13:09:13.002216Z", + "start_time": "2018-06-19T13:09:12.998336Z" } }, "outputs": [ @@ -41,8 +41,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2018-05-16T23:58:48.150739Z", - "start_time": "2018-05-16T23:58:48.143107Z" + "end_time": "2018-06-19T13:09:14.674713Z", + "start_time": "2018-06-19T13:09:14.668234Z" } }, "outputs": [], @@ -57,8 +57,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2018-05-16T23:58:48.173363Z", - "start_time": "2018-05-16T23:58:48.153066Z" + "end_time": "2018-06-19T13:09:14.912185Z", + "start_time": "2018-06-19T13:09:14.895298Z" } }, "outputs": [], @@ -73,25 +73,36 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2018-05-16T23:58:48.864223Z", - "start_time": "2018-05-16T23:58:48.177516Z" + "end_time": "2018-06-19T13:09:20.638699Z", + "start_time": "2018-06-19T13:09:17.343687Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:root:Line magic function `%autoreload` not found.\n" + ] + } + ], "source": [ + "%autoreload\n", "from nlp.disambiguator.geodict_gaurav import GauravGeodict\n", "from nlp.disambiguator.most_common import MostCommonDisambiguator\n", + "from nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator\n", "disMost_common=MostCommonDisambiguator()\n", - "disGaurav=GauravGeodict()" + "disGaurav=GauravGeodict()\n", + "disWiki=WikipediaDisambiguator()" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:59:53.695102Z", - "start_time": "2018-05-17T00:59:53.685756Z" + "end_time": "2018-06-19T13:10:29.965681Z", + "start_time": "2018-06-19T13:10:29.952223Z" } }, "outputs": [], @@ -106,11 +117,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:01:52.885111Z", - "start_time": "2018-05-17T00:01:52.850434Z" + "end_time": "2018-06-03T19:13:08.776780Z", + "start_time": "2018-06-03T19:13:08.752046Z" } }, "outputs": [], @@ -120,11 +131,11 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:00:09.181696Z", - "start_time": "2018-05-17T01:00:09.178578Z" + "end_time": "2018-06-03T19:13:13.030925Z", + "start_time": "2018-06-03T19:13:13.028591Z" } }, "outputs": [], @@ -134,11 +145,11 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:59:55.445531Z", - "start_time": "2018-05-17T00:59:55.407867Z" + "end_time": "2018-06-03T19:13:13.238647Z", + "start_time": "2018-06-03T19:13:13.212601Z" } }, "outputs": [], @@ -149,16 +160,22 @@ " res_dis=disGaurav.eval(df2[\"text\"].unique(),lang)\n", " df2[\"disambiguation\"]=df2.text.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n", " return (df2.GID == df2.disambiguation).sum()/len(df2)\n", + "\n", + "def accuracyWiki(df,lang):\n", + " df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"text\",\"GID\"]]\n", + " res_dis=disWiki.disambiguate_wiki(df2[\"text\"].unique(),lang)\n", + " df2[\"disambiguation\"]=df2.text.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n", + " return (df2.GID == df2.disambiguation).sum()/len(df2)\n", "#df" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:22:15.528864Z", - "start_time": "2018-05-17T01:01:01.373760Z" + "end_time": "2018-06-03T19:43:28.769834Z", + "start_time": "2018-06-03T19:15:06.598715Z" } }, "outputs": [ @@ -166,19 +183,20 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars\n", - " \n" + "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:12: RuntimeWarning: invalid value encountered in long_scalars\n", + " if sys.path[0] == '':\n" ] } ], "source": [ - "acc_MC,acc_GEO=[],[]\n", + "acc_MC,acc_GEO,acc_wiki=[],[],[]\n", "for fn in fns:\n", " id_=int(re.findall(r\"\\d+\",fn)[-1])\n", " \n", " df=pd.read_csv(fn)\n", - " acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\n", - " acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\n", + " acc_wiki.append(accuracyWiki(df,data_lang[id_]))\n", + " #acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\n", + " #acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\n", " " ] }, @@ -233,6 +251,718 @@ "np.mean(np.nan_to_num(acc_MC))" ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-03T19:44:42.307528Z", + "start_time": "2018-06-03T19:44:42.295687Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.740705700091002" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "np.mean(np.nan_to_num(acc_wiki))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-19T13:12:33.632268Z", + "start_time": "2018-06-19T13:12:26.349957Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "57451 9248\n", + "6.212262110726644\n" + ] + } + ], + "source": [ + "from helpers.gazeteer_helpers import count_of_se\n", + "sum_,count=0,0\n", + "for fn in fns:\n", + " try:\n", + " id_=int(re.findall(r\"\\d+\",fn)[-1])\n", + " df=pd.read_csv(fn)\n", + " df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"text\",\"GID\"]]\n", + " counts_t=df2.text.apply(lambda x: count_of_se(x,lang=data_lang[id_]))\n", + " sum_+=counts_t.sum()\n", + " count+=len(counts_t)\n", + " except:\n", + " pass\n", + "print(sum_,count)\n", + "print(sum_/count)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-19T13:10:32.794585Z", + "start_time": "2018-06-19T13:10:32.759937Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>Unnamed: 0.1</th>\n", + " <th>Unnamed: 0.1.1</th>\n", + " <th>Unnamed: 0.1.1.1</th>\n", + " <th>diff2</th>\n", + " <th>text</th>\n", + " <th>pos_</th>\n", + " <th>ent_type_</th>\n", + " <th>GID</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>Réunion</td>\n", + " <td>NOUN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1.0</td>\n", + " <td>Lac 2</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>2.0</td>\n", + " <td>Sud</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>3.0</td>\n", + " <td>Lac 2</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>4.0</td>\n", + " <td>BV Lac 2</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>5.0</td>\n", + " <td>BV Lac 2</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>6.0</td>\n", + " <td>Madagascar</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3404996</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>7.0</td>\n", + " <td>Madagascar</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3404996</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>8.0</td>\n", + " <td>–</td>\n", + " <td>PUNCT</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>9</td>\n", + " <td>9</td>\n", + " <td>9</td>\n", + " <td>9</td>\n", + " <td>9.0</td>\n", + " <td>Etat</td>\n", + " <td>NOUN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>10</td>\n", + " <td>10</td>\n", + " <td>10</td>\n", + " <td>10</td>\n", + " <td>10.0</td>\n", + " <td>Madagascar</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3404996</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>11.0</td>\n", + " <td>Lac 2</td>\n", + " <td>SPACE</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>12</td>\n", + " <td>12</td>\n", + " <td>12</td>\n", + " <td>12</td>\n", + " <td>12.0</td>\n", + " <td>Madagascar</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3404996</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>13</td>\n", + " <td>13</td>\n", + " <td>13</td>\n", + " <td>13</td>\n", + " <td>13.0</td>\n", + " <td>Madagascar</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3404996</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>14</td>\n", + " <td>14</td>\n", + " <td>14</td>\n", + " <td>14</td>\n", + " <td>14.0</td>\n", + " <td>Directeur</td>\n", + " <td>NOUN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>15</td>\n", + " <td>15</td>\n", + " <td>15</td>\n", + " <td>15</td>\n", + " <td>15.0</td>\n", + " <td>Lac</td>\n", + " <td>SPACE</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>16.0</td>\n", + " <td>Lac 2</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " <td>17.0</td>\n", + " <td>Paris</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD5400765</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>18</td>\n", + " <td>18</td>\n", + " <td>18</td>\n", + " <td>18</td>\n", + " <td>18.0</td>\n", + " <td>Antananarivo</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3682867</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>19</td>\n", + " <td>19</td>\n", + " <td>19</td>\n", + " <td>19</td>\n", + " <td>19.0</td>\n", + " <td>Directions Régionales</td>\n", + " <td>SPACE</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " <td>20.0</td>\n", + " <td>Centres</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>21</td>\n", + " <td>21</td>\n", + " <td>21</td>\n", + " <td>21</td>\n", + " <td>21.0</td>\n", + " <td>Services Agricoles</td>\n", + " <td>SPACE</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>22</td>\n", + " <td>22</td>\n", + " <td>22</td>\n", + " <td>22</td>\n", + " <td>22.0</td>\n", + " <td>BV Lac</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>23</td>\n", + " <td>23</td>\n", + " <td>23</td>\n", + " <td>23</td>\n", + " <td>23.0</td>\n", + " <td>jusqu’</td>\n", + " <td>VERB</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>24</td>\n", + " <td>24</td>\n", + " <td>24</td>\n", + " <td>24</td>\n", + " <td>24.0</td>\n", + " <td>Antananarivo</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3682867</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>25</td>\n", + " <td>25</td>\n", + " <td>25</td>\n", + " <td>25</td>\n", + " <td>25.0</td>\n", + " <td>Suivi</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>26</td>\n", + " <td>26</td>\n", + " <td>26</td>\n", + " <td>26</td>\n", + " <td>26.0</td>\n", + " <td>Ambositra</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD6124882</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>27</td>\n", + " <td>27</td>\n", + " <td>27</td>\n", + " <td>27</td>\n", + " <td>27.0</td>\n", + " <td>Farafangana</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD2452325</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>28</td>\n", + " <td>28</td>\n", + " <td>28</td>\n", + " <td>28</td>\n", + " <td>28.0</td>\n", + " <td>du Sud</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>29</td>\n", + " <td>29</td>\n", + " <td>29</td>\n", + " <td>29</td>\n", + " <td>29.0</td>\n", + " <td>Est</td>\n", + " <td>NOUN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>30</td>\n", + " <td>30</td>\n", + " <td>30</td>\n", + " <td>30</td>\n", + " <td>30.0</td>\n", + " <td>seuil</td>\n", + " <td>NOUN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>31</td>\n", + " <td>31</td>\n", + " <td>31</td>\n", + " <td>31</td>\n", + " <td>31.0</td>\n", + " <td>BV Lac 2</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>32</td>\n", + " <td>32</td>\n", + " <td>32</td>\n", + " <td>32</td>\n", + " <td>32.0</td>\n", + " <td>jusqu’</td>\n", + " <td>VERB</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>33</td>\n", + " <td>33</td>\n", + " <td>33</td>\n", + " <td>33</td>\n", + " <td>33.0</td>\n", + " <td>BV Lac 2</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>34</td>\n", + " <td>34</td>\n", + " <td>34</td>\n", + " <td>34</td>\n", + " <td>34.0</td>\n", + " <td>Secrétaire</td>\n", + " <td>NOUN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>35</td>\n", + " <td>35</td>\n", + " <td>35</td>\n", + " <td>35</td>\n", + " <td>35.0</td>\n", + " <td>Alaotra</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>36</th>\n", + " <td>36</td>\n", + " <td>36</td>\n", + " <td>36</td>\n", + " <td>36</td>\n", + " <td>36.0</td>\n", + " <td>Mangoro</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3574285</td>\n", + " </tr>\n", + " <tr>\n", + " <th>37</th>\n", + " <td>37</td>\n", + " <td>37</td>\n", + " <td>37</td>\n", + " <td>37</td>\n", + " <td>37.0</td>\n", + " <td>Directeur</td>\n", + " <td>NOUN</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>38</th>\n", + " <td>38</td>\n", + " <td>38</td>\n", + " <td>38</td>\n", + " <td>38</td>\n", + " <td>38.0</td>\n", + " <td>Lac 2 et</td>\n", + " <td>SPACE</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>39</th>\n", + " <td>39</td>\n", + " <td>39</td>\n", + " <td>39</td>\n", + " <td>39</td>\n", + " <td>39.0</td>\n", + " <td>Sous réserve</td>\n", + " <td>VERB</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " <tr>\n", + " <th>40</th>\n", + " <td>40</td>\n", + " <td>40</td>\n", + " <td>40</td>\n", + " <td>40</td>\n", + " <td>40.0</td>\n", + " <td>Grandjean</td>\n", + " <td>PROPN</td>\n", + " <td>LOC</td>\n", + " <td>GD3254594</td>\n", + " </tr>\n", + " <tr>\n", + " <th>41</th>\n", + " <td>41</td>\n", + " <td>41</td>\n", + " <td>41</td>\n", + " <td>41</td>\n", + " <td>41.0</td>\n", + " <td>jusqu’</td>\n", + " <td>VERB</td>\n", + " <td>LOC</td>\n", + " <td>O</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 Unnamed: 0.1.1.1 diff2 \\\n", + "0 0 0 0 0 0.0 \n", + "1 1 1 1 1 1.0 \n", + "2 2 2 2 2 2.0 \n", + "3 3 3 3 3 3.0 \n", + "4 4 4 4 4 4.0 \n", + "5 5 5 5 5 5.0 \n", + "6 6 6 6 6 6.0 \n", + "7 7 7 7 7 7.0 \n", + "8 8 8 8 8 8.0 \n", + "9 9 9 9 9 9.0 \n", + "10 10 10 10 10 10.0 \n", + "11 11 11 11 11 11.0 \n", + "12 12 12 12 12 12.0 \n", + "13 13 13 13 13 13.0 \n", + "14 14 14 14 14 14.0 \n", + "15 15 15 15 15 15.0 \n", + "16 16 16 16 16 16.0 \n", + "17 17 17 17 17 17.0 \n", + "18 18 18 18 18 18.0 \n", + "19 19 19 19 19 19.0 \n", + "20 20 20 20 20 20.0 \n", + "21 21 21 21 21 21.0 \n", + "22 22 22 22 22 22.0 \n", + "23 23 23 23 23 23.0 \n", + "24 24 24 24 24 24.0 \n", + "25 25 25 25 25 25.0 \n", + "26 26 26 26 26 26.0 \n", + "27 27 27 27 27 27.0 \n", + "28 28 28 28 28 28.0 \n", + "29 29 29 29 29 29.0 \n", + "30 30 30 30 30 30.0 \n", + "31 31 31 31 31 31.0 \n", + "32 32 32 32 32 32.0 \n", + "33 33 33 33 33 33.0 \n", + "34 34 34 34 34 34.0 \n", + "35 35 35 35 35 35.0 \n", + "36 36 36 36 36 36.0 \n", + "37 37 37 37 37 37.0 \n", + "38 38 38 38 38 38.0 \n", + "39 39 39 39 39 39.0 \n", + "40 40 40 40 40 40.0 \n", + "41 41 41 41 41 41.0 \n", + "\n", + " text pos_ ent_type_ GID \n", + "0 Réunion NOUN LOC O \n", + "1 Lac 2 PROPN LOC O \n", + "2 Sud PROPN LOC O \n", + "3 Lac 2 PROPN LOC O \n", + "4 BV Lac 2 PROPN LOC O \n", + "5 BV Lac 2 PROPN LOC O \n", + "6 Madagascar PROPN LOC GD3404996 \n", + "7 Madagascar PROPN LOC GD3404996 \n", + "8 – PUNCT LOC O \n", + "9 Etat NOUN LOC O \n", + "10 Madagascar PROPN LOC GD3404996 \n", + "11 Lac 2 SPACE LOC O \n", + "12 Madagascar PROPN LOC GD3404996 \n", + "13 Madagascar PROPN LOC GD3404996 \n", + "14 Directeur NOUN LOC O \n", + "15 Lac SPACE LOC O \n", + "16 Lac 2 PROPN LOC O \n", + "17 Paris PROPN LOC GD5400765 \n", + "18 Antananarivo PROPN LOC GD3682867 \n", + "19 Directions Régionales SPACE LOC O \n", + "20 Centres PROPN LOC O \n", + "21 Services Agricoles SPACE LOC O \n", + "22 BV Lac PROPN LOC O \n", + "23 jusqu’ VERB LOC O \n", + "24 Antananarivo PROPN LOC GD3682867 \n", + "25 Suivi PROPN LOC O \n", + "26 Ambositra PROPN LOC GD6124882 \n", + "27 Farafangana PROPN LOC GD2452325 \n", + "28 du Sud PROPN LOC O \n", + "29 Est NOUN LOC O \n", + "30 seuil NOUN LOC O \n", + "31 BV Lac 2 PROPN LOC O \n", + "32 jusqu’ VERB LOC O \n", + "33 BV Lac 2 PROPN LOC O \n", + "34 Secrétaire NOUN LOC O \n", + "35 Alaotra PROPN LOC O \n", + "36 Mangoro PROPN LOC GD3574285 \n", + "37 Directeur NOUN LOC O \n", + "38 Lac 2 et SPACE LOC O \n", + "39 Sous réserve VERB LOC O \n", + "40 Grandjean PROPN LOC GD3254594 \n", + "41 jusqu’ VERB LOC O " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/EvalDesambiguisationPADIWEB.ipynb b/notebooks/EvalDesambiguisationPADIWEB.ipynb index ba763a8..83ab07b 100644 --- a/notebooks/EvalDesambiguisationPADIWEB.ipynb +++ b/notebooks/EvalDesambiguisationPADIWEB.ipynb @@ -2,11 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:50:38.399698Z", - "start_time": "2018-05-17T00:50:38.396888Z" + "end_time": "2018-06-19T12:57:56.566077Z", + "start_time": "2018-06-19T12:57:56.076820Z" } }, "outputs": [], @@ -20,8 +20,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:48:13.001356Z", - "start_time": "2018-05-17T00:48:12.994569Z" + "end_time": "2018-06-19T12:57:56.766774Z", + "start_time": "2018-06-19T12:57:56.761060Z" } }, "outputs": [ @@ -39,11 +39,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:54:11.406691Z", - "start_time": "2018-05-17T00:54:11.400933Z" + "end_time": "2018-06-19T12:58:25.165818Z", + "start_time": "2018-06-19T12:58:25.056576Z" } }, "outputs": [], @@ -64,11 +64,11 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:25:05.006779Z", - "start_time": "2018-05-17T01:25:05.000357Z" + "end_time": "2018-06-19T12:58:25.614490Z", + "start_time": "2018-06-19T12:58:25.607038Z" } }, "outputs": [], @@ -88,11 +88,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:56:26.195260Z", - "start_time": "2018-05-17T00:56:26.185713Z" + "end_time": "2018-06-19T13:00:51.545645Z", + "start_time": "2018-06-19T13:00:51.538149Z" } }, "outputs": [], @@ -104,11 +104,11 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:05:10.917961Z", - "start_time": "2018-05-17T01:05:10.915317Z" + "end_time": "2018-06-19T12:58:56.147169Z", + "start_time": "2018-06-19T12:58:56.132754Z" } }, "outputs": [], @@ -118,28 +118,31 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 17, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:57:28.905930Z", - "start_time": "2018-05-17T00:57:28.346854Z" + "end_time": "2018-06-03T18:46:38.252413Z", + "start_time": "2018-06-03T18:46:35.836908Z" } }, "outputs": [], "source": [ + "%autoreload\n", "from nlp.disambiguator.geodict_gaurav import GauravGeodict\n", "from nlp.disambiguator.most_common import MostCommonDisambiguator\n", + "from nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator\n", "disMost_common=MostCommonDisambiguator()\n", - "disGaurav=GauravGeodict()" + "disGaurav=GauravGeodict()\n", + "disWiki=WikipediaDisambiguator()" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:10:19.593778Z", - "start_time": "2018-05-17T01:10:19.585332Z" + "end_time": "2018-06-03T18:40:57.064904Z", + "start_time": "2018-06-03T18:40:57.043921Z" } }, "outputs": [], @@ -154,11 +157,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T00:01:52.885111Z", - "start_time": "2018-05-17T00:01:52.850434Z" + "end_time": "2018-06-03T18:40:58.360243Z", + "start_time": "2018-06-03T18:40:58.203320Z" } }, "outputs": [], @@ -168,21 +171,33 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 19, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:10:21.463216Z", - "start_time": "2018-05-17T01:10:21.098003Z" + "end_time": "2018-06-03T18:46:54.196478Z", + "start_time": "2018-06-03T18:46:53.863582Z" } }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rivers State GD4106855 12.73152386775468\n", + "Kano GD4103071 21.675014816832682\n", + "Kano GD4103071 21.675014816832682\n", + "Lagos GD4468122 124.6205202335819\n", + "Lagos GD4468122 124.6205202335819\n", + "Port Harcourt GD791183 15.777445058883712\n" + ] + }, { "data": { "text/plain": [ "0.6666666666666666" ] }, - "execution_count": 52, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -194,11 +209,11 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:06:38.089187Z", - "start_time": "2018-05-17T01:06:38.080846Z" + "end_time": "2018-06-03T18:45:45.708459Z", + "start_time": "2018-06-03T18:45:45.679984Z" } }, "outputs": [], @@ -209,17 +224,22 @@ " res_dis=disGaurav.eval(df2[\"content\"].unique(),lang)\n", " df2[\"disambiguation\"]=df2.content.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n", " return (df2.GID == df2.disambiguation).sum()/len(df2)\n", + "def accuracyWiki(df,lang):\n", + " df2=df[-df[\"GID\"].isin([\"O\",\"NR\",\"o\"])][[\"content\",\"GID\"]]\n", + " res_dis=disWiki.disambiguate(df2[\"content\"].unique(),lang)\n", + " df2[\"disambiguation\"]=df2.content.apply(lambda x:res_dis[x] if x in res_dis else \"0\")\n", + " return (df2.GID == df2.disambiguation).sum()/len(df2)\n", "#df\n", "#df" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 20, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:37:02.165192Z", - "start_time": "2018-05-17T01:25:31.325566Z" + "end_time": "2018-06-03T18:53:53.880676Z", + "start_time": "2018-06-03T18:48:05.294472Z" } }, "outputs": [ @@ -229,21 +249,20 @@ "text": [ "/usr/local/lib/python3.6/site-packages/pandas/core/ops.py:816: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " result = getattr(x, name)(y)\n", - "/Users/jacquesfize/nas_cloud/Code/str-python/helpers/collision.py:30: RuntimeWarning: invalid value encountered in double_scalars\n", - " d_over_o_squared = d/np.dot(o, o) + 1e-10\n", - "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars\n", - " \n" + "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:11: RuntimeWarning: invalid value encountered in long_scalars\n", + " # This is added back by InteractiveShellApp.init_path()\n" ] } ], "source": [ - "acc_MC,acc_GEO=[],[]\n", + "acc_MC,acc_GEO,acc_wiki=[],[],[]\n", "for fn in fns:\n", " \n", " try:\n", " df,lang=parse_file(fn)\n", - " acc_MC.append(accuracyMostCommon(df,lang))\n", - " acc_GEO.append(accuracyGeodict(df,lang))\n", + " #acc_MC.append(accuracyMostCommon(df,lang))\n", + " #acc_GEO.append(accuracyGeodict(df,lang))\n", + " acc_wiki.append(accuracyWiki(df,lang))\n", " except:\n", " pass\n", " " @@ -300,6 +319,86 @@ "np.mean(np.nan_to_num(acc_MC))" ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-03T18:55:22.909028Z", + "start_time": "2018-06-03T18:55:22.904693Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5630869832932465" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "np.mean(np.nan_to_num(acc_wiki))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-19T13:01:36.778853Z", + "start_time": "2018-06-19T13:01:36.775832Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2018-06-19T13:10:53.120884Z", + "start_time": "2018-06-19T13:09:52.611805Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/site-packages/pandas/core/ops.py:816: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", + " result = getattr(x, name)(y)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "151959 7898\n", + "19.24018738921246\n" + ] + } + ], + "source": [ + "from helpers.gazeteer_helpers import count_of_se\n", + "sum_,count=0,0\n", + "for fn in fns:\n", + " try:\n", + " df,lang=parse_file(fn)\n", + " counts_t=df.content.apply(lambda x: count_of_se(x,lang=lang))\n", + " sum_+=counts_t.sum()\n", + " count+=len(counts_t)\n", + " except:\n", + " pass\n", + "print(sum_,count)\n", + "print(sum_/count)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tools.py b/tools.py index f5b1dd8..fd2dd99 100644 --- a/tools.py +++ b/tools.py @@ -4,7 +4,7 @@ import argparse from termcolor import colored -from .helpers.gazeteer_helpers import get_most_common_id, get_data, get_by_label +from helpers.gazeteer_helpers import get_most_common_id, get_data, get_by_label parser = argparse.ArgumentParser() -- GitLab