Commit 8439cf8b authored by Fize Jacques's avatar Fize Jacques
Browse files

+ Initalize Cython version of gmatch4py

+ Speed colision border by using shapely and load all the shape right at the beginning of the process

+ New experiment

+ Add noteboks
parent 87a9945f
No related merge requests found
Showing with 853 additions and 129 deletions
+853 -129
......@@ -5,5 +5,6 @@
"core_nlp_URL":"http://localhost:9000",
"es_server_old":"http://192.168.1.15:9200/",
"es_server":"http://localhost:9200/",
"database_json":"resources/database_exp_12_mars.db"
"database_json":"resources/database_exp_12_mars.db",
"log_file":"extract_log"
}
\ No newline at end of file
#!/usr/bin/env bash
path_texts=/Users/jacquesfize/LOD_DATASETS/raw_bvlac
output_dir=/Users/jacquesfize/LOD_DATASETS/exp_17_avr18
if [ "$1" == "generate" ]; then
#python3 generate_data.py $path_texts $output_dir/normal asso.json normal;
python3 generate_transform.py $output_dir/normal $output_dir/extension_1 extension -a 1;
python3 generate_transform.py $output_dir/normal $output_dir/extension_2 extension -a 2;
python3 generate_transform.py $output_dir/normal $output_dir/extension_3 extension -a 3;
python3 generate_transform.py $output_dir/normal $output_dir/gen_all_1 generalisation -t all -n 1;
python3 generate_transform.py $output_dir/normal $output_dir/gen_all_2 generalisation -t all -n 2;
python3 generate_transform.py $output_dir/normal $output_dir/gen_region generalisation -t bounded -b region;
python3 generate_transform.py $output_dir/normal $output_dir/gen_capital generalisation -t bounded -b capital;
python3 generate_transform.py $output_dir/normal $output_dir/gen_country generalisation -t bounded -b country;
fi
if [ "$1" == "eval" ]; then
## Normal STR eval
original=$output_dir/normal
dir=normal;
mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -a -o $output_dir/result_eval/$dir/;
done;
## Generalised STR eval
dir=gen_all_1
mesure=( "MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -a -o $output_dir/result_eval/$dir/;
done;
dir=gen_all_2
mesure=( "MCS" "VEO" "JACCARD" "HED" "GREEDY" "BOC" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -a -o $output_dir/result_eval/$dir/;
done;
dir=gen_region
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -a -o $output_dir/result_eval/$dir/;
done;
dir=gen_country
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -a -o $output_dir/result_eval/$dir/;
done;
## Extended STR eval
dir=extension_1
mesure=( "MCS" "VEO" "JACCARD" "BOC" "WLSUBTREE" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -a -o $output_dir/result_eval/$dir/;
done;
dir=extension_2
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -a -o $output_dir/result_eval/$dir/;
done;
fi
\ No newline at end of file
#!/usr/bin/env bash
#!/usr/bin/env bash
path_texts=data/MADA_LIGHT_raw
output_dir=data/graph_exp_mar_30
......
# coding = utf-8
# coding = utf-8
import argparse,glob,logging,string,time,re
from concurrent.futures import ThreadPoolExecutor
from langdetect import detect
from progressbar import ProgressBar, Timer, Bar, ETA, Counter
from helpers.boundary import get_all_shapes
from nlp.disambiguator.geodict_gaurav import *
from pipeline import *
import logging
logging.basicConfig(filename=config.log_file)
logging.basicConfig(format='%(asctime)s %(message)s')
def filter_nonprintable(text):
# Get the difference of all ASCII characters from the set of printable characters
nonprintable = set([chr(i) for i in range(128)]).difference(string.printable)
# Use translate to remove all non-printable characters
return text.translate({ord(character):None for character in nonprintable})
parser = argparse.ArgumentParser()
parser.add_argument("csv_input_dir")
parser.add_argument("graphs_output_dir")
parser.add_argument("metadata_output_fn")
subparsers = parser.add_subparsers(help='commands')
normal = subparsers.add_parser(
'normal', help='Basic STR generation. No argument are necessary !')
normal.set_defaults(which="norm")
gen_parser = subparsers.add_parser(
'generalisation', help='Apply a generalisation transformation on the generated STRs')
gen_parser.set_defaults(which="gene")
gen_parser.add_argument(
'-t','--type_gen', help='Type of generalisation',default="all")
gen_parser.add_argument(
'-n', help='Language',default=1)
gen_parser.add_argument(
'-b','--bound', help='If Generalisation is bounded, this arg. correspond'
'to the maximal ',default="country")
ext_parser = subparsers.add_parser(
'extension', help='Apply a extension process on the STRs')
ext_parser.set_defaults(which="ext")
ext_parser.add_argument(
'-d','--distance', help='radius distance',default=150)
ext_parser.add_argument(
'-u','--unit', help='unit used for the radius distance',default="km")
ext_parser.add_argument(
'-a','--adjacent_count', help='number of adjacent SE add to the STR',default=1)
args = parser.parse_args()
if "which" in args:
if args.which =="gene":
args.type_trans="gen"
elif args.which =="ext":
args.type_trans="ext"
print("Parameters entered : ",args)
if os.path.exists(args.csv_input_dir):
files_glob= glob.glob(args.csv_input_dir+"/*.csv")
else:
exit()
start = time.time()
associated_es={}
count_per_doc={}
i=0
logging.info("Get associated spatial entities and ")
with ProgressBar(max_value=len(files_glob),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg:
for fn in files_glob:
id_=int(re.findall("\d+", fn)[-1])
df=pd.read_csv(fn)
try:
df=df[(df["GID"]!='O') & (df.GID.notnull())]
except:
df = df[(df.GID.notnull())]
try:
count_per_doc[id_]=json.loads(df.groupby("GID").GID.count().to_json())
associated_es[id_] = df[["GID","text"]].groupby("GID",as_index=False).max().set_index('GID').to_dict()["text"]
except:
count_per_doc[id_]={}
associated_es[id_]={}
pg.update(i)
i+=1
logging.info("Fetch list of spatial entities available !")
all_es=set([])
for k,v in associated_es.items():
for k2 in v:
all_es.add(k2)
logging.info("Get All Shapes from Database for all ES")
all_shapes=get_all_shapes(list(all_es))
#print(all_shapes.keys())
i=0
with ProgressBar(max_value=len(files_glob),
widgets=[' [', Timer(), '] ', Bar(), '(', Counter(), ')', '(', ETA(), ')']) as pg:
for fn in files_glob:
id_ = int(re.findall("\d+", fn)[-1])
df = pd.read_csv(fn)
try:
df = df[(df["GID"] != 'O') & (df.GID.notnull())]
except:
df = df[(df.GID.notnull())]
df["label"]=df.GID.apply(lambda x:get_data(x)["en"])
df = df.rename(columns={"GID": "id"})
str_=STR.from_pandas(df,[],all_shapes).build()
nx.write_gexf(str_, args.graphs_output_dir + "/{0}.gexf".format(id_))
i+=1
pg.update(i)
# Save Metadata
open(os.path.join(args.graphs_output_dir,args.metadata_output_fn),'w').write(json.dumps([associated_es,count_per_doc],indent=4))
print("--- %s seconds ---" % (time.time() - start))
\ No newline at end of file
......@@ -8,8 +8,8 @@ from concurrent.futures import ThreadPoolExecutor
from progressbar import ProgressBar, Timer, Bar, ETA, Counter
from .nlp.disambiguator.geodict_gaurav import *
from .pipeline import *
from nlp.disambiguator.geodict_gaurav import *
from pipeline import *
parser = argparse.ArgumentParser()
parser.add_argument("graphs_input_dir")
......
# coding = utf-8
# coding = utf-8
from gmatch4py.ged.bipartite_graph_matching_2 import BP_2
from gmatch4py.utils import *
class GeoBP2(BP_2):
""""""
def __init__(self, node_del=1, node_ins=2, edge_del=1, edge_ins=1):
"""Constructor for GeoHED"""
BP_2.__init__(self, node_del, node_ins, edge_del, edge_ins)
def geo_distance(self, g1, g2, node1, node2):
g1_info = get_nodes_geolocalization(g1)
g2_info = get_nodes_geolocalization(g2)
return get_distance_two_entity(node1, node2, g1_info, g2_info)
def fuv(self, g1, g2, n1, n2):
if n1 and n2:
return super().fuv(g1, g2, n1, n2) + self.geo_distance(g1, g2, n1, n2)
else:
return super().fuv(g1, g2, n1, n2)
# coding = utf-8
from gmatch4py.ged.algorithm.graph_edit_dist import GraphEditDistance
from gmatch4py.utils import *
_cache_g_info={}
class GeoGED(GraphEditDistance):
""""""
def __init__(self,g1,g2,debug=False):
"""Constructor for GeoGED"""
GraphEditDistance.__init__(self,g1,g2,debug)
if not ",".join(g1.nodes()) in _cache_g_info:
self.g1_info = get_nodes_geolocalization(g1)
_cache_g_info[",".join(g1.nodes())] = self.g1_info
else:
self.g1_info = _cache_g_info[",".join(g1.nodes())]
if not ",".join(g2.nodes()) in _cache_g_info:
self.g2_info = get_nodes_geolocalization(g2)
_cache_g_info[",".join(g2.nodes())] = self.g2_info
else:
self.g2_info=_cache_g_info[",".join(g2.nodes())]
@staticmethod
def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
n = len(listgs)
comparison_matrix = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
comparison_matrix[i, j] = GraphEditDistance(listgs[i], listgs[j], False, node_del=c_del_node,
node_ins=c_ins_node, edge_del=c_del_edge,
edge_ins=c_ins_edge).distance()
comparison_matrix[j, i] = comparison_matrix[
i, j] # Unethical ! Since AGED is not a symmetric similarity measure !
return comparison_matrix
def insert_geo_distance(self,node2):
# If one nodes given, compute average distance
avg_=[]
for node in self.g1:
avg_.append(get_distance_two_entity(node,node2,self.g1_info,self.g2_info))
if avg_:
return np.mean(avg_)
return 0
def del_geo_distance(self,node1):
# If one nodes given, compute average distance
avg_=[]
for node in self.g1:
if node == node1:continue
avg_.append(get_distance_two_entity(node,node1,self.g1_info,self.g1_info))
if avg_:
return np.mean(avg_)
return 0
def insert_cost(self, i, j, nodes2):
return super().insert_cost(i,j,nodes2)+self.insert_geo_distance(nodes2[j])
def delete_cost(self, i, j, nodes1):
return super().insert_cost(i,j,nodes1)+self.del_geo_distance(nodes1[i])
\ No newline at end of file
# coding = utf-8
from gmatch4py.ged.hausdorff_edit_distance import HED
from gmatch4py.utils import *
class GeoHED(HED):
""""""
def __init__(self,node_del=1, node_ins=2, edge_del=1, edge_ins=1):
"""Constructor for GeoHED"""
HED.__init__(self,node_del, node_ins, edge_del, edge_ins)
@staticmethod
def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
n = len(listgs)
comparator = GeoHED(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
comparison_matrix = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
comparison_matrix[i, j] = comparator.hed(listgs[i], listgs[j])
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
def geo_distance(self,g1,g2,node1,node2):
g1_info = get_nodes_geolocalization(g1)
g2_info = get_nodes_geolocalization(g2)
return get_distance_two_entity(node1,node2,g1_info,g2_info)
def fuv(self, g1, g2, n1, n2):
if n1 and n2:
return super().fuv(g1,g2,n1,n2)+ self.geo_distance(g1,g2,n1,n2)
else:
return super().fuv(g1, g2, n1, n2)
# Gmatch4py a graph matching library for Python
Gmatch4py is a library dedicated to graph matching. Graph structure are stored in NetworkX.Graph objects.
## List of algorithm
* DeltaCon and DeltaCon0 (*debug needed*) [1]
* Vertex Ranking (*debug needed*) [2]
* Vertex Edge Overlap [2]
* Graph kernels
* Random Walk Kernel (*debug needed*) [3]
* Geometrical
* K-Step
* Shortest Path Kernel [3]
* Weisfeiler-Lehman Kernel [4]
* Subtree Kernel
* Edge Kernel
* Subtree Geo Kernel [new]
* Edge Geo Kernel [new]
* Graph Edit Distance [5]
* Approximated Graph Edit Distance
* Hausdorff Graph Edit Distance
* Bipartite Graph Edit Distance
* Greedy Edit Distance
* MCS [6]
## Publications associated
* [1] Koutra, D., Vogelstein, J. T., & Faloutsos, C. (2013, May). Deltacon: A principled massive-graph similarity function. In Proceedings of the 2013 SIAM International Conference on Data Mining (pp. 162-170). Society for Industrial and Applied Mathematics.
* [2] Papadimitriou, P., Dasdan, A., & Garcia-Molina, H. (2010). Web graph similarity for anomaly detection. Journal of Internet Services and Applications, 1(1), 19-30.
* [3] Vishwanathan, S. V. N., Schraudolph, N. N., Kondor, R., & Borgwardt, K. M. (2010). Graph kernels. Journal of Machine Learning Research, 11(Apr), 1201-1242.
* [4] Shervashidze, N., Schweitzer, P., Leeuwen, E. J. V., Mehlhorn, K., & Borgwardt, K. M. (2011). Weisfeiler-lehman graph kernels. Journal of Machine Learning Research, 12(Sep), 2539-2561.
* [5] Fischer, A., Riesen, K., & Bunke, H. (2017). Improved quadratic time approximation of graph edit distance by combining Hausdorff matching and greedy assignment. Pattern Recognition Letters, 87, 55-62.
* [6] A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer, Pattern Recognition Letters, 1998
## Authors
Jacques Fize
## TODO
* Debug algorithms with --> (*debug needed*)
\ No newline at end of file
__version__ = "0.1"
# coding = utf-8
import copy
from typing import Sequence
import networkx as nx
import numpy as np
class BagOfCliques():
@staticmethod
def compare(graphs):
b=BagOfCliques()
bog=b.getBagOfCliques(graphs)
#Compute cosine similarity
scores=np.dot(bog,bog.T)
for i in range(len(scores)):
for j in range(len(scores)):
scores[i,j]/=(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line
return scores
def getUniqueCliques(self,graphs):
"""
Return unique cliques from a population of graphs
:return:
"""
tree = {}
c_ = 0
clique_vocab = []
for g in graphs:
cliques = list(nx.algorithms.clique.find_cliques(nx.Graph(g)))
for clique in cliques:
t = tree
cli_temp = copy.deepcopy(clique)
new_clique = False
for i in range(len(clique)):
flag = False
v = None # vertex deleted
for vertex in cli_temp:
if vertex in t:
v = vertex
flag = True
if not flag in t:
v = cli_temp[0]
t[v] = {}
new_clique = True
t = t[v]
cli_temp.remove(v)
if new_clique:
c_ += 1
clique_vocab.append(clique)
return clique_vocab
def ifHaveMinor(self,G: nx.Graph, H: list):
"""
If a clique (minor) H belong to a graph G
:param H:
:return:
"""
if nx.Graph(G).subgraph(H).nodes() == H:
return 1
return 0
def getBagOfCliques(self,graphs : Sequence[nx.Graph]):
"""
:param clique_vocab:
:return:
"""
clique_vocab=self.getUniqueCliques(graphs)
l_v=len(clique_vocab)
boc = np.zeros((len(graphs), l_v))
for g in range(len(graphs)):
gr = graphs[g]
vector = np.zeros(l_v)
for m in range(l_v):
vector[m] = self.ifHaveMinor(gr, clique_vocab[m])
boc[g] = vector
return boc
\ No newline at end of file
# coding = utf-8
import networkx as nx
import numpy as np
import scipy.sparse
class DeltaCon0():
__type__ = "sim"
@staticmethod
def compare(list_gs):
n=len(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
S1 = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
S2 = np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(S1,S2))
comparison_matrix[j,i] = comparison_matrix[i,j]
return comparison_matrix
@staticmethod
def rootED(S1,S2):
return np.sqrt(np.sum((S1-S2)**2)) # Long live numpy !
@staticmethod
def degreeAndAdjacencyMatrix(G):
"""
Return the Degree(D) and Adjacency Matrix(A) from a graph G.
Inspired of nx.laplacian_matrix(G,nodelist,weight) code proposed by networkx
:param G:
:return:
"""
A = nx.to_scipy_sparse_matrix(G, nodelist=G.nodes(), weight="weight",
format='csr')
n, m = A.shape
diags = A.sum(axis=1)
D = scipy.sparse.spdiags(diags.flatten(), [0], m, n, format='csr')
return D, A
@staticmethod
def maxDegree(G):
degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence
# print "Degree sequence", degree_sequence
dmax = max(degree_sequence)
return dmax
class DeltaCon():
__type__ = "sim"
@staticmethod
def relabel_nodes(graph_list):
label_lookup = {}
label_counter = 0
n= len(graph_list)
# label_lookup is an associative array, which will contain the
# mapping from multiset labels (strings) to short labels
# (integers)
for i in range(n):
nodes = graph_list[i].nodes()
for j in range(len(nodes)):
if not (nodes[j] in label_lookup):
label_lookup[nodes[j]] = label_counter
label_counter += 1
graph_list[i] = nx.relabel_nodes(graph_list[i], label_lookup)
return graph_list
@staticmethod
def compare(list_gs, g=3):
n=len(list_gs)
list_gs=DeltaCon.relabel_nodes(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
V = g1.nodes()
V.extend(g2.nodes())
V=np.unique(V)
partitions=V.copy()
np.random.shuffle(partitions)
if len(partitions)< g:
partitions=np.array([partitions])
else:
partitions=np.array_split(partitions,g)
partitions_e_1 = DeltaCon.partitions2e(partitions, g1.nodes())
partitions_e_2 = DeltaCon.partitions2e(partitions, g2.nodes())
S1,S2=[],[]
for k in range(len(partitions)):
s0k1,s0k2=partitions_e_1[k],partitions_e_2[k]
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
s1k = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
s1k=np.linalg.solve(s1k,s0k1).tolist()
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
s2k= np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
s2k = np.linalg.solve(s2k, s0k2).tolist()
S1.append(s1k)
S2.append(s2k)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(np.array(S1),np.array(S2)))
comparison_matrix[j,i] = comparison_matrix[i,j]
return comparison_matrix
@staticmethod
def partitions2e( partitions, V):
e = [ [] for i in range(len(partitions))]
for p in range(len(partitions)):
e[p] = []
for i in range(len(V)):
if i in partitions[p]:
e[p].append(1.0)
else:
e[p].append(0.0)
return e
\ No newline at end of file
# coding = utf-8
\ No newline at end of file
# -*- coding: UTF-8 -*-
from __future__ import print_function
import sys
import numpy as np
from scipy.optimize import linear_sum_assignment
class AbstractGraphEditDistance(object):
def __init__(self, g1, g2,debug=False,**kwargs):
self.g1 = g1
self.g2 = g2
self.debug=debug
self.node_del = kwargs.get("node_del",1)
self.node_ins = kwargs.get("node_ins",1)
self.edge_del = kwargs.get("edge_del",1)
self.edge_ins = kwargs.get("edge_ins",1)
def distance(self):
opt_path = self.edit_costs()
if self.debug:
print("Edit path for ",str(self.__class__.__name__),"\n",opt_path)
return sum(opt_path)
def print_operations(self,cost_matrix,row_ind,col_ind):
nodes1 = self.g1.nodes()
nodes2 = self.g2.nodes()
dn1 = self.g1.node
dn2 = self.g2.node
n,m=len(nodes1),len(nodes2)
for i in range(len(row_ind)):
y,x=row_ind[i],col_ind[i]
val=cost_matrix[row_ind[i]][col_ind[i]]
if x<m and y<n:
print("SUB {0} to {1} cost = {2}".format(dn1[nodes1[y]]["label"],dn2[nodes2[x]]["label"],val))
elif x <m and y>=n:
print("ADD {0} cost = {1}".format(dn2[nodes2[y-n]]["label"],val))
elif x>=m and y<n:
print("DEL {0} cost = {1}".format(dn1[nodes1[m-x]]["label"],val))
def edit_costs(self):
cost_matrix = self.create_cost_matrix()
if self.debug:
np.set_printoptions(precision=3)
print("Cost Matrix for ",str(self.__class__.__name__),"\n",cost_matrix)
row_ind,col_ind = linear_sum_assignment(cost_matrix)
if self.debug:
self.print_operations(cost_matrix,row_ind,col_ind)
return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(len(row_ind))]
def create_cost_matrix(self):
"""
Creates a |N+M| X |N+M| cost matrix between all nodes in
graphs g1 and g2
Each cost represents the cost of substituting,
deleting or inserting a node
The cost matrix consists of four regions:
substitute | insert costs
-------------------------------
delete | delete -> delete
The delete -> delete region is filled with zeros
"""
n = len(self.g1)
m = len(self.g2)
cost_matrix = np.zeros((n+m,n+m))
#cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)]
nodes1 = self.g1.nodes()
nodes2 = self.g2.nodes()
for i in range(n):
for j in range(m):
cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j])
for i in range(m):
for j in range(m):
cost_matrix[i+n,j] = self.insert_cost(i, j, nodes2)
for i in range(n):
for j in range(n):
cost_matrix[j,i+m] = self.delete_cost(i, j, nodes1)
self.cost_matrix = cost_matrix
return cost_matrix
def insert_cost(self, i, j):
raise NotImplementedError
def delete_cost(self, i, j):
raise NotImplementedError
def substitute_cost(self, nodes1, nodes2):
raise NotImplementedError
def print_matrix(self):
print("cost matrix:")
print(self.g1.nodes())
print(self.g2.nodes())
print(np.array(self.create_cost_matrix()))
for column in self.create_cost_matrix():
for row in column:
if row == sys.maxsize:
print ("inf\t")
else:
print ("%.2f\t" % float(row))
print("")
import sys
from gmatch4py.ged.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
class EdgeEditDistance(AbstractGraphEditDistance):
"""
Calculates the graph edit distance between two edges.
A node in this context is interpreted as a graph,
and edges are interpreted as nodes.
"""
def __init__(self, g1, g2,**kwargs):
AbstractGraphEditDistance.__init__(self, g1, g2,**kwargs)
def insert_cost(self, i, j, nodes2):
if i == j:
return self.edge_ins
return sys.maxsize
def delete_cost(self, i, j, nodes1):
if i == j:
return self.edge_del
return sys.maxsize
def substitute_cost(self, edge1, edge2):
if edge1 == edge2:
return 0.
return self.edge_del+self.edge_ins
# -*- coding: UTF-8 -*-
import sys
import networkx as nx
from gmatch4py.ged.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
from gmatch4py.ged.algorithm.edge_edit_dist import EdgeEditDistance
from gmatch4py.ged.graph.edge_graph import EdgeGraph
def compare(g1, g2, print_details=False):
ged = GraphEditDistance(g1, g2,print_details)
return ged.distance()
class GraphEditDistance(AbstractGraphEditDistance):
def __init__(self, g1, g2,debug=False,**kwargs):
AbstractGraphEditDistance.__init__(self, g1, g2,debug,**kwargs)
def substitute_cost(self, node1, node2):
return self.relabel_cost(node1, node2) + self.edge_diff(node1, node2)
def relabel_cost(self, node1, node2):
if node1 == node2:
edges1=set(self.get_edge_multigraph(self.g1,node1))
edges2=set(self.get_edge_multigraph(self.g2,node2))
return abs(len(edges2.difference(edges1))) # Take in account if there is a different number of edges
else:
return self.node_ins+self.node_del
def delete_cost(self, i, j, nodes1):
if i == j:
return self.node_del+self.g1.degree(nodes1[i]) # Deleting a node implicate to delete in and out edges
return sys.maxsize
def insert_cost(self, i, j, nodes2):
if i == j:
deg=self.g2.degree(nodes2[j])
if isinstance(deg,dict):deg=0
return self.node_ins+deg
else:
return sys.maxsize
def get_edge_multigraph(self,g,node):
edges=[]
for id_,val in g.edge[node].items():
if not 0 in val:
edges.append(str(id_) + val["color"])
else:
for _,edge in val.items():
edges.append(str(id_)+edge["color"])
return edges
def edge_diff(self, node1, node2):
if isinstance(self.g1,nx.MultiDiGraph):
edges1 = self.get_edge_multigraph(self.g1,node1)
edges2 = self.get_edge_multigraph(self.g2,node2)
else:
edges1 = list(self.g1.edge[node1].keys())
edges2 = list(self.g2.edge[node2].keys())
if len(edges1) == 0 or len(edges2) == 0:
return max(len(edges1), len(edges2))
edit_edit_dist = EdgeEditDistance(
EdgeGraph(node1,edges1),
EdgeGraph(node2,edges2),
edge_del=self.edge_del,edge_ins=self.edge_ins,node_ins=self.node_ins,node_del=self.node_del
)
return edit_edit_dist.distance()
# coding = utf-8
import numpy as np
from .algorithm.graph_edit_dist import GraphEditDistance
class ApproximateGraphEditDistance():
__type__ = "dist"
@staticmethod
def compare(listgs,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1):
n= len(listgs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
comparison_matrix[i,j]= GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
comparison_matrix[j,i]= comparison_matrix[i,j] # Unethical ! Since AGED is not a symmetric similarity measure !
return comparison_matrix
\ No newline at end of file
# coding = utf-8
import numpy as np
class BP_2():
"""
"""
__type__="dist"
@staticmethod
def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
n = len(listgs)
comparator = BP_2(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
comparison_matrix = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
comparison_matrix[i, j] = comparator.bp2(listgs[i], listgs[j])
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1):
"""Constructor for HED"""
self.node_del = node_del
self.node_ins = node_ins
self.edge_del = edge_del
self.edge_ins = edge_ins
def bp2(self, g1, g2):
"""
Compute de Hausdorff Edit Distance
:param g1: first graph
:param g2: second graph
:return:
"""
return min(self.distance(self.psi(g1,g2)),self.distance(self.psi(g2,g1)))
def distance(self,e):
return np.sum(e)
def psi(self,g1,g2):
psi_=[]
nodes1 = g1.nodes()
nodes2 = g2.nodes()
for u in nodes1:
v=None
for w in nodes2:
if 2*self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,None) + self.fuv(g1,g2,None,w)\
and self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,v):
v=w
psi_.append(self.fuv(g1,g2,u,v))
if u:
nodes1= list(set(nodes1).difference(set([u])))
if v:
nodes2= list(set(nodes2).difference(set([v])))
for v in nodes2:
psi_.append(self.fuv(g1,g2,None,v))
return psi_
def fuv(self, g1, g2, n1, n2):
"""
Compute the Node Distance function
:param g1: first graph
:param g2: second graph
:param n1: node of the first graph
:param n2: node of the second graph
:return:
"""
if n2 == None: # Del
return self.node_del + ((self.edge_del / 2) * g1.degree(n1))
if n1 == None: # Insert
return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))
else:
if n1 == n2:
return 0.
return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2
def hed_edge(self, g1, g2, n1, n2):
"""
Compute HEDistance between edges of n1 and n2, respectively in g1 and g2
:param g1: first graph
:param g2: second graph
:param n1: node of the first graph
:param n2: node of the second graph
:return:
"""
return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)
def get_edge_multigraph(self, g, node):
"""
Get list of edge around a node in a Multigraph
:param g: multigraph
:param node: node in the multigraph
:return:
"""
edges = []
for edge in g.edges(data=True):
if node == edge[0] or node == edge[1]:
edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"]))
return edges
def sum_gpq(self, g1, n1, g2, n2):
"""
Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2
:param g1: first graph
:param n1: node in the first graph
:param g2: second graph
:param n2: node in the second graph
:return:
"""
#if isinstance(g1, nx.MultiDiGraph):
edges1 = self.get_edge_multigraph(g1, n1)
edges2 = self.get_edge_multigraph(g2, n2)
#else:
#print(1)
#edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())]
#edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())]
edges2.extend([None])
min_sum = np.zeros(len(edges1))
for i in range(len(edges1)):
min_i = np.zeros(len(edges2))
for j in range(len(edges2)):
min_i[j] = self.gpq(edges1[i], edges2[j])
min_sum[i] = np.min(min_i)
return np.sum(min_sum)
def gpq(self, e1, e2):
"""
Compute the edge distance function
:param e1: edge1
:param e2: edge2
:return:
"""
if e2 == None: # Del
return self.edge_del
if e1 == None: # Insert
return self.edge_ins
else:
if e1 == e2:
return 0.
return (self.edge_del + self.edge_ins) / 2
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment