Commit 8af775b6 authored by Fize Jacques's avatar Fize Jacques

Initial Commit

parent 3402ce77
......@@ -102,3 +102,6 @@ venv.bak/
# mypy
.mypy_cache/
*.cpp
*.c
.DS_Store
\ No newline at end of file
# GMatch4py
A python module for graph matching using Networkx
# Gmatch4py a graph matching library for Python
Gmatch4py is a library dedicated to graph matching. Graph structure are stored in NetworkX.Graph objects.
## List of algorithm
* DeltaCon and DeltaCon0 (*debug needed*) [1]
* Vertex Ranking (*debug needed*) [2]
* Vertex Edge Overlap [2]
* Graph kernels
* Random Walk Kernel (*debug needed*) [3]
* Geometrical
* K-Step
* Shortest Path Kernel [3]
* Weisfeiler-Lehman Kernel [4]
* Subtree Kernel
* Edge Kernel
* Subtree Geo Kernel [new]
* Edge Geo Kernel [new]
* Graph Edit Distance [5]
* Approximated Graph Edit Distance
* Hausdorff Graph Edit Distance
* Bipartite Graph Edit Distance
* Greedy Edit Distance
* MCS [6]
## Publications associated
* [1] Koutra, D., Vogelstein, J. T., & Faloutsos, C. (2013, May). Deltacon: A principled massive-graph similarity function. In Proceedings of the 2013 SIAM International Conference on Data Mining (pp. 162-170). Society for Industrial and Applied Mathematics.
* [2] Papadimitriou, P., Dasdan, A., & Garcia-Molina, H. (2010). Web graph similarity for anomaly detection. Journal of Internet Services and Applications, 1(1), 19-30.
* [3] Vishwanathan, S. V. N., Schraudolph, N. N., Kondor, R., & Borgwardt, K. M. (2010). Graph kernels. Journal of Machine Learning Research, 11(Apr), 1201-1242.
* [4] Shervashidze, N., Schweitzer, P., Leeuwen, E. J. V., Mehlhorn, K., & Borgwardt, K. M. (2011). Weisfeiler-lehman graph kernels. Journal of Machine Learning Research, 12(Sep), 2539-2561.
* [5] Fischer, A., Riesen, K., & Bunke, H. (2017). Improved quadratic time approximation of graph edit distance by combining Hausdorff matching and greedy assignment. Pattern Recognition Letters, 87, 55-62.
* [6] A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer, Pattern Recognition Letters, 1998
## Authors
Jacques Fize
## TODO
* Debug algorithms with --> (*debug needed*)
\ No newline at end of file
# coding = utf-8
\ No newline at end of file
# coding = utf-8
import copy
from typing import Sequence
import networkx as nx
import numpy as np
cimport numpy as np
import sys
from networkit import graph
from networkit.clique import MaximalCliques
def nx2nk(nxG, weightAttr=None):
"""
Convert a networkx.Graph to a NetworKit.Graph
:param weightAttr: the edge attribute which should be treated as the edge weight.
"""
# map networkx node ids to consecutive numerical node ids
idmap = dict((id, u) for (id, u) in zip(list(nxG.nodes), range(nxG.number_of_nodes())))
z = max(idmap.values()) + 1
# print("z = {0}".format(z))
if weightAttr is not None:
nkG = graph.Graph(z, weighted=True, directed=nxG.is_directed())
for (u_, v_) in nxG.edges():
u, v = idmap[u_], idmap[v_]
w = nxG[u_][v_][weightAttr]
nkG.addEdge(u, v, w)
else:
nkG = graph.Graph(z, directed=nxG.is_directed())
for (u_, v_) in nxG.edges():
u, v = idmap[u_], idmap[v_]
# print(u_, v_, u, v)
assert (u < z)
assert (v < z)
nkG.addEdge(u, v)
assert (nkG.numberOfNodes() == nxG.number_of_nodes())
assert (nkG.numberOfEdges() == nxG.number_of_edges())
return nkG.removeSelfLoops(),idmap
def getClique(nx_graph):
final_cliques=[]
if len(nx_graph) ==0:
return final_cliques
netkit_graph,idmap=nx2nk(nx_graph)
idmap={v:k for k,v in idmap.items()}
cliques=MaximalCliques(netkit_graph).run().getCliques()
for cl in cliques:
final_cliques.append(list(map(lambda x:idmap[x],cl)))
return final_cliques
class BagOfCliques():
@staticmethod
def compare(graphs,selected):
b=BagOfCliques()
bog=b.getBagOfCliques(graphs).astype(np.float32)
#Compute cosine similarity
cdef int n=bog.shape[0]
cdef double[:,:] scores = np.zeros((n,n))
cdef int i
for i in range(len(scores)):
if not i in selected:
continue
for j in range(i,len(scores)):
scores[i,j]=(np.dot(bog[i],bog[j]))/(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line
scores[j,i]=scores[i,j]
return scores
def getUniqueCliques(self,graphs):
"""
Return unique cliques from a population of graphs
:return:
"""
t = {}
c_ = 0
cdef list clique_vocab = []
cdef list cli_temp
cdef list cliques
cdef int len_graphs=len(graphs)
cdef int km= -1
for g in graphs:
km+=1
if not g:
continue
sys.stdout.write("\r{0}/{1} -- {2}".format(km,len_graphs,len(g)))
try:
cliques = list(getClique(nx.Graph(g)))
except:
#no clique found
#print(nx.Graph(g).edges())
cliques =[]
for clique in cliques:
cli_temp = copy.deepcopy(clique)
new_clique = False
for i in range(len(clique)):
flag = False
v = None # vertex deleted
for vertex in cli_temp:
if vertex in t:
v = vertex
flag = True
if not flag in t:
v = cli_temp[0]
t[v] = {}
new_clique = True
t = t[v]
cli_temp.remove(v)
if new_clique:
c_ += 1
clique_vocab.append(clique)
return clique_vocab
def clique2str(self,cliques):
return "".join(sorted(cliques))
def transform_clique_vocab(self,clique_vocab):
cdef dict new_vocab={}
cdef int len_voc=len(clique_vocab)
for c in range(len_voc):
print(c)
new_vocab[self.clique2str(clique_vocab[c])]=c
return new_vocab
def ifHaveMinor(self,clique, dict mapping):
"""
If a clique (minor) H belong to a graph G
:param H:
:return:
"""
if self.clique2str(clique) in mapping:
return 1
return 0
def getBagOfCliques(self,graphs ):
"""
:param clique_vocab:
:return:
"""
cdef list clique_vocab=self.getUniqueCliques(graphs)
cdef dict map_str_cliques=self.transform_clique_vocab(clique_vocab)
cdef int l_v=len(clique_vocab)
cdef np.ndarray boc = np.zeros((len(graphs), l_v))
cdef np.ndarray vector
cdef list cliques
for g in range(len(graphs)):
sys.stdout.write("\r{0}/{1}".format(g,len(graphs)))
gr = graphs[g]
vector = np.zeros(l_v)
cliques = list(getClique(nx.Graph(gr)))
for clique in cliques:
hash=self.clique2str(clique)
if hash in map_str_cliques:
vector[map_str_cliques[hash]] = 1
boc[g] = vector
return boc
\ No newline at end of file
# coding = utf-8
import networkx as nx
import numpy as np
import scipy.sparse
class DeltaCon0():
__type__ = "sim"
@staticmethod
def compare(list_gs,selected):
n=len(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
f=True
if not list_gs[i] or not list_gs[j]:
f=False
elif len(list_gs[i])== 0 or len(list_gs[j]) == 0:
f=False
if selected:
if not i in selected:
f=False
if f:
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
S1 = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
S2 = np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(S1,S2))
comparison_matrix[j,i] = comparison_matrix[i,j]
else:
comparison_matrix[i, j] = 0.
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
@staticmethod
def rootED(S1,S2):
return np.sqrt(np.sum((S1-S2)**2)) # Long live numpy !
@staticmethod
def degreeAndAdjacencyMatrix(G):
"""
Return the Degree(D) and Adjacency Matrix(A) from a graph G.
Inspired of nx.laplacian_matrix(G,nodelist,weight) code proposed by networkx
:param G:
:return:
"""
A = nx.to_scipy_sparse_matrix(G, nodelist=list(G.nodes), weight="weight",
format='csr')
n, m = A.shape
diags = A.sum(axis=1)
D = scipy.sparse.spdiags(diags.flatten(), [0], m, n, format='csr')
return D, A
@staticmethod
def maxDegree(G):
degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence
# print "Degree sequence", degree_sequence
dmax = max(degree_sequence)
return dmax
class DeltaCon():
__type__ = "sim"
@staticmethod
def relabel_nodes(graph_list):
label_lookup = {}
label_counter = 0
n= len(graph_list)
# label_lookup is an associative array, which will contain the
# mapping from multiset labels (strings) to short labels
# (integers)
for i in range(n):
nodes = list(graph_list[i].nodes)
for j in range(len(nodes)):
if not (nodes[j] in label_lookup):
label_lookup[nodes[j]] = label_counter
label_counter += 1
graph_list[i] = nx.relabel_nodes(graph_list[i], label_lookup)
return graph_list
@staticmethod
def compare(list_gs, g=3):
n=len(list_gs)
list_gs=DeltaCon.relabel_nodes(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
V = list(g1.nodes)
V.extend(list(g2.nodes))
V=np.unique(V)
partitions=V.copy()
np.random.shuffle(partitions)
if len(partitions)< g:
partitions=np.array([partitions])
else:
partitions=np.array_split(partitions,g)
partitions_e_1 = DeltaCon.partitions2e(partitions, list(g1.nodes))
partitions_e_2 = DeltaCon.partitions2e(partitions, list(g2.nodes))
S1,S2=[],[]
for k in range(len(partitions)):
s0k1,s0k2=partitions_e_1[k],partitions_e_2[k]
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
s1k = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
s1k=np.linalg.solve(s1k,s0k1).tolist()
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
s2k= np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
s2k = np.linalg.solve(s2k, s0k2).tolist()
S1.append(s1k)
S2.append(s2k)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(np.array(S1),np.array(S2)))
comparison_matrix[j,i] = comparison_matrix[i,j]
return comparison_matrix
@staticmethod
def partitions2e( partitions, V):
e = [ [] for i in range(len(partitions))]
for p in range(len(partitions)):
e[p] = []
for i in range(len(V)):
if i in partitions[p]:
e[p].append(1.0)
else:
e[p].append(0.0)
return e
\ No newline at end of file
# coding = utf-8
\ No newline at end of file
# coding = utf-8
from termcolor import colored
class NotFoundDistance(Exception):
def __init__(self,dd,distanceFunctionDict):
# Call the base class constructor with the parameters it needs
super(Exception, self).__init__(colored("{0} is not an edit distance implemented ! Select a distance from : {1}".format(dd,",".join(distanceFunctionDict.keys())),"red"))
# -*- coding: UTF-8 -*-
from __future__ import print_function
import sys
import numpy as np
from scipy.optimize import linear_sum_assignment
cimport numpy as np
class AbstractGraphEditDistance(object):
def __init__(self, g1, g2,debug=False,**kwargs):
self.g1 = g1
self.g2 = g2
self.debug=debug
self.node_del = kwargs.get("node_del",1)
self.node_ins = kwargs.get("node_ins",1)
self.edge_del = kwargs.get("edge_del",1)
self.edge_ins = kwargs.get("edge_ins",1)
def distance(self):
opt_path = self.edit_costs()
if self.debug:
print("Edit path for ",str(self.__class__.__name__),"\n",opt_path)
return sum(opt_path)
def print_operations(self,cost_matrix,row_ind,col_ind):
cdef list nodes1 = list(self.g1.nodes)
cdef list nodes2 = list(self.g2.nodes)
dn1 = self.g1.nodes
dn2 = self.g2.nodes
cdef int n=len(nodes1)
cdef int m=len(nodes2)
cdef int x,y,i
for i in range(len(row_ind)):
y,x=row_ind[i],col_ind[i]
val=cost_matrix[row_ind[i]][col_ind[i]]
if x<m and y<n:
print("SUB {0} to {1} cost = {2}".format(dn1[nodes1[y]]["label"],dn2[nodes2[x]]["label"],val))
elif x <m and y>=n:
print("ADD {0} cost = {1}".format(dn2[nodes2[y-n]]["label"],val))
elif x>=m and y<n:
print("DEL {0} cost = {1}".format(dn1[nodes1[m-x]]["label"],val))
def edit_costs(self):
cdef np.ndarray cost_matrix = self.create_cost_matrix()
if self.debug:
np.set_printoptions(precision=3)
print("Cost Matrix for ",str(self.__class__.__name__),"\n",cost_matrix)
row_ind,col_ind = linear_sum_assignment(cost_matrix)
if self.debug:
self.print_operations(cost_matrix,row_ind,col_ind)
cdef int f=len(row_ind)
return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(f)]
def create_cost_matrix(self):
"""
Creates a |N+M| X |N+M| cost matrix between all nodes in
graphs g1 and g2
Each cost represents the cost of substituting,
deleting or inserting a node
The cost matrix consists of four regions:
substitute | insert costs
-------------------------------
delete | delete -> delete
The delete -> delete region is filled with zeros
"""
cdef int n = len(self.g1)
cdef int m = len(self.g2)
cdef np.ndarray cost_matrix = np.zeros((n+m,n+m))
#cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)]
cdef list nodes1 = list(self.g1.nodes)
cdef list nodes2 = list(self.g2.nodes)
cdef int i,j
for i in range(n):
for j in range(m):
cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j])
for i in range(m):
for j in range(m):
cost_matrix[i+n,j] = self.insert_cost(i, j, nodes2)
for i in range(n):
for j in range(n):
cost_matrix[j,i+m] = self.delete_cost(i, j, nodes1)
self.cost_matrix = cost_matrix
return cost_matrix
def insert_cost(self, int i, int j):
raise NotImplementedError
def delete_cost(self, int i, int j):
raise NotImplementedError
def substitute_cost(self, nodes1, nodes2):
raise NotImplementedError
def print_matrix(self):
print("cost matrix:")
print(list(self.g1.nodes))
print(list(self.g2.nodes))
print(np.array(self.create_cost_matrix()))
for column in self.create_cost_matrix():
for row in column:
if row == sys.maxsize:
print ("inf\t")
else:
print ("%.2f\t" % float(row))
print("")
import sys
from .abstract_graph_edit_dist import AbstractGraphEditDistance
class EdgeEditDistance(AbstractGraphEditDistance):
"""
Calculates the graph edit distance between two edges.
A node in this context is interpreted as a graph,
and edges are interpreted as nodes.
"""
def __init__(self, g1, g2,**kwargs):
AbstractGraphEditDistance.__init__(self, g1, g2,**kwargs)
def insert_cost(self, int i, int j, nodes2):
if i == j:
return self.edge_ins
return sys.maxsize
def delete_cost(self, int i, int j, nodes1):
if i == j:
return self.edge_del
return sys.maxsize
def substitute_cost(self, edge1, edge2):
if edge1 == edge2:
return 0.
return self.edge_del+self.edge_ins
# -*- coding: UTF-8 -*-
import sys
import networkx as nx
from .abstract_graph_edit_dist import AbstractGraphEditDistance
from .edge_edit_dist import EdgeEditDistance
from ..graph.edge_graph import EdgeGraph
def compare(g1, g2, print_details=False):
ged = GraphEditDistance(g1, g2,print_details)
return ged.distance()
class GraphEditDistance(AbstractGraphEditDistance):
def __init__(self, g1, g2,debug=False,**kwargs):
AbstractGraphEditDistance.__init__(self, g1, g2,debug,**kwargs)
def substitute_cost(self, node1, node2):
return self.relabel_cost(node1, node2) + self.edge_diff(node1, node2)
def relabel_cost(self, node1, node2):
if node1 == node2:
edges1=set(self.get_edge_multigraph(self.g1,node1))
edges2=set(self.get_edge_multigraph(self.g2,node2))
return abs(len(edges2.difference(edges1))) # Take in account if there is a different number of edges
else:
return self.node_ins+self.node_del
def delete_cost(self, int i, int j, nodes1):
if i == j:
return self.node_del+self.g1.degree(nodes1[i]) # Deleting a node implicate to delete in and out edges
return sys.maxsize
def insert_cost(self, int i, int j, nodes2):
if i == j:
deg=self.g2.degree(nodes2[j])
if isinstance(deg,dict):deg=0
return self.node_ins+deg
else:
return sys.maxsize
def get_edge_multigraph(self,g,node):
cdef list edges=[]
for id_,val in g.edges[node].items():
if not 0 in val:
edges.append(str(id_) + val["color"])
else:
for _,edge in val.items():
edges.append(str(id_)+edge["color"])
return edges
def edge_diff(self, node1, node2):
cdef list edges1,edges2
if isinstance(self.g1,nx.MultiDiGraph):
edges1 = self.get_edge_multigraph(self.g1,node1)
edges2 = self.get_edge_multigraph(self.g2,node2)
else:
edges1 = list(self.g1.edges[node1].keys())
edges2 = list(self.g2.edges[node2].keys())
if len(edges1) == 0 or len(edges2) == 0:
return max(len(edges1), len(edges2))
edit_edit_dist = EdgeEditDistance(
EdgeGraph(node1,edges1),
EdgeGraph(node2,edges2),
edge_del=self.edge_del,edge_ins=self.edge_ins,node_ins=self.node_ins,node_del=self.node_del
)
return edit_edit_dist.distance()
# coding = utf-8
import numpy as np
from .algorithm.graph_edit_dist import GraphEditDistance
from cython.parallel import prange
class ApproximateGraphEditDistance():
__type__ = "dist"
@staticmethod
def compare(listgs,selected,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1):
cdef int n= len(listgs)
cdef double[:,:] comparison_matrix = np.zeros((n,n))
cdef int i,j
for i in prange(n,nogil=True):
for j in range(i,n):
with gil:
f=True
if not listgs[i] or not listgs[j]:
f=False
elif len(listgs[i])== 0 or len(listgs[j]) == 0:
f=False
if selected:
if not i in selected:
f=False
if f:
comparison_matrix[i][j] = GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
else:
comparison_matrix[i][j] = np.inf
comparison_matrix[j][i] = comparison_matrix[i][j]
return comparison_matrix
\ No newline at end of file
# coding = utf-8
import numpy as np