diff --git a/gmatch4py/alg_types.pyx b/gmatch4py/alg_types.pyx deleted file mode 100644 index 83ae12b41548a7802f08a8449120236ab440e0b2..0000000000000000000000000000000000000000 --- a/gmatch4py/alg_types.pyx +++ /dev/null @@ -1,7 +0,0 @@ -# coding = utf-8 -from enum import Enum - - -class AlgorithmType(Enum): - similarity = 0 - distance = 1 \ No newline at end of file diff --git a/gmatch4py/bag_of_cliques.pyx b/gmatch4py/bag_of_cliques.pyx index f418683eaa1ebf0a9a3482b208979f5a05ef26e5..4381672103d56a446fb713673737966d31332821 100644 --- a/gmatch4py/bag_of_cliques.pyx +++ b/gmatch4py/bag_of_cliques.pyx @@ -9,7 +9,7 @@ cimport numpy as np from scipy.sparse import csr_matrix,lil_matrix import sys -from .base cimport Base,intersection +from .base cimport Base cdef class BagOfCliques(Base): diff --git a/gmatch4py/base.pxd b/gmatch4py/base.pxd index 0f0eb7f871fffae0fdbc4a9a1df7b5cdd05144fb..9b03236261ba55b450e58ab7dc2be95c8437e9f7 100644 --- a/gmatch4py/base.pxd +++ b/gmatch4py/base.pxd @@ -17,6 +17,3 @@ cdef class Base: cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key) - -cpdef intersection(G,H) -cpdef union_(G,H) diff --git a/gmatch4py/base.pyx b/gmatch4py/base.pyx index cf904f4d99be40fe6905a082385ff157626a5b50..bd49edbaeac45e8ecd3a7ade40c6eb8b8cc94e2b 100644 --- a/gmatch4py/base.pyx +++ b/gmatch4py/base.pyx @@ -21,85 +21,6 @@ cpdef np.ndarray minmax_scale(np.ndarray matrix): return x/(max_) - -cpdef intersection(G, H): - """ - Return a new graph that contains only the edges and nodes that exist in - both G and H. - - The node sets of H and G must be the same. - - Parameters - ---------- - G,H : graph - A NetworkX graph. G and H must have the same node sets. - - Returns - ------- - GH : A new graph with the same type as G. - - Notes - ----- - Attributes from the graph, nodes, and edges are not copied to the new - graph. If you want a new graph of the intersection of G and H - with the attributes (including edge data) from G use remove_nodes_from() - as follows - - >>> G=nx.path_graph(3) - >>> H=nx.path_graph(5) - >>> R=G.copy() - >>> R.remove_nodes_from(n for n in G if n not in H) - - Modified so it can be used with two graphs with different nodes set - """ - # create new graph - R = nx.create_empty_copy(G) - - if not G.is_multigraph() == H.is_multigraph(): - raise nx.NetworkXError('G and H must both be graphs or multigraphs.') - if G.number_of_edges() <= H.number_of_edges(): - if G.is_multigraph(): - edges = G.edges(keys=True) - else: - edges = G.edges() - for e in edges: - if H.has_edge(*e): - R.add_edge(*e) - else: - if H.is_multigraph(): - edges = H.edges(keys=True) - else: - edges = H.edges() - for e in edges: - if G.has_edge(*e): - R.add_edge(*e) - nodes_g=set(G.nodes()) - nodes_h=set(H.nodes()) - R.remove_nodes_from(list(nodes_g - nodes_h)) - return R - -cpdef union_(G, H): - """ - Return a graph that contains nodes and edges from both graph G and H. - - Parameters - ---------- - G : networkx.Graph - First graph - H : networkx.Graph - Second graph - - Returns - ------- - networkx.Graph - A new graph with the same type as G. - """ - R = nx.create_empty_copy(G) - R.add_nodes_from(H.nodes(data=True)) - R.add_edges_from(G.edges(data=True)) - R.add_edges_from(H.edges(data=True)) - return R - cdef class Base: """ This class define the common methods to all Graph Matching algorithm. @@ -145,10 +66,34 @@ cdef class Base: self.edge_attr_key=edge_attr_key cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key): + """ + Set graph attribute used by the algorithm to compare graphs. + Parameters + ---------- + node_attr_key : str + key of the node attribute + edge_attr_key: str + key of the edge attribute + + """ self.node_attr_key=node_attr_key self.edge_attr_key=edge_attr_key cpdef np.ndarray get_selected_array(self,selected,size_corpus): + """ + Return an array which define which graph will be compared in the algorithms. + Parameters + ---------- + selected : list + indices of graphs you wish to compare + size_corpus : + size of your dataset + + Returns + ------- + np.ndarray + selected vector (1 -> selected, 0 -> not selected) + """ cdef double[:] selected_test = np.zeros(size_corpus) if not selected == None: for ix in range(len(selected)): @@ -159,6 +104,20 @@ cdef class Base: cpdef np.ndarray compare_old(self,list listgs, list selected): + """ + Soon will be depreciated ! To store the old version of an algorithm. + Parameters + ---------- + listgs : list + list of graphs + selected + selected graphs + + Returns + ------- + np.ndarray + distance/similarity matrix + """ pass @cython.boundscheck(False) @@ -179,7 +138,7 @@ cdef class Base: the None value Returns ------- - np.array + np.ndarray distance/similarity matrix """ @@ -190,12 +149,12 @@ cdef class Base: Return a normalized distance matrix Parameters ---------- - matrix : np.array - Similarity/distance matrix you want to transform + matrix : np.ndarray + Similarity/distance matrix you wish to transform Returns ------- - np.array + np.ndarray distance matrix """ if self.type_alg == 1: @@ -212,8 +171,8 @@ cdef class Base: Return a normalized similarity matrix Parameters ---------- - matrix : np.array - Similarity/distance matrix you want to transform + matrix : np.ndarray + Similarity/distance matrix you wish to transform Returns ------- @@ -227,24 +186,6 @@ cdef class Base: matrix=np.ma.getdata(minmax_scale(matrix)) return 1-matrix - def mcs(self, G, H): - """ - Return the Most Common Subgraph of - Parameters - ---------- - G : networkx.Graph - First Graph - H : networkx.Graph - Second Graph - - Returns - ------- - networkx.Graph - Most common Subgrah - """ - R=G.copy() - R.remove_nodes_from(n for n in G if n not in H) - return R cpdef bint isAccepted(self,G,index,selected): """ diff --git a/gmatch4py/bon.pyx b/gmatch4py/bon.pyx index 396231e7f98677ce6c3ce5db2281979fa8e768c5..0cc7e0e14061251150739f8adbbf76d333b88aaf 100644 --- a/gmatch4py/bon.pyx +++ b/gmatch4py/bon.pyx @@ -11,7 +11,7 @@ cdef class BagOfNodes(Base): We could call this algorithm Bag of nodes """ def __init__(self): - Base.__init__(self,0,True) + Base.__init__(self,0,True) cpdef np.ndarray compare(self,list graph_list, list selected): nodes = list() diff --git a/gmatch4py/deltacon.pyx b/gmatch4py/deltacon.pyx deleted file mode 100644 index a4d01b05816bb3bacf08818f42cf954cb80dc524..0000000000000000000000000000000000000000 --- a/gmatch4py/deltacon.pyx +++ /dev/null @@ -1,153 +0,0 @@ -# coding = utf-8 - -import networkx as nx -import numpy as np -import scipy.sparse - - -class DeltaCon0(): - __type__ = "sim" - - @staticmethod - def compare(list_gs,selected): - n=len(list_gs) - - comparison_matrix = np.zeros((n,n)) - for i in range(n): - for j in range(i,n): - g1,g2=list_gs[i],list_gs[j] - f=True - if not list_gs[i] or not list_gs[j]: - f=False - elif len(list_gs[i])== 0 or len(list_gs[j]) == 0: - f=False - if selected: - if not i in selected: - f=False - if f: - # S1 - epsilon = 1/(1+DeltaCon0.maxDegree(g1)) - D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1) - S1 = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A) - - # S2 - D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2) - epsilon = 1 / (1 + DeltaCon0.maxDegree(g2)) - S2 = np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A) - - - comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(S1,S2)) - comparison_matrix[j,i] = comparison_matrix[i,j] - else: - comparison_matrix[i, j] = 0. - comparison_matrix[j, i] = comparison_matrix[i, j] - - - return comparison_matrix - - @staticmethod - def rootED(S1,S2): - return np.sqrt(np.sum((S1-S2)**2)) # Long live numpy ! - - @staticmethod - def degreeAndAdjacencyMatrix(G): - """ - Return the Degree(D) and Adjacency Matrix(A) from a graph G. - Inspired of nx.laplacian_matrix(G,nodelist,weight) code proposed by networkx - :param G: - :return: - """ - A = nx.to_scipy_sparse_matrix(G, nodelist=list(G.nodes), weight="weight", - format='csr') - n, m = A.shape - diags = A.sum(axis=1) - D = scipy.sparse.spdiags(diags.flatten(), [0], m, n, format='csr') - - return D, A - @staticmethod - def maxDegree(G): - degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence - # print "Degree sequence", degree_sequence - dmax = max(degree_sequence) - return dmax - -class DeltaCon(): - __type__ = "sim" - - @staticmethod - def relabel_nodes(graph_list): - label_lookup = {} - label_counter = 0 - n= len(graph_list) - # label_lookup is an associative array, which will contain the - # mapping from multiset labels (strings) to short labels - # (integers) - for i in range(n): - nodes = list(graph_list[i].nodes) - - for j in range(len(nodes)): - if not (nodes[j] in label_lookup): - label_lookup[nodes[j]] = label_counter - label_counter += 1 - - graph_list[i] = nx.relabel_nodes(graph_list[i], label_lookup) - return graph_list - @staticmethod - def compare(list_gs, g=3): - n=len(list_gs) - list_gs=DeltaCon.relabel_nodes(list_gs) - comparison_matrix = np.zeros((n,n)) - for i in range(n): - for j in range(i,n): - g1,g2=list_gs[i],list_gs[j] - - V = list(g1.nodes) - V.extend(list(g2.nodes)) - V=np.unique(V) - - partitions=V.copy() - np.random.shuffle(partitions) - if len(partitions)< g: - partitions=np.array([partitions]) - else: - partitions=np.array_split(partitions,g) - partitions_e_1 = DeltaCon.partitions2e(partitions, list(g1.nodes)) - partitions_e_2 = DeltaCon.partitions2e(partitions, list(g2.nodes)) - S1,S2=[],[] - for k in range(len(partitions)): - s0k1,s0k2=partitions_e_1[k],partitions_e_2[k] - - # S1 - epsilon = 1/(1+DeltaCon0.maxDegree(g1)) - D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1) - s1k = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A) - s1k=np.linalg.solve(s1k,s0k1).tolist() - - # S2 - D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2) - epsilon = 1 / (1 + DeltaCon0.maxDegree(g2)) - s2k= np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A) - s2k = np.linalg.solve(s2k, s0k2).tolist() - - - - S1.append(s1k) - S2.append(s2k) - - comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(np.array(S1),np.array(S2))) - comparison_matrix[j,i] = comparison_matrix[i,j] - - return comparison_matrix - - - @staticmethod - def partitions2e( partitions, V): - e = [ [] for i in range(len(partitions))] - for p in range(len(partitions)): - e[p] = [] - for i in range(len(V)): - if i in partitions[p]: - e[p].append(1.0) - else: - e[p].append(0.0) - return e \ No newline at end of file diff --git a/gmatch4py/embedding/deepwalk.pyx b/gmatch4py/embedding/deepwalk.pyx index 104ab042fd94049314934460fb733badd1644218..5e91f6f87f87d33d836528e9fa3fc98ced3e2cdf 100644 --- a/gmatch4py/embedding/deepwalk.pyx +++ b/gmatch4py/embedding/deepwalk.pyx @@ -4,31 +4,31 @@ import os import sys import random -import networkx as nx + from io import open from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter from collections import Counter from concurrent.futures import ProcessPoolExecutor import logging +from multiprocessing import cpu_count -import graph as graph2 -import walks as serialized_walks -from gensim.models import Word2Vec -from skipgram import Skipgram - +import networkx as nx +import numpy as np +cimport numpy as np from six import text_type as unicode from six import iteritems from six.moves import range -cimport cython +from gensim.models import Word2Vec from sklearn.metrics.pairwise import cosine_similarity -from ..base cimport Base -import numpy as np -cimport numpy as np - -import psutil -from multiprocessing import cpu_count from joblib import Parallel, delayed +import psutil + +cimport cython +from ..base cimport Base +import graph as graph2 +import walks as serialized_walks +from skipgram import Skipgram p = psutil.Process(os.getpid()) @@ -42,6 +42,36 @@ except AttributeError: def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0): + """ + Return a DeepWalk embedding for a graph + + Parameters + ---------- + gr : nx.Graph + graph + number_walks : int, optional + Number of walk (the default is 10) + walk_length : int, optional + Length of the random walk started at each node (the default is 40) + window_size : int, optional + Window size of skipgram model. (the default is 5) + vertex_freq_degree : bool, optional + Use vertex degree to estimate the frequency of nodes (the default is False) + workers : int, optional + Number of parallel processes (the default is 1) + representation_size : int, optional + Number of latent dimensions to learn for each node (the default is 64) + max_memory_data_size : int, optional + 'Size to start dumping walks to disk, instead of keeping them in memory. (the default is 1000000000) + seed : int, optional + Seed for random walk generator (the default is 0) + + Returns + ------- + np.array + DeepWalk embedding + """ + if len(gr.edges())<1: return np.zeros((1,representation_size)) G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed()) @@ -115,6 +145,20 @@ cdef class DeepWalk(Base): Base.__init__(self,0,True) def extract_embedding(self, listgs): + """ + Extract DeepWalk embedding of each graph in `listgs` + + Parameters + ---------- + listgs : list + list of graphs + + Returns + ------- + list + list of embeddings + """ + from tqdm import tqdm models = Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings...")) return models diff --git a/gmatch4py/embedding/graph2vec.pyx b/gmatch4py/embedding/graph2vec.pyx index 669666240551f56eea43f356b237aab95b3b6b26..461efaf8a15e75476d5db15e83a28980502f78d2 100644 --- a/gmatch4py/embedding/graph2vec.pyx +++ b/gmatch4py/embedding/graph2vec.pyx @@ -1,13 +1,15 @@ import hashlib import json import glob + import pandas as pd import networkx as nx from tqdm import tqdm cimport numpy as np +import numpy.distutils.system_info as sysinfo + from joblib import Parallel, delayed from gensim.models.doc2vec import Doc2Vec, TaggedDocument -import numpy.distutils.system_info as sysinfo from sklearn.metrics.pairwise import cosine_similarity from ..base cimport Base @@ -21,10 +23,18 @@ class WeisfeilerLehmanMachine: def __init__(self, graph, features, iterations): """ Initialization method which executes feature extraction. - :param graph: The Nx graph object. - :param features: Feature hash table. - :param iterations: Number of WL iterations. + + Parameters + ---------- + graph : nx.Graph + graph + features : dict + Feature hash table. + iterations : int + number of WL iteration + """ + self.iterations = iterations self.graph = graph self.features = features @@ -35,8 +45,13 @@ class WeisfeilerLehmanMachine: def do_a_recursion(self): """ The method does a single WL recursion. - :return new_features: The hash table with extracted WL features. + + Returns + ------- + dict + The hash table with extracted WL features. """ + new_features = {} for node in self.nodes: nebs = self.graph.neighbors(node) @@ -58,11 +73,17 @@ class WeisfeilerLehmanMachine: def dataset_reader(graph): """ - Function to read the graph and features from a json file. - :param path: The path to the graph json. - :return graph: The graph object. - :return features: Features hash table. - :return name: Name of the graph. + Function to extract features from a networkx graph + + Parameters + ---------- + graph : nx.Graph + graph + + Returns + ------- + dict + Features hash table. """ features = dict(nx.degree(graph)) @@ -70,13 +91,26 @@ def dataset_reader(graph): features = {k:v for k,v, in features.items()} return graph, features + def feature_extractor(graph, ix, rounds): """ - Function to extract WL features from a graph. - :param path: The path to the graph json. - :param rounds: Number of WL iterations. - :return doc: Document collection object. + Function to extract WL features from a graph + + Parameters + ---------- + graph : nx.Graph + graph + ix : int + index of the graph in the dataset + rounds : int + number of WL iterations + + Returns + ------- + TaggedDocument + random walks """ + graph, features = dataset_reader(graph) machine = WeisfeilerLehmanMachine(graph,features,rounds) doc = TaggedDocument(words = machine.extracted_features , tags = ["g_{0}".format(ix)]) @@ -87,8 +121,32 @@ def feature_extractor(graph, ix, rounds): def generate_model(graphs, iteration = 2, dimensions = 64, min_count = 5, down_sampling = 0.0001, learning_rate = 0.0001, epochs = 10, workers = 4 ): """ Main function to read the graph list, extract features, learn the embedding and save it. - :param args: Object with the arguments. + + Parameters + ---------- + graphs : nx.Graph + Input graph + iteration : int, optional + number of iteration (the default is 2) + dimensions : int, optional + output vector dimension (the default is 64) + min_count : int, optional + min count parameter of Doc2vec model (the default is 5) + down_sampling : float, optional + Down sampling rate for frequent features. (the default is 0.0001) + learning_rate : float, optional + Initial learning rate (the default is 0.0001, which [default_description]) + epochs : int, optional + Number of epochs (the default is 10) + workers : int, optional + Number of workers (the default is 4) + + Returns + ------- + [type] + [description] """ + document_collections = Parallel(n_jobs = workers)(delayed(feature_extractor)(g, ix,iteration) for ix,g in tqdm(enumerate(graphs),desc="Extracting Features...")) graphs=[nx.relabel_nodes(g,{node:str(node) for node in list(g.nodes)},copy=True) for g in graphs] model = Doc2Vec(document_collections, diff --git a/gmatch4py/ged/abstract_graph_edit_dist.pyx b/gmatch4py/ged/abstract_graph_edit_dist.pyx index 157bf245aed2ff920213bf1f1934125c6b015029..95ba8d42e082c1d00b914cbe9c00028175e20862 100644 --- a/gmatch4py/ged/abstract_graph_edit_dist.pyx +++ b/gmatch4py/ged/abstract_graph_edit_dist.pyx @@ -3,8 +3,12 @@ from __future__ import print_function import sys import warnings + import numpy as np cimport numpy as np +import networkx as nx +from cython.parallel cimport prange,parallel + try: from munkres import munkres except ImportError: @@ -12,9 +16,8 @@ except ImportError: from scipy.optimize import linear_sum_assignment as munkres from ..base cimport Base -import networkx as nx from ..helpers.general import parsenx2graph -from cython.parallel cimport prange,parallel + cdef class AbstractGraphEditDistance(Base): @@ -31,8 +34,19 @@ cdef class AbstractGraphEditDistance(Base): cpdef double distance_ged(self,G,H): """ - Return the distance between G and H - :return: + Return the distance value between G and H + + Parameters + ---------- + G : gmatch4py.Graph + graph + H : gmatch4py.Graph + graph + + Returns + ------- + int + distance """ cdef list opt_path = self.edit_costs(G,H) return np.sum(opt_path) @@ -41,7 +55,18 @@ cdef class AbstractGraphEditDistance(Base): cdef list edit_costs(self, G, H): """ Return the optimal path edit cost list, to transform G into H - :return: + + Parameters + ---------- + G : gmatch4py.Graph + graph + H : gmatch4py.Graph + graph + + Returns + ------- + np.array + edit path """ cdef np.ndarray cost_matrix = self.create_cost_matrix(G,H).astype(float) return cost_matrix[munkres(cost_matrix)].tolist() @@ -59,6 +84,18 @@ cdef class AbstractGraphEditDistance(Base): delete | delete -> delete The delete -> delete region is filled with zeros + + Parameters + ---------- + G : gmatch4py.Graph + graph + H : gmatch4py.Graph + graph + + Returns + ------- + np.array + cost matrix """ cdef int n,m try: @@ -86,29 +123,38 @@ cdef class AbstractGraphEditDistance(Base): return cost_matrix cdef double insert_cost(self, int i, int j, nodesH, H): + """ + Return the insert cost of the ith nodes in H + + Returns + ------- + int + insert cost + """ raise NotImplementedError cdef double delete_cost(self, int i, int j, nodesG, G): + """ + Return the delete cost of the ith nodes in H + + Returns + ------- + int + delete cost + """ raise NotImplementedError cpdef double substitute_cost(self, node1, node2, G, H): + """ + Return the substitute cost of between the node1 in G and the node2 in H + + Returns + ------- + int + substitution cost + """ raise NotImplementedError - cpdef np.ndarray compare_old(self,list listgs, list selected): - cdef int n = len(listgs) - cdef np.ndarray comparison_matrix = np.zeros((n, n)).astype(float) - cdef int i,j - for i in range(n): - for j in range(n): - g1,g2=listgs[i],listgs[j] - f=self.isAccepted(g1 if isinstance(g1,nx.Graph) else g1.get_nx(),i,selected) - if f: - comparison_matrix[i, j] = self.distance_ged(g1, g2) - else: - comparison_matrix[i, j] = np.inf - #comparison_matrix[j, i] = comparison_matrix[i, j] - np.fill_diagonal(comparison_matrix,0) - return comparison_matrix cpdef np.ndarray compare(self,list listgs, list selected): cdef int n = len(listgs) diff --git a/gmatch4py/ged/bipartite_graph_matching_2.pyx b/gmatch4py/ged/bipartite_graph_matching_2.pyx index 02128920bb4b2387109653d3257252b1a8f2672d..a23a5ae27912c402d1243a6fb96a2d603d442edd 100644 --- a/gmatch4py/ged/bipartite_graph_matching_2.pyx +++ b/gmatch4py/ged/bipartite_graph_matching_2.pyx @@ -35,21 +35,6 @@ cdef class BP_2(Base): self.edge_del = edge_del self.edge_ins = edge_ins - cpdef np.ndarray compare_old(self,list listgs, list selected): - cdef int n = len(listgs) - cdef np.ndarray comparison_matrix = np.zeros((n, n)).astype(float) - cdef int i,j - for i in range(n): - for j in range(i, n): - g1,g2=listgs[i],listgs[j] - f=self.isAccepted(g1,i,selected) - if f: - comparison_matrix[i, j] = self.bp2(g1, g2) - else: - comparison_matrix[i, j] = np.inf - comparison_matrix[j, i] = comparison_matrix[i, j] - - return comparison_matrix @cython.boundscheck(False) cpdef np.ndarray compare(self,list listgs, list selected): @@ -80,9 +65,9 @@ cdef class BP_2(Base): Parameters ---------- - g1 : networkx.Graph + g1 : gmatch4py.Graph First Graph - g2 : networkx.Graph + g2 : gmatch4py.Graph Second Graph Returns @@ -143,33 +128,25 @@ cdef class BP_2(Base): return psi_ - cdef float sum_fuv(self, g1, g2): - """ - Compute Nearest Neighbour Distance between G1 and G2 - :param g1: First Graph - :param g2: Second Graph - :return: - """ - cdef np.ndarray min_sum = np.zeros(g1.size()) - cdef list nodes1 = list(g1.nodes()) - cdef list nodes2 = list(g2.nodes()) - nodes2.extend([None]) - cdef np.ndarray min_i - for i in range(g1.size()): - min_i = np.zeros(g2.size()) - for j in range(g2.size()): - min_i[j] = self.fuv(g1, g2, nodes1[i], nodes2[j]) - min_sum[i] = np.min(min_i) - return np.sum(min_sum) cdef float fuv(self, g1, g2, str n1, str n2): """ Compute the Node Distance function - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: + Parameters + ---------- + g1 : gmatch4py.Graph + First graph + g2 : gmatch4py.Graph + Second graph + n1 : int or str + identifier of the first node + n2 : int or str + identifier of the second node + + Returns + ------- + float + node distance """ if n2 == None: # Del return self.node_del + ((self.edge_del / 2.) * g1.degree(n1)) @@ -183,11 +160,21 @@ cdef class BP_2(Base): cdef float hed_edge(self, g1, g2, str n1, str n2): """ Compute HEDistance between edges of n1 and n2, respectively in g1 and g2 - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: + Parameters + ---------- + g1 : gmatch4py.Graph + First graph + g2 : gmatch4py.Graph + Second graph + n1 : int or str + identifier of the first node + n2 : int or str + identifier of the second node + + Returns + ------- + float + HEDistance between g1 and g2 """ return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2) @@ -195,11 +182,21 @@ cdef class BP_2(Base): cdef float sum_gpq(self, g1, str n1, g2, str n2): """ Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2 - :param g1: first graph - :param n1: node in the first graph - :param g2: second graph - :param n2: node in the second graph - :return: + Parameters + ---------- + g1 : gmatch4py.Graph + First graph + g2 : gmatch4py.Graph + Second graph + n1 : int or str + identifier of the first node + n2 : int or str + identifier of the second node + + Returns + ------- + float + Nearest Neighbour Distance """ #if isinstance(g1, nx.MultiDiGraph): @@ -219,10 +216,18 @@ cdef class BP_2(Base): cdef float gpq(self, str e1, str e2): """ Compute the edge distance function - :param e1: edge1 - :param e2: edge2 - :return: + Parameters + ---------- + e1 : str + first edge identifier + e2 + second edge indentifier + Returns + ------- + float + edge distance """ + if e2 == None: # Del return self.edge_del if e1 == None: # Insert diff --git a/gmatch4py/ged/graph_edit_dist.pyx b/gmatch4py/ged/graph_edit_dist.pyx index b1f9346cf79e3bcf96dad6d7231533a47f8fcb9f..7dd400fea21b93b0fd8c19b3e6e63df95791e896 100644 --- a/gmatch4py/ged/graph_edit_dist.pyx +++ b/gmatch4py/ged/graph_edit_dist.pyx @@ -6,7 +6,7 @@ import networkx as nx import numpy as np cimport numpy as np from .abstract_graph_edit_dist cimport AbstractGraphEditDistance -from ..base cimport intersection,union_ + cdef class GraphEditDistance(AbstractGraphEditDistance): diff --git a/gmatch4py/ged/hausdorff_edit_distance.pyx b/gmatch4py/ged/hausdorff_edit_distance.pyx index 2db26612d9f5d6225692257b217f407c97caa03e..67d3484512c3054dba34eb5d085521480d785eab 100644 --- a/gmatch4py/ged/hausdorff_edit_distance.pyx +++ b/gmatch4py/ged/hausdorff_edit_distance.pyx @@ -22,7 +22,20 @@ cdef class HED(Base): cdef int edge_ins def __init__(self, int node_del=1, int node_ins=1, int edge_del=1, int edge_ins=1): - """Constructor for HED""" + """ + HED Constructor + + Parameters + ---------- + node_del :int + Node deletion cost + node_ins : int + Node insertion cost + edge_del : int + Edge Deletion cost + edge_ins : int + Edge Insertion cost + """ Base.__init__(self,1,False) self.node_del = node_del self.node_ins = node_ins @@ -30,22 +43,6 @@ cdef class HED(Base): self.edge_ins = edge_ins - cpdef np.ndarray compare_old(self,list listgs, list selected): - cdef int n = len(listgs) - cdef np.ndarray comparison_matrix = np.zeros((n, n)).astype(float) - cdef int i,j - for i in range(n): - for j in range(i, n): - g1,g2=listgs[i],listgs[j] - f=self.isAccepted(g1,i,selected) - if f: - comparison_matrix[i, j] = self.hed(g1, g2) - else: - comparison_matrix[i, j] = np.inf - comparison_matrix[j, i] = comparison_matrix[i, j] - - return comparison_matrix - @cython.boundscheck(False) cpdef np.ndarray compare(self,list listgs, list selected): cdef int n = len(listgs) @@ -71,20 +68,38 @@ cdef class HED(Base): cdef float hed(self, g1, g2): """ - Compute de Hausdorff Edit Distance - :param g1: first graph - :param g2: second graph - :return: + Compute the HED similarity value between two `gmatch4py.Graph` + + Parameters + ---------- + g1 : gmatch4py.Graph + First Graph + g2 : gmatch4py.Graph + Second Graph + + Returns + ------- + float + similarity value """ return self.sum_fuv(g1, g2) + self.sum_fuv(g2, g1) cdef float sum_fuv(self, g1, g2): """ Compute Nearest Neighbour Distance between G1 and G2 - :param g1: First Graph - :param g2: Second Graph - :return: + Parameters + ---------- + g1 : gmatch4py.Graph + First graph + g2 : gmatch4py.Graph + Second graph + + Returns + ------- + float + Nearest Neighbour Distance """ + cdef np.ndarray min_sum = np.zeros(g1.size()) cdef list nodes1 = list(g1.nodes()) cdef list nodes2 = list(g2.nodes()) @@ -100,11 +115,21 @@ cdef class HED(Base): cdef float fuv(self, g1, g2, str n1, str n2): """ Compute the Node Distance function - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: + Parameters + ---------- + g1 : gmatch4py.Graph + First graph + g2 : gmatch4py.Graph + Second graph + n1 : int or str + identifier of the first node + n2 : int or str + identifier of the second node + + Returns + ------- + float + node distance """ if n2 == None: # Del return self.node_del + ((self.edge_del / 2.) * g1.degree(n1)) @@ -118,11 +143,21 @@ cdef class HED(Base): cdef float hed_edge(self, g1, g2, str n1, str n2): """ Compute HEDistance between edges of n1 and n2, respectively in g1 and g2 - :param g1: first graph - :param g2: second graph - :param n1: node of the first graph - :param n2: node of the second graph - :return: + Parameters + ---------- + g1 : gmatch4py.Graph + First graph + g2 : gmatch4py.Graph + Second graph + n1 : int or str + identifier of the first node + n2 : int or str + identifier of the second node + + Returns + ------- + float + HEDistance between g1 and g2 """ return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2) @@ -130,15 +165,25 @@ cdef class HED(Base): cdef float sum_gpq(self, g1, str n1, g2, str n2): """ Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2 - :param g1: first graph - :param n1: node in the first graph - :param g2: second graph - :param n2: node in the second graph - :return: + Parameters + ---------- + g1 : gmatch4py.Graph + First graph + g2 : gmatch4py.Graph + Second graph + n1 : int or str + identifier of the first node + n2 : int or str + identifier of the second node + + Returns + ------- + float + Nearest Neighbour Distance """ #if isinstance(g1, nx.MultiDiGraph): - cdef list edges1 = g1.get_edges_no(n1) if n1 else [] + cdef list edges1 = g1.get_edges_no(n1) if n1 else [] # rename method ... cdef list edges2 = g2.get_edges_no(n2) if n2 else [] cdef np.ndarray min_sum = np.zeros(len(edges1)) @@ -154,9 +199,16 @@ cdef class HED(Base): cdef float gpq(self, str e1, str e2): """ Compute the edge distance function - :param e1: edge1 - :param e2: edge2 - :return: + Parameters + ---------- + e1 : str + first edge identifier + e2 + second edge indentifier + Returns + ------- + float + edge distance """ if e2 == None: # Del return self.edge_del diff --git a/gmatch4py/helpers/general.pyx b/gmatch4py/helpers/general.pyx index c39560fe31152a93a48ee055b3e92e37f0b10759..0afce55a4524e7e6eaf42795cdc4a6c3d52de175 100644 --- a/gmatch4py/helpers/general.pyx +++ b/gmatch4py/helpers/general.pyx @@ -2,6 +2,22 @@ from ..graph cimport Graph import networkx as nx def parsenx2graph(list_gs,node_attr_key="",edge_attr_key=""): + """ + Parse list of Networkx graphs into Gmatch4py graph format + Parameters + ---------- + list_gs : list + list of graph + node_attr_key : str + node attribute used for the hash + edge_attr_key: str + edge attribute used for the hash + + Returns + ------- + list + list of gmatch4py.Graph + """ new_gs=[nx.relabel_nodes(g,{node:str(node) for node in list(g.nodes)},copy=True) for g in list_gs] new_gs=[Graph(g,node_attr_key,edge_attr_key) for g in new_gs] return new_gs diff --git a/gmatch4py/jaccard.pyx b/gmatch4py/jaccard.pyx index 6e0717c683239f5d11653db88cafd9e6258b16d1..6b0bfe74b131c8c573fd14b47677a2a594593903 100644 --- a/gmatch4py/jaccard.pyx +++ b/gmatch4py/jaccard.pyx @@ -4,7 +4,6 @@ import numpy as np cimport numpy as np from .base cimport Base -from .base cimport intersection,union_ from .helpers.general import parsenx2graph from cython.parallel cimport prange,parallel cimport cython @@ -15,31 +14,6 @@ cdef class Jaccard(Base): Base.__init__(self,0,True) - cpdef np.ndarray compare_old(self,list listgs, list selected): - cdef int n = len(listgs) - cdef np.ndarray comparison_matrix = np.zeros((n, n)) - cdef int i,j - for i in range(n): - for j in range(i,n): - g1,g2=listgs[i],listgs[j] - f=self.isAccepted(g1,i,selected) - if f: - inter_g=intersection(g1,g2) - union_g=union_(g1,g2) - if union_g.number_of_nodes() == 0 or union_g.number_of_edges()== 0: - comparison_matrix[i, j] = 0. - else: - comparison_matrix[i,j]=\ - ((inter_g.number_of_nodes())/(union_g.number_of_nodes()))\ - *\ - ((union_g.number_of_edges())/(union_g.number_of_edges())) - else: - comparison_matrix[i, j] = 0. - - comparison_matrix[j, i] = comparison_matrix[i, j] - - return comparison_matrix - @cython.boundscheck(False) cpdef np.ndarray compare(self,list listgs, list selected): cdef int n = len(listgs) diff --git a/gmatch4py/kernels/shortest_path_kernel.pyx b/gmatch4py/kernels/shortest_path_kernel.pyx index 4212e8d88ae083660a0d500b184094502229f29d..351b5bc1aa686e5063b3da4c6009f886c926c8e6 100644 --- a/gmatch4py/kernels/shortest_path_kernel.pyx +++ b/gmatch4py/kernels/shortest_path_kernel.pyx @@ -110,50 +110,11 @@ cdef class ShortestPathGraphKernel(Base): return np.nan_to_num(k_norm) - cpdef np.ndarray compare_single_core(self,list graph_list, list selected): - """Compute the all-pairs kernel values for a list of graphs. - This function can be used to directly compute the kernel - matrix for a list of graphs. The direct computation of the - kernel matrix is faster than the computation of all individual - pairwise kernel values. - Parameters - ---------- - graph_list: list - A list of graphs (list of networkx graphs) - Return - ------ - K: numpy.array, shape = (len(graph_list), len(graph_list)) - The similarity matrix of all graphs in graph_list. - """ - cdef int n = len(graph_list) - cdef double[:,:] k = np.zeros((n, n)) - - cdef list adjacency_matrices = [[None for i in range(n)]for j in range(n)] - cdef int i,j - for i in range(n): - for j in range(i, n): - adjacency_matrices[i][j] = get_adjacency(graph_list[i],graph_list[j]) - adjacency_matrices[j][i] = adjacency_matrices[i][j] - - for i in range(n): - for j in range(i, n): - if len(graph_list[i]) > 0 and len(graph_list[j]) >0: - a,b=adjacency_matrices[i][j] - k[i][j] = self.compare_two(a,b) - k[j][i] = k[i][j] - - k_norm = np.zeros((n,n)) - for i in range(n): - for j in range(i,n): - k_norm[i, j] = k[i][j] / np.sqrt(k[i][i] * k[j][j]) - k_norm[j, i] = k_norm[i, j] - - return np.nan_to_num(k_norm) -cdef class ShortestPathGraphKernelDotMatrix(ShortestPathGraphKernel): +cdef class ShortestPathGraphKernelDotCostMatrix(ShortestPathGraphKernel): """ - Shorthest path graph kernel. + Instead of just multiply the count of distance values fou,d between nodes of each graph, this version propose to multiply the node distance matrix generated from each graph. """ def __init__(self): ShortestPathGraphKernel.__init__(self) diff --git a/gmatch4py/mcs.pyx b/gmatch4py/mcs.pyx index c0c117fa4051c7284337a99fb643a1472e2e10f5..574b5a7f7284d80adbd9859826249515a4ca0b9e 100644 --- a/gmatch4py/mcs.pyx +++ b/gmatch4py/mcs.pyx @@ -15,20 +15,6 @@ cdef class MCS(Base): def __init__(self): Base.__init__(self,0,True) - cpdef np.ndarray compare_old(self,list listgs, list selected): - cdef int n = len(listgs) - cdef np.ndarray comparison_matrix = np.zeros((n, n)) - for i in range(n): - for j in range(i, n): - g1,g2=listgs[i],listgs[j] - f=self.isAccepted(g1,i,selected) - if f: - comparison_matrix[i, j] = self.s_mcs(g1,g2) - else: - comparison_matrix[i, j] = 0. - comparison_matrix[j, i] = comparison_matrix[i, j] - return comparison_matrix - @cython.boundscheck(False) cpdef np.ndarray compare(self,list listgs, list selected): cdef int n = len(listgs) diff --git a/gmatch4py/vertex_edge_overlap.pyx b/gmatch4py/vertex_edge_overlap.pyx index a6635a43f12d559cd41294c5f10c7564f8ec7156..e9fd66a1f1422bf3e9937549e96899fc8981b3d2 100644 --- a/gmatch4py/vertex_edge_overlap.pyx +++ b/gmatch4py/vertex_edge_overlap.pyx @@ -2,11 +2,13 @@ import numpy as np cimport numpy as np -from .base cimport Base,intersection + from .graph cimport Graph from cython.parallel cimport prange,parallel from .helpers.general import parsenx2graph cimport cython +from .base cimport Base + cdef class VertexEdgeOverlap(Base): """ @@ -17,27 +19,7 @@ cdef class VertexEdgeOverlap(Base): Code Author : Jacques Fize """ def __init__(self): - Base.__init__(self,0,True) - - cpdef np.ndarray compare_old(self,list listgs, list selected): - n = len(listgs) - cdef np.ndarray comparison_matrix = np.zeros((n, n)) - cdef list inter_ver,inter_ed - cdef int denom,i,j - for i in range(n): - for j in range(i,n): - g1,g2 = listgs[i],listgs[j] - f=self.isAccepted(g1,i,selected) - if f: - inter_g= intersection(g1,g2) - denom=g1.number_of_nodes()+g2.number_of_nodes()+\ - g1.number_of_edges()+g2.number_of_edges() - if denom == 0: - continue - comparison_matrix[i,j]=(2*(inter_g.number_of_nodes() - +inter_g.number_of_edges()))/denom # Data = True --> For nx.MultiDiGraph - comparison_matrix[j, i] = comparison_matrix[i, j] - return comparison_matrix + Base.__init__(self,0,True) @cython.boundscheck(False) cpdef np.ndarray compare(self,list listgs, list selected): diff --git a/setup.py b/setup.py index 6c670844aec5a6a720269a212be5c5be469e46f4..16c333471014e20750403006906dfe5e0b107fae 100644 --- a/setup.py +++ b/setup.py @@ -49,10 +49,10 @@ def makeExtension(extName): # get the list of extensions extNames = scandir("gmatch4py") -print(extNames) + # and build up the set of Extension objects extensions = cythonize([makeExtension(name) for name in extNames]) -print(extensions) + from os import path this_directory = path.abspath(path.dirname(__file__)) with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: