diff --git a/README.md b/README.md index 63525a8f34f191f9f283369752ff437bfa010d54..3c7bf65c17f7e5909502da1470bd443681e69528 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ ged.set_attr_graph_used("theme","color") # Edge colors and node themes attribute * Graph2Vec [1] * Node Embedding * DeepWalk [7] + * Node2vec [8] * Graph kernels * Random Walk Kernel (*debug needed*) [3] * Geometrical @@ -114,6 +115,7 @@ ged.set_attr_graph_used("theme","color") # Edge colors and node themes attribute * [5] Fischer, A., Riesen, K., & Bunke, H. (2017). Improved quadratic time approximation of graph edit distance by combining Hausdorff matching and greedy assignment. Pattern Recognition Letters, 87, 55-62. * [6] A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer, Pattern Recognition Letters, 1998 * [7] Perozzi, B., Al-Rfou, R., & Skiena, S. (2014, August). Deepwalk: Online learning of social representations. In Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 701-710). ACM. + * [8] node2vec: Scalable Feature Learning for Networks. Aditya Grover and Jure Leskovec. Knowledge Discovery and Data Mining, 2016. ## Author(s) @@ -125,6 +127,10 @@ each code is associated with a reference to the original.** ## CHANGELOG +### 12.03.2019 + + * Add Node2vec + ### 05.03.2019 * Add Graph Embedding algorithms diff --git a/gmatch4py/__init__.py b/gmatch4py/__init__.py index 18e096c6616f9b3eb3d3aecb4d03f59d853c8023..82851c935baa6642e67c5a8fddfdc0c2bfd26fb3 100644 --- a/gmatch4py/__init__.py +++ b/gmatch4py/__init__.py @@ -13,6 +13,7 @@ from .kernels.shortest_path_kernel import * # Graph Embedding import from .embedding.graph2vec import * from .embedding.deepwalk import * +from .embedding.node2vec import * # Helpers import from .helpers.reader import * from .helpers.general import * diff --git a/gmatch4py/embedding/deepwalk.pyx b/gmatch4py/embedding/deepwalk.pyx index 321f1082a28edf4dd0041187b4e87a8e890ea4d1..3bef7ba87cc3355e1468309632f59553cfc5a271 100644 --- a/gmatch4py/embedding/deepwalk.pyx +++ b/gmatch4py/embedding/deepwalk.pyx @@ -142,7 +142,7 @@ cdef class DeepWalk(Base): """ def __init__(self): - Base.__init__(self,0,True) + Base.__init__(self,0,False) def extract_embedding(self, listgs): """ diff --git a/gmatch4py/embedding/graph2vec.pyx b/gmatch4py/embedding/graph2vec.pyx index f7187b2747261ee492ac94d92c40e9e65f44b215..4300d4fec7d3116e9313cfd532ebe3cca3a201ab 100644 --- a/gmatch4py/embedding/graph2vec.pyx +++ b/gmatch4py/embedding/graph2vec.pyx @@ -173,7 +173,7 @@ cdef class Graph2Vec(Base): """ def __init__(self): - Base.__init__(self,0,True) + Base.__init__(self,0,False) @cython.boundscheck(False) cpdef np.ndarray compare(self,list listgs, list selected): diff --git a/gmatch4py/embedding/node2vec.pyx b/gmatch4py/embedding/node2vec.pyx new file mode 100644 index 0000000000000000000000000000000000000000..4a188687ab7cbba32babdb7b031e99b68c870a8e --- /dev/null +++ b/gmatch4py/embedding/node2vec.pyx @@ -0,0 +1,255 @@ +import random + +import numpy as np +cimport numpy as np +from gensim.models import Word2Vec +from sklearn.metrics.pairwise import cosine_similarity + +from ..base cimport Base +cimport cython +from joblib import Parallel, delayed +import networkx as nx + +class Graph(): + def __init__(self, nx_G, is_directed, p, q): + self.G = nx_G + self.is_directed = is_directed + self.p = p + self.q = q + + def node2vec_walk(self, walk_length, start_node): + ''' + Simulate a random walk starting from start node. + ''' + G = self.G + alias_nodes = self.alias_nodes + alias_edges = self.alias_edges + + walk = [start_node] + + while len(walk) < walk_length: + cur = walk[-1] + cur_nbrs = sorted(G.neighbors(cur)) + if len(cur_nbrs) > 0: + if len(walk) == 1: + walk.append( + cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) + else: + prev = walk[-2] + next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], + alias_edges[(prev, cur)][1])] + walk.append(next) + else: + break + + return walk + + def simulate_walks(self, num_walks, walk_length): + ''' + Repeatedly simulate random walks from each node. + ''' + # sys.stdout.write("\r") + G = self.G + walks = [] + nodes = list(G.nodes) + for walk_iter in range(num_walks): + # sys.stdout.write( + # '\rWalk iteration: {0}/{1}'.format(walk_iter + 1, num_walks)) + random.shuffle(nodes) + for node in nodes: + walks.append(self.node2vec_walk( + walk_length=walk_length, start_node=node)) + + return walks + + def get_alias_edge(self, src, dst): + ''' + Get the alias edge setup lists for a given edge. + ''' + G = self.G + p = self.p + q = self.q + + unnormalized_probs = [] + for dst_nbr in sorted(G.neighbors(dst)): + if dst_nbr == src: + unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) + elif G.has_edge(dst_nbr, src): + unnormalized_probs.append(G[dst][dst_nbr]['weight']) + else: + unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) + norm_const = sum(unnormalized_probs) + normalized_probs = [ + float(u_prob) / norm_const for u_prob in unnormalized_probs] + + return alias_setup(normalized_probs) + + def preprocess_transition_probs(self): + ''' + Preprocessing of transition probabilities for guiding the random walks. + ''' + G = self.G + is_directed = self.is_directed + + alias_nodes = {} + for node in list(G.nodes): + unnormalized_probs = [G[node][nbr]['weight'] + for nbr in sorted(G.neighbors(node))] + norm_const = sum(unnormalized_probs) + normalized_probs = [ + float(u_prob) / norm_const for u_prob in unnormalized_probs] + alias_nodes[node] = alias_setup(normalized_probs) + + alias_edges = {} + triads = {} + + if is_directed: + for edge in list(G.edges()): + alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) + else: + for edge in list(G.edges()): + alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) + alias_edges[(edge[1], edge[0])] = self.get_alias_edge( + edge[1], edge[0]) + + self.alias_nodes = alias_nodes + self.alias_edges = alias_edges + + return + + +def alias_setup(probs): + ''' + Compute utility lists for non-uniform sampling from discrete distributions. + Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ + for details + ''' + K = len(probs) + q = np.zeros(K) + J = np.zeros(K, dtype=np.int) + + smaller = [] + larger = [] + for kk, prob in enumerate(probs): + q[kk] = K * prob + if q[kk] < 1.0: + smaller.append(kk) + else: + larger.append(kk) + + while len(smaller) > 0 and len(larger) > 0: + small = smaller.pop() + large = larger.pop() + + J[small] = large + q[large] = q[large] + q[small] - 1.0 + if q[large] < 1.0: + smaller.append(large) + else: + larger.append(large) + + return J, q + + +def alias_draw(J, q): + ''' + Draw sample from a non-uniform discrete distribution using alias sampling. + ''' + K = len(J) + + kk = int(np.floor(np.random.rand() * K)) + if np.random.rand() < q[kk]: + return kk + else: + return J[kk] + + +def learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter): + ''' + Learn embeddings by optimizing the Skipgram objective using SGD. + ''' + walks_ = [list(map(str, walk)) for walk in walks] + model = Word2Vec(walks_, size=dimensions, window=window_size, + min_count=0, sg=1, workers=nb_workers, iter=nb_iter) + return model + + +def compute_graph_model(nx_graph, **kwargs): + ''' + Pipeline for representational learning for all nodes in a graph. + @param nx_graph + @kwarg p: int + @kwarg q: int + ''' + p = kwargs.get("p", 1) + q = kwargs.get("q", 1) + dimensions = kwargs.get("dimensions", 128) + window_size = kwargs.get("window_size", 10) + nb_workers = kwargs.get("nb_workers", 8) + nb_iter = kwargs.get("nb_iter", 1) + num_walks = kwargs.get("num_walks", 10) + walk_length = kwargs.get("walk_length", 80) + directed = kwargs.get("directed", False) + + G = Graph(nx_graph, directed, p, q) + G.preprocess_transition_probs() + walks = G.simulate_walks(num_walks, walk_length) + return learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter).wv.vectors + +cdef class Node2Vec(Base): + """ + Based on : + Extract Node2vec embedding of each graph in `listgs` + @inproceedings{Grover:2016:NSF:2939672.2939754, + author = {Grover, Aditya and Leskovec, Jure}, + title = {Node2Vec: Scalable Feature Learning for Networks}, + booktitle = {Proceedings of the 22Nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, + series = {KDD '16}, + year = {2016}, + isbn = {978-1-4503-4232-2}, + location = {San Francisco, California, USA}, + pages = {855--864}, + numpages = {10}, + url = {http://doi.acm.org/10.1145/2939672.2939754}, + doi = {10.1145/2939672.2939754}, + acmid = {2939754}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {feature learning, graph representations, information networks, node embeddings}, + } + + Original code : https://github.com/aditya-grover/node2vec + + Modified by : Jacques Fize + """ + + def __init__(self): + Base.__init__(self,0,False) + + def extract_embedding(self, listgs): + """ + Extract Node2vec embedding of each graph in `listgs` + + Parameters + ---------- + listgs : list + list of graphs + + Returns + ------- + list + list of embeddings + """ + + from tqdm import tqdm + models = Parallel(n_jobs = self.cpu_count)(delayed(compute_graph_model)(g,directed=g.is_directed()) for g in tqdm(listgs,desc="Extracting Embeddings...")) + return models + + @cython.boundscheck(False) + cpdef np.ndarray compare(self,list listgs, list selected): + #Â Selected is ignored + [nx.set_edge_attributes(g,1,'weight') for g in listgs] + models = self.extract_embedding(listgs) + vector_matrix = np.array([mod.mean(axis=0) for mod in models]) # Average nodes representations + cs = cosine_similarity(vector_matrix) + return cs diff --git a/gmatch4py/test/test.py b/gmatch4py/test/test.py deleted file mode 100644 index 52937c4691eea6255faf78316daee41bac1183ec..0000000000000000000000000000000000000000 --- a/gmatch4py/test/test.py +++ /dev/null @@ -1,18 +0,0 @@ -import pytest - - -def nimport(): - # Gmatch4py use networkx graph - import networkx as nx - - -def test_import(): - nimport() - -def ged(): - import networkx as nx - # import the GED using the munkres algorithm - - -def test_ged(): - ged() \ No newline at end of file diff --git a/setup.py b/setup.py index 16c333471014e20750403006906dfe5e0b107fae..e4811db611649a30cbed72bec39ec7d6e7a6d001 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,7 @@ setup( cmdclass={'build_ext': build_ext}, setup_requires=requirements, install_requires=requirements, - version="0.2.4.3beta", + version="0.2.5b", classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",