Add Node2vec + debug + change readme

8d0f0946 · Fize Jacques · 17849210 · 8d0f0946 · 8d0f0946 · 8d0f0946
Commit 8d0f0946 authored 6 years ago by Fize Jacques
Hide whitespace changes
Inline Side-by-side

Showing

with 265 additions and 21 deletions
+265 -21
--- a/README.md
+++ b/README.md
@@ -86,6 +86,7 @@ ged.set_attr_graph_used("theme","color") # Edge colors and node themes attribute
    * Graph2Vec [1]
 * Node Embedding
    * DeepWalk [7]
+    * Node2vec [8]
 * Graph kernels
    * Random Walk Kernel (*debug needed*) [3]
        * Geometrical 
@@ -114,6 +115,7 @@ ged.set_attr_graph_used("theme","color") # Edge colors and node themes attribute
  * [5] Fischer, A., Riesen, K., & Bunke, H. (2017). Improved quadratic time approximation of graph edit distance by combining Hausdorff matching and greedy assignment. Pattern Recognition Letters, 87, 55-62.
  * [6] A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer, Pattern Recognition Letters, 1998  
  * [7] Perozzi, B., Al-Rfou, R., & Skiena, S. (2014, August). Deepwalk: Online learning of social representations. In Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 701-710). ACM.
+  * [8] node2vec: Scalable Feature Learning for Networks. Aditya Grover and Jure Leskovec. Knowledge Discovery and Data Mining, 2016.

 ## Author(s)

@@ -125,6 +127,10 @@ each code is associated with a reference to the original.**

 ## CHANGELOG

+### 12.03.2019
+
+ * Add Node2vec
+
 ### 05.03.2019

 * Add Graph Embedding algorithms

--- a/gmatch4py/__init__.py
+++ b/gmatch4py/__init__.py
@@ -13,6 +13,7 @@ from .kernels.shortest_path_kernel import *
 # Graph Embedding import
 from .embedding.graph2vec import *
 from .embedding.deepwalk import *
+from .embedding.node2vec import *
 # Helpers import
 from .helpers.reader import *
 from .helpers.general import *

--- a/gmatch4py/embedding/deepwalk.pyx
+++ b/gmatch4py/embedding/deepwalk.pyx
@@ -142,7 +142,7 @@ cdef class DeepWalk(Base):
    """

    def __init__(self):
-        Base.__init__(self,0,True)
+        Base.__init__(self,0,False)

    def extract_embedding(self, listgs):
        """

--- a/gmatch4py/embedding/graph2vec.pyx
+++ b/gmatch4py/embedding/graph2vec.pyx
@@ -173,7 +173,7 @@ cdef class Graph2Vec(Base):
    """

    def __init__(self):
-        Base.__init__(self,0,True)
+        Base.__init__(self,0,False)

    @cython.boundscheck(False)
    cpdef np.ndarray compare(self,list listgs, list selected):

--- a/gmatch4py/embedding/node2vec.pyx
+++ b/gmatch4py/embedding/node2vec.pyx
+import random
+
+import numpy as np
+cimport numpy as np
+from gensim.models import Word2Vec
+from sklearn.metrics.pairwise import cosine_similarity
+
+from ..base cimport Base
+cimport cython
+from joblib import Parallel, delayed
+import networkx as nx
+
+class Graph():
+    def __init__(self, nx_G, is_directed, p, q):
+        self.G = nx_G
+        self.is_directed = is_directed
+        self.p = p
+        self.q = q
+
+    def node2vec_walk(self, walk_length, start_node):
+        '''
+        Simulate a random walk starting from start node.
+        '''
+        G = self.G
+        alias_nodes = self.alias_nodes
+        alias_edges = self.alias_edges
+
+        walk = [start_node]
+
+        while len(walk) < walk_length:
+            cur = walk[-1]
+            cur_nbrs = sorted(G.neighbors(cur))
+            if len(cur_nbrs) > 0:
+                if len(walk) == 1:
+                    walk.append(
+                        cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
+                else:
+                    prev = walk[-2]
+                    next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
+                                               alias_edges[(prev, cur)][1])]
+                    walk.append(next)
+            else:
+                break
+
+        return walk
+
+    def simulate_walks(self, num_walks, walk_length):
+        '''
+        Repeatedly simulate random walks from each node.
+        '''
+        # sys.stdout.write("\r")
+        G = self.G
+        walks = []
+        nodes = list(G.nodes)
+        for walk_iter in range(num_walks):
+            # sys.stdout.write(
+            #     '\rWalk iteration: {0}/{1}'.format(walk_iter + 1, num_walks))
+            random.shuffle(nodes)
+            for node in nodes:
+                walks.append(self.node2vec_walk(
+                    walk_length=walk_length, start_node=node))
+
+        return walks
+
+    def get_alias_edge(self, src, dst):
+        '''
+        Get the alias edge setup lists for a given edge.
+        '''
+        G = self.G
+        p = self.p
+        q = self.q
+
+        unnormalized_probs = []
+        for dst_nbr in sorted(G.neighbors(dst)):
+            if dst_nbr == src:
+                unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p)
+            elif G.has_edge(dst_nbr, src):
+                unnormalized_probs.append(G[dst][dst_nbr]['weight'])
+            else:
+                unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q)
+        norm_const = sum(unnormalized_probs)
+        normalized_probs = [
+            float(u_prob) / norm_const for u_prob in unnormalized_probs]
+
+        return alias_setup(normalized_probs)
+
+    def preprocess_transition_probs(self):
+        '''
+        Preprocessing of transition probabilities for guiding the random walks.
+        '''
+        G = self.G
+        is_directed = self.is_directed
+
+        alias_nodes = {}
+        for node in list(G.nodes):
+            unnormalized_probs = [G[node][nbr]['weight']
+                                  for nbr in sorted(G.neighbors(node))]
+            norm_const = sum(unnormalized_probs)
+            normalized_probs = [
+                float(u_prob) / norm_const for u_prob in unnormalized_probs]
+            alias_nodes[node] = alias_setup(normalized_probs)
+
+        alias_edges = {}
+        triads = {}
+
+        if is_directed:
+            for edge in list(G.edges()):
+                alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
+        else:
+            for edge in list(G.edges()):
+                alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
+                alias_edges[(edge[1], edge[0])] = self.get_alias_edge(
+                    edge[1], edge[0])
+
+        self.alias_nodes = alias_nodes
+        self.alias_edges = alias_edges
+
+        return
+
+
+def alias_setup(probs):
+    '''
+    Compute utility lists for non-uniform sampling from discrete distributions.
+    Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
+    for details
+    '''
+    K = len(probs)
+    q = np.zeros(K)
+    J = np.zeros(K, dtype=np.int)
+
+    smaller = []
+    larger = []
+    for kk, prob in enumerate(probs):
+        q[kk] = K * prob
+        if q[kk] < 1.0:
+            smaller.append(kk)
+        else:
+            larger.append(kk)
+
+    while len(smaller) > 0 and len(larger) > 0:
+        small = smaller.pop()
+        large = larger.pop()
+
+        J[small] = large
+        q[large] = q[large] + q[small] - 1.0
+        if q[large] < 1.0:
+            smaller.append(large)
+        else:
+            larger.append(large)
+
+    return J, q
+
+
+def alias_draw(J, q):
+    '''
+    Draw sample from a non-uniform discrete distribution using alias sampling.
+    '''
+    K = len(J)
+
+    kk = int(np.floor(np.random.rand() * K))
+    if np.random.rand() < q[kk]:
+        return kk
+    else:
+        return J[kk]
+
+
+def learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter):
+    '''
+    Learn embeddings by optimizing the Skipgram objective using SGD.
+    '''
+    walks_ = [list(map(str, walk)) for walk in walks]
+    model = Word2Vec(walks_, size=dimensions, window=window_size,
+                     min_count=0, sg=1, workers=nb_workers, iter=nb_iter)
+    return model
+
+
+def compute_graph_model(nx_graph, **kwargs):
+    '''
+    Pipeline for representational learning for all nodes in a graph.
+        @param nx_graph
+        @kwarg p: int
+        @kwarg q: int
+    '''
+    p = kwargs.get("p", 1)
+    q = kwargs.get("q", 1)
+    dimensions = kwargs.get("dimensions", 128)
+    window_size = kwargs.get("window_size", 10)
+    nb_workers = kwargs.get("nb_workers", 8)
+    nb_iter = kwargs.get("nb_iter", 1)
+    num_walks = kwargs.get("num_walks", 10)
+    walk_length = kwargs.get("walk_length", 80)
+    directed = kwargs.get("directed", False)
+
+    G = Graph(nx_graph, directed, p, q)
+    G.preprocess_transition_probs()
+    walks = G.simulate_walks(num_walks, walk_length)
+    return learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter).wv.vectors
+
+cdef class Node2Vec(Base):
+    """
+    Based on :
+    Extract Node2vec embedding of each graph in `listgs`
+        @inproceedings{Grover:2016:NSF:2939672.2939754,
+             author = {Grover, Aditya and Leskovec, Jure},
+             title = {Node2Vec: Scalable Feature Learning for Networks},
+             booktitle = {Proceedings of the 22Nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+             series = {KDD '16},
+             year = {2016},
+             isbn = {978-1-4503-4232-2},
+             location = {San Francisco, California, USA},
+             pages = {855--864},
+             numpages = {10},
+             url = {http://doi.acm.org/10.1145/2939672.2939754},
+             doi = {10.1145/2939672.2939754},
+             acmid = {2939754},
+             publisher = {ACM},
+             address = {New York, NY, USA},
+             keywords = {feature learning, graph representations, information networks, node embeddings},
+        }
+
+    Original code : https://github.com/aditya-grover/node2vec
+
+    Modified by : Jacques Fize
+    """
+
+    def __init__(self):
+        Base.__init__(self,0,False)
+
+    def extract_embedding(self, listgs):
+        """
+        Extract Node2vec embedding of each graph in `listgs`
+
+        Parameters
+        ----------
+        listgs : list
+            list of graphs
+
+        Returns
+        -------
+        list
+            list of embeddings
+        """
+
+        from tqdm import tqdm
+        models =  Parallel(n_jobs = self.cpu_count)(delayed(compute_graph_model)(g,directed=g.is_directed()) for g in tqdm(listgs,desc="Extracting Embeddings..."))
+        return models
+
+    @cython.boundscheck(False)
+    cpdef np.ndarray compare(self,list listgs, list selected):
+        # Selected is ignored
+        [nx.set_edge_attributes(g,1,'weight') for g in listgs]
+        models = self.extract_embedding(listgs)
+        vector_matrix = np.array([mod.mean(axis=0) for mod in models])   # Average nodes representations
+        cs = cosine_similarity(vector_matrix)
+        return cs
--- a/gmatch4py/test/test.py
+++ b/gmatch4py/test/test.py
-import pytest
-    
-
-def nimport():
-    # Gmatch4py use networkx graph 
-    import networkx as nx 
-
-
-def test_import():
-    nimport()
-
-def ged():
-    import networkx as nx
-    # import the GED using the munkres algorithm
-    
-
-def test_ged():
-    ged()
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -71,7 +71,7 @@ setup(
    cmdclass={'build_ext': build_ext},
    setup_requires=requirements,
    install_requires=requirements,
-    version="0.2.4.3beta",
+    version="0.2.5b",
    classifiers=[
            "Programming Language :: Python :: 3",
            "License :: OSI Approved :: MIT License",