deepwalk.pyx

#! /usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import random

from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from multiprocessing import cpu_count

import networkx as nx
import numpy as np
cimport numpy as np
from six import text_type as unicode
from six import iteritems
from six.moves import range

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
import psutil

cimport cython
from ..base cimport Base
import graph as graph2
import walks as serialized_walks
from skipgram import Skipgram


p = psutil.Process(os.getpid())
try:
    p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
    try:
        p.cpu_affinity(list(range(cpu_count())))
    except AttributeError:
        pass


def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0):
    """
    Return a DeepWalk embedding for a graph

    Parameters
    ----------
    gr : nx.Graph
        graph
    number_walks : int, optional
        Number of walk (the default is 10)
    walk_length : int, optional
        Length of the random walk started at each node (the default is 40)
    window_size : int, optional
        Window size of skipgram model. (the default is 5)
    vertex_freq_degree : bool, optional
        Use vertex degree to estimate the frequency of nodes (the default is False)
    workers : int, optional
        Number of parallel processes (the default is 1)
    representation_size : int, optional
        Number of latent dimensions to learn for each node (the default is 64)
    max_memory_data_size : int, optional
        'Size to start dumping walks to disk, instead of keeping them in memory. (the default is 1000000000)
    seed : int, optional
        Seed for random walk generator (the default is 0)

    Returns
    -------
    np.array
        DeepWalk embedding
    """

    if len(gr.edges())<1:
        return np.zeros((1,representation_size))
    G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed())
    num_walks = len(G.nodes()) * number_walks

    data_size = num_walks * walk_length

    #print("Data size (walks*length): {}".format(data_size))

    if data_size < max_memory_data_size:
        #print("Walking...")
        walks = graph2.build_deepwalk_corpus(G, num_paths=number_walks,
                                            path_length=walk_length, alpha=0, rand=random.Random(seed))
        #print("Training...")
        model = Word2Vec(walks, size=representation_size,
                            window=window_size, min_count=0, sg=1, hs=1, workers=workers)
    else:
        #print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(
        #    data_size, max_memory_data_size))
        #print("Walking...")

        walks_filebase = "temp.walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
                                                            path_length=walk_length, alpha=0, rand=random.Random(seed),
                                                            num_workers=workers)

        #print("Counting vertex frequency...")
        if not vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        #print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                            size=representation_size,
                            window=window_size, min_count=0, trim_rule=None, workers=workers)

    return model.wv.vectors


cdef class DeepWalk(Base):
    """
    Based on :
    @inproceedings{Perozzi:2014:DOL:2623330.2623732,
        author = {Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven},
        title = {DeepWalk: Online Learning of Social Representations},
        booktitle = {Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
        series = {KDD '14},
        year = {2014},
        isbn = {978-1-4503-2956-9},
        location = {New York, New York, USA},
        pages = {701--710},
        numpages = {10},
        url = {http://doi.acm.org/10.1145/2623330.2623732},
        doi = {10.1145/2623330.2623732},
        acmid = {2623732},
        publisher = {ACM},
        address = {New York, NY, USA},
        keywords = {deep learning, latent representations, learning with partial labels, network classification, online learning, social networks},
    }

    Orignal Code : https://github.com/phanein/deepwalk

    Modified by : Jacques Fize
    """

    def __init__(self):
        Base.__init__(self,0,True)

    def extract_embedding(self, listgs):
        """
        Extract DeepWalk embedding of each graph in `listgs`

        Parameters
        ----------
        listgs : list
            list of graphs

        Returns
        -------
        list
            list of embeddings
        """

        from tqdm import tqdm
        models =  Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings..."))
        return models

    @cython.boundscheck(False)
    cpdef np.ndarray compare(self,list listgs, list selected):
        # Selected is ignored
        models = self.extract_embedding(listgs)
        vector_matrix = np.array([mod.mean(axis=0) for mod in models])   # Average nodes representations
        cs = cosine_similarity(vector_matrix)
        return cs