An error occurred while loading the file. Please try again.
-
Fize Jacques authoredfef9b4dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from multiprocessing import cpu_count
import networkx as nx
import numpy as np
cimport numpy as np
from six import text_type as unicode
from six import iteritems
from six.moves import range
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
import psutil
cimport cython
from ..base cimport Base
import graph as graph2
import walks as serialized_walks
from skipgram import Skipgram
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0):
"""
Return a DeepWalk embedding for a graph
Parameters
----------
gr : nx.Graph
graph
number_walks : int, optional
Number of walk (the default is 10)
walk_length : int, optional
Length of the random walk started at each node (the default is 40)
window_size : int, optional
Window size of skipgram model. (the default is 5)
vertex_freq_degree : bool, optional
Use vertex degree to estimate the frequency of nodes (the default is False)
workers : int, optional
Number of parallel processes (the default is 1)
representation_size : int, optional
Number of latent dimensions to learn for each node (the default is 64)
max_memory_data_size : int, optional
'Size to start dumping walks to disk, instead of keeping them in memory. (the default is 1000000000)
seed : int, optional
Seed for random walk generator (the default is 0)
Returns
-------
np.array
DeepWalk embedding
"""
if len(gr.edges())<1:
return np.zeros((1,representation_size))
G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed())
num_walks = len(G.nodes()) * number_walks
data_size = num_walks * walk_length
#print("Data size (walks*length): {}".format(data_size))
if data_size < max_memory_data_size:
#print("Walking...")
walks = graph2.build_deepwalk_corpus(G, num_paths=number_walks,
path_length=walk_length, alpha=0, rand=random.Random(seed))
#print("Training...")
model = Word2Vec(walks, size=representation_size,
window=window_size, min_count=0, sg=1, hs=1, workers=workers)
else:
#print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(
# data_size, max_memory_data_size))
#print("Walking...")
walks_filebase = "temp.walks"
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
path_length=walk_length, alpha=0, rand=random.Random(seed),
num_workers=workers)
#print("Counting vertex frequency...")
if not vertex_freq_degree:
vertex_counts = serialized_walks.count_textfiles(
walk_files, workers)
else:
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())
#print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=representation_size,
window=window_size, min_count=0, trim_rule=None, workers=workers)
return model.wv.vectors
cdef class DeepWalk(Base):
"""
Based on :
@inproceedings{Perozzi:2014:DOL:2623330.2623732,
author = {Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven},
title = {DeepWalk: Online Learning of Social Representations},
booktitle = {Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
series = {KDD '14},
year = {2014},
isbn = {978-1-4503-2956-9},
location = {New York, New York, USA},
pages = {701--710},
numpages = {10},
url = {http://doi.acm.org/10.1145/2623330.2623732},
doi = {10.1145/2623330.2623732},
acmid = {2623732},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {deep learning, latent representations, learning with partial labels, network classification, online learning, social networks},
}
Orignal Code : https://github.com/phanein/deepwalk
Modified by : Jacques Fize
"""
def __init__(self):
Base.__init__(self,0,True)
def extract_embedding(self, listgs):
"""
Extract DeepWalk embedding of each graph in `listgs`
Parameters
----------
listgs : list
list of graphs
Returns
-------
list
list of embeddings
"""
from tqdm import tqdm
models = Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings..."))
return models
@cython.boundscheck(False)
cpdef np.ndarray compare(self,list listgs, list selected):
# Selected is ignored
models = self.extract_embedding(listgs)
vector_matrix = np.array([mod.mean(axis=0) for mod in models]) # Average nodes representations
cs = cosine_similarity(vector_matrix)
return cs