Commit 90505e2c authored by Fize Jacques's avatar Fize Jacques

Add Graph Embedding (DeepWalk, Graph2Vec), ShortestPathKernel checked, debug...

Add Graph Embedding (DeepWalk, Graph2Vec), ShortestPathKernel checked, debug graph, add joblib in requirement
parent 35a27dee
......@@ -117,7 +117,8 @@ Jacques Fize, *jacques[dot]fize[at]cirad[dot]fr*
Some algorithms from other projects were integrated to Gmatch4py. **Be assured that
each code is associated with a reference to the original.**
## CHANGELOG
## CHANGELOG
### 25.02.2019
* Add New Graph Class. Features : Cython Extensions, precomputed values (degrees, neighbor info), hash representation of edges and nodes for a faster comparison
* Some algorithms are parallelized such as graph edit distances or Jaccard
......
......@@ -9,8 +9,13 @@ from .ged.hausdorff_edit_distance import *
# Kernels algorithms import
from .kernels.weisfeiler_lehman import *
from .kernels.shortest_path_kernel import *
# Graph Embedding import
from .embedding.graph2vec import *
from .embedding.deepwalk import *
# Helpers import
from .helpers.reader import *
from .helpers.general import *
# Basic algorithms import
from .bag_of_cliques import *
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import random
import networkx as nx
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
import graph as graph2
import walks as serialized_walks
from gensim.models import Word2Vec
from skipgram import Skipgram
from six import text_type as unicode
from six import iteritems
from six.moves import range
cimport cython
from sklearn.metrics.pairwise import cosine_similarity
from ..base cimport Base
import numpy as np
cimport numpy as np
import psutil
from multiprocessing import cpu_count
from joblib import Parallel, delayed
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0):
if len(gr.edges())<1:
return np.zeros((1,representation_size))
G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed())
num_walks = len(G.nodes()) * number_walks
data_size = num_walks * walk_length
#print("Data size (walks*length): {}".format(data_size))
if data_size < max_memory_data_size:
#print("Walking...")
walks = graph2.build_deepwalk_corpus(G, num_paths=number_walks,
path_length=walk_length, alpha=0, rand=random.Random(seed))
#print("Training...")
model = Word2Vec(walks, size=representation_size,
window=window_size, min_count=0, sg=1, hs=1, workers=workers)
else:
#print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(
# data_size, max_memory_data_size))
#print("Walking...")
walks_filebase = "temp.walks"
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
path_length=walk_length, alpha=0, rand=random.Random(seed),
num_workers=workers)
#print("Counting vertex frequency...")
if not vertex_freq_degree:
vertex_counts = serialized_walks.count_textfiles(
walk_files, workers)
else:
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())
#print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=representation_size,
window=window_size, min_count=0, trim_rule=None, workers=workers)
return model.wv.vectors
cdef class DeepWalk(Base):
"""
Based on :
@inproceedings{Perozzi:2014:DOL:2623330.2623732,
author = {Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven},
title = {DeepWalk: Online Learning of Social Representations},
booktitle = {Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
series = {KDD '14},
year = {2014},
isbn = {978-1-4503-2956-9},
location = {New York, New York, USA},
pages = {701--710},
numpages = {10},
url = {http://doi.acm.org/10.1145/2623330.2623732},
doi = {10.1145/2623330.2623732},
acmid = {2623732},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {deep learning, latent representations, learning with partial labels, network classification, online learning, social networks},
}
Orignal Code : https://github.com/phanein/deepwalk
Modified by : Jacques Fize
"""
def __init__(self):
Base.__init__(self,0,True)
def extract_embedding(self, listgs):
from tqdm import tqdm
models = Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings..."))
return models
@cython.boundscheck(False)
cpdef np.ndarray compare(self,list listgs, list selected):
# Selected is ignored
models = self.extract_embedding(listgs)
vector_matrix = np.array([mod.mean(axis=0) for mod in models]) # Average nodes representations
cs = cosine_similarity(vector_matrix)
return cs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Graph utilities."""
import logging
import sys
from io import open
from os import path
from time import time
from glob import glob
from six.moves import range, zip, zip_longest
from six import iterkeys
from collections import defaultdict, Iterable
import random
from random import shuffle
from itertools import product,permutations
from scipy.io import loadmat
from scipy.sparse import issparse
logger = logging.getLogger("deepwalk")
__author__ = "Bryan Perozzi"
__email__ = "bperozzi@cs.stonybrook.edu"
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
class Graph(defaultdict):
"""Efficient basic implementation of nx `Graph' – Undirected graphs with self loops"""
def __init__(self):
super(Graph, self).__init__(list)
def nodes(self):
return self.keys()
def adjacency_iter(self):
return self.iteritems()
def subgraph(self, nodes={}):
subgraph = Graph()
for n in nodes:
if n in self:
subgraph[n] = [x for x in self[n] if x in nodes]
return subgraph
def make_undirected(self):
t0 = time()
for v in self.keys():
for other in self[v]:
if v != other:
self[other].append(v)
t1 = time()
logger.info('make_directed: added missing edges {}s'.format(t1-t0))
self.make_consistent()
return self
def make_consistent(self):
t0 = time()
for k in iterkeys(self):
self[k] = list(sorted(set(self[k])))
t1 = time()
logger.info('make_consistent: made consistent in {}s'.format(t1-t0))
self.remove_self_loops()
return self
def remove_self_loops(self):
removed = 0
t0 = time()
for x in self:
if x in self[x]:
self[x].remove(x)
removed += 1
t1 = time()
logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0)))
return self
def check_self_loops(self):
for x in self:
for y in self[x]:
if x == y:
return True
return False
def has_edge(self, v1, v2):
if v2 in self[v1] or v1 in self[v2]:
return True
return False
def degree(self, nodes=None):
if isinstance(nodes, Iterable):
return {v:len(self[v]) for v in nodes}
else:
return len(self[nodes])
def order(self):
"Returns the number of nodes in the graph"
return len(self)
def number_of_edges(self):
"Returns the number of nodes in the graph"
return sum([self.degree(x) for x in self.keys()])/2
def number_of_nodes(self):
"Returns the number of nodes in the graph"
return self.order()
def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
""" Returns a truncated random walk.
path_length: Length of the random walk.
alpha: probability of restarts.
start: the start node of the random walk.
"""
G = self
if start:
path = [start]
else:
# Sampling is uniform w.r.t V, and not w.r.t E
path = [rand.choice(list(G.keys()))]
while len(path) < path_length:
cur = path[-1]
if len(G[cur]) > 0:
if rand.random() >= alpha:
path.append(rand.choice(G[cur]))
else:
path.append(path[0])
else:
break
return [str(node) for node in path]
# TODO add build_walks in here
def build_deepwalk_corpus(G, num_paths, path_length, alpha=0,
rand=random.Random(0)):
walks = []
nodes = list(G.nodes())
for cnt in range(num_paths):
rand.shuffle(nodes)
for node in nodes:
walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
return walks
def build_deepwalk_corpus_iter(G, num_paths, path_length, alpha=0,
rand=random.Random(0)):
walks = []
nodes = list(G.nodes())
for cnt in range(num_paths):
rand.shuffle(nodes)
for node in nodes:
yield G.random_walk(path_length, rand=rand, alpha=alpha, start=node)
def clique(size):
return from_adjlist(permutations(range(1,size+1)))
# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def grouper(n, iterable, padvalue=None):
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)
def parse_adjacencylist(f):
adjlist = []
for l in f:
if l and l[0] != "#":
introw = [int(x) for x in l.strip().split()]
row = [introw[0]]
row.extend(set(sorted(introw[1:])))
adjlist.extend([row])
return adjlist
def parse_adjacencylist_unchecked(f):
adjlist = []
for l in f:
if l and l[0] != "#":
adjlist.extend([[int(x) for x in l.strip().split()]])
return adjlist
def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True):
if unchecked:
parse_func = parse_adjacencylist_unchecked
convert_func = from_adjlist_unchecked
else:
parse_func = parse_adjacencylist
convert_func = from_adjlist
adjlist = []
t0 = time()
total = 0
with open(file_) as f:
for idx, adj_chunk in enumerate(map(parse_func, grouper(int(chunksize), f))):
adjlist.extend(adj_chunk)
total += len(adj_chunk)
t1 = time()
logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0))
t0 = time()
G = convert_func(adjlist)
t1 = time()
logger.info('Converted edges to graph in {}s'.format(t1-t0))
if undirected:
t0 = time()
G = G.make_undirected()
t1 = time()
logger.info('Made graph undirected in {}s'.format(t1-t0))
return G
def load_edgelist(file_, undirected=True):
G = Graph()
with open(file_) as f:
for l in f:
x, y = l.strip().split()[:2]
x = int(x)
y = int(y)
G[x].append(y)
if undirected:
G[y].append(x)
G.make_consistent()
return G
def load_matfile(file_, variable_name="network", undirected=True):
mat_varables = loadmat(file_)
mat_matrix = mat_varables[variable_name]
return from_numpy(mat_matrix, undirected)
def from_networkx(G_input, undirected=True):
G = Graph()
for _, x in enumerate(G_input):
for y in iterkeys(G_input[x]):
G[x].append(y)
if undirected:
G.make_undirected()
return G
def from_numpy(x, undirected=True):
G = Graph()
if issparse(x):
cx = x.tocoo()
for i,j,v in zip(cx.row, cx.col, cx.data):
G[i].append(j)
else:
raise Exception("Dense matrices not yet supported.")
if undirected:
G.make_undirected()
G.make_consistent()
return G
def from_adjlist(adjlist):
G = Graph()
for row in adjlist:
node = row[0]
neighbors = row[1:]
G[node] = list(sorted(set(neighbors)))
return G
def from_adjlist_unchecked(adjlist):
G = Graph()
for row in adjlist:
node = row[0]
neighbors = row[1:]
G[node] = neighbors
return G
import hashlib
import json
import glob
import pandas as pd
import networkx as nx
from tqdm import tqdm
cimport numpy as np
from joblib import Parallel, delayed
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy.distutils.system_info as sysinfo
from sklearn.metrics.pairwise import cosine_similarity
from ..base cimport Base
cimport cython
class WeisfeilerLehmanMachine:
"""
Weisfeiler Lehman feature extractor class.
"""
def __init__(self, graph, features, iterations):
"""
Initialization method which executes feature extraction.
:param graph: The Nx graph object.
:param features: Feature hash table.
:param iterations: Number of WL iterations.
"""
self.iterations = iterations
self.graph = graph
self.features = features
self.nodes = self.graph.nodes()
self.extracted_features = [str(v) for k,v in features.items()]
self.do_recursions()
def do_a_recursion(self):
"""
The method does a single WL recursion.
:return new_features: The hash table with extracted WL features.
"""
new_features = {}
for node in self.nodes:
nebs = self.graph.neighbors(node)
degs = [self.features[neb] for neb in nebs]
features = "_".join([str(self.features[node])]+list(set(sorted([str(deg) for deg in degs]))))
hash_object = hashlib.md5(features.encode())
hashing = hash_object.hexdigest()
new_features[node] = hashing
self.extracted_features = self.extracted_features + list(new_features.values())
return new_features
def do_recursions(self):
"""
The method does a series of WL recursions.
"""
for iteration in range(self.iterations):
self.features = self.do_a_recursion()
def dataset_reader(graph):
"""
Function to read the graph and features from a json file.
:param path: The path to the graph json.
:return graph: The graph object.
:return features: Features hash table.
:return name: Name of the graph.
"""
features = dict(nx.degree(graph))
features = {k:v for k,v, in features.items()}
return graph, features
def feature_extractor(graph, ix, rounds):
"""
Function to extract WL features from a graph.
:param path: The path to the graph json.
:param rounds: Number of WL iterations.
:return doc: Document collection object.
"""
graph, features = dataset_reader(graph)
machine = WeisfeilerLehmanMachine(graph,features,rounds)
doc = TaggedDocument(words = machine.extracted_features , tags = ["g_{0}".format(ix)])
return doc
def generate_model(graphs, iteration = 2, dimensions = 64, min_count = 5, down_sampling = 0.0001, learning_rate = 0.0001, epochs = 10, workers = 4 ):
"""
Main function to read the graph list, extract features, learn the embedding and save it.
:param args: Object with the arguments.
"""
document_collections = Parallel(n_jobs = workers)(delayed(feature_extractor)(g, ix,iteration) for ix,g in tqdm(enumerate(graphs),desc="Extracting Features..."))
graphs=[nx.relabel_nodes(g,{node:str(node) for node in list(g.nodes)},copy=True) for g in graphs]
model = Doc2Vec(document_collections,
vector_size = dimensions,
window = 0,
min_count = min_count,
dm = 0,
sample = down_sampling,
workers = workers,
epochs = epochs,
alpha = learning_rate)
return model
cdef class Graph2Vec(Base):
"""
Based on :
graph2vec: Learning distributed representations of graphs.
Narayanan, Annamalai and Chandramohan, Mahinthan and Venkatesan, Rajasekar and Chen, Lihui and Liu, Yang
MLG 2017, 13th International Workshop on Mining and Learning with Graphs (MLGWorkshop 2017)
Orignal Code : https://github.com/benedekrozemberczki/graph2vec
Modified by : Jacques Fize
"""
def __init__(self):
Base.__init__(self,0,True)
@cython.boundscheck(False)
cpdef np.ndarray compare(self,list listgs, list selected):
# Selected is ignored
model = generate_model(listgs)
vector_matrix = model.docvecs.vectors_docs
cs = cosine_similarity(vector_matrix)
return cs
from collections import Counter, Mapping
from concurrent.futures import ProcessPoolExecutor
import logging
from multiprocessing import cpu_count
from six import string_types
from gensim.models import Word2Vec
from gensim.models.word2vec import Vocab
logger = logging.getLogger("deepwalk")
class Skipgram(Word2Vec):
"""A subclass to allow more customization of the Word2Vec internals."""
def __init__(self, vocabulary_counts=None, **kwargs):
self.vocabulary_counts = None
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["workers"] = kwargs.get("workers", cpu_count())
kwargs["size"] = kwargs.get("size", 128)
kwargs["sentences"] = kwargs.get("sentences", None)
kwargs["window"] = kwargs.get("window", 10)
kwargs["sg"] = 1
kwargs["hs"] = 1
if vocabulary_counts != None:
self.vocabulary_counts = vocabulary_counts
super(Skipgram, self).__init__(**kwargs)
import logging
from io import open
from os import path
from time import time
from multiprocessing import cpu_count
import random
from concurrent.futures import ProcessPoolExecutor