Commit fef9b4dd authored by Fize Jacques's avatar Fize Jacques

Add documentation. Clean old methods and classes.

parent c4667d6f
# coding = utf-8
from enum import Enum
class AlgorithmType(Enum):
similarity = 0
distance = 1
\ No newline at end of file
...@@ -9,7 +9,7 @@ cimport numpy as np ...@@ -9,7 +9,7 @@ cimport numpy as np
from scipy.sparse import csr_matrix,lil_matrix from scipy.sparse import csr_matrix,lil_matrix
import sys import sys
from .base cimport Base,intersection from .base cimport Base
cdef class BagOfCliques(Base): cdef class BagOfCliques(Base):
......
...@@ -17,6 +17,3 @@ cdef class Base: ...@@ -17,6 +17,3 @@ cdef class Base:
cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key) cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key)
cpdef intersection(G,H)
cpdef union_(G,H)
...@@ -21,85 +21,6 @@ cpdef np.ndarray minmax_scale(np.ndarray matrix): ...@@ -21,85 +21,6 @@ cpdef np.ndarray minmax_scale(np.ndarray matrix):
return x/(max_) return x/(max_)
cpdef intersection(G, H):
"""
Return a new graph that contains only the edges and nodes that exist in
both G and H.
The node sets of H and G must be the same.
Parameters
----------
G,H : graph
A NetworkX graph. G and H must have the same node sets.
Returns
-------
GH : A new graph with the same type as G.
Notes
-----
Attributes from the graph, nodes, and edges are not copied to the new
graph. If you want a new graph of the intersection of G and H
with the attributes (including edge data) from G use remove_nodes_from()
as follows
>>> G=nx.path_graph(3)
>>> H=nx.path_graph(5)
>>> R=G.copy()
>>> R.remove_nodes_from(n for n in G if n not in H)
Modified so it can be used with two graphs with different nodes set
"""
# create new graph
R = nx.create_empty_copy(G)
if not G.is_multigraph() == H.is_multigraph():
raise nx.NetworkXError('G and H must both be graphs or multigraphs.')
if G.number_of_edges() <= H.number_of_edges():
if G.is_multigraph():
edges = G.edges(keys=True)
else:
edges = G.edges()
for e in edges:
if H.has_edge(*e):
R.add_edge(*e)
else:
if H.is_multigraph():
edges = H.edges(keys=True)
else:
edges = H.edges()
for e in edges:
if G.has_edge(*e):
R.add_edge(*e)
nodes_g=set(G.nodes())
nodes_h=set(H.nodes())
R.remove_nodes_from(list(nodes_g - nodes_h))
return R
cpdef union_(G, H):
"""
Return a graph that contains nodes and edges from both graph G and H.
Parameters
----------
G : networkx.Graph
First graph
H : networkx.Graph
Second graph
Returns
-------
networkx.Graph
A new graph with the same type as G.
"""
R = nx.create_empty_copy(G)
R.add_nodes_from(H.nodes(data=True))
R.add_edges_from(G.edges(data=True))
R.add_edges_from(H.edges(data=True))
return R
cdef class Base: cdef class Base:
""" """
This class define the common methods to all Graph Matching algorithm. This class define the common methods to all Graph Matching algorithm.
...@@ -145,10 +66,34 @@ cdef class Base: ...@@ -145,10 +66,34 @@ cdef class Base:
self.edge_attr_key=edge_attr_key self.edge_attr_key=edge_attr_key
cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key): cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key):
"""
Set graph attribute used by the algorithm to compare graphs.
Parameters
----------
node_attr_key : str
key of the node attribute
edge_attr_key: str
key of the edge attribute
"""
self.node_attr_key=node_attr_key self.node_attr_key=node_attr_key
self.edge_attr_key=edge_attr_key self.edge_attr_key=edge_attr_key
cpdef np.ndarray get_selected_array(self,selected,size_corpus): cpdef np.ndarray get_selected_array(self,selected,size_corpus):
"""
Return an array which define which graph will be compared in the algorithms.
Parameters
----------
selected : list
indices of graphs you wish to compare
size_corpus :
size of your dataset
Returns
-------
np.ndarray
selected vector (1 -> selected, 0 -> not selected)
"""
cdef double[:] selected_test = np.zeros(size_corpus) cdef double[:] selected_test = np.zeros(size_corpus)
if not selected == None: if not selected == None:
for ix in range(len(selected)): for ix in range(len(selected)):
...@@ -159,6 +104,20 @@ cdef class Base: ...@@ -159,6 +104,20 @@ cdef class Base:
cpdef np.ndarray compare_old(self,list listgs, list selected): cpdef np.ndarray compare_old(self,list listgs, list selected):
"""
Soon will be depreciated ! To store the old version of an algorithm.
Parameters
----------
listgs : list
list of graphs
selected
selected graphs
Returns
-------
np.ndarray
distance/similarity matrix
"""
pass pass
@cython.boundscheck(False) @cython.boundscheck(False)
...@@ -179,7 +138,7 @@ cdef class Base: ...@@ -179,7 +138,7 @@ cdef class Base:
the None value the None value
Returns Returns
------- -------
np.array np.ndarray
distance/similarity matrix distance/similarity matrix
""" """
...@@ -190,12 +149,12 @@ cdef class Base: ...@@ -190,12 +149,12 @@ cdef class Base:
Return a normalized distance matrix Return a normalized distance matrix
Parameters Parameters
---------- ----------
matrix : np.array matrix : np.ndarray
Similarity/distance matrix you want to transform Similarity/distance matrix you wish to transform
Returns Returns
------- -------
np.array np.ndarray
distance matrix distance matrix
""" """
if self.type_alg == 1: if self.type_alg == 1:
...@@ -212,8 +171,8 @@ cdef class Base: ...@@ -212,8 +171,8 @@ cdef class Base:
Return a normalized similarity matrix Return a normalized similarity matrix
Parameters Parameters
---------- ----------
matrix : np.array matrix : np.ndarray
Similarity/distance matrix you want to transform Similarity/distance matrix you wish to transform
Returns Returns
------- -------
...@@ -227,24 +186,6 @@ cdef class Base: ...@@ -227,24 +186,6 @@ cdef class Base:
matrix=np.ma.getdata(minmax_scale(matrix)) matrix=np.ma.getdata(minmax_scale(matrix))
return 1-matrix return 1-matrix
def mcs(self, G, H):
"""
Return the Most Common Subgraph of
Parameters
----------
G : networkx.Graph
First Graph
H : networkx.Graph
Second Graph
Returns
-------
networkx.Graph
Most common Subgrah
"""
R=G.copy()
R.remove_nodes_from(n for n in G if n not in H)
return R
cpdef bint isAccepted(self,G,index,selected): cpdef bint isAccepted(self,G,index,selected):
""" """
......
...@@ -11,7 +11,7 @@ cdef class BagOfNodes(Base): ...@@ -11,7 +11,7 @@ cdef class BagOfNodes(Base):
We could call this algorithm Bag of nodes We could call this algorithm Bag of nodes
""" """
def __init__(self): def __init__(self):
Base.__init__(self,0,True) Base.__init__(self,0,True)
cpdef np.ndarray compare(self,list graph_list, list selected): cpdef np.ndarray compare(self,list graph_list, list selected):
nodes = list() nodes = list()
......
# coding = utf-8
import networkx as nx
import numpy as np
import scipy.sparse
class DeltaCon0():
__type__ = "sim"
@staticmethod
def compare(list_gs,selected):
n=len(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
f=True
if not list_gs[i] or not list_gs[j]:
f=False
elif len(list_gs[i])== 0 or len(list_gs[j]) == 0:
f=False
if selected:
if not i in selected:
f=False
if f:
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
S1 = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
S2 = np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(S1,S2))
comparison_matrix[j,i] = comparison_matrix[i,j]
else:
comparison_matrix[i, j] = 0.
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
@staticmethod
def rootED(S1,S2):
return np.sqrt(np.sum((S1-S2)**2)) # Long live numpy !
@staticmethod
def degreeAndAdjacencyMatrix(G):
"""
Return the Degree(D) and Adjacency Matrix(A) from a graph G.
Inspired of nx.laplacian_matrix(G,nodelist,weight) code proposed by networkx
:param G:
:return:
"""
A = nx.to_scipy_sparse_matrix(G, nodelist=list(G.nodes), weight="weight",
format='csr')
n, m = A.shape
diags = A.sum(axis=1)
D = scipy.sparse.spdiags(diags.flatten(), [0], m, n, format='csr')
return D, A
@staticmethod
def maxDegree(G):
degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence
# print "Degree sequence", degree_sequence
dmax = max(degree_sequence)
return dmax
class DeltaCon():
__type__ = "sim"
@staticmethod
def relabel_nodes(graph_list):
label_lookup = {}
label_counter = 0
n= len(graph_list)
# label_lookup is an associative array, which will contain the
# mapping from multiset labels (strings) to short labels
# (integers)
for i in range(n):
nodes = list(graph_list[i].nodes)
for j in range(len(nodes)):
if not (nodes[j] in label_lookup):
label_lookup[nodes[j]] = label_counter
label_counter += 1
graph_list[i] = nx.relabel_nodes(graph_list[i], label_lookup)
return graph_list
@staticmethod
def compare(list_gs, g=3):
n=len(list_gs)
list_gs=DeltaCon.relabel_nodes(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
V = list(g1.nodes)
V.extend(list(g2.nodes))
V=np.unique(V)
partitions=V.copy()
np.random.shuffle(partitions)
if len(partitions)< g:
partitions=np.array([partitions])
else:
partitions=np.array_split(partitions,g)
partitions_e_1 = DeltaCon.partitions2e(partitions, list(g1.nodes))
partitions_e_2 = DeltaCon.partitions2e(partitions, list(g2.nodes))
S1,S2=[],[]
for k in range(len(partitions)):
s0k1,s0k2=partitions_e_1[k],partitions_e_2[k]
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
s1k = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
s1k=np.linalg.solve(s1k,s0k1).tolist()
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
s2k= np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
s2k = np.linalg.solve(s2k, s0k2).tolist()
S1.append(s1k)
S2.append(s2k)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(np.array(S1),np.array(S2)))
comparison_matrix[j,i] = comparison_matrix[i,j]
return comparison_matrix
@staticmethod
def partitions2e( partitions, V):
e = [ [] for i in range(len(partitions))]
for p in range(len(partitions)):
e[p] = []
for i in range(len(V)):
if i in partitions[p]:
e[p].append(1.0)
else:
e[p].append(0.0)
return e
\ No newline at end of file
...@@ -4,31 +4,31 @@ ...@@ -4,31 +4,31 @@
import os import os
import sys import sys
import random import random
import networkx as nx
from io import open from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter from collections import Counter
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
import logging import logging
from multiprocessing import cpu_count
import graph as graph2 import networkx as nx
import walks as serialized_walks import numpy as np
from gensim.models import Word2Vec cimport numpy as np
from skipgram import Skipgram
from six import text_type as unicode from six import text_type as unicode
from six import iteritems from six import iteritems
from six.moves import range from six.moves import range
cimport cython from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
from ..base cimport Base
import numpy as np
cimport numpy as np
import psutil
from multiprocessing import cpu_count
from joblib import Parallel, delayed from joblib import Parallel, delayed
import psutil
cimport cython
from ..base cimport Base
import graph as graph2
import walks as serialized_walks
from skipgram import Skipgram
p = psutil.Process(os.getpid()) p = psutil.Process(os.getpid())
...@@ -42,6 +42,36 @@ except AttributeError: ...@@ -42,6 +42,36 @@ except AttributeError:
def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0): def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0):
"""
Return a DeepWalk embedding for a graph
Parameters
----------
gr : nx.Graph
graph
number_walks : int, optional
Number of walk (the default is 10)
walk_length : int, optional
Length of the random walk started at each node (the default is 40)
window_size : int, optional
Window size of skipgram model. (the default is 5)
vertex_freq_degree : bool, optional
Use vertex degree to estimate the frequency of nodes (the default is False)
workers : int, optional
Number of parallel processes (the default is 1)
representation_size : int, optional
Number of latent dimensions to learn for each node (the default is 64)
max_memory_data_size : int, optional
'Size to start dumping walks to disk, instead of keeping them in memory. (the default is 1000000000)
seed : int, optional
Seed for random walk generator (the default is 0)
Returns
-------
np.array
DeepWalk embedding
"""
if len(gr.edges())<1: if len(gr.edges())<1:
return np.zeros((1,representation_size)) return np.zeros((1,representation_size))
G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed()) G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed())
...@@ -115,6 +145,20 @@ cdef class DeepWalk(Base): ...@@ -115,6 +145,20 @@ cdef class DeepWalk(Base):
Base.__init__(self,0,True) Base.__init__(self,0,True)
def extract_embedding(self, listgs): def extract_embedding(self, listgs):
"""
Extract DeepWalk embedding of each graph in `listgs`
Parameters
----------
listgs : list
list of graphs
Returns
-------
list
list of embeddings
"""
from tqdm import tqdm from tqdm import tqdm
models = Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings...")) models = Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings..."))
return models return models
......
import hashlib import hashlib
import json import json
import glob import glob
import pandas as pd import pandas as pd
import networkx as nx import networkx as nx
from tqdm import tqdm from tqdm import tqdm
cimport numpy as np cimport numpy as np
import numpy.distutils.system_info as sysinfo
from joblib import Parallel, delayed from joblib import Parallel, delayed
from gensim.models.doc2vec import Doc2Vec, TaggedDocument from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy.distutils.system_info as sysinfo
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
from ..base cimport Base from ..base cimport Base
...@@ -21,10 +23,18 @@ class WeisfeilerLehmanMachine: ...@@ -21,10 +23,18 @@ class WeisfeilerLehmanMachine:
def __init__(self, graph, features, iterations): def __init__(self, graph, features, iterations):
""" """
Initialization method which executes feature extraction. Initialization method which executes feature extraction.
:param graph: The Nx graph object.
:param features: Feature hash table. Parameters
:param iterations: Number of WL iterations. ----------
graph : nx.Graph
graph
features : dict
Feature hash table.
iterations : int
number of WL iteration
""" """
self.iterations = iterations self.iterations = iterations
self.graph = graph self.graph = graph
self.features = features self.features = features
...@@ -35,8 +45,13 @@ class WeisfeilerLehmanMachine: ...@@ -35,8 +45,13 @@ class WeisfeilerLehmanMachine:
def do_a_recursion(self): def do_a_recursion(self):
""" """
The method does a single WL recursion. The method does a single WL recursion.
:return new_features: The hash table with extracted WL features.
Returns
-------
dict
The hash table with extracted WL features.
""" """
new_features = {} new_features = {}
for node in self.nodes: for node in self.nodes:
nebs = self.graph.neighbors(node) nebs = self.graph.neighbors(node)
...@@ -58,11 +73,17 @@ class WeisfeilerLehmanMachine: ...@@ -58,11 +73,17 @@ class WeisfeilerLehmanMachine:
def dataset_reader(graph): def dataset_reader(graph):
""" """
Function to read the graph and features from a json file. Function to extract features from a networkx graph
:param path: The path to the graph json.
:return graph: The graph object. Parameters
:return features: Features hash table. ----------
:return name: Name of the graph. graph : nx.Graph
graph
Returns
-------
dict
Features hash table.
""" """
features = dict(nx.degree(graph)) features = dict(nx.degree(graph))
...@@ -70,13 +91,26 @@ def dataset_reader(graph): ...@@ -70,13 +91,26 @@ def dataset_reader(graph):
features = {k:v for k,v, in features.items()} features = {k:v for k,v, in features.items()}
return graph, features return graph, features
def feature_extractor(graph, ix, rounds): def feature_extractor(graph, ix, rounds):
""" """
Function to extract WL features from a graph. Function to extract WL features from a graph
:param path: The path to the graph json.
:param rounds: Number of WL iterations. Parameters
:return doc: Document collection object. ----------
graph : nx.Graph
graph
ix : int
index of the graph in the dataset
rounds : int
number of WL iterations
Returns
-------
TaggedDocument
random walks
""" """
graph, features = dataset_reader(graph) graph, features = dataset_reader(graph)
machine = WeisfeilerLehmanMachine(graph,features,rounds) machine = WeisfeilerLehmanMachine(graph,features,rounds)
doc = TaggedDocument(words = machine.extracted_features , tags = ["g_{0}".format(ix)]) doc = TaggedDocument(words = machine.extracted_features , tags = ["g_{0}".format(ix)])
...@@ -87,8 +121,32 @@ def feature_extractor(graph, ix, rounds): ...@@ -87,8 +121,32 @@ def feature_extractor(graph, ix, rounds):
def generate_model(graphs, iteration = 2, dimensions = 64, min_count = 5, down_sampling = 0.0001, learning_rate = 0.0001, epochs = 10, workers = 4 ): def generate_model(graphs, iteration = 2, dimensions = 64, min_count = 5, down_sampling = 0.0001, learning_rate = 0.0001, epochs = 10, workers = 4 ):
""" """
Main function to read the graph list, extract features, learn the embedding and save it. Main function to read the graph list, extract features, learn the embedding and save it.
:param args: Object with the arguments.
Parameters
----------
graphs : nx.Graph
Input graph
iteration : int, optional
number of iteration (the default is 2)
dimensions : int, optional
output vector dimension (the default is 64)
min_count : int, optional
min count parameter of Doc2vec model (the default is 5)
down_sampling : float, optional
Down sampling rate for frequent features. (the default is 0.0001)
learning_rate : float, optional
Initial learning rate (the default is 0.0001, which [default_description])
epochs : int, optional
Number of epochs (the default is 10)
workers : int, optional
Number of workers (the default is 4)