Commit fef9b4dd authored by Fize Jacques's avatar Fize Jacques

Add documentation. Clean old methods and classes.

parent c4667d6f
# coding = utf-8
from enum import Enum
class AlgorithmType(Enum):
similarity = 0
distance = 1
\ No newline at end of file
......@@ -9,7 +9,7 @@ cimport numpy as np
from scipy.sparse import csr_matrix,lil_matrix
import sys
from .base cimport Base,intersection
from .base cimport Base
cdef class BagOfCliques(Base):
......
......@@ -17,6 +17,3 @@ cdef class Base:
cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key)
cpdef intersection(G,H)
cpdef union_(G,H)
......@@ -21,85 +21,6 @@ cpdef np.ndarray minmax_scale(np.ndarray matrix):
return x/(max_)
cpdef intersection(G, H):
"""
Return a new graph that contains only the edges and nodes that exist in
both G and H.
The node sets of H and G must be the same.
Parameters
----------
G,H : graph
A NetworkX graph. G and H must have the same node sets.
Returns
-------
GH : A new graph with the same type as G.
Notes
-----
Attributes from the graph, nodes, and edges are not copied to the new
graph. If you want a new graph of the intersection of G and H
with the attributes (including edge data) from G use remove_nodes_from()
as follows
>>> G=nx.path_graph(3)
>>> H=nx.path_graph(5)
>>> R=G.copy()
>>> R.remove_nodes_from(n for n in G if n not in H)
Modified so it can be used with two graphs with different nodes set
"""
# create new graph
R = nx.create_empty_copy(G)
if not G.is_multigraph() == H.is_multigraph():
raise nx.NetworkXError('G and H must both be graphs or multigraphs.')
if G.number_of_edges() <= H.number_of_edges():
if G.is_multigraph():
edges = G.edges(keys=True)
else:
edges = G.edges()
for e in edges:
if H.has_edge(*e):
R.add_edge(*e)
else:
if H.is_multigraph():
edges = H.edges(keys=True)
else:
edges = H.edges()
for e in edges:
if G.has_edge(*e):
R.add_edge(*e)
nodes_g=set(G.nodes())
nodes_h=set(H.nodes())
R.remove_nodes_from(list(nodes_g - nodes_h))
return R
cpdef union_(G, H):
"""
Return a graph that contains nodes and edges from both graph G and H.
Parameters
----------
G : networkx.Graph
First graph
H : networkx.Graph
Second graph
Returns
-------
networkx.Graph
A new graph with the same type as G.
"""
R = nx.create_empty_copy(G)
R.add_nodes_from(H.nodes(data=True))
R.add_edges_from(G.edges(data=True))
R.add_edges_from(H.edges(data=True))
return R
cdef class Base:
"""
This class define the common methods to all Graph Matching algorithm.
......@@ -145,10 +66,34 @@ cdef class Base:
self.edge_attr_key=edge_attr_key
cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key):
"""
Set graph attribute used by the algorithm to compare graphs.
Parameters
----------
node_attr_key : str
key of the node attribute
edge_attr_key: str
key of the edge attribute
"""
self.node_attr_key=node_attr_key
self.edge_attr_key=edge_attr_key
cpdef np.ndarray get_selected_array(self,selected,size_corpus):
"""
Return an array which define which graph will be compared in the algorithms.
Parameters
----------
selected : list
indices of graphs you wish to compare
size_corpus :
size of your dataset
Returns
-------
np.ndarray
selected vector (1 -> selected, 0 -> not selected)
"""
cdef double[:] selected_test = np.zeros(size_corpus)
if not selected == None:
for ix in range(len(selected)):
......@@ -159,6 +104,20 @@ cdef class Base:
cpdef np.ndarray compare_old(self,list listgs, list selected):
"""
Soon will be depreciated ! To store the old version of an algorithm.
Parameters
----------
listgs : list
list of graphs
selected
selected graphs
Returns
-------
np.ndarray
distance/similarity matrix
"""
pass
@cython.boundscheck(False)
......@@ -179,7 +138,7 @@ cdef class Base:
the None value
Returns
-------
np.array
np.ndarray
distance/similarity matrix
"""
......@@ -190,12 +149,12 @@ cdef class Base:
Return a normalized distance matrix
Parameters
----------
matrix : np.array
Similarity/distance matrix you want to transform
matrix : np.ndarray
Similarity/distance matrix you wish to transform
Returns
-------
np.array
np.ndarray
distance matrix
"""
if self.type_alg == 1:
......@@ -212,8 +171,8 @@ cdef class Base:
Return a normalized similarity matrix
Parameters
----------
matrix : np.array
Similarity/distance matrix you want to transform
matrix : np.ndarray
Similarity/distance matrix you wish to transform
Returns
-------
......@@ -227,24 +186,6 @@ cdef class Base:
matrix=np.ma.getdata(minmax_scale(matrix))
return 1-matrix
def mcs(self, G, H):
"""
Return the Most Common Subgraph of
Parameters
----------
G : networkx.Graph
First Graph
H : networkx.Graph
Second Graph
Returns
-------
networkx.Graph
Most common Subgrah
"""
R=G.copy()
R.remove_nodes_from(n for n in G if n not in H)
return R
cpdef bint isAccepted(self,G,index,selected):
"""
......
......@@ -11,7 +11,7 @@ cdef class BagOfNodes(Base):
We could call this algorithm Bag of nodes
"""
def __init__(self):
Base.__init__(self,0,True)
Base.__init__(self,0,True)
cpdef np.ndarray compare(self,list graph_list, list selected):
nodes = list()
......
# coding = utf-8
import networkx as nx
import numpy as np
import scipy.sparse
class DeltaCon0():
__type__ = "sim"
@staticmethod
def compare(list_gs,selected):
n=len(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
f=True
if not list_gs[i] or not list_gs[j]:
f=False
elif len(list_gs[i])== 0 or len(list_gs[j]) == 0:
f=False
if selected:
if not i in selected:
f=False
if f:
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
S1 = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
S2 = np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(S1,S2))
comparison_matrix[j,i] = comparison_matrix[i,j]
else:
comparison_matrix[i, j] = 0.
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
@staticmethod
def rootED(S1,S2):
return np.sqrt(np.sum((S1-S2)**2)) # Long live numpy !
@staticmethod
def degreeAndAdjacencyMatrix(G):
"""
Return the Degree(D) and Adjacency Matrix(A) from a graph G.
Inspired of nx.laplacian_matrix(G,nodelist,weight) code proposed by networkx
:param G:
:return:
"""
A = nx.to_scipy_sparse_matrix(G, nodelist=list(G.nodes), weight="weight",
format='csr')
n, m = A.shape
diags = A.sum(axis=1)
D = scipy.sparse.spdiags(diags.flatten(), [0], m, n, format='csr')
return D, A
@staticmethod
def maxDegree(G):
degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence
# print "Degree sequence", degree_sequence
dmax = max(degree_sequence)
return dmax
class DeltaCon():
__type__ = "sim"
@staticmethod
def relabel_nodes(graph_list):
label_lookup = {}
label_counter = 0
n= len(graph_list)
# label_lookup is an associative array, which will contain the
# mapping from multiset labels (strings) to short labels
# (integers)
for i in range(n):
nodes = list(graph_list[i].nodes)
for j in range(len(nodes)):
if not (nodes[j] in label_lookup):
label_lookup[nodes[j]] = label_counter
label_counter += 1
graph_list[i] = nx.relabel_nodes(graph_list[i], label_lookup)
return graph_list
@staticmethod
def compare(list_gs, g=3):
n=len(list_gs)
list_gs=DeltaCon.relabel_nodes(list_gs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
g1,g2=list_gs[i],list_gs[j]
V = list(g1.nodes)
V.extend(list(g2.nodes))
V=np.unique(V)
partitions=V.copy()
np.random.shuffle(partitions)
if len(partitions)< g:
partitions=np.array([partitions])
else:
partitions=np.array_split(partitions,g)
partitions_e_1 = DeltaCon.partitions2e(partitions, list(g1.nodes))
partitions_e_2 = DeltaCon.partitions2e(partitions, list(g2.nodes))
S1,S2=[],[]
for k in range(len(partitions)):
s0k1,s0k2=partitions_e_1[k],partitions_e_2[k]
# S1
epsilon = 1/(1+DeltaCon0.maxDegree(g1))
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g1)
s1k = np.linalg.inv(np.identity(len(g1))+(epsilon**2)*D -epsilon*A)
s1k=np.linalg.solve(s1k,s0k1).tolist()
# S2
D, A = DeltaCon0.degreeAndAdjacencyMatrix(g2)
epsilon = 1 / (1 + DeltaCon0.maxDegree(g2))
s2k= np.linalg.inv(np.identity(len(g2))+(epsilon**2)*D -epsilon*A)
s2k = np.linalg.solve(s2k, s0k2).tolist()
S1.append(s1k)
S2.append(s2k)
comparison_matrix[i,j] = 1/(1+DeltaCon0.rootED(np.array(S1),np.array(S2)))
comparison_matrix[j,i] = comparison_matrix[i,j]
return comparison_matrix
@staticmethod
def partitions2e( partitions, V):
e = [ [] for i in range(len(partitions))]
for p in range(len(partitions)):
e[p] = []
for i in range(len(V)):
if i in partitions[p]:
e[p].append(1.0)
else:
e[p].append(0.0)
return e
\ No newline at end of file
......@@ -4,31 +4,31 @@
import os
import sys
import random
import networkx as nx
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from multiprocessing import cpu_count
import graph as graph2
import walks as serialized_walks
from gensim.models import Word2Vec
from skipgram import Skipgram
import networkx as nx
import numpy as np
cimport numpy as np
from six import text_type as unicode
from six import iteritems
from six.moves import range
cimport cython
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from ..base cimport Base
import numpy as np
cimport numpy as np
import psutil
from multiprocessing import cpu_count
from joblib import Parallel, delayed
import psutil
cimport cython
from ..base cimport Base
import graph as graph2
import walks as serialized_walks
from skipgram import Skipgram
p = psutil.Process(os.getpid())
......@@ -42,6 +42,36 @@ except AttributeError:
def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0):
"""
Return a DeepWalk embedding for a graph
Parameters
----------
gr : nx.Graph
graph
number_walks : int, optional
Number of walk (the default is 10)
walk_length : int, optional
Length of the random walk started at each node (the default is 40)
window_size : int, optional
Window size of skipgram model. (the default is 5)
vertex_freq_degree : bool, optional
Use vertex degree to estimate the frequency of nodes (the default is False)
workers : int, optional
Number of parallel processes (the default is 1)
representation_size : int, optional
Number of latent dimensions to learn for each node (the default is 64)
max_memory_data_size : int, optional
'Size to start dumping walks to disk, instead of keeping them in memory. (the default is 1000000000)
seed : int, optional
Seed for random walk generator (the default is 0)
Returns
-------
np.array
DeepWalk embedding
"""
if len(gr.edges())<1:
return np.zeros((1,representation_size))
G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed())
......@@ -115,6 +145,20 @@ cdef class DeepWalk(Base):
Base.__init__(self,0,True)
def extract_embedding(self, listgs):
"""
Extract DeepWalk embedding of each graph in `listgs`
Parameters
----------
listgs : list
list of graphs
Returns
-------
list
list of embeddings
"""
from tqdm import tqdm
models = Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings..."))
return models
......
import hashlib
import json
import glob
import pandas as pd
import networkx as nx
from tqdm import tqdm
cimport numpy as np
import numpy.distutils.system_info as sysinfo
from joblib import Parallel, delayed
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy.distutils.system_info as sysinfo
from sklearn.metrics.pairwise import cosine_similarity
from ..base cimport Base
......@@ -21,10 +23,18 @@ class WeisfeilerLehmanMachine:
def __init__(self, graph, features, iterations):
"""
Initialization method which executes feature extraction.
:param graph: The Nx graph object.
:param features: Feature hash table.
:param iterations: Number of WL iterations.
Parameters
----------
graph : nx.Graph
graph
features : dict
Feature hash table.
iterations : int
number of WL iteration
"""
self.iterations = iterations
self.graph = graph
self.features = features
......@@ -35,8 +45,13 @@ class WeisfeilerLehmanMachine:
def do_a_recursion(self):
"""
The method does a single WL recursion.
:return new_features: The hash table with extracted WL features.
Returns
-------
dict
The hash table with extracted WL features.
"""
new_features = {}
for node in self.nodes:
nebs = self.graph.neighbors(node)
......@@ -58,11 +73,17 @@ class WeisfeilerLehmanMachine:
def dataset_reader(graph):
"""
Function to read the graph and features from a json file.
:param path: The path to the graph json.
:return graph: The graph object.
:return features: Features hash table.
:return name: Name of the graph.
Function to extract features from a networkx graph
Parameters
----------
graph : nx.Graph
graph
Returns
-------
dict
Features hash table.
"""
features = dict(nx.degree(graph))
......@@ -70,13 +91,26 @@ def dataset_reader(graph):
features = {k:v for k,v, in features.items()}
return graph, features
def feature_extractor(graph, ix, rounds):
"""
Function to extract WL features from a graph.
:param path: The path to the graph json.
:param rounds: Number of WL iterations.
:return doc: Document collection object.
Function to extract WL features from a graph
Parameters
----------
graph : nx.Graph
graph
ix : int
index of the graph in the dataset
rounds : int
number of WL iterations
Returns
-------
TaggedDocument
random walks
"""
graph, features = dataset_reader(graph)
machine = WeisfeilerLehmanMachine(graph,features,rounds)
doc = TaggedDocument(words = machine.extracted_features , tags = ["g_{0}".format(ix)])
......@@ -87,8 +121,32 @@ def feature_extractor(graph, ix, rounds):
def generate_model(graphs, iteration = 2, dimensions = 64, min_count = 5, down_sampling = 0.0001, learning_rate = 0.0001, epochs = 10, workers = 4 ):
"""
Main function to read the graph list, extract features, learn the embedding and save it.
:param args: Object with the arguments.
Parameters
----------
graphs : nx.Graph
Input graph
iteration : int, optional
number of iteration (the default is 2)
dimensions : int, optional
output vector dimension (the default is 64)
min_count : int, optional
min count parameter of Doc2vec model (the default is 5)
down_sampling : float, optional
Down sampling rate for frequent features. (the default is 0.0001)
learning_rate : float, optional
Initial learning rate (the default is 0.0001, which [default_description])
epochs : int, optional
Number of epochs (the default is 10)
workers : int, optional
Number of workers (the default is 4)
Returns
-------
[type]
[description]
"""
document_collections = Parallel(n_jobs = workers)(delayed(feature_extractor)(g, ix,iteration) for ix,g in tqdm(enumerate(graphs),desc="Extracting Features..."))
graphs=[nx.relabel_nodes(g,{node:str(node) for node in list(g.nodes)},copy=True) for g in graphs]
model = Doc2Vec(document_collections,
......
......@@ -3,8 +3,12 @@ from __future__ import print_function
import sys
import warnings
import numpy as np
cimport numpy as np
import networkx as nx
from cython.parallel cimport prange,parallel
try:
from munkres import munkres
except ImportError:
......@@ -12,9 +16,8 @@ except ImportError:
from scipy.optimize import linear_sum_assignment as munkres
from ..base cimport Base
import networkx as nx
from ..helpers.general import parsenx2graph
from cython.parallel cimport prange,parallel
cdef class AbstractGraphEditDistance(Base):
......@@ -31,8 +34,19 @@ cdef class AbstractGraphEditDistance(Base):
cpdef double distance_ged(self,G,H):
"""
Return the distance between G and H
:return:
Return the distance value between G and H
Parameters
----------
G : gmatch4py.Graph
graph
H : gmatch4py.Graph
graph
Returns
-------
int
distance
"""
cdef list opt_path = self.edit_costs(G,H)
return np.sum(opt_path)
......@@ -41,7 +55,18 @@ cdef class AbstractGraphEditDistance(Base):
cdef list edit_costs(self, G, H):
"""
Return the optimal path edit cost list, to transform G into H
:return:
Parameters
----------
G : gmatch4py.Graph
graph
H : gmatch4py.Graph
graph
Returns
-------
np.array
edit path
"""
cdef np.ndarray cost_matrix = self.create_cost_matrix(G,H).astype(float)
return cost_matrix[munkres(cost_matrix)].tolist()
......@@ -59,6 +84,18 @@ cdef class AbstractGraphEditDistance(Base):
delete | delete -> delete
The delete -> delete region is filled with zeros
Parameters
----------
G : gmatch4py.Graph
graph
H : gmatch4py.Graph
graph
Returns
-------
np.array
cost matrix
"""
cdef int n,m
try:
......@@ -86,29 +123,38 @@ cdef class AbstractGraphEditDistance(Base):
return cost_matrix
cdef double insert_cost(self, int i, int j, nodesH, H):
"""
Return the insert cost of the ith nodes in H
Returns
-------
int
insert cost
"""
raise NotImplementedError
cdef double delete_cost(self, int i, int j, nodesG, G):
"""
Return the delete cost of the ith nodes in H
Returns
-------
int
delete cost
"""
raise NotImplementedError
cpdef double substitute_cost(self, node1, node2, G, H):
"""