Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Fize Jacques
GMatch4py
Commits
fef9b4dd
Commit
fef9b4dd
authored
Mar 05, 2019
by
Fize Jacques
Browse files
Add documentation. Clean old methods and classes.
parent
c4667d6f
Changes
18
Hide whitespace changes
Inline
Side-by-side
gmatch4py/alg_types.pyx
deleted
100644 → 0
View file @
c4667d6f
# coding = utf-8
from
enum
import
Enum
class
AlgorithmType
(
Enum
):
similarity
=
0
distance
=
1
\ No newline at end of file
gmatch4py/bag_of_cliques.pyx
View file @
fef9b4dd
...
...
@@ -9,7 +9,7 @@ cimport numpy as np
from
scipy.sparse
import
csr_matrix
,
lil_matrix
import
sys
from
.base
cimport
Base
,
intersection
from
.base
cimport
Base
cdef
class
BagOfCliques
(
Base
):
...
...
gmatch4py/base.pxd
View file @
fef9b4dd
...
...
@@ -17,6 +17,3 @@ cdef class Base:
cpdef
set_attr_graph_used
(
self
,
str
node_attr_key
,
str
edge_attr_key
)
cpdef
intersection
(
G
,
H
)
cpdef
union_
(
G
,
H
)
gmatch4py/base.pyx
View file @
fef9b4dd
...
...
@@ -21,85 +21,6 @@ cpdef np.ndarray minmax_scale(np.ndarray matrix):
return
x
/
(
max_
)
cpdef
intersection
(
G
,
H
):
"""
Return a new graph that contains only the edges and nodes that exist in
both G and H.
The node sets of H and G must be the same.
Parameters
----------
G,H : graph
A NetworkX graph. G and H must have the same node sets.
Returns
-------
GH : A new graph with the same type as G.
Notes
-----
Attributes from the graph, nodes, and edges are not copied to the new
graph. If you want a new graph of the intersection of G and H
with the attributes (including edge data) from G use remove_nodes_from()
as follows
>>> G=nx.path_graph(3)
>>> H=nx.path_graph(5)
>>> R=G.copy()
>>> R.remove_nodes_from(n for n in G if n not in H)
Modified so it can be used with two graphs with different nodes set
"""
# create new graph
R
=
nx
.
create_empty_copy
(
G
)
if
not
G
.
is_multigraph
()
==
H
.
is_multigraph
():
raise
nx
.
NetworkXError
(
'G and H must both be graphs or multigraphs.'
)
if
G
.
number_of_edges
()
<=
H
.
number_of_edges
():
if
G
.
is_multigraph
():
edges
=
G
.
edges
(
keys
=
True
)
else
:
edges
=
G
.
edges
()
for
e
in
edges
:
if
H
.
has_edge
(
*
e
):
R
.
add_edge
(
*
e
)
else
:
if
H
.
is_multigraph
():
edges
=
H
.
edges
(
keys
=
True
)
else
:
edges
=
H
.
edges
()
for
e
in
edges
:
if
G
.
has_edge
(
*
e
):
R
.
add_edge
(
*
e
)
nodes_g
=
set
(
G
.
nodes
())
nodes_h
=
set
(
H
.
nodes
())
R
.
remove_nodes_from
(
list
(
nodes_g
-
nodes_h
))
return
R
cpdef
union_
(
G
,
H
):
"""
Return a graph that contains nodes and edges from both graph G and H.
Parameters
----------
G : networkx.Graph
First graph
H : networkx.Graph
Second graph
Returns
-------
networkx.Graph
A new graph with the same type as G.
"""
R
=
nx
.
create_empty_copy
(
G
)
R
.
add_nodes_from
(
H
.
nodes
(
data
=
True
))
R
.
add_edges_from
(
G
.
edges
(
data
=
True
))
R
.
add_edges_from
(
H
.
edges
(
data
=
True
))
return
R
cdef
class
Base
:
"""
This class define the common methods to all Graph Matching algorithm.
...
...
@@ -145,10 +66,34 @@ cdef class Base:
self
.
edge_attr_key
=
edge_attr_key
cpdef
set_attr_graph_used
(
self
,
str
node_attr_key
,
str
edge_attr_key
):
"""
Set graph attribute used by the algorithm to compare graphs.
Parameters
----------
node_attr_key : str
key of the node attribute
edge_attr_key: str
key of the edge attribute
"""
self
.
node_attr_key
=
node_attr_key
self
.
edge_attr_key
=
edge_attr_key
cpdef
np
.
ndarray
get_selected_array
(
self
,
selected
,
size_corpus
):
"""
Return an array which define which graph will be compared in the algorithms.
Parameters
----------
selected : list
indices of graphs you wish to compare
size_corpus :
size of your dataset
Returns
-------
np.ndarray
selected vector (1 -> selected, 0 -> not selected)
"""
cdef
double
[:]
selected_test
=
np
.
zeros
(
size_corpus
)
if
not
selected
==
None
:
for
ix
in
range
(
len
(
selected
)):
...
...
@@ -159,6 +104,20 @@ cdef class Base:
cpdef
np
.
ndarray
compare_old
(
self
,
list
listgs
,
list
selected
):
"""
Soon will be depreciated ! To store the old version of an algorithm.
Parameters
----------
listgs : list
list of graphs
selected
selected graphs
Returns
-------
np.ndarray
distance/similarity matrix
"""
pass
@
cython
.
boundscheck
(
False
)
...
...
@@ -179,7 +138,7 @@ cdef class Base:
the None value
Returns
-------
np.array
np.
nd
array
distance/similarity matrix
"""
...
...
@@ -190,12 +149,12 @@ cdef class Base:
Return a normalized distance matrix
Parameters
----------
matrix : np.array
Similarity/distance matrix you w
ant
to transform
matrix : np.
nd
array
Similarity/distance matrix you w
ish
to transform
Returns
-------
np.array
np.
nd
array
distance matrix
"""
if
self
.
type_alg
==
1
:
...
...
@@ -212,8 +171,8 @@ cdef class Base:
Return a normalized similarity matrix
Parameters
----------
matrix : np.array
Similarity/distance matrix you w
ant
to transform
matrix : np.
nd
array
Similarity/distance matrix you w
ish
to transform
Returns
-------
...
...
@@ -227,24 +186,6 @@ cdef class Base:
matrix
=
np
.
ma
.
getdata
(
minmax_scale
(
matrix
))
return
1
-
matrix
def
mcs
(
self
,
G
,
H
):
"""
Return the Most Common Subgraph of
Parameters
----------
G : networkx.Graph
First Graph
H : networkx.Graph
Second Graph
Returns
-------
networkx.Graph
Most common Subgrah
"""
R
=
G
.
copy
()
R
.
remove_nodes_from
(
n
for
n
in
G
if
n
not
in
H
)
return
R
cpdef
bint
isAccepted
(
self
,
G
,
index
,
selected
):
"""
...
...
gmatch4py/bon.pyx
View file @
fef9b4dd
...
...
@@ -11,7 +11,7 @@ cdef class BagOfNodes(Base):
We could call this algorithm Bag of nodes
"""
def
__init__
(
self
):
Base
.
__init__
(
self
,
0
,
True
)
Base
.
__init__
(
self
,
0
,
True
)
cpdef
np
.
ndarray
compare
(
self
,
list
graph_list
,
list
selected
):
nodes
=
list
()
...
...
gmatch4py/deltacon.pyx
deleted
100644 → 0
View file @
c4667d6f
# coding = utf-8
import
networkx
as
nx
import
numpy
as
np
import
scipy.sparse
class
DeltaCon0
():
__type__
=
"sim"
@
staticmethod
def
compare
(
list_gs
,
selected
):
n
=
len
(
list_gs
)
comparison_matrix
=
np
.
zeros
((
n
,
n
))
for
i
in
range
(
n
):
for
j
in
range
(
i
,
n
):
g1
,
g2
=
list_gs
[
i
],
list_gs
[
j
]
f
=
True
if
not
list_gs
[
i
]
or
not
list_gs
[
j
]:
f
=
False
elif
len
(
list_gs
[
i
])
==
0
or
len
(
list_gs
[
j
])
==
0
:
f
=
False
if
selected
:
if
not
i
in
selected
:
f
=
False
if
f
:
# S1
epsilon
=
1
/
(
1
+
DeltaCon0
.
maxDegree
(
g1
))
D
,
A
=
DeltaCon0
.
degreeAndAdjacencyMatrix
(
g1
)
S1
=
np
.
linalg
.
inv
(
np
.
identity
(
len
(
g1
))
+
(
epsilon
**
2
)
*
D
-
epsilon
*
A
)
# S2
D
,
A
=
DeltaCon0
.
degreeAndAdjacencyMatrix
(
g2
)
epsilon
=
1
/
(
1
+
DeltaCon0
.
maxDegree
(
g2
))
S2
=
np
.
linalg
.
inv
(
np
.
identity
(
len
(
g2
))
+
(
epsilon
**
2
)
*
D
-
epsilon
*
A
)
comparison_matrix
[
i
,
j
]
=
1
/
(
1
+
DeltaCon0
.
rootED
(
S1
,
S2
))
comparison_matrix
[
j
,
i
]
=
comparison_matrix
[
i
,
j
]
else
:
comparison_matrix
[
i
,
j
]
=
0.
comparison_matrix
[
j
,
i
]
=
comparison_matrix
[
i
,
j
]
return
comparison_matrix
@
staticmethod
def
rootED
(
S1
,
S2
):
return
np
.
sqrt
(
np
.
sum
((
S1
-
S2
)
**
2
))
# Long live numpy !
@
staticmethod
def
degreeAndAdjacencyMatrix
(
G
):
"""
Return the Degree(D) and Adjacency Matrix(A) from a graph G.
Inspired of nx.laplacian_matrix(G,nodelist,weight) code proposed by networkx
:param G:
:return:
"""
A
=
nx
.
to_scipy_sparse_matrix
(
G
,
nodelist
=
list
(
G
.
nodes
),
weight
=
"weight"
,
format
=
'csr'
)
n
,
m
=
A
.
shape
diags
=
A
.
sum
(
axis
=
1
)
D
=
scipy
.
sparse
.
spdiags
(
diags
.
flatten
(),
[
0
],
m
,
n
,
format
=
'csr'
)
return
D
,
A
@
staticmethod
def
maxDegree
(
G
):
degree_sequence
=
sorted
(
nx
.
degree
(
G
).
values
(),
reverse
=
True
)
# degree sequence
# print "Degree sequence", degree_sequence
dmax
=
max
(
degree_sequence
)
return
dmax
class
DeltaCon
():
__type__
=
"sim"
@
staticmethod
def
relabel_nodes
(
graph_list
):
label_lookup
=
{}
label_counter
=
0
n
=
len
(
graph_list
)
# label_lookup is an associative array, which will contain the
# mapping from multiset labels (strings) to short labels
# (integers)
for
i
in
range
(
n
):
nodes
=
list
(
graph_list
[
i
].
nodes
)
for
j
in
range
(
len
(
nodes
)):
if
not
(
nodes
[
j
]
in
label_lookup
):
label_lookup
[
nodes
[
j
]]
=
label_counter
label_counter
+=
1
graph_list
[
i
]
=
nx
.
relabel_nodes
(
graph_list
[
i
],
label_lookup
)
return
graph_list
@
staticmethod
def
compare
(
list_gs
,
g
=
3
):
n
=
len
(
list_gs
)
list_gs
=
DeltaCon
.
relabel_nodes
(
list_gs
)
comparison_matrix
=
np
.
zeros
((
n
,
n
))
for
i
in
range
(
n
):
for
j
in
range
(
i
,
n
):
g1
,
g2
=
list_gs
[
i
],
list_gs
[
j
]
V
=
list
(
g1
.
nodes
)
V
.
extend
(
list
(
g2
.
nodes
))
V
=
np
.
unique
(
V
)
partitions
=
V
.
copy
()
np
.
random
.
shuffle
(
partitions
)
if
len
(
partitions
)
<
g
:
partitions
=
np
.
array
([
partitions
])
else
:
partitions
=
np
.
array_split
(
partitions
,
g
)
partitions_e_1
=
DeltaCon
.
partitions2e
(
partitions
,
list
(
g1
.
nodes
))
partitions_e_2
=
DeltaCon
.
partitions2e
(
partitions
,
list
(
g2
.
nodes
))
S1
,
S2
=
[],[]
for
k
in
range
(
len
(
partitions
)):
s0k1
,
s0k2
=
partitions_e_1
[
k
],
partitions_e_2
[
k
]
# S1
epsilon
=
1
/
(
1
+
DeltaCon0
.
maxDegree
(
g1
))
D
,
A
=
DeltaCon0
.
degreeAndAdjacencyMatrix
(
g1
)
s1k
=
np
.
linalg
.
inv
(
np
.
identity
(
len
(
g1
))
+
(
epsilon
**
2
)
*
D
-
epsilon
*
A
)
s1k
=
np
.
linalg
.
solve
(
s1k
,
s0k1
).
tolist
()
# S2
D
,
A
=
DeltaCon0
.
degreeAndAdjacencyMatrix
(
g2
)
epsilon
=
1
/
(
1
+
DeltaCon0
.
maxDegree
(
g2
))
s2k
=
np
.
linalg
.
inv
(
np
.
identity
(
len
(
g2
))
+
(
epsilon
**
2
)
*
D
-
epsilon
*
A
)
s2k
=
np
.
linalg
.
solve
(
s2k
,
s0k2
).
tolist
()
S1
.
append
(
s1k
)
S2
.
append
(
s2k
)
comparison_matrix
[
i
,
j
]
=
1
/
(
1
+
DeltaCon0
.
rootED
(
np
.
array
(
S1
),
np
.
array
(
S2
)))
comparison_matrix
[
j
,
i
]
=
comparison_matrix
[
i
,
j
]
return
comparison_matrix
@
staticmethod
def
partitions2e
(
partitions
,
V
):
e
=
[
[]
for
i
in
range
(
len
(
partitions
))]
for
p
in
range
(
len
(
partitions
)):
e
[
p
]
=
[]
for
i
in
range
(
len
(
V
)):
if
i
in
partitions
[
p
]:
e
[
p
].
append
(
1.0
)
else
:
e
[
p
].
append
(
0.0
)
return
e
\ No newline at end of file
gmatch4py/embedding/deepwalk.pyx
View file @
fef9b4dd
...
...
@@ -4,31 +4,31 @@
import
os
import
sys
import
random
import
networkx
as
nx
from
io
import
open
from
argparse
import
ArgumentParser
,
FileType
,
ArgumentDefaultsHelpFormatter
from
collections
import
Counter
from
concurrent.futures
import
ProcessPoolExecutor
import
logging
from
multiprocessing
import
cpu_count
import
graph
as
graph2
import
walks
as
serialized_walks
from
gensim.models
import
Word2Vec
from
skipgram
import
Skipgram
import
networkx
as
nx
import
numpy
as
np
cimport
numpy
as
np
from
six
import
text_type
as
unicode
from
six
import
iteritems
from
six.moves
import
range
cimport
cython
from
gensim.models
import
Word2Vec
from
sklearn.metrics.pairwise
import
cosine_similarity
from
..base
cimport
Base
import
numpy
as
np
cimport
numpy
as
np
import
psutil
from
multiprocessing
import
cpu_count
from
joblib
import
Parallel
,
delayed
import
psutil
cimport
cython
from
..base
cimport
Base
import
graph
as
graph2
import
walks
as
serialized_walks
from
skipgram
import
Skipgram
p
=
psutil
.
Process
(
os
.
getpid
())
...
...
@@ -42,6 +42,36 @@ except AttributeError:
def
process
(
gr
,
number_walks
=
10
,
walk_length
=
40
,
window_size
=
5
,
vertex_freq_degree
=
False
,
workers
=
1
,
representation_size
=
64
,
max_memory_data_size
=
1000000000
,
seed
=
0
):
"""
Return a DeepWalk embedding for a graph
Parameters
----------
gr : nx.Graph
graph
number_walks : int, optional
Number of walk (the default is 10)
walk_length : int, optional
Length of the random walk started at each node (the default is 40)
window_size : int, optional
Window size of skipgram model. (the default is 5)
vertex_freq_degree : bool, optional
Use vertex degree to estimate the frequency of nodes (the default is False)
workers : int, optional
Number of parallel processes (the default is 1)
representation_size : int, optional
Number of latent dimensions to learn for each node (the default is 64)
max_memory_data_size : int, optional
'Size to start dumping walks to disk, instead of keeping them in memory. (the default is 1000000000)
seed : int, optional
Seed for random walk generator (the default is 0)
Returns
-------
np.array
DeepWalk embedding
"""
if
len
(
gr
.
edges
())
<
1
:
return
np
.
zeros
((
1
,
representation_size
))
G
=
graph2
.
from_networkx
(
gr
.
copy
(),
undirected
=
gr
.
is_directed
())
...
...
@@ -115,6 +145,20 @@ cdef class DeepWalk(Base):
Base
.
__init__
(
self
,
0
,
True
)
def
extract_embedding
(
self
,
listgs
):
"""
Extract DeepWalk embedding of each graph in `listgs`
Parameters
----------
listgs : list
list of graphs
Returns
-------
list
list of embeddings
"""
from
tqdm
import
tqdm
models
=
Parallel
(
n_jobs
=
cpu_count
())(
delayed
(
process
)(
nx
.
Graph
(
g
))
for
g
in
tqdm
(
listgs
,
desc
=
"Extracting Embeddings..."
))
return
models
...
...
gmatch4py/embedding/graph2vec.pyx
View file @
fef9b4dd
import
hashlib
import
json
import
glob
import
pandas
as
pd
import
networkx
as
nx
from
tqdm
import
tqdm
cimport
numpy
as
np
import
numpy.distutils.system_info
as
sysinfo
from
joblib
import
Parallel
,
delayed
from
gensim.models.doc2vec
import
Doc2Vec
,
TaggedDocument
import
numpy.distutils.system_info
as
sysinfo
from
sklearn.metrics.pairwise
import
cosine_similarity
from
..base
cimport
Base
...
...
@@ -21,10 +23,18 @@ class WeisfeilerLehmanMachine:
def
__init__
(
self
,
graph
,
features
,
iterations
):
"""
Initialization method which executes feature extraction.
:param graph: The Nx graph object.
:param features: Feature hash table.
:param iterations: Number of WL iterations.
Parameters
----------
graph : nx.Graph
graph
features : dict
Feature hash table.
iterations : int
number of WL iteration
"""
self
.
iterations
=
iterations
self
.
graph
=
graph
self
.
features
=
features
...
...
@@ -35,8 +45,13 @@ class WeisfeilerLehmanMachine:
def
do_a_recursion
(
self
):
"""
The method does a single WL recursion.
:return new_features: The hash table with extracted WL features.
Returns
-------
dict
The hash table with extracted WL features.
"""
new_features
=
{}
for
node
in
self
.
nodes
:
nebs
=
self
.
graph
.
neighbors
(
node
)
...
...
@@ -58,11 +73,17 @@ class WeisfeilerLehmanMachine:
def
dataset_reader
(
graph
):
"""
Function to read the graph and features from a json file.
:param path: The path to the graph json.
:return graph: The graph object.
:return features: Features hash table.
:return name: Name of the graph.
Function to extract features from a networkx graph
Parameters
----------
graph : nx.Graph
graph
Returns
-------
dict
Features hash table.
"""
features
=
dict
(
nx
.
degree
(
graph
))
...
...
@@ -70,13 +91,26 @@ def dataset_reader(graph):
features
=
{
k
:
v
for
k
,
v
,
in
features
.
items
()}
return
graph
,
features
def
feature_extractor
(
graph
,
ix
,
rounds
):
"""
Function to extract WL features from a graph.
:param path: The path to the graph json.
:param rounds: Number of WL iterations.
:return doc: Document collection object.
Function to extract WL features from a graph
Parameters
----------
graph : nx.Graph
graph
ix : int
index of the graph in the dataset
rounds : int
number of WL iterations
Returns
-------
TaggedDocument
random walks
"""
graph
,
features
=
dataset_reader
(
graph
)
machine
=
WeisfeilerLehmanMachine
(
graph
,
features
,
rounds
)
doc
=
TaggedDocument
(
words
=
machine
.
extracted_features
,
tags
=
[
"g_{0}"
.
format
(
ix
)])
...
...
@@ -87,8 +121,32 @@ def feature_extractor(graph, ix, rounds):
def
generate_model
(
graphs
,
iteration
=
2
,
dimensions
=
64
,
min_count
=
5
,
down_sampling
=
0.0001
,
learning_rate
=
0.0001
,
epochs
=
10
,
workers
=
4
):
"""
Main function to read the graph list, extract features, learn the embedding and save it.
:param args: Object with the arguments.
Parameters
----------
graphs : nx.Graph
Input graph
iteration : int, optional
number of iteration (the default is 2)
dimensions : int, optional
output vector dimension (the default is 64)
min_count : int, optional
min count parameter of Doc2vec model (the default is 5)
down_sampling : float, optional
Down sampling rate for frequent features. (the default is 0.0001)
learning_rate : float, optional
Initial learning rate (the default is 0.0001, which [default_description])
epochs : int, optional
Number of epochs (the default is 10)
workers : int, optional
Number of workers (the default is 4)
Returns