Commit df18e0b3 authored by Fize Jacques's avatar Fize Jacques

- Add few docstrings

 - Change ReadMe
 - Change some function signature (mostly helpers)
 - Update requirements.txt and setup.py
 - new access to the different algorithms (change __init__.py)
parent 19d06316
......@@ -5,9 +5,10 @@ GMatch4py algorithms were implemented with Cython to enhance performance.
## Requirements
* Python3
* Python 3.x
* Cython
* networkit (for Bag of Cliques)
* networkx
* numpy
## Installation
......@@ -19,6 +20,11 @@ $ cd GMatch4py
$ python3 setup.py install
```
or
```
$ (sudo) pip3 install .
```
## Get Started
### Graph input format
......@@ -33,7 +39,7 @@ If you want to use algorithms like *graph edit distances*, here is an example:
# Gmatch4py use networkx graph
import networkx as nx
# import the GED using the munkres algorithm
from gmatch4py.ged.graph_edit_dist import GraphEditDistance
import gmatch4py as gm
```
In this example, we use generated graphs using `networkx` helpers:
......@@ -47,7 +53,7 @@ All graph matching algorithms in `Gmatch4py work this way:
* Each object is associated with a `compare()` function with two parameters. First parameter is **a list of the graphs** you want to **compare**, i.e. measure the distance/similarity (depends on the algorithm). Then, you can specify a sample of graphs to be compared to all the other graphs. To this end, the second parameter should be **a list containing the indices** of these graphs (based on the first parameter list). If you rather compute the distance/similarity **between all graphs**, just use the `None` value.
```{python}
ged=GraphEditDistance(1,1,1,1) # all edit costs are equal to 1
ged=gm.GraphEditDistance(1,1,1,1) # all edit costs are equal to 1
result=ged.compare([g1,g2],None)
print(result)
```
......@@ -108,10 +114,10 @@ each code is associated with a reference to the original.**
## TODO List
* Debug algorithms with --> (*debug needed*)
* Improve code structure and performance
* Debug algorithms --> :runner:
* Improve code structure and performance :runner:
* Simplify `setup.py` :heavy_check_mark:
* Some algorithms are distance and others are similarity measure. Must change the compare
methods so it can adapt to the user need. For example, maybe the user want to deal with
graph similarity rather than distance between graph.:heavy_check_mark:
* Write the documentation :see_no_evil:
\ No newline at end of file
graph similarity rather than distance between graph. :heavy_check_mark:
* Write the documentation :runner:
\ No newline at end of file
# coding = utf-8
\ No newline at end of file
# coding = utf-8
# Graph Edit Distance algorithms import
from .ged.graph_edit_dist import *
from .ged.greedy_edit_distance import *
from .ged.bipartite_graph_matching_2 import *
from .ged.hausdorff_edit_distance import *
# Kernels algorithms import
from .kernels.weisfeiler_lehman import *
# Helpers import
from .helpers.reader import *
# Basic algorithms import
from .bag_of_cliques import *
from .mcs import *
from .vertex_edge_overlap import *
from .vertex_ranking import *
from .jaccard import *
\ No newline at end of file
......@@ -13,16 +13,24 @@ from .base cimport Base,intersection
cdef class BagOfCliques(Base):
"""
The Bag of Cliques is representation of a graph corpus using the well-known *bag of words* model. Here, instead of
word, we use unique cliques found in the graphs as a vocabulary. A clique is a highly connected graph where all the vertices are connected by an edge.
The resulting representation is then use to compute similarity value between graphs. For this purpose, we use the cosine
similarity.
"""
def __init__(self):
Base.__init__(self,0,True)
"""
Constructor of Bag Of Cliques.
"""
Base.__init__(self,0,True)
cpdef np.ndarray compare(self,list listgs, list selected):
b=BagOfCliques()
bog=b.getBagOfCliques(listgs).astype(np.float32)
print(bog.shape)
#Compute cosine similarity
bog=b.get_bag_of_cliques(listgs).astype(np.float32)
cdef int n=bog.shape[0]
cdef np.ndarray scores = np.zeros((n,n))
cdef int i
......@@ -37,10 +45,18 @@ cdef class BagOfCliques(Base):
scores[j,i]=scores[i,j]
return scores
def getUniqueCliques(self,graphs):
def get_unique_cliques(self, graphs):
"""
Return unique cliques from a population of graphs
:return:
Return a cliques found in a set of graphs
Parameters
----------
graphs: networkx.Graph array
list of graphs
Returns
-------
list
Cliques set
"""
t = {}
c_ = 0
......@@ -53,13 +69,8 @@ cdef class BagOfCliques(Base):
km+=1
if not g:
continue
# sys.stdout.write("\r{0}/{1} -- {2}".format(km,len_graphs,len(g)))
cliques = list(nx.find_cliques(nx.Graph(g)))
#no clique found
#print(nx.Graph(g).edges())
#cliques =[]
for clique in cliques:
cli_temp = copy.deepcopy(clique)
new_clique = False
for i in range(len(clique)):
......@@ -84,12 +95,36 @@ cdef class BagOfCliques(Base):
def clique2str(self,cliques):
"""
Return a "hash" string of a clique
Parameters
----------
cliques: array
Returns
-------
str
hash of a clique
"""
try:
return "".join(sorted(cliques))
except:
return "".join(sorted(list(map(str,cliques))))
def transform_clique_vocab(self,clique_vocab):
"""
Transform cliques found in `get_unique_cliques()` in a proper format to build the "Bag of Cliques"
Parameters
----------
clique_vocab : array
contains cliques
Returns
-------
dict
new clique vocab format
"""
cdef dict new_vocab={}
cdef int len_voc=len(clique_vocab)
for c in range(len_voc):
......@@ -97,32 +132,28 @@ cdef class BagOfCliques(Base):
new_vocab[self.clique2str(clique_vocab[c])]=c
return new_vocab
def ifHaveMinor(self,clique, dict mapping):
"""
If a clique (minor) H belong to a graph G
:param H:
:return:
def get_bag_of_cliques(self, graphs):
"""
if self.clique2str(clique) in mapping:
return 1
return 0
Return a the Bag of Cliques representation from a graph set.
Parameters
----------
graphs : networkx.Graph array
list of graphs
def getBagOfCliques(self,graphs ):
Returns
-------
np.ndarray
bag of cliques
"""
:param clique_vocab:
:return:
"""
cdef list clique_vocab=self.getUniqueCliques(graphs)
cdef list clique_vocab=self.get_unique_cliques(graphs)
cdef dict map_str_cliques=self.transform_clique_vocab(clique_vocab)
cdef int l_v=len(clique_vocab)
boc = np.zeros((len(graphs), l_v))
cdef np.ndarray vector
cdef list cliques
cdef str hash
#print(1)
for g in range(len(graphs)):
#sys.stdout.write("\r{0}/{1}".format(g,len(graphs)))
gr = graphs[g]
......
......@@ -2,13 +2,32 @@
import sys, os, glob, json, re
import networkx as nx
"""
The reader submodule contains high-level function to read and store graphs from various files.
"""
methods_read_graph={
"gexf":nx.read_gexf,
"gml":nx.read_gml,
"graphml":nx.read_graphml
}
def extract_number(fn):
def extract_index(fn):
"""
Extract index from filename
Parameters
----------
fn : str
filename
Returns
-------
int
index
"""
try:
return int(re.findall("\d+",fn)[-1])
except:
......@@ -17,16 +36,33 @@ def extract_number(fn):
def import_dir(directory,format="gexf",numbered=True):
"""
Based on a given directory, import all graphs and store them in a list/array
Parameters
----------
directory : str
directory path where graphs are stored
format : str
graph file format
numbered
if graph filename are numbered
Returns
-------
array
graphs
"""
if not os.path.exists(directory):
raise FileNotFoundError
raise FileNotFoundError("{0} does not exists".format(directory))
if not format in methods_read_graph:
raise NotImplementedError("{0} is not implemented !".format(format))
# Retrieve filename
fns = glob.glob(os.path.join(directory, "*.{0}".format(format)))
graphs=[]
if numbered:
n=max([extract_number(fn) for fn in fns])
n=max([extract_index(fn) for fn in fns])
graphs= [nx.Graph()]*(n+1)
association_map, i = {}, 0
......@@ -36,7 +72,7 @@ def import_dir(directory,format="gexf",numbered=True):
association_map[fn]=i
i+=1
else:
graphs[extract_number(fn)]=methods_read_graph[format](fn)
graphs[extract_index(fn)]=methods_read_graph[format](fn)
if not numbered:
return association_map,graphs
return graphs
......@@ -26,7 +26,20 @@ cdef class MCS(Base):
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
def s_mcs(self,g1, g2):
def s_mcs(self,G, H):
"""
Return the MCS measure value between
Parameters
----------
G : networkx.Graph
First Graph
H : networkx.Graph
Second Graph
return len(self.mcs(g1, g2)) / float(max(len(g1), len(g2)))
Returns
-------
"""
return len(self.mcs(G, H)) / float(max(len(G), len(H)))
......@@ -45,8 +45,8 @@ setup(
packages=["gmatch4py","gmatch4py.helpers"],
ext_modules=extensions,
cmdclass={'build_ext': build_ext},
setup_requires=["numpy","networkx","networkit","scipy"],
install_requires=["numpy","networkx","networkit","scipy"],
setup_requires=["numpy","networkx","scipy"],
install_requires=["numpy","networkx","scipy"],
version="0.1"
)
#Clean cpp and compiled file
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment