Commit 0cd549d9 authored by Fize Jacques's avatar Fize Jacques
Browse files

- Cythonize Gmatch4py

- Debug disambiguisation

- Debug Spacy NER API and StanfordNER Api

- Add Notebooks for Evaluations
parent 8439cf8b
No related merge requests found
Showing with 421 additions and 40 deletions
+421 -40
......@@ -81,10 +81,7 @@ with ProgressBar(max_value=len(files_glob),widgets=[' [', Timer(), '] ',Bar(),'(
id_=int(re.findall("\d+", fn)[-1])
df=pd.read_csv(fn)
try:
df=df[(df["GID"]!='O') & (df.GID.notnull())]
except:
df = df[(df.GID.notnull())]
df = df[-df["GID"].isin(['0', 'o', 'NR', 'O'])]
try:
count_per_doc[id_]=json.loads(df.groupby("GID").GID.count().to_json())
associated_es[id_] = df[["GID","text"]].groupby("GID",as_index=False).max().set_index('GID').to_dict()["text"]
......@@ -98,22 +95,29 @@ all_es=set([])
for k,v in associated_es.items():
for k2 in v:
all_es.add(k2)
logging.info("Get All Shapes from Database for all ES")
all_shapes=get_all_shapes(list(all_es))
#print(all_shapes.keys())
i=0
def foo_(x):
try:
return get_data(x)["en"]
except:
print(x)
with ProgressBar(max_value=len(files_glob),
widgets=[' [', Timer(), '] ', Bar(), '(', Counter(), ')', '(', ETA(), ')']) as pg:
for fn in files_glob:
id_ = int(re.findall("\d+", fn)[-1])
df = pd.read_csv(fn)
try:
df = df[(df["GID"] != 'O') & (df.GID.notnull())]
except:
df = df[(df.GID.notnull())]
df["label"]=df.GID.apply(lambda x:get_data(x)["en"])
# try:
df= df[-df["GID"].isin(['0','o','NR','O'])]
#print(df)
# except:
# df = df[(df.GID.notnull())]
# print("BUG",df)
df["label"]=df.GID.apply(foo_)
df = df.rename(columns={"GID": "id"})
str_=STR.from_pandas(df,[],all_shapes).build()
nx.write_gexf(str_, args.graphs_output_dir + "/{0}.gexf".format(id_))
......
File moved
# coding = utf-8
import networkx as nx
import graph_tool as gt
def get_prop_type(value, key=None):
"""
Performs typing and value conversion for the graph_tool PropertyMap class.
If a key is provided, it also ensures the key is in a format that can be
used with the PropertyMap. Returns a tuple, (type name, value, key)
"""
# Deal with the value
if isinstance(value, bool):
tname = 'bool'
elif isinstance(value, int):
tname = 'float'
value = float(value)
elif isinstance(value, float):
tname = 'float'
elif isinstance(value, str):
tname = 'string'
value = str(value)
elif isinstance(value, dict):
tname = 'object'
else:
tname = 'string'
value = str(value)
return tname, value, key
def nx2gt(nxG):
"""
Converts a networkx graph to a graph-tool graph.
"""
# Phase 0: Create a directed or undirected graph-tool Graph
gtG = gt.Graph(directed=nxG.is_directed())
# Add the Graph properties as "internal properties"
for key, value in nxG.graph.items():
# Convert the value and key into a type for graph-tool
tname, value, key = get_prop_type(value, key)
prop = gtG.new_graph_property(tname) # Create the PropertyMap
gtG.graph_properties[key] = prop # Set the PropertyMap
gtG.graph_properties[key] = value # Set the actual value
# Phase 1: Add the vertex and edge property maps
# Go through all nodes and edges and add seen properties
# Add the node properties first
nprops = set() # cache keys to only add properties once
for node, data in nxG.nodes_iter(data=True):
# Go through all the properties if not seen and add them.
for key, val in data.items():
if key in nprops: continue # Skip properties already added
# Convert the value and key into a type for graph-tool
tname, _, key = get_prop_type(val, key)
prop = gtG.new_vertex_property(tname) # Create the PropertyMap
gtG.vertex_properties[key] = prop # Set the PropertyMap
# Add the key to the already seen properties
nprops.add(key)
# Also add the node id: in NetworkX a node can be any hashable type, but
# in graph-tool node are defined as indices. So we capture any strings
# in a special PropertyMap called 'id' -- modify as needed!
gtG.vertex_properties['id'] = gtG.new_vertex_property('string')
# Add the edge properties second
eprops = set() # cache keys to only add properties once
for src, dst, data in nxG.edges_iter(data=True):
# Go through all the edge properties if not seen and add them.
for key, val in data.items():
if key in eprops: continue # Skip properties already added
# Convert the value and key into a type for graph-tool
tname, _, key = get_prop_type(val, key)
prop = gtG.new_edge_property(tname) # Create the PropertyMap
gtG.edge_properties[key] = prop # Set the PropertyMap
# Add the key to the already seen properties
eprops.add(key)
# Phase 2: Actually add all the nodes and vertices with their properties
# Add the nodes
vertices = {} # vertex mapping for tracking edges later
for node, data in nxG.nodes_iter(data=True):
# Create the vertex and annotate for our edges later
v = gtG.add_vertex()
vertices[node] = v
# Set the vertex properties, not forgetting the id property
data['id'] = str(node)
for key, value in data.items():
gtG.vp[key][v] = value # vp is short for vertex_properties
# Add the edges
for src, dst, data in nxG.edges_iter(data=True):
# Look up the vertex structs from our vertices mapping and add edge.
e = gtG.add_edge(vertices[src], vertices[dst])
# Add the edge properties
for key, value in data.items():
gtG.ep[key][e] = value # ep is short for edge_properties
# Done, finally!
return gtG
if __name__ == '__main__':
# Create the networkx graph
nxG = nx.Graph(name="Undirected Graph")
nxG.add_node("v1", name="alpha", color="red")
nxG.add_node("v2", name="bravo", color="blue")
nxG.add_node("v3", name="charlie", color="blue")
nxG.add_node("v4", name="hub", color="purple")
nxG.add_node("v5", name="delta", color="red")
nxG.add_node("v6", name="echo", color="red")
nxG.add_edge("v1", "v2", weight=0.5, label="follows")
nxG.add_edge("v1", "v3", weight=0.25, label="follows")
nxG.add_edge("v2", "v4", weight=0.05, label="follows")
nxG.add_edge("v3", "v4", weight=0.35, label="follows")
nxG.add_edge("v5", "v4", weight=0.65, label="follows")
nxG.add_edge("v6", "v4", weight=0.53, label="follows")
nxG.add_edge("v5", "v6", weight=0.21, label="follows")
for item in nxG.edges_iter(data=True):
print(item)
# Convert to graph-tool graph
gtG = nx2gt(nxG)
gtG.list_properties()
\ No newline at end of file
......@@ -44,7 +44,7 @@ class Jaccard():
def union_nodes(g1, g2):
union=set([])
for n in g1.nodes():union.add(n)
for n in g2.nodes(): union.add(n)
for n in g2.nodes():union.add(n)
return union
@staticmethod
......
__version__ = "0.1"
# coding = utf-8
\ No newline at end of file
......@@ -5,7 +5,7 @@ from typing import Sequence
import networkx as nx
import numpy as np
cimport numpy as np
class BagOfCliques():
......@@ -14,7 +14,7 @@ class BagOfCliques():
b=BagOfCliques()
bog=b.getBagOfCliques(graphs)
#Compute cosine similarity
scores=np.dot(bog,bog.T)
cdef np.ndarray scores=np.dot(bog,bog.T)
for i in range(len(scores)):
for j in range(len(scores)):
scores[i,j]/=(np.sqrt(np.sum(bog[i]**2))*np.sqrt(np.sum(bog[j]**2))) # Can be computed in one line
......@@ -27,9 +27,11 @@ class BagOfCliques():
"""
tree = {}
c_ = 0
clique_vocab = []
cdef list clique_vocab = []
cdef list cli_temp
cdef list cliques
for g in graphs:
cliques = list(nx.algorithms.clique.find_cliques(nx.Graph(g)))
cliques = list(nx.find_cliques(nx.Graph(g)))
for clique in cliques:
t = tree
cli_temp = copy.deepcopy(clique)
......@@ -55,7 +57,7 @@ class BagOfCliques():
return clique_vocab
def ifHaveMinor(self,G: nx.Graph, H: list):
def ifHaveMinor(self,G, list H):
"""
If a clique (minor) H belong to a graph G
:param H:
......@@ -66,16 +68,18 @@ class BagOfCliques():
return 0
def getBagOfCliques(self,graphs : Sequence[nx.Graph]):
def getBagOfCliques(self,graphs ):
"""
:param clique_vocab:
:return:
"""
clique_vocab=self.getUniqueCliques(graphs)
cdef list clique_vocab=self.getUniqueCliques(graphs)
cdef int l_v=len(clique_vocab)
cdef np.ndarray boc = np.zeros((len(graphs), l_v))
cdef np.ndarray vector
l_v=len(clique_vocab)
boc = np.zeros((len(graphs), l_v))
for g in range(len(graphs)):
gr = graphs[g]
vector = np.zeros(l_v)
......
File moved
# coding = utf-8
\ No newline at end of file
# coding = utf-8
......@@ -5,9 +5,12 @@ import sys
import numpy as np
from scipy.optimize import linear_sum_assignment
cimport numpy as np
class AbstractGraphEditDistance(object):
def __init__(self, g1, g2,debug=False,**kwargs):
self.g1 = g1
self.g2 = g2
......@@ -26,12 +29,14 @@ class AbstractGraphEditDistance(object):
return sum(opt_path)
def print_operations(self,cost_matrix,row_ind,col_ind):
nodes1 = self.g1.nodes()
nodes2 = self.g2.nodes()
cdef list nodes1 = self.g1.nodes()
cdef list nodes2 = self.g2.nodes()
dn1 = self.g1.node
dn2 = self.g2.node
n,m=len(nodes1),len(nodes2)
cdef int n=len(nodes1)
cdef int m=len(nodes2)
cdef int x,y,i
for i in range(len(row_ind)):
y,x=row_ind[i],col_ind[i]
val=cost_matrix[row_ind[i]][col_ind[i]]
......@@ -43,7 +48,7 @@ class AbstractGraphEditDistance(object):
print("DEL {0} cost = {1}".format(dn1[nodes1[m-x]]["label"],val))
def edit_costs(self):
cost_matrix = self.create_cost_matrix()
cdef np.ndarray cost_matrix = self.create_cost_matrix()
if self.debug:
np.set_printoptions(precision=3)
print("Cost Matrix for ",str(self.__class__.__name__),"\n",cost_matrix)
......@@ -51,7 +56,8 @@ class AbstractGraphEditDistance(object):
row_ind,col_ind = linear_sum_assignment(cost_matrix)
if self.debug:
self.print_operations(cost_matrix,row_ind,col_ind)
return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(len(row_ind))]
cdef int f=len(row_ind)
return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(f)]
def create_cost_matrix(self):
"""
......@@ -67,13 +73,13 @@ class AbstractGraphEditDistance(object):
The delete -> delete region is filled with zeros
"""
n = len(self.g1)
m = len(self.g2)
cost_matrix = np.zeros((n+m,n+m))
cdef int n = len(self.g1)
cdef int m = len(self.g2)
cdef np.ndarray cost_matrix = np.zeros((n+m,n+m))
#cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)]
nodes1 = self.g1.nodes()
nodes2 = self.g2.nodes()
cdef list nodes1 = self.g1.nodes()
cdef list nodes2 = self.g2.nodes()
cdef int i,j
for i in range(n):
for j in range(m):
cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j])
......@@ -89,10 +95,10 @@ class AbstractGraphEditDistance(object):
self.cost_matrix = cost_matrix
return cost_matrix
def insert_cost(self, i, j):
def insert_cost(self, int i, int j):
raise NotImplementedError
def delete_cost(self, i, j):
def delete_cost(self, int i, int j):
raise NotImplementedError
def substitute_cost(self, nodes1, nodes2):
......
......@@ -13,12 +13,12 @@ class EdgeEditDistance(AbstractGraphEditDistance):
def __init__(self, g1, g2,**kwargs):
AbstractGraphEditDistance.__init__(self, g1, g2,**kwargs)
def insert_cost(self, i, j, nodes2):
def insert_cost(self, int i, int j, nodes2):
if i == j:
return self.edge_ins
return sys.maxsize
def delete_cost(self, i, j, nodes1):
def delete_cost(self, int i, int j, nodes1):
if i == j:
return self.edge_del
return sys.maxsize
......
......@@ -30,12 +30,12 @@ class GraphEditDistance(AbstractGraphEditDistance):
else:
return self.node_ins+self.node_del
def delete_cost(self, i, j, nodes1):
def delete_cost(self, int i, int j, nodes1):
if i == j:
return self.node_del+self.g1.degree(nodes1[i]) # Deleting a node implicate to delete in and out edges
return sys.maxsize
def insert_cost(self, i, j, nodes2):
def insert_cost(self, int i, int j, nodes2):
if i == j:
deg=self.g2.degree(nodes2[j])
if isinstance(deg,dict):deg=0
......@@ -44,7 +44,7 @@ class GraphEditDistance(AbstractGraphEditDistance):
return sys.maxsize
def get_edge_multigraph(self,g,node):
edges=[]
cdef list edges=[]
for id_,val in g.edge[node].items():
if not 0 in val:
edges.append(str(id_) + val["color"])
......@@ -54,6 +54,7 @@ class GraphEditDistance(AbstractGraphEditDistance):
return edges
def edge_diff(self, node1, node2):
cdef list edges1,edges2
if isinstance(self.g1,nx.MultiDiGraph):
edges1 = self.get_edge_multigraph(self.g1,node1)
edges2 = self.get_edge_multigraph(self.g2,node2)
......
# coding = utf-8
import numpy as np
from .algorithm.graph_edit_dist import GraphEditDistance
class ApproximateGraphEditDistance():
__type__ = "dist"
@staticmethod
def compare(listgs,c_del_node=1,c_del_edge=1,c_ins_node=1,c_ins_edge=1):
n= len(listgs)
comparison_matrix = np.zeros((n,n))
for i in range(n):
for j in range(i,n):
comparison_matrix[i,j]= GraphEditDistance(listgs[i],listgs[j],False,node_del=c_del_node,node_ins=c_ins_node,edge_del=c_del_edge,edge_ins=c_ins_edge).distance()
comparison_matrix[j,i]= comparison_matrix[i,j] # Unethical ! Since AGED is not a symmetric similarity measure !
return comparison_matrix
\ No newline at end of file
# coding = utf-8
import numpy as np
cimport numpy as np
cdef class BP_2():
"""
"""
__type__="dist"
cdef int node_del
cdef int node_ins
cdef int edge_del
cdef int edge_ins
@staticmethod
def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
cdef int n = len(listgs)
comparator = BP_2(c_del_node, c_ins_node, c_del_edge, c_ins_edge)
cdef np.ndarray comparison_matrix = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
comparison_matrix[i, j] = comparator.bp2(listgs[i], listgs[j])
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
def __init__(self, node_del=1, node_ins=1, edge_del=1, edge_ins=1):
"""Constructor for HED"""
self.node_del = node_del
self.node_ins = node_ins
self.edge_del = edge_del
self.edge_ins = edge_ins
def bp2(self, g1, g2):
"""
Compute de Hausdorff Edit Distance
:param g1: first graph
:param g2: second graph
:return:
"""
return min(self.distance(self.psi(g1,g2)),self.distance(self.psi(g2,g1)))
def distance(self,e):
return np.sum(e)
def psi(self,g1,g2):
cdef list psi_=[]
cdef list nodes1 = g1.nodes()
cdef list nodes2 = g2.nodes()
for u in nodes1:
v=None
for w in nodes2:
if 2*self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,None) + self.fuv(g1,g2,None,w)\
and self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,v):
v=w
psi_.append(self.fuv(g1,g2,u,v))
if u:
nodes1= list(set(nodes1).difference(set([u])))
if v:
nodes2= list(set(nodes2).difference(set([v])))
for v in nodes2:
psi_.append(self.fuv(g1,g2,None,v))
return psi_
def fuv(self, g1, g2, n1, n2):
"""
Compute the Node Distance function
:param g1: first graph
:param g2: second graph
:param n1: node of the first graph
:param n2: node of the second graph
:return:
"""
if n2 == None: # Del
return self.node_del + ((self.edge_del / 2) * g1.degree(n1))
if n1 == None: # Insert
return self.node_ins + ((self.edge_ins / 2) * g2.degree(n2))
else:
if n1 == n2:
return 0.
return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2
def hed_edge(self, g1, g2, n1, n2):
"""
Compute HEDistance between edges of n1 and n2, respectively in g1 and g2
:param g1: first graph
:param g2: second graph
:param n1: node of the first graph
:param n2: node of the second graph
:return:
"""
return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2)
def get_edge_multigraph(self, g, node):
"""
Get list of edge around a node in a Multigraph
:param g: multigraph
:param node: node in the multigraph
:return:
"""
edges = []
for edge in g.edges(data=True):
if node == edge[0] or node == edge[1]:
edges.append("{0}-{1}-{2}".format(edge[0],edge[1],edge[2]["color"]))
return edges
def sum_gpq(self, g1, n1, g2, n2):
"""
Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2
:param g1: first graph
:param n1: node in the first graph
:param g2: second graph
:param n2: node in the second graph
:return:
"""
#if isinstance(g1, nx.MultiDiGraph):
cdef list edges1 = self.get_edge_multigraph(g1, n1)
cdef list edges2 = self.get_edge_multigraph(g2, n2)
#else:
#print(1)
#edges1 = [str(n1 + "-" + ef) for ef in list(g1.edge[n1].keys())]
#edges2 = [str(n2 + "-" + ef) for ef in list(g2.edge[n2].keys())]
edges2.extend([None])
cdef np.ndarray min_sum = np.zeros(len(edges1))
for i in range(len(edges1)):
min_i = np.zeros(len(edges2))
for j in range(len(edges2)):
min_i[j] = self.gpq(edges1[i], edges2[j])
min_sum[i] = np.min(min_i)
return np.sum(min_sum)
def gpq(self, e1, e2):
"""
Compute the edge distance function
:param e1: edge1
:param e2: edge2
:return:
"""
if e2 == None: # Del
return self.edge_del
if e1 == None: # Insert
return self.edge_ins
else:
if e1 == e2:
return 0.
return (self.edge_del + self.edge_ins) / 2
# coding = utf-8
\ No newline at end of file
# coding = utf-8
import numpy as np
from .algorithm.graph_edit_dist import GraphEditDistance
cimport numpy as np
class GreedyEditDistance(GraphEditDistance):
"""
Implementation of the Greedy Edit Distance presented in :
Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement
Andreas Fischer, Kaspar Riesen, Horst Bunke
2016
"""
__type__ = "dist"
@staticmethod
def compare(listgs, c_del_node=1, c_del_edge=1, c_ins_node=1, c_ins_edge=1):
cdef int n = len(listgs)
cdef np.ndarray comparison_matrix = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
comparison_matrix[i, j] = GreedyEditDistance(listgs[i], listgs[j],False, node_del=c_del_node,
node_ins=c_ins_node, edge_del=c_del_edge,
edge_ins=c_ins_edge).distance()
comparison_matrix[j, i] = comparison_matrix[i, j]
return comparison_matrix
def __init__(self,g1,g2,debug=False,**kwargs):
"""Constructor for GreedyEditDistance"""
super().__init__(g1,g2,debug,**kwargs)
def edit_costs(self):
cdef np.ndarray cost_matrix=self.create_cost_matrix()
cdef np.ndarray cost_matrix_2=cost_matrix.copy()
cdef list psi=[]
for i in range(len(cost_matrix)):
phi_i=np.argmin((cost_matrix[i]))
cost_matrix=np.delete(cost_matrix,phi_i,1)
psi.append([i,phi_i+i]) #+i to compensate the previous column deletion
return [cost_matrix_2[psi[i][0]][psi[i][1]] for i in range(len(psi))]
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment