Commit f953a4b2 authored by Pokiros's avatar Pokiros
Browse files

Add ged4py (GED Graph Edit Distance) + Gate NER integration + NER Evaluation Notebook

parent a84f34fc
No related merge requests found
Showing with 818 additions and 464 deletions
+818 -464
# -*- coding: UTF-8 -*-
from __future__ import print_function
from scipy.optimize import linear_sum_assignment
import sys
import numpy as np
class AbstractGraphEditDistance(object):
def __init__(self, g1, g2):
self.g1 = g1
self.g2 = g2
def normalized_distance(self):
"""
Returns the graph edit distance between graph g1 & g2
The distance is normalized on the size of the two graphs.
This is done to avoid favorisation towards smaller graphs
"""
avg_graphlen = (len(self.g1) + len(self.g2)) / 2
return self.distance() / avg_graphlen
def distance(self):
return sum(self.edit_costs())
def edit_costs(self):
cost_matrix = self.create_cost_matrix()
row_ind,col_ind = linear_sum_assignment(cost_matrix)
return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(len(row_ind))]
def create_cost_matrix(self):
"""
Creates a |N+M| X |N+M| cost matrix between all nodes in
graphs g1 and g2
Each cost represents the cost of substituting,
deleting or inserting a node
The cost matrix consists of four regions:
substitute | insert costs
-------------------------------
delete | delete -> delete
The delete -> delete region is filled with zeros
"""
n = len(self.g1)
m = len(self.g2)
cost_matrix = np.zeros((n+m,n+m))
#cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)]
nodes1 = self.g1.nodes()
nodes2 = self.g2.nodes()
for i in range(n):
for j in range(m):
cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j])
for i in range(m):
for j in range(m):
cost_matrix[i+n,j] = self.insert_cost(i, j, nodes2)
for i in range(n):
for j in range(n):
cost_matrix[j,i+m] = self.delete_cost(i, j, nodes1)
self.cost_matrix = cost_matrix
return cost_matrix
def insert_cost(self, i, j):
raise NotImplementedError
def delete_cost(self, i, j):
raise NotImplementedError
def substitute_cost(self, nodes1, nodes2):
raise NotImplementedError
def print_matrix(self):
print("cost matrix:")
for column in self.create_cost_matrix():
for row in column:
if row == sys.maxint:
print ("inf\t")
else:
print ("%.2f\t" % float(row))
print("")
from ged4py.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
import sys
class EdgeEditDistance(AbstractGraphEditDistance):
"""
Calculates the graph edit distance between two edges.
A node in this context is interpreted as a graph,
and edges are interpreted as nodes.
"""
def __init__(self, g1, g2):
AbstractGraphEditDistance.__init__(self, g1, g2)
def insert_cost(self, i, j, nodes2):
if i == j:
return 1
return sys.maxsize
def delete_cost(self, i, j, nodes1):
if i == j:
return 1
return sys.maxsize
def substitute_cost(self, edge1, edge2):
if edge1 == edge2:
return 0.
return 1
# -*- coding: UTF-8 -*-
from __future__ import print_function
from ged4py.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
from ged4py.algorithm.edge_edit_dist import EdgeEditDistance
from ged4py.graph.edge_graph import EdgeGraph
import sys
def compare(g1, g2, print_details=False):
ged = GraphEditDistance(g1, g2)
if print_details:
ged.print_matrix()
return ged.normalized_distance()
class GraphEditDistance(AbstractGraphEditDistance):
def __init__(self, g1, g2):
AbstractGraphEditDistance.__init__(self, g1, g2)
def substitute_cost(self, node1, node2):
return self.relabel_cost(node1, node2) + self.edge_diff(node1, node2)
def relabel_cost(self, node1, node2):
if node1 == node2:
return 0.
else:
return 1.
def delete_cost(self, i, j, nodes1):
if i == j:
return 1
return sys.maxsize
def insert_cost(self, i, j, nodes2):
if i == j:
return 1
else:
return sys.maxsize
def pos_insdel_weight(self, node):
return 1
def edge_diff(self, node1, node2):
edges1 = list(self.g1.edge[node1].keys())
edges2 = list(self.g2.edge[node2].keys())
if len(edges1) == 0 or len(edges2) == 0:
return max(len(edges1), len(edges2))
edit_edit_dist = EdgeEditDistance(EdgeGraph(node1,edges1), EdgeGraph(node2,edges2))
return edit_edit_dist.normalized_distance()
{
"id": "source1.txt-1",
"sentenceNumber": 1,
"length": 17,
"tokens": [{
"id": "1",
"lemma": "Haakon",
"deprel": "nsubj",
"word": "Haakon",
"rel": "4",
"pos": "NNP"
}, {
"id": "2",
"lemma": "be",
"deprel": "cop",
"word": "is",
"rel": "4",
"pos": "VBZ"
}, {
"id": "3",
"lemma": "my",
"deprel": "poss",
"word": "my",
"rel": "4",
"pos": "PRP$"
}, {
"id": "4",
"lemma": "name",
"deprel": "null",
"word": "name",
"rel": "0",
"pos": "NN"
}],
"filename": "source1.txt",
"offset": 0
}
# -*- coding: UTF-8 -*-
class EdgeGraph():
def __init__(self, init_node, nodes):
self.init_node=init_node
self.nodes_ = nodes
def nodes(self):
return self.nodes_
def size(self):
return len(self.nodes)
def __len__(self):
return len(self.nodes_)
# coding = utf-8
from ner.ner import *
import requests
from polyglot.text import Text,Word
class GateAnnie(NER):
""""""
def __init__(self,lang,host="http://localhost:4035"):
NER.__init__(self,lang)
self.host=host
def identify(self,input):
if not input:
return []
response=requests.post(self.host+"/ner",data=input.encode("utf-8")).content
response=response.decode("utf-8").split("\n")
response=[r.split("\t") for r in response]
return self.parse_output(input,response)
def parse_output(self,input,output):
# On ne récupère que les "LOC"(ations)
locations=[]
for i in output:
if i[1] == "LOC":
w=i[0].split("-")
w=[j.split(" ") for j in w]
if len(w[0]) <2:
w=w[0][0]
else:
w=w[0]
locations.append([w,i[1:]])
#print(locations)
# On récupére le pos_tagging de Polyglot
old=Text(input).pos_tags
#print("tagged")
#print(locations)
#Puis on extrait notre sortie
new_=[]
p = 0
while p < len(old):
item = old[p]
flag = False
for l in locations:
possibly = []
if isinstance(l[0], list) and len(l[0]) > 1:
if item[0] == l[0][0]:
flag = True
possibly.append([item[0], "BEG-LOC"])
j = 1
while j < len(l[0]):
#print(old[j + p], l[0][j])
if old[j + p][0] == l[0][j]:
if j + 1 == len(l[0]):
possibly.append([old[p + j][0], "END-LOC"])
else:
possibly.append([old[p + j][0], "LOC"])
else:
possibly = []
flag=False
break
j += 1
if possibly:
new_.extend(possibly)
p += j
break
elif item[0] == l[0]:
flag = True
new_.append([item[0], "LOC"])
p += 1
break
if not flag:
new_.append(list(item))
p += 1
return new_
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment