Add ged4py (GED Graph Edit Distance) + Gate NER integration + NER Evaluation Notebook

f953a4b2 · Pokiros · a84f34fc · f953a4b2 · f953a4b2 · f953a4b2
Commit f953a4b2 authored 7 years ago by Pokiros
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 818 additions and 464 deletions
+818 -464
--- a/ged4py/__init__.py
+++ b/ged4py/__init__.py
--- a/ged4py/algorithm/__init__.py
+++ b/ged4py/algorithm/__init__.py
--- a/ged4py/algorithm/abstract_graph_edit_dist.py
+++ b/ged4py/algorithm/abstract_graph_edit_dist.py
+# -*- coding: UTF-8 -*-
+from __future__ import print_function
+
+from scipy.optimize import linear_sum_assignment
+import sys
+import numpy as np
+
+
+class AbstractGraphEditDistance(object):
+    def __init__(self, g1, g2):
+        self.g1 = g1
+        self.g2 = g2
+
+    def normalized_distance(self):
+        """
+        Returns the graph edit distance between graph g1 & g2
+        The distance is normalized on the size of the two graphs.
+        This is done to avoid favorisation towards smaller graphs
+        """
+        avg_graphlen = (len(self.g1) + len(self.g2)) / 2
+        return self.distance() / avg_graphlen
+
+    def distance(self):
+        return sum(self.edit_costs())
+
+    def edit_costs(self):
+        cost_matrix = self.create_cost_matrix()
+        row_ind,col_ind = linear_sum_assignment(cost_matrix)
+        return [cost_matrix[row_ind[i]][col_ind[i]] for i in range(len(row_ind))]
+
+    def create_cost_matrix(self):
+        """
+        Creates a |N+M| X |N+M| cost matrix between all nodes in
+        graphs g1 and g2
+        Each cost represents the cost of substituting,
+        deleting or inserting a node
+        The cost matrix consists of four regions:
+
+        substitute 	| insert costs
+        -------------------------------
+        delete 		| delete -> delete
+
+        The delete -> delete region is filled with zeros
+        """
+        n = len(self.g1)
+        m = len(self.g2)
+        cost_matrix = np.zeros((n+m,n+m))
+        #cost_matrix = [[0 for i in range(n + m)] for j in range(n + m)]
+        nodes1 = self.g1.nodes()
+        nodes2 = self.g2.nodes()
+
+        for i in range(n):
+            for j in range(m):
+                cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j])
+
+        for i in range(m):
+            for j in range(m):
+                cost_matrix[i+n,j] = self.insert_cost(i, j, nodes2)
+
+        for i in range(n):
+            for j in range(n):
+                cost_matrix[j,i+m] = self.delete_cost(i, j, nodes1)
+
+        self.cost_matrix = cost_matrix
+        return cost_matrix
+
+    def insert_cost(self, i, j):
+        raise NotImplementedError
+
+    def delete_cost(self, i, j):
+        raise NotImplementedError
+
+    def substitute_cost(self, nodes1, nodes2):
+        raise NotImplementedError
+
+    def print_matrix(self):
+        print("cost matrix:")
+        for column in self.create_cost_matrix():
+            for row in column:
+                if row == sys.maxint:
+                    print ("inf\t")
+                else:
+                    print ("%.2f\t" % float(row))
+            print("")
--- a/ged4py/algorithm/edge_edit_dist.py
+++ b/ged4py/algorithm/edge_edit_dist.py
+from ged4py.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
+import sys
+
+
+class EdgeEditDistance(AbstractGraphEditDistance):
+    """
+    Calculates the graph edit distance between two edges.
+    A node in this context is interpreted as a graph,
+    and edges are interpreted as nodes.
+    """
+
+    def __init__(self, g1, g2):
+        AbstractGraphEditDistance.__init__(self, g1, g2)
+
+    def insert_cost(self, i, j, nodes2):
+        if i == j:
+            return 1
+        return sys.maxsize
+
+    def delete_cost(self, i, j, nodes1):
+        if i == j:
+            return 1
+        return sys.maxsize
+
+    def substitute_cost(self, edge1, edge2):
+        if edge1 == edge2:
+            return 0.
+        return 1
--- a/ged4py/algorithm/graph_edit_dist.py
+++ b/ged4py/algorithm/graph_edit_dist.py
+# -*- coding: UTF-8 -*-
+from __future__ import print_function
+from ged4py.algorithm.abstract_graph_edit_dist import AbstractGraphEditDistance
+from ged4py.algorithm.edge_edit_dist import EdgeEditDistance
+from ged4py.graph.edge_graph import EdgeGraph
+import sys
+
+
+def compare(g1, g2, print_details=False):
+    ged = GraphEditDistance(g1, g2)
+
+    if print_details:
+        ged.print_matrix()
+
+    return ged.normalized_distance()
+
+
+class GraphEditDistance(AbstractGraphEditDistance):
+
+    def __init__(self, g1, g2):
+        AbstractGraphEditDistance.__init__(self, g1, g2)
+
+    def substitute_cost(self, node1, node2):
+        return self.relabel_cost(node1, node2) + self.edge_diff(node1, node2)
+
+    def relabel_cost(self, node1, node2):
+        if node1 == node2:
+            return 0.
+        else:
+            return 1.
+
+    def delete_cost(self, i, j, nodes1):
+        if i == j:
+            return 1
+        return sys.maxsize
+
+    def insert_cost(self, i, j, nodes2):
+        if i == j:
+            return 1
+        else:
+            return sys.maxsize
+
+    def pos_insdel_weight(self, node):
+        return 1
+
+    def edge_diff(self, node1, node2):
+        edges1 = list(self.g1.edge[node1].keys())
+        edges2 = list(self.g2.edge[node2].keys())
+        if len(edges1) == 0 or len(edges2) == 0:
+            return max(len(edges1), len(edges2))
+
+        edit_edit_dist = EdgeEditDistance(EdgeGraph(node1,edges1), EdgeGraph(node2,edges2))
+        return edit_edit_dist.normalized_distance()
--- a/ged4py/data/source/source1.txt
+++ b/ged4py/data/source/source1.txt
+{
+  "id": "source1.txt-1",
+  "sentenceNumber": 1,
+  "length": 17,
+  "tokens": [{
+    "id": "1",
+    "lemma": "Haakon",
+    "deprel": "nsubj",
+    "word": "Haakon",
+    "rel": "4",
+    "pos": "NNP"
+  }, {
+    "id": "2",
+    "lemma": "be",
+    "deprel": "cop",
+    "word": "is",
+    "rel": "4",
+    "pos": "VBZ"
+  }, {
+    "id": "3",
+    "lemma": "my",
+    "deprel": "poss",
+    "word": "my",
+    "rel": "4",
+    "pos": "PRP$"
+  }, {
+    "id": "4",
+    "lemma": "name",
+    "deprel": "null",
+    "word": "name",
+    "rel": "0",
+    "pos": "NN"
+  }],
+  "filename": "source1.txt",
+  "offset": 0
+}
--- a/ged4py/graph/__init__.py
+++ b/ged4py/graph/__init__.py
--- a/ged4py/graph/edge_graph.py
+++ b/ged4py/graph/edge_graph.py
+# -*- coding: UTF-8 -*-
+
+
+class EdgeGraph():
+
+    def __init__(self, init_node, nodes):
+        self.init_node=init_node
+        self.nodes_ = nodes
+
+    def nodes(self):
+        return self.nodes_
+
+    def size(self):
+        return len(self.nodes)
+    def __len__(self):
+        return len(self.nodes_)
--- a/ner/gate_annie.py
+++ b/ner/gate_annie.py
 # coding = utf-8
+
+from ner.ner import *
+import requests
+from polyglot.text import Text,Word
+class GateAnnie(NER):
+    """"""
+
+    def __init__(self,lang,host="http://localhost:4035"):
+        NER.__init__(self,lang)
+        self.host=host
+
+    def identify(self,input):
+        if not input:
+            return []
+
+        response=requests.post(self.host+"/ner",data=input.encode("utf-8")).content
+        response=response.decode("utf-8").split("\n")
+        response=[r.split("\t") for r in response]
+
+        return self.parse_output(input,response)
+
+    def parse_output(self,input,output):
+        # On ne récupère que les "LOC"(ations)
+        locations=[]
+        for i in output:
+            if i[1] == "LOC":
+                w=i[0].split("-")
+                w=[j.split(" ") for j in w]
+                if len(w[0]) <2:
+                    w=w[0][0]
+                else:
+                    w=w[0]
+                locations.append([w,i[1:]])
+
+
+        #print(locations)
+        # On récupére le pos_tagging de Polyglot
+        old=Text(input).pos_tags
+        #print("tagged")
+        #print(locations)
+
+        #Puis on extrait notre sortie
+        new_=[]
+        p = 0
+        while p < len(old):
+            item = old[p]
+            flag = False
+            for l in locations:
+                possibly = []
+                if isinstance(l[0], list) and len(l[0]) > 1:
+                    if item[0] == l[0][0]:
+                        flag = True
+                        possibly.append([item[0], "BEG-LOC"])
+                        j = 1
+                        while j < len(l[0]):
+                            #print(old[j + p], l[0][j])
+                            if old[j + p][0] == l[0][j]:
+                                if j + 1 == len(l[0]):
+                                    possibly.append([old[p + j][0], "END-LOC"])
+                                else:
+                                    possibly.append([old[p + j][0], "LOC"])
+                            else:
+                                possibly = []
+                                flag=False
+                                break
+                            j += 1
+                        if possibly:
+                            new_.extend(possibly)
+                            p += j
+                            break
+                elif item[0] == l[0]:
+                    flag = True
+                    new_.append([item[0], "LOC"])
+                    p += 1
+                    break
+            if not flag:
+                new_.append(list(item))
+                p += 1
+
+        return new_
+
+
--- a/notebooks/NER Evaluation.ipynb
+++ b/notebooks/NER Evaluation.ipynb