Commit 957a7e13 authored by Pokiros's avatar Pokiros
Browse files

Modify tt4py + add str with semantics (v1) + modify pipeline

parent 082aee05
No related merge requests found
Showing with 644 additions and 21 deletions
+644 -21
# coding = utf-8
"""Weisfeiler_Lehman graph kernel.
Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by:
Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt
Mehlhorn, Karsten M. Borgwardt, JMLR, 2012.
http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html
Author : Sandro Vega-Pons, Emanuele Olivetti
"""
import numpy as np
import networkx as nx
import copy
class GK_WL():
"""
Weisfeiler_Lehman graph kernel.
"""
def compare_list(self, graph_list, h=1, node_label=True):
"""Compute the all-pairs kernel values for a list of graphs.
This function can be used to directly compute the kernel
matrix for a list of graphs. The direct computation of the
kernel matrix is faster than the computation of all individual
pairwise kernel values.
Parameters
----------
graph_list: list
A list of graphs (list of networkx graphs)
h : interger
Number of iterations.
node_label : boolean
Whether to use original node labels. True for using node labels
saved in the attribute 'node_label'. False for using the node
degree of each node as node attribute.
Return
------
K: numpy.array, shape = (len(graph_list), len(graph_list))
The similarity matrix of all graphs in graph_list.
"""
self.graphs = graph_list
n = len(graph_list)
lists = [0] * n
k = [0] * (h + 1)
n_nodes = 0
n_max = 0
# Compute adjacency lists and n_nodes, the total number of
# nodes in the dataset.
for i in range(n):
lists[i] = graph_list[i].adjacency_list()
n_nodes = n_nodes + len(graph_list[i])
# Computing the maximum number of nodes in the graphs. It
# will be used in the computation of vectorial
# representation.
if(n_max < len(graph_list[i])):
n_max = len(graph_list[i])
phi = np.zeros((n_max, n), dtype=np.uint64)
# INITIALIZATION: initialize the nodes labels for each graph
# with their labels or with degrees (for unlabeled graphs)
labels = [0] * n
label_lookup = {}
label_counter = 0
# label_lookup is an associative array, which will contain the
# mapping from multiset labels (strings) to short labels
# (integers)
if node_label is True:
for i in range(n):
l_aux = nx.get_node_attributes(graph_list[i],
'label').values()
l_aux = list(l_aux)
# It is assumed that the graph has an attribute
# 'node_label'
labels[i] = np.zeros(len(l_aux), dtype=np.int32)
for j in range(len(l_aux)):
if not (l_aux[j] in label_lookup):
label_lookup[l_aux[j]] = label_counter
labels[i][j] = label_counter
label_counter += 1
else:
labels[i][j] = label_lookup[l_aux[j]]
# labels are associated to a natural number
# starting with 0.
phi[labels[i][j], i] += 1
else:
for i in range(n):
labels[i] = np.array(list(graph_list[i].degree().values()))
for j in range(len(labels[i])):
phi[labels[i][j], i] += 1
print(phi)
# Simplified vectorial representation of graphs (just taking
# the vectors before the kernel iterations), i.e., it is just
# the original nodes degree.
self.vectors = np.copy(phi.transpose())
k = np.dot(phi.transpose(), phi)
# MAIN LOOP
it = 0
new_labels = copy.deepcopy(labels)
while it < h:
# create an empty lookup table
label_lookup = {}
label_counter = 0
phi = np.zeros((n_nodes, n), dtype=np.uint64)
for i in range(n):
for v in range(len(lists[i])):
# form a multiset label of the node v of the i'th graph
# and convert it to a string
long_label = np.concatenate((np.array([labels[i][v]]),
np.sort(labels[i]
[lists[i][v]])))
long_label_string = str(long_label)
# if the multiset label has not yet occurred, add it to the
# lookup table and assign a number to it
if not (long_label_string in label_lookup):
label_lookup[long_label_string] = label_counter
new_labels[i][v] = label_counter
label_counter += 1
else:
new_labels[i][v] = label_lookup[long_label_string]
# fill the column for i'th graph in phi
aux = np.bincount(new_labels[i])
phi[new_labels[i], i] += aux[new_labels[i]]
k += np.dot(phi.transpose(), phi)
labels = copy.deepcopy(new_labels)
it = it + 1
# Compute the normalized version of the kernel
k_norm = np.zeros(k.shape)
for i in range(k.shape[0]):
for j in range(k.shape[1]):
k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j])
return k_norm
def compare(self, g_1, g_2, h=1, node_label=True):
"""Compute the kernel value (similarity) between two graphs.
The kernel is normalized to [0,1] by the equation:
k_norm(g1, g2) = k(g1, g2) / sqrt(k(g1,g1) * k(g2,g2))
Parameters
----------
g_1 : networkx.Graph
First graph.
g_2 : networkx.Graph
Second graph.
h : interger
Number of iterations.
node_label : boolean
Whether to use the values under the graph attribute 'node_label'
as node labels. If False, the degree of the nodes are used as
labels.
Returns
-------
k : The similarity value between g1 and g2.
"""
gl = [g_1, g_2]
return self.compare_list(gl, h, node_label)[0, 1]
......@@ -180,13 +180,13 @@ if not os.path.exists(args.graphs_dir):
graphs={}
for t,text in texts.items():
if text:
graphs[t]=pip.build(text).graph
graphs[t]=pip.buildSemSTR(text,win_size=7).graph
else:
graphs[t]=nx.MultiDiGraph()
os.mkdir(graphs_dir)
os.mkdir(args.graphs_dir)
for t,g in graphs.items():
nx.write_gexf(g,os.path.join(graphs_dir,"{0}.gexf".format(t)))
print(t)
nx.write_gexf(g,os.path.join(args.graphs_dir,"{0}.gexf".format(t)))
# LOAD graph data and associated spatial entities of each graph
assC=json.load(open("associated_and_count.json"))
......
# coding: utf-8
from ner.gate_annie import GateAnnie
from ner.nltk import NLTK
from pipeline import *
from pos_tagger.tagger import Tagger
# Disa
from disambiguator.pagerank import *
from disambiguator.geodict_gaurav import *
# Graph Edit Distance Algorithm Import
from ged4py.algorithm import graph_edit_dist as ged
from ged4py.geo_ged import GeoGED
from ged4py.geo_hed import GeoHED
from ged4py.hausdorff_edit_distance import HED
from ged4py.bipartite_graph_matching_2 import BP_2
from ged4py.greedy_edit_distance import GreedyEditDistance
from ged4py.geo_bp2 import GeoBP2
from ged4py.exception import NotFoundDistance
import numpy as np
import glob, json, argparse
from progressbar import ProgressBar,Timer,Bar,ETA
# Similarity Function between graph and a set of graphs
def compareGED(id_,graphs):
g=graphs[id_]
sc=np.zeros(len(graphs))
for id_,g2 in graphs.items():
score=ged.compare(g,g2)
sc[id_]=score
return sc
def compareGEOGED(id1,graphs):
g=graphs[id1]
sc=np.zeros(len(graphs))
for id_,g2 in graphs.items():
try:
if len(g2) >1:
gg=GeoGED(g,g2)
score=gg.distance()
sc[id_] = score
else:
sc[id_]=np.inf
except:
sc[id_] = np.inf
return sc
def compareBP2(id_,graphs):
bp2=BP_2()
g = graphs[id_]
sc = np.zeros(len(graphs))
for id_, g2 in graphs.items():
if len(g2) >0:
score = bp2.bp2(g, g2)
sc[id_] = score
else:
sc[id_] = np.inf
return sc
def compareGEOBP2(id_,graphs):
bp2=GeoBP2()
g = graphs[id_]
sc = np.zeros(len(graphs))
for id_, g2 in graphs.items():
if len(g2) >0:
score = bp2.bp2(g, g2)
sc[id_] = score
else:
sc[id_] = np.inf
return sc
def compareHED(id_,graphs):
h=HED()
g = graphs[id_]
sc = np.zeros(len(graphs))
for id_, g2 in graphs.items():
if len(g2) >0:
score = h.hed(g, g2)
sc[id_] = score
else:
sc[id_]=np.inf
return sc
def compareGEOHED(id_,graphs):
h=GeoHED()
g = graphs[id_]
sc = np.zeros(len(graphs))
for id_, g2 in graphs.items():
if len(g2)>1:
score = h.hed(g, g2)
sc[id_] = score
else:
sc[id_] = np.inf
return sc
def compareGreedy(id_,graphs):
g = graphs[id_]
sc = np.zeros(len(graphs))
for id_, g2 in graphs.items():
h = GreedyEditDistance(g,g2)
score = h.distance()
sc[id_] = score
return sc
funcDict={
"GED":compareGED,
"GEOGED":compareGEOGED,
"BP2":compareBP2,
"GEOBP2":compareGEOBP2,
"HED":compareHED,
"GEOHED":compareGEOHED,
"GREEDY":compareGreedy
}
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("distance")
parser.add_argument("graphs_dir")
parser.add_argument("--ignore",help="Ignore Output",action="store_true")
parser.add_argument("-o","--output",help="Output Filename",default="GED")
args = parser.parse_args()
if not args.distance in funcDict.keys():
raise NotFoundDistance(args.distance,funcDict)
exit()
# Initialize Pipeline for Spatial Entities extraction and STR construction
pip=Pipeline(lang="english",tagger=Tagger(),ner=StanfordNER(lang="en"))
# Load all the text from the corpus
def get_text_data(directory):
"""
Load the Epidemiology corpus
"""
files = glob.glob(os.path.join(directory,"*.txt"))
texts={}
for filepath in files:
id_doc=int(re.findall("\d+",filepath)[-1])
data=open(filepath).read()
texts[id_doc]=data
return texts
texts=get_text_data("data/data_agritrop/text/")# Raw text
#print("TEST text 0 = ",texts[0])
# Extract All spatial entities
if not os.path.exists("associated_and_count_agritrop.json"):
pass
associated_es={}
count_per_doc={}
for id_,text in texts.items():
if text:
a,b,c=pip.parse(text)
count_per_doc[id_]=a
associated_es[id_]=c
else:
associated_es[id_]={}
count_per_doc[id_]={}
open("associated_and_count_agritrop.json",'w').write(json.dumps([associated_es,count_per_doc],indent=4))
if not os.path.exists(args.graphs_dir):
pass
import networkx as nx
graphs={}
for t,text in texts.items():
if text:
graphs[t]=pip.build(text).graph
else:
graphs[t]=nx.MultiDiGraph()
os.mkdir(args.graphs_dir)
for t,g in graphs.items():
print(t)
nx.write_gexf(g,os.path.join(args.graphs_dir,"{0}.gexf".format(t)))
# LOAD graph data and associated spatial entities of each graph
assC=json.load(open("associated_and_count_agritrop.json"))
associated_es,count_per_doc=assC[0],assC[1]
graphs={}
for file in glob.glob(args.graphs_dir.rstrip("/")+"/*.gexf"):
id=int(re.findall("\d+",file)[0])
graphs[id]=nx.read_gexf(file)
from ged4py.algorithm import graph_edit_dist as ged
from ged4py.bipartite_graph_matching_2 import BP_2
def getLocInfo(id_):
data=get_data(id_)
if 'coord' in data:
return [data["coord"]["lat"],data["coord"]["lon"]]
return [0,0]
def get_associated_es(associated_es_data):
new_={}
for id_ in associated_es_data:
new_[id_]={"label":associated_es_data[id_],"coord":getLocInfo(id_)}
return new_
def getEdges4Draw(associated_es,edges):
data={}
for es in associated_es:
data[es]=getLocInfo(es)
lines=[]
for ed in edges:
lines.append([data[ed[0]],data[ed[1]],ed[2]["color"]])
return lines
# Generating Evaluation Output
top_ten_documents=[]
final_data={}
inv_table,j={},0
new_graphs={}
for i in graphs:
inv_table[j]=i
new_graphs[j]=graphs[i]
j+=1
with ProgressBar(max_value=len(texts.keys()),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg:
inc=0
for doc_s in inv_table.keys():
id_json=str(doc_s)
real_id=inv_table[doc_s]
r_id_str=str(real_id)
#print(id_json)
_score=funcDict[args.distance](doc_s, new_graphs)
top_4_docs_score= np.sort(_score)[1:4].astype(float)
top_4_docs= np.argsort(_score)[1:4].astype(int)
#print(top_10_docs)
final_data[real_id]={
"sp_entities":get_associated_es(associated_es[str(real_id)]),
"text":texts[inv_table[doc_s]],
"edges":getEdges4Draw(associated_es[str(real_id)],graphs[real_id].edges(data=True))
}
final_data[real_id]["top_10"]=[]
for d in range(len(top_4_docs)):
doc_data={}
doc_data["score"]=top_4_docs_score[d]
doc_data["id_txt"]=inv_table[int(top_4_docs[d])]
doc_data["text"]=texts[doc_data["id_txt"]]
doc_data["sp_entities"]=get_associated_es(associated_es[str(doc_data["id_txt"])])
doc_data["edges"]=getEdges4Draw(associated_es[str(doc_data["id_txt"])],graphs[doc_data["id_txt"]].edges(data=True))
doc_data["relevant"]=None
final_data[real_id]["top_10"].append(doc_data)
inc+=1
pg.update(inc)
if not args.ignore:
open("graph_viewer/evalTopJP_10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4))
......@@ -16,14 +16,46 @@ import json
class STR_SEM(STR):
""""""
def __init__(self, tagged_text, spatial_entities,diseases,species):
super().__init__(tagged_text, spatial_entities,diseases,species)
labels = json.load(open("/Users/jacquesfize/Downloads/labelsEN.json"))
def __init__(self, tagged_text, spatial_entities):
super().__init__(tagged_text, spatial_entities)
self.labels = json.load(open("/Users/jacquesfize/Downloads/labelsEN.json"))
self.tags=None
def build(self, verbose=False):
super().build(cooc=True, inc=True, adj=True,verbose=verbose)
def build(self, win_size = 5,verbose=False):
super().build(cooc=False, inc=True, adj=True,verbose=verbose)
search_engine = tt4py.Text(self.tagged_text)
search_engine.transform_tagged()
search_engine.tag_item_in_thesaurus(self.labels,prefix_="agrovoc")
sps_inv = {}
for k, v in self.spatial_entities.items():
sps_inv[v.lower()] = k
cleaned_=search_engine.tagged_text
self.tags = cleaned_
linked_to = {}
w = 0
while w < len(cleaned_):
curr = cleaned_[w]
if "agrovoc" in curr[1] and not "LOC" in curr[1]:
window = np.array(cleaned_[w - win_size:w + win_size])
if not window.size:
w += 1
continue
for wo in window:
if "LOC" == wo[1] and wo[0] in sps_inv:
if not curr[0] in linked_to: linked_to[curr[0]] = set([])
linked_to[curr[0]].add(wo[0])
w += 1
edges = []
register = set([])
for l, v in linked_to.items():
for vi in v:
for vj in v:
if vj != vi and vj + "-" + vi not in register:
edges.append([sps_inv[vi], sps_inv[vj], {"label": str(l), "color": "cyan"}])
register.add(vi + "-" + vj)
self.graph.add_edges_from(edges)
# load spatial entities
# find the positions for each spatial entities
# for each spatial entities postitions, find the neighbouring words
......
......@@ -9,6 +9,7 @@ from pos_tagger.treetagger import TreeTagger
from ner.stanford_ner import *
from disambiguator.pagerank import PageRankDisambiguator
from models.str import STR
from models.str_with_semantic import STR_SEM
class Pipeline(object):
......@@ -100,6 +101,18 @@ class Pipeline(object):
str_.build()
return str_
def buildSemSTR(self,text,win_size=5):
"""
Return the corresponding STR for a text.
:param text:
:return: STR
"""
_,output, se_identified = self.parse(text)
str_=STR_SEM(output,se_identified)
str_.build(win_size=win_size)
return str_
def build_class_variation_str(self,text):
"""
Return the corresponding STR for a text.
......
......@@ -4,6 +4,12 @@ from tt4py.helpers import *
import numpy as np
from enum import Enum
from termcolor import colored
from ner.ner import NER
from nltk.stem import WordNetLemmatizer, SnowballStemmer
_wn_lem =WordNetLemmatizer()
_snowball_stemmer = SnowballStemmer("english")
class TaggedType(Enum):
POS=2
......@@ -16,12 +22,19 @@ class SearchFlag(Enum):
SP_WS = lambda x : x.split(" ") # split using whitespaces
SP_P = lambda x : x.split(".") # split using point
SP_D = lambda x : x.split("-") # split using dash
WN_LEM = lambda x : _wn_lem.lemmatize(x)
SNW_STEM = lambda x : _snowball_stemmer.stem(x)
class TaggedInputError(Exception):
def __init__(self):
super(Exception, self).__init__(
colored("Wrong input : check your input data type or the size for each token data ", "red"))
class WrongThesaurusFormatError(Exception):
def __init__(self,var):
super(Exception, self).__init__(
colored("Wrong thesaurus format: use dict format instead of {0}. Ex. {'id_1':'label'}".format(str(type(var))), "red"))
class Text(object):
def __init__(self,tagged_text,type=TaggedType.MIX_POS_TAG):
#check if 'tagged_text' is an iterable object
......@@ -32,6 +45,7 @@ class Text(object):
# Convert input into numpy array
self.tagged_text=tagged_text
if isinstance(tagged_text,dict):
self.tagged_text = dict_to_array(tagged_text)
elif isinstance(tagged_text,list):
......@@ -45,19 +59,13 @@ class Text(object):
if not type.value == self.tagged_text.shape[1]:
raise TaggedInputError
self.raw_text=" ".join(self.tagged_text[:,0])
self._original=self.tagged_text.copy()
self.flag_applied = []
def is_in_text(self,string,flags=[SearchFlag.NO_CASE]):
t_1,t_2=[string],self.raw_text
# Apply necessary for string search
for flag in flags:
t_1,t_2=np.array([flag(i) for i in t_1]).flatten(),np.array([flag(i) for i in t_2]).flatten()
if not " {0} ".format(t_1) in t_2:
return False
return True
def transform_tagged(self,flags = [SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D]):
tagged = self.tagged_text.copy().tolist()
tagged = self._original.copy().tolist()
# Apply necessary for string search
for flag in flags:
tagged_t=[]
......@@ -71,9 +79,19 @@ class Text(object):
else:
tagged_t.extend([[res_, token[1]]])
tagged=tagged_t
self.tagged_text = np.array(tagged)
self.flag_applied=flags
def hasSameFlags(self,flags):
for f in flags:
if not f in self.flag_applied:
return False
return True
def get_occurrences(self,string,flags = [SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D]):
if not self.hasSameFlags(flags):
self.transform_tagged(flags)
positions_list=[]
t_1 = [string]
......@@ -111,15 +129,118 @@ class Text(object):
pos2=pos1
return self.tagged_text[pos1-window_size:window_size+pos2]
def extract_token_by_tag(self,tags):
def extract_token_by_tag(self,*tags):
res,posis_=[],[]
for tag in tags:
posis_.extend(np.argwhere(self.tagged_text[:, -1] == tag).flatten())
posis_ = sorted(posis_)
for pos in posis_:
pp=self.tagged_text[pos].tolist()
pp.append(pos)
res.append(pp)
return res
def tag_item_in_thesaurus(self,thesaurus,flags = [SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D],prefix_="th_",stop_tag = ["LOC"]):
if not self.hasSameFlags(flags):
self.transform_tagged(flags)
if not isinstance(thesaurus,dict):
raise WrongThesaurusFormatError(thesaurus)
t=" ".join(self.tagged_text[:,0].tolist())
for id_,element in thesaurus.items():
if element in t:
positions_ = self.get_occurrences(element)
for d_ in positions_:
f=True
x,y=d_[0],d_[1]
c=0
if not self.isWorthIt(x,y,prefix_):
break
for st in stop_tag:
if x != y and st in self.tagged_text[x:y][:,1]:
f=False
elif x == y and st in self.tagged_text[x][1]:
f=False
if f:
# rec_str_= self.tagged_text[x][0]
# if x != y:
# rec_str_ = self.reconstruct_str(self.tagged_text[x:y][:,0])
# if x - y > 1:
# self.tagged_text = np.delete(self.tagged_text,np.arange(x+1,y),0)
# else:
# self.tagged_text = np.delete(self.tagged_text,y,0)
if abs(x-y)> 0:
self.tagged_text[x:y][:,1] = prefix_ + id_
#print("AFTER",self.tagged_text[x:y],x,y)
else:
self.tagged_text[x][1] = prefix_ + id_
#print("AFTER", self.tagged_text[x], x)
new_tagged_= []
j=0
while j < len(self.tagged_text):
tag = self.tagged_text[j]
if prefix_ in tag[-1]:
curr=tag[-1]
t=1
while j+t < len(self.tagged_text):
if self.tagged_text[j+t][-1] != curr:
break
t+=1
#print(self.reconstruct_str(self.tagged_text[j:j+t][:,0]),self.tagged_text[j:j+t],j,t)
new_tagged_.append([self.reconstruct_str(self.tagged_text[j:j+t][:,0]),curr])
j+=t
else:
new_tagged_.append(tag.tolist())
j+=1
self.tagged_text=np.array(new_tagged_)
def reconstruct_str(self,list_):
res = ""
no_sp_char = ["-"]
no_sp_bf = [","]
for ch in list_:
if not ch in no_sp_char and res:
if res[-1] in no_sp_char or ch in no_sp_bf:
res+=ch
if not res:
res+=ch
else:
res+=" "+ch
return res
def isWorthIt(self, x, y,prefix):
taille = abs(x-y)
count=0
if x == y:
if prefix in self.tagged_text[x]:
count+=1
taille=1
else:
# c=None
for item in self.tagged_text[x:y]:
if prefix in item[-1]:
count+=1
# if not c: c=item[-1]
# elif c and item[-1] != c: ---> A discuter
# return False
decx,decy=0,0
fx,fy=True,True
while fx or fy:
fx,fy=False,False
if x-(decx+1) >0:
if prefix in self.tagged_text[x-(decx+1)][-1] :
fx=True
decx += 1
if y + decy+1 < len(self.tagged_text):
if prefix in self.tagged_text[y + decy+1][-1] :
fy=True
decy += 1
#print(self.tagged_text[x:y],count,taille+decy+decx)
if taille < count+decx+decy:
return False
return True
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment