Commit 156baa8e authored by Pokiros's avatar Pokiros
Browse files

Initialise Git with previous code

parent 435b84a0
No related merge requests found
Showing with 3156 additions and 0 deletions
+3156 -0
.gitignore 0 → 100644
# Created by .ignore support plugin (hsz.mobi)
### Example user template template
### Example user template
# IntelliJ project files
.idea
*.iml
test*
{
"tree_tagger_home":"/Users/jacquesfize/.tree-tagger/cmd/",
"osm_boundaries_directory":"/Users/jacquesfize/install",
"core_nlp_URL":"http://localhost:9000"
}
\ No newline at end of file
# coding = utf-8
import json
class Configuration(object):
def __init__(self, data):
self.__dict__=data
for d in self.__dict__:
if isinstance(self.__dict__[d],dict):
self.__dict__[d]=Configuration(self.__dict__[d])
def __getitem__(self, item):
return self.__dict__[item]
config = Configuration(json.load(open("config/config.json")))
# coding = utf-8
\ No newline at end of file
# coding = utf-8
from ner.ner import *
import copy
import numpy as np
class Disambiguator(object):
def __init__(self):
"""Constructor for Disambiguator"""
pass
def extract_se_entities(self, input):
out = self.parse_corpus(input)
en_ = out[out[:, 1] == NER._unified_tag["place"]]
return np.unique(en_[:, 0])
def parse_corpus(self, corpus):
final_corpus = []
t = 0
placeTag = NER._unified_tag["place"]
while t < len(corpus):
tag = copy.copy(corpus[t])
if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
lenw = 1
if tag[1] == "BEG-" + placeTag:
compound_tag = tag[0]
t += 1
while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
tag = copy.copy(corpus[t])
if tag[0].endswith("-") or compound_tag.endswith("-"):
compound_tag += tag[0]
else:
compound_tag += " " + tag[0]
t += 1
lenw += 1
tag[0] = compound_tag
tag[1] = placeTag
t += 1
else:
t += 1
final_corpus.append(tag)
return np.array(final_corpus)
def disambiguate(self,ner_result):
pass
# coding = utf-8
from .disambiguator import Disambiguator
from helpers.collision_with_gazetteer_data import *
from helpers.gazeteer_helpers import *
import math
class GauravGeodict(Disambiguator):
def __init__(self):
Disambiguator.__init__(self)
def fib_formula(self,n):
if n in [0,1]: return 0 # Modifying fibonacci behaviour
golden_ratio = (1 + math.sqrt(5)) / 2
val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
return int(round(val))
def inclusion_log(self,x, alpha=0.2):
return math.log(x)
def get_inclusion_tree(self, id_, prop):
"""
For an entity return it geographical inclusion tree using a property.
"""
arr = []
current_entity = get_data(id_)
while True:
if prop in current_entity:
arr.append(current_entity[prop][0])
current_entity = get_data(current_entity[prop][0])
else:
arr.append('Q2') # Earth ID
break
return arr
def get_inclusion_score(self,id1, id2): # is it really inclusion ? :)
list1 = self.get_inclusion_tree(id1, 'P131')
list2 = self.get_inclusion_tree(id2, 'P131')
interP131 = len(list(set(list1).intersection(list2)))
list1 = self.get_inclusion_tree(id1, 'P706')
list2 = self.get_inclusion_tree(id2, 'P706')
interP706 = len(list(set(list1).intersection(list2)))
# return fib_no[interP131]+fib_no[interP706]
return self.inclusion_log(interP131) + self.inclusion_log(interP706)
def Adjacency_P47(self,id1, id2):
data_1, data_2 = get_data(id1), get_data(id2)
if "P47" in data_1 and "P47" in data_2:
if id1 in data_2["P47"] or id2 in data_1["P47"]:
return True
return False
def Adjacency_Hull(self,id1, id2):
return collisionTwoSEBoundaries(id1, id2)
def disambiguateOne(self,spat_candidates, fixed_entities):
score_dc = {}
for cand in spat_candidates:
id_cand = cand["id"]
score_dc[id_cand] = 0
for fixed in fixed_entities:
id_fixed = fixed_entities[fixed]["id"]
if self.Adjacency_P47(id_cand, id_fixed):
score_dc[id_cand] += 3
if self.Adjacency_Hull(id_cand, id_fixed):
score_dc[id_cand] += 2
score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
m = max(score_dc, key=score_dc.get)
if score_dc[m] < 4:
return None
for cand in spat_candidates:
if cand["id"] == m:
return cand["id"]
def disambiguate(self,ner_result,lang="en"):
se_ = self.extract_se_entities(ner_result)
selected_en = {}
fixed_entities = {}
ambiguous_entities = {}
for en in se_:
request = get_by_label(en, lang)
if len(request) ==0:
request = get_by_alias(en, lang)
if len(request) > 1:
ambiguous_entities[en] = [r["_source"] for r in request]
elif len(request) == 1:
fixed_entities[en] = request[0]["_source"]
d_amb_results = {}
for amb_ent in ambiguous_entities:
d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
if not d:
d_amb_results[amb_ent] = get_most_common_id(amb_ent, lang)
else:
d_amb_results[amb_ent] = d
for k, v in fixed_entities.items():
fixed_entities[k] = v["id"]
for k, v in d_amb_results.items():
fixed_entities[k] = v
return fixed_entities
# coding = utf-8
from .disambiguator import Disambiguator
from ner.ner import *
import copy
import numpy as np
from helpers.gazeteer_helpers import get_most_common_id, label_exists, alias_exists, get_most_common_id_alias
class PageRankDisambiguator(Disambiguator):
def __init__(self):
Disambiguator.__init__(self)
def disambiguate(self, ner_result, lang="en"):
se_ = self.extract_se_entities(ner_result)
selected_en = {}
for en in se_:
if label_exists(en, lang):
id_ = get_most_common_id(en, lang)
selected_en[id_] = en
elif alias_exists(en,lang):
id_ = get_most_common_id_alias(en, lang)
selected_en[id_] = en
return selected_en
epidemio.json 0 → 100644
+ 2017
0
View file @ 156baa8e
This diff is collapsed.
# coding=utf-8
import json
import numpy as np
class GeneralStat(object):
"""docstring f#or GeneralStat."""
def __init__(self, doc_id2text, doc_id2label):
self.doc_id2text = doc_id2text
self.doc_id2label = doc_id2label
def similarity(self, doc1_id,doc2_id):
return 0.0
def avg_precision_at_n(self,n):
res = np.array([])
for i in self.doc_id2text.keys():
res = np.append(res,self.precision_at_n(i,n))
return np.mean(res)
def precision_at_n(self,doc_id,n):
result = []
for k in self.doc_id2text:
if k != doc_id:
result.append((k, self.similarity(doc_id,k)))
res = np.array(result, dtype=[("i", int), ("j", float)])
res = np.sort(res, order="j")[::-1][:n]
relev = 0
lab_rel = self.doc_id2label[doc_id]
for i in res:
if self.doc_id2label[i[0]] == lab_rel:
relev += 1
return relev / n
def rank_doc(self,doc_id):
result = []
for k in self.doc_id2text:
if k != doc_id:
result.append((k, self.similarity(doc_id,k)))
res = np.array(result, dtype=[("i", int), ("j", float)])
res = np.sort(res, order="j")[::-1]
for i in range(len(res)):
if res[i][0] == self.doc_id2label[doc_id]:
return i
#print(111,res,self.doc_id2label[doc_id])
def MRR(self):
res = np.array([])
for i in self.doc_id2text.keys():
try:
res = np.append(res,1/self.rank_doc(i))
except Exception as e:
pass
return np.mean(res)
# coding=utf-8
import numpy as np
from extractor import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from evaluate.evaluate import *
from models.str import *
class BOWStatistics(GeneralStat):
"""
Extract different statistic values from a corpus using our spatial representation and similarity measures.
"""
sp_en = None
def __init__(self, corpus_name, doc_id2text, doc_id2label,lang="fr"):
"""
Constructor
Parameters
----------
doc_id2text : dict
Map that link document id to text
doc_id2label : dict
Map that link document id to label (for IR stats)
"""
super(BOWStatistics, GeneralStat.__init__(self, doc_id2text, doc_id2label))
self.doc_id2text = doc_id2text
self.doc_id2label = doc_id2label
self.doc_id2graph, self.not_found = None, None
self.lang = lang
self.corpus_name = corpus_name
if not BOWStatistics.sp_en or BOWStatistics.sp_en.language[:2] != lang:
BOWStatistics.sp_en = SpatialEntityExtractor(language=self.lang)
self.bow_ = None
self.__spatial_e=set([])
self.extract_data()
transformer = TfidfVectorizer(smooth_idf=False,vocabulary=list(self.__spatial_e))
self.id2bow_id, self.corpus = {}, []
i = 0
for k, v in self.doc_id2text.items():
self.corpus.append(v)
self.id2bow_id[k] = i
i += 1
self.bow_ = transformer.fit_transform(self.corpus)
def extract_se(self, text):
"""
Create a spatial graph from a text.
[TODO Description]
Parameters
----------
text : string
Text content you want to transform
occ : boolean
Include cooccurrency relation in graph
adj : boolean
Include adjacency relation in graph
inc : boolean
Include inclusion relation in graph
Returns
-------
STR
graph of the spatial configuration in the text
"""
try:
text = BOWStatistics.sp_en.clean(text)
ann = BOWStatistics.sp_en.tag(text)
output = BOWStatistics.sp_en.parse_output(
ann, text, "tree_tagger")
graph = STR(text=output, lang=self.lang)
places=graph.get_place_order()
#print(places)
if places:
for p in places:self.__spatial_e.add(p)
return " ".join(places)
except Exception as e:
return None
def compute_thematic_similarity(self, index_doc1, index_doc2):
"""
Compute the cosine similarity
"""
try:
v1 = self.bow_[self.id2bow_id[index_doc1]]
v2 = self.bow_[self.id2bow_id[index_doc2]]
except Exception as e:
print("An document id don't exists in the BOW !")
return False
return cosine_similarity(v1, v2)[0][0]
def similarity(self, doc1_id, doc2_id):
try:
return self.compute_thematic_similarity(doc1_id, doc2_id)
except ZeroDivisionError as e:
return 0.0
def extract_data(self):
"""
Generate a spatial graph for each text in the corpus.
[TODO Description]
Parameters
----------
occ : boolean
Include cooccurrency relation in graph
adj : boolean
Include adjacency relation in graph
inc : boolean
Include inclusion relation in graph
"""
self.doc_id2graph = {}
del_node = []
for i in self.doc_id2text:
self.doc_id2text[i] = self.extract_se(self.doc_id2text[i])
if self.doc_id2text[i] == None :
del_node.append(i)
self.not_found = len(del_node)
for i in del_node:
del self.doc_id2label[i]
del self.doc_id2text[i]
def extract_common_statistic(self):
"""
Extract common statistics extracted from the corpus
Statistics:
* Number of documents
* Number of documents without Spatial Entity
* Number of documents with Spatial Entity
* Average Size of a graph
* Standard Deviation of graph size
* Average Document size
* Average Edge Intersection length
* Average Node Intersection length
* Maximum Egde Intersection length
* Maximum Node Intersection length
* Average Node Jaccard Similarity
* Average Edge Jaccard Similarity
* Maximum Similarity between graphs
Returns
-------
dict
statistic dictionnary
"""
stats = {}
stats["nb_doc"] = len(self.doc_id2text.keys())
stats["nb_doc_without_SP_EN"] = self.not_found
stats["nb_doc_without_graph"] = self.not_found
stats["nb_doc_with_SP_EN"] = len(
self.doc_id2graph.keys()) - stats["nb_doc_without_SP_EN"]
sim_data = {}
max_sim_V1 = 0.0
documents_size = []
for kg1 in self.doc_id2graph:
g1 = self.doc_id2graph[kg1]
if g1 == None:
continue
documents_size.append(len(self.doc_id2graph[kg1].corpus))
for kg2 in self.doc_id2graph:
g2 = self.doc_id2graph[kg2]
if g1 == None or g2 == None:
continue
if g1 != g2 and not (g1, g2) in sim_data and not (g2, g1) in sim_data:
if len(g1.graph.nodes()) == 0 or len(g2.graph.nodes()) == 0:
continue
if self.doc_id2label[kg1] == self.doc_id2label[kg2]:
sim_data[(kg1, kg2)] = self.similarity(
kg1, kg2)
if sim_data[(kg1, kg2)] > max_sim_V1 and sim_data[(kg1, kg2)] < 1:
max_sim_V1 = sim_data[(kg1, kg2)]
i, size = 0, 0
size_list = []
for kg1 in self.doc_id2graph:
g1 = self.doc_id2graph[kg1]
if g1 != None:
size_list.append(len(g1.graph.nodes()))
# Graph Size statistics
stats["avg_size"] = np.mean(size_list)
stats["std_size"] = np.std(size_list)
# Average Document size
stats["avg_document_size"] = np.mean(documents_size)
# Max similarity value
stats["max_sim_v1"] = max_sim_V1
return stats
# coding=utf-8
import networkx as nx
import numpy as np
from extractor import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from evaluate.evaluate import *
from models.str import *
from similarity.str_sim import heuristic_1
class STRStatistics(GeneralStat):
"""
Extract different statistic values from a corpus using our spatial representation and similarity measures.
"""
sp_en = None
def __init__(self,corpus_name, doc_id2text, doc_id2label, sim_function="jaccard", lang="fr", thematic_=False):
"""
Constructor
Parameters
----------
doc_id2text : dict
Map that link document id to text
doc_id2label : dict
Map that link document id to label (for IR stats)
"""
super(STRStatistics, GeneralStat.__init__(self, doc_id2text, doc_id2label))
self.doc_id2text = doc_id2text
self.doc_id2label = doc_id2label
self.doc_id2graph, self.not_found = None, None
self.lang = lang
self.corpus_name=corpus_name
if not STRStatistics.sp_en or STRStatistics.sp_en.language[:2] != lang:
STRStatistics.sp_en = SpatialEntityExtractor(language=self.lang)
self.sim_func = (sim_function if sim_function in ["hypergeo", "node2vec", "jaccard", "sim_mcs", "sim_mcs_e",
"sim_wgu","heur1"] else "jaccard")
self.bow_ = None
self.only_thematic=False
if thematic_:
transformer = TfidfVectorizer(smooth_idf=False)
self.id2bow_id, self.corpus = {}, []
i = 0
for k, v in self.doc_id2text.items():
self.corpus.append(v)
self.id2bow_id[k] = i
i += 1
self.bow_ = transformer.fit_transform(self.corpus)
def loadGraph(self, text, occ=True, adj=True, inc=True):
"""
Create a spatial graph from a text.
[TODO Description]
Parameters
----------
text : string
Text content you want to transform
occ : boolean
Include cooccurrency relation in graph
adj : boolean
Include adjacency relation in graph
inc : boolean
Include inclusion relation in graph
Returns
-------
STR
graph of the spatial configuration in the text
"""
try:
text = STRStatistics.sp_en.clean(text)
ann = STRStatistics.sp_en.tag(text)
output = STRStatistics.sp_en.parse_output(
ann, text, "tree_tagger")
graph = STR(text=output, lang=self.lang)
graph.extract_names_associated_to_place()
graph.create_place_repr_vector()
graph.create_multi_graph(occ, adj, inc, False)
if len(graph.graph.nodes()) == 0: return None
if self.sim_func == "node2vec":
graph.node2vec_models(num_walks=10, directed=True)
#graph.node2vec_model(num_walks=10, directed=True)
return graph
except Exception as e:
return None
"""
def similarity_jaccard_edge(self,g1, g2):
\"""
Compute jaccard indice between two graph using their edges.
Parameters
----------
g1 : nx.Graph
first graph
g2 : nx.Graph
second graph
Returns
-------
float
jaccard similarity value
\"""
if isinstance(g1, nx.MultiDiGraph): return self.similarity_jaccard_edge_multi(g1, g2)
ed_g1 = g1.edges()
ed_g2 = g2.edges()
union = []
for ed1 in ed_g1:
if ed1 not in union: union.append(ed1)
for ed1 in ed_g2:
if ed1 not in union: union.append(ed1)
if not union:
return 0
inter = []
for ed1 in ed_g1:
if ed1 in ed_g2 and not ed1 in inter:
# print(ed1)
inter.append(ed1)
# print(len(inter) / len(union),len(inter) , len(union))
return len(inter) / len(union)
"""
def transform_edge_data(self,data):
new_ = []
for ed1 in data:
new_.append((ed1[0], ed1[1], ed1[2]["color"]))
return new_
def similarity_jaccard_edge(self,g1, g2):
ed_g1 = self.transform_edge_data(g1.edges(data=True))
ed_g2 = self.transform_edge_data(g2.edges(data=True))
union = []
for ed1 in ed_g1:
if ed1 not in union: union.append(ed1)
for ed1 in ed_g2:
if ed1 not in union: union.append(ed1)
if not union:
return 0
inter = []
for ed1 in ed_g1:
if ed1 in ed_g2 and not ed1 in inter:
inter.append(ed1)
# print(len(inter) / len(union),len(inter) , len(union))
return len(inter) / len(union)
def similarity_jaccard_node(self, g1, g2):
"""
Compute jaccard indice between two graph using their nodes.
Parameters
----------
g1 : nx.Graph
first graph
g2 : nx.Graph
second graph
Returns
-------
float
jaccard similarity value
"""
so_g1 = g1.nodes()
so_g2 = g2.nodes()
union = set(so_g1 + so_g2)
if not union:
return 0
inter = []
for so1 in so_g1:
if so1 in so_g2:
inter.append(so1)
return len(inter) / len(union)
def union_nodes(self, g1, g2):
"""
Compute union of two graph nodes.
Parameters
----------
g1 : nx.Graph
first graph
g2 : nx.Graph
second graph
Returns
-------
list
union
"""
so_g1 = g1.nodes()
so_g2 = g2.nodes()
return set(so_g1 + so_g2)
def union_edges(self, g1, g2):
"""
Compute union of two graph edges.
Parameters
----------
g1 : nx.Graph
first graph
g2 : nx.Graph
second graph
Returns
-------
list
union
"""
ed_g1 = g1.edges()
ed_g2 = g2.edges()
return set(ed_g1 + ed_g2)
def inter_edges(self, g1, g2):
"""
Compute intersection of two graph edges.
Parameters
----------
g1 : nx.Graph
first graph
g2 : nx.Graph
second graph
Returns
-------
list
intersection
"""
ed_g1 = g1.edges()
ed_g2 = g2.edges()
inter = []
for ed1 in ed_g1:
if ed1 in ed_g2:
inter.append(ed1)
return inter
def inter_nodes(self, g1, g2):
"""
Compute intersection of two graph nodes.
Parameters
----------
g1 : nx.Graph
first graph
g2 : nx.Graph
second graph
Returns
-------
list
intersection
"""
so_g1 = g1.nodes()
so_g2 = g2.nodes()
inter = []
for so1 in so_g1:
if so1 in so_g2:
inter.append(so1)
return inter
def similarity_jaccard(self, g1, g2):
"""
Compute a "jaccard" similarity between two graph.
.. math:: \frac{|E_{G_1}\cap E_{G_2}|}{|E_{G_1}\cup E_{G_2}|} \times \frac{|N_{G_1}\cap N_{G_2}|}{|N_{G_1}\cup N_{G_2}|}
Parameters
----------
g1 : nx.Graph
first graph
g2 : nx.Graph
second graph
Returns
-------
float
similarity
"""
return self.similarity_jaccard_node(g1, g2) * self.similarity_jaccard_edge(g1, g2)
def mcs(self, g1, g2):
res = nx.MultiDiGraph()
res.add_nodes_from(self.inter_nodes(g1, g2))
res.add_edges_from(self.inter_edges(g1, g2))
return res
def MCS(self, g1, g2):
res = nx.MultiDiGraph()
res.add_nodes_from(self.union_nodes(g1, g2))
res.add_edges_from(self.union_edges(g1, g2))
return res
def s_mcs(self, g1, g2):
"""
A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer,
Pattern Recognition Letters, 1998
"""
return len(self.mcs(g1, g2)) / max(len(g1), len(g2))
def s_mcs_with_edge(self, g1, g2):
"""
A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer,
Pattern Recognition Letters, 1998
"""
mcs = self.mcs(g1, g2)
len_mcs = len(mcs.nodes()) + len(mcs.edges())
len_g1 = len(g1.nodes()) + len(g1.edges())
len_g2 = len(g2.nodes()) + len(g2.edges())
return (len_mcs) / max(len_g1, len_g2)
def s_wgu(self, g1, g2):
"""
Graph distances using graph union, W.D. Wallis an P.Shoubridge and M. Kraetzl and D. Ray
Pattern Recognition Letters, 2001
"""
return len(self.mcs(g1, g2)) / (len(g1) + len(g2) - len(self.mcs(g1, g2)))
def node2vec_similarity(self, g1_, g2_):
mod1 = g1_.n2vec_models
mod2 = g2_.n2vec_models
g1,g2=g1_.graph,g2_.graph
available = []
for a in mod1:
if a in mod2: available.append(a)
moy = []
jaccard = self.similarity_jaccard_node(g1, g2)
for index in available:
sum_ = 0
l_ = 0
for vec in list(g1.nodes()):
for vec2 in list(g2.nodes()):
if vec == vec2:
sim_topo = \
(cosine_similarity(mod1[index][vec].reshape(1, -1), mod2[index][vec2].reshape(1, -1)))[0][0]
sum_ += sim_topo
l_ += 1
if l_ == 0:
moy.append(0.0)
else:
moy.append(sum_ / l_)
return np.mean(moy) * jaccard
def hyper_geo_similarity(self, g1, g2, data_returned=1):
from models.hypergeo import compareWithHyperGeom
probs = compareWithHyperGeom(g1, g2)
return probs[data_returned]
def compute_thematic_similarity(self, index_doc1, index_doc2):
"""
Compute the cosine similarity
"""
try:
v1 = self.bow_[self.id2bow_id[index_doc1]]
v2 = self.bow_[self.id2bow_id[index_doc2]]
except Exception as e:
print("An document id don't exists in the BOW !")
return False
return cosine_similarity(v1, v2)[0][0]
def similarity(self, doc1_id, doc2_id, bow_=False):
if self.only_thematic:
return self.compute_thematic_similarity(doc1_id, doc2_id)
if self.bow_ != None and not bow_:
theme_sim = self.compute_thematic_similarity(doc1_id, doc2_id)
return theme_sim + self.similarity(doc1_id, doc2_id, True)
try:
g1, g2 = self.doc_id2graph[doc1_id], self.doc_id2graph[doc2_id]
if not g1 or not g2: return 0.0
if self.sim_func == "jaccard":
return self.similarity_jaccard(g1.graph, g2.graph)
elif self.sim_func == "heur1":
return heuristic_1(g1,g2)
elif self.sim_func == "sim_wgu":
return self.s_wgu(g1.graph, g2.graph)
elif self.sim_func == "sim_mcs_e":
return self.s_mcs_with_edge(g1.graph, g2.graph)
elif self.sim_func == "node2vec":
return self.node2vec_similarity(g1, g2)
elif self.sim_func == "hypergeo":
return self.hyper_geo_similarity(g1.graph, g2.graph)
else:
return self.s_mcs(g1.graph, g2.graph)
except ZeroDivisionError as e:
return 0.0
def generate_graphs(self, occ=True, adj=True, inc=True,min_graph_size=1):
"""
Generate a spatial graph for each text in the corpus.
[TODO Description]
Parameters
----------
occ : boolean
Include cooccurrency relation in graph
adj : boolean
Include adjacency relation in graph
inc : boolean
Include inclusion relation in graph
"""
self.doc_id2graph = {}
del_node = []
for i in self.doc_id2text:
self.doc_id2graph[i] = self.loadGraph(
self.doc_id2text[i], occ, adj, inc)
if self.doc_id2graph[i] == None\
or len(self.doc_id2graph[i].graph.nodes()) < min_graph_size:
del_node.append(i)
self.not_found = len(del_node)
for i in del_node:
del self.doc_id2label[i]
del self.doc_id2text[i]
del self.doc_id2graph[i]
if self.bow_ != None:
del self.id2bow_id[i]
def extract_common_statistic(self):
"""
Extract common statistics extracted from the corpus
Statistics:
* Number of documents
* Number of documents without Spatial Entity
* Number of documents with Spatial Entity
* Average Size of a graph
* Standard Deviation of graph size
* Average Document size
* Average Edge Intersection length
* Average Node Intersection length
* Maximum Egde Intersection length
* Maximum Node Intersection length
* Average Node Jaccard Similarity
* Average Edge Jaccard Similarity
* Maximum Similarity between graphs
Returns
-------
dict
statistic dictionnary
"""
stats = {}
stats["nb_doc"] = len(self.doc_id2graph.keys())
stats["nb_doc_without_SP_EN"] = self.not_found
stats["nb_doc_without_graph"] = self.not_found
stats["nb_doc_with_SP_EN"] = len(
self.doc_id2graph.keys()) - stats["nb_doc_without_SP_EN"]
sim_data = {}
max_sim_V1 = 0.0
documents_size = []
for kg1 in self.doc_id2graph:
g1 = self.doc_id2graph[kg1]
if g1 == None:
continue
documents_size.append(len(self.doc_id2graph[kg1].corpus))
for kg2 in self.doc_id2graph:
g2 = self.doc_id2graph[kg2]
if g1 == None or g2 == None:
continue
if g1 != g2 and not (g1, g2) in sim_data and not (g2, g1) in sim_data:
if len(g1.graph.nodes()) == 0 or len(g2.graph.nodes()) == 0:
continue
if self.doc_id2label[kg1] == self.doc_id2label[kg2]:
sim_data[(kg1, kg2)] = self.similarity(
kg1, kg2)
if sim_data[(kg1, kg2)] > max_sim_V1 and sim_data[(kg1, kg2)] < 1:
max_sim_V1 = sim_data[(kg1, kg2)]
i, size = 0, 0
size_list = []
for kg1 in self.doc_id2graph:
g1 = self.doc_id2graph[kg1]
if g1 != None:
size_list.append(len(g1.graph.nodes()))
# Graph Size statistics
stats["avg_size"] = np.mean(size_list)
stats["std_size"] = np.std(size_list)
# Average Document size
stats["avg_document_size"] = np.mean(documents_size)
# Max similarity value
stats["max_sim_v1"] = max_sim_V1
return stats
def save_graph_data(self,occ,adj,inc,min_graph_size):
path_temp_dir = "temp_/{4}/{0}_{1}_{2}_{3}/".format(int(occ), int(adj), int(inc), min_graph_size,
self.corpus_name)
if not os.path.exists(path_temp_dir):
os.makedirs(path_temp_dir)
os.makedirs(path_temp_dir+"text")
os.makedirs(path_temp_dir + "graph")
for i in self.doc_id2graph:
nx.write_gexf(self.doc_id2graph[i].graph,path_temp_dir+"graph/{0}".format(i))
open(path_temp_dir + "corpus.json",'w').write(json.dumps(self.doc_id2text))
open(path_temp_dir + "labels.json", 'w').write(json.dumps(self.doc_id2label))
# coding = utf-8
from termcolor import colored
class NotADisambiguatorInstance(Exception):
def __init__(self):
# Call the base class constructor with the parameters it needs
super(Exception, self).__init__(colored("Setting disambiguator: Give a Disambiguator or Disambiguator sub-class instance","red"))
# coding = utf-8
from termcolor import colored
class LanguageNotAvailable(Exception):
def __init__(self, lang, object):
# Call the base class constructor with the parameters it needs
super(Exception, self).__init__("{0} not available for {1}".format(colored(lang,"red"), colored(object.__class__.__name__,"magenta")))
#coding = utf-8
from termcolor import colored
class ClassifierNotFound(Exception):
def __init__(self, file):
# Call the base class constructor with the parameters it needs
super(Exception, self).__init__("Classifier at {0} doesn't exists. Check your configuration file !".format(colored(file,"red")))
class BinairyDirectoryNotFound(Exception):
def __init__(self, dir,object):
# Call the base class constructor with the parameters it needs
super(Exception, self).__init__("Binairies for {0} at {1} doesn't exists. Check your configuration file !".format(colored(object.__class__.__name__,"magenta"),colored(dir,"red")))
class NotANERInstance(Exception):
def __init__(self):
# Call the base class constructor with the parameters it needs
super(Exception, self).__init__(colored("Setting Named Entity Recognizer: Give a NER or NER sub-class instance","red"))
# coding = utf-8
from termcolor import colored
class NotATaggerInstance(Exception):
def __init__(self):
# Call the base class constructor with the parameters it needs
super(Exception, self).__init__(colored("Setting pos-tagger: Give a Tagger or Tagger sub-class instance","red"))
import numpy as np
"""
Source : https://hackmd.io/s/ryFmIZrsl#
"""
def is_separating_axis(o, p1, p2):
"""
Return True and the push vector if o is a separating axis of p1 and p2.
Otherwise, return False and None.
"""
min1, max1 = float('+inf'), float('-inf')
min2, max2 = float('+inf'), float('-inf')
for v in p1:
projection = np.dot(v, o)
min1 = min(min1, projection)
max1 = max(max1, projection)
for v in p2:
projection = np.dot(v, o)
min2 = min(min2, projection)
max2 = max(max2, projection)
if max1 >= min2 and max2 >= min1:
d = min(max2 - min1, max1 - min2)
# push a bit more than needed so the shapes do not overlap in future
# tests due to float precision
d_over_o_squared = d/np.dot(o, o) + 1e-10
pv = d_over_o_squared*o
return False, pv
else:
return True, None
def edges_of(vertices):
"""
Return the vectors for the edges of the polygon p.
p is a polygon.
"""
edges = []
N = len(vertices)
for i in range(N):
edge = vertices[(i + 1)%N] - vertices[i]
edges.append(edge)
return edges
def orthogonal(v):
"""
Return a 90 degree clockwise rotation of the vector v.
"""
return np.array([-v[1], v[0]])
def collide(p1, p2):
'''
Return True and the MPV if the shapes collide. Otherwise, return False and
None.
p1 and p2 are lists of ordered pairs, the vertices of the polygons in the
counterclockwise direction.
'''
p1 = [np.array(v, 'float64') for v in p1]
p2 = [np.array(v, 'float64') for v in p2]
edges = edges_of(p1)
edges += edges_of(p2)
orthogonals = [orthogonal(e) for e in edges]
push_vectors = []
for o in orthogonals:
separates, pv = is_separating_axis(o, p1, p2)
if separates:
# they do not collide and there is no push vector
return False, None
else:
push_vectors.append(pv)
# they do collide and the push_vector with the smallest length is the MPV
mpv = min(push_vectors, key=(lambda v: np.dot(v, v)))
# assert mpv pushes p1 away from p2
d = centers_displacement(p1, p2) # direction from p1 to p2
if np.dot(d, mpv) > 0: # if it's the same direction, then invert
mpv = -mpv
return True, mpv
def centers_displacement(p1, p2):
"""
Return the displacement between the geometric center of p1 and p2.
"""
# geometric center
c1 = np.mean(np.array(p1), axis=0)
c2 = np.mean(np.array(p2), axis=0)
return c2 - c1
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment