Commit e9d151de authored by Fize Jacques's avatar Fize Jacques
Browse files

Debug spatial relation extraction

Debug disambiguator
delete old disambiguator classes
Add Parallelization for STR generation and Transform
parent 176a106e
No related merge requests found
Showing with 228 additions and 450 deletions
+228 -450
......@@ -80,7 +80,7 @@ def getGEO(id_se):
if "path" in data.other:
return gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"])).convex_hull
elif "coord" in data.other:
return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename(
return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(0.5)])).rename(
columns={0: 'geometry'})
return None
......@@ -144,4 +144,5 @@ def collisionTwoSEBoundaries(id_se1, id_se2):
__cache_adjacency[id_se1][id_se2] = True
return True
__cache_adjacency[id_se1][id_se2] = False
return False
\ No newline at end of file
return False
# coding = utf-8
from shapely.geometry import Point
from .collision import collide
from .geo_relation_database import GeoRelationMatchingDatabase
from ..helpers.geodict_helpers import gazetteer
class RelationExtractor():
__cache_entity_data = {}
def __init__(self, pre_computed={}):
self.db_rel_match = pre_computed
def is_relation(self, id_se1: str, id_se2: str):
raise NotImplementedError()
def get_data(self, id_se):
"""
Return an gazpy.Element object containing information about a spatial entity.
Parameters
----------
id_se : str
Identifier of the spatial entity
Returns
-------
gazpy.Element
data
"""
if id_se in RelationExtractor.__cache_entity_data:
return RelationExtractor.__cache_entity_data[id_se]
data = gazetteer.get_by_id(id_se)
if len(data) > 0:
RelationExtractor.__cache_entity_data[id_se] = data[0]
return data[0]
def in_cache(self, id_se1: str, id_se2: str):
raise NotImplementedError()
def add_cache(self,id_se1: str, id_se2: str, value : bool, two_way:bool = False):
if id_se1 not in self.db_rel_match:
self.db_rel_match[id_se1] = {}
if two_way and id_se2 not in self.db_rel_match:
self.db_rel_match[id_se2] = {}
self.db_rel_match[id_se1][id_se2] = value
if two_way:
self.db_rel_match[id_se2][id_se1] = value
class InclusionRelation(RelationExtractor):
def __init__(self, precomputed={}):
RelationExtractor.__init__(self, precomputed)
def in_cache(self, id_se1: str, id_se2: str):
if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]:
return True, self.db_rel_match[id_se1][id_se2]
return False, False
def is_relation(self, id_se1: str, id_se2: str):
found_, value = self.in_cache(id_se1, id_se2)
if found_:
return value
inc_chain_P131, inc_chain_P706 = self.get_inclusion_chain(id_se1, "P131"), self.get_inclusion_chain(id_se1,"P706")
inc_chain = inc_chain_P131
inc_chain.extend(inc_chain_P706)
inc_chain = set(inc_chain)
if id_se2 in inc_chain:
self.add_cache(id_se1, id_se2, True)
return True
self.add_cache(id_se1, id_se2, False)
return False
def get_inclusion_chain(self, id_, prop):
"""
For an entity return it geographical inclusion tree using a property.
"""
arr__ = []
try:
current_entity = gazetteer.get_by_id(id_)[0]
if "inc_" + prop in current_entity.other:
arr__ = current_entity.other["inc_" + prop]
elif "inc_geoname" in current_entity.other:
arr__ = current_entity.other.inc_geoname
if isinstance(arr__, str):
arr__ = [arr__]
except:
pass
return arr__
class AdjacencyRelation(RelationExtractor):
def __init__(self, precomputed={},inc_rel_extractor=InclusionRelation()):
RelationExtractor.__init__(self, precomputed)
self.inc_rel_extractor=inc_rel_extractor
def in_cache(self, id_se1: str, id_se2: str):
if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]:
return True, self.db_rel_match[id_se1][id_se2]
elif id_se2 in self.db_rel_match and id_se1 in self.db_rel_match[id_se2]:
return True, self.db_rel_match[id_se2][id_se1]
return False, False
def is_relation(self, id_se1: str, id_se2: str):
found_, value = self.in_cache(id_se1, id_se2)
if found_:
return value
stop_class = {"A-PCLI", "A-ADM1"}
def get_p47_adjacency_data(data):
p47se1 = []
for el in data.other.P47:
d = gazetteer.get_by_other_id(el, "wikidata")
if not d: continue
p47se1.append(d[0].id)
return p47se1
if self.inc_rel_extractor.is_relation(id_se1, id_se2) or self.inc_rel_extractor.is_relation(id_se2, id_se1):
self.add_cache(id_se1, id_se2, False, True)
return False
data_se1, data_se2 = self.get_data(id_se1), self.get_data(id_se2)
if not data_se1 or not data_se2:
self.add_cache(id_se1, id_se2, False, True)
return False
if "P47" in data_se2.other and id_se1 in get_p47_adjacency_data(data_se2):
self.add_cache(id_se1, id_se2, True, True)
return True
elif "P47" in data_se1.other and id_se2 in get_p47_adjacency_data(data_se1):
self.add_cache(id_se1, id_se2, True,True)
return True
if collide(id_se1, id_se2):
self.add_cache(id_se1, id_se2, True,True)
return True
if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2:
if Point(data_se1.coord.lon, data_se1.coord.lat).distance(
Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len(
set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1:
self.add_cache(id_se1, id_se2, True,True)
return True
self.add_cache(id_se1, id_se2, False,True)
return False
......@@ -74,11 +74,11 @@ class RelationExtractor(MetaCollector):
spatial_entities : list
list of spatial entities identifier
"""
self.spatial_entities = spatial_entities
self.spatial_entities = list(set(spatial_entities))
# Retrieve Geometries
data = [[sp_id, getGEO(sp_id)] for sp_id in
tqdm(spatial_entities, desc="Retrieving Geometries...")]
tqdm(self.spatial_entities, desc="Retrieving Geometries...")]
self.all_geometry = []
for i in data:
......@@ -116,18 +116,18 @@ class RelationExtractor(MetaCollector):
except Exception as e:
print(e)
corr_ = gdf_intersect.iloc[:, 2:] ^ (gdf_within.iloc[:,2:] | gdf_within.iloc[:,2:].T) # An entity cannot be related to an other entity by two type of relation
adj_ = gdf_intersect.iloc[:, 2:] & corr_ # because if include and not adjacent does not mean Adjacent !
gdf_intersect.set_index("id", inplace=True)
gdf_within.set_index("id", inplace=True)
del gdf_intersect["geometry"]
del gdf_within["geometry"]
gdf_adjacency = gdf_within.iloc[:, :2]
gdf_adjacency = pd.concat((gdf_adjacency, adj_), axis=1) # Stuck id and geom to adjacency data
corr_ = gdf_intersect ^ (gdf_within | gdf_within.T) # An entity cannot be related to an other entity by two type of relation
adj_ = gdf_intersect & corr_ # because if include and not adjacent does not mean Adjacent !
del gdf_adjacency["geometry"]
del gdf_within["geometry"]
# Transform to dict for a fastest access !
self.adjacency_geom = gdf_adjacency.set_index("id")
self.inclusion_geom = gdf_within.set_index("id")
self.adjacency_geom = adj_
self.inclusion_geom = gdf_within
def get_relation_meta_based(self):
"""
......@@ -158,7 +158,7 @@ class RelationExtractor(MetaCollector):
adj_res[se2][se1] = adj_res[se1][se2]
self.adjacency_meta = pd.DataFrame.from_dict(adj_res)
self.inclusion_meta = pd.DataFrame.from_dict(inc_res)
self.inclusion_meta = pd.DataFrame.from_dict(inc_res,orient="index")
def fuse_meta_and_geom(self):
"""
......@@ -176,16 +176,15 @@ class RelationExtractor(MetaCollector):
self.inclusion_meta.sort_index(inplace=True)
self.adjacency_geom.sort_index(inplace=True)
self.inclusion_geom.sort_index(inplace=True)
self.adjacency_meta.sort_index(axis=1, inplace=True)
self.inclusion_meta.sort_index(axis=1, inplace=True)
self.adjacency_geom.sort_index(axis=1, inplace=True)
self.inclusion_geom.sort_index(axis=1, inplace=True)
df_adj = self.adjacency_meta.copy()
df_inc = self.inclusion_meta.copy()
df_adj.iloc[:, :] = self.adjacency_meta | self.adjacency_geom
df_inc.iloc[:, :] = self.inclusion_meta | self.inclusion_geom
df_inc = self.inclusion_meta
self.adjacency_geom = (self.adjacency_geom ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_geom
self.adjacency_meta = (self.adjacency_meta ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_meta
df_adj = (self.adjacency_geom | self.adjacency_meta)
return df_adj, df_inc
......
......@@ -39,7 +39,7 @@ class STR(object):
"""
__cache_entity_data = {} # Store data about entity requested
def __init__(self, tagged_text, spatial_entities,toponym_first=True):
def __init__(self, tagged_text, spatial_entities,toponym_first=True, precomputed_inc={}, precomputed_adj={}):
"""
Constructor
......@@ -64,8 +64,8 @@ class STR(object):
self.adjacency_relationships = {}
self.inclusion_relationships = {}
self.adj_rel_db=AdjacencyRelation()
self.inc_rel_db = InclusionRelation()
self.inc_rel_db = InclusionRelation(precomputed_inc)
self.adj_rel_db = AdjacencyRelation(precomputed_adj,self.inc_rel_db)
self.graph = nx.MultiDiGraph()
......
......@@ -30,8 +30,10 @@ class Disambiguator(object):
dict
{toponym : geodictID}
"""
if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2:
if isinstance(ner_output, np.ndarray) and len(ner_output.shape) == 2 and ner_output.shape[1] == 2:
toponyms = self.parse_ner_output(ner_output)
elif len(ner_output.shape) != 2:
return {}
elif not toponyms:
raise ValueError("Either enter a list of toponyms or give ner_output")
if self.context_based:
......
# coding = utf-8
\ No newline at end of file
# coding = utf-8
import copy
import string
import numpy as np
from ..ner.ner import NER
class Disambiguator(object):
def __init__(self):
"""Constructor for Disambiguator"""
pass
def extract_se_entities(self, input):
out = Disambiguator.parse_corpus(input)
en_ = out[out[:, 1] == NER._unified_tag["place"]]
return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
def toponymes_frequencies(self, ens_):
count = {}
for en in ens_:
if not en in count: count[en] = 0
count[en] += 1
return count
@staticmethod
def parse_corpus(corpus):
final_corpus = []
t = 0
placeTag = NER._unified_tag["place"]
while t < len(corpus):
tag = copy.copy(corpus[t])
if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
lenw = 1
if tag[1] == "BEG-" + placeTag:
compound_tag = tag[0]
t += 1
while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
tag = copy.copy(corpus[t])
if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
compound_tag += tag[0]
else:
compound_tag += " " + tag[0]
t += 1
lenw += 1
tag[0] = compound_tag
tag[1] = placeTag
t += 1
else:
t += 1
final_corpus.append(tag)
return np.array(final_corpus)
def disambiguate(self, ner_result):
pass
def disambiguate_list(self,toponyms,lang):
pass
\ No newline at end of file
# coding = utf-8
import math
from ...helpers.collision import *
#from ...helpers.geodict_helpers_old import *
from ...helpers.geodict_helpers import *
from .disambiguator import Disambiguator
from ...models.str import get_inclusion_chain
class GauravGeodict(Disambiguator):
def __init__(self):
Disambiguator.__init__(self)
def fib_formula(self, n):
if n in [0, 1]: return 0 # Modifying fibonacci behaviour
golden_ratio = (1 + math.sqrt(5)) / 2
val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
return int(round(val))
def inclusion_log(self, x, alpha=0.2):
if x==0:
return 1
return math.log(x)
def get_inclusion_tree(self, id_, prop):
"""
For an entity return it geographical inclusion tree using a property.
"""
arr = []
current_entity = gazetteer.get_by_id(id_)[0]
while True:
if prop in current_entity:
arr.append(current_entity[prop][0])
current_entity = gazetteer.get_by_other_id(current_entity[prop][0],"wikidata")
else:
arr.append(gazetteer.get_by_label("Earth","en")[0].id) # Earth ID
break
return arr
def get_inclusion_score(self, id1, id2): # is it really inclusion ? :)
list1 = get_inclusion_chain(id1, 'P131')
list2 = get_inclusion_chain(id2, 'P131')
interP131 = len(list(set(list1).intersection(list2)))
list1 = get_inclusion_chain(id1, 'P706')
list2 = get_inclusion_chain(id2, 'P706')
interP706 = len(list(set(list1).intersection(list2)))
# return fib_no[interP131]+fib_no[interP706]
return self.inclusion_log(interP131) + self.inclusion_log(interP706)
def Adjacency_P47(self, id1, id2):
data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
if "P47" in data_1 and "P47" in data_2:
if id1 in data_2.other.P47 or id2 in data_1.other.P47:
return True
return False
def Adjacency_Hull(self, id1, id2):
return collisionTwoSEBoundaries(id1, id2)
def disambiguateOne(self, spat_candidates, fixed_entities):
score_dc = {}
for cand in spat_candidates:
id_cand = cand.id
score_dc[id_cand] = 0
for fixed in fixed_entities:
id_fixed = fixed_entities[fixed].id
if self.Adjacency_P47(id_cand, id_fixed):
score_dc[id_cand] += 3
elif self.Adjacency_Hull(id_cand, id_fixed):
score_dc[id_cand] += 2
score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
m = max(score_dc, key=score_dc.get)
if score_dc[m] < 4:
return None
for cand in spat_candidates:
if cand.id == m:
return cand.id
def eval(self,se_,lang):
selected_en = {}
fixed_entities = {}
ambiguous_entities = {}
for en in se_:
request = gazetteer.get_by_label(en, lang)
if len(request) == 0:
request = gazetteer.get_by_alias(en, lang)
if len(request) > 1:
ambiguous_entities[en] = request
elif len(request) == 1:
fixed_entities[en] = request[0]
d_amb_results = {}
for amb_ent in ambiguous_entities:
d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
if not d:
d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
else:
d_amb_results[amb_ent] = d
#print(fixed_entities)
for k, v in fixed_entities.items():
fixed_entities[k] = v.id
for k, v in d_amb_results.items():
fixed_entities[k] = v
return fixed_entities
\ No newline at end of file
# coding = utf-8
\ No newline at end of file
# coding = utf-8
class BigramModel:
def __init__(self,freq={},count={}):
self.cooc_freq=freq
self.count_associated=count
def append(self,uri1,uri2):
if not uri1 in self.cooc_freq:
self.cooc_freq[uri1]={}
if not uri2 in self.cooc_freq[uri1]:
self.cooc_freq[uri1][uri2]=0
self.cooc_freq[uri1][uri2]+=1
self.increment_count(uri2)
def increment_count(self,uri):
if not uri in self.count_associated:
self.count_associated[uri]=0
self.count_associated[uri]+=1
def get_coocurence_probability(self, pr1, *args):
if len(args) < 2:
print("Only one URI indicated")
return 0.
res_=1.
for u in range(1,len(args)):
res_*=self.get_bigram_probability(args[0],args[u],pr1)
return res_
def get_bigram_probability(self,uri1,uri2,pr1=1):
nna=0.00000001
if uri1 in self.cooc_freq:
if uri2 in self.cooc_freq[uri1]:
return self.cooc_freq[uri1][uri2]
#return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1
elif uri2 in self.cooc_freq:
if uri1 in self.cooc_freq[uri2]:
return self.cooc_freq[uri2][uri1]
#return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
return nna
# coding = utf-8
from ...helpers.geodict_helpers import *
from .disambiguator import Disambiguator
import re, json, os
from ...config.configuration import config
from inflector import Inflector,English,Spanish,French
inflectors= {
"en":Inflector(English()),
"fr":Inflector(French()),
"es":Inflector(Spanish())
}
stop_words = {
"fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")),
"en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n"))
}
common_words = {
"fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))),
"en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n"))
}
class MostCommonDisambiguator(Disambiguator):
def __init__(self):
Disambiguator.__init__(self)
def disambiguate(self, ner_result, lang="en"):
count, se_ = self.extract_se_entities(ner_result)
new_count = {}
selected_en = {}
for en in se_:
id_,score=self.disambiguate_(en,lang)
if not id_ == "O" and id_:
selected_en[id_] = en
new_count[id_] = count[en]
return new_count, selected_en
def disambiguate_list(self,toponyms,lang):
result={}
for toponym in toponyms:
id_,_=self.disambiguate_(toponym,lang)
if id_:
result[id_]=toponym
return result
def disambiguate_(self, label, lang='fr'):
if re.match("^\d+$", label):
return 'O', -1
if lang in stop_words: #and lang in common_words:
if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
return 'O', -1
if lang in inflectors:
plural=inflectors[lang].singularize(label)
else:
plural = label.rstrip("s") + "s"
if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
return 'O', -1
data=get_most_common_id_v3(label, lang)
id_, score=None,0
if data:
id_,score=data.id,data.score
return id_, score
# coding = utf-8
import re
from .disambiguator import Disambiguator
from .models.bigram import BigramModel
import pickle
from ...config.configuration import config
#from ...helpers.geodict_helpers_old import *
from ...helpers.geodict_helpers import *
from .most_common import stop_words,common_words
import networkx as nx
def read_pickle(fn):
return pickle.load(open(fn,'rb'))
class WikipediaDisambiguator(Disambiguator):
def __init__(self,measure="degree"):
Disambiguator.__init__(self)
# Load model
self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
self.measure=measure
def disambiguate(self, ner_result, lang="en"):
count, se_ = self.extract_se_entities(ner_result)
new_count = {}
selected_en_rev = {}
selected_en = self.disambiguate_wiki(se_,lang)
for en in selected_en:
selected_en_rev[en]=selected_en[en]
#new_count[selected_en[en]] = count[en]
return new_count, selected_en
def disambiguate_list(self,toponyms,lang):
result=self.disambiguate_wiki(toponyms,lang)
return {k:v for k,v in result.items() if v}
def disambiguate_wiki(self, entities, lang):
spat_en=[]
for e in entities:
if re.match("^\d+$", e):
continue
if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
continue
plural = e.rstrip("s") + "s"
if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
continue
spat_en.append(e)
spat_en=list(set(spat_en))
g = nx.Graph()
possible_candidates = []
betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
group_candidate = {} #candidates per toponym
for e in spat_en:
cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
cand = [c.id for c in cand if c]
if not cand:
cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
group_candidate[e] = cand
betw_cand[e]=cand
for n in cand:
betw_cand[n]=set(cand)-set(n)
possible_candidates.extend(cand)
for cand in possible_candidates:
g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
for cand in possible_candidates:
for cand2 in possible_candidates:
# Get PageRank score
d = data_candidate[cand]
sc = 1
sc=d.score
# Compute probability
prob = self.model.get_coocurence_probability(sc, cand, cand2)
if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
prob = 0.0
if prob < 0.0000001:
prob = 0.0
if not cand == cand2:
# take the lowest co-occurrency between two candidates
if g.has_edge(cand2, cand) :
if g.edges[cand2,cand]["weight"] < prob:
continue
g.add_edge(cand, cand2, weight=prob)
selected = {}
#Take the candidates with the highest degree weighted
for gr in group_candidate:
try:
if self.measure == "degree":
selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
elif self.measure == "centrality":
selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight"))
else:# degree by default
selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
#print(1)
except Exception as e:
selected[gr]=get_most_common_id_v3(gr,lang)
return selected
from .spacy import Spacy
from .nltk import NLTK
from .polyglot import Polyglot
#from .polyglot import Polyglot
from .stanford_ner import StanfordNER
from .ner import NER
\ No newline at end of file
......@@ -2,8 +2,10 @@
import re
from nltk import word_tokenize
from joblib import Parallel, delayed
from strpython.models.str import STR
from .models.spatial_relation import RelationExtractor
from .models.str import STR
from .models.transformation.transform import Generalisation, Expansion
from .nlp.disambiguator import *
......@@ -11,12 +13,13 @@ from .nlp.ner import *
from .nlp.exception.disambiguator import NotADisambiguatorInstance
from .nlp.exception.ner import NotANERInstance
from .nlp.exception.tagger import NotATaggerInstance
from .nlp.pos_tagger.tagger import Tagger
from .nlp.pos_tagger.treetagger import TreeTagger
from multiprocessing import cpu_count
from mytoolbox.env import in_notebook
if in_notebook():
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
else:
from tqdm import tqdm
class Pipeline(object):
......@@ -76,28 +79,46 @@ class Pipeline(object):
else:
raise NotADisambiguatorInstance()
def build(self,text,se_identified=None, **kwargs):
"""
Return the corresponding STR for a text.
:param text:
:return: STR
def extract_all_relation(self,spatial_entities):
"""
toponyms= kwargs.get("toponyms", None)
stop_words=kwargs.get("stop_words",[])
if isinstance(toponyms,list):
se_identified = self.disambiguator.disambiguate(self.lang,toponyms=[top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3])
input = ""
Extract relation information between spatial entities
Parameters
----------
spatial_entities
elif se_identified:
input, se_identified = self.parse(text)
else:
input,se_identified=self.parse(text)
Returns
-------
str_=STR(word_tokenize(input),se_identified,toponym_first=True)
str_.build(adj=True,inc=True)
str_=self.transform(str_,**kwargs)
"""
r = RelationExtractor(spatial_entities)
r.get_relation_geometry_based()
r.get_relation_meta_based()
df_adj, df_inc = r.fuse_meta_and_geom()
dict_adj = df_adj.to_dict()
dict_inc = df_inc.to_dict()
return dict_adj, dict_inc
def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs):
text_and_spatial_entities = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts"))
sp_es= []
for res in text_and_spatial_entities:
sp_es.extend(list(res[1].values()))
sp_es= [es for es in sp_es if es.startswith("GD")]
print("Extract Spatial Relation for all identified spatial entities")
adj_rel_dict, inc_rel_dict = self.extract_all_relation(sp_es)
str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], adj_rel_dict, inc_rel_dict, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR"))
return str_s
def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs):
str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.transform)(str_, **kwargs) for str_ in tqdm(strs_,desc="Transform STR"))
return str_s
def build(self, text_input, spatial_entities_identified, prec_adj, prec_inc):
str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=prec_adj,precomputed_inc=prec_inc)
str_.build(adj=True, inc=True)
return str_
def transform(self,str_,**kwargs):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment