From e9d151de8c505f1edfb680c0c376a3ac43160a7e Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Mon, 18 Mar 2019 17:03:01 +0100 Subject: [PATCH] Debug spatial relation extraction Debug disambiguator delete old disambiguator classes Add Parallelization for STR generation and Transform --- strpython/helpers/collision.py | 5 +- strpython/helpers/relation_extraction.py | 157 ++++++++++++++++++ strpython/models/spatial_relation.py | 31 ++-- strpython/models/str.py | 6 +- strpython/nlp/disambiguator/disambiguator.py | 4 +- strpython/nlp/disambiguator_old/__init__.py | 1 - .../nlp/disambiguator_old/disambiguator.py | 62 ------- .../nlp/disambiguator_old/geodict_gaurav.py | 111 ------------- .../nlp/disambiguator_old/models/__init__.py | 1 - .../nlp/disambiguator_old/models/bigram.py | 46 ----- .../nlp/disambiguator_old/most_common.py | 71 -------- .../nlp/disambiguator_old/wikipedia_cooc.py | 110 ------------ strpython/nlp/ner/__init__.py | 2 +- strpython/pipeline.py | 71 +++++--- 14 files changed, 228 insertions(+), 450 deletions(-) create mode 100644 strpython/helpers/relation_extraction.py delete mode 100644 strpython/nlp/disambiguator_old/__init__.py delete mode 100644 strpython/nlp/disambiguator_old/disambiguator.py delete mode 100644 strpython/nlp/disambiguator_old/geodict_gaurav.py delete mode 100644 strpython/nlp/disambiguator_old/models/__init__.py delete mode 100644 strpython/nlp/disambiguator_old/models/bigram.py delete mode 100644 strpython/nlp/disambiguator_old/most_common.py delete mode 100644 strpython/nlp/disambiguator_old/wikipedia_cooc.py diff --git a/strpython/helpers/collision.py b/strpython/helpers/collision.py index 25d4f95..86a9dbb 100644 --- a/strpython/helpers/collision.py +++ b/strpython/helpers/collision.py @@ -80,7 +80,7 @@ def getGEO(id_se): if "path" in data.other: return gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"])).convex_hull elif "coord" in data.other: - return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename( + return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(0.5)])).rename( columns={0: 'geometry'}) return None @@ -144,4 +144,5 @@ def collisionTwoSEBoundaries(id_se1, id_se2): __cache_adjacency[id_se1][id_se2] = True return True __cache_adjacency[id_se1][id_se2] = False - return False \ No newline at end of file + return False + diff --git a/strpython/helpers/relation_extraction.py b/strpython/helpers/relation_extraction.py new file mode 100644 index 0000000..86585e7 --- /dev/null +++ b/strpython/helpers/relation_extraction.py @@ -0,0 +1,157 @@ +# coding = utf-8 +from shapely.geometry import Point + +from .collision import collide +from .geo_relation_database import GeoRelationMatchingDatabase +from ..helpers.geodict_helpers import gazetteer + + +class RelationExtractor(): + __cache_entity_data = {} + + def __init__(self, pre_computed={}): + self.db_rel_match = pre_computed + + def is_relation(self, id_se1: str, id_se2: str): + raise NotImplementedError() + + def get_data(self, id_se): + """ + Return an gazpy.Element object containing information about a spatial entity. + + Parameters + ---------- + id_se : str + Identifier of the spatial entity + + Returns + ------- + gazpy.Element + data + """ + + if id_se in RelationExtractor.__cache_entity_data: + return RelationExtractor.__cache_entity_data[id_se] + data = gazetteer.get_by_id(id_se) + if len(data) > 0: + RelationExtractor.__cache_entity_data[id_se] = data[0] + return data[0] + + def in_cache(self, id_se1: str, id_se2: str): + raise NotImplementedError() + + def add_cache(self,id_se1: str, id_se2: str, value : bool, two_way:bool = False): + if id_se1 not in self.db_rel_match: + self.db_rel_match[id_se1] = {} + if two_way and id_se2 not in self.db_rel_match: + self.db_rel_match[id_se2] = {} + + self.db_rel_match[id_se1][id_se2] = value + if two_way: + self.db_rel_match[id_se2][id_se1] = value + + +class InclusionRelation(RelationExtractor): + + def __init__(self, precomputed={}): + RelationExtractor.__init__(self, precomputed) + + def in_cache(self, id_se1: str, id_se2: str): + if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]: + return True, self.db_rel_match[id_se1][id_se2] + return False, False + + def is_relation(self, id_se1: str, id_se2: str): + found_, value = self.in_cache(id_se1, id_se2) + if found_: + return value + + inc_chain_P131, inc_chain_P706 = self.get_inclusion_chain(id_se1, "P131"), self.get_inclusion_chain(id_se1,"P706") + inc_chain = inc_chain_P131 + inc_chain.extend(inc_chain_P706) + inc_chain = set(inc_chain) + + if id_se2 in inc_chain: + self.add_cache(id_se1, id_se2, True) + return True + + self.add_cache(id_se1, id_se2, False) + return False + + def get_inclusion_chain(self, id_, prop): + """ + For an entity return it geographical inclusion tree using a property. + """ + arr__ = [] + try: + current_entity = gazetteer.get_by_id(id_)[0] + if "inc_" + prop in current_entity.other: + arr__ = current_entity.other["inc_" + prop] + elif "inc_geoname" in current_entity.other: + arr__ = current_entity.other.inc_geoname + if isinstance(arr__, str): + arr__ = [arr__] + except: + pass + return arr__ + + +class AdjacencyRelation(RelationExtractor): + + def __init__(self, precomputed={},inc_rel_extractor=InclusionRelation()): + RelationExtractor.__init__(self, precomputed) + self.inc_rel_extractor=inc_rel_extractor + + def in_cache(self, id_se1: str, id_se2: str): + if id_se1 in self.db_rel_match and id_se2 in self.db_rel_match[id_se1]: + return True, self.db_rel_match[id_se1][id_se2] + elif id_se2 in self.db_rel_match and id_se1 in self.db_rel_match[id_se2]: + return True, self.db_rel_match[id_se2][id_se1] + return False, False + + def is_relation(self, id_se1: str, id_se2: str): + found_, value = self.in_cache(id_se1, id_se2) + if found_: + return value + + stop_class = {"A-PCLI", "A-ADM1"} + + def get_p47_adjacency_data(data): + p47se1 = [] + for el in data.other.P47: + d = gazetteer.get_by_other_id(el, "wikidata") + if not d: continue + p47se1.append(d[0].id) + return p47se1 + + + if self.inc_rel_extractor.is_relation(id_se1, id_se2) or self.inc_rel_extractor.is_relation(id_se2, id_se1): + self.add_cache(id_se1, id_se2, False, True) + return False + + data_se1, data_se2 = self.get_data(id_se1), self.get_data(id_se2) + if not data_se1 or not data_se2: + self.add_cache(id_se1, id_se2, False, True) + return False + + if "P47" in data_se2.other and id_se1 in get_p47_adjacency_data(data_se2): + self.add_cache(id_se1, id_se2, True, True) + return True + + elif "P47" in data_se1.other and id_se2 in get_p47_adjacency_data(data_se1): + self.add_cache(id_se1, id_se2, True,True) + return True + + if collide(id_se1, id_se2): + self.add_cache(id_se1, id_se2, True,True) + return True + + if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2: + if Point(data_se1.coord.lon, data_se1.coord.lat).distance( + Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( + set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: + self.add_cache(id_se1, id_se2, True,True) + return True + + self.add_cache(id_se1, id_se2, False,True) + return False diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py index b3b02fc..8f3f26d 100644 --- a/strpython/models/spatial_relation.py +++ b/strpython/models/spatial_relation.py @@ -74,11 +74,11 @@ class RelationExtractor(MetaCollector): spatial_entities : list list of spatial entities identifier """ - self.spatial_entities = spatial_entities + self.spatial_entities = list(set(spatial_entities)) # Retrieve Geometries data = [[sp_id, getGEO(sp_id)] for sp_id in - tqdm(spatial_entities, desc="Retrieving Geometries...")] + tqdm(self.spatial_entities, desc="Retrieving Geometries...")] self.all_geometry = [] for i in data: @@ -116,18 +116,18 @@ class RelationExtractor(MetaCollector): except Exception as e: print(e) - corr_ = gdf_intersect.iloc[:, 2:] ^ (gdf_within.iloc[:,2:] | gdf_within.iloc[:,2:].T) # An entity cannot be related to an other entity by two type of relation - adj_ = gdf_intersect.iloc[:, 2:] & corr_ # because if include and not adjacent does not mean Adjacent ! + gdf_intersect.set_index("id", inplace=True) + gdf_within.set_index("id", inplace=True) + del gdf_intersect["geometry"] + del gdf_within["geometry"] - gdf_adjacency = gdf_within.iloc[:, :2] - gdf_adjacency = pd.concat((gdf_adjacency, adj_), axis=1) # Stuck id and geom to adjacency data + corr_ = gdf_intersect ^ (gdf_within | gdf_within.T) # An entity cannot be related to an other entity by two type of relation + adj_ = gdf_intersect & corr_ # because if include and not adjacent does not mean Adjacent ! - del gdf_adjacency["geometry"] - del gdf_within["geometry"] # Transform to dict for a fastest access ! - self.adjacency_geom = gdf_adjacency.set_index("id") - self.inclusion_geom = gdf_within.set_index("id") + self.adjacency_geom = adj_ + self.inclusion_geom = gdf_within def get_relation_meta_based(self): """ @@ -158,7 +158,7 @@ class RelationExtractor(MetaCollector): adj_res[se2][se1] = adj_res[se1][se2] self.adjacency_meta = pd.DataFrame.from_dict(adj_res) - self.inclusion_meta = pd.DataFrame.from_dict(inc_res) + self.inclusion_meta = pd.DataFrame.from_dict(inc_res,orient="index") def fuse_meta_and_geom(self): """ @@ -176,16 +176,15 @@ class RelationExtractor(MetaCollector): self.inclusion_meta.sort_index(inplace=True) self.adjacency_geom.sort_index(inplace=True) self.inclusion_geom.sort_index(inplace=True) - self.adjacency_meta.sort_index(axis=1, inplace=True) self.inclusion_meta.sort_index(axis=1, inplace=True) self.adjacency_geom.sort_index(axis=1, inplace=True) self.inclusion_geom.sort_index(axis=1, inplace=True) - df_adj = self.adjacency_meta.copy() - df_inc = self.inclusion_meta.copy() - df_adj.iloc[:, :] = self.adjacency_meta | self.adjacency_geom - df_inc.iloc[:, :] = self.inclusion_meta | self.inclusion_geom + df_inc = self.inclusion_meta + self.adjacency_geom = (self.adjacency_geom ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_geom + self.adjacency_meta = (self.adjacency_meta ^ (self.inclusion_meta | self.inclusion_meta.T)) & self.adjacency_meta + df_adj = (self.adjacency_geom | self.adjacency_meta) return df_adj, df_inc diff --git a/strpython/models/str.py b/strpython/models/str.py index 6612e89..ca8b46c 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -39,7 +39,7 @@ class STR(object): """ __cache_entity_data = {} # Â Store data about entity requested - def __init__(self, tagged_text, spatial_entities,toponym_first=True): + def __init__(self, tagged_text, spatial_entities,toponym_first=True, precomputed_inc={}, precomputed_adj={}): """ Constructor @@ -64,8 +64,8 @@ class STR(object): self.adjacency_relationships = {} self.inclusion_relationships = {} - self.adj_rel_db=AdjacencyRelation() - self.inc_rel_db = InclusionRelation() + self.inc_rel_db = InclusionRelation(precomputed_inc) + self.adj_rel_db = AdjacencyRelation(precomputed_adj,self.inc_rel_db) self.graph = nx.MultiDiGraph() diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py index 927a70f..54defd1 100644 --- a/strpython/nlp/disambiguator/disambiguator.py +++ b/strpython/nlp/disambiguator/disambiguator.py @@ -30,8 +30,10 @@ class Disambiguator(object): dict {toponym : geodictID} """ - if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2: + if isinstance(ner_output, np.ndarray) and len(ner_output.shape) == 2 and ner_output.shape[1] == 2: toponyms = self.parse_ner_output(ner_output) + elif len(ner_output.shape) != 2: + return {} elif not toponyms: raise ValueError("Either enter a list of toponyms or give ner_output") if self.context_based: diff --git a/strpython/nlp/disambiguator_old/__init__.py b/strpython/nlp/disambiguator_old/__init__.py deleted file mode 100644 index 950f635..0000000 --- a/strpython/nlp/disambiguator_old/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# coding = utf-8 \ No newline at end of file diff --git a/strpython/nlp/disambiguator_old/disambiguator.py b/strpython/nlp/disambiguator_old/disambiguator.py deleted file mode 100644 index ee0d899..0000000 --- a/strpython/nlp/disambiguator_old/disambiguator.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding = utf-8 - -import copy -import string - -import numpy as np - -from ..ner.ner import NER - - -class Disambiguator(object): - - def __init__(self): - """Constructor for Disambiguator""" - pass - - def extract_se_entities(self, input): - out = Disambiguator.parse_corpus(input) - en_ = out[out[:, 1] == NER._unified_tag["place"]] - return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0]) - - def toponymes_frequencies(self, ens_): - count = {} - for en in ens_: - if not en in count: count[en] = 0 - count[en] += 1 - return count - - @staticmethod - def parse_corpus(corpus): - final_corpus = [] - t = 0 - placeTag = NER._unified_tag["place"] - while t < len(corpus): - tag = copy.copy(corpus[t]) - - if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag: - lenw = 1 - if tag[1] == "BEG-" + placeTag: - compound_tag = tag[0] - t += 1 - while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag: - tag = copy.copy(corpus[t]) - if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation: - compound_tag += tag[0] - else: - compound_tag += " " + tag[0] - t += 1 - lenw += 1 - tag[0] = compound_tag - tag[1] = placeTag - t += 1 - else: - t += 1 - final_corpus.append(tag) - return np.array(final_corpus) - - def disambiguate(self, ner_result): - pass - - def disambiguate_list(self,toponyms,lang): - pass \ No newline at end of file diff --git a/strpython/nlp/disambiguator_old/geodict_gaurav.py b/strpython/nlp/disambiguator_old/geodict_gaurav.py deleted file mode 100644 index 3d59912..0000000 --- a/strpython/nlp/disambiguator_old/geodict_gaurav.py +++ /dev/null @@ -1,111 +0,0 @@ -# coding = utf-8 -import math - -from ...helpers.collision import * -#from ...helpers.geodict_helpers_old import * -from ...helpers.geodict_helpers import * -from .disambiguator import Disambiguator - -from ...models.str import get_inclusion_chain - - -class GauravGeodict(Disambiguator): - - def __init__(self): - Disambiguator.__init__(self) - - def fib_formula(self, n): - if n in [0, 1]: return 0 # Modifying fibonacci behaviour - golden_ratio = (1 + math.sqrt(5)) / 2 - val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5) - return int(round(val)) - - def inclusion_log(self, x, alpha=0.2): - if x==0: - return 1 - return math.log(x) - - def get_inclusion_tree(self, id_, prop): - """ - For an entity return it geographical inclusion tree using a property. - """ - arr = [] - current_entity = gazetteer.get_by_id(id_)[0] - while True: - if prop in current_entity: - arr.append(current_entity[prop][0]) - current_entity = gazetteer.get_by_other_id(current_entity[prop][0],"wikidata") - else: - arr.append(gazetteer.get_by_label("Earth","en")[0].id) # Earth ID - break - return arr - - def get_inclusion_score(self, id1, id2): # is it really inclusion ? :) - list1 = get_inclusion_chain(id1, 'P131') - list2 = get_inclusion_chain(id2, 'P131') - interP131 = len(list(set(list1).intersection(list2))) - list1 = get_inclusion_chain(id1, 'P706') - list2 = get_inclusion_chain(id2, 'P706') - interP706 = len(list(set(list1).intersection(list2))) - # return fib_no[interP131]+fib_no[interP706] - return self.inclusion_log(interP131) + self.inclusion_log(interP706) - - def Adjacency_P47(self, id1, id2): - data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0] - if "P47" in data_1 and "P47" in data_2: - if id1 in data_2.other.P47 or id2 in data_1.other.P47: - return True - return False - - def Adjacency_Hull(self, id1, id2): - return collisionTwoSEBoundaries(id1, id2) - - def disambiguateOne(self, spat_candidates, fixed_entities): - score_dc = {} - - for cand in spat_candidates: - id_cand = cand.id - score_dc[id_cand] = 0 - for fixed in fixed_entities: - id_fixed = fixed_entities[fixed].id - if self.Adjacency_P47(id_cand, id_fixed): - score_dc[id_cand] += 3 - elif self.Adjacency_Hull(id_cand, id_fixed): - score_dc[id_cand] += 2 - score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed) - m = max(score_dc, key=score_dc.get) - if score_dc[m] < 4: - return None - for cand in spat_candidates: - if cand.id == m: - return cand.id - - - def eval(self,se_,lang): - selected_en = {} - fixed_entities = {} - ambiguous_entities = {} - for en in se_: - request = gazetteer.get_by_label(en, lang) - if len(request) == 0: - request = gazetteer.get_by_alias(en, lang) - - if len(request) > 1: - ambiguous_entities[en] = request - elif len(request) == 1: - fixed_entities[en] = request[0] - - d_amb_results = {} - for amb_ent in ambiguous_entities: - d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities) - if not d: - d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id - else: - d_amb_results[amb_ent] = d - #print(fixed_entities) - for k, v in fixed_entities.items(): - fixed_entities[k] = v.id - for k, v in d_amb_results.items(): - fixed_entities[k] = v - - return fixed_entities \ No newline at end of file diff --git a/strpython/nlp/disambiguator_old/models/__init__.py b/strpython/nlp/disambiguator_old/models/__init__.py deleted file mode 100644 index 950f635..0000000 --- a/strpython/nlp/disambiguator_old/models/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# coding = utf-8 \ No newline at end of file diff --git a/strpython/nlp/disambiguator_old/models/bigram.py b/strpython/nlp/disambiguator_old/models/bigram.py deleted file mode 100644 index ec146b4..0000000 --- a/strpython/nlp/disambiguator_old/models/bigram.py +++ /dev/null @@ -1,46 +0,0 @@ -# coding = utf-8 - - -class BigramModel: - def __init__(self,freq={},count={}): - self.cooc_freq=freq - self.count_associated=count - - def append(self,uri1,uri2): - - if not uri1 in self.cooc_freq: - self.cooc_freq[uri1]={} - if not uri2 in self.cooc_freq[uri1]: - self.cooc_freq[uri1][uri2]=0 - self.cooc_freq[uri1][uri2]+=1 - - self.increment_count(uri2) - - def increment_count(self,uri): - if not uri in self.count_associated: - self.count_associated[uri]=0 - self.count_associated[uri]+=1 - - def get_coocurence_probability(self, pr1, *args): - if len(args) < 2: - print("Only one URI indicated") - return 0. - res_=1. - for u in range(1,len(args)): - res_*=self.get_bigram_probability(args[0],args[u],pr1) - return res_ - - - def get_bigram_probability(self,uri1,uri2,pr1=1): - nna=0.00000001 - if uri1 in self.cooc_freq: - if uri2 in self.cooc_freq[uri1]: - return self.cooc_freq[uri1][uri2] - #return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1 - elif uri2 in self.cooc_freq: - if uri1 in self.cooc_freq[uri2]: - return self.cooc_freq[uri2][uri1] - #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1 - return nna - - diff --git a/strpython/nlp/disambiguator_old/most_common.py b/strpython/nlp/disambiguator_old/most_common.py deleted file mode 100644 index 2989325..0000000 --- a/strpython/nlp/disambiguator_old/most_common.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding = utf-8 - - - -from ...helpers.geodict_helpers import * -from .disambiguator import Disambiguator -import re, json, os -from ...config.configuration import config - -from inflector import Inflector,English,Spanish,French - -inflectors= { - "en":Inflector(English()), - "fr":Inflector(French()), - "es":Inflector(Spanish()) -} -stop_words = { - "fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")), - "en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n")) -} - -common_words = { - "fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))), - "en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n")) -} - - -class MostCommonDisambiguator(Disambiguator): - - def __init__(self): - Disambiguator.__init__(self) - - def disambiguate(self, ner_result, lang="en"): - count, se_ = self.extract_se_entities(ner_result) - new_count = {} - selected_en = {} - for en in se_: - id_,score=self.disambiguate_(en,lang) - if not id_ == "O" and id_: - selected_en[id_] = en - new_count[id_] = count[en] - - return new_count, selected_en - - def disambiguate_list(self,toponyms,lang): - result={} - for toponym in toponyms: - id_,_=self.disambiguate_(toponym,lang) - if id_: - result[id_]=toponym - return result - - def disambiguate_(self, label, lang='fr'): - if re.match("^\d+$", label): - return 'O', -1 - if lang in stop_words: #and lang in common_words: - if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]: - return 'O', -1 - - if lang in inflectors: - plural=inflectors[lang].singularize(label) - else: - plural = label.rstrip("s") + "s" - if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]: - return 'O', -1 - - data=get_most_common_id_v3(label, lang) - id_, score=None,0 - if data: - id_,score=data.id,data.score - return id_, score diff --git a/strpython/nlp/disambiguator_old/wikipedia_cooc.py b/strpython/nlp/disambiguator_old/wikipedia_cooc.py deleted file mode 100644 index c9a522a..0000000 --- a/strpython/nlp/disambiguator_old/wikipedia_cooc.py +++ /dev/null @@ -1,110 +0,0 @@ -# coding = utf-8 -import re - -from .disambiguator import Disambiguator -from .models.bigram import BigramModel -import pickle -from ...config.configuration import config -#from ...helpers.geodict_helpers_old import * -from ...helpers.geodict_helpers import * -from .most_common import stop_words,common_words -import networkx as nx - -def read_pickle(fn): - return pickle.load(open(fn,'rb')) - -class WikipediaDisambiguator(Disambiguator): - - def __init__(self,measure="degree"): - Disambiguator.__init__(self) - # Load model - self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count)) - self.measure=measure - def disambiguate(self, ner_result, lang="en"): - count, se_ = self.extract_se_entities(ner_result) - new_count = {} - selected_en_rev = {} - selected_en = self.disambiguate_wiki(se_,lang) - for en in selected_en: - selected_en_rev[en]=selected_en[en] - #new_count[selected_en[en]] = count[en] - - return new_count, selected_en - - def disambiguate_list(self,toponyms,lang): - result=self.disambiguate_wiki(toponyms,lang) - return {k:v for k,v in result.items() if v} - - def disambiguate_wiki(self, entities, lang): - - spat_en=[] - for e in entities: - if re.match("^\d+$", e): - continue - if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]: - continue - - plural = e.rstrip("s") + "s" - if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]: - continue - spat_en.append(e) - spat_en=list(set(spat_en)) - g = nx.Graph() - - possible_candidates = [] - betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ... - group_candidate = {} #candidates per toponym - - for e in spat_en: - cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4) - cand = [c.id for c in cand if c] - if not cand: - cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c] - group_candidate[e] = cand - betw_cand[e]=cand - for n in cand: - betw_cand[n]=set(cand)-set(n) - possible_candidates.extend(cand) - - for cand in possible_candidates: - g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang]) - - data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates} - for cand in possible_candidates: - for cand2 in possible_candidates: - # Get PageRank score - d = data_candidate[cand] - - sc = 1 - sc=d.score - # Compute probability - prob = self.model.get_coocurence_probability(sc, cand, cand2) - - if cand2 in betw_cand[cand] or cand in betw_cand[cand2]: - prob = 0.0 - if prob < 0.0000001: - prob = 0.0 - if not cand == cand2: - # take the lowest co-occurrency between two candidates - if g.has_edge(cand2, cand) : - if g.edges[cand2,cand]["weight"] < prob: - continue - g.add_edge(cand, cand2, weight=prob) - - selected = {} - - #Take the candidates with the highest degree weighted - for gr in group_candidate: - try: - - if self.measure == "degree": - selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) - elif self.measure == "centrality": - selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight")) - else:# degree by default - selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) - #print(1) - except Exception as e: - selected[gr]=get_most_common_id_v3(gr,lang) - return selected - diff --git a/strpython/nlp/ner/__init__.py b/strpython/nlp/ner/__init__.py index 764d8f5..52d9924 100644 --- a/strpython/nlp/ner/__init__.py +++ b/strpython/nlp/ner/__init__.py @@ -1,5 +1,5 @@ from .spacy import Spacy from .nltk import NLTK -from .polyglot import Polyglot +#from .polyglot import Polyglot from .stanford_ner import StanfordNER from .ner import NER \ No newline at end of file diff --git a/strpython/pipeline.py b/strpython/pipeline.py index c7cd89f..db81049 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -2,8 +2,10 @@ import re from nltk import word_tokenize +from joblib import Parallel, delayed -from strpython.models.str import STR +from .models.spatial_relation import RelationExtractor +from .models.str import STR from .models.transformation.transform import Generalisation, Expansion from .nlp.disambiguator import * @@ -11,12 +13,13 @@ from .nlp.ner import * from .nlp.exception.disambiguator import NotADisambiguatorInstance from .nlp.exception.ner import NotANERInstance -from .nlp.exception.tagger import NotATaggerInstance - -from .nlp.pos_tagger.tagger import Tagger -from .nlp.pos_tagger.treetagger import TreeTagger - +from multiprocessing import cpu_count +from mytoolbox.env import in_notebook +if in_notebook(): + from tqdm._tqdm_notebook import tqdm_notebook as tqdm +else: + from tqdm import tqdm class Pipeline(object): @@ -76,28 +79,46 @@ class Pipeline(object): else: raise NotADisambiguatorInstance() - - def build(self,text,se_identified=None, **kwargs): - """ - Return the corresponding STR for a text. - :param text: - :return: STR + def extract_all_relation(self,spatial_entities): """ - toponyms= kwargs.get("toponyms", None) - stop_words=kwargs.get("stop_words",[]) - - if isinstance(toponyms,list): - se_identified = self.disambiguator.disambiguate(self.lang,toponyms=[top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3]) - input = "" + Extract relation information between spatial entities + Parameters + ---------- + spatial_entities - elif se_identified: - input, se_identified = self.parse(text) - else: - input,se_identified=self.parse(text) + Returns + ------- - str_=STR(word_tokenize(input),se_identified,toponym_first=True) - str_.build(adj=True,inc=True) - str_=self.transform(str_,**kwargs) + """ + r = RelationExtractor(spatial_entities) + r.get_relation_geometry_based() + r.get_relation_meta_based() + df_adj, df_inc = r.fuse_meta_and_geom() + dict_adj = df_adj.to_dict() + dict_inc = df_inc.to_dict() + return dict_adj, dict_inc + + def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs): + + text_and_spatial_entities = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")) + sp_es= [] + for res in text_and_spatial_entities: + sp_es.extend(list(res[1].values())) + sp_es= [es for es in sp_es if es.startswith("GD")] + print("Extract Spatial Relation for all identified spatial entities") + adj_rel_dict, inc_rel_dict = self.extract_all_relation(sp_es) + + str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], adj_rel_dict, inc_rel_dict, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR")) + return str_s + + def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs): + + str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.transform)(str_, **kwargs) for str_ in tqdm(strs_,desc="Transform STR")) + return str_s + + def build(self, text_input, spatial_entities_identified, prec_adj, prec_inc): + str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=prec_adj,precomputed_inc=prec_inc) + str_.build(adj=True, inc=True) return str_ def transform(self,str_,**kwargs): -- GitLab