From a640ee46f78848f93d4284fa8d94edf349f567fa Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Wed, 13 Mar 2019 18:47:37 +0100 Subject: [PATCH] DEBUG --- criteria_cache.py | 14 +- generate_similarity_matrix.py | 76 ---- strpython/eval/automatic_annotation.py | 208 ++++++----- strpython/helpers/collision.py | 23 +- strpython/helpers/geo_relation_database.py | 2 +- strpython/helpers/geodict_helpers_old.py | 394 --------------------- strpython/helpers/match_cache.py | 17 + strpython/helpers/relation_extraction.py | 2 +- strpython/models/str.py | 2 +- 9 files changed, 127 insertions(+), 611 deletions(-) delete mode 100644 generate_similarity_matrix.py delete mode 100644 strpython/helpers/geodict_helpers_old.py create mode 100644 strpython/helpers/match_cache.py diff --git a/criteria_cache.py b/criteria_cache.py index b04c925..e424ec5 100644 --- a/criteria_cache.py +++ b/criteria_cache.py @@ -6,18 +6,16 @@ import networkx as nx import numpy as np -from strpython.eval.automatic_annotation import AnnotationAutomatic,save_cache,add_cache +from strpython.eval.automatic_annotation import AnnotationAutomatic from strpython.models.str import STR from tqdm import tqdm,TqdmSynchronisationWarning -from joblib import Parallel, delayed -from multiprocessing import cpu_count import warnings warnings.simplefilter("ignore", TqdmSynchronisationWarning) tqdm.pandas() -annotater = AnnotationAutomatic() +annotater = AnnotationAutomatic("bvlac") parser = argparse.ArgumentParser() @@ -38,6 +36,7 @@ for fn in fns: all_cp.extend(cps.tolist()) all_cp=set(all_cp) df = pd.DataFrame([cp.split("_") for cp in all_cp],columns="G1 G2".split()) +df= df.sort_values(by="G1 G2".split()) str_graph_path = args.graph_dir @@ -51,11 +50,11 @@ def foo(x): try: return annotater.all(strs[int(x.G1)], strs[int(x.G2)],int(x.G1), int(x.G2)) except KeyError as e: - add_cache(int(x.G1), int(x.G2),[0, 0, 0, 0]) - return [0, 0, 0, 0] + annotater.matching_cache.add(int(x.G1),int(x.G2),*(0, 0, 0, 0)) + return [0,0,0,0] -df["res"] = df.progress_apply(lambda x: foo(x), axis=1) #Parallel(n_jobs=4)(delayed(foo)(x) for x in tqdm(df.itertuples(),total=df.size,desc="Extracting Crit"))# +df["res"] = df.progress_apply(lambda x: foo(x), axis=1) #Parallel(n_jobs=4,backend="threading")(delayed(foo)(x) for x in tqdm(df.itertuples(),total=df.size,desc="Extracting Crit")) df.res=df.res.apply(lambda x :list(map(int,x)) if x else []) df["c1"] = df.res.apply(lambda x: x[0] if len(x)>0 else 0) df["c2"] = df.res.apply(lambda x: x[1] if len(x)>0 else 0) @@ -63,6 +62,5 @@ df["c3"] = df.res.apply(lambda x: x[2] if len(x)>0 else 0) df["c4"] = df.res.apply(lambda x: x[3] if len(x)>0 else 0) del df["res"] -save_cache() df.to_csv(args.output_file) diff --git a/generate_similarity_matrix.py b/generate_similarity_matrix.py deleted file mode 100644 index 4c35fdf..0000000 --- a/generate_similarity_matrix.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding = utf-8 -import glob -# from gmatch4py.bag_of_cliques import BagOfCliques -from gmatch4py.helpers.reader import import_dir -from gmatch4py.base import Base -from gmatch4py.ged.graph_edit_dist import GraphEditDistance -from gmatch4py.ged.bipartite_graph_matching_2 import BP_2 -from gmatch4py.ged.greedy_edit_distance import GreedyEditDistance -from gmatch4py.ged.hausdorff_edit_distance import HED -from gmatch4py.jaccard import Jaccard -from gmatch4py.kernels.weisfeiler_lehman import * -from gmatch4py.mcs import MCS -from gmatch4py.vertex_edge_overlap import VertexEdgeOverlap -import argparse, os, sys, re, json, logging -import datetime - -logging.basicConfig( - filename="{0}.csv".format(datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")), - format="%(message)s,%(asctime)s", - level=logging.DEBUG -) - -parser = argparse.ArgumentParser() -parser.add_argument("graphs_input_dir") -parser.add_argument("matrix_output_dir") -parser.add_argument("-d", action="store_true", help="Return distance matrix") -parser.add_argument("-s", action="store_true", help="Selected graph ?") - -args = parser.parse_args() -if not os.path.exists(args.graphs_input_dir): - print("Input graph directory doesn't exist!") - sys.exit(1) - -if not os.path.exists(args.matrix_output_dir): - print("Output matrix directory doesn't exist!") - print("Creating directory") - os.makedirs(args.matrix_output_dir) - print("Directory created") - -logging.info(msg="L_G,BEGIN,\"\"") -graphs = import_dir(args.graphs_input_dir) -logging.info(msg="L_G,DONE,\"\"") -# print(graphs) -selected = None -if args.s: - selected = json.load(open("selected.json")) -# Compute matrices -for class_ in [GraphEditDistance, BP_2, GreedyEditDistance, HED, Jaccard, MCS, - VertexEdgeOverlap]: - logging.info(msg="C_S,BEG,\"{0}\"".format(class_.__name__)) - print("Computing the Similarity Matrix for {0}".format(class_.__name__)) - - if class_ in (GraphEditDistance, BP_2, GreedyEditDistance, HED): - comparator = class_(1, 1, 1, 1) - elif class_ == WeisfeleirLehmanKernel: - comparator = class_(h=2) - else: - comparator = class_() - matrix = comparator.compare(graphs, selected) - if not args.d: - matrix = comparator.similarity(matrix) - else: - matrix = comparator.distance(matrix) - logging.info(msg="C_S,DONE,\"{0}\"".format(class_.__name__)) - output_fn = "{0}/{1}_{2}.npy".format( - args.matrix_output_dir.rstrip("/"), - class_.__name__, - os.path.dirname(args.graphs_input_dir).replace("/", "_") - ) - logging.info(msg="M_S,BEG,\"{0}\"".format(class_.__name__)) - np.save(output_fn, matrix) - logging.info(msg="M_S,DONE,\"{0}\"".format(class_.__name__)) - print("Matrix Saved") - -# json.dump(mapping_files_to_graphs,open("{0}/{1}".format(args.matrix_output_dir.rstrip("/"),"metadata.json"))) -print("Done") diff --git a/strpython/eval/automatic_annotation.py b/strpython/eval/automatic_annotation.py index 89f9182..fcd487e 100644 --- a/strpython/eval/automatic_annotation.py +++ b/strpython/eval/automatic_annotation.py @@ -1,48 +1,10 @@ # coding = utf-8 -import json -import os - -from strpython.models.str import STR -import networkx as nx import numpy as np -import geopandas as gpd -from shapely.geometry import MultiPoint,Polygon,Point,LineString - - -def jsonKeys2int(x): - if isinstance(x, dict): - return {int(k):jsonKeys2int(v) for k,v in x.items() } - return x - -__cache__crit={} - -if os.path.exists("cache.json"): - try: - __cache__crit=json.load(open("cache.json")) - __cache__crit=jsonKeys2int(__cache__crit) - except Exception as e: - print(e) - -def save_cache(): - global __cache__crit - open("cache.json", 'w').write(json.dumps(__cache__crit)) - -def get_from_cache(id1,id2): - global __cache__crit - # try: - if id1 in __cache__crit: - if id2 in __cache__crit[id1]: - return __cache__crit[id1][id2] - elif id2 in __cache__crit: - if id1 in __cache__crit[id2]: - return __cache__crit[id2][id1] - return None - -def add_cache(id1,id2,data): - global __cache__crit - if not id1 in __cache__crit: - __cache__crit[id1] = {} - __cache__crit[id1][id2] = data + +from ..models.str import STR +from ..helpers.match_cache import MatchingCache +from ..helpers.relation_extraction import AdjacencyRelation, InclusionRelation + class AnnotationAutomatic(object): """ @@ -50,110 +12,140 @@ class AnnotationAutomatic(object): To facilitate the annotation, this class propose an automatic annotation. Author : Jacques Fize """ - def __init__(self): - pass - - def all(self,str1,str2,id1=None,id2=None): - cache_data=get_from_cache(id1,id2) - if not cache_data: - crit_ = [self.criterion1(str1, str2), self.criterion2(str1, str2),self.criterion3(str1, str2, id1, id2),self.criterion4(str1, str2, id1, id2)] - add_cache(id1,id2,crit_) - return crit_ - return cache_data - - def criterion1(self,str1,str2): + + def __init__(self, dataset): + self.matching_cache = MatchingCache(dataset) + self.adj_rel_db = AdjacencyRelation() + self.inc_rel_db = InclusionRelation() + + def all(self, str1, str2, id1=None, id2=None): + """ + + Parameters + ---------- + str1 + str2 + id1 + id2 + + Returns + ------- + + """ + if id1 and id2: + found, value = self.matching_cache.is_match(int(id1), int(id2)) + if found: + return list(value) + + crit_ = [self.criterion1(str1, str2), self.criterion2(str1, str2), self.criterion3(str1, str2, id1, id2), + self.criterion4(str1, str2, id1, id2)] + self.matching_cache.add(id1, id2, *crit_) + return crit_ + + def criterion1(self, str1, str2): """ Return True if both STR contains similar spatial entities. - :param str1: STR - :param str2: STR - :return: + Parameters + ---------- + str1 + str2 + + Returns + ------- + """ + print("CRIT1") return int(len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys())) > 0) - def criterion2(self,str1 : STR,str2 : STR): + def criterion2(self, str1: STR, str2: STR): """ Return True if two STR contains proper spatial entities that share a proximity. - :param str1: STR - :param str2: STR - :return: + Parameters + ---------- + str1 + str2 + + Returns + ------- + """ - stop_en=set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys()) + print("CRIT2") + stop_en = set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys()) for es in str1.spatial_entities: for es2 in str2.spatial_entities: - if not es in stop_en and not es2 in stop_en: - if str1.is_included_in(es,es2): + if not es in stop_en and not es2 in stop_en and es != es2: + if self.inc_rel_db.is_relation(es, es2): return 1 - if str1.is_adjacent(es,es2): + if self.adj_rel_db.is_relation(es, es2): return 1 return 0 - def criterion3(self, str1 :STR , str2: STR,id1=None,id2=None,th=0.3): + def criterion3(self, str1: STR, str2: STR, id1=None, id2=None): """ Return True if one or multiple cluster of spatial entities have been found in both STR. Cluster are constructed based on low distance between spatial entities. The clustering method used is Mean-Shift as implemented in scikit-learn module. - :param str1: - :param str2: - :return: - """ + Parameters + ---------- + str1 + str2 + id1 + id2 + + Returns + ------- + """ + print("CRIT3") try: - c1=str1.get_cluster(id1) + c1 = str1.get_cluster(id1) except: - c1 = str1.get_cluster() ## Feignasse !!!! + c1 = str1.get_cluster() try: - c2=str2.get_cluster(id2) + c2 = str2.get_cluster(id2) except: c2 = str2.get_cluster() - if not "geometry" in c1 or (not "geometry" in c2): + if ("geometry" not in c1) or ("geometry" not in c2): return 0 - c1["area"] = c1.area - c2["area"] = c2.area - c1=c1.sort_values(by="area",ascending=False) - c2=c2.sort_values(by="area",ascending=False) - mean=np.mean(c1.area) - for ind,rows in c1.iterrows(): - if rows.area <mean: + c1["area_"] = c1.area + c2["area_"] = c2.area + c1 = c1.sort_values(by="area_", ascending=False) + c2 = c2.sort_values(by="area_", ascending=False) + mean = np.mean(c1.area_) + c1=c1[c1.area_ >= mean] + return int(c1.intersects(c2).any()) + for ind, rows in c1.iterrows(): + if rows.area < mean: break - for ind2,rows2 in c2.iterrows(): + for ind2, rows2 in c2.iterrows(): if rows.geometry.intersects(rows2.geometry): return 1 - #print(gpd.GeoDataFrame(geometry=[rows.geometry])) - # inter = gpd.overlay( - # gpd.GeoDataFrame(geometry=[rows.geometry]), - # gpd.GeoDataFrame(geometry=[rows2.geometry]), - # how="intersection", - # use_sindex=False - # ) - # a1,a2=c1.area.sum(),c2.area.sum() - # if "geometry" in inter: - # ia=inter.area.sum() - # if a1 < a2 and ia/a1 >= th: - # return 1 - # elif a1 > a2 and ia/a2 >= th: - # return 1 return 0 - - - def criterion4(self, str1, str2,id1=None,id2=None,): + def criterion4(self, str1, str2, id1=None, id2=None, ): """ Return True if both str share the same clusters. Using the same clustering methods as in criterion3(). - :param str1: - :param str2: - :return: + Parameters + ---------- + str1 + str2 + id1 + id2 + + Returns + ------- + """ try: - c1=str1.get_cluster(id1) + c1 = str1.get_cluster(id1) except: - c1 = str1.get_cluster() ## Feignasse !!!! + c1 = str1.get_cluster() # Feignasse !!!! try: - c2=str2.get_cluster(id2) + c2 = str2.get_cluster(id2) except: c2 = str2.get_cluster() - if not "geometry" in c1 or (not "geometry" in c2): + if ("geometry" not in c1) or ("geometry" not in c2): return 0 return int(c1.intersects(c2).all()) - diff --git a/strpython/helpers/collision.py b/strpython/helpers/collision.py index 7f4647b..dccf8b8 100644 --- a/strpython/helpers/collision.py +++ b/strpython/helpers/collision.py @@ -23,9 +23,6 @@ def add_cache(id_, hull): :return: """ global __cache, __limit_cache, __cache_frequency - if len(__cache) > __limit_cache: - warnings.warn("Limit broken") - del __cache[min(__cache_frequency, key=__cache_frequency.get)] __cache[id_] = hull if not id_ in __cache_frequency: __cache_frequency[id_] = 0 __cache_frequency[id_] += 1 @@ -81,31 +78,13 @@ def getGEO(id_se): data=data[0] if "path" in data.other: - return explode(gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"]))).convex_hull + return gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"])).convex_hull elif "coord" in data.other: return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename( columns={0: 'geometry'}) return None -def getGEO2(id_se): - """ - Get the geofootprint of a spatial entity. If found, this geofootprint is a shape extracted from OSM. If not, - coordinates are used. - :param id_se: id of the spatial entity - :return: geopandas.GeoSeries - """ - data = gazetteer.get_by_id(id_se) - if not data: - return None - - data=data[0] - if "path" in data: - return "P",explode(gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"]))).convex_hull - elif "coord" in data: - return "C",gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename( - columns={0: 'geometry'}) - return None def collide(se1, se2): """ diff --git a/strpython/helpers/geo_relation_database.py b/strpython/helpers/geo_relation_database.py index db3e630..4d9efe6 100644 --- a/strpython/helpers/geo_relation_database.py +++ b/strpython/helpers/geo_relation_database.py @@ -172,7 +172,7 @@ class GeoRelationMatchingDatabase(): result_ = cursor.fetchone() cursor.close() if result_: - return True, tuple(map(bool, result_[-4:])) + return True, tuple(map(int, result_[-4:])) return False, False diff --git a/strpython/helpers/geodict_helpers_old.py b/strpython/helpers/geodict_helpers_old.py deleted file mode 100644 index dffac38..0000000 --- a/strpython/helpers/geodict_helpers_old.py +++ /dev/null @@ -1,394 +0,0 @@ -# coding=utf-8 -import math -import re - -from elasticsearch import Elasticsearch -from ..config.configuration import config -import pandas as pd -from mytoolbox.structure.objectify import objectify - -es = Elasticsearch(config.es_server) - -geo_term={ - "fr":open(config.language_resources_path.rstrip("/")+"/geo_term_fr").read().lower().strip().split("\n"), - "en":open(config.language_resources_path.rstrip("/")+"/geo_term_en").read().strip().split("\n") -} - -def convert_es_to_pandas(es_query_results): - """ - Return a `pandas.Dataframe` object built from the elasticsearch query results - - Parameters - ---------- - es_query_results : dict - elasticsearch.search() result - - Returns - ------- - pandas.DataFrame - Dataframe of the elasticsearch query results - """ - if es_query_results["hits"]["total"] == 0: - return None - df = pd.DataFrame([g["_source"] for g in es_query_results["hits"]["hits"]]) - if "score" in df: - df["score"] = df["score"].apply(lambda x: float(x)) - else: - df["score"] = df.apply(lambda x: 0) - df["score"].fillna(-1, inplace=True) - return df - - -def parse_score(score): - if math.isnan(score): - return -1 - else: - return score - -def parse_label2(label : str,lang): - if not lang in geo_term: - return parse_label(label) - - label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) - label = label.strip("'").strip("’") - - parts=label.split(" ") - # f=False - # for part in parts: - # if part.lower() in geo_term[lang]: - # f=True - # if not f: - # return parse_label(label) - new_labels=[] - for part in parts: - if not part.lower() in geo_term[lang]: - new_labels.append(parse_label(part).strip("/?")+"+") - else: - new_labels.append(parse_label(part).strip("/")) - return "/"+"[ ]?".join(new_labels)+"/" - - - - -def parse_label(label: str): - """ - Parse label/toponym to a specific regular expression that allows dissimilarity with the official toponyms/aliases. - - Parameters - ---------- - label : str - toponym - Returns - ------- - str - regular expression built from the toponym - """ - label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) - label = label.strip("'").strip("’") - new_label = "" - for c in label: - if c.isupper(): - close_par = ")" if not (new_label.endswith(")") or new_label.endswith("?")) and new_label != "" else "" - # if new_label.endswith("]"): - # new_label = new_label[:-1] + "({0}{1}]".format(c.lower(), c) - # else: - new_label += close_par + "([{0}{1}]".format(c.lower(), c) - # print("upper", new_label) - elif c == " ": - new_label += ")?[ ]?" - # print("espace", new_label) - elif c == "'" or c == "’": - new_label += c + ")?" - # print("apostrophe", new_label) - else: - - new_label += ("(" if new_label == "" else "") + ("(" if new_label.endswith("?") else "") + c - # print("else", new_label) - new_label = "/" + new_label + ")?/" - return new_label - - -def most_common_label(toponym: str, lang: str): - """ - - - Parameters - ---------- - toponym : str - toponym - lang : str - toponym language - Returns - ------- - - """ - res = es.search("gazetteer", "place", - body={ "query": {"query_string": {"query": "\"{0}\"".format(toponym), "analyze_wildcard": False}}, - "from": 0, - "size": 50, - "sort": [{'score': "desc"}]}) - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None, 0 - return res.iloc[0].id, res.iloc[0].score - - -def most_common_alias(toponym: str, lang: str): - """ - Return most common spatial entity by itsje - - Parameters - ---------- - toponym : str - toponym - lang : str - toponym language - Returns - ------- - - """ - res = es.search("gazetteer", "place", - body={"size": 1, "sort": [{"score": {"order": "desc", "unmapped_type": "boolean"}}],"query": {"bool": {"must": [{"term": {lang: toponym}}], "must_not": [], "should": []}}}) - - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None, 0 - return res.iloc[0].id, res.iloc[0].score - - -def n_label_similar(toponym, lang, n=5, score=True): - body = { - "query": { - "query_string": { - "default_field": lang, - "query": parse_label2(toponym,lang) - } - }, - "from": 0, - "size": n - } - if score: - body["sort"] = [ - { - 'score': "desc" - } - ] - try: - res = es.search("gazetteer", "place", - body=body) - except: - return None - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None - return res - - -def n_alias_similar(toponym, lang, n=5, score=True): - body = {"query": {"nested": {"path": "aliases", - "query": - { - "query_string": { - "default_field": "aliases.{0}".format(lang), - "query": parse_label2(toponym,lang) - } - } - }}, - "from": 0, - "size": n} - if score: - body["sort"] = [ - { - 'score': "desc" - } - ] - try: - res = es.search("gazetteer", "place", - body=body) - except: - return None - - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None, 0 - return res.iloc[0].id, res.iloc[0].score - - - -def get_most_common_id_v3(label, lang='fr'): - """ - Return the spatial entity and its score, based on a specific label and language that obtains the highest score. - The difference with the V2 is that it takes special cases: - * english placenames in a french text - * alias like China which designated also a spatial entity - :param label: - :param lang: - :return: - """ - id_, score = most_common_label(label, lang) - if id_: - # China case - id_2, score2 = most_common_alias(label, lang) - if id_2 and score2 > score: - id_, score = id_2, score2 - simi=n_label_similar(label, lang) - if isinstance(simi,pd.DataFrame): - id_3, score3 = simi.iloc[0].id,simi.iloc[0].score - if id_2 and score2 > score: - id_, score = id_3, score3 - return id_, score - - # if nothing found in english, search in aliases - id_, score = most_common_alias(label, lang) - if id_: - return id_, score - - similar_label=n_label_similar(label,lang) - if isinstance(similar_label,pd.DataFrame): - return similar_label.iloc[0].id, similar_label.iloc[0].score - - similar_alias = n_alias_similar(label, lang) - if isinstance(similar_alias,pd.DataFrame): - return similar_alias.iloc[0].id, similar_alias.iloc[0].score - - return None, -1 - - - - -def get_data(id): - """ - Return the data asssociated to an id in Geodict - :param id: - :return: - """ - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"id": id}}], "must_not": [], "should": []}}, "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - return objectify(res) - return None - - -def get_data_by_wikidata_id(id): - """ - Return the data asssociated to a wikidata id in Geodict - :param id: - :return: - """ - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"wikidataID": id}}], "must_not": [], "should": []}}, - "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - return objectify(res) - return None - - -def get_data_by_geonames_id(id): - """ - Return the data asssociated to a geonames id in Geodict - :param id: - :return: - """ - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"geonameID": id}}], "must_not": [], "should": []}}, - "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - return objectify(res) - return None - - -def get_by_label(label, lang): - """ - A Supprimer - :param label: - :param lang: - :return: - """ - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, "size": 50} - response = es.search('gazetteer', 'place', body=query) - if 'hits' in response['hits']: - return objectify(response['hits']['hits']) - return None - - -def get_by_alias(alias, lang): - """ - A supprimer - :param alias: - :param lang: - :return: - """ - query = { - "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} - response = es.search('gazetteer', 'place', body=query) - if 'hits' in response['hits']: - return objectify(response['hits']['hits']) - return None - - -def label_exists(label, lang): - """ - Return True if a spatial entity exists with a specific label in a specific language. - :param label: str - :param lang: str - :return: bool - """ - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - response = es.count('gazetteer', 'place', body=query) - if response["count"] > 0: - return True - return False - - -def alias_exists(alias, lang): - """ - Return True if a spatial entity exists with a specific alias in a specific language. - :param alias: str - :param lang: str - :return: bool - """ - query = { - "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} - response = es.count('gazetteer', 'place', body=query) - if response["count"] > 0: - return True - return False - - -def count_of_se(label, lang): - """ - Return the number of spatial entities associated with a specific label in a specific language. - :param label: str - :param lang: str - :return: int - """ - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - response = es.count('gazetteer', 'place', body=query) - return response["count"] - - -def get_top_candidate(label, lang, n=5): - """ - Return the 5-top candidates for a designated label in a specific language. - :param label: str - :param lang: str - :return: list - """ - if n<4: - n=4 - query={"size": n-3, "sort": [{"score": {"order": "desc"}}],"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - query2={"size": 1, "sort": [{"score": {"order": "desc"}}], - "query": {"query_string": {"query": "\"{0}\"".format(label), "analyze_wildcard": False}}} - query3 = {"size": 1, "sort": [{"score": {"order": "desc", "unmapped_type": "boolean"}}],"query": {"bool": {"must": [{"term": {"en": "\"{0}\"".format(label)}}], "must_not": [], "should": []}}} - response = es.search('gazetteer', 'place', body=query) - res=[] - if 'hits' in response['hits']: - res=[x["_source"]["id"] for x in response['hits']['hits']] - res.extend([get_most_common_id_v3(label,lang)[0]]) - return res diff --git a/strpython/helpers/match_cache.py b/strpython/helpers/match_cache.py new file mode 100644 index 0000000..82669a2 --- /dev/null +++ b/strpython/helpers/match_cache.py @@ -0,0 +1,17 @@ +# coding = utf-8 + +from .geo_relation_database import GeoRelationMatchingDatabase + + +class MatchingCache: + + def __init__(self, dataset, geo_rel_match_database=GeoRelationMatchingDatabase()): + self.db_rel_match = geo_rel_match_database + self.dataset = dataset + + def is_match(self, id_str1: int, id_str2: int): + return self.db_rel_match.get_matching(id_str1, id_str2, self.dataset) + + def add(self, id_str1: int, id_str2: int, c1: int, c2: int, c3: int, c4: int): + if not self.is_match(id_str1, id_str2)[0]: + self.db_rel_match.add_matching(self.dataset, id_str1, id_str2, c1, c2, c3, c4) diff --git a/strpython/helpers/relation_extraction.py b/strpython/helpers/relation_extraction.py index 41f8676..6cba224 100644 --- a/strpython/helpers/relation_extraction.py +++ b/strpython/helpers/relation_extraction.py @@ -1,7 +1,7 @@ # coding = utf-8 from shapely.geometry import Point -from strpython.helpers.collision import collide +from .collision import collide from .geo_relation_database import GeoRelationMatchingDatabase from ..helpers.geodict_helpers import gazetteer diff --git a/strpython/models/str.py b/strpython/models/str.py index e2d70b9..837b327 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -15,7 +15,7 @@ from sklearn.cluster import MeanShift, estimate_bandwidth, dbscan import matplotlib.pyplot as plt from ..helpers.geodict_helpers import gazetteer -from strpython.helpers.relation_extraction import AdjacencyRelation, InclusionRelation +from ..helpers.relation_extraction import AdjacencyRelation, InclusionRelation def get_inclusion_chain(id_, prop): -- GitLab