diff --git a/.gitignore b/.gitignore index 4a97748395d8d5b4ee3b6fd348a997ed58c6a753..f380ba1cf8b7da69be21b3e47ca8bfce5792c48c 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ __pycache__/ *.npy *.pkl *cache.json -*.gexf \ No newline at end of file +*.gexf +temp_cluster/ diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py index abcddfc4360113ace74adbd1c394de06ef2ca487..d683b7493790cfcdf8db7e7d3a2711cf62c578b2 100644 --- a/auto_fill_annotation.py +++ b/auto_fill_annotation.py @@ -5,9 +5,13 @@ import argparse, os, re, json, glob import pandas as pd import networkx as nx -from strpython.eval.automatic_annotation import AnnotationAutomatic,save_cache +from strpython.eval.automatic_annotation import AnnotationAutomatic,save_cache,add_cache from strpython.models.str import STR from tqdm import tqdm,TqdmSynchronisationWarning +from joblib import Parallel, delayed +from multiprocessing import cpu_count + + import warnings warnings.simplefilter("ignore", TqdmSynchronisationWarning) tqdm.pandas() @@ -40,10 +44,11 @@ def foo(x): return annotater.all(strs[x.G1], strs[x.G2],x.G1, x.G2) except KeyError as e: print(e) + add_cache(strs[x.G1], strs[x.G2],[0, 0, 0, 0]) return [0, 0, 0, 0] -df["res"] = df.progress_apply(lambda x: foo(x), axis=1) +df["res"] = Parallel(n_jobs=cpu_count())(delayed(foo)(x) for x in tqdm(df.itertuples()))#df.progress_apply(lambda x: foo(x), axis=1) df.res=df.res.apply(lambda x :list(map(int,x)) if x else []) df[["c1"]] = df.res.apply(lambda x: x[0] if len(x)>0 else 0) df[["c2"]] = df.res.apply(lambda x: x[1] if len(x)>0 else 0) diff --git a/strpython/config/config.json b/strpython/config/config.json index 2a86282993a809e8174fff29d63dc9559b70fc6c..5f32fb2b3283427e1dbffd579590fcf8deff8bf2 100644 --- a/strpython/config/config.json +++ b/strpython/config/config.json @@ -12,5 +12,6 @@ "count":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/count_wiki.pkl" }, "language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/language_resources", - "gazetteer":"geodict" + "gazetteer":"geodict", + "relation_db_path" : "/Users/jacquesfize/.services/relation_match.db" } \ No newline at end of file diff --git a/strpython/helpers/geodict_helpers.py b/strpython/helpers/geodict_helpers.py index 1425206f18baabc38b2c0c0ad464660726396f78..6c174d32a3fb4e404777cf48117cde32681e4329 100644 --- a/strpython/helpers/geodict_helpers.py +++ b/strpython/helpers/geodict_helpers.py @@ -5,7 +5,7 @@ import re from elasticsearch import Elasticsearch from ..config.configuration import config import pandas as pd -from ..helpers.objectify import objectify + import gazpy as ga diff --git a/strpython/helpers/geodict_helpers_old.py b/strpython/helpers/geodict_helpers_old.py index 7da757d5f8c95c3350d8544e65d8b11428415d88..dffac384bedeaee71ce638982b020831b40aa4c4 100644 --- a/strpython/helpers/geodict_helpers_old.py +++ b/strpython/helpers/geodict_helpers_old.py @@ -5,7 +5,7 @@ import re from elasticsearch import Elasticsearch from ..config.configuration import config import pandas as pd -from ..helpers.objectify import objectify +from mytoolbox.structure.objectify import objectify es = Elasticsearch(config.es_server) diff --git a/strpython/helpers/objectify.py b/strpython/helpers/objectify.py deleted file mode 100644 index 1bf4780e1a05caba16ed5336f2be2e34098aa1cc..0000000000000000000000000000000000000000 --- a/strpython/helpers/objectify.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python - - -"""Scrap module. - -Just tiny bits & bolts. - -.. author: Adrian Castravete -.. modified by : Jacques Fize (Implemented for Python 3 and recursive objectification) -""" - -from functools import wraps - - -def objectify(func): - """Mimic an object given a dictionary. - - Given a dictionary, create an object and make sure that each of its - keys are accessible via attributes. - If func is a function act as decorator, otherwise just change the dictionary - and return it. - :param func: A function or another kind of object. - :returns: Either the wrapper for the decorator, or the changed value. - - Example:: - - >>> obj = {'old_key': 'old_value'} - >>> oobj = objectify(obj) - >>> oobj['new_key'] = 'new_value' - >>> print oobj['old_key'], oobj['new_key'], oobj.old_key, oobj.new_key - - >>> @objectify - ... def func(): - ... return {'old_key': 'old_value'} - >>> obj = func() - >>> obj['new_key'] = 'new_value' - >>> print obj['old_key'], obj['new_key'], obj.old_key, obj.new_key - - """ - - def create_object(value): - """Create the object. - - Given a dictionary, create an object and make sure that each of its - keys are accessible via attributes. - Ignore everything if the given value is not a dictionary. - :param value: A dictionary or another kind of object. - :returns: Either the created object or the given value. - - """ - if isinstance(value, dict): - # Build a simple generic object. - class Object(dict): - def __setitem__(self, key, val): - setattr(self, key, val) - return super(Object, self).__setitem__(key, val) - - # Create that simple generic object. - ret_obj = Object() - # Assign the attributes given the dictionary keys. - for key, val in value.items(): - if isinstance(val,dict): - ret_obj[key] = objectify(val) - else: - ret_obj[key] = val - setattr(ret_obj, key, val) - return ret_obj - else: - return value - - # If func is a function, wrap around and act like a decorator. - if hasattr(func, '__call__'): - @wraps(func) - def wrapper(*args, **kwargs): - """Wrapper function for the decorator. - - :returns: The return value of the decorated function. - - """ - value = func(*args, **kwargs) - return create_object(value) - - return wrapper - - # Else just try to objectify the value given. - else: - return create_object(func) diff --git a/strpython/models/str.py b/strpython/models/str.py index 72aa9caf1ce9cc3dc1519e99026eb2b9493a75d9..e2d70b97556a2edcec5382176e542a42f3eed3b4 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -1,26 +1,21 @@ # coding = utf-8 import copy -import logging import os import time import warnings + from tqdm import tqdm import folium import geopandas as gpd import networkx as nx import pandas as pd from shapely.geometry import MultiPoint, Polygon, Point, LineString +from sklearn.cluster import MeanShift, estimate_bandwidth, dbscan +import matplotlib.pyplot as plt -from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency from ..helpers.geodict_helpers import gazetteer -from ..eval.stats import most_common - -from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan -import numpy as np - - -# logging.basicConfig(filename=config.log_file,level=logging.INFO) +from strpython.helpers.relation_extraction import AdjacencyRelation, InclusionRelation def get_inclusion_chain(id_, prop): @@ -42,13 +37,11 @@ class STR(object): """ Str basic structure """ - __cache_inclusion = {} # Store inclusion relations found between spaital entities - __cache_adjacency = {} # Store adjacency relations found between spaital entities __cache_entity_data = {} # Â Store data about entity requested def __init__(self, tagged_text, spatial_entities,toponym_first=True): """ - Constructir + Constructor Parameters ---------- @@ -71,6 +64,11 @@ class STR(object): self.adjacency_relationships = {} self.inclusion_relationships = {} + self.adj_rel_db=AdjacencyRelation() + self.inc_rel_db = InclusionRelation() + + self.graph = nx.MultiDiGraph() + @staticmethod def from_networkx_graph(g: nx.Graph, tagged_: list = []): """ @@ -164,10 +162,8 @@ class STR(object): id1, id2 = edge[0], edge[1] if edge[2]["color"] == "green": self.add_adjacency_rel(edge[0], edge[1]) - self.add_cache__adjacency(id1, id2, True) elif edge[2]["color"] == "red": self.add_inclusion_rel(edge[0], edge[1]) - self.add_cache_inclusion(id1, id2, True) def add_spatial_entity(self, id, label=None, v=True): """ @@ -213,7 +209,7 @@ class STR(object): except: label = None self.add_spatial_entity(id, label, False) - # print(self.graph.nodes(data=True)) + def add_adjacency_rel(self, se1, se2): """ @@ -231,7 +227,6 @@ class STR(object): if not se1 in self.adjacency_relationships: self.adjacency_relationships[se1] = {} if not se2 in self.adjacency_relationships: self.adjacency_relationships[se2] = {} self.adjacency_relationships[se1][se2], self.adjacency_relationships[se2][se1] = True, True - self.add_cache__adjacency(se1, se2, True) def add_inclusion_rel(self, se1, se2): """ @@ -248,47 +243,9 @@ class STR(object): if not se1 in self.inclusion_relationships: self.inclusion_relationships[se1] = {} self.inclusion_relationships[se1][se2] = True - self.add_cache_inclusion(se1, se2, True) - - def add_cache_inclusion(self, id1, id2, v=True): - """ - Add a relation of inclusion in a cache variable - - Parameters - ---------- - id1 : str - id of the first spatial entity - id2 : str - id of the second spatial entity - v : bool, optional - if the relation exists between the two spatial entities. Default is True - - """ - if not id1 in STR.__cache_inclusion: - STR.__cache_inclusion[id1] = {} - STR.__cache_inclusion[id1][id2] = v - def add_cache__adjacency(self, se1, se2, v=True): - """ - Add a relation of adjacency in a cache variable - Parameters - ---------- - id1 : str - id of the first spatial entity - id2 : str - id of the second spatial entity - v : bool, optional - if the relation exists between the two spatial entities. Default is True - - """ - if not se1 in STR.__cache_adjacency: - STR.__cache_adjacency[se1] = {} - if not se2 in STR.__cache_adjacency: - STR.__cache_adjacency[se2] = {} - STR.__cache_adjacency[se1][se2] = v - STR.__cache_adjacency[se2][se1] = v def get_data(self, id_se): """ @@ -376,156 +333,15 @@ class STR(object): if self.adjacency_relationships[se1][se2]: self.graph.add_edge(se1, se2, key=0, color="green") - def is_included_in(self, se1_id, se2_id): - """ - Return True if a spatial entity is included within another one. - - Parameters - ---------- - se1_id : str - id of the contained entity - se2_id : str - id of the entity container - - Returns - ------- - bool - if se1 included in se2 - """ - - if se1_id in self.inclusion_relationships: - if se2_id in self.inclusion_relationships[se1_id]: - return self.inclusion_relationships[se1_id][se2_id] - - inc_chain_P131 = get_inclusion_chain(se1_id, "P131") - inc_chain_P706 = get_inclusion_chain(se1_id, "P706") - inc_chain = inc_chain_P131 - inc_chain.extend(inc_chain_P706) - inc_chain = set(inc_chain) - if se2_id in inc_chain: - self.add_cache_inclusion(se1_id, se2_id, True) - return True - - return False - - def is_adjacent_cache(self, se1, se2): - """ - Return true if two spatial entities were found adjacent previously. - - Parameters - ---------- - se1 : str - id of the first spatial entity - se2 : str - id of the second spatial entity - - Returns - ------- - bool - if se1 adjacent to se2 - """ - - if se1 in STR.__cache_adjacency: - if se2 in STR.__cache_adjacency[se1]: - return STR.__cache_adjacency[se1][se2] - if se2 in STR.__cache_adjacency: - if se1 in STR.__cache_adjacency[se2]: - return STR.__cache_adjacency[se2][se1] - return False - - def is_included_cache(self, se1, se2): - """ - Return true if a spatial entity were found included previously in an other one. - - Parameters - ---------- - se1 : str - id of the first spatial entity - se2 : str - id of the second spatial entity - - Returns - ------- - bool - if se1 included to se2 - """ - if se1 in STR.__cache_inclusion: - if se2 in STR.__cache_inclusion[se1]: - return STR.__cache_inclusion[se1][se2] - return False - - def is_adjacent(self, se1, se2, datase1=None, datase2=None): - """ - Return true if se1 is adjacent to se2. - - Parameters - ---------- - se1 : str - id of the first spatial entity - se2 : str - id of the second spatial entity - datase1 : gazpy.Element, optional - if given cached data concerning the spatial entity with id = se1 (the default is None) - datase2 : gazpy.Element, optional - if given cached data concerning the spatial entity with id = se2 (the default is None) - - Returns - ------- - bool - true if adjacent - """ - - stop_class = set(["A-PCLI", "A-ADM1"]) - - def get_p47_adjacency_data(data): - p47se1 = [] - for el in data.other.P47: - d = gazetteer.get_by_other_id(el, "wikidata") - if not d: continue - p47se1.append(d[0].id) - return p47se1 - - if self.is_adjacent_cache(se1, se2): - return False - - if self.is_included_in(se1, se2) or self.is_included_in(se2, se1): - return False - - data_se1, data_se2 = self.get_data(se1), self.get_data(se2) - - if "P47" in data_se2.other and se1 in get_p47_adjacency_data(data_se2): - return True - # print("P47") - elif "P47" in data_se1.other and se2 in get_p47_adjacency_data(data_se1): - return True - # print("P47") - - if collisionTwoSEBoundaries(se1, se2): - return True - - if data_se1 and data_se2 and "coord" in data_se1.other and "coord" in data_se2.other: - if Point(data_se1.coord.lon, data_se1.coord.lat).distance( - Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( - set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: - return True - return False def get_inclusion_relationships(self): """ Find all the inclusion relationships between the spatial entities declared in the current STR. """ - for se_ in tqdm(self.spatial_entities, desc="Extract Inclusion"): - inc_chain_P131 = get_inclusion_chain(se_, "P131") - inc_chain_P706 = get_inclusion_chain(se_, "P706") - - inc_chain = inc_chain_P131 - inc_chain.extend(inc_chain_P706) - inc_chain = set(inc_chain) - for se2_ in self.spatial_entities: - if se2_ in inc_chain: + if se_ != se2_ and self.inc_rel_db.is_relation(se_,se2_): self.add_inclusion_rel(se_, se2_) def get_adjacency_relationships(self): @@ -533,21 +349,11 @@ class STR(object): Find all the adjacency relationships between the spatial entities declared in the current STR. """ - data = {se: self.get_data(se) for se in self.spatial_entities} - for se1 in tqdm(self.spatial_entities, desc="Extract Adjacency Relationship"): - data_se1 = data[se1] for se2 in self.spatial_entities: - if se1 == se2: continue - if se1 in self.adjacency_relationships: - if se2 in self.adjacency_relationships[se1]: - continue - if se2 in self.adjacency_relationships: - if se1 in self.adjacency_relationships[se2]: - continue - data_se2 = data[se2] - if self.is_adjacent(se1, se2, data_se1, data_se2): - self.add_adjacency_rel(se1, se2) + if se1 != se2 and self.adj_rel_db.is_relation(se1, se2): + self.add_adjacency_rel(se1,se2) + def build(self, inc=True, adj=True, verbose=False): """ @@ -576,7 +382,6 @@ class STR(object): graph.add_nodes_from(nodes) if adj: - debut = time.time() self.get_adjacency_relationships() for se1 in self.adjacency_relationships: for se2 in self.adjacency_relationships[se1]: @@ -585,7 +390,6 @@ class STR(object): graph.add_edge(se2, se1, key=0, color="green") if inc: - debut = time.time() self.get_inclusion_relationships() for se1 in self.inclusion_relationships: for se2 in self.inclusion_relationships[se1]: @@ -616,7 +420,7 @@ class STR(object): except: print("Error while saving STR to {0}".format(format)) - def getUndirected(self): + def get_undirected(self,simple_graph=True): """ Return the Undirected form of a STR graph. @@ -625,8 +429,9 @@ class STR(object): networkx.Graph unidirected graph """ - - return nx.Graph(self.graph) + if simple_graph: + return nx.Graph(self.graph) + return nx.MultiGraph(self.graph) def get_geo_data_of_se(self): """ @@ -762,7 +567,7 @@ class STR(object): Matplotlib figure instance """ - import matplotlib.pyplot as plt + world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) base = world.plot(color='white', edgecolor='black', figsize=(16, 9)) points = [] @@ -804,24 +609,6 @@ class STR(object): plt.show() -# def to_Multipoints(x): -# """ -# Return a polygon buffered representation for a set of point - -# Parameters -# ---------- -# x : pandas.Series -# coordinates columns - -# Returns -# ------- -# shapely.geometry.Polygon -# polygon -# """ - -# #print(x[["x","y"]].values) -# return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1) - def to_Polygon(x): """ Return a polygon buffered representation for a set of points.