diff --git a/.gitignore b/.gitignore index f380ba1cf8b7da69be21b3e47ca8bfce5792c48c..805fbd71f81f33a10cb43c3b70f9a8d31550e52b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ __pycache__/ *cache.json *.gexf temp_cluster/ +agromada* \ No newline at end of file diff --git a/strpython/helpers/relation_extraction.py b/strpython/helpers/relation_extraction.py deleted file mode 100644 index 6cba2245beefbb5cfc6191b3f2831799b05dfab6..0000000000000000000000000000000000000000 --- a/strpython/helpers/relation_extraction.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding = utf-8 -from shapely.geometry import Point - -from .collision import collide -from .geo_relation_database import GeoRelationMatchingDatabase -from ..helpers.geodict_helpers import gazetteer - - -class RelationExtractor(): - __cache_entity_data = {} - - def __init__(self, geo_rel_match_database=GeoRelationMatchingDatabase()): - self.db_rel_match = geo_rel_match_database - - def is_relation(self, id_se1: str, id_se2: str): - raise NotImplementedError() - - def get_data(self, id_se): - """ - Return an gazpy.Element object containing information about a spatial entity. - - Parameters - ---------- - id_se : str - Identifier of the spatial entity - - Returns - ------- - gazpy.Element - data - """ - - if id_se in RelationExtractor.__cache_entity_data: - return RelationExtractor.__cache_entity_data[id_se] - data = gazetteer.get_by_id(id_se) - if len(data) > 0: - RelationExtractor.__cache_entity_data[id_se] = data[0] - return data[0] - - -class AdjacencyRelation(RelationExtractor): - - def __init__(self): - RelationExtractor.__init__(self) - - def is_relation(self, id_se1: str, id_se2: str): - found_, value = self.db_rel_match.get_adjacency(id_se1, id_se2) - if found_: - return value - - stop_class = {"A-PCLI", "A-ADM1"} - - def get_p47_adjacency_data(data): - p47se1 = [] - for el in data.other.P47: - d = gazetteer.get_by_other_id(el, "wikidata") - if not d: continue - p47se1.append(d[0].id) - return p47se1 - - - inc = InclusionRelation() - if inc.is_relation(id_se1, id_se2) or inc.is_relation(id_se2, id_se1): - self.db_rel_match.add_adjacency(id_se1, id_se2,False) - return False - - data_se1, data_se2 = self.get_data(id_se1), self.get_data(id_se2) - - if "P47" in data_se2.other and id_se1 in get_p47_adjacency_data(data_se2): - self.db_rel_match.add_adjacency(id_se1, id_se2,False) - return True - - elif "P47" in data_se1.other and id_se2 in get_p47_adjacency_data(data_se1): - self.db_rel_match.add_adjacency(id_se1, id_se2,True) - return True - - - if collide(id_se1, id_se2): - self.db_rel_match.add_adjacency(id_se1, id_se2,True) - return True - - if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2: - if Point(data_se1.coord.lon, data_se1.coord.lat).distance( - Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( - set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: - self.db_rel_match.add_adjacency(id_se1, id_se2,True) - return True - - self.db_rel_match.add_adjacency(id_se1, id_se2,False) - return False - - -class InclusionRelation(RelationExtractor): - - def __init__(self): - RelationExtractor.__init__(self) - - def is_relation(self, id_se1: str, id_se2: str): - found_, value = self.db_rel_match.get_inclusion(id_se1, id_se2) - if found_: - return value - - inc_chain_P131, inc_chain_P706 = self.get_inclusion_chain(id_se1, "P131"), self.get_inclusion_chain(id_se1,"P706") - inc_chain = inc_chain_P131 - inc_chain.extend(inc_chain_P706) - inc_chain = set(inc_chain) - - if id_se2 in inc_chain: - self.db_rel_match.add_inclusion(id_se1, id_se2, True) - return True - - self.db_rel_match.add_inclusion(id_se1, id_se2, False) - return False - - def get_inclusion_chain(self, id_, prop): - """ - For an entity return it geographical inclusion tree using a property. - """ - arr__ = [] - current_entity = gazetteer.get_by_id(id_)[0] - if "inc_" + prop in current_entity.other: - arr__ = current_entity.other["inc_" + prop] - elif "inc_geoname" in current_entity.other: - arr__ = current_entity.other.inc_geoname - if isinstance(arr__, str): - arr__ = [arr__] - return arr__ diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py index baa7c10ca3d1e93f1483ad98c606e5e20d7697e9..b3b02fce79f3dd14e5c2fceabd080a8c38fdd4bf 100644 --- a/strpython/models/spatial_relation.py +++ b/strpython/models/spatial_relation.py @@ -4,12 +4,8 @@ from multiprocessing import cpu_count from shapely.geometry import Point, Polygon import geopandas as gpd import pandas as pd -import numpy as np - -from joblib import Parallel, delayed from mytoolbox.env import in_notebook - if in_notebook(): from tqdm._tqdm_notebook import tqdm_notebook as tqdm else: @@ -20,6 +16,7 @@ from strpython.helpers.geodict_helpers import gazetteer from mytoolbox.structure.objectify import objectify + class MetaCollector(): __cache_entity_data = {} @@ -61,7 +58,7 @@ class MetaCollector(): Returns ------- bool - if relation exists + if a relation exists """ raise NotImplementedError() @@ -70,10 +67,18 @@ class RelationExtractor(MetaCollector): __cache_entity_data = {} def __init__(self, spatial_entities): + """ + Constructor + Parameters + ---------- + spatial_entities : list + list of spatial entities identifier + """ self.spatial_entities = spatial_entities + # Retrieve Geometries data = [[sp_id, getGEO(sp_id)] for sp_id in - tqdm(spatial_entities, desc="Retrieving Geometries...")] + tqdm(spatial_entities, desc="Retrieving Geometries...")] self.all_geometry = [] for i in data: @@ -82,10 +87,18 @@ class RelationExtractor(MetaCollector): else: self.all_geometry.append([i[0], i[1].geometry.values[0]]) - self.adjacency_geom, self.inclusion_geom = {}, {} - self.adjacency_meta, self.inclusion_meta = {}, {} + self.adjacency_geom, self.inclusion_geom = pd.DataFrame(), pd.DataFrame() + self.adjacency_meta, self.inclusion_meta = pd.DataFrame(), pd.DataFrame() def get_relation_geometry_based(self): + """ + Extract adjacency and inclusion relation based on the geometries of each spatial entities. + + Each relation is extracted based on the following logical operation: + * Adj(Sa, Sb) = (Intersect(Sa,Sb) XOR (Within(Sa,Sb) OR Within(Sb,Sa))) AND Intersect(Sa,Sb)) + * Inclusion(Sa, Sb) = Within(Sa,Sb) + + """ if not self.all_geometry: raise ValueError("No geometry extracted. Check the `spatial_entities` arg during the initialization.") @@ -103,7 +116,7 @@ class RelationExtractor(MetaCollector): except Exception as e: print(e) - corr_ = gdf_intersect.iloc[:, 2:] ^ gdf_within.iloc[:,2:] # An entity cannot be related to an other entity by two type of relation + corr_ = gdf_intersect.iloc[:, 2:] ^ (gdf_within.iloc[:,2:] | gdf_within.iloc[:,2:].T) # An entity cannot be related to an other entity by two type of relation adj_ = gdf_intersect.iloc[:, 2:] & corr_ # because if include and not adjacent does not mean Adjacent ! gdf_adjacency = gdf_within.iloc[:, :2] @@ -117,63 +130,94 @@ class RelationExtractor(MetaCollector): self.inclusion_geom = gdf_within.set_index("id") def get_relation_meta_based(self): + """ + Extract relation between spatial entities based on their meta-data. + + """ meta_adj_extractor = AdjacencyMetaRelation(self.spatial_entities) meta_inc_extractor = InclusionMetaRelation(self.spatial_entities) + inc_res = {} + for se1 in tqdm(self.spatial_entities, desc="Retrieve Inclusion based on meta_data"): + for se2 in self.spatial_entities: + if not se1 in inc_res: inc_res[se1] = {} + if meta_inc_extractor.is_relation(se1, se2): + inc_res[se1][se2] = True + else: + inc_res[se1][se2] = False + adj_res = {} for i in tqdm(range(len(self.spatial_entities)), desc="Retrieve Adjacency based on meta-data"): se1 = self.spatial_entities[i] sub_spat = self.spatial_entities[i:len(self.spatial_entities)] - res = Parallel(n_jobs=4, backend="multiprocessing")(delayed(meta_adj_extractor.is_relation)(se1, se2) for se2 in sub_spat) for j in range(len(sub_spat)): se2 = sub_spat[j] if not se1 in adj_res: adj_res[se1] = {} if not se2 in adj_res: adj_res[se2] = {} - adj_res[se1][se2] = res[j] - adj_res[se2][se1] = res[j] - - inc_res = {} - for se1 in tqdm(self.spatial_entities, desc="Retrieve Inclusion based on meta_data"): - res = Parallel(n_jobs=4, backend="multiprocessing")(delayed(meta_inc_extractor.is_relation)(se1, se2) for se2 in self.spatial_entities) - #res= [meta_inc_extractor.is_relation(se1, se2) for se2 in self.spatial_entities] - for i,se2 in enumerate(self.spatial_entities): - if not se1 in inc_res: inc_res[se1] = {} - adj_res[se1][se2] = res[j] + adj_res[se1][se2] = meta_adj_extractor.is_relation(se1, se2) + adj_res[se2][se1] = adj_res[se1][se2] self.adjacency_meta = pd.DataFrame.from_dict(adj_res) self.inclusion_meta = pd.DataFrame.from_dict(inc_res) def fuse_meta_and_geom(self): - # To apply logical combination correctly ! - self.adjacency_meta.sort_index(inplace= True) + """ + Merge relation found using two methods : + * relation found using entities geometries + * relation found using entities meta-data + + Returns + ------- + tuple(pd.DataFrame, pd.DataFrame) + adjacency relation dataframe, inclusion relation dataframe + """ + # We sort both index and columns to apply logical combination correctly ! + self.adjacency_meta.sort_index(inplace=True) self.inclusion_meta.sort_index(inplace=True) self.adjacency_geom.sort_index(inplace=True) self.inclusion_geom.sort_index(inplace=True) - self.adjacency_meta.sort_index(axis=1,inplace=True) - self.inclusion_meta.sort_index(axis=1,inplace=True) - self.adjacency_geom.sort_index(axis=1,inplace=True) - self.inclusion_geom.sort_index(axis=1,inplace=True) + self.adjacency_meta.sort_index(axis=1, inplace=True) + self.inclusion_meta.sort_index(axis=1, inplace=True) + self.adjacency_geom.sort_index(axis=1, inplace=True) + self.inclusion_geom.sort_index(axis=1, inplace=True) df_adj = self.adjacency_meta.copy() df_inc = self.inclusion_meta.copy() - df_adj.iloc[:,:] = self.adjacency_meta | self.adjacency_geom - df_inc.iloc[:,:] = self.inclusion_meta | self.inclusion_geom + df_adj.iloc[:, :] = self.adjacency_meta | self.adjacency_geom + df_inc.iloc[:, :] = self.inclusion_meta | self.inclusion_geom return df_adj, df_inc + class AdjacencyMetaRelation(MetaCollector): def __init__(self, spatial_entities): + """ + Constructor + + Parameters + ---------- + spatial_entities : list + list of spatial entities identifier + """ MetaCollector.__init__(self) self.p_47_dict = {} self.distances_is_inf_to = {} self.get_all_p47(spatial_entities) self.get_all_distances(spatial_entities) - def get_all_p47(self, ses): + def get_all_p47(self, spatial_entities): + """ + Extract P47 property value for each spatial entities in `ses` + Parameters + ---------- + spatial_entities : list + list of spatial entities identifier + + """ p47_dict = {} - for es in tqdm(ses, "Extract all P47 data..."): + for es in tqdm(spatial_entities, desc="Extract all P47 data..."): data = self.get_data(es) p47se1 = [] if "P47" in data: @@ -184,40 +228,36 @@ class AdjacencyMetaRelation(MetaCollector): p47_dict[data.id] = p47se1 self.p_47_dict = p47_dict - def compute_dist(self,data_se1, data_se2): - stop_class = {"A-PCLI", "A-ADM1"} - if data_se1["lat"] == np.inf : - return np.inf - return Point(data_se1["lon"], data_se1["lat"]).distance(Point(data_se2["lon"], data_se2["lon"])) + def get_all_distances(self, spatial_entities, max_d=1): + """ + Compute the distance between all entities. Then associate a boolean value if a distance inferior to `max_d` value. + Parameters + ---------- + spatial_entities : list + list of spatial entities - def get_all_distances(self, spatial_entities): + """ stop_class = {"A-PCLI", "A-ADM1"} # Country or First Adminstrative cut - data = {} - for sp_ in spatial_entities: - dd=self.get_data(sp_) - if "coord" in dd: - data[sp_]={"lat":dd.coord.lat,"lon":dd.coord.lon} - else: - data[sp_] = {"lat": np.inf, "lon": np.inf} + data = {es: MetaCollector().get_data(es) for es in spatial_entities} - dist_all = {} - for es in tqdm(spatial_entities): - res_ = Parallel(n_jobs=4, backend="multiprocessing")(delayed(self.compute_dist)(data[es], data[es2]) for es2 in spatial_entities) - for ix,es2 in enumerate(spatial_entities): - if not es in dist_all:dist_all[es]={} - if not es2 in dist_all: dist_all[es2] = {} - dist_all[es][es2]=res_[ix] - dist_all[es2][es] = res_[ix] + coord = [MetaCollector().get_data(es) for es in spatial_entities] + coord = [([c.id, Point(c.coord.lon, c.coord.lat)] if "coord" in c else [c.id, Point(180, 90)]) for c in coord] + + df = gpd.GeoDataFrame(coord, columns="id geometry".split()) + for row in tqdm(df.itertuples(), total=len(df)): + df["{0}".format(row.id)] = df.distance(row.geometry) + + dist_all = df.set_index("id").to_dict() for se1 in tqdm(spatial_entities, desc="Compute Distances"): if not se1 in self.distances_is_inf_to: self.distances_is_inf_to[se1] = {} for se2 in spatial_entities: - data_se1, data_se2 = data[se1], data[se2] + data_se1, data_se2 = data[se1], data[se2] if data_se1 and data_se2 and "coord" in data_se1 and "coord" in data_se2: not_in_stop = len(set(data_se1.class_) & stop_class) < 1 and len( set(data_se2.class_) & stop_class) < 1 - self.distances_is_inf_to[se1][se2] = dist_all[se1][se2] < 1 and not_in_stop + self.distances_is_inf_to[se1][se2] = dist_all[se1][se2] < max_d and not_in_stop else: self.distances_is_inf_to[se1][se2] = False @@ -236,11 +276,19 @@ class AdjacencyMetaRelation(MetaCollector): class InclusionMetaRelation(MetaCollector): _inc_chain_cache = {} - def __init__(self,spatial_entities): + def __init__(self, spatial_entities): + """ + Constructor + + Parameters + ---------- + spatial_entities : list + list of spatial entities identifier + """ MetaCollector.__init__(self) self._inc_chain_cache = {} - for se in tqdm(spatial_entities,desc="Extract Inclusion Chains"): - inc_chain_P131, inc_chain_P706 = self.get_inclusion_chain(se, "P131"), self.get_inclusion_chain(se,"P706") + for se in tqdm(spatial_entities, desc="Extract Inclusion Chains"): + inc_chain_P131, inc_chain_P706 = self.get_inclusion_chain(se, "P131"), self.get_inclusion_chain(se, "P706") inc_chain = inc_chain_P131 inc_chain.extend(inc_chain_P706) inc_chain = set(inc_chain) @@ -256,6 +304,18 @@ class InclusionMetaRelation(MetaCollector): def get_inclusion_chain(self, id_, prop): """ For an entity return it geographical inclusion tree using a property. + + Parameters + ---------- + id_ : str + spatial entity identifier + prop : str + inclusion meta-data identifier + + Returns + ------- + list + inclusion chain """ arr__ = [] current_entity = gazetteer.get_by_id(id_)[0]