From c9669cc995d377b138aa7c193d49a5675f7c98f2 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Wed, 11 Jul 2018 21:55:11 +0200 Subject: [PATCH] Debug Create server script oriented to run Geomatching --- generate_similarity_matrix.py | 74 +++++++++++++++ strpython/eval/stats.py | 31 +++++++ .../helpers/collision_with_gazetteer_data.py | 91 +++++++++---------- strpython/models/str.py | 90 ++++++++++++------ 4 files changed, 210 insertions(+), 76 deletions(-) create mode 100644 generate_similarity_matrix.py create mode 100644 strpython/eval/stats.py diff --git a/generate_similarity_matrix.py b/generate_similarity_matrix.py new file mode 100644 index 0000000..a547380 --- /dev/null +++ b/generate_similarity_matrix.py @@ -0,0 +1,74 @@ +# coding = utf-8 +import glob + +from gmatch4py.bag_of_cliques import BagOfCliques +from gmatch4py.base import Base +from gmatch4py.ged.graph_edit_dist import GraphEditDistance +from gmatch4py.ged.bipartite_graph_matching_2 import BP_2 +from gmatch4py.ged.greedy_edit_distance import GreedyEditDistance +from gmatch4py.ged.hausdorff_edit_distance import HED +from gmatch4py.jaccard import Jaccard +from gmatch4py.kernels.weisfeiler_lehman import * +from gmatch4py.mcs import MCS +from gmatch4py.vertex_edge_overlap import VertexEdgeOverlap + +import argparse, os, sys, re, json + +parser = argparse.ArgumentParser() +parser.add_argument("graphs_input_dir") +parser.add_argument("matrix_output_dir") +parser.add_argument("-d", action="store_true", help="Return distance matrix") + +args = parser.parse_args() +if not os.path.exists(args.graphs_input_dir): + print("Input graph directory doesn't exist!") + sys.exit(1) + +if not os.path.exists(args.matrix_output_dir): + print("Output matrix directory doesn't exist!") + print("Creating directory") + os.makedirs(args.matrix_output_dir) + print("Directory created") + +graphs = [] +mapping_files_to_graphs = {} + +# Loading graphs +fns = glob.glob(args.graphs_input_dir.rstrip("/") + "/*.gexf") +if not fns: + print("Input dir empty! Not .gexf file found!") + +i = 0 +for fn in fns: + graphs.append(nx.read_gexf(fn)) + mapping_files_to_graphs[i] = fn + +#print(graphs) + +# Compute matrices +for class_ in [BagOfCliques, GraphEditDistance, BP_2, GreedyEditDistance, HED, Jaccard, WeisfeleirLehmanKernel, MCS, + VertexEdgeOverlap]: + print("Computing the Similarity Matrix for {0}".format(class_.__name__)) + + if class_ in (GraphEditDistance, BP_2, GreedyEditDistance, HED): + comparator = class_(1, 1, 1, 1) + elif class_ == WeisfeleirLehmanKernel: + comparator = class_(h=2) + else: + comparator=class_() + matrix = comparator.compare(graphs, None) + if not args.d: + matrix = comparator.similarity(matrix) + else: + matrix= comparator.distance(matrix) + print("Matrix ready. Saving ...") + output_fn="{0}/{1}_{2}.npy".format( + args.matrix_output_dir.rstrip("/"), + class_.__name__, + os.path.dirname(args.graphs_input_dir).replace("/","_") + ) + np.save(output_fn,matrix) + print("Matrix Saved") + +json.dump(mapping_files_to_graphs,open("{0}/{1}".format(args.matrix_output_dir.rstrip("/"),"metadata.json"))) +print("Done") \ No newline at end of file diff --git a/strpython/eval/stats.py b/strpython/eval/stats.py new file mode 100644 index 0000000..915c554 --- /dev/null +++ b/strpython/eval/stats.py @@ -0,0 +1,31 @@ +# coding = utf-8 + +from ..helpers.gazeteer_helpers import get_data +import numpy as np + +def flattern(A): + rt = [] + for i in A: + if isinstance(i, list): + rt.extend(flattern(i)) + elif isinstance(i, np.ndarray): + rt.extend(flattern(i.tolist())) + else: + rt.append(i) + return rt + +def most_common(lst): + if len(list(set(lst))) >1 and "P-PPL" in set(lst): + lst=[x for x in lst if x != "PPL"] + return max(set(lst), key=lst.count) + +def granularity(graph): + """ + Return the granularity of a STR + :param graph: + :return: + """ + class_list=flattern([get_data(n)["class"] for n in list(graph.nodes())]) + if not class_list: + return [] + return most_common(class_list) \ No newline at end of file diff --git a/strpython/helpers/collision_with_gazetteer_data.py b/strpython/helpers/collision_with_gazetteer_data.py index 232c340..4e0665d 100644 --- a/strpython/helpers/collision_with_gazetteer_data.py +++ b/strpython/helpers/collision_with_gazetteer_data.py @@ -1,57 +1,65 @@ import json import os +import shapely from scipy.spatial import ConvexHull from shapely.geometry import Polygon, Point, shape from ..config.configuration import config from .gazeteer_helpers import get_data -from .collision import collide +#from .collision import collide +import geopandas as gpd __cache={} __cache_adjacency={} __limit_cache=400 -def parseHull(hull_object,points): - hull=[] - for simplex in hull_object.simplices: - hull.append(points[simplex[0]]) - hull.append(points[simplex[1]]) - return hull - -def getConvexHull(id): - res = get_data(id) - if not "path" in res:return [] - path = res["path"] - data=json.load(open(os.path.join(config.osm_boundaries_directory,path))) - boundaries=data["geometry"]["coordinates"] - if data["geometry"]["type"]== "Polygon": - hull = parseHull(ConvexHull(boundaries[-1]),boundaries[-1]) - return [hull] - else: - hull=[] - for bound in boundaries[-1]: - hull.append(parseHull(ConvexHull(bound),bound)) - return hull - def add_cache(id_,hull): global __cache,__limit_cache if len(__cache) > __limit_cache: __cache={} __cache[id_]=hull -def include_in(hull_1,hull_2): - n_=len(hull_1) - inside=0 - poly=Polygon(hull_2) - for p in hull_1: - if Point(p[0],p[1]).within(poly): - inside+=1 - if inside/n_ > 0.5: - return True +def getGEO(id_se): + data=get_data(id_se) + if "path" in data: + return gpd.read_file(os.path.join(config.osm_boundaries_directory, data["path"])).geometry + elif "coord" in data: + return Point(data["coord"]["lon"],data["coord"]["lat"]) + return None +def collide(se1,se2): + try: + if se1 in __cache: + data_se1=__cache[se1] + else: + data_se1 = gpd.GeoSeries(list(getGEO(se1).values[0])) + add_cache(se1,data_se1) + if se2 in __cache: + data_se2=__cache[se2] + else: + data_se2 = gpd.GeoSeries(list(getGEO(se2).values[0])) + add_cache(se2, data_se2) + except: + return False + + if type(data_se1) != type(data_se2): + if type(data_se1) == gpd.geoseries.GeoSeries: + return data_se1.intersects(data_se2).any() + else: + return data_se2.intersects(data_se1).any() + try: + if data_se1.intersects(data_se2): + return True + except: + if data_se1.intersects(data_se2).any(): + return True return False + + + + def collisionTwoSEBoundaries(id_SE1,id_SE2): global __cache,__cache_adjacency if id_SE1 in __cache_adjacency: @@ -61,24 +69,11 @@ def collisionTwoSEBoundaries(id_SE1,id_SE2): if id_SE1 in __cache_adjacency[id_SE2]: return __cache_adjacency[id_SE2][id_SE1] - if id_SE1 in __cache: - hull_1=__cache[id_SE1] - else: - hull_1 = getConvexHull(id_SE1) - add_cache(id_SE1,hull_1) - if id_SE2 in __cache: - hull_2=__cache[id_SE2] - else: - hull_2 = getConvexHull(id_SE2) - add_cache(id_SE2, hull_2) - if not id_SE1 in __cache_adjacency: __cache_adjacency[id_SE1]={} - for h1 in hull_1: - for h2 in hull_2: - if collide(h1,h2) and not include_in(h1,h2): - __cache_adjacency[id_SE1][id_SE2] = True - return True + if collide(id_SE1,id_SE2): #and not include_in(h1,h2): + __cache_adjacency[id_SE1][id_SE2] = True + return True __cache_adjacency[id_SE1][id_SE2]=False return False diff --git a/strpython/models/str.py b/strpython/models/str.py index 8d4d20a..3c255b0 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -6,7 +6,7 @@ import networkx as nx import pandas as pd import logging -from shapely.geometry import Point +from shapely.geometry import Point, MultiPoint, MultiLineString, LineString from ..config.configuration import config #logging.basicConfig(filename=config.log_file,level=logging.INFO) @@ -17,7 +17,7 @@ from ..helpers.collision_with_gazetteer_data import collisionTwoSEBoundaries from ..helpers.deprecated import deprecated from ..helpers.gazeteer_helpers import get_data, get_data_by_wikidata_id from ..nlp.ner.ner import NER - +import geopandas as gpd def get_inclusion_chain(id_, prop): """ @@ -223,7 +223,7 @@ class STR(object): p47se1 = [] for el in data["P47"]: d = get_data_by_wikidata_id(el) - if not "hits" in d: + if "id" in d: p47se1.append(d["id"]) return p47se1 @@ -233,47 +233,39 @@ class STR(object): :return: """ stop_class=set(["A-PCLI","A-ADM1"]) + for se1 in self.spatial_entities: data_se1 = get_data(se1) for se2 in self.spatial_entities: - if se1 == se2: - continue + if se1 == se2: continue + if self.is_included_in(se1,se2) or self.is_included_in(se2,se1): + continue if se1 in self.adjacency_relationships: if se2 in self.adjacency_relationships[se1]: continue if se2 in self.adjacency_relationships: if se1 in self.adjacency_relationships[se2]: continue - - if self.is_included_in(se1, se2) or self.is_included_in(se2, se1): - continue - data_se2 = get_data(se2) - f = False - if "P47" in data_se1 and "P47" in data_se2: - p47se1 = self.getP47AdjacencyData(data_se1) - if se2 in p47se1: + if "P47" in data_se2: + if se1 in self.getP47AdjacencyData(data_se2): f = True - else: - p47se2 = self.getP47AdjacencyData(data_se2) - if se1 in p47se2: + #print(data_se1["en"], data_se2["en"], "P47") + if not f: + if "P47" in data_se2: + if se2 in self.getP47AdjacencyData(data_se2): f = True + #print(data_se1["en"], data_se2["en"], "P47") if not f: - #if not self.shapes: - if collisionTwoSEBoundaries(se1, se2): - f = True - #else: - #if is_intersect(se1, se2, self.shapes): - #f = True + f = collisionTwoSEBoundaries(se1, se2) if not f: - #print(data_se1,data_se2) - if Point(data_se1["coord"]["lon"],data_se1["coord"]["lat"]).distance(Point(data_se2["coord"]["lon"], data_se2["coord"]["lat"])) < 1\ - and len(set(data_se1["class"]) & stop_class) < 1 and len(set(data_se2["class"]) & stop_class) < 1: - f=True - - self.add_adjacency_rel(se1, se2,f) + if Point(data_se1["coord"]["lon"], data_se1["coord"]["lat"]).distance( + Point(data_se2["coord"]["lon"], data_se2["coord"]["lat"])) < 1 and len( + set(data_se1["class"]) & stop_class) < 1 and len(set(data_se2["class"]) & stop_class) < 1: + f = True + self.add_adjacency_rel(se1, se2, f) @@ -346,3 +338,45 @@ class STR(object): self.add_inclusion_rel(edge[0], edge[1]) def set_all_shapes(self,shapes): self.shapes=shapes + + def map_projection(self): + import matplotlib.pyplot as plt + world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) + base = world.plot(color='white', edgecolor='black', figsize=(16, 9)) + points=[] + for se in self.spatial_entities: + data=get_data(se) + try: + points.append(Point(data["coord"]["lon"],data["coord"]["lat"])) + except: + pass + + lines_adj=[] + for se1 in self.adjacency_relationships: + data_se1=get_data(se1) + for se2 in self.adjacency_relationships[se1]: + data_se2 = get_data(se2) + if self.adjacency_relationships[se1][se2]: + lines_adj.append( + LineString([(data_se1["coord"]["lon"],data_se1["coord"]["lat"]),(data_se2["coord"]["lon"], data_se2["coord"]["lat"])]) + ) + lines_inc=[] + for se1 in self.inclusion_relationships: + data_se1=get_data(se1) + for se2 in self.inclusion_relationships[se1]: + if self.inclusion_relationships[se1][se2]: + data_se2 = get_data(se2) + lines_inc.append( + LineString([ + (data_se1["coord"]["lon"], data_se1["coord"]["lat"]), + (data_se2["coord"]["lon"], data_se2["coord"]["lat"])] + ) + ) + + gpd.GeoSeries(points).plot(ax=base,marker='o',markersize=5,color="blue") + gpd.GeoSeries(lines_adj).plot(ax=base, color="green") + gpd.GeoSeries(lines_inc).plot(ax=base, color="red") + print("adj",gpd.GeoSeries(lines_adj)) + print("inc",gpd.GeoSeries(lines_inc)) + plt.show() + -- GitLab