Add Cache relation extraction using sqlite + Add entry for sqlite db in config.json

847ed7fb · Fize Jacques · 4da62e60 · 847ed7fb · 847ed7fb · 847ed7fb
Commit 847ed7fb authored 6 years ago by Fize Jacques
Hide whitespace changes
Inline Side-by-side

Showing

with 33 additions and 326 deletions
+33 -326
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,5 @@ __pycache__/
 *.npy
 *.pkl
 *cache.json
-*.gexf
\ No newline at end of file
+*.gexf
+temp_cluster/
--- a/auto_fill_annotation.py
+++ b/auto_fill_annotation.py
@@ -5,9 +5,13 @@ import argparse, os, re, json, glob
 import pandas as pd
 import networkx as nx

-from strpython.eval.automatic_annotation import AnnotationAutomatic,save_cache
+from strpython.eval.automatic_annotation import AnnotationAutomatic,save_cache,add_cache
 from strpython.models.str import STR
 from tqdm import tqdm,TqdmSynchronisationWarning
+from joblib import Parallel, delayed
+from multiprocessing import cpu_count
+
+
 import warnings
 warnings.simplefilter("ignore", TqdmSynchronisationWarning)
 tqdm.pandas()
@@ -40,10 +44,11 @@ def foo(x):
        return annotater.all(strs[x.G1], strs[x.G2],x.G1, x.G2)
    except KeyError as e:
        print(e)
+        add_cache(strs[x.G1], strs[x.G2],[0, 0, 0, 0])
        return [0, 0, 0, 0]


-df["res"] = df.progress_apply(lambda x: foo(x), axis=1)
+df["res"] = Parallel(n_jobs=cpu_count())(delayed(foo)(x) for x in tqdm(df.itertuples()))#df.progress_apply(lambda x: foo(x), axis=1)
 df.res=df.res.apply(lambda x :list(map(int,x)) if x else [])
 df[["c1"]] = df.res.apply(lambda x: x[0] if len(x)>0 else 0)
 df[["c2"]] = df.res.apply(lambda x: x[1] if len(x)>0 else 0)

--- a/strpython/config/config.json
+++ b/strpython/config/config.json
@@ -12,5 +12,6 @@
    "count":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/count_wiki.pkl"
  },
  "language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/language_resources",
-  "gazetteer":"geodict"
+  "gazetteer":"geodict",
+  "relation_db_path" : "/Users/jacquesfize/.services/relation_match.db"
 }
\ No newline at end of file
--- a/strpython/helpers/geodict_helpers.py
+++ b/strpython/helpers/geodict_helpers.py
@@ -5,7 +5,7 @@ import re
 from elasticsearch import Elasticsearch
 from ..config.configuration import config
 import pandas as pd
-from ..helpers.objectify import objectify
+

 import gazpy as ga


--- a/strpython/helpers/geodict_helpers_old.py
+++ b/strpython/helpers/geodict_helpers_old.py
@@ -5,7 +5,7 @@ import re
 from elasticsearch import Elasticsearch
 from ..config.configuration import config
 import pandas as pd
-from ..helpers.objectify import objectify
+from mytoolbox.structure.objectify import objectify

 es = Elasticsearch(config.es_server)


--- a/strpython/helpers/objectify.py
+++ b/strpython/helpers/objectify.py
-#!/usr/bin/env python
-
-
-"""Scrap module.
-
-Just tiny bits & bolts.
-
-.. author: Adrian Castravete
-.. modified by : Jacques Fize (Implemented for Python 3 and recursive objectification)
-"""
-
-from functools import wraps
-
-
-def objectify(func):
-    """Mimic an object given a dictionary.
-
-    Given a dictionary, create an object and make sure that each of its
-    keys are accessible via attributes.
-    If func is a function act as decorator, otherwise just change the dictionary
-    and return it.
-    :param func: A function or another kind of object.
-    :returns: Either the wrapper for the decorator, or the changed value.
-
-    Example::
-
-    >>> obj = {'old_key': 'old_value'}
-    >>> oobj = objectify(obj)
-    >>> oobj['new_key'] = 'new_value'
-    >>> print oobj['old_key'], oobj['new_key'], oobj.old_key, oobj.new_key
-
-    >>> @objectify
-    ... def func():
-    ...     return {'old_key': 'old_value'}
-    >>> obj = func()
-    >>> obj['new_key'] = 'new_value'
-    >>> print obj['old_key'], obj['new_key'], obj.old_key, obj.new_key
-
-    """
-
-    def create_object(value):
-        """Create the object.
-
-        Given a dictionary, create an object and make sure that each of its
-        keys are accessible via attributes.
-        Ignore everything if the given value is not a dictionary.
-        :param value: A dictionary or another kind of object.
-        :returns: Either the created object or the given value.
-
-        """
-        if isinstance(value, dict):
-            # Build a simple generic object.
-            class Object(dict):
-                def __setitem__(self, key, val):
-                    setattr(self, key, val)
-                    return super(Object, self).__setitem__(key, val)
-
-            # Create that simple generic object.
-            ret_obj = Object()
-            # Assign the attributes given the dictionary keys.
-            for key, val in value.items():
-                if isinstance(val,dict):
-                    ret_obj[key] = objectify(val)
-                else:
-                    ret_obj[key] = val
-                setattr(ret_obj, key, val)
-            return ret_obj
-        else:
-            return value
-
-    # If func is a function, wrap around and act like a decorator.
-    if hasattr(func, '__call__'):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            """Wrapper function for the decorator.
-
-            :returns: The return value of the decorated function.
-
-            """
-            value = func(*args, **kwargs)
-            return create_object(value)
-
-        return wrapper
-
-    # Else just try to objectify the value given.
-    else:
-        return create_object(func)
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
 # coding = utf-8
 import copy
-import logging
 import os
 import time
 import warnings

+
 from tqdm import tqdm
 import folium
 import geopandas as gpd
 import networkx as nx
 import pandas as pd
 from shapely.geometry import MultiPoint, Polygon, Point, LineString
+from sklearn.cluster import MeanShift, estimate_bandwidth, dbscan
+import matplotlib.pyplot as plt

-from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency
 from ..helpers.geodict_helpers import gazetteer
-from ..eval.stats import most_common
-
-from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan
-import numpy as np
-
-
-# logging.basicConfig(filename=config.log_file,level=logging.INFO)
+from strpython.helpers.relation_extraction import AdjacencyRelation, InclusionRelation


 def get_inclusion_chain(id_, prop):
@@ -42,13 +37,11 @@ class STR(object):
    """
    Str basic structure
    """
-    __cache_inclusion = {}  # Store inclusion relations found between spaital entities
-    __cache_adjacency = {}  # Store adjacency relations found between spaital entities
    __cache_entity_data = {}  #  Store data about entity requested

    def __init__(self, tagged_text, spatial_entities,toponym_first=True):
        """
-        Constructir
+        Constructor

        Parameters
        ----------
@@ -71,6 +64,11 @@ class STR(object):
        self.adjacency_relationships = {}
        self.inclusion_relationships = {}

+        self.adj_rel_db=AdjacencyRelation()
+        self.inc_rel_db = InclusionRelation()
+
+        self.graph = nx.MultiDiGraph()
+
    @staticmethod
    def from_networkx_graph(g: nx.Graph, tagged_: list = []):
        """
@@ -164,10 +162,8 @@ class STR(object):
            id1, id2 = edge[0], edge[1]
            if edge[2]["color"] == "green":
                self.add_adjacency_rel(edge[0], edge[1])
-                self.add_cache__adjacency(id1, id2, True)
            elif edge[2]["color"] == "red":
                self.add_inclusion_rel(edge[0], edge[1])
-                self.add_cache_inclusion(id1, id2, True)

    def add_spatial_entity(self, id, label=None, v=True):
        """
@@ -213,7 +209,7 @@ class STR(object):
            except:
                label = None
            self.add_spatial_entity(id, label, False)
-        # print(self.graph.nodes(data=True))
+

    def add_adjacency_rel(self, se1, se2):
        """
@@ -231,7 +227,6 @@ class STR(object):
        if not se1 in self.adjacency_relationships: self.adjacency_relationships[se1] = {}
        if not se2 in self.adjacency_relationships: self.adjacency_relationships[se2] = {}
        self.adjacency_relationships[se1][se2], self.adjacency_relationships[se2][se1] = True, True
-        self.add_cache__adjacency(se1, se2, True)

    def add_inclusion_rel(self, se1, se2):
        """
@@ -248,47 +243,9 @@ class STR(object):
        if not se1 in self.inclusion_relationships:
            self.inclusion_relationships[se1] = {}
        self.inclusion_relationships[se1][se2] = True
-        self.add_cache_inclusion(se1, se2, True)
-
-    def add_cache_inclusion(self, id1, id2, v=True):
-        """
-        Add a relation of inclusion in a cache variable
-
-        Parameters
-        ----------
-        id1 : str
-            id of the first spatial entity
-        id2 : str
-            id of the second spatial entity
-        v : bool, optional
-            if the relation exists between the two spatial entities. Default is True
-
-        """

-        if not id1 in STR.__cache_inclusion:
-            STR.__cache_inclusion[id1] = {}
-        STR.__cache_inclusion[id1][id2] = v

-    def add_cache__adjacency(self, se1, se2, v=True):
-        """
-        Add a relation of adjacency in a cache variable

-        Parameters
-        ----------
-        id1 : str
-            id of the first spatial entity
-        id2 : str
-            id of the second spatial entity
-        v : bool, optional
-            if the relation exists between the two spatial entities. Default is True
-
-        """
-        if not se1 in STR.__cache_adjacency:
-            STR.__cache_adjacency[se1] = {}
-        if not se2 in STR.__cache_adjacency:
-            STR.__cache_adjacency[se2] = {}
-        STR.__cache_adjacency[se1][se2] = v
-        STR.__cache_adjacency[se2][se1] = v

    def get_data(self, id_se):
        """
@@ -376,156 +333,15 @@ class STR(object):
                if self.adjacency_relationships[se1][se2]:
                    self.graph.add_edge(se1, se2, key=0, color="green")

-    def is_included_in(self, se1_id, se2_id):
-        """
-        Return True if a spatial entity is included within another one.
-
-        Parameters
-        ----------
-        se1_id : str
-            id of the contained entity
-        se2_id : str
-            id of the entity container
-
-        Returns
-        -------
-        bool
-            if se1 included in se2
-        """
-
-        if se1_id in self.inclusion_relationships:
-            if se2_id in self.inclusion_relationships[se1_id]:
-                return self.inclusion_relationships[se1_id][se2_id]
-
-        inc_chain_P131 = get_inclusion_chain(se1_id, "P131")
-        inc_chain_P706 = get_inclusion_chain(se1_id, "P706")
-        inc_chain = inc_chain_P131
-        inc_chain.extend(inc_chain_P706)
-        inc_chain = set(inc_chain)
-        if se2_id in inc_chain:
-            self.add_cache_inclusion(se1_id, se2_id, True)
-            return True
-
-        return False
-
-    def is_adjacent_cache(self, se1, se2):
-        """
-        Return true if two spatial entities were found adjacent previously.
-
-        Parameters
-        ----------
-        se1 : str
-            id of the first spatial entity
-        se2 : str
-            id of the second spatial entity
-
-        Returns
-        -------
-        bool
-            if se1 adjacent to se2
-        """
-
-        if se1 in STR.__cache_adjacency:
-            if se2 in STR.__cache_adjacency[se1]:
-                return STR.__cache_adjacency[se1][se2]
-        if se2 in STR.__cache_adjacency:
-            if se1 in STR.__cache_adjacency[se2]:
-                return STR.__cache_adjacency[se2][se1]
-        return False
-
-    def is_included_cache(self, se1, se2):
-        """
-        Return true if a spatial entity were found included previously in an other one.
-
-        Parameters
-        ----------
-        se1 : str
-            id of the first spatial entity
-        se2 : str
-            id of the second spatial entity
-
-        Returns
-        -------
-        bool
-            if se1 included to se2
-        """
-        if se1 in STR.__cache_inclusion:
-            if se2 in STR.__cache_inclusion[se1]:
-                return STR.__cache_inclusion[se1][se2]
-        return False
-
-    def is_adjacent(self, se1, se2, datase1=None, datase2=None):
-        """
-        Return true if se1 is adjacent to se2.
-
-        Parameters
-        ----------
-        se1 : str
-            id of the first spatial entity
-        se2 : str
-            id of the second spatial entity
-        datase1 : gazpy.Element, optional
-            if given cached data concerning the spatial entity with id = se1 (the default is None)
-        datase2 : gazpy.Element, optional
-            if given cached data concerning the spatial entity with id = se2 (the default is None)
-
-        Returns
-        -------
-        bool
-            true if adjacent
-        """
-
-        stop_class = set(["A-PCLI", "A-ADM1"])
-
-        def get_p47_adjacency_data(data):
-            p47se1 = []
-            for el in data.other.P47:
-                d = gazetteer.get_by_other_id(el, "wikidata")
-                if not d: continue
-                p47se1.append(d[0].id)
-            return p47se1
-
-        if self.is_adjacent_cache(se1, se2):
-            return False
-
-        if self.is_included_in(se1, se2) or self.is_included_in(se2, se1):
-            return False
-
-        data_se1, data_se2 = self.get_data(se1), self.get_data(se2)
-
-        if "P47" in data_se2.other and se1 in get_p47_adjacency_data(data_se2):
-            return True
-            # print("P47")
-        elif "P47" in data_se1.other and se2 in get_p47_adjacency_data(data_se1):
-            return True
-            # print("P47")
-
-        if collisionTwoSEBoundaries(se1, se2):
-            return True
-
-        if data_se1 and  data_se2 and "coord" in data_se1.other and "coord" in data_se2.other:
-            if Point(data_se1.coord.lon, data_se1.coord.lat).distance(
-                    Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len(
-                set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1:
-                return True
-        return False

    def get_inclusion_relationships(self):
        """
        Find all the inclusion relationships between the spatial entities declared in the current STR.

        """
-
        for se_ in tqdm(self.spatial_entities, desc="Extract Inclusion"):
-            inc_chain_P131 = get_inclusion_chain(se_, "P131")
-            inc_chain_P706 = get_inclusion_chain(se_, "P706")
-
-            inc_chain = inc_chain_P131
-            inc_chain.extend(inc_chain_P706)
-            inc_chain = set(inc_chain)
-
            for se2_ in self.spatial_entities:
-                if se2_ in inc_chain:
+                if se_ != se2_ and self.inc_rel_db.is_relation(se_,se2_):
                    self.add_inclusion_rel(se_, se2_)

    def get_adjacency_relationships(self):
@@ -533,21 +349,11 @@ class STR(object):
        Find all the adjacency relationships between the spatial entities declared in the current STR.
        """

-        data = {se: self.get_data(se) for se in self.spatial_entities}
-
        for se1 in tqdm(self.spatial_entities, desc="Extract Adjacency Relationship"):
-            data_se1 = data[se1]
            for se2 in self.spatial_entities:
-                if se1 == se2: continue
-                if se1 in self.adjacency_relationships:
-                    if se2 in self.adjacency_relationships[se1]:
-                        continue
-                if se2 in self.adjacency_relationships:
-                    if se1 in self.adjacency_relationships[se2]:
-                        continue
-                data_se2 = data[se2]
-                if self.is_adjacent(se1, se2, data_se1, data_se2):
-                    self.add_adjacency_rel(se1, se2)
+                if se1 != se2 and self.adj_rel_db.is_relation(se1, se2):
+                    self.add_adjacency_rel(se1,se2)
+

    def build(self, inc=True, adj=True, verbose=False):
        """
@@ -576,7 +382,6 @@ class STR(object):
        graph.add_nodes_from(nodes)

        if adj:
-            debut = time.time()
            self.get_adjacency_relationships()
            for se1 in self.adjacency_relationships:
                for se2 in self.adjacency_relationships[se1]:
@@ -585,7 +390,6 @@ class STR(object):
                        graph.add_edge(se2, se1, key=0, color="green")

        if inc:
-            debut = time.time()
            self.get_inclusion_relationships()
            for se1 in self.inclusion_relationships:
                for se2 in self.inclusion_relationships[se1]:
@@ -616,7 +420,7 @@ class STR(object):
        except:
            print("Error while saving STR to {0}".format(format))

-    def getUndirected(self):
+    def get_undirected(self,simple_graph=True):
        """
        Return the Undirected form of a STR graph.

@@ -625,8 +429,9 @@ class STR(object):
        networkx.Graph
            unidirected graph
        """
-
-        return nx.Graph(self.graph)
+        if simple_graph:
+            return  nx.Graph(self.graph)
+        return nx.MultiGraph(self.graph)

    def get_geo_data_of_se(self):
        """
@@ -762,7 +567,7 @@ class STR(object):
            Matplotlib figure instance
        """

-        import matplotlib.pyplot as plt
+
        world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
        base = world.plot(color='white', edgecolor='black', figsize=(16, 9))
        points = []
@@ -804,24 +609,6 @@ class STR(object):
        plt.show()


-# def to_Multipoints(x):
-#     """
-#     Return a polygon buffered representation for a set of point
-
-#     Parameters
-#     ----------
-#     x : pandas.Series
-#         coordinates columns
-
-#     Returns
-#     -------
-#     shapely.geometry.Polygon
-#         polygon
-#     """
-
-#     #print(x[["x","y"]].values)
-#     return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1)
-
 def to_Polygon(x):
    """
    Return a polygon buffered representation for a set of points.