diff --git a/README.md b/README.md index 207a27090dcdf0dd72b93f19531217b8cadcd7de..274d6b9256260ea229a9e6a723786b9005bab18e 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,131 @@ -# STR +# `strpython` + +In [1,2], we propose matching process based on a dedicated graph structured named Spatial Textual Representation. This structure is composed of spatial entities (places like *Paris*) and the spatial relations that connects them. This library propose an implementation of the Spatial Textual Representation and extensions. + +# Requirements -This repository contains all the work on STR or Spatial Textual Representation. The file -hierarchy is divided in multiple modules such as : - - * **config** which contains the configuration file and a dedicated class for loading and - interact with it - * **gmatch4py** is a module which contains implementation of various graph matching - algorithms - * **helpers** is a module which contains various helpers methods for requesting the geo database - (geodict) or collision between polygons, etc.. - * **models** contains the STR structure and its variations. - * **nlp** contains all the implementation or interface of nlp methods such as NER, POS, - Toponym disambiguation, ... + * Python 3 + * Linux or Mac OS X + * Geodict [downlad ES server](http://geodict.cirad.fr) + * Install `gazpy` : `sudo pip3 install git+https://github.com/Jacobe2169/gazpy.git` -## Generate STR +# Installation + +```bash +git clone <gitrepo> +cd str-python +(sudo) pip3 install . +``` + +# How-to ? + +## Using the Python API + +To generate a STR for one or many documents, you need to define the STR pipeline. At this end, instantiate a `Pipeline` object. +```python +from strpython import Pipeline, STR + +pip = Pipeline(lang="fr") +``` + +You can custom the NER, you choose to use : `Spacy, Flair, Polyglot` (See `strpython.nlp.ner`). + +```python +from strpython import Pipeline, STR +from strpython.nlp.ner.spacy import Spacy + +pip = Pipeline(lang="fr",ner= Spacy(lang="fr")) +``` + +You can also customize the disambiguation algorithm use in the *geocoding* step (See `strpython.nlp.disambiguator`). + +```python +from strpython import Pipeline, STR +from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator + +dis= WikipediaDisambiguator() +pip = Pipeline(lang="fr",disambiguator=dis) +``` + +Then, to generate a STR for a document, use the `Pipeline.pipe_build([<list of text>])` + +```python +text = """EU looks to Northern Ireland-only backstop to break Brexit impasseEU trade commissioner says he believes ‘penny is finally dropping’ for Boris JohnsonDaniel BoffeyLast modified on Tue 10 Sep 2019 21.10 BST EU flagBrussels hopes Boris Johnson’s EU envoy, David Frost, will further pursue the idea at meetings later this week. Photograph: Clemens Bilan/EPAThe EU is pinning its hopes on British negotiators reverting to the Northern Ireland-only backstop previously rejected by Theresa May as a threat to the constitutional integrity of the UK.With Boris Johnson facing a choice between breaking his word and extending the UK’s membership of the EU beyond 31 October, or bringing back a tweaked deal for a last-gasp vote in parliament, officials and diplomats have expressed hope the prime minister will make a U-turn.EU sources insisted there was no other approach that could work and the negotiations were otherwise doomed to hit a “zombie stage†given the likelihood of an imminent general election.“We don’t know what mandate the prime minister has to propose something and obviously there is a strong division between the parliament and the government,†said Nathalie Loiseau, a former French minister for EU affairs.It is hoped in Brussels that Johnson’s EU envoy, David Frost, will further pursue a Northern Ireland-only backstop during meetings with the European commission’s Brexit taskforce on Wednesday and Friday.The newly nominated EU commissioner for trade, Phil Hogan, a former Irish minister, told the Irish Times he believed the “penny is finally dropping†in Johnson’s government over the lack of alternatives.The idea was originally rejected by May on the grounds it was unpalatable to her partners in the Democratic Unionist party, on which she relied for her working majority. At the time, she said “no British prime minister†could accept a regulatory border being drawn in the Irish Sea.No 10 insisted on Tuesday that Johnson was not pursuing the idea again in the hope of winning the support of more hardline Eurosceptics. “We are not seeking a Northern Ireland-only backstop,†a No 10 spokesman said.However, Arlene Foster, the DUP leader, was sufficiently alarmed to demand a private meeting with Johnson in Downing Street on Tuesday evening, which lasted an hour.Following the meeting, Foster said: “The prime minister rejected a Northern Ireland-only backstop in a letter to Donald Tusk on 19 August. It is undemocratic and unconstitutional and would place a tariff border between Northern Ireland and the rest of the United Kingdom. That would be unacceptable.“During today’s meeting, the prime minister confirmed his rejection of the Northern Ireland-only backstop and his commitment to securing a deal which works for the entire United Kingdom as well as our neighbours in the Republic of Ireland.â€Johnson has not been specific about how he will get a new deal with Brussels, but before his meeting with Foster, he said “there is a way†to achieve one “but it will take a lot of hard workâ€, as he fought back against accusations that his five-week prorogation of parliament is anti-democratic. “Donnez-moi un break – what a load of nonsense,†he said, switching to Franglais.The prime minister has said he wants to remove the Irish backstop from the withdrawal agreement as it would tie Northern Ireland into the single market and the whole of the UK into a shared customs territory with the EU. He has described the arrangement as “undemocratic†and railed against signing a treaty that he says would be “inconsistent with the sovereignty of the UKâ€.But his proposal in recent days of a single all-Ireland agrifood zone has offered some hope in Brussels that the government may return to the initial EU suggestion of an arrangement that solely keeps Northern Ireland within the EU’s structures.Hogan said Johnson, who visited the Irish prime minister, Leo Varadkar, in Dublin on Monday, had offered some grounds for optimism in his recent talks.“Mr Johnson has made a proposal in the last few days talking about an all-Ireland food zone. That is certainly a clear indication of divergence between Northern Ireland and the Republic of Ireland/the EU and the rest of the UK,†he said.“This is the first time that this has been spoken about by a British prime minister where they are prepared to accept some level of divergence between Northern Ireland and the rest of the UK. If we can build on that, we certainly might get closer to one another in terms of a possible outcome.â€Hogan warned, however, that the single agrifood zone was some distance from a solution to the Brexit impasse. “It would have to include all goods … in terms of any agreement,†he said.“I remain hopeful that the penny is finally dropping with the UK that there are pragmatic and practical solutions that can actually be introduced into the debate at this stage, albeit at the 11th hour, that may find some common ground between the EU and the UK. The taoiseach has indicated in the last 24 hours that the Northern Ireland-only backstop is quite an interesting idea to revisit.â€Fabian Zuleeg, the chief executive of the European Policy Centre thinktank in Brussels, said the only point of the talks in Brussels would be to discuss an extension of article 50 beyond 31 October or the detail of a Northern Ireland-only arrangement.“But in reality I don’t believe that the UK government wants to go down this route,†he said. “So at the moment I don’t see anything of substance that is being discussed because nothing else can be opened.â€After his nomination on Tuesday by the European commission’s president-designate, Ursula von der Leyen, Hogan is to take over any trade talks with the UK once the country leaves the bloc, with the former deputy chief EU negotiator Sabine Weyand as his director general.Hogan said the establishment of a new negotiating team “will take probably six to eight months once we know what the outcome of the present negotiations are … Then I expect it will take a number of years before we conclude the negotiations.â€More people in France…... like you, are reading and supporting The Guardian’s independent, investigative journalism than ever before. And unlike many new organisations, we have chosen an approach that allows us to keep our journalism accessible to all, regardless of where they live or what they can afford. But we need your ongoing support to keep working as we do.The Guardian will engage with the most critical issues of our time – from the escalating climate catastrophe to widespread inequality to the influence of big tech on our lives. At a time when factual information is a necessity, we believe that each of us, around the world, deserves access to accurate reporting with integrity at its heart.Our editorial independence means we set our own agenda and voice our own opinions. Guardian journalism is free from commercial and political bias and not influenced by billionaire owners or shareholders. This means we can give a voice to those less heard, explore where others turn away, and rigorously challenge those in power.We need your support to keep delivering quality journalism, to maintain our openness and to protect our precious independence. Every reader contribution, big or small, is so valuable. Support The Guardian from as little as €1 – and it only takes a minute. Thank you.""" +list_strs = pip.pipe_build([text]) + +list_str[0] +# Out[24]: +# STR +# Spatial Entities : {'GD2589931': 'United Kingdom', 'GD3978256': 'Brussels', 'GD2806921': 'Ireland', 'GD4465124': 'Dublin', 'GD3117352': 'France', 'GD5639369': 'Northern Ireland'} +# Verbose : False + +``` + +### STR Transformation + +Multiple transformation have been proposed : spatial-based and thematic-based + +#### Spatial Based +```python +from strpython.models.transformation.transform import Generalisation, Expansion + +# Region limited, Generalisation +gen_r = Generalisation().transform(str_, type_trans="gen", type_gen="bounded", bound="region") +# Country limited, Generalisation +gen_c = Generalisation().transform(str_, type_trans="gen", type_gen="bounded", bound="country") + +# Extension (n=1) +# Add n=1 entities found in a radius of 50 km around each entity extended +ext_1 = Expansion().transform(str_, type_trans="ext", adjacent_count=1, distance="50") +# Extension (n=2) +# Add n=2 entities found in a radius of 50 km around each entity extended +ext_2 = Expansion().transform(str_, type_trans="ext", adjacent_count=2, distance="50") +``` + +#### Thematic-Based + +```python +from strpython.models.thematic_str import ThematicSTR +from strpython.helpers.terminology.matcher import TerminologyMatcher +from strpython.models.transformation.thematic import * + +# Thematic entities matched +term_matcher = TerminologyMatcher("EU envoy government backstop".split()) + +# Build a Thematic STR, first we initialise +t_str = ThematicSTR.from_STR(str_) + +# Integrate the thematic +t_str.setup(text,term_matcher,"fr") +t_str.build() + +t_str +#Out[2]: +#STR +# Spatial Entities : {'GD2589931': 'United Kingdom', 'GD3978256': 'Brussels', 'GD2806921': 'Ireland', 'GD4465124': 'Dublin', 'GD3117352': 'France', 'GD5639369': 'Northern Ireland'} +# Verbose : False +#Thematic : {0: 'EU', 2: 'government'} + +# Apply thematic to generalized and extended version of the str +gen_c_t= get_generalized_with_thematic(gen_c,t_str) +gen_r_t= get_generalized_with_thematic(gen_r,t_str) + +ext_1_t = get_extended_with_thematic(ext_1,t_str) +ext_2_t = get_extended_with_thematic(ext_2,t_str) +``` + +### Plot a STR + +To visualize the STR graph, four methods are implemented : interactive map (using `folium`), static map (using `geopandas`), a network visualization (using `networkx` draw methods) and a Late$\chi$ TikZ output (using `tikz-network` python api). + +| **Output type** | **STR class method** | +|---------------------|-----------------------------| +| *interactive map* | `STR.to_folium()` | +| *static map* | `STR.map_projection()` | +| *network layout* | `STR.plot()` | +| *Late$\chi$ (TikZ)* | `STR.to_latex()` | + +## Using the command line To generate STR, use the `generate_str.py`. @@ -40,3 +152,11 @@ optional arguments: Transformation to apply -o OUTPUT, --output OUTPUT Output Filename + +``` + +# Bibliography + +[1]Jacques Fize, Mathieu Roche, Maguelonne Teisseire *Matching heterogeneous textual data using spatial features* **Inteligent Data Analysis Journal** + +[2]Jacques Fize, Mathieu Roche, Maguelonne Teisseire *Matching heterogeneous textual data using spatial features* **13th International Workshop on Spatial and Spatiotemporal Data Mining (SSTDM-18)** \ No newline at end of file diff --git a/strpython/helpers/terminology/terminology_matcher.py b/strpython/helpers/terminology/terminology_matcher.py index a79dfacfbcb471170a29ce195ee81f93a839c16f..4699ce9aa51c6b86562a0dbf7791eb22b306a901 100644 --- a/strpython/helpers/terminology/terminology_matcher.py +++ b/strpython/helpers/terminology/terminology_matcher.py @@ -13,6 +13,7 @@ from textblob_fr import PatternTagger, PatternAnalyzer from mytoolbox.text.match import match_sequence,match_sequences + def get_lemmatizer(lang): i = importlib.import_module("spacy.lang.{0}.lemmatizer".format(lang)) return i.LOOKUP diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py index f1df8b17bf6629bd8a007b2a1cd8112842beca2e..c01f7979a3c89cbeb3525e08d2685064a78c8c1e 100644 --- a/strpython/models/spatial_relation.py +++ b/strpython/models/spatial_relation.py @@ -307,7 +307,7 @@ class InclusionMetaRelation(MetaCollector): def get_inclusion_chain(self, id_, prop): """ - For an entity return it geographical inclusion tree using a property. + For an entity return its geographical inclusion tree using a property. Parameters ---------- diff --git a/strpython/models/str.py b/strpython/models/str.py index e091a886ac70372876d0482bfa3fab8742d83919..8c4ed87be4a8954c84d943c894f56ef15aad69f7 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -53,7 +53,7 @@ class STR(object): """ __cache_entity_data = {} #  Store data about entity requested - def __init__(self, spatial_entities, toponym_first=True, verbose=False): + def __init__(self, spatial_entities, toponym_first=True, verbose=False,already_built=False): """ Constructor @@ -73,7 +73,8 @@ class STR(object): if not k[:2] == "GD": del spatial_entities[k] - self.load_spatial_relations() + if not already_built: + self.load_spatial_relations() # Initialize Attributes self.adjacency_relationships = {} @@ -123,14 +124,13 @@ class STR(object): sp_en = {} for nod in g: try: - sp_en[nod] = g.nodes[nod]["label"] + sp_en[nod] = g.node[nod]["label"] except KeyError: # If no label found, grab one from the geo-database data = gazetteer.get_by_id(nod) if data: sp_en[nod] = data[0].label - str_ = STR(sp_en,toponym_first=False) - str_.load_spatial_relations() + str_ = STR(sp_en,toponym_first=False,already_built=True) str_.set_graph(g) return str_ @@ -624,7 +624,7 @@ class STR(object): return map - def map_projection(self, plt_=False): + def map_projection(self, plt_=False,figsize=(16, 9)): """ Return a matplotlib figure of the STR @@ -641,7 +641,8 @@ class STR(object): import matplotlib.pyplot as plt world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) - base = world.plot(color='white', edgecolor='black', figsize=(16, 9)) + world.to_crs(epsg=3857) + base = world.plot(color='white',edgecolor='black', figsize=figsize) points = [] for se in self.spatial_entities: data = gazetteer.get_by_id(se)[0] @@ -720,7 +721,7 @@ class STR(object): else: return plt.gca() - def to_latex(self,to_clipboard=False): + def to_latex(self,to_clipboard=False,geo_layout=False): def get_color(x): if x == "S_E": @@ -732,10 +733,20 @@ class STR(object): G =self.graph.copy() fn = tempfile.NamedTemporaryFile().name + + pos = graphviz_layout(G) + + if geo_layout: + pos={} + for se in self.spatial_entities: + data = gazetteer.get_by_id(se)[0] + pos[se] = (data.coord.lon, data.coord.lat) + + plot(G, filename=fn, type="tex", - layout=graphviz_layout(G), + layout=pos, vertex_label=dict(G.nodes(data="label")), vertex_color = {k:get_color(v) for k,v in dict(G.nodes(data="type")).items()}, edge_color=[ed[-1] for ed in list(G.edges(data="color"))], diff --git a/strpython/models/thematic_str.py b/strpython/models/thematic_str.py index 3929e31a9a9132833a2fd0951632582100ebc480..ced35c0a1136e413c71f96c0af60af856c4fb510 100644 --- a/strpython/models/thematic_str.py +++ b/strpython/models/thematic_str.py @@ -2,15 +2,16 @@ import re import pandas as pd +import networkx as nx from strpython.models.str import STR -from ..helpers.th_se_matcher import ThematicSpatialEntitiesMatcher +from strpython.helpers.th_se_matcher import ThematicSpatialEntitiesMatcher class ThematicSTR(STR): - def __init__(self, text, spatial_entities, vocabulary_matcher=None, lang=None): - STR.__init__(self, spatial_entities,toponym_first=False) + def __init__(self, text, spatial_entities, vocabulary_matcher=None, lang=None,already_built=False): + STR.__init__(self, spatial_entities,toponym_first=False, already_built = already_built) self.text = text if vocabulary_matcher and lang: self.matcher = ThematicSpatialEntitiesMatcher(vocabulary_matcher, lang) @@ -21,6 +22,19 @@ class ThematicSTR(STR): def __repr__(self): return STR.__repr__(self) + "\nThematic : {0}".format(self.thematic_entities) + @staticmethod + def from_networkx_graph(g: nx.Graph, tagged_: list = []): + spatial_entities = {k:v for k,v in dict(g.nodes(data="label")).items() if k.startswith("GD")} + STR_ = ThematicSTR("", spatial_entities,already_built=True) + for src, tar, att in g.edges(data=True): + if ("type_" in att and att["type_"] == "them") or ("color" in att and att["color"] == "blue"): + STR_.add_thematic_relationships(src,tar) + STR_.thematic_entities[tar]=g.node[tar]["label"] + STR_.graph = g + return STR_ + + + @staticmethod def from_STR(str_): diff --git a/strpython/models/transformation/thematic.py b/strpython/models/transformation/thematic.py index a7df8dbcd0ae7d81987843090870aeac8009f9e9..e689e534ffc5f8e95e21f8b42e0bcf0cc2a0cbae 100644 --- a/strpython/models/transformation/thematic.py +++ b/strpython/models/transformation/thematic.py @@ -11,7 +11,10 @@ def get_generalized_with_thematic( generalised_str: STR, str_thematic :ThematicS thematic_rel = str_thematic.thematic_relationships - new_gen = ThematicSTR.from_STR(generalised_str) + new_gen = ThematicSTR.from_networkx_graph(generalised_str.graph) + + relation_cache.update(new_gen.adjacency_relationships, new_gen.inclusion_relationships) + relation_cache.update(str_thematic.adjacency_relationships, str_thematic.inclusion_relationships) linked_es = set(list(thematic_rel.keys())) #  Spatial entities linked to Thematic Unit all_es = list(set(generalised_str.spatial_entities.keys()) | set(str_thematic.spatial_entities.keys())) @@ -46,7 +49,7 @@ def get_generalized_with_thematic( generalised_str: STR, str_thematic :ThematicS def get_extended_with_thematic(extended_str, thematic_str): - new_ext = ThematicSTR.from_STR(extended_str) + new_ext = ThematicSTR.from_networkx_graph(extended_str.graph) thematic_rel = thematic_str.thematic_relationships for es in thematic_rel: for them in thematic_rel[es]: diff --git a/strpython/pipeline.py b/strpython/pipeline.py index b52e1320f8089e41699be4970ce6f36a07bb2d2b..ba837461b48943f51649dc7e3b2e5405d98b8314 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -145,7 +145,9 @@ class Pipeline(object): str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[1]) for ext in tqdm(text_and_spatial_entities, desc="Build STR", disable=(not self.verbose))) return str_s - def build(self, spatial_entities_identified): + def build(self, spatial_entities_identified,extract_rel=False): + if extract_rel: + self.extract_all_relation([es_id for es_id in spatial_entities_identified]) str_ = STR(spatial_entities_identified, toponym_first=True) str_.build() return str_