From 755998a619c54b211dd2a38e0976f9448f1ca8a0 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Tue, 12 Mar 2019 17:37:13 +0100 Subject: [PATCH] Debug, STR modif for faster generation, debug disambiguators, update pipeline,debug document selection --- depreciated/generate_data.py | 2 +- depreciated/generate_data_csv.py | 2 +- generate_annotation_file.py | 2 +- generate_str.py | 2 +- strpython/eval/disambiguation.py | 2 +- strpython/models/str.py | 681 +++++++++++++----- strpython/nlp/disambiguator/__init__.py | 7 +- strpython/nlp/disambiguator/disambiguator.py | 107 +-- strpython/nlp/disambiguator/most_common.py | 22 +- strpython/nlp/disambiguator/share_prop.py | 174 +++++ strpython/nlp/disambiguator/wikipedia_cooc.py | 76 +- strpython/nlp/disambiguator_old/__init__.py | 1 + .../nlp/disambiguator_old/disambiguator.py | 62 ++ .../geodict_gaurav.py | 0 .../nlp/disambiguator_old/models/__init__.py | 1 + .../nlp/disambiguator_old/models/bigram.py | 46 ++ .../nlp/disambiguator_old/most_common.py | 71 ++ .../nlp/disambiguator_old/wikipedia_cooc.py | 110 +++ strpython/nlp/ner/__init__.py | 5 + strpython/nlp/ner/ner.py | 40 +- strpython/nlp/ner/nltk.py | 21 +- strpython/nlp/ner/polyglot.py | 31 +- strpython/nlp/ner/spacy.py | 46 +- strpython/nlp/ner/stanford_ner.py | 75 +- strpython/pipeline.py | 68 +- 25 files changed, 1185 insertions(+), 469 deletions(-) create mode 100644 strpython/nlp/disambiguator/share_prop.py create mode 100644 strpython/nlp/disambiguator_old/__init__.py create mode 100644 strpython/nlp/disambiguator_old/disambiguator.py rename strpython/nlp/{disambiguator => disambiguator_old}/geodict_gaurav.py (100%) create mode 100644 strpython/nlp/disambiguator_old/models/__init__.py create mode 100644 strpython/nlp/disambiguator_old/models/bigram.py create mode 100644 strpython/nlp/disambiguator_old/most_common.py create mode 100644 strpython/nlp/disambiguator_old/wikipedia_cooc.py diff --git a/depreciated/generate_data.py b/depreciated/generate_data.py index a1147eb..ac2fe26 100644 --- a/depreciated/generate_data.py +++ b/depreciated/generate_data.py @@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor from langdetect import detect from progressbar import ProgressBar, Timer, Bar, ETA, Counter -from strpython.nlp.disambiguator.geodict_gaurav import * +from strpython.nlp.disambiguator.share_prop import * from strpython.pipeline import * import networkx as nx diff --git a/depreciated/generate_data_csv.py b/depreciated/generate_data_csv.py index dfcc9ee..41a5099 100644 --- a/depreciated/generate_data_csv.py +++ b/depreciated/generate_data_csv.py @@ -6,7 +6,7 @@ import argparse,glob, string,time,re from progressbar import ProgressBar, Timer, Bar, ETA, Counter from strpython.models.str import STR -from strpython.nlp.disambiguator.geodict_gaurav import * +from strpython.nlp.disambiguator.share_prop import * from strpython.pipeline import * import pandas as pd import networkx as nx diff --git a/generate_annotation_file.py b/generate_annotation_file.py index d18abc9..b4b1501 100644 --- a/generate_annotation_file.py +++ b/generate_annotation_file.py @@ -33,7 +33,7 @@ selected = json.load(open(args.selectedFile)) for fn in matrix_fns: measure = os.path.basename(fn).split("_")[0] - type_= "_".join(fn.split("_")[1:]).replace(".npy.bz2","") + type_= "_".join(os.path.basename(fn).split("_")[1:]).replace(".npy.bz2","") print("Proceeding...",measure, type_) df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)), selected, diff --git a/generate_str.py b/generate_str.py index 3e08e1f..3e04cbd 100644 --- a/generate_str.py +++ b/generate_str.py @@ -21,7 +21,7 @@ from strpython.nlp.ner.polyglot import Polyglot as poly_ner from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d -from strpython.nlp.disambiguator.geodict_gaurav import GauravGeodict as shared_geo_d +from strpython.nlp.disambiguator.share_prop import ShareProp as shared_geo_d from strpython.nlp.disambiguator.most_common import MostCommonDisambiguator as most_common_d from mytoolbox.text.clean import * diff --git a/strpython/eval/disambiguation.py b/strpython/eval/disambiguation.py index eeed6c5..960a53d 100644 --- a/strpython/eval/disambiguation.py +++ b/strpython/eval/disambiguation.py @@ -1,7 +1,7 @@ # coding = utf-8 from shapely.geometry import Point -from ..nlp.disambiguator.geodict_gaurav import GauravGeodict +from ..nlp.disambiguator.share_prop import GauravGeodict from ..nlp.disambiguator.most_common import MostCommonDisambiguator from ..nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator diff --git a/strpython/models/str.py b/strpython/models/str.py index 6bfbed9..2d673ba 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -5,11 +5,12 @@ import os import time import warnings +from tqdm import tqdm import folium import geopandas as gpd import networkx as nx import pandas as pd -from shapely.geometry import MultiPoint,Polygon,Point,LineString +from shapely.geometry import MultiPoint, Polygon, Point, LineString from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency from ..helpers.geodict_helpers import gazetteer @@ -18,6 +19,7 @@ from ..eval.stats import most_common from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan import numpy as np + # logging.basicConfig(filename=config.log_file,level=logging.INFO) @@ -40,10 +42,31 @@ class STR(object): """ Str basic structure """ - __cache_inclusion = {} - def __init__(self, tagged_text, spatial_entities): + __cache_inclusion = {} # Store inclusion relations found between spaital entities + __cache_adjacency = {} # Store adjacency relations found between spaital entities + __cache_entity_data = {} #  Store data about entity requested + + def __init__(self, tagged_text, spatial_entities,toponym_first=True): + """ + Constructir + + Parameters + ---------- + tagged_text : list + Text in forms of token associated with tag (2D array 2*t where t == |tokens| ) + spatial_entities : dict + spatial entities associated with a text. Follow this structure {"<id>: <label>"} + + """ + self.tagged_text = tagged_text self.spatial_entities = spatial_entities + if toponym_first: + self.spatial_entities= {id_:topo for topo,id_ in self.spatial_entities.items()} + + for k in list(spatial_entities.keys()): + if not k[:2] == "GD": + del spatial_entities[k] self.adjacency_relationships = {} self.inclusion_relationships = {} @@ -51,11 +74,21 @@ class STR(object): @staticmethod def from_networkx_graph(g: nx.Graph, tagged_: list = []): """ - Return a STR built from a Networkx imported graph - :param g: - :param tagged_: - :return: + Build a STR based on networkx graph + + Parameters + ---------- + g : nx.Graph + input graph + tagged_ : list, optional + tagged text (the default is []). A 2D array 2*t where t == |tokens|. + + Returns + ------- + STR + resulting STR """ + sp_en = {} for nod in g: try: @@ -63,44 +96,96 @@ class STR(object): except KeyError: # If no label found, grab one from the geo-database data = gazetteer.get_by_id(nod) if data: - sp_en[nod] = data[0].label + sp_en[nod] = data[0].name - str_ = STR(tagged_, sp_en) + str_ = STR(tagged_, sp_en,toponym_first=False) str_.set_graph(g) return str_ @staticmethod def from_dict(spat_ent: dict, tagged_: list = []): """ - Return a STR built from a Networkx imported graph - :param g: - :param tagged_: - :return: + Build a STR based on networkx graph + + Parameters + ---------- + spat_ent : dict + Dict of patial entities associated with a text. Follow this structure {"<id>: <label>"} + tagged_ : list, optional + tagged text (the default is []). A 2D array 2*t where t == |tokens|. + + Returns + ------- + STR + resulting STR """ sp_en = {} for id_, label in spat_ent.items(): sp_en[id_] = label - str_ = STR(tagged_, sp_en) + str_ = STR(tagged_, sp_en,toponym_first=False) str_.build() return str_ @staticmethod def from_pandas(dataf: pd.DataFrame, tagged: list = []): + """ + Build a STR from a Pandas Dataframe with two column : id and label. + + Parameters + ---------- + dataf : pd.DataFrame + dataframe containing the spatial entities + tagged : list, optional + tagged text (the default is []). A 2D array 2*t where t == |tokens|. + + Returns + ------- + STR + resulting STR + """ + return STR.from_dict(pd.Series(dataf.label.values, index=dataf.id).to_dict(), tagged) + def set_graph(self, g): + """ + Apply changes to the current STR based on Networkx Graph. + + Parameters + ---------- + g : networkx.Graph + input graph + + """ + + self.graph = g + rel_ = self.graph.edges(data=True) + for edge in rel_: + id1, id2 = edge[0], edge[1] + if edge[2]["color"] == "green": + self.add_adjacency_rel(edge[0], edge[1]) + self.add_cache__adjacency(id1, id2, True) + elif edge[2]["color"] == "red": + self.add_inclusion_rel(edge[0], edge[1]) + self.add_cache_inclusion(id1, id2, True) + def add_spatial_entity(self, id, label=None, v=True): """ - Adding a spatial entity to the current STR - :param id: - :param label: - :return: + Add a spatial entity to the current STR + + Parameters + ---------- + id : str + identifier of the spatial entity in Geodict + label : str, optional + if not available in Geodict (the default is None) + """ - data_ = gazetteer.get_by_id(id) + data_ = self.get_data(id) if not data_: warnings.warn("{0} wasn't found in Geo-Database".format(id)) return False - data_=data_[0] + data_ = data_[0] if not label and v == True: warnings.warn("Label empty. @en label from Geo-Database will be used.") label = data_["en"] @@ -110,9 +195,14 @@ class STR(object): def add_spatial_entities(self, ids: list, labels: list = []): """ Add spatial entities to the current STR - :param ids: - :param label: - :return: + + Parameters + ---------- + ids : list + list of identifiers of each spatial entity + labels : list, optional + list of labels of each spatial entity + """ if not labels: warnings.warn("Labels list is empty. @en labels from Geo-Database will be used by default") @@ -125,27 +215,120 @@ class STR(object): self.add_spatial_entity(id, label, False) # print(self.graph.nodes(data=True)) - def add_adjacency_rel(self, se1, se2,v=True): - if not se1 in self.adjacency_relationships: - self.adjacency_relationships[se1] = {} - self.adjacency_relationships[se1][se2]=v + def add_adjacency_rel(self, se1, se2): + """ + Add a adjacency relationship to the current STR. + + Parameters + ---------- + se1 : str + Identifier of the first spatial entity + se2 : str + Identifier of the second spatial entity + + """ - def add_inclusion_rel(self, se1, se2,v=True): + if not se1 in self.adjacency_relationships: self.adjacency_relationships[se1] = {} + if not se2 in self.adjacency_relationships: self.adjacency_relationships[se2] = {} + self.adjacency_relationships[se1][se2], self.adjacency_relationships[se2][se1] = True, True + self.add_cache__adjacency(se1, se2, True) + + def add_inclusion_rel(self, se1, se2): + """ + Add a inclusion relationship to the current STR. + + Parameters + ---------- + se1 : str + Identifier of the first spatial entity + se2 : str + Identifier of the second spatial entity + + """ if not se1 in self.inclusion_relationships: self.inclusion_relationships[se1] = {} - self.inclusion_relationships[se1][se2]=v + self.inclusion_relationships[se1][se2] = True + self.add_cache_inclusion(se1, se2, True) + + def add_cache_inclusion(self, id1, id2, v=True): + """ + Add a relation of inclusion in a cache variable + + Parameters + ---------- + id1 : str + id of the first spatial entity + id2 : str + id of the second spatial entity + v : bool, optional + if the relation exists between the two spatial entities. Default is True + + """ + + if not id1 in STR.__cache_inclusion: + STR.__cache_inclusion[id1] = {} + STR.__cache_inclusion[id1][id2] = v + + def add_cache__adjacency(self, se1, se2, v=True): + """ + Add a relation of adjacency in a cache variable + + Parameters + ---------- + id1 : str + id of the first spatial entity + id2 : str + id of the second spatial entity + v : bool, optional + if the relation exists between the two spatial entities. Default is True + + """ + if not se1 in STR.__cache_adjacency: + STR.__cache_adjacency[se1] = {} + if not se2 in STR.__cache_adjacency: + STR.__cache_adjacency[se2] = {} + STR.__cache_adjacency[se1][se2] = v + STR.__cache_adjacency[se2][se1] = v + + def get_data(self, id_se): + """ + Return an gazpy.Element object containing information about a spatial entity. + + Parameters + ---------- + id_se : str + Identifier of the spatial entity - def transform_spatial_entities(self, transform_map): + Returns + ------- + gazpy.Element + data """ - Apply transformation to a STR - :param transform_map: - :return: + + if id_se in STR.__cache_entity_data: + return STR.__cache_entity_data[id_se] + data = gazetteer.get_by_id(id_se) + if len(data) > 0: + STR.__cache_entity_data[id_se] = data[0] + + def transform_spatial_entities(self, transform_map: dict): """ + Replace or delete certain spatial entities based on a transformation map + + Parameters + ---------- + transform_map : dict + New mapping for the spatial entities in the current STR. Format required : {"<id of the old spatial entity>":"<id of the new spatial entity>"} + + """ + final_transform_map = {} # Erase old spatial entities new_label = {} + to_del = set([]) for old_se, new_se in transform_map.items(): - data = gazetteer.get_by_id(new_se) + data = self.get_data(new_se) + to_del.add(old_se) if data: data = data[0] final_transform_map[old_se] = new_se @@ -153,78 +336,186 @@ class STR(object): self.add_spatial_entity(new_se, data.label.en) del self.spatial_entities[old_se] + new_label[new_se] = data.label.en else: warnings.warn("{0} doesn't exists in the geo database!".format(new_se)) + self.graph = nx.relabel_nodes(self.graph, final_transform_map) + + for es in to_del: + if es in self.graph._node: + self.graph.remove_node(es) + for se_ in new_label: self.graph.nodes[se_]["label"] = new_label[se_] def update(self): """ - Method for updating links between spatial entities - :return: + Update the relationship between spatial entities in the STR. Used when transforming the STR. """ + nodes = copy.deepcopy(self.graph.nodes(data=True)) self.graph.clear() self.graph.add_nodes_from(nodes) - print("inclusion") self.get_inclusion_relationships() for se1 in self.inclusion_relationships: for se2 in self.inclusion_relationships[se1]: + if not se1 in self.graph.nodes or not se2 in self.graph.nodes: + continue if self.inclusion_relationships[se1][se2]: self.graph.add_edge(se1, se2, key=0, color="red") - print("adjacency") self.get_adjacency_relationships() for se1 in self.adjacency_relationships: for se2 in self.adjacency_relationships[se1]: + if not se1 in self.graph.nodes or not se2 in self.graph.nodes: + continue if self.adjacency_relationships[se1][se2]: self.graph.add_edge(se1, se2, key=0, color="green") - print("fin adj") - - - - def add_cache_inclusion(self,id1, id2): - if not id1 in STR.__cache_inclusion: - STR.__cache_inclusion[id1] = set([]) - STR.__cache_inclusion[id1].add(id2) def is_included_in(self, se1_id, se2_id): - global __cache_inclusion """ - Return true if the two spatial entities identified by @se1_id and @se2_id share an inclusion relationship - :param se1_id: - :param se2_id: - :return: + Return True if a spatial entity is included within another one. + + Parameters + ---------- + se1_id : str + id of the contained entity + se2_id : str + id of the entity container + + Returns + ------- + bool + if se1 included in se2 """ + if se1_id in self.inclusion_relationships: if se2_id in self.inclusion_relationships[se1_id]: return self.inclusion_relationships[se1_id][se2_id] - if se1_id in STR.__cache_inclusion: - if se2_id in STR.__cache_inclusion[se1_id]: - return True - inc_chain_P131 = get_inclusion_chain(se1_id, "P131") inc_chain_P706 = get_inclusion_chain(se1_id, "P706") inc_chain = inc_chain_P131 inc_chain.extend(inc_chain_P706) inc_chain = set(inc_chain) if se2_id in inc_chain: - self.add_cache_inclusion(se1_id,se2_id) + self.add_cache_inclusion(se1_id, se2_id, True) + return True + + return False + + def is_adjacent_cache(self, se1, se2): + """ + Return true if two spatial entities were found adjacent previously. + + Parameters + ---------- + se1 : str + id of the first spatial entity + se2 : str + id of the second spatial entity + + Returns + ------- + bool + if se1 adjacent to se2 + """ + + if se1 in STR.__cache_adjacency: + if se2 in STR.__cache_adjacency[se1]: + return STR.__cache_adjacency[se1][se2] + if se2 in STR.__cache_adjacency: + if se1 in STR.__cache_adjacency[se2]: + return STR.__cache_adjacency[se2][se1] + return False + + def is_included_cache(self, se1, se2): + """ + Return true if a spatial entity were found included previously in an other one. + + Parameters + ---------- + se1 : str + id of the first spatial entity + se2 : str + id of the second spatial entity + + Returns + ------- + bool + if se1 included to se2 + """ + if se1 in STR.__cache_inclusion: + if se2 in STR.__cache_inclusion[se1]: + return STR.__cache_inclusion[se1][se2] + return False + + def is_adjacent(self, se1, se2, datase1=None, datase2=None): + """ + Return true if se1 is adjacent to se2. + + Parameters + ---------- + se1 : str + id of the first spatial entity + se2 : str + id of the second spatial entity + datase1 : gazpy.Element, optional + if given cached data concerning the spatial entity with id = se1 (the default is None) + datase2 : gazpy.Element, optional + if given cached data concerning the spatial entity with id = se2 (the default is None) + + Returns + ------- + bool + true if adjacent + """ + + stop_class = set(["A-PCLI", "A-ADM1"]) + + def get_p47_adjacency_data(data): + p47se1 = [] + for el in data.other.P47: + d = gazetteer.get_by_other_id(el, "wikidata") + if not d: continue + p47se1.append(d[0].id) + return p47se1 + + if self.is_adjacent_cache(se1, se2): + return False + + if self.is_included_in(se1, se2) or self.is_included_in(se2, se1): + return False + + data_se1, data_se2 = self.get_data(se1), self.get_data(se2) + + if "P47" in data_se2 and se1 in get_p47_adjacency_data(data_se2): + return True + # print("P47") + elif "P47" in data_se1 and se2 in get_p47_adjacency_data(data_se1): + return True + # print("P47") + + if collisionTwoSEBoundaries(se1, se2): return True + if "coord" in data_se1 and "coord" in data_se2: + if Point(data_se1.coord.lon, data_se1.coord.lat).distance( + Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( + set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: + return True return False def get_inclusion_relationships(self): """ - Return all the inclusion relationships between all the spatial entities in the STR. - :return: + Find all the inclusion relationships between the spatial entities declared in the current STR. + """ - inclusions_ = [] - for se_ in self.spatial_entities: + + for se_ in tqdm(self.spatial_entities, desc="Extract Inclusion"): inc_chain_P131 = get_inclusion_chain(se_, "P131") inc_chain_P706 = get_inclusion_chain(se_, "P706") @@ -234,62 +525,19 @@ class STR(object): for se2_ in self.spatial_entities: if se2_ in inc_chain: - self.add_inclusion_rel(se_,se2_) - return inclusions_ - - def getP47AdjacencyData(self, data): - p47se1 = [] - for el in data.other.P47: - d = gazetteer.get_by_other_id(el,"wikidata") - if not d:continue - p47se1.append(d[0].id) - return p47se1 - - def is_adjacent(self,se1,se2,datase1=None,datase2=None): - f = False - stop_class = set(["A-PCLI", "A-ADM1"]) - if self.is_included_in(se1, se2): - return f - - elif self.is_included_in(se2, se1): - return f - - data_se1 = gazetteer.get_by_id(se1)[0] if not datase1 else datase1 # Évite de recharger à chaque fois -_- - data_se2 = gazetteer.get_by_id(se2)[0] if not datase2 else datase2 - - # print("testP47") - if "P47" in data_se2.other: - if se1 in self.getP47AdjacencyData(data_se2): - return True - # print("P47") - if not f: - if "P47" in data_se1.other: - if se2 in self.getP47AdjacencyData(data_se1): - return True - # print("P47") - if not f: - # print("test collision") - if collisionTwoSEBoundaries(se1, se2): - return True - if not f: - if "coord" in data_se1.other and "coord" in data_se2.other: - if Point(data_se1.coord.lon, data_se1.coord.lat).distance( - Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( - set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: - return True - return f + self.add_inclusion_rel(se_, se2_) def get_adjacency_relationships(self): """ - Return all the adjacency relationships between all the spatial entities in the STR. - :return: + Find all the adjacency relationships between the spatial entities declared in the current STR. """ - data={se:gazetteer.get_by_id(se)[0] for se in self.spatial_entities} - for se1 in self.spatial_entities: + + data = {se: self.get_data(se) for se in self.spatial_entities} + + for se1 in tqdm(self.spatial_entities, desc="Extract Adjacency Relationship"): data_se1 = data[se1] for se2 in self.spatial_entities: if se1 == se2: continue - # print("test adjacency") if se1 in self.adjacency_relationships: if se2 in self.adjacency_relationships[se1]: continue @@ -297,18 +545,28 @@ class STR(object): if se1 in self.adjacency_relationships[se2]: continue data_se2 = data[se2] - self.add_adjacency_rel(se1, se2, self.is_adjacent(se1,se2,data_se1,data_se2)) - - + if self.is_adjacent(se1, se2, data_se1, data_se2): + self.add_adjacency_rel(se1, se2) def build(self, inc=True, adj=True, verbose=False): """ Build the STR - :param inc: - :param adj: - :param verbose: - :return: + + Parameters + ---------- + inc : bool, optional + if inclusion relationship have to be included in the STR (the default is True) + adj : bool, optional + if adjacency relationship have to be included in the STR (the default is True) + verbose : bool, optional + Verbose mode activated (the default is False) + + Returns + ------- + networkx.Graph + graph representing the STR """ + nodes = [] for k, v in self.spatial_entities.items(): nodes.append((k, {"label": v})) @@ -317,34 +575,35 @@ class STR(object): graph.add_nodes_from(nodes) if adj: - debut=time.time() + debut = time.time() self.get_adjacency_relationships() for se1 in self.adjacency_relationships: for se2 in self.adjacency_relationships[se1]: if self.adjacency_relationships[se1][se2]: - graph.add_edge(se1,se2, key=0, color="green") + graph.add_edge(se1, se2, key=0, color="green") graph.add_edge(se2, se1, key=0, color="green") - logging.info("Extract Adjacency Rel\t{0}".format(time.time()-debut)) if inc: - debut=time.time() + debut = time.time() self.get_inclusion_relationships() for se1 in self.inclusion_relationships: for se2 in self.inclusion_relationships[se1]: if self.inclusion_relationships[se1][se2]: - graph.add_edge(se1,se2, key=0, color="red") - logging.info("Extract Inclusion Rel\t{0}".format(time.time() - debut)) + graph.add_edge(se1, se2, key=0, color="red") + self.graph = graph return graph def save_graph_fig(self, output_fn, format="svg"): """ - Save the graph graphiz reprensentation + Save the graphiz reprensentation of the STR graph. Parameters ---------- output_fn : string Output filename + format : str + Output format (svg or pdf) """ try: @@ -357,44 +616,63 @@ class STR(object): print("Error while saving STR to {0}".format(format)) def getUndirected(self): - return nx.Graph(self.graph) + """ + Return the Undirected form of a STR graph. - def set_graph(self, g): - self.graph = g - rel_ = self.graph.edges(data=True) - for edge in rel_: - id1, id2 = edge[0], edge[1] - if edge[2]["color"] == "green": - self.add_adjacency_rel(edge[0],edge[1]) - add_cache_adjacency(id1, id2) - elif edge[2]["color"] == "red": - self.add_inclusion_rel(edge[0], edge[1]) - self.add_cache_inclusion(id1,id2) + Returns + ------- + networkx.Graph + unidirected graph + """ + return nx.Graph(self.graph) def get_geo_data_of_se(self): - points,label,class_ = [], [], [] + """ + Return Geographical information for each spatial entities in the STR + + Returns + ------- + geopandas.GeoDataFrame + dataframe containing geographical information of each entity in the STR + """ + + points, label, class_ = [], [], [] for se in self.spatial_entities: data = gazetteer.get_by_id(se)[0] try: points.append(Point(data.coord.lon, data.coord.lat)) - label.append(data.label) + label.append(data.name) # class_.append(most_common(data["class"])) except KeyError: pass # print(len(points),len(label),len(class_)) - df=gpd.GeoDataFrame({"geometry":points,"label":label}) - df["x"]=df.geometry.apply(lambda p: p.x) + df = gpd.GeoDataFrame({"geometry": points, "label": label}) + df["x"] = df.geometry.apply(lambda p: p.x) df["y"] = df.geometry.apply(lambda p: p.y) return df - def get_cluster(self,id_=None): - if id_ and os.path.exists("./temp_cluster/{0}.geojson".format(id_)): + def get_cluster(self, id_=None): + """ + Return the cluster detected using spatial entities position. + + Parameters + ---------- + id_ : temp_file_id, optional + if cached version of geoinfo (the default is None) + + Returns + ------- + gpd.GeoDataFrame + cluster geometry + """ + + if os.path.exists("./temp_cluster/{0}.geojson".format(id_)): return gpd.read_file("./temp_cluster/{0}.geojson".format(id_)) - data=self.get_geo_data_of_se() - X=data[["x", "y"]].values - if len(X) ==0: # if zero samples return Empty GeoDataFrame + data = self.get_geo_data_of_se() + X = data[["x", "y"]].values + if len(X) == 0: # if zero samples return Empty GeoDataFrame return gpd.GeoDataFrame() try: bandwidth = estimate_bandwidth(X) @@ -402,33 +680,25 @@ class STR(object): ms.fit(X) data["cluster"] = ms.labels_ except: - samples,labels=dbscan(X) + samples, labels = dbscan(X) data["cluster"] = labels - """ - - # deuxième découpe en cluster - c=data['cluster'].value_counts().idxmax() - X=data[data["cluster"] == c] - X=X[["x","y"]] - bandwidth = estimate_bandwidth(X.values) - ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) - ms.fit(X.values) - X["cluster"]=ms.labels_+(data['cluster'].max()+1) - lab=ms.labels_ - lab+=data['cluster'].max()+1 - - data["cluster"][data["cluster"] == c]=X["cluster"] - """ - geo = data.groupby("cluster").apply(to_Polygon) cluster_polybuff = gpd.GeoDataFrame(geometry=geo) if id_: cluster_polybuff.to_file("./temp_cluster/{0}.geojson".format(id_)) return cluster_polybuff - def to_folium(self): + """ + Use the folium package to project the STR on a map + + Returns + ------- + folium.Map + folium map instance + """ + points = [] for se in self.spatial_entities: data = gazetteer.get_by_id(se)[0] @@ -449,10 +719,10 @@ class STR(object): ) lines_inc = [] for se1 in self.inclusion_relationships: - data_se1 = data_se1=gazetteer.get_by_id(se1)[0] + data_se1 = data_se1 = gazetteer.get_by_id(se1)[0] for se2 in self.inclusion_relationships[se1]: if self.inclusion_relationships[se1][se2]: - data_se2 = data_se1=gazetteer.get_by_id(se2)[0] + data_se2 = data_se1 = gazetteer.get_by_id(se2)[0] lines_inc.append( LineString([ (data_se1.coord.lon, data_se1.coord.lat), @@ -460,45 +730,58 @@ class STR(object): ) ) - def to_fol(seris,color="#ff0000"): - df=gpd.GeoDataFrame(geometry=seris.values) - df.crs={'init' :'epsg:4326'} - return folium.features.GeoJson(df.to_json(),style_function=lambda x: {'color':color}) + def to_fol(seris, color="#ff0000"): + df = gpd.GeoDataFrame(geometry=seris.values) + df.crs = {'init': 'epsg:4326'} + return folium.features.GeoJson(df.to_json(), style_function=lambda x: {'color': color}) gjson1 = to_fol(gpd.GeoSeries(points)) - gjson2 = to_fol(gpd.GeoSeries(lines_adj),color='#00ff00') + gjson2 = to_fol(gpd.GeoSeries(lines_adj), color='#00ff00') gjson3 = to_fol(gpd.GeoSeries(lines_inc)) - map=folium.Map() + map = folium.Map() map.add_child(gjson1) map.add_child(gjson2) map.add_child(gjson3) return map + def map_projection(self, plt=False): + """ + Return a matplotlib figure of the STR + + Parameters + ---------- + plt : bool, optional + if the user wish to use the plt.show() (the default is False) + + Returns + ------- + plt.Figure + Matplotlib figure instance + """ - def map_projection(self,plt=False): import matplotlib.pyplot as plt world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) base = world.plot(color='white', edgecolor='black', figsize=(16, 9)) - points=[] + points = [] for se in self.spatial_entities: - data=gazetteer.get_by_id(se)[0] + data = gazetteer.get_by_id(se)[0] try: - points.append(Point(data.coord.lon,data.coord.lat)) + points.append(Point(data.coord.lon, data.coord.lat)) except: pass - lines_adj=[] + lines_adj = [] for se1 in self.adjacency_relationships: - data_se1=gazetteer.get_by_id(se1)[0] + data_se1 = gazetteer.get_by_id(se1)[0] for se2 in self.adjacency_relationships[se1]: data_se2 = gazetteer.get_by_id(se2)[0] if self.adjacency_relationships[se1][se2]: lines_adj.append( - LineString([(data_se1.coord.lon,data_se1.coord.lat),(data_se2.coord.lon, data_se2.coord.lat)]) - ) - lines_inc=[] + LineString([(data_se1.coord.lon, data_se1.coord.lat), (data_se2.coord.lon, data_se2.coord.lat)]) + ) + lines_inc = [] for se1 in self.inclusion_relationships: data_se1 = gazetteer.get_by_id(se1)[0] for se2 in self.inclusion_relationships[se1]: @@ -511,7 +794,7 @@ class STR(object): ) ) - gpd.GeoSeries(points).plot(ax=base,marker='o',markersize=5,color="blue") + gpd.GeoSeries(points).plot(ax=base, marker='o', markersize=5, color="blue") gpd.GeoSeries(lines_adj).plot(ax=base, color="green") gpd.GeoSeries(lines_inc).plot(ax=base, color="red") @@ -520,17 +803,45 @@ class STR(object): plt.show() -def to_Multipoints(x): - #print(x[["x","y"]].values) - return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1) +# def to_Multipoints(x): +# """ +# Return a polygon buffered representation for a set of point + +# Parameters +# ---------- +# x : pandas.Series +# coordinates columns + +# Returns +# ------- +# shapely.geometry.Polygon +# polygon +# """ + +# #print(x[["x","y"]].values) +# return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1) def to_Polygon(x): - points = [Point(z) for z in x[["x","y"]].values] + """ + Return a polygon buffered representation for a set of points. + + Parameters + ---------- + x : pandas.Series + coordinates columns + + Returns + ------- + shapely.geometry.Polygon + polygon + """ + + points = [Point(z) for z in x[["x", "y"]].values] if len(points) > 2: coords = [p.coords[:][0] for p in points] poly = Polygon(coords).buffer(1) return poly - elif len(points)==1: + elif len(points) == 1: return points[0].buffer(1) else: coords = [p.coords[:][0] for p in points] diff --git a/strpython/nlp/disambiguator/__init__.py b/strpython/nlp/disambiguator/__init__.py index 950f635..bceef44 100644 --- a/strpython/nlp/disambiguator/__init__.py +++ b/strpython/nlp/disambiguator/__init__.py @@ -1 +1,6 @@ -# coding = utf-8 \ No newline at end of file +# coding = utf-8 + +from .most_common import MostCommonDisambiguator +from .share_prop import ShareProp +from .wikipedia_cooc import WikipediaDisambiguator +from .disambiguator import Disambiguator \ No newline at end of file diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py index ee0d899..927a70f 100644 --- a/strpython/nlp/disambiguator/disambiguator.py +++ b/strpython/nlp/disambiguator/disambiguator.py @@ -10,53 +10,62 @@ from ..ner.ner import NER class Disambiguator(object): - def __init__(self): + def __init__(self,one_by_one=False,context_based=False): """Constructor for Disambiguator""" - pass - - def extract_se_entities(self, input): - out = Disambiguator.parse_corpus(input) - en_ = out[out[:, 1] == NER._unified_tag["place"]] - return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0]) - - def toponymes_frequencies(self, ens_): - count = {} - for en in ens_: - if not en in count: count[en] = 0 - count[en] += 1 - return count - - @staticmethod - def parse_corpus(corpus): - final_corpus = [] - t = 0 - placeTag = NER._unified_tag["place"] - while t < len(corpus): - tag = copy.copy(corpus[t]) - - if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag: - lenw = 1 - if tag[1] == "BEG-" + placeTag: - compound_tag = tag[0] - t += 1 - while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag: - tag = copy.copy(corpus[t]) - if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation: - compound_tag += tag[0] - else: - compound_tag += " " + tag[0] - t += 1 - lenw += 1 - tag[0] = compound_tag - tag[1] = placeTag - t += 1 - else: - t += 1 - final_corpus.append(tag) - return np.array(final_corpus) - - def disambiguate(self, ner_result): - pass - - def disambiguate_list(self,toponyms,lang): - pass \ No newline at end of file + self.one_by_one= one_by_one + self.context_based=context_based + + def disambiguate(self,lang,ner_output=None,toponyms=None): + """ + Run the disambiguation on the NER output + Parameters + ---------- + ner_output : 2D numpy array + NER output + lang : str + language + + Returns + ------- + dict + {toponym : geodictID} + """ + if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2: + toponyms = self.parse_ner_output(ner_output) + elif not toponyms: + raise ValueError("Either enter a list of toponyms or give ner_output") + if self.context_based: + return self.disambiguate_context_based(toponyms,lang) + else: + return self.disambiguate_one_by_one(toponyms,lang) + + def disambiguate_one_by_one(self, toponyms, lang): + """ + Disambiguation process when toponyms are geocoded one by one. + Parameters + ---------- + toponyms :list + toponyms + Returns + ------- + dict + {toponym : geodictID} + """ + raise NotImplementedError + + def disambiguate_context_based(self,toponyms,lang): + """ + Disambiguation process when toponyms are geocoded using each one of them + Parameters + ---------- + toponyms :list + toponyms + Returns + ------- + dict + {toponym : geodictID} + """ + raise NotImplementedError + + def parse_ner_output(self,ner_output): + return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]] \ No newline at end of file diff --git a/strpython/nlp/disambiguator/most_common.py b/strpython/nlp/disambiguator/most_common.py index 2989325..be12646 100644 --- a/strpython/nlp/disambiguator/most_common.py +++ b/strpython/nlp/disambiguator/most_common.py @@ -28,40 +28,28 @@ common_words = { class MostCommonDisambiguator(Disambiguator): def __init__(self): - Disambiguator.__init__(self) + Disambiguator.__init__(self,one_by_one=True) - def disambiguate(self, ner_result, lang="en"): - count, se_ = self.extract_se_entities(ner_result) - new_count = {} - selected_en = {} - for en in se_: - id_,score=self.disambiguate_(en,lang) - if not id_ == "O" and id_: - selected_en[id_] = en - new_count[id_] = count[en] - - return new_count, selected_en - - def disambiguate_list(self,toponyms,lang): + def disambiguate_one_by_one(self, toponyms,lang): result={} for toponym in toponyms: id_,_=self.disambiguate_(toponym,lang) if id_: - result[id_]=toponym + result[toponym]=id_ return result def disambiguate_(self, label, lang='fr'): if re.match("^\d+$", label): return 'O', -1 if lang in stop_words: #and lang in common_words: - if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]: + if label.lower().rstrip("s") in stop_words[lang]: return 'O', -1 if lang in inflectors: plural=inflectors[lang].singularize(label) else: plural = label.rstrip("s") + "s" - if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]: + if plural.lower() in stop_words[lang]: return 'O', -1 data=get_most_common_id_v3(label, lang) diff --git a/strpython/nlp/disambiguator/share_prop.py b/strpython/nlp/disambiguator/share_prop.py new file mode 100644 index 0000000..001bc78 --- /dev/null +++ b/strpython/nlp/disambiguator/share_prop.py @@ -0,0 +1,174 @@ +# coding = utf-8 +import math + +from ...helpers.collision import * +#from ...helpers.geodict_helpers_old import * +from ...helpers.geodict_helpers import * +from .disambiguator import Disambiguator + +from ...models.str import get_inclusion_chain + + +class ShareProp(Disambiguator): + + def __init__(self): + Disambiguator.__init__(self,context_based=True) + + def fib_formula(self, n): + """ + Return the fibonacci value. + Parameters + ---------- + n : int + parameter + Returns + ------- + int + fibonnaci value + """ + if n in [0, 1]: return 0 # Modifying fibonacci behaviour + golden_ratio = (1 + math.sqrt(5)) / 2 + val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5) + return int(round(val)) + + def inclusion_log(self, x): + """ + Return the inclusion log + Parameters + ---------- + x : int + parameter + + Returns + ------- + int + inclusion log + """ + if x==0: + return 1 + return math.log(x) + + + def get_inclusion_score(self, id1, id2): + """ + Return the inclusion score. Compute the distance between two entities in the hierarchy. + Parameters + ---------- + id1 : str + id of the first spatial entity + id2 : str + id of the second spatial entity + + Returns + ------- + int + inclusion score + """ + list1 = get_inclusion_chain(id1, 'P131') + list2 = get_inclusion_chain(id2, 'P131') + interP131 = len(list(set(list1).intersection(list2))) + list1 = get_inclusion_chain(id1, 'P706') + list2 = get_inclusion_chain(id2, 'P706') + interP706 = len(list(set(list1).intersection(list2))) + # return fib_no[interP131]+fib_no[interP706] + return self.inclusion_log(interP131) + self.inclusion_log(interP706) + + def Adjacency_P47(self, id1, id2): + """ + Return true, if two spatial entities are found adjacent using the P47 property (share borders) from Wikidata. + Parameters + ---------- + id1 : str + id of the first spatial entity + id2 : str + id of the second spatial entity + + Returns + ------- + bool + true if adjacent using P47 + """ + data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0] + if "P47" in data_1 and "P47" in data_2: + if id1 in data_2.other.P47 or id2 in data_1.other.P47: + return True + return False + + def Adjacency_Hull(self, id1, id2): + """ + To find if two spatial entities hull "collide" + Parameters + ---------- + id1 : str + id of the first spatial entity + id2 : str + id of the second spatial entity + + Returns + ------- + bool + if collide + """ + return collisionTwoSEBoundaries(id1, id2) + + def disambiguateOne(self, spat_candidates, fixed_entities): + """ + Disambiguate one toponym + Parameters + ---------- + spat_candidates + list of candidates found in the georeferential + fixed_entities + entities with no ambiguities + + Returns + ------- + + """ + score_dc = {} + for cand in spat_candidates: + id_cand = cand.id + score_dc[id_cand] = 0 + for fixed in fixed_entities: + id_fixed = fixed_entities[fixed].id + if self.Adjacency_P47(id_cand, id_fixed): + score_dc[id_cand] += 3 + elif self.Adjacency_Hull(id_cand, id_fixed): + score_dc[id_cand] += 2 + score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed) + + m = max(score_dc, key=score_dc.get) + if score_dc[m] < 4: + return None + for cand in spat_candidates: + if cand.id == m: + return cand.id + + + def disambiguate_context_based(self,toponyms,lang): + selected_en = {} + fixed_entities = {} + ambiguous_entities = {} + for topo in toponyms: + request = gazetteer.get_by_label(topo, lang) + if len(request) == 0: + request = gazetteer.get_by_alias(topo, lang) + if len(request) > 1: + ambiguous_entities[topo] = request + elif len(request) == 1: + fixed_entities[topo] = request[0] + + d_amb_results = {} + for amb_ent in ambiguous_entities: + d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities) + if not d: + d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id + else: + d_amb_results[amb_ent] = d + + for k, v in fixed_entities.items(): + selected_en[k] = v.id + for k, v in d_amb_results.items(): + selected_en[k] = v + + return selected_en \ No newline at end of file diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py index c9a522a..a8dacfd 100644 --- a/strpython/nlp/disambiguator/wikipedia_cooc.py +++ b/strpython/nlp/disambiguator/wikipedia_cooc.py @@ -16,80 +16,71 @@ def read_pickle(fn): class WikipediaDisambiguator(Disambiguator): def __init__(self,measure="degree"): - Disambiguator.__init__(self) + Disambiguator.__init__(self,context_based=True) # Load model self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count)) self.measure=measure - def disambiguate(self, ner_result, lang="en"): - count, se_ = self.extract_se_entities(ner_result) - new_count = {} - selected_en_rev = {} - selected_en = self.disambiguate_wiki(se_,lang) - for en in selected_en: - selected_en_rev[en]=selected_en[en] - #new_count[selected_en[en]] = count[en] - return new_count, selected_en def disambiguate_list(self,toponyms,lang): result=self.disambiguate_wiki(toponyms,lang) return {k:v for k,v in result.items() if v} - def disambiguate_wiki(self, entities, lang): - - spat_en=[] - for e in entities: - if re.match("^\d+$", e): + def disambiguate_context_based(self,toponyms,lang): + toponyms_filtered=[] + for toponym in toponyms: + if re.match("^\d+$", toponym): continue - if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]: + if lang in stop_words and toponym.lower().rstrip("s") in stop_words[lang]:# or toponym.lower().rstrip("s") in common_words[lang]: continue - plural = e.rstrip("s") + "s" + plural = toponym.rstrip("s") + "s" if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]: continue - spat_en.append(e) - spat_en=list(set(spat_en)) + toponyms_filtered.append(toponym) + + toponyms_filtered=list(set(toponyms_filtered)) g = nx.Graph() possible_candidates = [] betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ... group_candidate = {} #candidates per toponym - for e in spat_en: - cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4) - cand = [c.id for c in cand if c] - if not cand: - cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c] - group_candidate[e] = cand - betw_cand[e]=cand - for n in cand: - betw_cand[n]=set(cand)-set(n) - possible_candidates.extend(cand) + for toponym in toponyms_filtered: + candidates = get_top_candidate(toponym, lang, 5) + candidates = [c.id for c in candidates if c] + if not candidates: + candidates = [c.id for c in gazetteer.get_n_label_similar(toponym,lang,5) if c] + group_candidate[toponym] = candidates + betw_cand[toponym]=candidates + for n in candidates: + betw_cand[n]=set(candidates)-set(n) + possible_candidates.extend(candidates) - for cand in possible_candidates: - g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang]) + for candidate in possible_candidates: + g.add_node(candidate, label=gazetteer.get_by_id(candidate)[0].label[lang]) data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates} - for cand in possible_candidates: - for cand2 in possible_candidates: + for candidate in possible_candidates: + for candidate2 in possible_candidates: # Get PageRank score - d = data_candidate[cand] + d = data_candidate[candidate] sc = 1 sc=d.score # Compute probability - prob = self.model.get_coocurence_probability(sc, cand, cand2) + prob = self.model.get_coocurence_probability(sc, candidate, candidate2) - if cand2 in betw_cand[cand] or cand in betw_cand[cand2]: + if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]: prob = 0.0 if prob < 0.0000001: prob = 0.0 - if not cand == cand2: + if not candidate == candidate2: # take the lowest co-occurrency between two candidates - if g.has_edge(cand2, cand) : - if g.edges[cand2,cand]["weight"] < prob: + if g.has_edge(candidate2, candidate) : + if g.edges[candidate2,candidate]["weight"] < prob: continue - g.add_edge(cand, cand2, weight=prob) + g.add_edge(candidate, candidate2, weight=prob) selected = {} @@ -104,7 +95,8 @@ class WikipediaDisambiguator(Disambiguator): else:# degree by default selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) #print(1) - except Exception as e: - selected[gr]=get_most_common_id_v3(gr,lang) + except Exception as toponym: + most_common = get_most_common_id_v3(gr, lang) + if most_common and len(most_common)>0: selected[gr]=most_common[0].id return selected diff --git a/strpython/nlp/disambiguator_old/__init__.py b/strpython/nlp/disambiguator_old/__init__.py new file mode 100644 index 0000000..950f635 --- /dev/null +++ b/strpython/nlp/disambiguator_old/__init__.py @@ -0,0 +1 @@ +# coding = utf-8 \ No newline at end of file diff --git a/strpython/nlp/disambiguator_old/disambiguator.py b/strpython/nlp/disambiguator_old/disambiguator.py new file mode 100644 index 0000000..ee0d899 --- /dev/null +++ b/strpython/nlp/disambiguator_old/disambiguator.py @@ -0,0 +1,62 @@ +# coding = utf-8 + +import copy +import string + +import numpy as np + +from ..ner.ner import NER + + +class Disambiguator(object): + + def __init__(self): + """Constructor for Disambiguator""" + pass + + def extract_se_entities(self, input): + out = Disambiguator.parse_corpus(input) + en_ = out[out[:, 1] == NER._unified_tag["place"]] + return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0]) + + def toponymes_frequencies(self, ens_): + count = {} + for en in ens_: + if not en in count: count[en] = 0 + count[en] += 1 + return count + + @staticmethod + def parse_corpus(corpus): + final_corpus = [] + t = 0 + placeTag = NER._unified_tag["place"] + while t < len(corpus): + tag = copy.copy(corpus[t]) + + if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag: + lenw = 1 + if tag[1] == "BEG-" + placeTag: + compound_tag = tag[0] + t += 1 + while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag: + tag = copy.copy(corpus[t]) + if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation: + compound_tag += tag[0] + else: + compound_tag += " " + tag[0] + t += 1 + lenw += 1 + tag[0] = compound_tag + tag[1] = placeTag + t += 1 + else: + t += 1 + final_corpus.append(tag) + return np.array(final_corpus) + + def disambiguate(self, ner_result): + pass + + def disambiguate_list(self,toponyms,lang): + pass \ No newline at end of file diff --git a/strpython/nlp/disambiguator/geodict_gaurav.py b/strpython/nlp/disambiguator_old/geodict_gaurav.py similarity index 100% rename from strpython/nlp/disambiguator/geodict_gaurav.py rename to strpython/nlp/disambiguator_old/geodict_gaurav.py diff --git a/strpython/nlp/disambiguator_old/models/__init__.py b/strpython/nlp/disambiguator_old/models/__init__.py new file mode 100644 index 0000000..950f635 --- /dev/null +++ b/strpython/nlp/disambiguator_old/models/__init__.py @@ -0,0 +1 @@ +# coding = utf-8 \ No newline at end of file diff --git a/strpython/nlp/disambiguator_old/models/bigram.py b/strpython/nlp/disambiguator_old/models/bigram.py new file mode 100644 index 0000000..ec146b4 --- /dev/null +++ b/strpython/nlp/disambiguator_old/models/bigram.py @@ -0,0 +1,46 @@ +# coding = utf-8 + + +class BigramModel: + def __init__(self,freq={},count={}): + self.cooc_freq=freq + self.count_associated=count + + def append(self,uri1,uri2): + + if not uri1 in self.cooc_freq: + self.cooc_freq[uri1]={} + if not uri2 in self.cooc_freq[uri1]: + self.cooc_freq[uri1][uri2]=0 + self.cooc_freq[uri1][uri2]+=1 + + self.increment_count(uri2) + + def increment_count(self,uri): + if not uri in self.count_associated: + self.count_associated[uri]=0 + self.count_associated[uri]+=1 + + def get_coocurence_probability(self, pr1, *args): + if len(args) < 2: + print("Only one URI indicated") + return 0. + res_=1. + for u in range(1,len(args)): + res_*=self.get_bigram_probability(args[0],args[u],pr1) + return res_ + + + def get_bigram_probability(self,uri1,uri2,pr1=1): + nna=0.00000001 + if uri1 in self.cooc_freq: + if uri2 in self.cooc_freq[uri1]: + return self.cooc_freq[uri1][uri2] + #return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1 + elif uri2 in self.cooc_freq: + if uri1 in self.cooc_freq[uri2]: + return self.cooc_freq[uri2][uri1] + #return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1 + return nna + + diff --git a/strpython/nlp/disambiguator_old/most_common.py b/strpython/nlp/disambiguator_old/most_common.py new file mode 100644 index 0000000..2989325 --- /dev/null +++ b/strpython/nlp/disambiguator_old/most_common.py @@ -0,0 +1,71 @@ +# coding = utf-8 + + + +from ...helpers.geodict_helpers import * +from .disambiguator import Disambiguator +import re, json, os +from ...config.configuration import config + +from inflector import Inflector,English,Spanish,French + +inflectors= { + "en":Inflector(English()), + "fr":Inflector(French()), + "es":Inflector(Spanish()) +} +stop_words = { + "fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")), + "en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n")) +} + +common_words = { + "fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))), + "en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n")) +} + + +class MostCommonDisambiguator(Disambiguator): + + def __init__(self): + Disambiguator.__init__(self) + + def disambiguate(self, ner_result, lang="en"): + count, se_ = self.extract_se_entities(ner_result) + new_count = {} + selected_en = {} + for en in se_: + id_,score=self.disambiguate_(en,lang) + if not id_ == "O" and id_: + selected_en[id_] = en + new_count[id_] = count[en] + + return new_count, selected_en + + def disambiguate_list(self,toponyms,lang): + result={} + for toponym in toponyms: + id_,_=self.disambiguate_(toponym,lang) + if id_: + result[id_]=toponym + return result + + def disambiguate_(self, label, lang='fr'): + if re.match("^\d+$", label): + return 'O', -1 + if lang in stop_words: #and lang in common_words: + if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]: + return 'O', -1 + + if lang in inflectors: + plural=inflectors[lang].singularize(label) + else: + plural = label.rstrip("s") + "s" + if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]: + return 'O', -1 + + data=get_most_common_id_v3(label, lang) + id_, score=None,0 + if data: + id_,score=data.id,data.score + return id_, score diff --git a/strpython/nlp/disambiguator_old/wikipedia_cooc.py b/strpython/nlp/disambiguator_old/wikipedia_cooc.py new file mode 100644 index 0000000..c9a522a --- /dev/null +++ b/strpython/nlp/disambiguator_old/wikipedia_cooc.py @@ -0,0 +1,110 @@ +# coding = utf-8 +import re + +from .disambiguator import Disambiguator +from .models.bigram import BigramModel +import pickle +from ...config.configuration import config +#from ...helpers.geodict_helpers_old import * +from ...helpers.geodict_helpers import * +from .most_common import stop_words,common_words +import networkx as nx + +def read_pickle(fn): + return pickle.load(open(fn,'rb')) + +class WikipediaDisambiguator(Disambiguator): + + def __init__(self,measure="degree"): + Disambiguator.__init__(self) + # Load model + self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count)) + self.measure=measure + def disambiguate(self, ner_result, lang="en"): + count, se_ = self.extract_se_entities(ner_result) + new_count = {} + selected_en_rev = {} + selected_en = self.disambiguate_wiki(se_,lang) + for en in selected_en: + selected_en_rev[en]=selected_en[en] + #new_count[selected_en[en]] = count[en] + + return new_count, selected_en + + def disambiguate_list(self,toponyms,lang): + result=self.disambiguate_wiki(toponyms,lang) + return {k:v for k,v in result.items() if v} + + def disambiguate_wiki(self, entities, lang): + + spat_en=[] + for e in entities: + if re.match("^\d+$", e): + continue + if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]: + continue + + plural = e.rstrip("s") + "s" + if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]: + continue + spat_en.append(e) + spat_en=list(set(spat_en)) + g = nx.Graph() + + possible_candidates = [] + betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ... + group_candidate = {} #candidates per toponym + + for e in spat_en: + cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4) + cand = [c.id for c in cand if c] + if not cand: + cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c] + group_candidate[e] = cand + betw_cand[e]=cand + for n in cand: + betw_cand[n]=set(cand)-set(n) + possible_candidates.extend(cand) + + for cand in possible_candidates: + g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang]) + + data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates} + for cand in possible_candidates: + for cand2 in possible_candidates: + # Get PageRank score + d = data_candidate[cand] + + sc = 1 + sc=d.score + # Compute probability + prob = self.model.get_coocurence_probability(sc, cand, cand2) + + if cand2 in betw_cand[cand] or cand in betw_cand[cand2]: + prob = 0.0 + if prob < 0.0000001: + prob = 0.0 + if not cand == cand2: + # take the lowest co-occurrency between two candidates + if g.has_edge(cand2, cand) : + if g.edges[cand2,cand]["weight"] < prob: + continue + g.add_edge(cand, cand2, weight=prob) + + selected = {} + + #Take the candidates with the highest degree weighted + for gr in group_candidate: + try: + + if self.measure == "degree": + selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) + elif self.measure == "centrality": + selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight")) + else:# degree by default + selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) + #print(1) + except Exception as e: + selected[gr]=get_most_common_id_v3(gr,lang) + return selected + diff --git a/strpython/nlp/ner/__init__.py b/strpython/nlp/ner/__init__.py index e69de29..764d8f5 100644 --- a/strpython/nlp/ner/__init__.py +++ b/strpython/nlp/ner/__init__.py @@ -0,0 +1,5 @@ +from .spacy import Spacy +from .nltk import NLTK +from .polyglot import Polyglot +from .stanford_ner import StanfordNER +from .ner import NER \ No newline at end of file diff --git a/strpython/nlp/ner/ner.py b/strpython/nlp/ner/ner.py index 2066a24..47006b3 100644 --- a/strpython/nlp/ner/ner.py +++ b/strpython/nlp/ner/ner.py @@ -12,7 +12,43 @@ class NER: self._lang = lang def identify(self, input): - return input + """ + + Parameters + ---------- + input + + Returns + ------- + + """ + raise NotImplementedError def parse_output(self, output): - pass + """ + Parse the output of the NER + Parameters + ---------- + output: obj + ner output + Returns + ------- + 2D-array numpy + First col = Text, Second Col = Tag + """ + raise NotImplementedError + + def translate_tag(self, tag): + """ + Translate the NER tag to a unique tag use in this module. + Parameters + ---------- + tag :str + tag + + Returns + ------- + str + transformed tag + """ + raise NotImplementedError \ No newline at end of file diff --git a/strpython/nlp/ner/nltk.py b/strpython/nlp/ner/nltk.py index 265b9f1..1fdd64c 100644 --- a/strpython/nlp/ner/nltk.py +++ b/strpython/nlp/ner/nltk.py @@ -2,7 +2,7 @@ import nltk from .ner import NER - +import numpy as np class NLTK(NER): """ @@ -25,22 +25,9 @@ class NLTK(NER): for tok_ in ner_tagged: if isinstance(tok_, nltk.tree.Tree): corresponding_tag_ = self.translate_tag(tok_.label()) - if not tok_.label() in NLTK._list_of_tags_available: - for i in tok_: output.append(list(i)) - else: - if not len(tok_) > 1: - output.append([tok_[0][0], corresponding_tag_]) - else: - for i in range(len(tok_)): - if i == 0: - output.append([tok_[i][0], "BEG-" + corresponding_tag_]) - elif i + 1 == len(tok_): - output.append([tok_[i][0], "END-" + corresponding_tag_]) - else: - output.append([tok_[i][0], corresponding_tag_]) - else: - output.append(list(tok_)) - return output + if tok_.label() in NLTK._list_of_tags_available: + output.append([" ".join([t[0] for t in tok_]),self.translate_tag(tok_.label())]) + return np.array(output) def translate_tag(self, tag): if tag == "LOCATION" or tag == "GPE": diff --git a/strpython/nlp/ner/polyglot.py b/strpython/nlp/ner/polyglot.py index df8c083..005935c 100644 --- a/strpython/nlp/ner/polyglot.py +++ b/strpython/nlp/ner/polyglot.py @@ -15,37 +15,12 @@ class Polyglot(NER): self.poly_instance=None def identify(self,text): - self.poly_instance = json.loads(Text(text,hint_language_code=self._lang).to_json()) + self.poly_instance = Text(text,hint_language_code=self._lang) result_=[] - for item in self.poly_instance: - pos_t=self.parse_polyglot_output(item["entities"], item["pos_tags"]) - result_.extend(pos_t) + for en in self.poly_instance.entities: + result_.append([eval(en.__str__()),self.translate_tag(en.tag)]) return np.array(result_) - def parse_polyglot_output(self, entities_list, sentence_pos_tagged): - """ - """ - tk_pos=0 - sentence_pos_tagged=np.array(sentence_pos_tagged) - while tk_pos < len(sentence_pos_tagged): - token_=sentence_pos_tagged[tk_pos] - for entity_ in entities_list: - if token_[0] == entity_[1][0]: - if len(entity_[1]) > 1: - en_=np.array(entity_[1]) - en_in_pos_tag=sentence_pos_tagged[tk_pos:tk_pos+len(en_)][:,0] - if np.array_equal(en_ ,en_in_pos_tag): - - sentence_pos_tagged[tk_pos][1]="BEG-"+self.translate_tag(entity_[0]) - sentence_pos_tagged[tk_pos+len(en_)-1][1] = "END-" + self.translate_tag(entity_[0]) - if len(en_) >2:sentence_pos_tagged[tk_pos+1:tk_pos+len(en_)-1][:,1]=self.translate_tag(entity_[0]) - else: - sentence_pos_tagged[tk_pos][1]=self.translate_tag(entity_[0]) - tk_pos+=len(entity_[1]) - break - tk_pos+=1 - return sentence_pos_tagged - def translate_tag(self,tag): if tag == "I-PER": return NER._unified_tag["pers"] diff --git a/strpython/nlp/ner/spacy.py b/strpython/nlp/ner/spacy.py index 47456bb..98e9ea2 100644 --- a/strpython/nlp/ner/spacy.py +++ b/strpython/nlp/ner/spacy.py @@ -2,6 +2,7 @@ import spacy +import numpy as np from .ner import NER from ..exception.language import LanguageNotAvailable @@ -13,6 +14,7 @@ _tag_spacy = { "org": "ORG" } +all_tags=["GPE", "LOC","PERSON","ORG"] def flatten(lis): """ @@ -59,26 +61,12 @@ class Spacy(NER): import multiprocessing if len(text) > 10000: output_=[] - for t in self._ner.pipe(self.split_text(text,10000),n_threads=multiprocessing.cpu_count(),batch_size=10000,as_tuples=False,): - output_.extend([[token.text, token.pos_, token.ent_type_] for token in t]) - return self.parse_output(output_, []) + for t in self._ner.pipe(self.split_text(text,10000),n_threads=multiprocessing.cpu_count(),batch_size=100,as_tuples=False): + output_.extend([[token.text, self.translate_tag(token.label_)] for token in t.ents]) + return np.array(output_) else: - output_ = [[token.text, token.pos_, token.ent_type_] for token in self._ner(text)] - return self.parse_output(output_, []) - - def parse_output(self, output, pos_tags): - # Pre-Treatment on the output - # print(1) - tagged_ = [] - _tag_entity = flatten(list(_tag_spacy.values())) - - for token in output: - if token[-1] in _tag_entity: - tagged_.append([token[0], self.translate_tag(token[-1])]) - else: - tagged_.append([token[0], token[-2]]) - - return self.add_beg_ending_to_tag(tagged_) + output_ = [[token.text, self.translate_tag(token.label_)] for token in self._ner(text).ents if token.label_ in all_tags] + return np.array(output_) def translate_tag(self, tag): if tag == _tag_spacy["pers"]: @@ -88,23 +76,3 @@ class Spacy(NER): if tag == _tag_spacy["org"]: return NER._unified_tag["org"] - def add_beg_ending_to_tag(self, tag): - _tag_entity = list(NER._unified_tag.values()) - t = 0 - while t < len(tag): - if tag[t][1] in _tag_entity and t + 1 < len(tag): - - if tag[t + 1][1] == tag[t][1]: - tag[t][1] = "BEG-" + tag[t][1] - t += 1 - if t + 1 >= len(tag): - tag[t][1] = "END-" + tag[t][1] - - while t + 1 < len(tag): - if tag[t + 1][1] != tag[t][1]: - tag[t][1] = "END-" + tag[t][1] - break - else: - t += 1 - t += 1 - return tag diff --git a/strpython/nlp/ner/stanford_ner.py b/strpython/nlp/ner/stanford_ner.py index 776b14f..6b7d72d 100644 --- a/strpython/nlp/ner/stanford_ner.py +++ b/strpython/nlp/ner/stanford_ner.py @@ -17,10 +17,29 @@ _stanfordner_to_treetagger_lang = { "es" : "espagnol" } -_tag_stanford = { - "place": "LOCATION", - "org": "ORGANIZATION", - "pers": "PERSON" +_tag_stanford = {"en":{ + "place": "LOCATION", + "org": "ORGANIZATION", + "pers": "PERSON" + }, + "fr":{ + "place": "I-LIEU", + "org": "I-ORG", + "pers": "I-PERS" + } + +} +nlp_config={"fr" : { + "tokenize.language" : "fr", + "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger", + "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz", + "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz", + "depparse.language" : "french", + "ner.model": "/Users/jacquesfize/.services/stanford-corenlp-full-2017-06-09/eunews.fr.crf.gz", + "ssplit.newlineIsSentenceBreak": "always" + }, + "en":{} + } @@ -51,11 +70,8 @@ class StanfordNER(NER): if not self._lang in _stanfordner_available_language: print(self._lang) raise LanguageNotAvailable(self._lang, self) - self._ner= RestStanford(config.core_nlp_URL) - - self.identified = None @@ -80,15 +96,16 @@ class StanfordNER(NER): if not text: raise TypeError("No value found in `text` parameter.") + properties = {'annotators': 'tokenize,ssplit,pos,ner', 'outputFormat': 'json', + "tokenize.untokenizable": "noneDelete", "pipelineLanguage": self._lang} + properties.update(nlp_config[self._lang]) if len(text) < maxlen : - output_=self._ner.annotate(text,properties={'annotators': 'tokenize,ssplit,pos,ner','outputFormat':'json',"tokenize.untokenizable":"noneDelete"}) + output_=self._ner.annotate(text,properties=properties) if isinstance(output_, str): output_ = json.loads(output_, strict=False) else: texts=self.split_text(text,maxlen) - output_ = self._ner.annotate(texts[0], properties={'annotators': 'tokenize,ssplit,pos,ner', - 'outputFormat': 'json', - "tokenize.untokenizable": "noneDelete"}) + output_ = self._ner.annotate(texts[0] ,properties=properties) if isinstance(output_, str): output_ = json.loads(output_, strict=False) @@ -122,50 +139,24 @@ class StanfordNER(NER): return self.parse_output(output_, []) def parse_output(self, output, pos_tags): - # Pre-Treatment on the output - #print(1) tagged_=[] - _tag_entity = list(_tag_stanford.values()) + _tag_entity = list(_tag_stanford[self._lang].values()) for sentence in output["sentences"]: - #print(sentence.keys()) for w in sentence["tokens"]: if w["ner"] in _tag_entity: tagged_.append([w["originalText"],self.translate_tag(w["ner"])]) - else: - tagged_.append([w["originalText"], w["pos"]]) - return self.add_beg_ending_to_tag(tagged_) + return tagged_ def translate_tag(self,tag): - if tag == _tag_stanford["pers"]: + if tag == _tag_stanford[self._lang]["pers"]: return NER._unified_tag["pers"] - if tag ==_tag_stanford["place"]: + if tag ==_tag_stanford[self._lang]["place"]: return NER._unified_tag["place"] - if tag ==_tag_stanford["org"]: + if tag ==_tag_stanford[self._lang]["org"]: return NER._unified_tag["org"] - def add_beg_ending_to_tag(self, tag): - _tag_entity = list(NER._unified_tag.values()) - t = 0 - while t < len(tag): - if tag[t][1] in _tag_entity and t + 1 < len(tag): - - if tag[t + 1][1] == tag[t][1]: - tag[t][1] = "BEG-" + tag[t][1] - t += 1 - if t + 1 >= len(tag): - tag[t][1] = "END-" + tag[t][1] - - while t + 1 < len(tag): - if tag[t + 1][1] != tag[t][1]: - tag[t][1] = "END-" + tag[t][1] - break - else: - t += 1 - t += 1 - return tag - # java -mx600m -cp "*:lib\*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier eunews.fr.crf.gz -textFile ../ownCloud/THESE/NoteBookPython/corpus/corpus.txt > test.txt diff --git a/strpython/pipeline.py b/strpython/pipeline.py index 4db01b3..c7cd89f 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -1,17 +1,21 @@ # coding =utf-8 -from strpython.models.str import STR +import re + +from nltk import word_tokenize +from strpython.models.str import STR from .models.transformation.transform import Generalisation, Expansion -from .nlp.disambiguator.disambiguator import Disambiguator -from .nlp.disambiguator.most_common import MostCommonDisambiguator + +from .nlp.disambiguator import * +from .nlp.ner import * + from .nlp.exception.disambiguator import NotADisambiguatorInstance from .nlp.exception.ner import NotANERInstance from .nlp.exception.tagger import NotATaggerInstance -from .nlp.ner.ner import NER -from .nlp.ner.stanford_ner import StanfordNER + from .nlp.pos_tagger.tagger import Tagger from .nlp.pos_tagger.treetagger import TreeTagger -import json,re + @@ -21,7 +25,7 @@ class Pipeline(object): Run the whole treatement on a given text """ - def __init__(self,lang="english",**kwargs): + def __init__(self,lang="en",**kwargs): """ Constructor @@ -29,8 +33,7 @@ class Pipeline(object): :param kwargs: """ self.lang=lang[:2] - self.tagger=kwargs["tagger"] if "tagger" in kwargs else TreeTagger(language=lang) - self.ner = kwargs["ner"] if "ner" in kwargs else StanfordNER(lang=lang[:2]) + self.ner = kwargs["ner"] if "ner" in kwargs else Spacy(lang=lang[:2]) self.disambiguator=kwargs["disambiguator"] if "disambiguator" in kwargs else MostCommonDisambiguator() def parse(self,text,debug=False): @@ -40,30 +43,16 @@ class Pipeline(object): :rtype: list,dict """ output = text - # If specificate POS - if self.tagger.active: - output = self.tagger.tag(output) # NER output = self.ner.identify(output) - # Disambiguation - count,se_identified = self.disambiguator.disambiguate(output, self.lang) + se_identified = self.disambiguator.disambiguate(self.lang,ner_output=output) if debug: print(se_identified) - return count,output,se_identified + return text, se_identified - def set_tagger(self,tagger): - """ - Set POS tagger used in the Pipeline - :param tagger: - :return: - """ - if isinstance(tagger,Tagger): - self.tagger=tagger - else: - raise NotATaggerInstance() def set_ner(self,ner): """ @@ -94,23 +83,22 @@ class Pipeline(object): :param text: :return: STR """ - cooc= kwargs.get("cooc",False) - adj = kwargs.get("adj", True) - inc = kwargs.get("inc", True) toponyms= kwargs.get("toponyms", None) stop_words=kwargs.get("stop_words",[]) + if isinstance(toponyms,list): - se_identified = self.disambiguator.disambiguate_list([top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3],self.lang) - count,output ={},text - #print(se_identified) - elif not se_identified: - count,output, se_identified = self.parse(text) + se_identified = self.disambiguator.disambiguate(self.lang,toponyms=[top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3]) + input = "" + + elif se_identified: + input, se_identified = self.parse(text) else: - count, output, _ = self.parse(text) - str_=STR(output,se_identified) - str_.build(adj=adj,inc=inc) - str_=self.transform(str_,**kwargs) #TODO : Add count - return str_,count,str_.spatial_entities + input,se_identified=self.parse(text) + + str_=STR(word_tokenize(input),se_identified,toponym_first=True) + str_.build(adj=True,inc=True) + str_=self.transform(str_,**kwargs) + return str_ def transform(self,str_,**kwargs): if not "type_trans" in kwargs: @@ -121,7 +109,3 @@ class Pipeline(object): else: str_=Expansion().transform(str_,**kwargs) return str_ - - -if __name__ == '__main__': - pass \ No newline at end of file -- GitLab