From c6185b506bf0c7a5613868e85d1f89f68bfa9329 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Thu, 4 Jul 2019 15:42:25 +0200 Subject: [PATCH] Add Flair NER debug debug agrovoc matcher add latex network output str --- requirements.txt | 2 + strpython/helpers/terminology/matcher.py | 8 ++- strpython/models/str.py | 65 +++++++++++++++-- strpython/nlp/disambiguator/disambiguator.py | 4 +- strpython/nlp/ner/flair.py | 73 ++++++++++++++++++++ strpython/pipeline.py | 15 ++-- 6 files changed, 154 insertions(+), 13 deletions(-) create mode 100644 strpython/nlp/ner/flair.py diff --git a/requirements.txt b/requirements.txt index 4a432f8..ed619fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ PyICU pycld2 morfessor textblob +pyperclip +jellyfish diff --git a/strpython/helpers/terminology/matcher.py b/strpython/helpers/terminology/matcher.py index e976c26..6d83210 100644 --- a/strpython/helpers/terminology/matcher.py +++ b/strpython/helpers/terminology/matcher.py @@ -93,14 +93,18 @@ def matcher_agrovoc( lang): TerminologyMatcher matcher """ - agrovoc_vocab = pd.read_csv(os.path.join(package_directory,"resources/terminology/agrovoc/agrovoc_cleaned.csv")) + agrovoc_vocab = pd.read_csv(os.path.join(package_directory,"resources/terminology/agrovoc/agrovoc_cleaned2.csv")) + #indexes_to_ignore = json.load(open(os.path.join(package_directory,"resources/terminology/agrovoc/entry_to_ignore.json"))) + agrovoc_vocab["preferred_label_new"] = agrovoc_vocab["preferred_label_new"].apply( lambda x: safe_execute({}, Exception, json.loads, x.replace("\'", "\""))) agrovoc_vocab["label_lang"] = agrovoc_vocab["preferred_label_new"].apply( lambda x: str(resolv_a(x[lang]) if lang in x else np.nan).strip().lower()) agrovoc_vocab=agrovoc_vocab[~pd.isna(agrovoc_vocab["label_lang"])] + #agrovoc_vocab["syns"] = agrovoc_vocab.Synonyms.apply(lambda x : [] if pd.isna(x) else x.split("|")) - return TerminologyMatcher(agrovoc_vocab["label_lang"].values.tolist()) + terminology = np.array(agrovoc_vocab["label_lang"].values.tolist()) + return TerminologyMatcher(terminology.tolist()) def matcher_biotex(lang): """ diff --git a/strpython/models/str.py b/strpython/models/str.py index 1aeab90..e091a88 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -25,6 +25,11 @@ from .spatial_relation import get_spatial_relations from joblib import Parallel,delayed from strpython.helpers.collision import getGEO +import tempfile +from network2tikz import plot +from networkx.drawing.nx_agraph import graphviz_layout +import pyperclip + max_int = 1e6 def get_inclusion_chain(id_, prop): @@ -215,6 +220,27 @@ class STR(object): self.spatial_entities[id] = label self.graph.add_node(id, label=label,type="S_E") + def remove_spatial_entity(self, id): + """ + Add a spatial entity to the current STR + + Parameters + ---------- + id : str + identifier of the spatial entity in Geodict + label : str, optional + if not available in Geodict (the default is None) + + """ + if id in self.spatial_entities: + del self.spatial_entities[id] + if id in self.graph: + self.graph.remove_node(id) + + def drop_zero_degree_entities(self): + for n in list(self.spatial_entities.keys()): + if self.graph.degree(n) <1: + self.remove_spatial_entity(n) def add_spatial_entities(self, ids: list, labels: list = []): """ @@ -598,7 +624,7 @@ class STR(object): return map - def map_projection(self, plt=False): + def map_projection(self, plt_=False): """ Return a matplotlib figure of the STR @@ -649,8 +675,7 @@ class STR(object): gpd.GeoSeries(points).plot(ax=base, marker='o', markersize=5, color="blue") gpd.GeoSeries(lines_adj).plot(ax=base, color="green") gpd.GeoSeries(lines_inc).plot(ax=base, color="red") - - if not plt: + if not plt_: return base plt.show() @@ -661,7 +686,10 @@ class STR(object): import matplotlib.pyplot as plt plt.figure(figsize=figsize) G = self.graph.copy() - pos = layout_func(G, scale=scale) + try: + pos = layout_func(G, scale=scale) + except: + pos = layout_func(G) #pos = nx.layout.shell_layout(ext_2_t.graph) nodes = list(G.nodes(data=True)) @@ -692,7 +720,36 @@ class STR(object): else: return plt.gca() + def to_latex(self,to_clipboard=False): + def get_color(x): + if x == "S_E": + return "blue_tikznetw" + else: + return "red_tikznetw" + + color_data ="""\\usepackage{xcolor}\n\\definecolor{red_tikznetw}{HTML}{D91E18}\n\\definecolor{blue_tikznetw}{HTML}{4183D7}""" + + G =self.graph.copy() + fn = tempfile.NamedTemporaryFile().name + plot(G, + filename=fn, + type="tex", + layout=graphviz_layout(G), + vertex_label=dict(G.nodes(data="label")), + vertex_color = {k:get_color(v) for k,v in dict(G.nodes(data="type")).items()}, + edge_color=[ed[-1] for ed in list(G.edges(data="color"))], + vertex_label_position='below', + canvas=(10, 10), + node_label_style="{\\bfseries}" + ) + tex_data = open(fn).read().split("\n") + tex_data.insert(2,color_data) + tex_data = "\n".join(tex_data) + + if not to_clipboard: + return tex_data + pyperclip.copy(tex_data) def __len__(self): return len(self.spatial_entities) diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py index d80eaf4..422c8fd 100644 --- a/strpython/nlp/disambiguator/disambiguator.py +++ b/strpython/nlp/disambiguator/disambiguator.py @@ -33,7 +33,7 @@ class Disambiguator(object): {toponym : geodictID} """ if not toponyms: - if isinstance(ner_output, np.ndarray) and len(ner_output.shape) == 2 and ner_output.shape[1] == 2: + if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2: toponyms = self.parse_ner_output(ner_output) elif len(np.asarray(ner_output).shape) != 2: return {} @@ -89,4 +89,6 @@ class Disambiguator(object): def parse_ner_output(self,ner_output): + if not isinstance(ner_output,np.ndarray): + ner_output = np.asarray(ner_output) return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]] \ No newline at end of file diff --git a/strpython/nlp/ner/flair.py b/strpython/nlp/ner/flair.py new file mode 100644 index 0000000..6baefe1 --- /dev/null +++ b/strpython/nlp/ner/flair.py @@ -0,0 +1,73 @@ +# coding = utf-8 +# coding=utf-8 + +from flair.data import Sentence +from flair.models import SequenceTagger + +import numpy as np +from .ner import NER +from ..exception.language import LanguageNotAvailable + +_flair_available_language = ["fr", "en","es","de"] + +_tag_flair = { + "place": [ "LOC"], # Petite particularité + "pers": "PER", + "org": "ORG" +} + +all_tags=["LOC","PER","ORG"] + + +class Flair(NER): + """ + Python wrapper for StanfordNER + """ + + def __init__(self, lang="fr"): + NER.__init__(self, lang) + + if not self._lang in _flair_available_language: + raise LanguageNotAvailable(self._lang, self) + + self._ner = SequenceTagger.load("{0}-ner".format(self._lang)) + + def split_text(self,text,maxlen=50000): + texts=text.split(".") + phrases_given=[] + c=0 + current_phrase="" + for t in texts: + if c + len(t)+1 <maxlen: + current_phrase+="."+t + c+=len(t)+1 + elif c + len(t) > maxlen: + phrases_given.append(current_phrase) + current_phrase, c ="",0 + if not phrases_given: + phrases_given=[text] + return phrases_given + + def identify(self, text=None): + import multiprocessing + if len(text) > 10000: + output_=[] + for t in self.split_text(text,10000): + sentence = Sentence(t) + self._ner.predict(sentence) + output_.extend([[e.text, self.translate_tag(e.tag)] for e in sentence.get_spans('ner')]) + + else: + sentence = Sentence(text) + self._ner.predict(sentence) + output_=[[e.text, self.translate_tag(e.tag)] for e in sentence.get_spans('ner')] + return np.array(output_) + + def translate_tag(self, tag): + if tag == _tag_flair["pers"]: + return NER._unified_tag["pers"] + if tag in _tag_flair["place"]: + return NER._unified_tag["place"] + if tag == _tag_flair["org"]: + return NER._unified_tag["org"] + diff --git a/strpython/pipeline.py b/strpython/pipeline.py index 640ca80..b52e132 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -17,7 +17,7 @@ from .nlp.ner import Spacy, NER from .nlp.exception.disambiguator import NotADisambiguatorInstance from .nlp.exception.ner import NotANERInstance -from multiprocessing import cpu_count +from multiprocessing import cpu_count as cpu_count_system from mytoolbox.env import in_notebook if in_notebook(): @@ -65,7 +65,8 @@ class Pipeline(object): # Disambiguation se_identified = self.disambiguator.disambiguate(self.lang,ner_output=output) for top_, id in list(se_identified.items()): - if not id.startswith("GD") or top_.lower() in stop_words: + if not id.startswith("GD") or top_.strip().lower() in stop_words\ + or re.match("\d+",top_.strip()) or len(re.sub("\d+","",top_))<3: del se_identified[top_] if debug: print(se_identified) @@ -122,11 +123,13 @@ class Pipeline(object): open("{0}_adj_dict.json".format(self.corpus_name),'w').write(json.dumps(data.adjacency)) open("{0}_inc_dict.json".format(self.corpus_name),'w').write(json.dumps(data.inclusion)) - def pipe_build(self,texts, cpu_count=cpu_count(), **kwargs): + def pipe_build(self,texts, **kwargs): # Extract Spatial entities - + cpu_count = kwargs.get("cpu_count",cpu_count_system()) stop_words = kwargs.get("stop_words",[]) - text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts", disable=(not self.verbose))] + stop_words = [s.lower() for s in stop_words] + + text_and_spatial_entities = [self.parse(text,stop_words=stop_words) for text in tqdm(texts,desc="Extract spatial entities from the texts", disable=(not self.verbose))] # Filter Output sp_es= [] @@ -139,7 +142,7 @@ class Pipeline(object): self.extract_all_relation(sp_es) - str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[1], **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR", disable=(not self.verbose))) + str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[1]) for ext in tqdm(text_and_spatial_entities, desc="Build STR", disable=(not self.verbose))) return str_s def build(self, spatial_entities_identified): -- GitLab