From c6185b506bf0c7a5613868e85d1f89f68bfa9329 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Thu, 4 Jul 2019 15:42:25 +0200
Subject: [PATCH] Add Flair NER debug debug agrovoc matcher add latex network
 output str

---
 requirements.txt                             |  2 +
 strpython/helpers/terminology/matcher.py     |  8 ++-
 strpython/models/str.py                      | 65 +++++++++++++++--
 strpython/nlp/disambiguator/disambiguator.py |  4 +-
 strpython/nlp/ner/flair.py                   | 73 ++++++++++++++++++++
 strpython/pipeline.py                        | 15 ++--
 6 files changed, 154 insertions(+), 13 deletions(-)
 create mode 100644 strpython/nlp/ner/flair.py

diff --git a/requirements.txt b/requirements.txt
index 4a432f8..ed619fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,5 @@ PyICU
 pycld2
 morfessor
 textblob
+pyperclip
+jellyfish
diff --git a/strpython/helpers/terminology/matcher.py b/strpython/helpers/terminology/matcher.py
index e976c26..6d83210 100644
--- a/strpython/helpers/terminology/matcher.py
+++ b/strpython/helpers/terminology/matcher.py
@@ -93,14 +93,18 @@ def matcher_agrovoc( lang):
     TerminologyMatcher
         matcher
     """
-    agrovoc_vocab = pd.read_csv(os.path.join(package_directory,"resources/terminology/agrovoc/agrovoc_cleaned.csv"))
+    agrovoc_vocab = pd.read_csv(os.path.join(package_directory,"resources/terminology/agrovoc/agrovoc_cleaned2.csv"))
+    #indexes_to_ignore = json.load(open(os.path.join(package_directory,"resources/terminology/agrovoc/entry_to_ignore.json")))
+
     agrovoc_vocab["preferred_label_new"] = agrovoc_vocab["preferred_label_new"].apply(
         lambda x: safe_execute({}, Exception, json.loads, x.replace("\'", "\"")))
     agrovoc_vocab["label_lang"] = agrovoc_vocab["preferred_label_new"].apply(
         lambda x: str(resolv_a(x[lang]) if lang in x else np.nan).strip().lower())
     agrovoc_vocab=agrovoc_vocab[~pd.isna(agrovoc_vocab["label_lang"])]
+
     #agrovoc_vocab["syns"] = agrovoc_vocab.Synonyms.apply(lambda x : [] if pd.isna(x) else x.split("|"))
-    return TerminologyMatcher(agrovoc_vocab["label_lang"].values.tolist())
+    terminology = np.array(agrovoc_vocab["label_lang"].values.tolist())
+    return TerminologyMatcher(terminology.tolist())
 
 def matcher_biotex(lang):
     """
diff --git a/strpython/models/str.py b/strpython/models/str.py
index 1aeab90..e091a88 100644
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -25,6 +25,11 @@ from .spatial_relation import get_spatial_relations
 from joblib import Parallel,delayed
 from strpython.helpers.collision import getGEO
 
+import tempfile
+from network2tikz import plot
+from networkx.drawing.nx_agraph import graphviz_layout
+import pyperclip
+
 max_int = 1e6
 
 def get_inclusion_chain(id_, prop):
@@ -215,6 +220,27 @@ class STR(object):
         self.spatial_entities[id] = label
         self.graph.add_node(id, label=label,type="S_E")
 
+    def remove_spatial_entity(self, id):
+        """
+        Add a spatial entity to the current STR
+
+        Parameters
+        ----------
+        id : str
+            identifier of the spatial entity in Geodict
+        label : str, optional
+            if not available in Geodict (the default is None)
+
+        """
+        if id in self.spatial_entities:
+            del self.spatial_entities[id]
+        if id in self.graph:
+            self.graph.remove_node(id)
+
+    def drop_zero_degree_entities(self):
+        for n in list(self.spatial_entities.keys()):
+            if self.graph.degree(n) <1:
+                self.remove_spatial_entity(n)
 
     def add_spatial_entities(self, ids: list, labels: list = []):
         """
@@ -598,7 +624,7 @@ class STR(object):
 
         return map
 
-    def map_projection(self, plt=False):
+    def map_projection(self, plt_=False):
         """
         Return a matplotlib figure of the STR
 
@@ -649,8 +675,7 @@ class STR(object):
         gpd.GeoSeries(points).plot(ax=base, marker='o', markersize=5, color="blue")
         gpd.GeoSeries(lines_adj).plot(ax=base, color="green")
         gpd.GeoSeries(lines_inc).plot(ax=base, color="red")
-
-        if not plt:
+        if not plt_:
             return base
         plt.show()
 
@@ -661,7 +686,10 @@ class STR(object):
         import matplotlib.pyplot as plt
         plt.figure(figsize=figsize)
         G = self.graph.copy()
-        pos = layout_func(G, scale=scale)
+        try:
+            pos = layout_func(G, scale=scale)
+        except:
+            pos = layout_func(G)
         #pos = nx.layout.shell_layout(ext_2_t.graph)
 
         nodes = list(G.nodes(data=True))
@@ -692,7 +720,36 @@ class STR(object):
         else:
             return plt.gca()
 
+    def to_latex(self,to_clipboard=False):
 
+        def get_color(x):
+            if x == "S_E":
+                return "blue_tikznetw"
+            else:
+                return "red_tikznetw"
+
+        color_data ="""\\usepackage{xcolor}\n\\definecolor{red_tikznetw}{HTML}{D91E18}\n\\definecolor{blue_tikznetw}{HTML}{4183D7}"""
+
+        G =self.graph.copy()
+        fn = tempfile.NamedTemporaryFile().name
+        plot(G,
+             filename=fn,
+             type="tex",
+             layout=graphviz_layout(G),
+             vertex_label=dict(G.nodes(data="label")),
+             vertex_color = {k:get_color(v) for k,v in dict(G.nodes(data="type")).items()},
+             edge_color=[ed[-1] for ed in list(G.edges(data="color"))],
+             vertex_label_position='below',
+             canvas=(10, 10),
+             node_label_style="{\\bfseries}"
+             )
+        tex_data = open(fn).read().split("\n")
+        tex_data.insert(2,color_data)
+        tex_data = "\n".join(tex_data)
+
+        if not to_clipboard:
+            return tex_data
+        pyperclip.copy(tex_data)
     def __len__(self):
         return len(self.spatial_entities)
 
diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py
index d80eaf4..422c8fd 100644
--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -33,7 +33,7 @@ class Disambiguator(object):
             {toponym : geodictID}
         """
         if not toponyms:
-            if isinstance(ner_output, np.ndarray) and len(ner_output.shape) == 2 and ner_output.shape[1] == 2:
+            if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2:
                 toponyms = self.parse_ner_output(ner_output)
             elif len(np.asarray(ner_output).shape) != 2:
                 return {}
@@ -89,4 +89,6 @@ class Disambiguator(object):
 
 
     def parse_ner_output(self,ner_output):
+        if not isinstance(ner_output,np.ndarray):
+            ner_output = np.asarray(ner_output)
         return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]]
\ No newline at end of file
diff --git a/strpython/nlp/ner/flair.py b/strpython/nlp/ner/flair.py
new file mode 100644
index 0000000..6baefe1
--- /dev/null
+++ b/strpython/nlp/ner/flair.py
@@ -0,0 +1,73 @@
+# coding = utf-8
+# coding=utf-8
+
+from flair.data import Sentence
+from flair.models import SequenceTagger
+
+import numpy as np
+from .ner import NER
+from ..exception.language import LanguageNotAvailable
+
+_flair_available_language = ["fr", "en","es","de"]
+
+_tag_flair = {
+    "place": [ "LOC"],  # Petite particularitÃ©
+    "pers": "PER",
+    "org": "ORG"
+}
+
+all_tags=["LOC","PER","ORG"]
+
+
+class Flair(NER):
+    """
+    Python wrapper for StanfordNER
+    """
+
+    def __init__(self, lang="fr"):
+        NER.__init__(self, lang)
+
+        if not self._lang in _flair_available_language:
+            raise LanguageNotAvailable(self._lang, self)
+
+        self._ner = SequenceTagger.load("{0}-ner".format(self._lang))
+
+    def split_text(self,text,maxlen=50000):
+        texts=text.split(".")
+        phrases_given=[]
+        c=0
+        current_phrase=""
+        for t in texts:
+            if c + len(t)+1 <maxlen:
+                current_phrase+="."+t
+                c+=len(t)+1
+            elif c + len(t) > maxlen:
+                phrases_given.append(current_phrase)
+                current_phrase, c ="",0
+        if not phrases_given:
+            phrases_given=[text]
+        return phrases_given
+
+    def identify(self, text=None):
+        import multiprocessing
+        if len(text) > 10000:
+            output_=[]
+            for t in self.split_text(text,10000):
+                sentence = Sentence(t)
+                self._ner.predict(sentence)
+                output_.extend([[e.text, self.translate_tag(e.tag)] for e in sentence.get_spans('ner')])
+
+        else:
+            sentence = Sentence(text)
+            self._ner.predict(sentence)
+            output_=[[e.text, self.translate_tag(e.tag)] for e in sentence.get_spans('ner')]
+        return np.array(output_)
+
+    def translate_tag(self, tag):
+        if tag == _tag_flair["pers"]:
+            return NER._unified_tag["pers"]
+        if tag in _tag_flair["place"]:
+            return NER._unified_tag["place"]
+        if tag == _tag_flair["org"]:
+            return NER._unified_tag["org"]
+
diff --git a/strpython/pipeline.py b/strpython/pipeline.py
index 640ca80..b52e132 100644
--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -17,7 +17,7 @@ from .nlp.ner import Spacy, NER
 
 from .nlp.exception.disambiguator import NotADisambiguatorInstance
 from .nlp.exception.ner import NotANERInstance
-from multiprocessing import cpu_count
+from multiprocessing import cpu_count as cpu_count_system
 
 from mytoolbox.env import in_notebook
 if in_notebook():
@@ -65,7 +65,8 @@ class Pipeline(object):
         # Disambiguation
         se_identified = self.disambiguator.disambiguate(self.lang,ner_output=output)
         for top_, id in list(se_identified.items()):
-            if not id.startswith("GD") or top_.lower() in stop_words:
+            if not id.startswith("GD") or top_.strip().lower() in stop_words\
+                    or re.match("\d+",top_.strip()) or len(re.sub("\d+","",top_))<3:
                 del se_identified[top_]
         if debug:
             print(se_identified)
@@ -122,11 +123,13 @@ class Pipeline(object):
                 open("{0}_adj_dict.json".format(self.corpus_name),'w').write(json.dumps(data.adjacency))
                 open("{0}_inc_dict.json".format(self.corpus_name),'w').write(json.dumps(data.inclusion))
 
-    def pipe_build(self,texts, cpu_count=cpu_count(), **kwargs):
+    def pipe_build(self,texts, **kwargs):
         # Extract Spatial entities
-
+        cpu_count = kwargs.get("cpu_count",cpu_count_system())
         stop_words = kwargs.get("stop_words",[])
-        text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts", disable=(not self.verbose))]
+        stop_words = [s.lower() for s in stop_words]
+
+        text_and_spatial_entities = [self.parse(text,stop_words=stop_words) for text in tqdm(texts,desc="Extract spatial entities from the texts", disable=(not self.verbose))]
 
         # Filter Output
         sp_es= []
@@ -139,7 +142,7 @@ class Pipeline(object):
 
         self.extract_all_relation(sp_es)
 
-        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[1], **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR", disable=(not self.verbose)))
+        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[1]) for ext in tqdm(text_and_spatial_entities, desc="Build STR", disable=(not self.verbose)))
         return str_s
 
     def build(self, spatial_entities_identified):
-- 
GitLab