From 04745dce851afaf5db906fff1a68f6c677bff532 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Thu, 31 Jan 2019 13:56:08 +0100
Subject: [PATCH] debug

---
 strpython/helpers/geodict_helpers.py | 11 ++++++--
 strpython/models/word2vec.py         | 39 ++++++++++++++++------------
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/strpython/helpers/geodict_helpers.py b/strpython/helpers/geodict_helpers.py
index 9ac66f9..4d51429 100644
--- a/strpython/helpers/geodict_helpers.py
+++ b/strpython/helpers/geodict_helpers.py
@@ -65,8 +65,15 @@ def get_top_candidate(label, lang, n=5):
     if n < 4:
         n = 4
     res=gazetteer.get_by_label(label,lang,size=n-3,score=False)
-    res.extend(gazetteer.get_n_label_similar(label,lang,n=1))
-    res.extend(gazetteer.get_n_alias_similar(label, lang, n=1))
+    try:
+        res.extend(gazetteer.get_n_label_similar(label,lang,n=1))
+    except :
+        pass
+    
+    try:
+        res.extend(gazetteer.get_n_alias_similar(label, lang, n=1))
+    except:
+        pass
     res.append(get_most_common_id_v3(label, lang))
 
     return res
diff --git a/strpython/models/word2vec.py b/strpython/models/word2vec.py
index 31bc398..c0904f7 100644
--- a/strpython/models/word2vec.py
+++ b/strpython/models/word2vec.py
@@ -3,7 +3,7 @@
 # -*- coding: utf-8 -*-
 
 from glob import glob
-
+from tqdm import tqdm
 import numpy as np
 from gensim.models.word2vec import Word2Vec
 from polyglot.text import Text
@@ -147,25 +147,32 @@ if __name__ == "__main__":
     texts = [open(f).read() for f in files]
     sentences = []
     # Classic tokenization of sentences
-    for f in files:
-        text = open(f).read()
+    for f in tqdm(texts):
+        text = f
         if not text: continue
-        text = Text(text)
+        try:
+            text = Text(text)
 
-        for s in text.sentences:
-            tokens = []
-            for t in s.tokens: tokens.append(t.lower())
-            sentences.append(tokens)
+            for s in text.sentences:
+                tokens = []
+                for t in s.tokens: tokens.append(t.lower())
+                sentences.append(tokens)
+        except:
+            pass
 
     # Add compound word version of sentences
-    for t in texts:
+    for t in tqdm(texts):
         if not t: continue
-        nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'})
-        dependenc = filter_dependencies(getDependant(nlp_o))
-        dependenc2 = filter_dependenciesV1(getDependant(nlp_o))
-        sentences.extend(transformed_sentences(nlp_o, dependenc))  # extend compound word
-        sentences.extend(transformed_sentences(nlp_o, dependenc2))  # classic compound word
-
+        try:
+            nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'})
+            dependenc = filter_dependencies(getDependant(nlp_o))
+            dependenc2 = filter_dependenciesV1(getDependant(nlp_o))
+            sentences.extend(transformed_sentences(nlp_o, dependenc))  # extend compound word
+            sentences.extend(transformed_sentences(nlp_o, dependenc2))  # classic compound word
+
+        except expression as identifier:
+            pass
+        
     model = Word2Vec(sentences, min_count=10)
-    model.save("resources/w2v_model_epi.w2v")
+    model.save("w2v_model_epi.w2v")
 
-- 
GitLab