From 04745dce851afaf5db906fff1a68f6c677bff532 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Thu, 31 Jan 2019 13:56:08 +0100 Subject: [PATCH] debug --- strpython/helpers/geodict_helpers.py | 11 ++++++-- strpython/models/word2vec.py | 39 ++++++++++++++++------------ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/strpython/helpers/geodict_helpers.py b/strpython/helpers/geodict_helpers.py index 9ac66f9..4d51429 100644 --- a/strpython/helpers/geodict_helpers.py +++ b/strpython/helpers/geodict_helpers.py @@ -65,8 +65,15 @@ def get_top_candidate(label, lang, n=5): if n < 4: n = 4 res=gazetteer.get_by_label(label,lang,size=n-3,score=False) - res.extend(gazetteer.get_n_label_similar(label,lang,n=1)) - res.extend(gazetteer.get_n_alias_similar(label, lang, n=1)) + try: + res.extend(gazetteer.get_n_label_similar(label,lang,n=1)) + except : + pass + + try: + res.extend(gazetteer.get_n_alias_similar(label, lang, n=1)) + except: + pass res.append(get_most_common_id_v3(label, lang)) return res diff --git a/strpython/models/word2vec.py b/strpython/models/word2vec.py index 31bc398..c0904f7 100644 --- a/strpython/models/word2vec.py +++ b/strpython/models/word2vec.py @@ -3,7 +3,7 @@ # -*- coding: utf-8 -*- from glob import glob - +from tqdm import tqdm import numpy as np from gensim.models.word2vec import Word2Vec from polyglot.text import Text @@ -147,25 +147,32 @@ if __name__ == "__main__": texts = [open(f).read() for f in files] sentences = [] # Classic tokenization of sentences - for f in files: - text = open(f).read() + for f in tqdm(texts): + text = f if not text: continue - text = Text(text) + try: + text = Text(text) - for s in text.sentences: - tokens = [] - for t in s.tokens: tokens.append(t.lower()) - sentences.append(tokens) + for s in text.sentences: + tokens = [] + for t in s.tokens: tokens.append(t.lower()) + sentences.append(tokens) + except: + pass # Add compound word version of sentences - for t in texts: + for t in tqdm(texts): if not t: continue - nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'}) - dependenc = filter_dependencies(getDependant(nlp_o)) - dependenc2 = filter_dependenciesV1(getDependant(nlp_o)) - sentences.extend(transformed_sentences(nlp_o, dependenc)) # extend compound word - sentences.extend(transformed_sentences(nlp_o, dependenc2)) # classic compound word - + try: + nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'}) + dependenc = filter_dependencies(getDependant(nlp_o)) + dependenc2 = filter_dependenciesV1(getDependant(nlp_o)) + sentences.extend(transformed_sentences(nlp_o, dependenc)) # extend compound word + sentences.extend(transformed_sentences(nlp_o, dependenc2)) # classic compound word + + except expression as identifier: + pass + model = Word2Vec(sentences, min_count=10) - model.save("resources/w2v_model_epi.w2v") + model.save("w2v_model_epi.w2v") -- GitLab