diff --git a/strpython/helpers/geodict_helpers.py b/strpython/helpers/geodict_helpers.py index 9ac66f95646a3937745e3c71b882015939f13c87..4d514299fdc3e1417fae93ae4386ace90fedf476 100644 --- a/strpython/helpers/geodict_helpers.py +++ b/strpython/helpers/geodict_helpers.py @@ -65,8 +65,15 @@ def get_top_candidate(label, lang, n=5): if n < 4: n = 4 res=gazetteer.get_by_label(label,lang,size=n-3,score=False) - res.extend(gazetteer.get_n_label_similar(label,lang,n=1)) - res.extend(gazetteer.get_n_alias_similar(label, lang, n=1)) + try: + res.extend(gazetteer.get_n_label_similar(label,lang,n=1)) + except : + pass + + try: + res.extend(gazetteer.get_n_alias_similar(label, lang, n=1)) + except: + pass res.append(get_most_common_id_v3(label, lang)) return res diff --git a/strpython/models/word2vec.py b/strpython/models/word2vec.py index 31bc398b2df27d5b42063bce4dba77c232deb8b7..c0904f7514f64eec46f32f1371c64b5bf878ead2 100644 --- a/strpython/models/word2vec.py +++ b/strpython/models/word2vec.py @@ -3,7 +3,7 @@ # -*- coding: utf-8 -*- from glob import glob - +from tqdm import tqdm import numpy as np from gensim.models.word2vec import Word2Vec from polyglot.text import Text @@ -147,25 +147,32 @@ if __name__ == "__main__": texts = [open(f).read() for f in files] sentences = [] # Classic tokenization of sentences - for f in files: - text = open(f).read() + for f in tqdm(texts): + text = f if not text: continue - text = Text(text) + try: + text = Text(text) - for s in text.sentences: - tokens = [] - for t in s.tokens: tokens.append(t.lower()) - sentences.append(tokens) + for s in text.sentences: + tokens = [] + for t in s.tokens: tokens.append(t.lower()) + sentences.append(tokens) + except: + pass # Add compound word version of sentences - for t in texts: + for t in tqdm(texts): if not t: continue - nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'}) - dependenc = filter_dependencies(getDependant(nlp_o)) - dependenc2 = filter_dependenciesV1(getDependant(nlp_o)) - sentences.extend(transformed_sentences(nlp_o, dependenc)) # extend compound word - sentences.extend(transformed_sentences(nlp_o, dependenc2)) # classic compound word - + try: + nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'}) + dependenc = filter_dependencies(getDependant(nlp_o)) + dependenc2 = filter_dependenciesV1(getDependant(nlp_o)) + sentences.extend(transformed_sentences(nlp_o, dependenc)) # extend compound word + sentences.extend(transformed_sentences(nlp_o, dependenc2)) # classic compound word + + except expression as identifier: + pass + model = Word2Vec(sentences, min_count=10) - model.save("resources/w2v_model_epi.w2v") + model.save("w2v_model_epi.w2v")