Commit 04745dce authored by Fize Jacques's avatar Fize Jacques
Browse files

debug

parent d191d372
No related merge requests found
Showing with 32 additions and 18 deletions
+32 -18
......@@ -65,8 +65,15 @@ def get_top_candidate(label, lang, n=5):
if n < 4:
n = 4
res=gazetteer.get_by_label(label,lang,size=n-3,score=False)
res.extend(gazetteer.get_n_label_similar(label,lang,n=1))
res.extend(gazetteer.get_n_alias_similar(label, lang, n=1))
try:
res.extend(gazetteer.get_n_label_similar(label,lang,n=1))
except :
pass
try:
res.extend(gazetteer.get_n_alias_similar(label, lang, n=1))
except:
pass
res.append(get_most_common_id_v3(label, lang))
return res
......@@ -3,7 +3,7 @@
# -*- coding: utf-8 -*-
from glob import glob
from tqdm import tqdm
import numpy as np
from gensim.models.word2vec import Word2Vec
from polyglot.text import Text
......@@ -147,25 +147,32 @@ if __name__ == "__main__":
texts = [open(f).read() for f in files]
sentences = []
# Classic tokenization of sentences
for f in files:
text = open(f).read()
for f in tqdm(texts):
text = f
if not text: continue
text = Text(text)
try:
text = Text(text)
for s in text.sentences:
tokens = []
for t in s.tokens: tokens.append(t.lower())
sentences.append(tokens)
for s in text.sentences:
tokens = []
for t in s.tokens: tokens.append(t.lower())
sentences.append(tokens)
except:
pass
# Add compound word version of sentences
for t in texts:
for t in tqdm(texts):
if not t: continue
nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'})
dependenc = filter_dependencies(getDependant(nlp_o))
dependenc2 = filter_dependenciesV1(getDependant(nlp_o))
sentences.extend(transformed_sentences(nlp_o, dependenc)) # extend compound word
sentences.extend(transformed_sentences(nlp_o, dependenc2)) # classic compound word
try:
nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'})
dependenc = filter_dependencies(getDependant(nlp_o))
dependenc2 = filter_dependenciesV1(getDependant(nlp_o))
sentences.extend(transformed_sentences(nlp_o, dependenc)) # extend compound word
sentences.extend(transformed_sentences(nlp_o, dependenc2)) # classic compound word
except expression as identifier:
pass
model = Word2Vec(sentences, min_count=10)
model.save("resources/w2v_model_epi.w2v")
model.save("w2v_model_epi.w2v")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment