Commit 4eb45b03 authored by Fize Jacques's avatar Fize Jacques
Browse files

DEBUG + Joblib

parent 47b65238
No related merge requests found
Showing with 22 additions and 2031 deletions
+22 -2031
......@@ -30,9 +30,9 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc
from joblib import Parallel,delayed
#
Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"))
#for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
#annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
#Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"))
for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
min_carac_dict=None
if min_carac_fn != "" and os.path.exists(min_carac_fn):
......@@ -163,7 +163,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str):
try:
return annotater.all(None, None, x.G1, x.G2)
except Exception as e:
print("Error",e)
return [0, 0, 0, 0,300000,0]
df["res"] = df.apply(lambda x: foo(x), axis=1)
......
This diff is collapsed.
......@@ -17,7 +17,7 @@ from strpython.nlp.pos_tagger.tagger import Tagger
from strpython.models.str import STR
from strpython.nlp.ner.spacy import Spacy as spacy_ner
from strpython.nlp.ner.polyglot import Polyglot as poly_ner
#from strpython.nlp.ner.polyglot import Polyglot as poly_ner
from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner
from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d
......@@ -45,7 +45,7 @@ disambiguator_dict = {
ner_dict = {
"spacy": spacy_ner,
"polyglot":poly_ner,
#"polyglot":poly_ner,
"stanford":stanford_ner
}
......@@ -88,6 +88,8 @@ if not os.path.exists(args.input_pkl):
df = pd.read_pickle(args.input_pkl)
dataset_name = args.input_pkl.replace(".pkl","")
cols=set(df.columns)
if not "filename" in cols or not "id_doc" in cols or not "content" in cols or not "lang" in cols:
raise ValueError("Missing data column in input given")
......@@ -96,7 +98,7 @@ languages= np.unique(df.lang.values)
print("Languages available in the corpus",languages)
pipelines={
lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator]())
lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator](),corpus_name=dataset_name)
for lang in tqdm(languages,desc="Load Pipelines model")
}
......
......@@ -11,7 +11,7 @@ from .models.str import STR
from .models.transformation.transform import Generalisation, Expansion
from .nlp.disambiguator import *
from .nlp.ner import *
from .nlp.ner import Spacy, NER
from .nlp.exception.disambiguator import NotADisambiguatorInstance
from .nlp.exception.ner import NotANERInstance
......@@ -45,6 +45,9 @@ class Pipeline(object):
self.dict_adj = kwargs.get("dict_adj",None)
self.dict_inc = kwargs.get("dict_inc",None)
self.corpus_name = kwargs["corpus_name"] if "corpus_name" in kwargs else "no_name"
self.corpus_name= "{0}_{1}".format(self.corpus_name,self.lang)
def parse(self,text,debug=False):
"""
......@@ -97,10 +100,10 @@ class Pipeline(object):
"""
if not self.dict_adj and not self.dict_inc:
if os.path.exists("adj_dict.json") and yes_or_no(question="Do you want to use previous adj file"):
self.dict_adj=json.load(open("adj_dict.json"))
if os.path.exists("inc_dict.json") and yes_or_no(question="Do you want to use previous inc file"):
self.dict_adj=json.load(open("inc_dict.json"))
if os.path.exists("{0}_adj_dict.json".format(self.corpus_name)) and yes_or_no(question="Do you want to use previous adj file"):
self.dict_adj=json.load(open("{0}_adj_dict.json".format(self.corpus_name)))
if os.path.exists("{0}_inc_dict.json".format(self.corpus_name)) and yes_or_no(question="Do you want to use previous inc file"):
self.dict_adj=json.load(open("{0}_inc_dict.json".format(self.corpus_name)))
if not self.dict_adj and not self.dict_inc:
r = RelationExtractor(spatial_entities)
......@@ -109,11 +112,14 @@ class Pipeline(object):
df_adj, df_inc = r.fuse_meta_and_geom()
self.dict_adj = df_adj.to_dict()
self.dict_inc = df_inc.to_dict()
# Saving
open("{0}_adj_dict.json".format(self.corpus_name),'w').write(json.dumps(self.dict_adj))
open("{0}_inc_dict.json".format(self.corpus_name),'w').write(json.dumps(self.dict_inc))
def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs):
text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")]
#text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")]
text_and_spatial_entities = Parallel(n_jobs=4,backend="multiprocessing")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts") )
sp_es= []
for res in text_and_spatial_entities:
sp_es.extend(list(res[1].values()))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment