Commit 4eb45b03 authored by Fize Jacques's avatar Fize Jacques
Browse files

DEBUG + Joblib

parent 47b65238
No related merge requests found
Showing with 22 additions and 2031 deletions
+22 -2031
...@@ -30,9 +30,9 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc ...@@ -30,9 +30,9 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc
from joblib import Parallel,delayed from joblib import Parallel,delayed
# #
Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample")) #Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"))
#for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"): for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
#annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
min_carac_dict=None min_carac_dict=None
if min_carac_fn != "" and os.path.exists(min_carac_fn): if min_carac_fn != "" and os.path.exists(min_carac_fn):
...@@ -163,7 +163,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str): ...@@ -163,7 +163,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str):
try: try:
return annotater.all(None, None, x.G1, x.G2) return annotater.all(None, None, x.G1, x.G2)
except Exception as e: except Exception as e:
print("Error",e)
return [0, 0, 0, 0,300000,0] return [0, 0, 0, 0,300000,0]
df["res"] = df.apply(lambda x: foo(x), axis=1) df["res"] = df.apply(lambda x: foo(x), axis=1)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -17,7 +17,7 @@ from strpython.nlp.pos_tagger.tagger import Tagger ...@@ -17,7 +17,7 @@ from strpython.nlp.pos_tagger.tagger import Tagger
from strpython.models.str import STR from strpython.models.str import STR
from strpython.nlp.ner.spacy import Spacy as spacy_ner from strpython.nlp.ner.spacy import Spacy as spacy_ner
from strpython.nlp.ner.polyglot import Polyglot as poly_ner #from strpython.nlp.ner.polyglot import Polyglot as poly_ner
from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner
from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d
...@@ -45,7 +45,7 @@ disambiguator_dict = { ...@@ -45,7 +45,7 @@ disambiguator_dict = {
ner_dict = { ner_dict = {
"spacy": spacy_ner, "spacy": spacy_ner,
"polyglot":poly_ner, #"polyglot":poly_ner,
"stanford":stanford_ner "stanford":stanford_ner
} }
...@@ -88,6 +88,8 @@ if not os.path.exists(args.input_pkl): ...@@ -88,6 +88,8 @@ if not os.path.exists(args.input_pkl):
df = pd.read_pickle(args.input_pkl) df = pd.read_pickle(args.input_pkl)
dataset_name = args.input_pkl.replace(".pkl","")
cols=set(df.columns) cols=set(df.columns)
if not "filename" in cols or not "id_doc" in cols or not "content" in cols or not "lang" in cols: if not "filename" in cols or not "id_doc" in cols or not "content" in cols or not "lang" in cols:
raise ValueError("Missing data column in input given") raise ValueError("Missing data column in input given")
...@@ -96,7 +98,7 @@ languages= np.unique(df.lang.values) ...@@ -96,7 +98,7 @@ languages= np.unique(df.lang.values)
print("Languages available in the corpus",languages) print("Languages available in the corpus",languages)
pipelines={ pipelines={
lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator]()) lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator](),corpus_name=dataset_name)
for lang in tqdm(languages,desc="Load Pipelines model") for lang in tqdm(languages,desc="Load Pipelines model")
} }
......
...@@ -11,7 +11,7 @@ from .models.str import STR ...@@ -11,7 +11,7 @@ from .models.str import STR
from .models.transformation.transform import Generalisation, Expansion from .models.transformation.transform import Generalisation, Expansion
from .nlp.disambiguator import * from .nlp.disambiguator import *
from .nlp.ner import * from .nlp.ner import Spacy, NER
from .nlp.exception.disambiguator import NotADisambiguatorInstance from .nlp.exception.disambiguator import NotADisambiguatorInstance
from .nlp.exception.ner import NotANERInstance from .nlp.exception.ner import NotANERInstance
...@@ -45,6 +45,9 @@ class Pipeline(object): ...@@ -45,6 +45,9 @@ class Pipeline(object):
self.dict_adj = kwargs.get("dict_adj",None) self.dict_adj = kwargs.get("dict_adj",None)
self.dict_inc = kwargs.get("dict_inc",None) self.dict_inc = kwargs.get("dict_inc",None)
self.corpus_name = kwargs["corpus_name"] if "corpus_name" in kwargs else "no_name"
self.corpus_name= "{0}_{1}".format(self.corpus_name,self.lang)
def parse(self,text,debug=False): def parse(self,text,debug=False):
""" """
...@@ -97,10 +100,10 @@ class Pipeline(object): ...@@ -97,10 +100,10 @@ class Pipeline(object):
""" """
if not self.dict_adj and not self.dict_inc: if not self.dict_adj and not self.dict_inc:
if os.path.exists("adj_dict.json") and yes_or_no(question="Do you want to use previous adj file"): if os.path.exists("{0}_adj_dict.json".format(self.corpus_name)) and yes_or_no(question="Do you want to use previous adj file"):
self.dict_adj=json.load(open("adj_dict.json")) self.dict_adj=json.load(open("{0}_adj_dict.json".format(self.corpus_name)))
if os.path.exists("inc_dict.json") and yes_or_no(question="Do you want to use previous inc file"): if os.path.exists("{0}_inc_dict.json".format(self.corpus_name)) and yes_or_no(question="Do you want to use previous inc file"):
self.dict_adj=json.load(open("inc_dict.json")) self.dict_adj=json.load(open("{0}_inc_dict.json".format(self.corpus_name)))
if not self.dict_adj and not self.dict_inc: if not self.dict_adj and not self.dict_inc:
r = RelationExtractor(spatial_entities) r = RelationExtractor(spatial_entities)
...@@ -109,11 +112,14 @@ class Pipeline(object): ...@@ -109,11 +112,14 @@ class Pipeline(object):
df_adj, df_inc = r.fuse_meta_and_geom() df_adj, df_inc = r.fuse_meta_and_geom()
self.dict_adj = df_adj.to_dict() self.dict_adj = df_adj.to_dict()
self.dict_inc = df_inc.to_dict() self.dict_inc = df_inc.to_dict()
# Saving
open("{0}_adj_dict.json".format(self.corpus_name),'w').write(json.dumps(self.dict_adj))
open("{0}_inc_dict.json".format(self.corpus_name),'w').write(json.dumps(self.dict_inc))
def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs): def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs):
text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")] #text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")]
text_and_spatial_entities = Parallel(n_jobs=4,backend="multiprocessing")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts") )
sp_es= [] sp_es= []
for res in text_and_spatial_entities: for res in text_and_spatial_entities:
sp_es.extend(list(res[1].values())) sp_es.extend(list(res[1].values()))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment