DEBUG + Joblib

4eb45b03 · Fize Jacques · 47b65238 · 4eb45b03 · 47b65238 · 4eb45b03
Commit 4eb45b03 authored 6 years ago by Fize Jacques
Hide whitespace changes
Inline Side-by-side

Showing

with 22 additions and 2031 deletions
+22 -2031
--- a/auto_fill_annotation.py
+++ b/auto_fill_annotation.py
@@ -30,9 +30,9 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc
    from joblib import Parallel,delayed
    #
-    Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"))
+    #Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"))
-    #for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
+    for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
-         #annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
+         annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
    min_carac_dict=None
    if min_carac_fn != "" and os.path.exists(min_carac_fn):
@@ -163,7 +163,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str):
        try:
            return annotater.all(None, None, x.G1, x.G2)
        except Exception as e:
-            print("Error",e)
            return [0, 0, 0, 0,300000,0]
    df["res"] = df.apply(lambda x: foo(x), axis=1)

--- a/data/_depreciated/miscaellenous/epidemio.json
+++ b/data/_depreciated/miscaellenous/epidemio.json
--- a/generate_str.py
+++ b/generate_str.py
@@ -17,7 +17,7 @@ from strpython.nlp.pos_tagger.tagger import Tagger
 from strpython.models.str import STR
 from strpython.nlp.ner.spacy import Spacy as spacy_ner
-from strpython.nlp.ner.polyglot import Polyglot as poly_ner
+#from strpython.nlp.ner.polyglot import Polyglot as poly_ner
 from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner
 from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d
@@ -45,7 +45,7 @@ disambiguator_dict = {
 ner_dict = {
    "spacy": spacy_ner,
-    "polyglot":poly_ner,
+    #"polyglot":poly_ner,
    "stanford":stanford_ner
 }
@@ -88,6 +88,8 @@ if not os.path.exists(args.input_pkl):
 df = pd.read_pickle(args.input_pkl)
+dataset_name =  args.input_pkl.replace(".pkl","")
 cols=set(df.columns)
 if not "filename" in cols or not "id_doc" in cols or not "content" in cols or not "lang" in cols:
    raise ValueError("Missing data column in input given")
@@ -96,7 +98,7 @@ languages= np.unique(df.lang.values)
 print("Languages available in the corpus",languages)
 pipelines={
-    lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator]())
+    lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator](),corpus_name=dataset_name)
    for lang in tqdm(languages,desc="Load Pipelines model")
 }

--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -11,7 +11,7 @@ from .models.str import STR
 from .models.transformation.transform import Generalisation, Expansion
 from .nlp.disambiguator import *
-from .nlp.ner import *
+from .nlp.ner import Spacy, NER
 from .nlp.exception.disambiguator import NotADisambiguatorInstance
 from .nlp.exception.ner import NotANERInstance
@@ -45,6 +45,9 @@ class Pipeline(object):
        self.dict_adj = kwargs.get("dict_adj",None)
        self.dict_inc = kwargs.get("dict_inc",None)
+        self.corpus_name = kwargs["corpus_name"] if "corpus_name" in kwargs else "no_name"
+        self.corpus_name= "{0}_{1}".format(self.corpus_name,self.lang)
    def parse(self,text,debug=False):
        """
@@ -97,10 +100,10 @@ class Pipeline(object):
        """
        if not self.dict_adj and not self.dict_inc:
-            if os.path.exists("adj_dict.json") and yes_or_no(question="Do you want to use previous adj file"):
+            if os.path.exists("{0}_adj_dict.json".format(self.corpus_name)) and yes_or_no(question="Do you want to use previous adj file"):
-                self.dict_adj=json.load(open("adj_dict.json"))
+                self.dict_adj=json.load(open("{0}_adj_dict.json".format(self.corpus_name)))
-            if os.path.exists("inc_dict.json") and yes_or_no(question="Do you want to use previous inc file"):
+            if os.path.exists("{0}_inc_dict.json".format(self.corpus_name)) and yes_or_no(question="Do you want to use previous inc file"):
-                self.dict_adj=json.load(open("inc_dict.json"))
+                self.dict_adj=json.load(open("{0}_inc_dict.json".format(self.corpus_name)))
            if not self.dict_adj and not self.dict_inc:
                r = RelationExtractor(spatial_entities)
@@ -109,11 +112,14 @@ class Pipeline(object):
                df_adj, df_inc = r.fuse_meta_and_geom()
                self.dict_adj = df_adj.to_dict()
                self.dict_inc = df_inc.to_dict()
+                # Saving
+                open("{0}_adj_dict.json".format(self.corpus_name),'w').write(json.dumps(self.dict_adj))
+                open("{0}_inc_dict.json".format(self.corpus_name),'w').write(json.dumps(self.dict_inc))
    def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs):
-        text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")]
+        #text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")]
+        text_and_spatial_entities = Parallel(n_jobs=4,backend="multiprocessing")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts") )
        sp_es= []
        for res in text_and_spatial_entities:
            sp_es.extend(list(res[1].values()))