From 47b6523821984319871c492d18b13029ca5a93e0 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Wed, 15 May 2019 12:55:06 +0200
Subject: [PATCH] DEBUG

---
 auto_fill_annotation.py                      | 33 ++++++++++++++------
 requirements.txt                             | 13 ++++++++
 setup.py                                     |  1 -
 strpython/eval/automatic_annotation.py       |  6 ++--
 strpython/models/str.py                      |  2 --
 strpython/models/transformation/transform.py |  4 +--
 6 files changed, 43 insertions(+), 16 deletions(-)
 create mode 100644 requirements.txt

diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py
index a06b2f2..f6b0f29 100644
--- a/auto_fill_annotation.py
+++ b/auto_fill_annotation.py
@@ -28,8 +28,11 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc
     if not os.path.exists(last_step_output):
         os.makedirs(last_step_output)
 
-    for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
-         annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
+    from joblib import Parallel,delayed
+    #
+    Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"))
+    #for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
+         #annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
 
     min_carac_dict=None
     if min_carac_fn != "" and os.path.exists(min_carac_fn):
@@ -63,12 +66,15 @@ def generate_annotation_dataframe(matrix_sim_dir, selected_graphs, output_dir):
         print("Proceeding...", measure, type_)
         if os.path.exists(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_))):
             continue
-        df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
-                                        selected_graphs,
-                                        measure, type_)
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-        df.to_csv(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_)))
+        try:
+            df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
+                                            selected_graphs,
+                                            measure, type_)
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            df.to_csv(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_)))
+        except:
+            print("Could'not read {0}".format(fn))
 
 
 def extract_criteria_4_all(annotater, csv_input_dir, raw_graph_dir, dataset, threshold, output_file="temp_out.csv"):
@@ -157,6 +163,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str):
         try:
             return annotater.all(None, None, x.G1, x.G2)
         except Exception as e:
+            print("Error",e)
             return [0, 0, 0, 0,300000,0]
 
     df["res"] = df.apply(lambda x: foo(x), axis=1)
@@ -196,6 +203,14 @@ def synthesize(last_step_output,output_filename,min_size_G1=None,min_size_G2=Non
     if min_carac_dict and ncar_min_doc2 > 0:
         output_filename= output_filename+"_mindoc2len_{0}".format(ncar_min_doc2)
     data = []
+
+    # min_c5,max_c5 = 2000,0
+    # min_c6, max_c6 = 2000, 0
+    # for fn in tqdm(fns,desc="Synthetise Results"):
+    #     df = pd.read_csv(fn)
+    #     min_c5, max_c5 = min(min_c5,df.c5.min()), max(max_c5,df.c5.max())
+    #     min_c6, max_c6 = min(min_c6,df.c6.min()), max(max_c6,df.c6.max())
+
     for fn in tqdm(fns,desc="Synthetise Results"):
         df = pd.read_csv(fn)
         if min_size_G1:
@@ -214,7 +229,7 @@ def synthesize(last_step_output,output_filename,min_size_G1=None,min_size_G2=Non
 
         df = df.replace([np.inf, -np.inf], 300000)
         df["c5"] = 1 - (df.c5 - df.c5.min()) / (df.c5.max() - df.c5.min())
-        df["c6"] = (df.c6 - df.c6.min()) / (df.c6.max() - df.c6.min())
+        df["c6"] = (df.c6 - df.c6.min()) / (df.c6.max() -df.c6.min())
         if len(df) <1:
             continue
         mes = np.unique(df.sim_measure)[0]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..5652042
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+geopandas
+pandas
+nltk
+polyglot
+spacy
+https://github.com/Jacobe2169/my_toolbox.git
+termcolor
+pycorenlp
+https://github.com/Jacobe2169/Python-Inflector.git
+PyICU
+pycld2
+morfessor
+textblob
diff --git a/setup.py b/setup.py
index ac2185d..f2d7456 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,5 @@ setup(
     license='MIT',
     author='Jacques Fize',
     author_email='jacques.fize@cirad.fr',
-    setup_require=['p'],
     description=''
 )
diff --git a/strpython/eval/automatic_annotation.py b/strpython/eval/automatic_annotation.py
index f07b2cc..2d2760a 100644
--- a/strpython/eval/automatic_annotation.py
+++ b/strpython/eval/automatic_annotation.py
@@ -219,7 +219,7 @@ class AnnotationAutomatic(object):
 
     def criterion6(self, str1, str2):
         """
-        Return True if both STR contains similar spatial entities.
+        Return the value of the dice coefficient between two str spatial entities set
         Parameters
         ----------
         str1
@@ -229,4 +229,6 @@ class AnnotationAutomatic(object):
         -------
 
         """
-        return len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys()))
\ No newline at end of file
+        G = set(str1.graph.nodes.keys())
+        H = set(str2.graph.nodes.keys())
+        return 2*(len(G & H))/(len(G)+len(H))
\ No newline at end of file
diff --git a/strpython/models/str.py b/strpython/models/str.py
index 1666854..828a000 100644
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -189,7 +189,6 @@ class STR(object):
         if not data_:
             warnings.warn("{0} wasn't found in Geo-Database".format(id))
             return False
-        data_ = data_[0]
         if not label and v == True:
             warnings.warn("Label empty. @en label from Geo-Database will be used.")
             label = data_["en"]
@@ -296,7 +295,6 @@ class STR(object):
             data = self.get_data(new_se)
             to_del.add(old_se)
             if data:
-                data = data[0]
                 final_transform_map[old_se] = new_se
                 if not new_se in self.spatial_entities:
                     self.add_spatial_entity(new_se, data.label.en)
diff --git a/strpython/models/transformation/transform.py b/strpython/models/transformation/transform.py
index df57d63..607a411 100644
--- a/strpython/models/transformation/transform.py
+++ b/strpython/models/transformation/transform.py
@@ -73,10 +73,10 @@ class Generalisation(Transformation):
             if not node in inclusion_dictionnary:
                 inc_list = []
                 data = gazetteer.get_by_id(node)
-                if not data:continue
+                if len(data)<1:continue
                 data=data[0]
                 try:
-                    inc_list = data[0].other.inc_P131
+                    inc_list = data.other.inc_P131
                 except:
                     pass
                 if not inc_list:
-- 
GitLab