From 47b6523821984319871c492d18b13029ca5a93e0 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Wed, 15 May 2019 12:55:06 +0200 Subject: [PATCH] DEBUG --- auto_fill_annotation.py | 33 ++++++++++++++------ requirements.txt | 13 ++++++++ setup.py | 1 - strpython/eval/automatic_annotation.py | 6 ++-- strpython/models/str.py | 2 -- strpython/models/transformation/transform.py | 4 +-- 6 files changed, 43 insertions(+), 16 deletions(-) create mode 100644 requirements.txt diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py index a06b2f2..f6b0f29 100644 --- a/auto_fill_annotation.py +++ b/auto_fill_annotation.py @@ -28,8 +28,11 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc if not os.path.exists(last_step_output): os.makedirs(last_step_output) - for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"): - annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) + from joblib import Parallel,delayed + # + Parallel(n_jobs=4,backend="threading")(delayed(annotate_eval_sample)(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample")) + #for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"): + #annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) min_carac_dict=None if min_carac_fn != "" and os.path.exists(min_carac_fn): @@ -63,12 +66,15 @@ def generate_annotation_dataframe(matrix_sim_dir, selected_graphs, output_dir): print("Proceeding...", measure, type_) if os.path.exists(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_))): continue - df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)), - selected_graphs, - measure, type_) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - df.to_csv(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_))) + try: + df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)), + selected_graphs, + measure, type_) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + df.to_csv(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_))) + except: + print("Could'not read {0}".format(fn)) def extract_criteria_4_all(annotater, csv_input_dir, raw_graph_dir, dataset, threshold, output_file="temp_out.csv"): @@ -157,6 +163,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str): try: return annotater.all(None, None, x.G1, x.G2) except Exception as e: + print("Error",e) return [0, 0, 0, 0,300000,0] df["res"] = df.apply(lambda x: foo(x), axis=1) @@ -196,6 +203,14 @@ def synthesize(last_step_output,output_filename,min_size_G1=None,min_size_G2=Non if min_carac_dict and ncar_min_doc2 > 0: output_filename= output_filename+"_mindoc2len_{0}".format(ncar_min_doc2) data = [] + + # min_c5,max_c5 = 2000,0 + # min_c6, max_c6 = 2000, 0 + # for fn in tqdm(fns,desc="Synthetise Results"): + # df = pd.read_csv(fn) + # min_c5, max_c5 = min(min_c5,df.c5.min()), max(max_c5,df.c5.max()) + # min_c6, max_c6 = min(min_c6,df.c6.min()), max(max_c6,df.c6.max()) + for fn in tqdm(fns,desc="Synthetise Results"): df = pd.read_csv(fn) if min_size_G1: @@ -214,7 +229,7 @@ def synthesize(last_step_output,output_filename,min_size_G1=None,min_size_G2=Non df = df.replace([np.inf, -np.inf], 300000) df["c5"] = 1 - (df.c5 - df.c5.min()) / (df.c5.max() - df.c5.min()) - df["c6"] = (df.c6 - df.c6.min()) / (df.c6.max() - df.c6.min()) + df["c6"] = (df.c6 - df.c6.min()) / (df.c6.max() -df.c6.min()) if len(df) <1: continue mes = np.unique(df.sim_measure)[0] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5652042 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +geopandas +pandas +nltk +polyglot +spacy +https://github.com/Jacobe2169/my_toolbox.git +termcolor +pycorenlp +https://github.com/Jacobe2169/Python-Inflector.git +PyICU +pycld2 +morfessor +textblob diff --git a/setup.py b/setup.py index ac2185d..f2d7456 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,5 @@ setup( license='MIT', author='Jacques Fize', author_email='jacques.fize@cirad.fr', - setup_require=['p'], description='' ) diff --git a/strpython/eval/automatic_annotation.py b/strpython/eval/automatic_annotation.py index f07b2cc..2d2760a 100644 --- a/strpython/eval/automatic_annotation.py +++ b/strpython/eval/automatic_annotation.py @@ -219,7 +219,7 @@ class AnnotationAutomatic(object): def criterion6(self, str1, str2): """ - Return True if both STR contains similar spatial entities. + Return the value of the dice coefficient between two str spatial entities set Parameters ---------- str1 @@ -229,4 +229,6 @@ class AnnotationAutomatic(object): ------- """ - return len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys())) \ No newline at end of file + G = set(str1.graph.nodes.keys()) + H = set(str2.graph.nodes.keys()) + return 2*(len(G & H))/(len(G)+len(H)) \ No newline at end of file diff --git a/strpython/models/str.py b/strpython/models/str.py index 1666854..828a000 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -189,7 +189,6 @@ class STR(object): if not data_: warnings.warn("{0} wasn't found in Geo-Database".format(id)) return False - data_ = data_[0] if not label and v == True: warnings.warn("Label empty. @en label from Geo-Database will be used.") label = data_["en"] @@ -296,7 +295,6 @@ class STR(object): data = self.get_data(new_se) to_del.add(old_se) if data: - data = data[0] final_transform_map[old_se] = new_se if not new_se in self.spatial_entities: self.add_spatial_entity(new_se, data.label.en) diff --git a/strpython/models/transformation/transform.py b/strpython/models/transformation/transform.py index df57d63..607a411 100644 --- a/strpython/models/transformation/transform.py +++ b/strpython/models/transformation/transform.py @@ -73,10 +73,10 @@ class Generalisation(Transformation): if not node in inclusion_dictionnary: inc_list = [] data = gazetteer.get_by_id(node) - if not data:continue + if len(data)<1:continue data=data[0] try: - inc_list = data[0].other.inc_P131 + inc_list = data.other.inc_P131 except: pass if not inc_list: -- GitLab