From b69badf3cc60e871c164ee7c75a2ae18cf50bf19 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Fri, 7 Sep 2018 12:13:47 +0200 Subject: [PATCH] Add automatic annotation process --- auto_fill_annotation.py | 53 +++++++++++++++++++++++++ generate_annotation_file.py | 45 +++++++++++++++++++++ run_automatic_annotation.py | 36 +++++++++++++++++ strpython/helpers/sim_matrix.py | 5 +-- strpython/models/str.py | 20 +++++++--- tools.py | 69 --------------------------------- 6 files changed, 150 insertions(+), 78 deletions(-) create mode 100644 auto_fill_annotation.py create mode 100644 generate_annotation_file.py create mode 100644 run_automatic_annotation.py delete mode 100644 tools.py diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py new file mode 100644 index 0000000..e1de0cb --- /dev/null +++ b/auto_fill_annotation.py @@ -0,0 +1,53 @@ +# coding = utf-8 + + +import argparse, os, re, json, glob +import pandas as pd +import networkx as nx + +from strpython.eval.automatic_annotation import AnnotationAutomatic +from strpython.models.str import STR + +annotater = AnnotationAutomatic() + +parser = argparse.ArgumentParser() + +parser.add_argument("csv_file") +parser.add_argument("graph_dir") +parser.add_argument("output_file") + +args = parser.parse_args() + +if not os.path.exists(args.csv_file) or not os.path.exists(args.graph_dir): + raise FileNotFoundError("Error in Input") + +df = pd.read_csv(args.csv_file, index_col=0) +str_graph_path = args.graph_dir + + +strs = {} +for file in glob.glob(os.path.join(str_graph_path, "*.gexf")): + id_ = int(re.findall("\d+", file)[-1]) + try: + strs[id_] = STR.from_networkx_graph(nx.read_gexf(file)) + except: + strs[id_] = STR({}, []) + + +def foo(x): + try: + return annotater.all(strs[x.G1], strs[x.G2]) + except: + return [0, 0, 0, 0] + + +df["res"] = df.apply(lambda x: foo(x), axis=1) +df.res=df.res.apply(lambda x :list(map(int,x))) +df[["c1"]] = df.res.apply(lambda x: x[0]) +df[["c2"]] = df.res.apply(lambda x: x[1]) +df[["c3"]] = df.res.apply(lambda x: x[2]) +df[["c4"]] = df.res.apply(lambda x: x[3]) + +del df["res"] + +df.to_csv(args.output_file) diff --git a/generate_annotation_file.py b/generate_annotation_file.py new file mode 100644 index 0000000..065a9f4 --- /dev/null +++ b/generate_annotation_file.py @@ -0,0 +1,45 @@ +# coding = utf-8 +import numpy as np +import argparse, json, os, re, sys, glob, time +from strpython.helpers.sim_matrix import matrix_to_pandas_dataframe, read_bz2_matrix + +script_beg=time.time() + +def _path(string): + if os.path.exists(string): + return string + else: + raise FileNotFoundError(string) + + +parser = argparse.ArgumentParser() + +parser.add_argument("matricesDir", type=_path) +parser.add_argument("selectedFile", type=_path) +parser.add_argument("outputDir") +args = parser.parse_args() + +if not os.path.isdir(args.outputDir): + try: + os.makedirs(args.outputDir) + except: + raise InterruptedError("Cannot create {0} dir".format(args.outputDir)) + +matrix_fns = glob.glob(os.path.join(args.matricesDir, "*.npy.bz2")) +selected = json.load(open(args.selectedFile)) + + + +for fn in matrix_fns: + measure = os.path.basename(fn).split("_")[0] + if os.path.basename(fn).split("_")[-2] in ["extension","gen"]: + type_ = "_".join(os.path.basename(fn).split("_")[-2:]).replace(".npy.bz2", "") + else: + type_ = os.path.basename(fn).split("_")[-1].replace(".npy.bz2", "") + print("Proceeding...",measure, type_) + df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)), + selected, + measure, type_) + df.to_csv(os.path.join(args.outputDir,"{0}_{1}.csv".format(measure,type_))) + +print("The script took {0}s to finish".format(time.time()-script_beg)) diff --git a/run_automatic_annotation.py b/run_automatic_annotation.py new file mode 100644 index 0000000..c1e2d45 --- /dev/null +++ b/run_automatic_annotation.py @@ -0,0 +1,36 @@ +# coding = utf-8 + +import os, re, argparse, json,sys, subprocess, glob + +parser=argparse.ArgumentParser() + +parser.add_argument("simMatrixInputDir") +parser.add_argument("graphDataDir") +parser.add_argument("selectedInputFile") +parser.add_argument("outputAnnotation_dir") +parser.add_argument("outputAnnotation2_dir") + +args=parser.parse_args() + +print("Generating Annotation File") +# process=subprocess.run(["python3","generate_annotation_file.py",args.simMatrixInputDir,args.selectedInputFile,args.outputAnnotation_dir]) +# +# if process.returncode == 1: +# raise subprocess.CalledProcessError("The process did not end well !") + +fns=glob.glob(os.path.join(args.outputAnnotation_dir,"*.csv")) + + +if not os.path.exists(args.outputAnnotation2_dir): + os.makedirs(args.outputAnnotation2_dir) +for fn in fns: + print("Processing {0}...".format(fn)) + if os.path.basename(fn).split("_")[-2] in ["extension","gen"]: + graph_dir = "_".join(os.path.basename(fn).split("_")[-2:]).replace(".csv", "") + else: + graph_dir = os.path.basename(fn).split("_")[-1].replace(".csv", "") + print(fn,graph_dir) + print(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir,graph_dir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))]) + process=subprocess.run(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir,graph_dir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))]) + if process.returncode == 1: + raise subprocess.CalledProcessError(process,"The process did not end well !") \ No newline at end of file diff --git a/strpython/helpers/sim_matrix.py b/strpython/helpers/sim_matrix.py index 5ad2669..0462de5 100644 --- a/strpython/helpers/sim_matrix.py +++ b/strpython/helpers/sim_matrix.py @@ -32,11 +32,10 @@ def read_and_load(file_path, selected=None, bz2=True): def matrix_to_pandas_dataframe(matrix, selected, sim_measure, type_str, n=5): sim, type_ = sim_measure, type_str tab_array = [] - for line in range(len(matrix)): + for line in selected: top_n = np.argsort(matrix[line])[::-1][1:n + 1] - index = selected[line] rank = 1 for val in top_n: - tab_array.append([index, val, sim, type_, rank, 0, 0, 0, 0]) + tab_array.append([line, val, sim, type_, rank, 0, 0, 0, 0]) rank += 1 return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4".split()) diff --git a/strpython/models/str.py b/strpython/models/str.py index efe5ae4..5cde4ff 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -13,7 +13,9 @@ from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency from ..helpers.geodict_helpers import get_data, get_data_by_wikidata_id from ..eval.stats import most_common -from sklearn.cluster import MeanShift, estimate_bandwidth +from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan + + # logging.basicConfig(filename=config.log_file,level=logging.INFO) @@ -377,7 +379,7 @@ class STR(object): label.append(data["en"]) class_.append(most_common(data["class"])) except: - pass + class_.append("P-PPL") df=gpd.GeoDataFrame({"geometry":points,"label":label,"classe":class_}) df["x"]=df.geometry.apply(lambda p: p.x) df["y"] = df.geometry.apply(lambda p: p.y) @@ -385,10 +387,16 @@ class STR(object): def get_cluster(self): data=self.get_geo_data_of_se() - bandwidth = estimate_bandwidth(data[["x", "y"]].values) - ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) - ms.fit(data[["x", "y"]].values) - data["cluster"] = ms.labels_ + X=data[["x", "y"]].values + try: + bandwidth = estimate_bandwidth(X) + ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) + ms.fit(X) + data["cluster"] = ms.labels_ + except: + samples,labels=dbscan(X) + data["cluster"] = labels + """ # deuxième découpe en cluster diff --git a/tools.py b/tools.py deleted file mode 100644 index d417d45..0000000 --- a/tools.py +++ /dev/null @@ -1,69 +0,0 @@ -# coding = utf-8 - -import argparse - -from termcolor import colored - -from strpython.helpers.geodict_helpers import get_most_common_id_v3, get_data, get_by_label - -parser = argparse.ArgumentParser() - -subparsers = parser.add_subparsers(help='commands') - -# A list command -list_parser = subparsers.add_parser( - 'get_id', help='Return ids for a given label') -list_parser.set_defaults(which="getid") -list_parser.add_argument( - 'label', help='label') -list_parser.add_argument( - '-l','--language', help='Language',default="en") -list_parser.add_argument( - '-c','--most_common', action='store_true', help='Take the most common based on its PR value') - -args = parser.parse_args() - -if args.which == "getid": - ind=0 - label=args.label - lang=args.language - if args.most_common: - try: - data=get_data(get_most_common_id_v3(label,lang))[0] - print(colored("Most Common -->\t{0}\t{1}\t{2}".format(data[lang], data["id"],data["wikidataID"]), "magenta")) - except: - print(colored("No results found for \"{0}\" in \"{1}\"".format(label,lang),"red")) - if label[0].islower(): - new_label=label[0].upper()+label[1:] - data=get_by_label(new_label,lang) - if data: - ind+=1 - print("\t"*ind+"Using Capital we found this results") - for d in data: - d=d["_source"] - print("\t"*ind+d["en"],d["aliases"], - colored(d["id"],"blue")) - print("\t"*ind+"...") - mc = get_data(get_most_common_id_v3(new_label, lang)[0]) - print(colored("\t"*ind+"Most Common -->\t{0}\t{1}".format(mc[lang], mc["id"]), "magenta")) - else: - data = get_by_label(label, lang) - if not data: - print(colored("No results found for \"{0}\" in \"{1}\"".format(label, lang), "red")) - if label[0].islower(): - - new_label=label[0].upper()+label[1:] - data = get_by_label(new_label, lang) - print("Using Capital we found this results : ") - ind+=1 - label=new_label - if not data: - pass - elif data: - for d in data: - d = d["_source"] - print("\t"*ind+ d["en"], d["aliases"], - colored(d["id"], "blue")) - print("\t"*ind+"...") - mc = get_data(get_most_common_id(label, lang)[0]) - print(colored("\t"*ind+"Most Common -->\t{0}\t{1}".format(mc[lang], mc["id"]), "magenta")) -- GitLab