From b69badf3cc60e871c164ee7c75a2ae18cf50bf19 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Fri, 7 Sep 2018 12:13:47 +0200
Subject: [PATCH] Add automatic annotation process

---
 auto_fill_annotation.py         | 53 +++++++++++++++++++++++++
 generate_annotation_file.py     | 45 +++++++++++++++++++++
 run_automatic_annotation.py     | 36 +++++++++++++++++
 strpython/helpers/sim_matrix.py |  5 +--
 strpython/models/str.py         | 20 +++++++---
 tools.py                        | 69 ---------------------------------
 6 files changed, 150 insertions(+), 78 deletions(-)
 create mode 100644 auto_fill_annotation.py
 create mode 100644 generate_annotation_file.py
 create mode 100644 run_automatic_annotation.py
 delete mode 100644 tools.py

diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py
new file mode 100644
index 0000000..e1de0cb
--- /dev/null
+++ b/auto_fill_annotation.py
@@ -0,0 +1,53 @@
+# coding = utf-8
+
+
+import argparse, os, re, json, glob
+import pandas as pd
+import networkx as nx
+
+from strpython.eval.automatic_annotation import AnnotationAutomatic
+from strpython.models.str import STR
+
+annotater = AnnotationAutomatic()
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("csv_file")
+parser.add_argument("graph_dir")
+parser.add_argument("output_file")
+
+args = parser.parse_args()
+
+if not os.path.exists(args.csv_file) or not os.path.exists(args.graph_dir):
+    raise FileNotFoundError("Error in Input")
+
+df = pd.read_csv(args.csv_file, index_col=0)
+str_graph_path = args.graph_dir
+
+
+strs = {}
+for file in glob.glob(os.path.join(str_graph_path, "*.gexf")):
+    id_ = int(re.findall("\d+", file)[-1])
+    try:
+        strs[id_] = STR.from_networkx_graph(nx.read_gexf(file))
+    except:
+        strs[id_] = STR({}, [])
+
+
+def foo(x):
+    try:
+        return annotater.all(strs[x.G1], strs[x.G2])
+    except:
+        return [0, 0, 0, 0]
+
+
+df["res"] = df.apply(lambda x: foo(x), axis=1)
+df.res=df.res.apply(lambda x :list(map(int,x)))
+df[["c1"]] = df.res.apply(lambda x: x[0])
+df[["c2"]] = df.res.apply(lambda x: x[1])
+df[["c3"]] = df.res.apply(lambda x: x[2])
+df[["c4"]] = df.res.apply(lambda x: x[3])
+
+del df["res"]
+
+df.to_csv(args.output_file)
diff --git a/generate_annotation_file.py b/generate_annotation_file.py
new file mode 100644
index 0000000..065a9f4
--- /dev/null
+++ b/generate_annotation_file.py
@@ -0,0 +1,45 @@
+# coding = utf-8
+import numpy as np
+import argparse, json, os, re, sys, glob, time
+from strpython.helpers.sim_matrix import matrix_to_pandas_dataframe, read_bz2_matrix
+
+script_beg=time.time()
+
+def _path(string):
+    if os.path.exists(string):
+        return string
+    else:
+        raise FileNotFoundError(string)
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("matricesDir", type=_path)
+parser.add_argument("selectedFile", type=_path)
+parser.add_argument("outputDir")
+args = parser.parse_args()
+
+if not os.path.isdir(args.outputDir):
+    try:
+        os.makedirs(args.outputDir)
+    except:
+        raise InterruptedError("Cannot create {0} dir".format(args.outputDir))
+
+matrix_fns = glob.glob(os.path.join(args.matricesDir, "*.npy.bz2"))
+selected = json.load(open(args.selectedFile))
+
+
+
+for fn in matrix_fns:
+    measure = os.path.basename(fn).split("_")[0]
+    if os.path.basename(fn).split("_")[-2] in ["extension","gen"]:
+        type_ = "_".join(os.path.basename(fn).split("_")[-2:]).replace(".npy.bz2", "")
+    else:
+        type_ = os.path.basename(fn).split("_")[-1].replace(".npy.bz2", "")
+    print("Proceeding...",measure, type_)
+    df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
+                                    selected,
+                                    measure, type_)
+    df.to_csv(os.path.join(args.outputDir,"{0}_{1}.csv".format(measure,type_)))
+
+print("The script took {0}s to finish".format(time.time()-script_beg))
diff --git a/run_automatic_annotation.py b/run_automatic_annotation.py
new file mode 100644
index 0000000..c1e2d45
--- /dev/null
+++ b/run_automatic_annotation.py
@@ -0,0 +1,36 @@
+# coding = utf-8
+
+import os, re, argparse, json,sys, subprocess, glob
+
+parser=argparse.ArgumentParser()
+
+parser.add_argument("simMatrixInputDir")
+parser.add_argument("graphDataDir")
+parser.add_argument("selectedInputFile")
+parser.add_argument("outputAnnotation_dir")
+parser.add_argument("outputAnnotation2_dir")
+
+args=parser.parse_args()
+
+print("Generating Annotation File")
+# process=subprocess.run(["python3","generate_annotation_file.py",args.simMatrixInputDir,args.selectedInputFile,args.outputAnnotation_dir])
+#
+# if process.returncode  == 1:
+#     raise subprocess.CalledProcessError("The process did not end well !")
+
+fns=glob.glob(os.path.join(args.outputAnnotation_dir,"*.csv"))
+
+
+if not os.path.exists(args.outputAnnotation2_dir):
+    os.makedirs(args.outputAnnotation2_dir)
+for fn in fns:
+    print("Processing {0}...".format(fn))
+    if os.path.basename(fn).split("_")[-2] in ["extension","gen"]:
+        graph_dir = "_".join(os.path.basename(fn).split("_")[-2:]).replace(".csv", "")
+    else:
+        graph_dir = os.path.basename(fn).split("_")[-1].replace(".csv", "")
+    print(fn,graph_dir)
+    print(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir,graph_dir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))])
+    process=subprocess.run(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir,graph_dir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))])
+    if process.returncode == 1:
+        raise subprocess.CalledProcessError(process,"The process did not end well !")
\ No newline at end of file
diff --git a/strpython/helpers/sim_matrix.py b/strpython/helpers/sim_matrix.py
index 5ad2669..0462de5 100644
--- a/strpython/helpers/sim_matrix.py
+++ b/strpython/helpers/sim_matrix.py
@@ -32,11 +32,10 @@ def read_and_load(file_path, selected=None, bz2=True):
 def matrix_to_pandas_dataframe(matrix, selected, sim_measure, type_str, n=5):
     sim, type_ = sim_measure, type_str
     tab_array = []
-    for line in range(len(matrix)):
+    for line in selected:
         top_n = np.argsort(matrix[line])[::-1][1:n + 1]
-        index = selected[line]
         rank = 1
         for val in top_n:
-            tab_array.append([index, val, sim, type_, rank, 0, 0, 0, 0])
+            tab_array.append([line, val, sim, type_, rank, 0, 0, 0, 0])
             rank += 1
     return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4".split())
diff --git a/strpython/models/str.py b/strpython/models/str.py
index efe5ae4..5cde4ff 100644
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -13,7 +13,9 @@ from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency
 from ..helpers.geodict_helpers import get_data, get_data_by_wikidata_id
 from ..eval.stats import most_common
 
-from sklearn.cluster import MeanShift, estimate_bandwidth
+from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan
+
+
 # logging.basicConfig(filename=config.log_file,level=logging.INFO)
 
 
@@ -377,7 +379,7 @@ class STR(object):
                 label.append(data["en"])
                 class_.append(most_common(data["class"]))
             except:
-                pass
+                class_.append("P-PPL")
         df=gpd.GeoDataFrame({"geometry":points,"label":label,"classe":class_})
         df["x"]=df.geometry.apply(lambda p: p.x)
         df["y"] = df.geometry.apply(lambda p: p.y)
@@ -385,10 +387,16 @@ class STR(object):
 
     def get_cluster(self):
         data=self.get_geo_data_of_se()
-        bandwidth = estimate_bandwidth(data[["x", "y"]].values)
-        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
-        ms.fit(data[["x", "y"]].values)
-        data["cluster"] = ms.labels_
+        X=data[["x", "y"]].values
+        try:
+            bandwidth = estimate_bandwidth(X)
+            ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
+            ms.fit(X)
+            data["cluster"] = ms.labels_
+        except:
+            samples,labels=dbscan(X)
+            data["cluster"] = labels
+
         """
 
         # deuxième découpe en cluster
diff --git a/tools.py b/tools.py
deleted file mode 100644
index d417d45..0000000
--- a/tools.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# coding = utf-8
-
-import argparse
-
-from termcolor import colored
-
-from strpython.helpers.geodict_helpers import get_most_common_id_v3, get_data, get_by_label
-
-parser = argparse.ArgumentParser()
-
-subparsers = parser.add_subparsers(help='commands')
-
-# A list command
-list_parser = subparsers.add_parser(
-    'get_id', help='Return ids for a given label')
-list_parser.set_defaults(which="getid")
-list_parser.add_argument(
-    'label', help='label')
-list_parser.add_argument(
-    '-l','--language', help='Language',default="en")
-list_parser.add_argument(
-    '-c','--most_common', action='store_true', help='Take the most common based on its PR value')
-
-args = parser.parse_args()
-
-if args.which == "getid":
-    ind=0
-    label=args.label
-    lang=args.language
-    if args.most_common:
-        try:
-            data=get_data(get_most_common_id_v3(label,lang))[0]
-            print(colored("Most Common -->\t{0}\t{1}\t{2}".format(data[lang], data["id"],data["wikidataID"]), "magenta"))
-        except:
-            print(colored("No results found for \"{0}\" in \"{1}\"".format(label,lang),"red"))
-            if label[0].islower():
-                new_label=label[0].upper()+label[1:]
-                data=get_by_label(new_label,lang)
-                if data:
-                    ind+=1
-                    print("\t"*ind+"Using Capital we found this results")
-                    for d in data:
-                        d=d["_source"]
-                        print("\t"*ind+d["en"],d["aliases"],
-                              colored(d["id"],"blue"))
-                    print("\t"*ind+"...")
-                    mc = get_data(get_most_common_id_v3(new_label, lang)[0])
-                    print(colored("\t"*ind+"Most Common -->\t{0}\t{1}".format(mc[lang], mc["id"]), "magenta"))
-    else:
-        data = get_by_label(label, lang)
-        if not data:
-            print(colored("No results found for \"{0}\" in \"{1}\"".format(label, lang), "red"))
-            if label[0].islower():
-
-                new_label=label[0].upper()+label[1:]
-                data = get_by_label(new_label, lang)
-                print("Using Capital we found this results : ")
-                ind+=1
-                label=new_label
-        if not data:
-            pass
-        elif data:
-            for d in data:
-                d = d["_source"]
-                print("\t"*ind+ d["en"], d["aliases"],
-                      colored(d["id"], "blue"))
-            print("\t"*ind+"...")
-            mc = get_data(get_most_common_id(label, lang)[0])
-            print(colored("\t"*ind+"Most Common -->\t{0}\t{1}".format(mc[lang], mc["id"]), "magenta"))
-- 
GitLab