Commit b69badf3 authored by Fize Jacques's avatar Fize Jacques
Browse files

Add automatic annotation process

parent fd045e4c
No related merge requests found
Showing with 150 additions and 78 deletions
+150 -78
# coding = utf-8
import argparse, os, re, json, glob
import pandas as pd
import networkx as nx
from strpython.eval.automatic_annotation import AnnotationAutomatic
from strpython.models.str import STR
annotater = AnnotationAutomatic()
parser = argparse.ArgumentParser()
parser.add_argument("csv_file")
parser.add_argument("graph_dir")
parser.add_argument("output_file")
args = parser.parse_args()
if not os.path.exists(args.csv_file) or not os.path.exists(args.graph_dir):
raise FileNotFoundError("Error in Input")
df = pd.read_csv(args.csv_file, index_col=0)
str_graph_path = args.graph_dir
strs = {}
for file in glob.glob(os.path.join(str_graph_path, "*.gexf")):
id_ = int(re.findall("\d+", file)[-1])
try:
strs[id_] = STR.from_networkx_graph(nx.read_gexf(file))
except:
strs[id_] = STR({}, [])
def foo(x):
try:
return annotater.all(strs[x.G1], strs[x.G2])
except:
return [0, 0, 0, 0]
df["res"] = df.apply(lambda x: foo(x), axis=1)
df.res=df.res.apply(lambda x :list(map(int,x)))
df[["c1"]] = df.res.apply(lambda x: x[0])
df[["c2"]] = df.res.apply(lambda x: x[1])
df[["c3"]] = df.res.apply(lambda x: x[2])
df[["c4"]] = df.res.apply(lambda x: x[3])
del df["res"]
df.to_csv(args.output_file)
# coding = utf-8
import numpy as np
import argparse, json, os, re, sys, glob, time
from strpython.helpers.sim_matrix import matrix_to_pandas_dataframe, read_bz2_matrix
script_beg=time.time()
def _path(string):
if os.path.exists(string):
return string
else:
raise FileNotFoundError(string)
parser = argparse.ArgumentParser()
parser.add_argument("matricesDir", type=_path)
parser.add_argument("selectedFile", type=_path)
parser.add_argument("outputDir")
args = parser.parse_args()
if not os.path.isdir(args.outputDir):
try:
os.makedirs(args.outputDir)
except:
raise InterruptedError("Cannot create {0} dir".format(args.outputDir))
matrix_fns = glob.glob(os.path.join(args.matricesDir, "*.npy.bz2"))
selected = json.load(open(args.selectedFile))
for fn in matrix_fns:
measure = os.path.basename(fn).split("_")[0]
if os.path.basename(fn).split("_")[-2] in ["extension","gen"]:
type_ = "_".join(os.path.basename(fn).split("_")[-2:]).replace(".npy.bz2", "")
else:
type_ = os.path.basename(fn).split("_")[-1].replace(".npy.bz2", "")
print("Proceeding...",measure, type_)
df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
selected,
measure, type_)
df.to_csv(os.path.join(args.outputDir,"{0}_{1}.csv".format(measure,type_)))
print("The script took {0}s to finish".format(time.time()-script_beg))
# coding = utf-8
import os, re, argparse, json,sys, subprocess, glob
parser=argparse.ArgumentParser()
parser.add_argument("simMatrixInputDir")
parser.add_argument("graphDataDir")
parser.add_argument("selectedInputFile")
parser.add_argument("outputAnnotation_dir")
parser.add_argument("outputAnnotation2_dir")
args=parser.parse_args()
print("Generating Annotation File")
# process=subprocess.run(["python3","generate_annotation_file.py",args.simMatrixInputDir,args.selectedInputFile,args.outputAnnotation_dir])
#
# if process.returncode == 1:
# raise subprocess.CalledProcessError("The process did not end well !")
fns=glob.glob(os.path.join(args.outputAnnotation_dir,"*.csv"))
if not os.path.exists(args.outputAnnotation2_dir):
os.makedirs(args.outputAnnotation2_dir)
for fn in fns:
print("Processing {0}...".format(fn))
if os.path.basename(fn).split("_")[-2] in ["extension","gen"]:
graph_dir = "_".join(os.path.basename(fn).split("_")[-2:]).replace(".csv", "")
else:
graph_dir = os.path.basename(fn).split("_")[-1].replace(".csv", "")
print(fn,graph_dir)
print(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir,graph_dir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))])
process=subprocess.run(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir,graph_dir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))])
if process.returncode == 1:
raise subprocess.CalledProcessError(process,"The process did not end well !")
\ No newline at end of file
...@@ -32,11 +32,10 @@ def read_and_load(file_path, selected=None, bz2=True): ...@@ -32,11 +32,10 @@ def read_and_load(file_path, selected=None, bz2=True):
def matrix_to_pandas_dataframe(matrix, selected, sim_measure, type_str, n=5): def matrix_to_pandas_dataframe(matrix, selected, sim_measure, type_str, n=5):
sim, type_ = sim_measure, type_str sim, type_ = sim_measure, type_str
tab_array = [] tab_array = []
for line in range(len(matrix)): for line in selected:
top_n = np.argsort(matrix[line])[::-1][1:n + 1] top_n = np.argsort(matrix[line])[::-1][1:n + 1]
index = selected[line]
rank = 1 rank = 1
for val in top_n: for val in top_n:
tab_array.append([index, val, sim, type_, rank, 0, 0, 0, 0]) tab_array.append([line, val, sim, type_, rank, 0, 0, 0, 0])
rank += 1 rank += 1
return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4".split()) return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4".split())
...@@ -13,7 +13,9 @@ from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency ...@@ -13,7 +13,9 @@ from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency
from ..helpers.geodict_helpers import get_data, get_data_by_wikidata_id from ..helpers.geodict_helpers import get_data, get_data_by_wikidata_id
from ..eval.stats import most_common from ..eval.stats import most_common
from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan
# logging.basicConfig(filename=config.log_file,level=logging.INFO) # logging.basicConfig(filename=config.log_file,level=logging.INFO)
...@@ -377,7 +379,7 @@ class STR(object): ...@@ -377,7 +379,7 @@ class STR(object):
label.append(data["en"]) label.append(data["en"])
class_.append(most_common(data["class"])) class_.append(most_common(data["class"]))
except: except:
pass class_.append("P-PPL")
df=gpd.GeoDataFrame({"geometry":points,"label":label,"classe":class_}) df=gpd.GeoDataFrame({"geometry":points,"label":label,"classe":class_})
df["x"]=df.geometry.apply(lambda p: p.x) df["x"]=df.geometry.apply(lambda p: p.x)
df["y"] = df.geometry.apply(lambda p: p.y) df["y"] = df.geometry.apply(lambda p: p.y)
...@@ -385,10 +387,16 @@ class STR(object): ...@@ -385,10 +387,16 @@ class STR(object):
def get_cluster(self): def get_cluster(self):
data=self.get_geo_data_of_se() data=self.get_geo_data_of_se()
bandwidth = estimate_bandwidth(data[["x", "y"]].values) X=data[["x", "y"]].values
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) try:
ms.fit(data[["x", "y"]].values) bandwidth = estimate_bandwidth(X)
data["cluster"] = ms.labels_ ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
data["cluster"] = ms.labels_
except:
samples,labels=dbscan(X)
data["cluster"] = labels
""" """
# deuxième découpe en cluster # deuxième découpe en cluster
......
# coding = utf-8
import argparse
from termcolor import colored
from strpython.helpers.geodict_helpers import get_most_common_id_v3, get_data, get_by_label
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(help='commands')
# A list command
list_parser = subparsers.add_parser(
'get_id', help='Return ids for a given label')
list_parser.set_defaults(which="getid")
list_parser.add_argument(
'label', help='label')
list_parser.add_argument(
'-l','--language', help='Language',default="en")
list_parser.add_argument(
'-c','--most_common', action='store_true', help='Take the most common based on its PR value')
args = parser.parse_args()
if args.which == "getid":
ind=0
label=args.label
lang=args.language
if args.most_common:
try:
data=get_data(get_most_common_id_v3(label,lang))[0]
print(colored("Most Common -->\t{0}\t{1}\t{2}".format(data[lang], data["id"],data["wikidataID"]), "magenta"))
except:
print(colored("No results found for \"{0}\" in \"{1}\"".format(label,lang),"red"))
if label[0].islower():
new_label=label[0].upper()+label[1:]
data=get_by_label(new_label,lang)
if data:
ind+=1
print("\t"*ind+"Using Capital we found this results")
for d in data:
d=d["_source"]
print("\t"*ind+d["en"],d["aliases"],
colored(d["id"],"blue"))
print("\t"*ind+"...")
mc = get_data(get_most_common_id_v3(new_label, lang)[0])
print(colored("\t"*ind+"Most Common -->\t{0}\t{1}".format(mc[lang], mc["id"]), "magenta"))
else:
data = get_by_label(label, lang)
if not data:
print(colored("No results found for \"{0}\" in \"{1}\"".format(label, lang), "red"))
if label[0].islower():
new_label=label[0].upper()+label[1:]
data = get_by_label(new_label, lang)
print("Using Capital we found this results : ")
ind+=1
label=new_label
if not data:
pass
elif data:
for d in data:
d = d["_source"]
print("\t"*ind+ d["en"], d["aliases"],
colored(d["id"], "blue"))
print("\t"*ind+"...")
mc = get_data(get_most_common_id(label, lang)[0])
print(colored("\t"*ind+"Most Common -->\t{0}\t{1}".format(mc[lang], mc["id"]), "magenta"))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment