auto_fill_annotation.py 9.11 KiB
# coding = utf-8


import argparse, os
import warnings

import os, re, glob,json
import networkx as nx
import numpy as np

import pandas as pd
from tqdm import tqdm

tqdm.pandas()

from strpython.eval.automatic_annotation import AnnotationAutomatic
from strpython.models.str import STR
from strpython.helpers.sim_matrix import matrix_to_pandas_dataframe, read_bz2_matrix


def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs,
         threshold, inclusion_fn, adjacency_fn,
         min_carac_fn, min_size_G1,min_size_G2,n_car_min_doc1,n_car_min_doc2,
         formatG1,format_fn):
    annotater = AnnotationAutomatic(dataset, threshold, inclusion_fn, adjacency_fn)
    first_step_output = "output_first_step_{0}_{1}".format(dataset, threshold)
    last_step_output = "output_final_{0}_{1}".format(dataset, threshold)
    generate_annotation_dataframe(matrix_sim_dir, selected_graphs, first_step_output)
    size_str = extract_criteria_4_all(annotater, first_step_output, raw_graph_dir, dataset, threshold)

    if not os.path.exists(last_step_output):
        os.makedirs(last_step_output)


    for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
         annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)

    min_carac_dict=None
    if min_carac_fn != "" and os.path.exists(min_carac_fn):
        min_carac_dict=json.load(open(min_carac_fn))

    format_data = None
    if format_fn and formatG1:
        format_data = json.load(open(format_fn))
        for form in formatG1.split(","):
            synthesize(last_step_output,"{0}_{1}.csv".format(dataset,threshold),min_size_G1,min_size_G2,min_carac_dict,n_car_min_doc1,n_car_min_doc2,form,format_data)
    else:
        synthesize(last_step_output, "{0}_{1}.csv".format(dataset, threshold), min_size_G1, min_size_G2, min_carac_dict,
                   n_car_min_doc1, n_car_min_doc2)

def generate_annotation_dataframe(matrix_sim_dir, selected_graphs, output_dir):
    """
    First Step
    Parameters
    ----------
    matrix_sim_dir
    selected_graphs
    output_dir

    Returns
    -------

    """

    if not os.path.exists(matrix_sim_dir):
        raise FileNotFoundError("Similarity matrix directory not found : {0}".format(matrix_sim_dir))

    for fn in glob.glob(os.path.join(matrix_sim_dir,"*.bz2")):
        measure = os.path.basename(fn).split("_")[0]

        type_ = "_".join(os.path.basename(fn).split("_")[1:]).replace(".npy.bz2", "")
        print("Proceeding...", measure, type_)
        if os.path.exists(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_))):
            continue
        try:
            df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
                                            selected_graphs,
                                            measure, type_,1)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            df.to_csv(os.path.join(output_dir, "{0}_{1}.csv".format(measure, type_)))
        except:
            print("Could'not read {0}".format(fn))


def extract_criteria_4_all(annotater, csv_input_dir, raw_graph_dir, dataset, threshold, output_file="temp_out.csv"):
    """
    Second STEP
    Parameters
    ----------
    annotater
    csv_input_dir
    raw_graph_dir
    dataset
    threshold
    output_file

    Returns
    -------

    """
    if not os.path.exists(csv_input_dir) or not os.path.exists(raw_graph_dir):
        raise FileNotFoundError("Error in Input")

    # Extract all match found using every combination of measure and type of STR
    all_str_matchin_available = []
    for filename in glob.glob("{0}/*".format(csv_input_dir)):
        couples = pd.read_csv(filename)["G1 G2".split()].apply(lambda x: "_".join(x.values.astype(str)),
                                                               axis=1).values.tolist()
        all_str_matchin_available.extend(couples)
    all_str_matchin_available = set(all_str_matchin_available)

    # Store in a dataframe
    matching_dataframe = pd.DataFrame([cp.split("_") for cp in all_str_matchin_available], columns="G1 G2".split())
    matching_dataframe = matching_dataframe.sort_values(by="G1 G2".split())

    # Load STRs
    strs = {}
    size_STR={}

    def load(fn):
        id_ = int(re.findall("\d+", fn)[-1])
        strs[id_] = STR.from_networkx_graph(nx.read_gexf(fn))
        size_STR[id_] = len(strs[id_])

    for file in tqdm(glob.glob(os.path.join(raw_graph_dir, "*.gexf")), desc="Load Graphs"):
        id_ = int(re.findall("\d+", file)[-1])
        strs[id_] = STR.from_networkx_graph(nx.read_gexf(file))
        size_STR[id_]= len(strs[id_])
    #Do the annotation for a match between two STR
    def annotate(x):
        try:
            return annotater.all(strs[int(x.G1)], strs[int(x.G2)], int(x.G1), int(x.G2))
        except KeyError as e:
            annotater.matching_cache.add(int(x.G1), int(x.G2), *(0, 0, 0, 0,300000,0))
            return [0, 0, 0, 0,300000,0,0]

    # Annotation Time
    print("Computing Criteria for each match")
    matching_dataframe["res"] = matching_dataframe.progress_apply(lambda x: annotate(x), axis=1)
    matching_dataframe.res = matching_dataframe.res.apply(lambda x: [int(x[0]),int(x[1]),int(x[2]),int(x[3]),float(x[4]),float(x[5])] if x else [])
    for ix, col in enumerate("c1 c2 c3 c4 c5 c6".split()):
        matching_dataframe[col] = matching_dataframe.res.apply(lambda x: x[ix] if len(x) > 0 else 0)

    del matching_dataframe["res"]
    # Writiting output
    return size_STR


def annotate_eval_sample(annotater, csv_file, output_file, size_str):
    """
    Third Step
    Parameters
    ----------
    annotater
    csv_file
    output_file

    Returns
    -------

    """
    if os.path.exists(output_file):
        return
    if not os.path.exists(csv_file):  # or not os.path.exists(args.graph_dir):
        raise FileNotFoundError("Error in Input : {0}".format(csv_file))

    df = pd.read_csv(csv_file, index_col=0)

    def foo(x):
        try:
            return annotater.all(None, None, x.G1, x.G2)
        except Exception as e:

            return [0, 0, 0, 0,300000,0]

    df["res"] = df.apply(lambda x: foo(x), axis=1)
    df.res = df.res.apply(lambda x: list(map(float, x)) if x else [])  # if bool
    df[["c1"]] = df.res.apply(lambda x: x[0] if len(x) > 0 else 0)
    df[["c2"]] = df.res.apply(lambda x: x[1] if len(x) > 0 else 0)
    df[["c3"]] = df.res.apply(lambda x: x[2] if len(x) > 0 else 0)
    df[["c4"]] = df.res.apply(lambda x: x[3] if len(x) > 0 else 0)
    df[["c5"]] = df.res.apply(lambda x: x[4] if len(x) > 0 else 300000)
    df[["c6"]] = df.res.apply(lambda x: x[5] if len(x) > 0 else 0)
    df["size_G1"] =df.apply(lambda x: size_str[x.G1] if x.G1 in size_str else 0, axis=1)
    df["size_G2"] = df.apply(lambda x: size_str[x.G2] if x.G2 in size_str else 0, axis=1)
    del df["res"]

    df.to_csv(output_file)


def synthesize(last_step_output,output_filename,min_size_G1=None,min_size_G2=None,
               min_carac_dict=None,ncar_min_doc1=0,ncar_min_doc2=0,
               formatG1=None,format_data=None):
    """
    Fourth Step
    Parameters
    ----------
    last_step_output
    output_filename

    Returns
    -------

    """
    fns = glob.glob(os.path.join(last_step_output, "*.csv"))
    if min_size_G1:
        output_filename= output_filename+"_ming1_{0}".format(min_size_G1)
    if min_size_G2:
        output_filename= output_filename+"_ming2_{0}".format(min_size_G2)
    if min_carac_dict and ncar_min_doc1 > 0:
        output_filename= output_filename+"_mindoc1len_{0}".format(ncar_min_doc1)
    if min_carac_dict and ncar_min_doc2 > 0:
        output_filename= output_filename+"_mindoc2len_{0}".format(ncar_min_doc2)
    if formatG1 and format_data:
        output_filename = output_filename + "_format_{0}".format(formatG1)
    data = []



    for fn in tqdm(fns,desc="Synthetise Results"):
        df = pd.read_csv(fn)
        if formatG1:
            df["formatG1"] = df.G1.apply(lambda x: format_data[str(x)])
        if min_size_G1:
            df= df[df.size_G1 >= min_size_G1]

        if min_size_G2:
            df = df[df.size_G2 >= min_size_G2]

        if formatG1 and format_data:
            df = df[df.formatG1 == formatG1]

        if min_carac_dict and ncar_min_doc1>0:
            df["len_doc1"]=df.apply(lambda x:min_carac_dict[str(x.G1)],axis=1)
            df =df[df.len_doc1 >= ncar_min_doc1]

        if min_carac_dict and ncar_min_doc2>0:
            df["len_doc2"]=df.apply(lambda x:min_carac_dict[str(x.G2)] if str(x.G2) in min_carac_dict else 0,axis=1)
            df =df[df.len_doc2 >= ncar_min_doc2]

        df = df.replace([np.inf, -np.inf], 300000)
        df["c5"] = 1 - (df.c5 - df.c5.min()) / (df.c5.max() - df.c5.min())
        #df["c6"] = (df.c6 - df.c6.min()) / (df.c6.max() -df.c6.min())
        if len(df) <1:
            continue
        mes = np.unique(df.sim_measure)[0]
        type_ = np.unique(df.type_str)[0]
        val = df.groupby("G1").mean().mean()["c1 c2 c3 c4 c5 c6".split()].values.tolist()
        val.insert(0, type_)
        val.insert(0, mes)
        data.append(val)

    res = pd.DataFrame(data, columns="mesure type c1 c2 c3 c4 c5 c6".split())
    res.to_csv(output_filename)