compute_similarity.py 3.51 KiB
# coding = utf-8
import glob
import argparse, os, sys, re, json, logging
import datetime
import time


import numpy as np
import pandas as pd
import networkx as nx

from gmatch4py import *
from gmatch4py.helpers.reader import import_dir
from gmatch4py import GraphEditDistance as GED2
from gmatch4py.base import Base

#############
# FUNCTIONS #
#############

def _get_graphs(df,id_colsname,colsname):
    """
    Return a list of graphs
    
    Parameters
    ----------
    df : pandas.Dataframe
        input
    id_colsname : str
        name of the column that contains graph's ids
    colsname : str
        name of the column that contains the graphs
    
    Returns
    -------
    list
        list of graphs
    """

    N=np.max(df[id_colsname])
    graphs=[nx.Graph()]*(N+1)
    for _, row in df.iterrows():
        graphs[row[id_colsname]]=(row[colsname] if isinstance(row[colsname],nx.Graph) else row[colsname].graph)
    return graphs

#######################
#  PARSE ARGUMENTS    #
#######################

parser = argparse.ArgumentParser()

parser.add_argument("input")
parser.add_argument("-s","--selected",default="")
parser.add_argument("-g","--graphcol",
    help="Type of graph you want to compare",
    action="append")
parser.add_argument("-i","--idcol",default="id_doc")
parser.add_argument("-l","--logfile",default="{0}.csv".format(datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")))
parser.add_argument("-o","--output",default="output/",help="Output Directory")

args = parser.parse_args()

#################
#  MAIN PROGRAM # 
#################

#Check input file existence
if not os.path.exists(args.input):
    raise FileNotFoundError("Input file doesn't exists ! {0}".format(args.input))

# LOAD INPUT FILE
df = pd.read_pickle(args.input)

# Check id_col
if not args.idcol in df:
    raise KeyError("Column Id with key = {0} does not exists ! ".format(args.idcol))

# Check graph column
for col in args.graphcol:
    if not col in df:
        raise KeyError("Graph Column with key = {0} does not exists ! ".format(col))

# IF SELECTED GRAPHS
selected = None
if args.selected and os.path.exists(args.selected):
    selected=json.load(open(os.path.join(args.selected)))

# LOAD DATA for each type of graph
datas={type_:_get_graphs(df,args.idcol,type_) for type_ in args.graphcol}

# OUTPUT FN
matrix_output_dir=args.output

output_text=[]
for str_type in args.graphcol:
    graphs=datas[str_type]
    for class_ in [BagOfNodes,WeisfeleirLehmanKernel,GraphEditDistance, BP_2, HED, GreedyEditDistance, Jaccard, MCS, VertexEdgeOverlap]:
        deb=time.time()
        print("Computing the Similarity Matrix for {0} and {1}".format(class_.__name__,str_type))
        if class_ in (GraphEditDistance, BP_2, GreedyEditDistance, HED):
            comparator = class_(1, 1, 1, 1)
        elif class_ == GED2:
            comparator = class_(1, 1, 1, 1,weighted=True)
        elif class_ == WeisfeleirLehmanKernel:
            comparator = class_(h=2)
        else:
            comparator=class_()
        # COMPARE
        matrix = comparator.compare(graphs, selected)
        matrix = comparator.similarity(matrix)
        
        # OUTPUT FILENAME 
        output_fn="{0}/{1}_{2}.npy".format(
            matrix_output_dir.rstrip("/"),
            class_.__name__,
            str_type
        )
        # UPDATE LOG
        output_text.append([class_.__name__,str_type,time.time()-deb])
        
        # SAVE RESULT
        np.save(output_fn,matrix)
        
        print("Matrix Saved")

open(args.logfile,'w').write(json.dumps(output_fn))
print("Done")