eval.py 7.61 KiB
# coding: utf-8
import glob
import json
import os
import re
import time

from progressbar import ProgressBar, Timer, Bar, ETA

from gmatch4py.bag_of_cliques import BagOfCliques
from gmatch4py.ged.approximate_ged import ApproximateGraphEditDistance
from gmatch4py.ged.bipartite_graph_matching_2 import BP_2
from gmatch4py.ged.greedy_edit_distance import GreedyEditDistance
from gmatch4py.ged.hausdorff_edit_distance import HED
from gmatch4py.jaccard import Jaccard
from gmatch4py.kernels.weisfeiler_lehman import *
from gmatch4py.mcs import MCS
from gmatch4py.vertex_edge_overlap import VertexEdgeOverlap
from strpython.nlp.bow_se import BOWSE
from strpython.pipeline import *

# Function for output generation
def_temp = [36, -36]
temp = def_temp
max_temp = -30
dec = 5


def getLocInfo(id_):
    global temp, dec
    try:
        data = get_data(id_)
        if 'coord' in data:
            return [data["coord"]["lat"], data["coord"]["lon"]]
        else:
            temp = [temp[0], temp[1] + dec]
            if temp[1] >= max_temp:
                temp = [temp[0] + dec, def_temp[1]]
            return temp
    except:
        pass


def get_associated_es(associated_es_data):
    global temp
    new_ = {}
    temp = def_temp
    for id_ in associated_es_data:
        try:
            new_[id_] = {"label": get_data(id_)["en"], "coord": getLocInfo(id_)}
        except:
            new_[id_] = {"label": id_, "coord": getLocInfo(id_)}
    return new_


def getEdges4Draw(data, edges):
    lines = []
    for ed in edges:
        lines.append([data[ed[0]]["coord"], data[ed[1]]["coord"], ed[2]["color"]])
        if lines[-1][-1] == "cyan":
            lines[-1][-1] = "blue";

    return lines


# Similarity Function between graph and a set of graphs

def compareMCS(graphs, selected):
    return 1 - MCS.compare(graphs, selected)


# GED algorithm
def compareGED(graphs, selected):
    return ApproximateGraphEditDistance.compare(graphs, selected)


def compareBP2(graphs, selected):
    return BP_2.compare(graphs, selected)


def compareHED(graphs, selected):
    return HED.compare(graphs, selected)


def compareGreedy(graphs, selected):
    return GreedyEditDistance.compare(graphs, selected)


def compareWLSubTreeKernel(graphs, selected):
    return 1 - WeisfeleirLehmanKernel.compare(graphs, selected, h=3)


def compareBOWSE(graphs, selected):
    return 1 - BOWSE.compare(graphs, selected)


def compareBOC(graphs_array, selected):
    return np.ones((len(graphs_array),len(graphs_array))) - BagOfCliques.compare(graphs_array, selected)


def compareVEO(graphs_array, selected):
    return 1 - VertexEdgeOverlap.compare(graphs_array, selected)


def compareJaccard(graphs_array, selected):
    return 1 - Jaccard.compare(graphs_array, selected)


funcDict = {
    "MCS": compareMCS,
    "VEO": compareVEO,
    "GED": compareGED,
    "BP2": compareBP2,
    "HED": compareHED,
    "GREEDY": compareGreedy,
    "WLSUBTREE": compareWLSubTreeKernel,
    "BOWSE": compareBOWSE,
    "BOC": compareBOC,
    "JACCARD": compareJaccard
}

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("distance")
parser.add_argument("texts_dir")
parser.add_argument("graphs_dir")
parser.add_argument("metadata_fn")
parser.add_argument("original_dir")
parser.add_argument("-s", "--selectedGraph")
parser.add_argument("-a", "--all", action="store_true")
parser.add_argument("-o", "--output", help="Output Filename")
args = parser.parse_args()

original_dir = args.original_dir
if not args.distance in funcDict.keys():
    raise NotFoundDistance(args.distance, funcDict)
    exit()

# Load all the text from the corpus
texts = []
if os.path.exists(args.texts_dir):
    files_glob = glob.glob(args.texts_dir + "/*.txt")
    texts = [""] * len(files_glob)
    for fn in files_glob:
        id = int(re.findall("\d+", fn)[-1])
        texts[id] = open(fn).read()
    # if not files_:
    #     print("No .txt files found in {0}".format(args.texts_dir))
    #     exit()
    # for fn in files_:
    #     try:
    #         texts.append()
    #     except:
    #         print("{0} could'nt be read !".format(fn))

# If output Dir doesn't exists
if not os.path.exists(args.graphs_dir):
    print("No graph files were loaded !")
    exit()
if not texts:
    print("No text files were loaded !")
    exit()

# Load graph data and associated spatial entities of each graph

assC = json.load(open(args.metadata_fn))
associated_es, count_per_doc = assC[0], assC[1]

graphs = {}
for file in glob.glob(args.graphs_dir.rstrip("/") + "/*.gexf"):
    id = int(re.findall("\d+", file)[-1])
    graphs[id] = nx.read_gexf(file)

graphs_array = [nx.Graph() for i in range(max(graphs.keys()) + 1)]
for i, g in graphs.items():
    graphs_array[i] = g

# We take 50 documents chosen randomly. Then we test, if the top-10 returned documents are relevant !

if args.all:
    selected_documents_ = list(graphs.keys())
elif args.selectedGraph:
    selected_documents_ = json.load(open(args.selectedGraph))
# if args.all:
#     selected_documents_=list(graphs.keys())
# else:
#     selected_documents_ = []
#     ids=[]
#     for i in range(len(graphs)):
#         if len(graphs[i])>1:
#             ids.append(i)
#
#     import random
#     random.shuffle(ids)
#     try:
#         selected_documents_=ids[:50]
#     except:
#         selected_documents_=ids[:int(len(ids)/2)]


# Generating Evaluation Output
top_ten_documents = []
final_data = {}

deb = time.time()
print("Computing Similarity Matrix ...")
similarity_matrix = funcDict[args.distance](graphs_array, selected_documents_)
print("Similarity Matrix Computed in {0} s.".format(time.time() - deb))

graphs = {}
for file in glob.glob(original_dir.rstrip("/") + "/*.gexf"):
    id = int(re.findall("\d+", file)[-1])
    graphs[id] = nx.read_gexf(file)

nn_ = 5

with ProgressBar(max_value=len(selected_documents_), widgets=[' [', Timer(), '] ', Bar(), ' (', ETA(), ') ', ]) as pg:
    inc = 0
    for doc_s in selected_documents_:
        if not len(graphs[doc_s]) > 0:
            continue
        bow_score = similarity_matrix[doc_s]
        top_docs_score = np.sort(bow_score).astype(float)
        top_docs = np.argsort(bow_score).astype(int)
        final_data[doc_s] = {
            "sp_entities": get_associated_es(graphs[doc_s].nodes()),
            "text": texts[doc_s],
        }
        final_data[doc_s]["edges"] = getEdges4Draw(final_data[doc_s]["sp_entities"], graphs[doc_s].edges(data=True))
        final_data[doc_s]["topk"] = []
        n_top_docs = len(top_docs)
        for d in range(n_top_docs):
            if not top_docs[d] in graphs or top_docs[d] == doc_s:
                continue
            if len(final_data[doc_s]["topk"]) == nn_:
                break
            doc_data = {}
            doc_data["score"] = top_docs_score[d]
            doc_data["id_txt"] = int(top_docs[d])
            doc_data["text"] = ""  # texts[int(top_10_docs[d])]
            doc_data["sp_entities"] = get_associated_es(graphs[doc_data["id_txt"]].nodes())
            doc_data["edges"] = getEdges4Draw(doc_data["sp_entities"], graphs[doc_data["id_txt"]].edges(data=True))
            doc_data["relevant"] = None
            final_data[doc_s]["topk"].append(doc_data)
        inc += 1
        pg.update(inc)

if not args.output:
    print("Saved in gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance))
    open("gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance), 'w').write(json.dumps(final_data, indent=4))
else:
    print("Saved in {0}/evalTop10STR_{1}.json".format(args.output, args.distance))
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    open("{0}/evalTop10STR_{1}.json".format(args.output.rstrip("/"), args.distance), 'w').write(
        json.dumps(final_data, indent=4))