validationFramework.py

import ogr
import os
import numpy as np
import math
import subprocess
import platform
import sys
import warnings
import csv

from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, precision_recall_fscore_support
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from math import floor, log10
from mtdUtils import cloneVectorDataStructure, fieldToArray
from segmentationWorkflow import selectSamples

def pixelValidation(ref_shp,val_shp,ref_img,txt_out,cfield,pfield=None):
    # Platform dependent parameters
    if platform.system() == 'Linux':
        sh = False
    elif platform.system() == 'Windows':
        sh = True
    else:
        sys.exit("Platform not supported!")

    if pfield is None:
        pfield = 'p' + cfield

    tmp = os.path.splitext(val_shp)[0] + '.tif'
    cmd = ['otbcli_Rasterization', '-in', val_shp, '-im', ref_img, '-out', tmp, 'uint16', '-mode', 'attribute', '-mode.attribute.field', pfield]
    subprocess.call(cmd, shell=sh)

    tmp_cm = os.path.splitext(val_shp)[0] + '.confmat.csv'
    cmd = ['otbcli_ComputeConfusionMatrix', '-in', tmp, '-out', tmp_cm, '-ref', 'vector', '-ref.vector.in', ref_shp, '-ref.vector.field', cfield]
    res = subprocess.check_output(cmd,shell=sh)

    os.remove(tmp)

    tid = open(txt_out, "w")
    tid.write(res)
    tid.close()


def surfaceValidation(ref_shp,val_shp,out,cfield,pfield=None):
    if pfield is None:
        pfield = 'p' + cfield

    shpd = ogr.GetDriverByName('ESRI Shapefile')
    ref_ds = ogr.Open(ref_shp, 0)
    val_ds = ogr.Open(val_shp, 0)
    ref_ly = ref_ds.GetLayer(0)
    val_ly = val_ds.GetLayer(0)

    out_ds = shpd.CreateDataSource(out)
    out_ly = out_ds.CreateLayer(os.path.splitext(os.path.basename(out))[0],
                             srs=val_ly.GetSpatialRef(),
                             geom_type=val_ly.GetLayerDefn().GetGeomType())

    feat = val_ly.GetFeature(0)
    cidx = feat.GetFieldIndex(cfield)
    pidx = feat.GetFieldIndex(pfield)
    out_ly.CreateField(feat.GetFieldDefnRef(cidx))
    out_ly.CreateField(feat.GetFieldDefnRef(pidx))
    err_fld = ogr.FieldDefn("ERR", ogr.OFTInteger)
    out_ly.CreateField(err_fld)

    y_true = []
    y_pred = []
    y_wght = []

    for rf in ref_ly:
        rg = rf.GetGeometryRef()
        val_ly.SetSpatialFilter(rg)
        val_ly.ResetReading()
        for vf in val_ly:
            vg = vf.GetGeometryRef()
            og = vg.Intersection(rg)
            if og is not None and og.GetArea() > 0:
                of = ogr.Feature(out_ly.GetLayerDefn())
                of.SetGeometry(og)
                c = vf.GetField(cidx)
                p = vf.GetField(pidx)
                e = int(c != p)
                of.SetField(0, c)
                of.SetField(1, p)
                of.SetField(2, e)
                out_ly.CreateFeature(of)
                y_true.append(int(c))
                y_pred.append(int(p))
                y_wght.append(og.GetArea())
            elif og is None:
                print vg.GetArea()

    ref_ds = None
    val_ds = None
    out_ds = None

    cm = confusion_matrix(y_true,y_pred,sample_weight=y_wght)
    acc = accuracy_score(y_true, y_pred, sample_weight=y_wght)
    kappa = cohen_kappa_score(y_true, y_pred, sample_weight=y_wght)
    prf = precision_recall_fscore_support(y_true, y_pred, sample_weight=y_wght)
    classes = sorted(np.unique(y_true))

    return classes,cm,acc,kappa,prf


def formatValidationTxt(classes,cm,acc,kappa,prf,txt_out):
    # Format text output
    tid = open(txt_out, "w")
    tid.write('Confusion matrix (surfaces):\n\n')
    ndigit = int(math.ceil(math.log10(np.max(cm))))
    cm_fmt = '%' + str(ndigit + 3) + '.2f'
    hd_fmt = '[ %' + str(ndigit - 1) + 'd ]'
    tid.write(' ' * (ndigit + 3) + ' ' + ' '.join([hd_fmt % x for x in classes]) + '\n\n')
    i = 0
    for l in cm:
        tid.write(hd_fmt % classes[i] + ' ' + ' '.join([cm_fmt % x for x in l]) + '\n')
        i += 1

    tid.write('\n')

    pr_fmt = ' ' * (ndigit - 3) + '%6.4f'
    tid.write('Per-class figures :\n\n')
    tid.write(' ' * (ndigit + 3) + ' ' + ' '.join([hd_fmt % x for x in classes]) + '\n\n')
    tid.write('PREC' + ' ' * (ndigit + 3 - 4) + ' ' + ' '.join([pr_fmt % x for x in prf[0]]) + '\n')
    tid.write('RECALL' + ' ' * (ndigit + 3 - 6) + ' ' + ' '.join([pr_fmt % x for x in prf[1]]) + '\n')
    tid.write('F-MEAS' + ' ' * (ndigit + 3 - 6) + ' ' + ' '.join([pr_fmt % x for x in prf[2]]) + '\n')

    tid.write('\n')

    tid.write('Overall Accuracy: %5.2f%%\n' % (acc * 100))
    tid.write('Kappa           : %6.4f\n' % (kappa))

    tid.close()

def genKFolds(shp,fld,k,out_fld=None):
    # Read all features and store classes in a separate array
    ds = ogr.Open(shp)
    allfeat = []
    classes = []
    ly = ds.GetLayer(0)
    for f in ly:
        allfeat.append(f)
        classes.append(f.GetField(fld))
    ly.ResetReading()

    classes = np.array(classes)
    Ncl = len(np.unique(classes))

    # Generate indices for fold splitting
    kf = StratifiedKFold(n_splits=k,shuffle=True)
    train_index = []
    test_index = []
    i = 1
    for tr,ts in kf.split(classes,classes):
        chk = int(len(np.unique(classes[tr])) < Ncl) + 2*int(len(np.unique(classes[ts])) < Ncl)
        if chk == 1:
            warnings.warn('Fold ' + str(i) + ' misses some class in training set.')
        elif chk == 2:
            warnings.warn('Fold ' + str(i) + ' misses some class in test set.')
        elif chk == 3:
            warnings.warn('Fold ' + str(i) + ' misses some class in both training and test set.')
        train_index.append(tr)
        test_index.append(ts)
        i += 1

    # Generate and fill output files
    dgt = int(floor(log10(k))+1)
    train_out = []
    test_out = []
    if out_fld is None:
        out_fld = os.path.dirname(shp) + '/' + str(k) + '_folds'
        if not os.path.exists(out_fld):
            os.mkdir(out_fld)
    elif not os.path.exists(out_fld):
        sys.exit('Output folder does not exists!')

    drv = ogr.GetDriverByName('ESRI Shapefile')

    for i in range(k):
        fn_train = out_fld + '/' + os.path.basename(shp).replace('.shp','_train_fold_' + str(i+1).zfill(dgt) + '.shp')
        if os.path.exists(fn_train):
            drv.DeleteDataSource(fn_train)
        fn_test = out_fld + '/' + os.path.basename(shp).replace('.shp','_test_fold_' + str(i+1).zfill(dgt) + '.shp')
        if os.path.exists(fn_test):
            drv.DeleteDataSource(fn_test)
        train_out.append(fn_train)
        test_out.append(fn_test)
        ds_tr = cloneVectorDataStructure(ds,fn_train)
        ds_ts = cloneVectorDataStructure(ds, fn_test)
        ds_tr_ly = ds_tr.GetLayer(0)
        ds_ts_ly = ds_ts.GetLayer(0)
        for t in train_index[i]:
            ds_tr_ly.CreateFeature(allfeat[t])
        for t in test_index[i]:
            ds_ts_ly.CreateFeature(allfeat[t])
        ds_tr = None
        ds_ts = None

    ds = None

    return train_out,test_out

def kFoldRefToSamples(train_samples, test_samples, train_folds, test_folds):
    k = len(train_folds)
    dgt = int(floor(log10(k)) + 1)
    kfold_train_samples = []
    kfold_test_samples = []
    for i in range(k):
        fnout = train_samples.replace('.shp','_train_fold_' + str(i+1).zfill(dgt) + '.shp')
        selectSamples(train_samples,train_folds[i],fnout,merging_fields=False)
        kfold_train_samples.append(fnout)
        fnout = test_samples.replace('.shp', '_test_fold_' + str(i + 1).zfill(dgt) + '.shp')
        selectSamples(test_samples, test_folds[i], fnout,merging_fields=False)
        kfold_test_samples.append(fnout)

    return kfold_train_samples,kfold_test_samples

def kFoldReport(fscores,accs,kappas,txt_out):
    # Format text output
    tid = open(txt_out, "w")
    classes = fscores.keys()
    ndigit = int(math.ceil(math.log10(np.max(classes))))
    hd_fmt = '[ %' + str(15) + 'd ]'

    mns = []
    stds = []
    # Convert into arrays
    npy_accs = np.array(accs)
    npy_kappas = np.array(kappas)
    for c in fscores:
        mns.append(np.mean(fscores[c]))
        stds.append(np.std(fscores[c]))

    acc_mn = np.mean(npy_accs)
    acc_std = np.std(npy_accs)
    kap_mn = np.mean(npy_kappas)
    kap_std = np.std(npy_kappas)

    tid.write('Summary of ' + str(len(accs)) + '-fold cross validation.\n\n')

    tid.write('Per-class F-scores :\n\n')
    tid.write('\n'.join(['Class %8d : %6.4f +/- %6.4f' % (c,x,y) for c,x,y in zip(classes,mns,stds)]))

    tid.write('\n\n')

    tid.write('Overall Accuracy: %5.2f%% +/- %5.2f%%\n' % (acc_mn * 100,acc_std * 100))
    tid.write('Kappa           : %6.4f +/- %6.4f\n' % (kap_mn,kap_std))

    tid.close()

def readKFoldReport(fn,cln,tag = None):
    fid = open(fn, 'rb')
    cid = open(cln, 'rb')
    # read report
    classes = []
    fsc_mean = []
    fsc_std = []
    oa_mean = None
    oa_std = None
    kc_mean = None
    kc_std = None

    clnames = {}
    notfound = tag is not None
    cidl = cid.read().splitlines()
    for cl in cidl:
        if tag is not None and notfound:
            if cl != tag:
                continue
            if cl == tag and notfound:
                notfound = False
                continue

        scl = cl.split(',')
        if len(scl) == 2:
            clnames[int(scl[0])] = scl[1]
        else:
            break

    for l in fid:
        line = l.split()
        if len(line) == 0:
            continue
        if line[0] == 'Class':
            classes.append(int(line[1]))
            fsc_mean.append(float(line[3]))
            fsc_std.append(float(line[5]))
        elif line[0] == 'Overall':
            oa_mean = float(line[2][:-1])
            oa_std = float(line[4][:-1])
        elif line[0] == 'Kappa':
            kc_mean = float(line[2])
            kc_std = float(line[4])

    out = {}
    out['PerClass'] = {}
    out['OverallAcc'] = [oa_mean,oa_std]
    out['Kappa'] = [kc_mean,kc_std]
    out['ClassDict'] = clnames

    for i in range(len(classes)):
        out['PerClass'][clnames[classes[i]]] = [fsc_mean[i],fsc_std[i]]

    fid.close()
    cid.close()

    return out

def kFoldReportToLatexTable(fn,cln,tag=None,ofn=None,mode='vertical'):
    dct = readKFoldReport(fn,cln,tag)
    if ofn == None:
        ofn = fn.replace('.txt','.tex')
    oid = open(ofn,'wb')
    oid.write('\\documentclass{standalone}\n')
    oid.write('\\usepackage[dvipsnames]{xcolor}\n')
    oid.write('\\renewcommand\\familydefault{\\sfdefault}\n')
    oid.write('\\begin{document}\n')

    pcf = [dct['ClassDict'][i] for i in sorted(dct['ClassDict'])]

    #def tabular
    if mode == 'vertical':
        oid.write('\\begin{tabular}{|c|c|}\n')
        oid.write('\\hline\n')
        oid.write('\\textbf{Class} & \\textbf{F-Score} \\\\\\hline\n')
        for c in pcf:
            clr = 'black'
            if dct['PerClass'][c][0] < 0.3:
                clr = 'red'
            elif dct['PerClass'][c][0] >= 0.3 and dct['PerClass'][c][0] < 0.5:
                clr = 'orange'
            elif dct['PerClass'][c][0] > 0.75:
                clr = 'ForestGreen'
            oid.write('\\textit{%s} & \\color{%s}%1.4f$\\pm$%1.4f \\\\\\hline\n' % (c,clr,dct['PerClass'][c][0],dct['PerClass'][c][1]))
        oid.write('\\hline\n')
        clr = 'black'
        if dct['OverallAcc'][0] < 0.3:
            clr = 'red'
        elif dct['OverallAcc'][0] >= 0.3 and dct['OverallAcc'][0] < 0.5:
            clr = 'orange'
        elif dct['OverallAcc'][0] > 0.75:
            clr = 'ForestGreen'
        oid.write('\\textbf{Overall Acc.} & {\\color{%s}\\textbf{%2.2f}\\%% $\\pm$ \\textbf{%2.2f}\\%%} \\\\\\hline\n' % (clr,dct['OverallAcc'][0],dct['OverallAcc'][1]))
        oid.write('\\textbf{Kappa} & \\textbf{%0.4f} $\\pm$ \\textbf{%0.4f} \\\\\\hline\n' % (dct['Kappa'][0],dct['Kappa'][1]))
        oid.write('\\end{tabular}')

    oid.write('\\end{document}\n')
    oid.close()
    cdir = os.getcwd()
    os.chdir(os.path.dirname(ofn))
    subprocess.call(['pdflatex',ofn])
    os.chdir(cdir)
    return

def getTrainingDataFromShapefile(shp,fields,code):
    ds = ogr.Open(shp)
    ly = ds.GetLayer(0)
    training_set = np.empty((ly.GetFeatureCount(),len(fields)))
    training_labels = np.empty(ly.GetFeatureCount())
    i=0
    to_del = set()
    for f in ly:
        j=0
        for fld in fields:
            training_set[i,j] = f.GetField(fld)
            if np.isnan(training_set[i,j]):
                to_del.add(i)
            j += 1
        training_labels[i] = f.GetField(code)
        i += 1
    ds = None

    # delete samples containing NaN
    lst = np.array(list(to_del)).astype(np.int)
    training_set = np.delete(training_set, lst, axis=0)
    training_labels = np.delete(training_labels,lst)

    return training_set,training_labels

def getVariableImportance(shp,fields,code,out_fn, nbtrees = 100, nodesize = 25, mtry = 0, nruns = 1, field_names=None):
    if mtry <= 0 or mtry > len(fields):
        mf = 'auto'
    rf = RandomForestClassifier(n_estimators=nbtrees, min_samples_leaf=nodesize, max_features=mf, oob_score=True)
    X,y = getTrainingDataFromShapefile(shp,fields,code)
    importance = np.zeros(len(fields)).astype(float)
    oob_score = 0.0
    for i in range(nruns):
        rf.fit(X,y)
        importance += rf.feature_importances_
        oob_score += rf.oob_score_
    importance /= nruns
    oob_score /= nruns

    if field_names is not None and os.path.exists(field_names):
        with open(field_names, mode='rb') as infile:
            reader = csv.reader(infile)
            names_dict = {rows[0]: rows[1] for rows in reader if rows[0] in fields}
    else:
        names_dict = {x:x for x in fields}

    imp_dict = {fields[i]:importance[i] for i in range(len(fields))}
    with open(out_fn, mode='wb') as outfile:
        outfile.write('#OOB Accuracy: %6.4f%%\n' % (100 * oob_score))
        writer = csv.writer(outfile)
        for key,val in sorted(imp_dict.iteritems(), key=lambda (k,v):(v,k), reverse=True):
            writer.writerow([key, names_dict[key],val])

    return oob_score