diff --git a/classify.py b/classify.py new file mode 100644 index 0000000000000000000000000000000000000000..b4bcf8ed63d9e03b85904581722bc340d4d077a9 --- /dev/null +++ b/classify.py @@ -0,0 +1,76 @@ +import numpy as np +import sys +import glob +from scipy import sparse +from gbssl import LGC,HMN,PARW,MAD,OMNIProp,CAMLP +from sklearn.neighbors import kneighbors_graph +from scipy.sparse import coo_matrix +from sklearn.preprocessing import normalize +from sklearn.utils import shuffle +from sklearn.metrics import accuracy_score +from scipy.spatial.distance import pdist +from scipy.spatial.distance import squareform +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import f1_score +import os.path + +def extractKNNGraph(knn, k): + nrow, _ = knn.shape + G = sparse.lil_matrix((nrow,nrow)) + + for i in range(nrow): + for j in knn[i,1:k+1]: + G[i,j]=1 + + G_trans = G.transpose() + #MUTUAL KNN GRAPH + mKNN = np.minimum(G.todense(),G_trans.todense()) + return sparse.lil_matrix(mKNN) + + +def getKNNEucl(X): + dist = pdist(X,'euclidean') + dist = squareform(dist) + + knn = np.argsort(dist,axis=1) + return knn + + + +def classify(directoy, embFileName, labelFileName, numberOfNearestNeighbors): + #Y = np.load(directory+"/class.npy") + X = np.load(embFileName) + scaler = MinMaxScaler() + scaler.fit(X) + X = scaler.transform(X) + + knn = getKNNEucl(X) + G = extractKNNGraph(knn, numberOfNearestNeighbors) + nrow, _ = G.shape + + #print directory+"/labels/"+str(runId)+"_"+str(nsamples)+".npy" + labeled = np.load( labelFileName ) + id_labeled = labeled[:,0].astype("int") + cl_labeled = labeled[:,1].astype("int") + camlp = CAMLP(graph=G) + camlp.fit(np.array(id_labeled),np.array(cl_labeled)) + + prob_cl = camlp.predict_proba(np.arange(nrow)) + predict = np.argmax(prob_cl,axis=1) + return predict + + +#Directory Name on which data are stored +directory = sys.argv[1] + +#File that contains the new representation learned with the SESAM approach or any other data representation +embFileName = sys.argv[2] + +#File in the directory/labels folder with label information +#The file has as many row as the number of labeled example +#Each row has two information: the position of the labeled example w.r.t. the data file data.npy, the associated label +labelFileName = sys.argv[3] + +numberOfNearestNeighbors = 20 +prediction = classify(directory, embFileName, labelFileName, numberOfNearestNeighbors) +print(prediction) diff --git a/sesam.py b/sesam.py new file mode 100644 index 0000000000000000000000000000000000000000..b3a39f047aced30732d832d306c0ee4e11f23731 --- /dev/null +++ b/sesam.py @@ -0,0 +1,135 @@ +from keras.layers import Input, Dense +from keras.models import Model +from keras import optimizers +from keras.callbacks import LearningRateScheduler +from keras.callbacks import EarlyStopping +import glob +from sklearn.preprocessing import MinMaxScaler +import keras + +import scipy.io as sio +import numpy as np +import sys +import os +import random +from sklearn import preprocessing +from random import randint +#from cop_kmeans import cop_kmeans, l2_distance +import math + +import tensorflow as tf +from keras.backend.tensorflow_backend import set_session +#config = tf.ConfigProto() +#config.gpu_options.per_process_gpu_memory_fraction = 0.5 +#set_session(tf.Session(config=config)) + + +def deepSSAEMulti(n_dim, n_hidden1, n_hidden2, n_classes): + input_layer = Input(shape=(n_dim,)) + encoded = Dense(n_hidden1, activation='relu')(input_layer) + encoded = Dense(n_hidden2, activation='relu', name="low_dim_features")(encoded) + decoded = Dense(n_hidden1, activation='relu')(encoded) + decoded = Dense(n_dim, activation='sigmoid')(decoded) + + classifier = Dense(n_classes, activation='softmax')(encoded) + + rmsPropOpt = optimizers.RMSprop(lr=0.0005) + rmsPropOpt1 = optimizers.RMSprop(lr=0.0005) + autoencoder = Model(inputs=[input_layer], outputs=[decoded]) + autoencoder.compile(optimizer=rmsPropOpt, loss=['mse']) + + ssautoencoder = Model(inputs=[input_layer], outputs=[decoded, classifier]) + ssautoencoder.compile(optimizer=rmsPropOpt1, loss=['mse','categorical_crossentropy'], loss_weights=[1., 1.]) + return [autoencoder, ssautoencoder] + + +def feature_extraction(model, data, layer_name): + feat_extr = Model(inputs= model.input, outputs= model.get_layer(layer_name).output) + return feat_extr.predict(data) + +def learn_SingleReprSS(X_tot, idx_train, Y): + n_classes = len(np.unique(Y)) + idx_train = idx_train.astype("int") + X_train = X_tot[idx_train] + Y_train = Y[idx_train] + encoded_Y_train = keras.utils.to_categorical(Y_train, n_classes) + n_row, n_col = X_tot.shape + + perc_50 = math.ceil( n_col -1) + perc_10 = math.ceil( n_col * 0.5) + perc_5 = math.ceil( (n_col * 0.5) - 1) + perc_1 = math.ceil( n_col * 0.25) + + n_hidden1 = randint(perc_10, perc_50) + n_hidden2 = randint(perc_1, perc_5) + + ae, ssae = deepSSAEMulti(n_col, n_hidden1, n_hidden2, n_classes) + for i in range(200): + print "epoch: %d" % i + ae.fit(X_tot, X_tot, epochs=1, batch_size=64, shuffle=True, verbose=1) + ssae.fit(X_train, [X_train, encoded_Y_train], epochs=1, batch_size=8, shuffle=True, verbose=1) + new_train_feat = feature_extraction(ae, X_tot, "low_dim_features") + return new_train_feat + + +def learn_representationSS(X_tot, idx_train, Y, ens_size): + intermediate_reprs = np.array([]) + for l in range(ens_size): + embeddings = learn_SingleReprSS(X_tot, idx_train, Y) + if intermediate_reprs.size == 0: + intermediate_reprs = embeddings + else: + intermediate_reprs = np.column_stack((intermediate_reprs, embeddings)) + return intermediate_reprs + + +def normData(data): + X = np.array(data) + scaler = MinMaxScaler() + scaler.fit(X) + return scaler.transform(X) + + +if __name__ == "__main__": + #Directory Name on which data are stored + directory = sys.argv[1] + + #File in the directory/labels folder with label information + #The file has as many row as the number of labeled example + #Each row has two information: the position of the labeled example w.r.t. the data file data.npy, the associated label + fileName = sys.argv[2] + + dataset_name = directory+"/data.npy" + dataset_cl_name = directory+"/class.npy" + + dataset = np.load(dataset_name) + dataset = normData(dataset) + + dataset_cl = np.load(dataset_cl_name) + + #Size of the ensemble + ens_size = 30 + dirEmb = "embeddings" + dir_path = directory+"/"+dirEmb + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + #for fileName in files: + fName = fileName.split("/")[-1] + run_id, nsamples = fName.split(".")[0].split("_") + outFileName = dir_path+"/"+run_id+"_"+nsamples+".npy" + if os.path.exists(outFileName): + print "ALREADY EXIST %s" % outFileName + exit() + print "CREATE EMBEDDINGS for the file %s" % fileName + + + sys.stdout.flush() + idx_cl = np.load(fileName) + idx_train = idx_cl[:,0] + + + new_feat_ssae = learn_representationSS(dataset, idx_train, dataset_cl, ens_size) + outFileName = dir_path+"/"+run_id+"_"+nsamples+".npy" + np.save(outFileName, new_feat_ssae) diff --git a/sonar/class.npy b/sonar/class.npy new file mode 100644 index 0000000000000000000000000000000000000000..efb92ef8b687d683de7d987b91b0aeeff0ca6bc1 Binary files /dev/null and b/sonar/class.npy differ diff --git a/sonar/data.npy b/sonar/data.npy new file mode 100644 index 0000000000000000000000000000000000000000..9af68440d5e60e49e87bd8369525ca07a45dadf3 Binary files /dev/null and b/sonar/data.npy differ diff --git a/sonar/labels/0_10.npy b/sonar/labels/0_10.npy new file mode 100644 index 0000000000000000000000000000000000000000..71a46583527201eb5070a0d9da757114730f32f9 Binary files /dev/null and b/sonar/labels/0_10.npy differ