diff --git a/scripts/classification_from_segmentation.py b/scripts/classification_from_segmentation.py index cec5c1c83d3405dd6420b36bce50b1f79faf28d3..a5b7ea47734f7f36e18aaeab04ec658ac3bf2314 100644 --- a/scripts/classification_from_segmentation.py +++ b/scripts/classification_from_segmentation.py @@ -11,6 +11,7 @@ from sklearn.cross_validation import train_test_split import matplotlib.pyplot as plt from sklearn import tree, svm, ensemble, model_selection from sklearn.metrics import cohen_kappa_score +from sklearn.preprocessing import StandardScaler from pprint import pprint import numpy as np try: @@ -19,7 +20,305 @@ except: print 'pandas not imported' +def my_recipe(segmentation, roi_file, raster, + **kwargs): + """ + Given a vector segmentation with some segments labellised, and the image + to classify, the fonction train a model, apply it to the whole segmentation + and returns some quality indices. + + Parameters + ---------- + segmentation : string or list of string, + Path to an vector source or geo-like python objects + roi_file : string, + Path of vector file containing labeled samples. + raster : string, + path to a GDAL raster source + stats : list of str, optional + Which statistics to calculate for each zone. + All possible choices are ['count', 'min', 'max', 'mean', 'sum', 'std', + 'median', 'majority', 'minority', 'unique', 'range', 'nodata']. + Defaults to ['mean', 'std']. + nodata : float, optional + If `raster` is a GDAL source, this value overrides any NODATA value + specified in the file's metadata. + If `None`, the file's metadata's NODATA value (if any) will be used. + defaults to `None`. + field_id : string, optional + Name of the field containing the unique id of the segments. Defaults + to 'DN'. + field_class : string, optional + Name of the field containing the class number of segments. Defaults to + 'class'. 0 can not be used as a class number, otherwise + field_pred : string, optional + Name of the field containing the predicted class number of segments. + By defaults, the new field's name is 'class_pred'. + train_size : int or float, optional + If float, it must be comprised between 0 and 1, and its the proportion + of sample to use for training. If int, it is the absolute number of + sample to use for training. Defaul to 0.5. + target_names : list of string, (optional) + List of classes' name. + save : bool, + True if you want to save segmentation quality indices. + base : string + Path where the segmentation statistics are saved. + estimator : sklearn classifier, + sklearn.tree.DecisionTreeClassifier(), sklearn.svm.SVC(), or + sklearn.ensemble.RandomForestClassifier() with the parameters you + chose. + For SVC, decision_function_shape must be set as 'ovr' if there are more + than two classes. + cross_validation : bool + If True cross validation parameters are to be indicated + update_shape_file : bool + If predictions have to be added to attribute table of the input + segmentation file. Can be very time consuming. + + cross validation parameters + --------------------------- + Paremeters should be indicated in a dictionnary cross_validation_dict + + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. Default to + ``svm.SVC(decision_function_shape='ovr')``. + scoring : str, default 'accuracy' + Indicator to use to compare classifications. Must be 'accuracy' or + 'kappa'. + + n_jobs : int, default 1 + Number of parallel jobs. + cv : int, cross-validation generator or an iterable, optional + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + - None, to use the default 3-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - An object to be used as a cross-validation generator. + - An iterable yielding train, test splits. + For integer/None inputs, if the estimator is a classifier and ``y`` + is either binary or multiclass, :class:`StratifiedKFold` is used. + In all other cases, :class:`KFold` is used. + + Refer :ref:`User Guide <cross_validation>` for the various + cross-validation strategies that can be used here. + + param_grid : default to:: + + {'kernel': ['rbf'], + 'C': [0.01, 0.1, 1, 10, 100, 1000], + 'gamma': [1, 0.1, 0.001, 0.0001, 0.00001]} + output_result_name : string, + If results_svc are to be save. + + Returns + ------- + dict_output : dictionnary containing the following variables : + cm : 2d array, + The confusion matrix, true labels in row and predicted label in + columns + kappa : float, + Cohen Kappa indice + accuracy :float, + Overall accuracy.X + report : string, + Report of precision, recall, f1-score of each class. + Y_predict : 1d array, + Contain the predicted labels of segments used for validation. + Y_true : 1d array, + Contain the true labels of segments used for validation. + classif : 1d array, + Contain the predicted labels of all segments of + `vector_segmentation` + FID : 1d array, + Contain the id of all segments of `vector_segmentation`. It + corresponds to the id of `field_id` field of the shapefile. + """ +# a=int(input('If you use several Shapefiles, make sure every single one' + + # 'of them have at least one sample for each class. Ready ' + + # 'to continue ? (1 for yes / 0 for no) \n')) + # if a==0: + # return 'Process stopped' + + # extract parameters + stats = kwargs.get('stats', ['mean', 'std']) + field_id = kwargs.get('field_id', 'DN') + field_class = kwargs.get('field_class', 'class') + train_size = kwargs.get('train_size', 0.5) + nodata = kwargs.get('nodata', None) + target_names = kwargs.get('target_names') + debug = kwargs.get('debug', False) + field_pred = kwargs.get('field_pred', 'pred_class') + estimator = kwargs.get('estimator', None) + n_jobs = kwargs.get('n_jobs', 1) + save = kwargs.get('save', False) + + if isinstance(segmentation, list): + gml = True + list_shp = segmentation + else: + gml = False + list_shp = [segmentation] + + dict_mat = {} + dict_mat['nb_of_shp'] = len(list_shp) + dict_mat['gml'] = gml + + X = Y = FID = X_train = X_test = Y_train = Y_test = None + FID_train = None + FID_test = None + # Extract labels on each shape to perform the classification on the + # whole zone + for i, a_seg_file in enumerate(list_shp): + + if debug: + print 'Control : shp {}'.format(i+1) + + base = kwargs.get('base', os.path.splitext(raster)[0]) + X_i = np.load(base + '_stats_allsegment_X{}.npy'.format(i)) + Y_i = np.load(base + '_stats_allsegment_Y{}.npy'.format(i)) + FID_i = np.load(base + '_stats_allsegment_FID{}.npy'.format(i)) + + # Store stats, labels and FID for each shape in a dictionary + dict_mat.update({'X_{}'.format(i+1): X_i, 'Y_{}'.format(i+1): Y, + 'FID_{}'.format(i+1): FID_i}) + + # Extract stats only for segments that are labellised + # X_samp, Y_samp, FID_samp = cla.extract_sample(X_i, Y_i, FID_i) + + X_samp = np.load(base + '_stats_sample_X{}.npy'.format(i)) + Y_samp = np.load(base + '_stats_sample_Y{}.npy'.format(i)) + FID_samp = np.load(base + '_stats_sample_FID{}.npy'.format(i)) + + # Split them into train and test sample + temp_result = train_test_split(X_samp, Y_samp, FID_samp, + train_size=train_size) + (X_train_i, X_test_i, Y_train_i, Y_test_i, + FID_train_i, FID_test_i) = temp_result + + # Concatenate and store in a dictionary. Concatenation is for model + # training.concat + # Predictions, however, are performed on individual matrix. + results = cla.concat([X, X_i], [Y, Y_i], [FID, FID_i], + [X_train, X_train_i], [X_test, X_test_i], + [Y_train, Y_train_i], [Y_test, Y_test_i], + [FID_train, FID_train_i], [FID_test, FID_test_i]) + (X, Y, FID, X_train, X_test, Y_train, Y_test, + FID_train, FID_test) = results + + if save: + base = kwargs.get('base', os.path.splitext(raster)[0]) + X_name = base + '_X.npy' + Y_name = base + '_Y.npy' + FID_name = base + '_FID.npy' + X_train_name = base + '_X_train.npy' + X_test_name = base + '_X_test.npy' + Y_train_name = base + '_Y_train.npy' + Y_test_name = base + '_Y_test.npy' + FID_train_name = base + '_FID_train.npy' + FID_test_name = base + '_FID_test.npy' + np.save(X_name, X) + np.save(Y_name, Y) + np.save(FID_name, FID) + np.save(X_train_name, X_train) + np.save(X_test_name, X_test) + np.save(Y_train_name, Y_train) + np.save(Y_test_name, Y_test) + np.save(FID_train_name, FID_train) + np.save(FID_test_name, FID_test) + + # Store in a dictionary + scaler = StandardScaler().fit(X_train) + dict_mat['X'], dict_mat['Y'], dict_mat['FID'] = scaler.transform(X), Y, FID + dict_mat['X_train'], dict_mat['X_test'] = scaler.transform(X_train), scaler.transform(X_test) + dict_mat['Y_train'], dict_mat['Y_test'] = Y_train, Y_test + + # Train, then apply a classifier, on a validation set and on the whole + # segmentation + #Y_predict, classif = cla.classifier(X_train, Y_train, X_test, X) + if kwargs.get('cross_validation'): + if save : + base = kwargs.get('base', os.path.splitext(raster)[0]) + base += 'cross_validation.csv' + else: + base = None + cross_validation_dict = kwargs.get('cross_validation_dict') + clf = cla.select_best_svm_classifier_sample(scaler.transform(X_train), + Y_train, + output_result_name=base, + **cross_validation_dict) + estimator = clf.best_estimator_ + + temp_result = cla.classifier(scaler.transform(X_train), Y_train, + scaler.transform(X_test), + Y_test, scaler.transform(X), dict_mat, + estimator=estimator, n_jobs=n_jobs) + + Y_predict, classif, dict_mat, classif_parameters = temp_result + + print 'Control : classifier done' + print classif_parameters + + if kwargs.get('update_shape_file'): + # Update the vector segmentation + for i, a_seg_file in enumerate(list_shp): + # cla.add_predict_to_shp(roi_file, classif, FID, field_pred = field_pred, + # field_id = field_id) + cla.add_predict_to_shp(a_seg_file, dict_mat['classif_{}'.format(i+1)], + dict_mat['FID_{}'.format(i+1)], + field_pred=field_pred, field_id=field_id) + print 'Control : vector updated' + + # Get some quality indices + cm, report, accuracy = cla.pred_error_metrics(Y_predict, Y_test, + target_names=target_names) + + kappa = cohen_kappa_score(Y_predict, Y_test) + + # Print the results + print "confusion matrix\n", cm + print report + print "OA :", accuracy + print "Kappa:", kappa + + try: + # Show confusion matrix in a separate window + plt.matshow(cm) + plt.title('Confusion matrix') + plt.colorbar() + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.show() + except: # In case it called from a server + print 'Confusion Matrix can not be displayed' + + dict_output = {'cm': cm, + 'kappa': kappa, + 'accuracy': accuracy, + 'report': report, + 'Y_predict': Y_predict, + 'Y_true': Y_test, + 'classif': classif, + 'FID': FID, + 'estimator': str(classif_parameters)} + + # dict_output.update(dict_mat) + dict_output = dict(dict_output, **dict_mat) + + if save: + base = kwargs.get('base', os.path.splitext(raster)[0]) + try: + fichier = open(base + '_classifier.txt', 'w') + fichier.write(dict_output['estimator']) + fichier.close() + except: + print 'classifier not saved' + save_output(dict_output, base + '_classif') + + return dict_output ################################################# ########## Launch classification ################ @@ -77,6 +376,9 @@ def classification_from_A_to_Z(segmentation, roi_file, raster, than two classes. cross_validation : bool If True cross validation parameters are to be indicated + update_shape_file : bool + If predictions have to be added to attribute table of the input + segmentation file. Can be very time consuming. cross validation parameters --------------------------- @@ -185,7 +487,7 @@ def classification_from_A_to_Z(segmentation, roi_file, raster, field_class=field_class, debug=debug) X_i, (Y_i, FID_i) = check_for_nan_inf(X_i, Y_i, FID_i) - if save : + if save: base = kwargs.get('base', os.path.splitext(raster)[0]) x_i_name = base + '_stats_allsegment_X{}.npy'.format(i) y_i_name = base + '_stats_allsegment_Y{}.npy'.format(i) @@ -206,6 +508,7 @@ def classification_from_A_to_Z(segmentation, roi_file, raster, field_id=field_id, field_class=field_class, debug=debug) + X_samp, (Y_samp, FID_samp) = check_for_nan_inf(X_samp, Y_samp, FID_samp) if save : @@ -247,14 +550,15 @@ def classification_from_A_to_Z(segmentation, roi_file, raster, np.save(Y_test_name, Y_test) # Store in a dictionary - dict_mat['X'], dict_mat['Y'], dict_mat['FID'] = X, Y, FID - dict_mat['X_train'], dict_mat['X_test'] = X_train, X_test + scaler = StandardScaler().fit(X_train) + dict_mat['X'], dict_mat['Y'], dict_mat['FID'] = scaler.transform(X), Y, FID + dict_mat['X_train'], dict_mat['X_test'] = scaler.transform(X_train), scaler.transform(X_test) dict_mat['Y_train'], dict_mat['Y_test'] = Y_train, Y_test # Train, then apply a classifier, on a validation set and on the whole # segmentation #Y_predict, classif = cla.classifier(X_train, Y_train, X_test, X) - + scaler = StandardScaler().fit(X_train) if kwargs.get('cross_validation'): if save : base = kwargs.get('base', os.path.splitext(raster)[0]) @@ -262,12 +566,15 @@ def classification_from_A_to_Z(segmentation, roi_file, raster, else: base = None cross_validation_dict = kwargs.get('cross_validation_dict') - clf = cla.select_best_svm_classifier_sample(X_train, Y_train, + clf = cla.select_best_svm_classifier_sample(scaler.transform(X_train), + Y_train, output_result_name=base, **cross_validation_dict) estimator = clf.best_estimator_ - temp_result = cla.classifier(X_train, Y_train, X_test, Y_test, X, dict_mat, + temp_result = cla.classifier(scaler.transform(X_train), Y_train, + scaler.transform(X_test), + Y_test, X, dict_mat, estimator=estimator, n_jobs=n_jobs) Y_predict, classif, dict_mat, classif_parameters = temp_result @@ -275,14 +582,15 @@ def classification_from_A_to_Z(segmentation, roi_file, raster, print 'Control : classifier done' print classif_parameters - # Update the vector segmentation - for i, a_seg_file in enumerate(list_shp): - # cla.add_predict_to_shp(roi_file, classif, FID, field_pred = field_pred, - # field_id = field_id) - cla.add_predict_to_shp(a_seg_file, dict_mat['classif_{}'.format(i+1)], - dict_mat['FID_{}'.format(i+1)], - field_pred=field_pred, field_id=field_id) - print 'Control : vector updated' + if kwargs.get('update_shape_file'): + # Update the vector segmentation + for i, a_seg_file in enumerate(list_shp): + # cla.add_predict_to_shp(roi_file, classif, FID, field_pred = field_pred, + # field_id = field_id) + cla.add_predict_to_shp(a_seg_file, dict_mat['classif_{}'.format(i+1)], + dict_mat['FID_{}'.format(i+1)], + field_pred=field_pred, field_id=field_id) + print 'Control : vector updated' # Get some quality indices cm, report, accuracy = cla.pred_error_metrics(Y_predict, Y_test, diff --git a/ymraster/classification.py b/ymraster/classification.py index dd27cbbeebd860c53544ef5099d154bca3d427f0..f8cb0e67ddc6b41c7b12b92a78911ecbc30dfb9a 100755 --- a/ymraster/classification.py +++ b/ymraster/classification.py @@ -300,12 +300,14 @@ def classifier(X_train, Y_train, X_test, Y_test, X_img, dict_mat, clf = clf.fit(X_train, Y_train) # Prediction - for i in range(1,dict_mat['nb_of_shp']+1): #prediction for the sub-matrix (--> to modify the sub-Shapefiles) + #for i in range(1,dict_mat['nb_of_shp']+1): #prediction for the sub-matrix (--> to modify the sub-Shapefiles) #Perform the prediction on the whole label image for each shp - dict_mat['classif_{}'.format(i)] = clf.predict(dict_mat['X_{}'.format(i)]) + # dict_mat['classif_{}'.format(i)] = clf.predict(dict_mat['X_{}'.format(i)]) + # Concatenate to get predictions for the whole matrix (--> to determine of quality indices) - dict_mat['classif'] = np.concatenate([dict_mat['classif_{}'.format(i)] for i in range(1,dict_mat['nb_of_shp']+1)]) + #dict_mat['classif'] = np.concatenate([dict_mat['classif_{}'.format(i)] for i in range(1,dict_mat['nb_of_shp']+1)]) + dict_mat['classif'] = clf.predict(dict_mat['X']) # Perform the prediction on the test sample for each shp dict_mat['Y_predict'] = clf.predict(dict_mat['X_test']) @@ -400,8 +402,8 @@ def get_stats_from_shape(vector, raster, stats, nodata = None, field_id = 'DN', #get stats with rasterstats module my_stats = zonal_stats(vector,raster, stats = stats, - band = idx_band + 1, geojson_out = True, - nodata = nodata) + band = idx_band + 1, geojson_out=True, + nodata=nodata) #convert it to an X an Y array #----------------------------- @@ -419,8 +421,8 @@ def get_stats_from_shape(vector, raster, stats, nodata = None, field_id = 'DN', else: for i,feat in enumerate(my_stats): if FID[i] != feat['properties'][field_id] : - #check if the stats of the differents bands have been - #computed in the same order + # check if the stats of the differents bands have been + # computed in the same order inconsistency += 1 for j,s in enumerate(stats): X[i,j + idx_band * len(stats)] = feat['properties'][s]