Source

Target

Commits (2)
Showing with 120 additions and 39 deletions
+120 -39
...@@ -5,10 +5,14 @@ import numpy as np ...@@ -5,10 +5,14 @@ import numpy as np
import pandas as pd import pandas as pd
from OBIA.OBIABase import * from OBIA.OBIABase import *
from sklearn.model_selection import StratifiedGroupKFold, GroupShuffleSplit from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, precision_recall_fscore_support from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, precision_recall_fscore_support
from Learning.SampleManagement import gen_k_folds, generate_samples_from_set
import warnings
class ObjectBasedClassifier: class ObjectBasedClassifier:
def __init__(self, object_layer, time_series_list, user_feature_list, def __init__(self, object_layer, time_series_list, user_feature_list,
reference_data=None, ref_class_field='class', ref_id_field='id'): reference_data=None, ref_class_field='class', ref_id_field='id'):
...@@ -22,31 +26,50 @@ class ObjectBasedClassifier: ...@@ -22,31 +26,50 @@ class ObjectBasedClassifier:
self.obia_base.populate_ref_db() self.obia_base.populate_ref_db()
self.training_base = self.obia_base.get_reference_db_as_training_base(class_field=ref_class_field) self.training_base = self.obia_base.get_reference_db_as_training_base(class_field=ref_class_field)
self.training_base['folds'] = [] self.training_base['folds'] = []
self.training_base['dummy_ids'] = []
return return
def gen_k_folds(self, k, class_field='class', n_retries=10): def gen_k_folds(self, k, class_field='class', n_retries=10, augment=False):
ok = False ok = False
retry = 0 retry = 0
while (not ok) and retry<n_retries: while (not ok) and retry<n_retries:
self.training_base['folds'] = [] self.training_base['folds'], ok, problematic = gen_k_folds(self.training_base['X'],
sgk = StratifiedGroupKFold(n_splits=k, shuffle=True) self.training_base[class_field],
for tr_i, ts_i in sgk.split(self.training_base['X'], self.training_base['groups'], k)
self.training_base[class_field],
self.training_base['groups']):
self.training_base['folds'].append((tr_i, ts_i))
# check if all classes are in all splits
n_classes = len(np.unique(self.training_base[class_field]))
ok = True
for f in self.training_base['folds']:
ok &= (len(np.unique(self.training_base[class_field][f[0]])) == n_classes and
len(np.unique(self.training_base[class_field][f[1]])) == n_classes)
retry += 1 retry += 1
if not ok: if not ok:
self.training_base['folds'] = [] if not augment:
raise Exception("Not all classes are present in each fold/split.\n" raise Exception("Not all classes are present in each fold/split.\n"
"Please check that you have enough groups (e.g. 2 x n_folds) per class.") "Please check that you have enough polygons (e.g. 2 x n_folds) per class.")
else:
warnings.warn('Classes {} have not enough groups to ensure sample presence in each fold. Augmenting to 2 x n_folds samples.'.format(problematic))
n_samples_to_add = [2*k - len(np.unique(self.training_base['groups'][self.training_base[class_field]==c])) for c in problematic]
curr_grp = np.max(self.training_base['groups']) + 1
curr_id = np.max(self.training_base['obj_id']) + 1
for c,n in zip(problematic, n_samples_to_add):
x = self.training_base['X'][self.training_base[class_field]==c]
s = generate_samples_from_set(x,n,0.01)
sc = c * np.ones(n)
sg = curr_grp + np.arange(n)
sid = curr_id + np.arange(n)
self.training_base['X'] = np.vstack([self.training_base['X'], s])
self.training_base[class_field] = np.concatenate([self.training_base[class_field], sc])
self.training_base['groups'] = np.concatenate([self.training_base['groups'], sg])
self.training_base['obj_id'] = np.concatenate([self.training_base['obj_id'], sid])
self.training_base['dummy_ids'] = np.concatenate([self.training_base['dummy_ids'], sid])
curr_grp += n
curr_id += n
retry = 0
while (not ok) and retry<n_retries:
self.training_base['folds'], ok, problematic = gen_k_folds(self.training_base['X'],
self.training_base[class_field],
self.training_base['groups'], k)
retry += 1
if not ok:
raise Exception("Still not ok after augmentation. Please provide more samples.")
return return
# To change!
def gen_hold_out(self, test_train_ratio=0.5, n_splits=1, class_field='class'): def gen_hold_out(self, test_train_ratio=0.5, n_splits=1, class_field='class'):
gss = GroupShuffleSplit(n_splits=n_splits, test_size=test_train_ratio) gss = GroupShuffleSplit(n_splits=n_splits, test_size=test_train_ratio)
for tr_i, ts_i in gss.split(self.training_base['X'], for tr_i, ts_i in gss.split(self.training_base['X'],
...@@ -73,13 +96,16 @@ class ObjectBasedClassifier: ...@@ -73,13 +96,16 @@ class ObjectBasedClassifier:
models.append(RandomForestClassifier(n_estimators=n_estimators)) models.append(RandomForestClassifier(n_estimators=n_estimators))
models[-1].fit(self.training_base['X'][tr_i], self.training_base[class_field][tr_i]) models[-1].fit(self.training_base['X'][tr_i], self.training_base[class_field][tr_i])
l, c = self.training_base['obj_id'][ts_i], models[-1].predict(self.training_base['X'][ts_i]) l, c = self.training_base['obj_id'][ts_i], models[-1].predict(self.training_base['X'][ts_i])
# Remove dummy ids and relative class label (can lead to no samples in test set!)
c = np.delete(c, np.isin(l, self.training_base['dummy_ids']))
l = np.delete(l, np.isin(l, self.training_base['dummy_ids']))
y_true, y_pred = self.obia_base.true_pred_bypixel(l, c, class_field) y_true, y_pred = self.obia_base.true_pred_bypixel(l, c, class_field)
results.append( results.append(
{ {
'conf_matrix': confusion_matrix(y_true, y_pred), 'conf_matrix': confusion_matrix(y_true, y_pred, labels=np.unique(self.training_base[class_field])),
'accuracy': accuracy_score(y_true, y_pred), 'accuracy': accuracy_score(y_true, y_pred),
'kappa' : cohen_kappa_score(y_true, y_pred), 'kappa' : cohen_kappa_score(y_true, y_pred),
'p_r_f1': precision_recall_fscore_support(y_true, y_pred, zero_division=0), 'p_r_f1': precision_recall_fscore_support(y_true, y_pred, zero_division=0, labels=np.unique(self.training_base[class_field])),
'importances' : models[-1].feature_importances_ 'importances' : models[-1].feature_importances_
} }
) )
......
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
def gen_k_folds(X, Y, G, k):
folds = []
sgk = StratifiedGroupKFold(n_splits=k, shuffle=True)
for tr_i, ts_i in sgk.split(X, Y, G):
folds.append((tr_i, ts_i))
# check if all classes are in all splits
problematic = []
for f in folds:
problematic.extend([
np.setdiff1d(
np.unique(Y),
np.unique(Y[f[0]])
),
np.setdiff1d(
np.unique(Y),
np.unique(Y[f[1]])
)
])
ok = all([x.size == 0 for x in problematic])
problematic = np.unique(np.concatenate(problematic))
if not ok:
folds = []
return folds, ok, problematic
def generate_samples_from_set(X, num_samples=1, sigma_noise=0.0):
M = np.mean(X, axis=0)
C = np.cov(X, rowvar=False)
S = np.random.multivariate_normal(M,C,size=num_samples)
if sigma_noise > 0.0:
M = np.zeros(X.shape[1])
C = np.array(np.diag((sigma_noise**2) * np.ones(X.shape[1])))
N = np.random.multivariate_normal(M,C,size=num_samples)
S += N
return S
\ No newline at end of file
...@@ -43,7 +43,7 @@ def generate_report_figures(map, palette_fn, results, summary, out_dir, map_name ...@@ -43,7 +43,7 @@ def generate_report_figures(map, palette_fn, results, summary, out_dir, map_name
of = {} of = {}
of['conf_matrices'] = [] of['conf_matrices'] = []
for i,r in enumerate(results): for i,r in enumerate(results):
cm = ConfusionMatrixDisplay.from_predictions(r['true_vs_pred'][0], r['true_vs_pred'][1], cm = ConfusionMatrixDisplay.from_predictions(r['true_vs_pred'][0], r['true_vs_pred'][1], labels=labels,
normalize='true', include_values=True, values_format='.2f') normalize='true', include_values=True, values_format='.2f')
of['conf_matrices'].append('{}/conf_matrix_{}.png'.format(out_dir, str(i).zfill(2))) of['conf_matrices'].append('{}/conf_matrix_{}.png'.format(out_dir, str(i).zfill(2)))
cm.ax_.set_xticklabels(class_names, rotation=45, ha='right') cm.ax_.set_xticklabels(class_names, rotation=45, ha='right')
...@@ -55,8 +55,8 @@ def generate_report_figures(map, palette_fn, results, summary, out_dir, map_name ...@@ -55,8 +55,8 @@ def generate_report_figures(map, palette_fn, results, summary, out_dir, map_name
of['cl_rep'] = [] of['cl_rep'] = []
for r in results: for r in results:
of['cl_rep'].append(classification_report(r['true_vs_pred'][0], r['true_vs_pred'][1], of['cl_rep'].append(classification_report(r['true_vs_pred'][0], r['true_vs_pred'][1], labels=labels,
output_dict=True, target_names=class_names)) output_dict=True, target_names=class_names, zero_division=0))
fsc = [np.array([x[c]['f1-score'] for x in of['cl_rep']]) for c in class_names] fsc = [np.array([x[c]['f1-score'] for x in of['cl_rep']]) for c in class_names]
fsc_m = [np.mean(x) for x in fsc] fsc_m = [np.mean(x) for x in fsc]
...@@ -182,9 +182,12 @@ def generate_pdf(of, out_pdf, name='output'): ...@@ -182,9 +182,12 @@ def generate_pdf(of, out_pdf, name='output'):
row.cell('{:.4f}'.format(datum), align='R') row.cell('{:.4f}'.format(datum), align='R')
row.cell('{}'.format(data_row[-1]), align='R') row.cell('{}'.format(data_row[-1]), align='R')
row = table.row() row = table.row()
row.cell('Accuracy') if 'accuracy' in rep.keys():
row.cell('{:.2f}%'.format(rep['accuracy']*100), align='R') row.cell('Accuracy')
row.cell('{:.2f}%'.format(rep['accuracy']*100), align='R')
elif 'micro avg' in rep.keys():
row.cell('Micro Avg. F1-Score')
row.cell('{:.2f}%'.format(rep['micro avg']['f1-score']*100), align='R')
pdf.output(out_pdf) pdf.output(out_pdf)
......
...@@ -36,17 +36,29 @@ def fetch(shp, dt, output_fld, credentials): ...@@ -36,17 +36,29 @@ def fetch(shp, dt, output_fld, credentials):
return S2TheiaPipeline(output_fld) return S2TheiaPipeline(output_fld)
def fetch_eodag(shp, dt, output_fld, credentials): def fetch_eodag(shp, dt, output_fld, credentials, only_tiles=None):
bbox = get_query_bbox(shp) bbox = get_query_bbox(shp)
dag = EODataAccessGateway(user_conf_file_path=credentials) dag = EODataAccessGateway(user_conf_file_path=credentials)
search_criteria = { if only_tiles is None:
"productType": "S2_MSI_L2A_MAJA", search_criteria = {
"start": dt.split('/')[0], "productType": "S2_MSI_L2A_MAJA",
"end": dt.split('/')[1], "start": dt.split('/')[0],
"geom": {"lonmin": bbox[0], "latmin": bbox[1], "lonmax": bbox[2], "latmax": bbox[3]} "end": dt.split('/')[1],
} "geom": {"lonmin": bbox[0], "latmin": bbox[1], "lonmax": bbox[2], "latmax": bbox[3]}
res = dag.search_all(**search_criteria) }
ret = dag.download_all(res, outputs_prefix=output_fld, extract=True, delete_archive=True) res = dag.search_all(**search_criteria)
ret = dag.download_all(res, outputs_prefix=output_fld, extract=True, delete_archive=True)
else:
for tile in only_tiles:
search_criteria = {
"productType": "S2_MSI_L2A_MAJA",
"start": dt.split('/')[0],
"end": dt.split('/')[1],
"geom": {"lonmin": bbox[0], "latmin": bbox[1], "lonmax": bbox[2], "latmax": bbox[3]},
"tileIdentifier": tile
}
res = dag.search_all(**search_criteria)
ret = dag.download_all(res, outputs_prefix=output_fld, extract=True, delete_archive=True)
for f in ret: for f in ret:
im = glob.glob(f+'/*')[0] im = glob.glob(f+'/*')[0]
os.rename(im, os.path.join(os.path.dirname(f),os.path.basename(im))) os.rename(im, os.path.join(os.path.dirname(f),os.path.basename(im)))
......
...@@ -84,7 +84,7 @@ def train_valid_workflow(seg, ts_lst_pkl, d, m_file): ...@@ -84,7 +84,7 @@ def train_valid_workflow(seg, ts_lst_pkl, d, m_file):
reference_data=d['ref_db']['path'], reference_data=d['ref_db']['path'],
ref_class_field=d['ref_db']['fields']) ref_class_field=d['ref_db']['fields'])
obc.gen_k_folds(5, class_field=d['ref_db']['fields'][-1]) obc.gen_k_folds(5, class_field=d['ref_db']['fields'][-1],augment=d['training']['augment_if_missing'])
if 'export_training_base' in d['training'].keys() and d['training']['export_training_base'] is True: if 'export_training_base' in d['training'].keys() and d['training']['export_training_base'] is True:
obc.save_training_base('{}/_side/training_base.pkl'.format(os.path.join(d['output_path'], d['chain_name']))) obc.save_training_base('{}/_side/training_base.pkl'.format(os.path.join(d['output_path'], d['chain_name'])))
......
...@@ -8,7 +8,8 @@ ...@@ -8,7 +8,8 @@
"ref_db" : { "ref_db" : {
"path": "/path/to/ref/db/vector", "path": "/path/to/ref/db/vector",
"fields": ["class_field_1", "class_field_2"] "fields": ["class_field_1", "class_field_2"],
"augment_if_missing": false
}, },
"dem" : { "dem" : {
......
...@@ -83,7 +83,7 @@ def preprocess_s1(in_fld, roi, out_fld, dem_fld=None, geoid=None, direction=None ...@@ -83,7 +83,7 @@ def preprocess_s1(in_fld, roi, out_fld, dem_fld=None, geoid=None, direction=None
s1.compute_features() s1.compute_features()
return s1.write_outputs(out_fld) return s1.write_outputs(out_fld)
def fetch(imagery, shp, out_fld, dt=None, auth=None): def fetch(imagery, shp, out_fld, dt=None, auth=None, only_tiles=None):
assert(imagery in ['s2theia', 's2planetary', 's1grd', 's1rtc', 'planetmosaics', 'cop-dem-glo-30', 'nasadem']) assert(imagery in ['s2theia', 's2planetary', 's1grd', 's1rtc', 'planetmosaics', 'cop-dem-glo-30', 'nasadem'])
if imagery not in ['s2planetary', 'cop-dem-glo-30', 'nasadem'] and auth is None: if imagery not in ['s2planetary', 'cop-dem-glo-30', 'nasadem'] and auth is None:
raise ValueError("Please provide authentication information.") raise ValueError("Please provide authentication information.")
...@@ -91,7 +91,7 @@ def fetch(imagery, shp, out_fld, dt=None, auth=None): ...@@ -91,7 +91,7 @@ def fetch(imagery, shp, out_fld, dt=None, auth=None):
raise ValueError("Please provide date range option.") raise ValueError("Please provide date range option.")
if imagery == 's2theia': if imagery == 's2theia':
#temporarily switch to eodag since theia_picker is unusable #temporarily switch to eodag since theia_picker is unusable
s2theia.fetch_eodag(shp, dt, out_fld, auth) s2theia.fetch_eodag(shp, dt, out_fld, auth, only_tiles.split(';'))
elif imagery == 's2planetary': elif imagery == 's2planetary':
s2planetary.fetch(shp, dt, out_fld) s2planetary.fetch(shp, dt, out_fld)
elif imagery == 's1grd': elif imagery == 's1grd':
......
...@@ -73,6 +73,7 @@ def main(args): ...@@ -73,6 +73,7 @@ def main(args):
fetchp.add_argument("out_folder", type=str, help="Output folder where fetched data will be downloaded.") fetchp.add_argument("out_folder", type=str, help="Output folder where fetched data will be downloaded.")
fetchp.add_argument("--date_range", type=str, help="Date query in the YYYY-MM-DD/YYYY-MM-DD format.") fetchp.add_argument("--date_range", type=str, help="Date query in the YYYY-MM-DD/YYYY-MM-DD format.")
fetchp.add_argument("--auth", type=str, default=None, help="Authentication information (credentials file, API key, etc.)") fetchp.add_argument("--auth", type=str, default=None, help="Authentication information (credentials file, API key, etc.)")
fetchp.add_argument("--only_tiles", type=str, default=None, help="Colon separated list of tiles to download (where applicable).")
chain = subpar.add_parser("launch_chain", help="Launch a Moringa workflow using a JSON config file.", chain = subpar.add_parser("launch_chain", help="Launch a Moringa workflow using a JSON config file.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
...@@ -115,7 +116,7 @@ def main(args): ...@@ -115,7 +116,7 @@ def main(args):
preprocess_planet(arg.in_folder, arg.out_folder) preprocess_planet(arg.in_folder, arg.out_folder)
if arg.cmd == "fetch": if arg.cmd == "fetch":
fetch(arg.imagery, arg.roi, arg.out_folder, arg.date_range, arg.auth) fetch(arg.imagery, arg.roi, arg.out_folder, arg.date_range, arg.auth, arg.only_tiles)
if arg.cmd == "launch_chain": if arg.cmd == "launch_chain":
if arg.workflow == 'basic': if arg.workflow == 'basic':
......