diff --git a/decloud/models/callbacks.py b/decloud/models/callbacks.py index 16614a249d4849ee7ca7c56e7bb8abf9926896bd..04635b9836cdb94ed634eaa8103584426d898499 100644 --- a/decloud/models/callbacks.py +++ b/decloud/models/callbacks.py @@ -25,7 +25,6 @@ import os import shutil import tensorflow as tf from tensorflow import keras -from decloud.core import system from decloud.models.utils import _is_chief # Callbacks being called at the end of each epoch during training @@ -45,7 +44,7 @@ class ArchiveCheckpoint(keras.callbacks.Callback): self.backup_dir = backup_dir self.strategy = strategy - def on_epoch_end(self, epoch, logs=None): + def on_epoch_begin(self, epoch, logs=None): """ At the end of each epoch, we save the directory of BackupAndRestore to a different name for archiving """ @@ -92,7 +91,7 @@ class AdditionalValidationSets(keras.callbacks.Callback): for metric, result in zip(self.model.metrics_names, results): if self.logdir: - writer = tf.summary.create_file_writer(system.pathify(self.logdir) + 'validation_{}'.format(i + 1)) + writer = tf.summary.create_file_writer(os.path.join(self.logdir, 'validation_{}'.format(i + 1))) with writer.as_default(): tf.summary.scalar('epoch_' + metric, result, step=epoch) # tensorboard adds an 'epoch_' prefix else: diff --git a/decloud/models/train_from_tfrecords.py b/decloud/models/train_from_tfrecords.py index 1cd28fd38348224b03799f14cb1bfb1189ce6f7c..eb58d4e449e841342867a547296a92d4cd4e11dc 100644 --- a/decloud/models/train_from_tfrecords.py +++ b/decloud/models/train_from_tfrecords.py @@ -152,7 +152,7 @@ def main(args): # adding the info to the SavedModel path out_savedmodel = None if params.out_savedmodel is None else \ - system.pathify(params.out_savedmodel) + expe_name + date_tag + os.path.join(params.out_savedmodel, expe_name + date_tag) # Scaling batch size and learning rate accordingly to number of workers batch_size_train = params.batch_size_train * n_workers @@ -203,17 +203,16 @@ def main(args): if params.strategy == 'singlecpu': logging.warning('Checkpoints can not be saved while using singlecpu option. Discarding checkpoints') else: - # Create a backup - backup_dir = system.pathify(params.ckpt_dir) + params.model - callbacks.append(keras.callbacks.experimental.BackupAndRestore(backup_dir=backup_dir)) - - # Save the checkpoint to a persistent location + backup_dir = os.path.join(params.ckpt_dir, params.model) + # Backup (deleted once the model is trained the specified number of epochs) + callbacks.append(keras.callbacks.BackupAndRestore(backup_dir=backup_dir)) + # Persistent save (still here after the model is trained) callbacks.append(ArchiveCheckpoint(backup_dir, strategy)) # Define the Keras TensorBoard callback. logdir = None if params.logdir: - logdir = system.pathify(params.logdir) + "{}_{}".format(date_tag, expe_name) + logdir = os.path.join(params.logdir, f"{date_tag}_{expe_name}") tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir, profile_batch=params.profiling) callbacks.append(tensorboard_callback)