From 16a6f1e21572c2cfc3e6c1aa16d563ff6ee31280 Mon Sep 17 00:00:00 2001
From: Remi Cresson <remi.cresson@irstea.fr>
Date: Thu, 2 Jun 2022 13:13:18 +0200
Subject: [PATCH] FIX: backup/restore callbacks

---
 decloud/models/callbacks.py            |  5 ++---
 decloud/models/train_from_tfrecords.py | 13 ++++++-------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/decloud/models/callbacks.py b/decloud/models/callbacks.py
index 16614a2..04635b9 100644
--- a/decloud/models/callbacks.py
+++ b/decloud/models/callbacks.py
@@ -25,7 +25,6 @@ import os
 import shutil
 import tensorflow as tf
 from tensorflow import keras
-from decloud.core import system
 from decloud.models.utils import _is_chief
 
 # Callbacks being called at the end of each epoch during training
@@ -45,7 +44,7 @@ class ArchiveCheckpoint(keras.callbacks.Callback):
         self.backup_dir = backup_dir
         self.strategy = strategy
 
-    def on_epoch_end(self, epoch, logs=None):
+    def on_epoch_begin(self, epoch, logs=None):
         """
         At the end of each epoch, we save the directory of BackupAndRestore to a different name for archiving
         """
@@ -92,7 +91,7 @@ class AdditionalValidationSets(keras.callbacks.Callback):
 
             for metric, result in zip(self.model.metrics_names, results):
                 if self.logdir:
-                    writer = tf.summary.create_file_writer(system.pathify(self.logdir) + 'validation_{}'.format(i + 1))
+                    writer = tf.summary.create_file_writer(os.path.join(self.logdir, 'validation_{}'.format(i + 1)))
                     with writer.as_default():
                         tf.summary.scalar('epoch_' + metric, result, step=epoch)  # tensorboard adds an 'epoch_' prefix
                 else:
diff --git a/decloud/models/train_from_tfrecords.py b/decloud/models/train_from_tfrecords.py
index 1cd28fd..eb58d4e 100644
--- a/decloud/models/train_from_tfrecords.py
+++ b/decloud/models/train_from_tfrecords.py
@@ -152,7 +152,7 @@ def main(args):
 
         # adding the info to the SavedModel path
         out_savedmodel = None if params.out_savedmodel is None else \
-            system.pathify(params.out_savedmodel) + expe_name + date_tag
+            os.path.join(params.out_savedmodel, expe_name + date_tag)
 
         # Scaling batch size and learning rate accordingly to number of workers
         batch_size_train = params.batch_size_train * n_workers
@@ -203,17 +203,16 @@ def main(args):
                 if params.strategy == 'singlecpu':
                     logging.warning('Checkpoints can not be saved while using singlecpu option. Discarding checkpoints')
                 else:
-                    # Create a backup
-                    backup_dir = system.pathify(params.ckpt_dir) + params.model
-                    callbacks.append(keras.callbacks.experimental.BackupAndRestore(backup_dir=backup_dir))
-
-                    # Save the checkpoint to a persistent location
+                    backup_dir = os.path.join(params.ckpt_dir, params.model)
+                    # Backup (deleted once the model is trained the specified number of epochs)
+                    callbacks.append(keras.callbacks.BackupAndRestore(backup_dir=backup_dir))
+                    # Persistent save (still here after the model is trained)
                     callbacks.append(ArchiveCheckpoint(backup_dir, strategy))
 
             # Define the Keras TensorBoard callback.
             logdir = None
             if params.logdir:
-                logdir = system.pathify(params.logdir) + "{}_{}".format(date_tag, expe_name)
+                logdir = os.path.join(params.logdir, f"{date_tag}_{expe_name}")
                 tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir,
                                                                    profile_batch=params.profiling)
                 callbacks.append(tensorboard_callback)
-- 
GitLab