From fadb5fa948444c1c0b2c941e8cdeec1e84163ca1 Mon Sep 17 00:00:00 2001
From: Thibault Hallouin <thibault.hallouin@inrae.fr>
Date: Tue, 23 Aug 2022 14:05:44 +0200
Subject: [PATCH] deal with both missing observations/predictions for
 determinist

Unlike 3de89046390def7e25171aa4a420ebe4f0ead06d, this also deals with
missing predictions. This effectively pairwise remove those time steps
where either observations or predictions are missing data. This does so
without creating copies of the input data, via the use of xfunctions
stored as class members.

The unit test for missing data is updated to include NAN also in the
predictions (with a varying number of NAN across the 5 time series).
---
 include/evalhyd/evald.hpp     |  7 +++--
 src/determinist/evaluator.hpp | 57 +++++++++++++++++++++++------------
 tests/test_determinist.cpp    | 40 ++++++++++++++++--------
 3 files changed, 69 insertions(+), 35 deletions(-)

diff --git a/include/evalhyd/evald.hpp b/include/evalhyd/evald.hpp
index a3af81b..248071d 100644
--- a/include/evalhyd/evald.hpp
+++ b/include/evalhyd/evald.hpp
@@ -35,14 +35,17 @@ namespace evalhyd
     /// :Parameters:
     ///
     ///    q_obs: ``xt::xexpression<A>``
-    ///       Streamflow observations. Time steps with missing observation
+    ///       Streamflow observations. Time steps with missing observations
     ///       must be assigned `NAN` values. Those time steps will be ignored
     ///       both in the observations and the predictions before the
     ///       *metrics* are computed.
     ///       shape: ({1+, ...}, time)
     ///
     ///    q_prd: ``xt::xexpression<A>``
-    ///       Streamflow predictions.
+    ///       Streamflow predictions. Time steps with missing predictions
+    ///       must be assigned `NAN` values. Those time steps will be ignored
+    ///       both in the observations and the predictions before the
+    ///       *metrics* are computed.
     ///       shape: ({1+, ...}, time)
     ///
     ///    metrics: ``std::vector<std::string>``
diff --git a/src/determinist/evaluator.hpp b/src/determinist/evaluator.hpp
index 95f0346..124b9f0 100644
--- a/src/determinist/evaluator.hpp
+++ b/src/determinist/evaluator.hpp
@@ -26,10 +26,20 @@ namespace evalhyd
         template <class A>
         class Evaluator
         {
+        // define type for function to be used to "mask" (i.e. yield NaNs) the
+        // time steps for which the observations/predictions are missing data
+        typedef xt::xfunction<xt::detail::conditional_ternary,
+                              xt::xfunction<xt::math::isnan_fun, const A&>,
+                              xt::xscalar<float>,
+                              const A&> nan_function;
+
         private:
             // members for input data
-            const A& q_obs;
-            const A& q_prd;
+            const A& _q_obs;
+            const A& _q_prd;
+            nan_function q_obs;
+            nan_function q_prd;
+
             // members for computational elements
             A mean_obs;
             A mean_prd;
@@ -43,17 +53,33 @@ namespace evalhyd
             // constructor method
             Evaluator(const xt::xexpression<A>& obs,
                       const xt::xexpression<A>& prd) :
-                    q_obs{obs.derived_cast()},
-                    q_prd{prd.derived_cast()}
+                    _q_obs{obs.derived_cast()},
+                    _q_prd{prd.derived_cast()},
+                    // "mask" (i.e. use NaN) observations where predictions are
+                    // missing (i.e. NaN)
+                    q_obs{xt::where(xt::isnan(prd.derived_cast()),
+                                    NAN, obs.derived_cast())},
+                    // "mask" (i.e. use NaN) predictions where observations are
+                    // missing (i.e. NaN)
+                    q_prd{xt::where(xt::isnan(obs.derived_cast()),
+                                    NAN, prd.derived_cast())}
             {
                 // check that data dimensions are compatible
-                if (q_prd.dimension() != q_obs.dimension())
+                if (_q_prd.dimension() != _q_obs.dimension())
                 {
                     throw std::runtime_error(
                             "number of dimensions of 'sim' and 'obs' "
                             "must be identical"
                     );
                 }
+                else if (_q_prd.shape(_q_prd.dimension() - 1)
+                         != _q_obs.shape( _q_obs.dimension() - 1))
+                {
+                    throw std::runtime_error(
+                            "length of time series of 'sim' and 'obs' "
+                            "must be identical"
+                    );
+                }
             };
 
             // members for evaluation metrics
@@ -103,10 +129,7 @@ namespace evalhyd
         template <class A>
         void Evaluator<A>::calc_mean_prd()
         {
-            mean_prd = xt::nanmean(
-                    xt::where(xt::isnan(q_obs), NAN, q_prd),
-                    -1, xt::keep_dims
-            );
+            mean_prd = xt::nanmean(q_prd, -1, xt::keep_dims);
         }
 
         // Compute the quadratic error between observations and predictions.
@@ -157,9 +180,7 @@ namespace evalhyd
         template <class A>
         void Evaluator<A>::calc_quad_prd()
         {
-            quad_prd = xt::square(
-                    xt::where(xt::isnan(q_obs), NAN, q_prd) - mean_prd
-            );
+            quad_prd = xt::square(q_prd - mean_prd);
         }
 
         // Compute the Pearson correlation coefficient.
@@ -191,8 +212,7 @@ namespace evalhyd
             // calculate error in timing and dynamics $r_{pearson}$
             // (Pearson's correlation coefficient)
             auto r_num = xt::nansum(
-                    (xt::where(xt::isnan(q_obs), NAN, q_prd) - mean_prd)
-                    * (q_obs - mean_obs),
+                    (q_prd - mean_prd) * (q_obs - mean_obs),
                     -1, xt::keep_dims
             );
             auto r_den = xt::sqrt(
@@ -218,8 +238,7 @@ namespace evalhyd
         void Evaluator<A>::calc_bias()
         {
             // calculate $bias$
-            bias = xt::nansum(xt::where(xt::isnan(q_obs), NAN, q_prd),
-                              -1, xt::keep_dims)
+            bias = xt::nansum(q_prd, -1, xt::keep_dims)
                    / xt::nansum(q_obs, -1, xt::keep_dims);
         }
 
@@ -287,8 +306,7 @@ namespace evalhyd
         void Evaluator<A>::calc_KGE()
         {
             // calculate error in spread of flow $alpha$
-            auto q_prd_ = xt::where(xt::isnan(q_obs), NAN, q_prd);
-            auto alpha = detail::std_dev(q_prd_, mean_prd)
+            auto alpha = detail::std_dev(q_prd, mean_prd)
                          / detail::std_dev(q_obs, mean_obs);
 
             // compute KGE
@@ -326,8 +344,7 @@ namespace evalhyd
         void Evaluator<A>::calc_KGEPRIME()
         {
             // calculate error in spread of flow $gamma$
-            auto q_prd_ = xt::where(xt::isnan(q_obs), NAN, q_prd);
-            auto gamma = (detail::std_dev(q_prd_, mean_prd) / mean_prd)
+            auto gamma = (detail::std_dev(q_prd, mean_prd) / mean_prd)
                          / (detail::std_dev(q_obs, mean_obs) / mean_obs);
 
             // compute KGE
diff --git a/tests/test_determinist.cpp b/tests/test_determinist.cpp
index 8ef2fec..89c213e 100644
--- a/tests/test_determinist.cpp
+++ b/tests/test_determinist.cpp
@@ -189,6 +189,12 @@ TEST(DeterministTests, TestMissingData)
 
     // add some missing observations artificially by assigning NaN values
     xt::view(observed, xt::all(), xt::range(0, 20)) = NAN;
+    // add some missing predictions artificially by assigning NaN values
+    xt::view(observed, 0, xt::range(20, 23)) = NAN;
+    xt::view(observed, 1, xt::range(20, 26)) = NAN;
+    xt::view(observed, 2, xt::range(20, 29)) = NAN;
+    xt::view(observed, 3, xt::range(20, 32)) = NAN;
+    xt::view(observed, 4, xt::range(20, 35)) = NAN;
 
     // compute metrics with observations containing NaN values
     std::vector<xt::xarray<double>> metrics_nan =
@@ -196,19 +202,27 @@ TEST(DeterministTests, TestMissingData)
                     observed, predicted, metrics
             );
 
-    // compute metrics on subset of observations and predictions
-    // (i.e. eliminating region with NaN in observations manually)
-    xt::xtensor<double, 2> obs = xt::view(observed, xt::all(), xt::range(20, _));
-    xt::xtensor<double, 2> prd = xt::view(predicted, xt::all(), xt::range(20, _));
-
-    std::vector<xt::xarray<double>> metrics_sbs =
-            evalhyd::evald<xt::xtensor<double, 2>>(
-                    obs, prd, metrics
-            );
-
     for (int m = 0; m < metrics.size(); m++) {
-        EXPECT_TRUE(
-                xt::allclose(metrics_nan[m], metrics_sbs[m])
-        ) << "Failure for (" << metrics[m] << ")";
+        for (int p = 0; p < predicted.shape(0); p++) {
+            // compute metrics on subset of observations and predictions
+            // (i.e. eliminating region with NaN in observations manually)
+            xt::xtensor<double, 1> obs =
+                    xt::view(observed, 0, xt::range(3*(p+1), _));
+            xt::xtensor<double, 1> prd =
+                    xt::view(predicted, p, xt::range(3*(p+1), _));
+
+            std::vector<xt::xarray<double>> metrics_sbs =
+                    evalhyd::evald<xt::xtensor<double, 1>>(
+                            obs, prd, {metrics[m]}
+                    );
+
+            // compare to check results are the same
+            EXPECT_TRUE(
+                    xt::allclose(
+                            xt::view(metrics_nan[m], p),
+                            metrics_sbs[0]
+                    )
+            ) << "Failure for (" << metrics[m] << ")";
+        }
     }
 }
\ No newline at end of file
-- 
GitLab