From 3de89046390def7e25171aa4a420ebe4f0ead06d Mon Sep 17 00:00:00 2001
From: Thibault Hallouin <thibault.hallouin@inrae.fr>
Date: Tue, 23 Aug 2022 09:55:50 +0200
Subject: [PATCH] deal with missing observations as NaN for determinist

In order to avoid copies of the input data, xfunctions are generated
to replace timesteps in predictions where they are NaN in observations.
This means that with this implementation, missing predictions are not
supported.
---
 include/evalhyd/evald.hpp     |  5 +++-
 src/determinist/evaluator.hpp | 45 +++++++++++++++++++++++------------
 tests/test_determinist.cpp    | 45 +++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/include/evalhyd/evald.hpp b/include/evalhyd/evald.hpp
index e04de65..a3af81b 100644
--- a/include/evalhyd/evald.hpp
+++ b/include/evalhyd/evald.hpp
@@ -35,7 +35,10 @@ namespace evalhyd
     /// :Parameters:
     ///
     ///    q_obs: ``xt::xexpression<A>``
-    ///       Streamflow observations.
+    ///       Streamflow observations. Time steps with missing observation
+    ///       must be assigned `NAN` values. Those time steps will be ignored
+    ///       both in the observations and the predictions before the
+    ///       *metrics* are computed.
     ///       shape: ({1+, ...}, time)
     ///
     ///    q_prd: ``xt::xexpression<A>``
diff --git a/src/determinist/evaluator.hpp b/src/determinist/evaluator.hpp
index f47060a..95f0346 100644
--- a/src/determinist/evaluator.hpp
+++ b/src/determinist/evaluator.hpp
@@ -15,7 +15,9 @@ namespace evalhyd
         template <class A1, class A2>
         inline auto std_dev(A1&& arr, A2&& mean_arr)
         {
-            return xt::sqrt(xt::mean(xt::square(arr - mean_arr), -1, xt::keep_dims));
+            return xt::sqrt(
+                    xt::nanmean(xt::square(arr - mean_arr), -1, xt::keep_dims)
+            );
         }
     }
 
@@ -53,11 +55,13 @@ namespace evalhyd
                     );
                 }
             };
+
             // members for evaluation metrics
             A RMSE;
             A NSE;
             A KGE;
             A KGEPRIME;
+
             // methods to compute elements
             void calc_mean_obs();
             void calc_mean_prd();
@@ -66,6 +70,7 @@ namespace evalhyd
             void calc_quad_prd();
             void calc_r_pearson();
             void calc_bias();
+
             // methods to compute metrics
             void calc_RMSE();
             void calc_NSE();
@@ -84,7 +89,7 @@ namespace evalhyd
         template <class A>
         void Evaluator<A>::calc_mean_obs()
         {
-            mean_obs = xt::mean(q_obs, -1, xt::keep_dims);
+            mean_obs = xt::nanmean(q_obs, -1, xt::keep_dims);
         }
 
         // Compute the mean of the predictions.
@@ -98,7 +103,10 @@ namespace evalhyd
         template <class A>
         void Evaluator<A>::calc_mean_prd()
         {
-            mean_prd = xt::mean(q_prd, -1, xt::keep_dims);
+            mean_prd = xt::nanmean(
+                    xt::where(xt::isnan(q_obs), NAN, q_prd),
+                    -1, xt::keep_dims
+            );
         }
 
         // Compute the quadratic error between observations and predictions.
@@ -149,7 +157,9 @@ namespace evalhyd
         template <class A>
         void Evaluator<A>::calc_quad_prd()
         {
-            quad_prd = xt::square(q_prd - mean_prd);
+            quad_prd = xt::square(
+                    xt::where(xt::isnan(q_obs), NAN, q_prd) - mean_prd
+            );
         }
 
         // Compute the Pearson correlation coefficient.
@@ -180,12 +190,14 @@ namespace evalhyd
         {
             // calculate error in timing and dynamics $r_{pearson}$
             // (Pearson's correlation coefficient)
-            auto r_num = xt::sum(
-                    (q_prd - mean_prd) * (q_obs - mean_obs), -1, xt::keep_dims
+            auto r_num = xt::nansum(
+                    (xt::where(xt::isnan(q_obs), NAN, q_prd) - mean_prd)
+                    * (q_obs - mean_obs),
+                    -1, xt::keep_dims
             );
             auto r_den = xt::sqrt(
-                    xt::sum(quad_prd, -1, xt::keep_dims)
-                    * xt::sum(quad_obs, -1, xt::keep_dims)
+                    xt::nansum(quad_prd, -1, xt::keep_dims)
+                    * xt::nansum(quad_obs, -1, xt::keep_dims)
             );
 
             r_pearson = r_num / r_den;
@@ -206,8 +218,9 @@ namespace evalhyd
         void Evaluator<A>::calc_bias()
         {
             // calculate $bias$
-            bias = xt::sum(q_prd, -1, xt::keep_dims)
-                   / xt::sum(q_obs, -1, xt::keep_dims);
+            bias = xt::nansum(xt::where(xt::isnan(q_obs), NAN, q_prd),
+                              -1, xt::keep_dims)
+                   / xt::nansum(q_obs, -1, xt::keep_dims);
         }
 
         // Compute the root-mean-square error (RMSE).
@@ -222,7 +235,7 @@ namespace evalhyd
         void Evaluator<A>::calc_RMSE()
         {
            // compute RMSE
-           RMSE = xt::sqrt(xt::mean(quad_err, -1, xt::keep_dims));
+           RMSE = xt::sqrt(xt::nanmean(quad_err, -1, xt::keep_dims));
         }
 
         // Compute the Nash-Sutcliffe Efficiency (NSE).
@@ -240,8 +253,8 @@ namespace evalhyd
         void Evaluator<A>::calc_NSE()
         {
             // compute squared errors operands
-            A f_num = xt::sum(quad_err, -1, xt::keep_dims);
-            A f_den = xt::sum(quad_obs, -1, xt::keep_dims);
+            A f_num = xt::nansum(quad_err, -1, xt::keep_dims);
+            A f_den = xt::nansum(quad_obs, -1, xt::keep_dims);
 
             // compute NSE
             NSE = 1 - (f_num / f_den);
@@ -274,7 +287,8 @@ namespace evalhyd
         void Evaluator<A>::calc_KGE()
         {
             // calculate error in spread of flow $alpha$
-            auto alpha = detail::std_dev(q_prd, mean_prd)
+            auto q_prd_ = xt::where(xt::isnan(q_obs), NAN, q_prd);
+            auto alpha = detail::std_dev(q_prd_, mean_prd)
                          / detail::std_dev(q_obs, mean_obs);
 
             // compute KGE
@@ -312,7 +326,8 @@ namespace evalhyd
         void Evaluator<A>::calc_KGEPRIME()
         {
             // calculate error in spread of flow $gamma$
-            auto gamma = (detail::std_dev(q_prd, mean_prd) / mean_prd)
+            auto q_prd_ = xt::where(xt::isnan(q_obs), NAN, q_prd);
+            auto gamma = (detail::std_dev(q_prd_, mean_prd) / mean_prd)
                          / (detail::std_dev(q_obs, mean_obs) / mean_obs);
 
             // compute KGE
diff --git a/tests/test_determinist.cpp b/tests/test_determinist.cpp
index 4d1c48d..8ef2fec 100644
--- a/tests/test_determinist.cpp
+++ b/tests/test_determinist.cpp
@@ -8,6 +8,8 @@
 
 #include "evalhyd/evald.hpp"
 
+using namespace xt::placeholders;  // required for `_` to work
+
 TEST(DeterministTests, TestMetrics2D)
 {
     // read in data
@@ -167,3 +169,46 @@ TEST(DeterministTests, TestTransform)
     EXPECT_TRUE(xt::allclose(metrics[0], nse_pow));
 
 }
+
+TEST(DeterministTests, TestMissingData)
+{
+    std::vector<std::string> metrics =
+            {"RMSE", "NSE", "KGE", "KGEPRIME"};
+
+    // read in data
+    std::ifstream ifs;
+    ifs.open("./data/q_obs.csv");
+    xt::xtensor<double, 2> observed =xt::load_csv<int>(ifs);
+    ifs.close();
+
+    ifs.open("./data/q_prd.csv");
+    xt::xtensor<double, 2> predicted = xt::view(
+            xt::load_csv<double>(ifs), xt::range(0, 5), xt::all()
+    );
+    ifs.close();
+
+    // add some missing observations artificially by assigning NaN values
+    xt::view(observed, xt::all(), xt::range(0, 20)) = NAN;
+
+    // compute metrics with observations containing NaN values
+    std::vector<xt::xarray<double>> metrics_nan =
+            evalhyd::evald<xt::xtensor<double, 2>>(
+                    observed, predicted, metrics
+            );
+
+    // compute metrics on subset of observations and predictions
+    // (i.e. eliminating region with NaN in observations manually)
+    xt::xtensor<double, 2> obs = xt::view(observed, xt::all(), xt::range(20, _));
+    xt::xtensor<double, 2> prd = xt::view(predicted, xt::all(), xt::range(20, _));
+
+    std::vector<xt::xarray<double>> metrics_sbs =
+            evalhyd::evald<xt::xtensor<double, 2>>(
+                    obs, prd, metrics
+            );
+
+    for (int m = 0; m < metrics.size(); m++) {
+        EXPECT_TRUE(
+                xt::allclose(metrics_nan[m], metrics_sbs[m])
+        ) << "Failure for (" << metrics[m] << ")";
+    }
+}
\ No newline at end of file
-- 
GitLab