From c8d07b82ab8676eb1d898cfefe58356ba09097dc Mon Sep 17 00:00:00 2001
From: Thibault Hallouin <thibault.hallouin@inrae.fr>
Date: Fri, 24 Mar 2023 12:56:29 +0100
Subject: [PATCH] add another version of CRPS (from empirical CDF)

new metric is called CRPS_FROM_ECDF
---
 include/evalhyd/detail/probabilist/cdf.hpp    | 200 ++++++++++++++++++
 .../evalhyd/detail/probabilist/evaluator.hpp  |  28 +++
 .../evalhyd/detail/probabilist/intervals.hpp  |   3 +-
 .../evalhyd/detail/probabilist/quantiles.hpp  |   6 +-
 include/evalhyd/evalp.hpp                     |   7 +
 tests/expected/evalp/CRPS_FROM_ECDF.csv       |   1 +
 tests/test_probabilist.cpp                    |  32 +++
 7 files changed, 273 insertions(+), 4 deletions(-)
 create mode 100644 include/evalhyd/detail/probabilist/cdf.hpp
 create mode 100644 tests/expected/evalp/CRPS_FROM_ECDF.csv

diff --git a/include/evalhyd/detail/probabilist/cdf.hpp b/include/evalhyd/detail/probabilist/cdf.hpp
new file mode 100644
index 0000000..1f5489e
--- /dev/null
+++ b/include/evalhyd/detail/probabilist/cdf.hpp
@@ -0,0 +1,200 @@
+// Copyright (c) 2023, INRAE.
+// Distributed under the terms of the GPL-3 Licence.
+// The full licence is in the file LICENCE, distributed with this software.
+
+#ifndef EVALHYD_PROBABILIST_CDF_HPP
+#define EVALHYD_PROBABILIST_CDF_HPP
+
+#include <xtensor/xtensor.hpp>
+#include <xtensor/xview.hpp>
+#include <xtensor/xmath.hpp>
+
+
+namespace evalhyd
+{
+    namespace probabilist
+    {
+        namespace intermediate
+        {
+            /// Compute the CRPS for each time step as the distance between the
+            /// observed and empirical (i.e. constructed from the ensemble
+            /// predictions) quadratic cumulative density functions (CDFs).
+            ///
+            /// \param q_obs
+            ///     Streamflow observations.
+            ///     shape: (sites, time)
+            /// \param q_qnt
+            ///     Streamflow prediction quantiles.
+            ///     shape: (sites, lead times, quantiles, time)
+            /// \param n_sit
+            ///     Number of sites.
+            /// \param n_ldt
+            ///     Number of lead times.
+            /// \param n_mbr
+            ///     Number of ensemble members.
+            /// \param n_tim
+            ///     Number of time steps.
+            /// \return
+            ///     CRPS for each time step.
+            ///     shape: (sites, lead times, time)
+            template <class XD2>
+            inline xt::xtensor<double, 3> calc_crps_from_ecdf(
+                    const XD2& q_obs,
+                    const xt::xtensor<double, 4>& q_qnt,
+                    std::size_t n_sit,
+                    std::size_t n_ldt,
+                    std::size_t n_mbr,
+                    std::size_t n_tim
+            )
+            {
+                // notations below follow Hersbach (2000)
+                // https://doi.org/10.1175/1520-0434(2000)015<0559:DOTCRP>2.0.CO;2
+
+                // declare and initialise internal variables
+                xt::xtensor<double, 4> alpha_i =
+                        xt::zeros<double>({n_mbr + 1, n_sit, n_ldt, n_tim});
+
+                xt::xtensor<double, 4> beta_i =
+                        xt::zeros<double>({n_mbr + 1, n_sit, n_ldt, n_tim});
+
+                // case x_a < x_1
+                // i.e. observation is an outlier before predictive range
+                auto x_a = xt::view(q_obs, xt::all(), xt::newaxis(), xt::all());
+                auto x_1 = xt::view(q_qnt, xt::all(), xt::all(), 0, xt::all());
+
+                auto is_before = x_a < x_1;
+                xt::view(beta_i, 0, xt::all()) = xt::where(
+                        is_before, x_1 - x_a, xt::view(beta_i, 0)
+                );
+
+                for (std::size_t m = 0; m < n_mbr - 1; m++)
+                {
+                    auto x_i = xt::view(q_qnt, xt::all(), xt::all(), m, xt::all());
+                    auto x_ip1 = xt::view(q_qnt, xt::all(), xt::all(), m + 1, xt::all());
+
+                    // case x_a < x_i
+                    // i.e. observation below given member
+                    auto is_below = x_a < x_i;
+                    xt::view(beta_i, m + 1, xt::all()) = xt::where(
+                            is_below, x_ip1 - x_i, xt::view(beta_i, m + 1)
+                    );
+
+                    // case x_i <= x_a <= x_{i+1}
+                    // i.e. observation between given member and next member
+                    auto is_between = (x_i <= x_a) && (x_a <= x_ip1);
+                    xt::view(alpha_i, m + 1, xt::all()) = xt::where(
+                            is_between, x_a - x_i,  xt::view(alpha_i, m + 1)
+                    );
+                    xt::view(beta_i, m + 1, xt::all()) = xt::where(
+                            is_between, x_ip1 - x_a, xt::view(beta_i, m + 1)
+                    );
+
+                    // case x_a > x_{i+1}
+                    // i.e. observation above next member
+                    auto is_above = x_a > x_ip1;
+                    xt::view(alpha_i, m + 1, xt::all()) = xt::where(
+                            is_above, x_ip1 - x_i, xt::view(alpha_i, m + 1)
+                    );
+                }
+
+                // case x_a > x_N
+                // i.e. observation is an outlier beyond predictive range
+                auto x_N = xt::view(q_qnt, xt::all(), xt::all(), n_mbr - 1, xt::all());
+
+                auto is_beyond = x_a > x_N;
+                xt::view(alpha_i, n_mbr, xt::all()) = xt::where(
+                        is_beyond, x_a - x_N, xt::view(alpha_i, n_mbr)
+                );
+
+                // compute crps as difference between the quadratic CDFs
+                auto p_i_2 = xt::eval(
+                        xt::view(
+                                xt::square(xt::arange<double>(n_mbr + 1) / n_mbr),
+                                xt::all(), xt::newaxis(), xt::newaxis(),
+                                xt::newaxis()
+                        )
+                );
+
+                auto crps_from_ecdf = xt::sum(
+                        (alpha_i * p_i_2) + (beta_i * (1 - p_i_2)),
+                        0
+                );
+
+                return crps_from_ecdf;
+            }
+        }
+
+        namespace metrics
+        {
+            /// Compute the continuous rank probability score based on the
+            /// integration over the quadratic difference between the observed
+            /// and empirical cumulative density functions (CRPS_FROM_ECDF).
+            ///
+            /// \param crps_from_ecdf
+            ///     CRPS for each time step.
+            ///     shape: (sites, lead times, time)
+            /// \param t_msk
+            ///     Temporal subsets of the whole record.
+            ///     shape: (sites, lead times, subsets, time)
+            /// \param b_exp
+            ///     Boostrap samples.
+            ///     shape: (samples, time slice)
+            /// \param n_sit
+            ///     Number of sites.
+            /// \param n_ldt
+            ///     Number of lead times.
+            /// \param n_tim
+            ///     Number of time steps.
+            /// \param n_msk
+            ///     Number of temporal subsets.
+            /// \param n_exp
+            ///     Number of bootstrap samples.
+            /// \return
+            ///     CRPS.
+            ///     shape: (sites, lead times, subsets, samples)
+            inline xt::xtensor<double, 4> calc_CRPS_FROM_ECDF(
+                    const xt::xtensor<double, 3>& crps_from_ecdf,
+                    const xt::xtensor<bool, 4>& t_msk,
+                    const std::vector<xt::xkeep_slice<int>>& b_exp,
+                    std::size_t n_sit,
+                    std::size_t n_ldt,
+                    std::size_t n_msk,
+                    std::size_t n_exp
+            )
+            {
+                // initialise output variable
+                xt::xtensor<double, 4> CRPS_FROM_ECDF =
+                        xt::zeros<double>({n_sit, n_ldt, n_msk, n_exp});
+
+                // compute variable one mask at a time to minimise memory imprint
+                for (std::size_t m = 0; m < n_msk; m++)
+                {
+                    // apply the mask
+                    // (using NaN workaround until reducers work on masked_view)
+                    auto crps_masked = xt::where(
+                            xt::view(t_msk, xt::all(), xt::all(), m, xt::all()),
+                            crps_from_ecdf,
+                            NAN
+                    );
+
+                    // compute variable one sample at a time
+                    for (std::size_t e = 0; e < n_exp; e++)
+                    {
+                        // apply the bootstrap sampling
+                        auto crps_masked_sampled = xt::view(
+                                crps_masked, xt::all(), xt::all(), b_exp[e]
+                        );
+
+                        // calculate the mean over the time steps
+                        xt::view(CRPS_FROM_ECDF, xt::all(), xt::all(), m, e) =
+                                xt::nanmean(crps_masked_sampled, -1);
+                    }
+                }
+
+                return CRPS_FROM_ECDF;
+            }
+        }
+    }
+}
+
+#endif //EVALHYD_PROBABILIST_CDF_HPP
\ No newline at end of file
diff --git a/include/evalhyd/detail/probabilist/evaluator.hpp b/include/evalhyd/detail/probabilist/evaluator.hpp
index e1f5b48..7e23c8e 100644
--- a/include/evalhyd/detail/probabilist/evaluator.hpp
+++ b/include/evalhyd/detail/probabilist/evaluator.hpp
@@ -14,6 +14,7 @@
 
 #include "diagnostics.hpp"
 #include "brier.hpp"
+#include "cdf.hpp"
 #include "quantiles.hpp"
 #include "contingency.hpp"
 #include "ranks.hpp"
@@ -80,6 +81,8 @@ namespace evalhyd
             // (i.e. before the reduction along the temporal axis)
             // > Brier-based
             xtl::xoptional<xt::xtensor<double, 4>, bool> bs;
+            // > CDF-based
+            xtl::xoptional<xt::xtensor<double, 3>, bool> crps_from_ecdf;
             // > Quantiles-based
             xtl::xoptional<xt::xtensor<double, 4>, bool> qs;
             xtl::xoptional<xt::xtensor<double, 3>, bool> crps_from_qs;
@@ -99,6 +102,8 @@ namespace evalhyd
             xtl::xoptional<xt::xtensor<double, 6>, bool> BS_LBD;
             xtl::xoptional<xt::xtensor<double, 5>, bool> BSS;
             xtl::xoptional<xt::xtensor<double, 4>, bool> CRPS_FROM_BS;
+            // > CDF-based
+            xtl::xoptional<xt::xtensor<double, 4>, bool> CRPS_FROM_ECDF;
             // > Quantiles-based
             xtl::xoptional<xt::xtensor<double, 5>, bool> QS;
             xtl::xoptional<xt::xtensor<double, 4>, bool> CRPS_FROM_QS;
@@ -382,6 +387,17 @@ namespace evalhyd
                 return bs.value();
             };
 
+            xt::xtensor<double, 3> get_crps_from_ecdf()
+            {
+                if (!crps_from_ecdf.has_value())
+                {
+                    crps_from_ecdf = intermediate::calc_crps_from_ecdf(
+                            q_obs, get_q_qnt(), n_sit, n_ldt, n_mbr, n_tim
+                    );
+                }
+                return crps_from_ecdf.value();
+            };
+
             xt::xtensor<double, 4> get_qs()
             {
                 if (!qs.has_value())
@@ -599,6 +615,18 @@ namespace evalhyd
                 return CRPS_FROM_BS.value();
             };
 
+            xt::xtensor<double, 4> get_CRPS_FROM_ECDF()
+            {
+                if (!CRPS_FROM_ECDF.has_value())
+                {
+                    CRPS_FROM_ECDF = metrics::calc_CRPS_FROM_ECDF(
+                            get_crps_from_ecdf(), t_msk, b_exp,
+                            n_sit, n_ldt, n_msk, n_exp
+                    );
+                }
+                return CRPS_FROM_ECDF.value();
+            };
+
             xt::xtensor<double, 5> get_QS()
             {
                 if (!QS.has_value())
diff --git a/include/evalhyd/detail/probabilist/intervals.hpp b/include/evalhyd/detail/probabilist/intervals.hpp
index 0a2f4c2..c36a235 100644
--- a/include/evalhyd/detail/probabilist/intervals.hpp
+++ b/include/evalhyd/detail/probabilist/intervals.hpp
@@ -296,7 +296,8 @@ namespace evalhyd
                         std::size_t n_itv,
                         std::size_t n_msk,
                         std::size_t n_exp
-                ) {
+                )
+                {
                     // initialise output variable
                     xt::xtensor<double, 5> METRIC =
                             xt::zeros<double>({n_sit, n_ldt, n_msk, n_exp, n_itv});
diff --git a/include/evalhyd/detail/probabilist/quantiles.hpp b/include/evalhyd/detail/probabilist/quantiles.hpp
index a25ad61..88bd528 100644
--- a/include/evalhyd/detail/probabilist/quantiles.hpp
+++ b/include/evalhyd/detail/probabilist/quantiles.hpp
@@ -186,7 +186,7 @@ namespace evalhyd
             /// Compute the continuous rank probability score based on the
             /// integration over the quantile scores (CRPS_FROM_QS).
             ///
-            /// \param crps
+            /// \param crps_from_qs
             ///     CRPS for each time step.
             ///     shape: (sites, lead times, time)
             /// \param t_msk
@@ -207,7 +207,7 @@ namespace evalhyd
             ///     CRPS.
             ///     shape: (sites, lead times, subsets, samples)
             inline xt::xtensor<double, 4> calc_CRPS_FROM_QS(
-                    const xt::xtensor<double, 3>& crps,
+                    const xt::xtensor<double, 3>& crps_from_qs,
                     const xt::xtensor<bool, 4>& t_msk,
                     const std::vector<xt::xkeep_slice<int>>& b_exp,
                     std::size_t n_sit,
@@ -227,7 +227,7 @@ namespace evalhyd
                     // (using NaN workaround until reducers work on masked_view)
                     auto crps_masked = xt::where(
                             xt::view(t_msk, xt::all(), xt::all(), m, xt::all()),
-                            crps,
+                            crps_from_qs,
                             NAN
                     );
 
diff --git a/include/evalhyd/evalp.hpp b/include/evalhyd/evalp.hpp
index 95e6f3d..e3eeb7b 100644
--- a/include/evalhyd/evalp.hpp
+++ b/include/evalhyd/evalp.hpp
@@ -225,6 +225,7 @@ namespace evalhyd
         utils::check_metrics(
                 metrics,
                 {"BS", "BSS", "BS_CRD", "BS_LBD", "REL_DIAG", "CRPS_FROM_BS",
+                 "CRPS_FROM_ECDF",
                  "QS", "CRPS_FROM_QS",
                  "POD", "POFD", "FAR", "CSI", "ROCSS",
                  "RANK_HIST", "DS", "AS",
@@ -450,6 +451,12 @@ namespace evalhyd
                         uncertainty::summarise_p(evaluator.get_CRPS_FROM_BS(), summary)
                 );
             }
+            else if ( metric == "CRPS_FROM_ECDF" )
+            {
+                r.emplace_back(
+                        uncertainty::summarise_p(evaluator.get_CRPS_FROM_ECDF(), summary)
+                );
+            }
             else if ( metric == "QS" )
             {
                 r.emplace_back(
diff --git a/tests/expected/evalp/CRPS_FROM_ECDF.csv b/tests/expected/evalp/CRPS_FROM_ECDF.csv
new file mode 100644
index 0000000..13832cc
--- /dev/null
+++ b/tests/expected/evalp/CRPS_FROM_ECDF.csv
@@ -0,0 +1 @@
+271.9578705197483
diff --git a/tests/test_probabilist.cpp b/tests/test_probabilist.cpp
index f332450..fd8f24e 100644
--- a/tests/test_probabilist.cpp
+++ b/tests/test_probabilist.cpp
@@ -31,6 +31,7 @@ using namespace xt::placeholders;  // required for `_` to work
 
 std::vector<std::string> all_metrics_p = {
         "BS", "BSS", "BS_CRD", "BS_LBD", "REL_DIAG", "CRPS_FROM_BS",
+        "CRPS_FROM_ECDF",
         "QS", "CRPS_FROM_QS",
         "POD", "POFD", "FAR", "CSI", "ROCSS",
         "RANK_HIST", "DS", "AS",
@@ -116,6 +117,37 @@ TEST(ProbabilistTests, TestBrier)
     }
 }
 
+TEST(ProbabilistTests, TestCDF)
+{
+    // read in data
+    xt::xtensor<double, 1> observed;
+    xt::xtensor<double, 2> predicted;
+    std::tie(observed, predicted) = load_data_p();
+
+    // read in expected results
+    auto expected = load_expected_p();
+
+    // compute scores
+    std::vector<std::string> metrics = {"CRPS_FROM_ECDF"};
+
+    std::vector<xt::xarray<double>> results =
+            evalhyd::evalp(
+                    // shape: (sites [1], time [t])
+                    xt::eval(xt::view(observed, xt::newaxis(), xt::all())),
+                    // shape: (sites [1], lead times [1], members [m], time [t])
+                    xt::eval(xt::view(predicted, xt::newaxis(), xt::newaxis(), xt::all(), xt::all())),
+                    metrics
+            );
+
+    // check results
+    for (std::size_t m = 0; m < metrics.size(); m++)
+    {
+        EXPECT_TRUE(xt::all(xt::isclose(
+                results[m], expected[metrics[m]], 1e-05, 1e-08, true
+        ))) << "Failure for (" << metrics[m] << ")";
+    }
+}
+
 TEST(ProbabilistTests, TestQuantiles)
 {
     // read in data
-- 
GitLab