From 9de91e03d5da204c2847e7c409595cb15df6aec4 Mon Sep 17 00:00:00 2001 From: Le Roux Erwan <erwan.le-roux@irstea.fr> Date: Tue, 2 Mar 2021 15:51:03 +0100 Subject: [PATCH] [projection swe] add non-stationary weight computer. refactor code for the weights. --- .../adamont_data/adamont_scenario.py | 4 + .../visualizer_for_projection_ensemble.py | 7 ++ .../projected_swe/abstract_weight_computer.py | 94 +++++++++++++++ .../projected_swe/main_weight_computation.py | 112 ++++-------------- .../non_stationary_weight_computer.py | 41 +++++++ projects/projected_swe/utils.py | 73 ++++++++++++ 6 files changed, 242 insertions(+), 89 deletions(-) create mode 100644 projects/projected_swe/abstract_weight_computer.py create mode 100644 projects/projected_swe/non_stationary_weight_computer.py create mode 100644 projects/projected_swe/utils.py diff --git a/extreme_data/meteo_france_data/adamont_data/adamont_scenario.py b/extreme_data/meteo_france_data/adamont_data/adamont_scenario.py index e9003913..1cbdead3 100644 --- a/extreme_data/meteo_france_data/adamont_data/adamont_scenario.py +++ b/extreme_data/meteo_france_data/adamont_data/adamont_scenario.py @@ -139,3 +139,7 @@ def scenario_to_real_scenarios(adamont_scenario): def gcm_rcm_couple_to_str(gcm_rcm_couple): return SEPARATOR_STR.join(gcm_rcm_couple) + + +def str_to_gcm_rcm_couple(str): + return tuple(str.split(SEPARATOR_STR)) diff --git a/extreme_trend/ensemble_fit/visualizer_for_projection_ensemble.py b/extreme_trend/ensemble_fit/visualizer_for_projection_ensemble.py index 9d441551..4dfb2e52 100644 --- a/extreme_trend/ensemble_fit/visualizer_for_projection_ensemble.py +++ b/extreme_trend/ensemble_fit/visualizer_for_projection_ensemble.py @@ -26,10 +26,17 @@ class VisualizerForProjectionEnsemble(object): gcm_to_year_min_and_year_max=None, ): self.altitudes_list = altitudes_list + self.scenario = scenario self.gcm_rcm_couples = gcm_rcm_couples self.massif_names = massif_names self.ensemble_fit_classes = ensemble_fit_classes + # Some checks + if gcm_to_year_min_and_year_max is not None: + for gcm, years in gcm_to_year_min_and_year_max.items(): + assert isinstance(gcm, str), gcm + assert len(years) == 2, years + # Load all studies altitude_class_to_gcm_couple_to_studies = OrderedDict() for altitudes in altitudes_list: diff --git a/projects/projected_swe/abstract_weight_computer.py b/projects/projected_swe/abstract_weight_computer.py new file mode 100644 index 00000000..dbff2a20 --- /dev/null +++ b/projects/projected_swe/abstract_weight_computer.py @@ -0,0 +1,94 @@ +from collections import OrderedDict +import pandas as pd + +import numpy as np +from scipy.special import softmax + +from extreme_data.meteo_france_data.adamont_data.adamont_scenario import gcm_rcm_couple_to_str +from extreme_data.meteo_france_data.scm_models_data.altitudes_studies import AltitudesStudies +from extreme_fit.estimator.margin_estimator.abstract_margin_estimator import compute_nllh +from extreme_trend.ensemble_fit.independent_ensemble_fit.independent_ensemble_fit import IndependentEnsembleFit +from extreme_trend.ensemble_fit.visualizer_for_projection_ensemble import VisualizerForProjectionEnsemble +from projects.projected_swe.utils import WEIGHT_COLUMN_NAME, get_csv_filepath, save_to_filepath +from spatio_temporal_dataset.coordinates.temporal_coordinates.abstract_temporal_covariate_for_fit import \ + TimeTemporalCovariate +from collections import OrderedDict + +import numpy as np +from scipy.special import softmax + +from extreme_data.meteo_france_data.scm_models_data.altitudes_studies import AltitudesStudies +from extreme_fit.estimator.margin_estimator.abstract_margin_estimator import compute_nllh +from extreme_trend.ensemble_fit.independent_ensemble_fit.independent_ensemble_fit import IndependentEnsembleFit +from extreme_trend.ensemble_fit.visualizer_for_projection_ensemble import VisualizerForProjectionEnsemble +from spatio_temporal_dataset.coordinates.temporal_coordinates.abstract_temporal_covariate_for_fit import \ + TimeTemporalCovariate + + +class AbstractWeightComputer(object): + + def __init__(self, visualizer: VisualizerForProjectionEnsemble, scm_study_class, year_min, year_max): + self.scm_study_class = scm_study_class + self.year_max = year_max + self.year_min = year_min + self.visualizer = visualizer + + def compute_weights_and_save_them(self): + column_names = [WEIGHT_COLUMN_NAME] + self.visualizer.gcm_rcm_couples + df = pd.DataFrame(index=self.visualizer.gcm_rcm_couples, columns=column_names) + for i, column_name in enumerate(column_names): + if i == 0: + gcm_rcm_couples_subset = self.visualizer.gcm_rcm_couples + gcm_rcm_couple_missing = None + else: + index_missing = i - 1 + gcm_rcm_couple_missing = self.visualizer.gcm_rcm_couples[index_missing] + gcm_rcm_couples_subset = self.visualizer.gcm_rcm_couples[:] + gcm_rcm_couples_subset.remove(gcm_rcm_couple_missing) + # Compute weights + gcm_rcm_couple_to_weight = self.compute_weight(gcm_rcm_couples_subset, gcm_rcm_couple_missing) + column_values = [gcm_rcm_couple_to_weight[c] for c in df.index] + df[column_name] = column_values + + # Save csv + save_to_filepath(df, self.visualizer.gcm_rcm_couples, self.visualizer.altitudes_list, + self.year_min, self.year_max, + self.visualizer.scenario) + + def compute_weight(self, gcm_rcm_couples_subset, gcm_rcm_couple_missing): + # Initial the dictionary + gcm_rcm_couple_to_nllh = OrderedDict() + for gcm_rcm_couple in gcm_rcm_couples_subset: + gcm_rcm_couple_to_nllh[gcm_rcm_couple] = 0 + for ensemble_fit in self.visualizer.ensemble_fits(ensemble_class=IndependentEnsembleFit): + # Load the AltitudeStudies + if gcm_rcm_couple_missing is None: + reference_studies = AltitudesStudies(self.scm_study_class, ensemble_fit.altitudes, + year_min=self.year_min, + year_max=self.year_max) + else: + reference_studies = ensemble_fit.gcm_rcm_couple_to_visualizer[gcm_rcm_couple_missing].studies + # Computer the nllh for each "station", i.e. each altitude and massif_name + for altitude, reference_study in reference_studies.altitude_to_study.items(): + for massif_name, maxima in reference_study.massif_name_to_annual_maxima.items(): + assert all([self.year_min <= year <= self.year_max for year in reference_study.ordered_years]) + gcm_rcm_couple_to_local_nllh = self.compute_gcm_rcm_couple_to_local_nllh(ensemble_fit, + gcm_rcm_couples_subset, + massif_name, + maxima, + altitude, + reference_study) + if gcm_rcm_couple_to_local_nllh is not None: + for c, nllh in gcm_rcm_couple_to_local_nllh.items(): + gcm_rcm_couple_to_nllh[c] += nllh + # Compute weights + weights = softmax(-np.array(list(gcm_rcm_couple_to_nllh.values()))) + weights = [w[0] if isinstance(w, np.ndarray) else w for w in weights] + gcm_rcm_couple_to_weight = dict(zip(gcm_rcm_couples_subset, weights)) + # Add missing weight + gcm_rcm_couple_to_weight[gcm_rcm_couple_missing] = np.nan + return gcm_rcm_couple_to_weight + + def compute_gcm_rcm_couple_to_local_nllh(self, ensemble_fit, gcm_rcm_couples_subset, massif_name, maxima, + altitude, reference_study): + raise NotImplementedError diff --git a/projects/projected_swe/main_weight_computation.py b/projects/projected_swe/main_weight_computation.py index f43222ec..579346bf 100644 --- a/projects/projected_swe/main_weight_computation.py +++ b/projects/projected_swe/main_weight_computation.py @@ -11,97 +11,28 @@ from scipy.special import softmax from extreme_data.meteo_france_data.adamont_data.adamont.adamont_safran import AdamontSnowfall from extreme_data.meteo_france_data.adamont_data.adamont_scenario import AdamontScenario, get_gcm_rcm_couples, \ - gcm_rcm_couple_to_str, SEPARATOR_STR + gcm_rcm_couple_to_str, SEPARATOR_STR, scenario_to_str from extreme_data.meteo_france_data.scm_models_data.altitudes_studies import AltitudesStudies from extreme_data.meteo_france_data.scm_models_data.safran.safran import SafranSnowfall1Day from extreme_data.meteo_france_data.scm_models_data.utils import Season from extreme_data.utils import DATA_PATH from extreme_fit.estimator.margin_estimator.abstract_margin_estimator import compute_nllh +from extreme_fit.model.margin_model.polynomial_margin_model.gev_altitudinal_models import StationaryAltitudinal +from extreme_fit.model.margin_model.polynomial_margin_model.models_based_on_pariwise_analysis.gev_with_linear_shape_wrt_altitude import \ + AltitudinalShapeLinearTimeStationary from extreme_fit.model.margin_model.polynomial_margin_model.utils import \ ALTITUDINAL_GEV_MODELS_BASED_ON_POINTWISE_ANALYSIS from extreme_trend.ensemble_fit.independent_ensemble_fit.independent_ensemble_fit import IndependentEnsembleFit from extreme_trend.ensemble_fit.visualizer_for_projection_ensemble import VisualizerForProjectionEnsemble from extreme_trend.one_fold_fit.altitude_group import altitudes_for_groups +from projects.projected_swe.abstract_weight_computer import AbstractWeightComputer +from projects.projected_swe.non_stationary_weight_computer import NllhWeightComputer +from projects.projected_swe.utils import load_gcm_rcm_couple_to_weight from spatio_temporal_dataset.coordinates.temporal_coordinates.abstract_temporal_covariate_for_fit import \ TimeTemporalCovariate from spatio_temporal_dataset.coordinates.temporal_coordinates.temperature_covariate import \ AnomalyTemperatureWithSplineTemporalCovariate -WEIGHT_COLUMN_NAME = "weight" - -WEIGHT_FOLDER = "ensemble weight" - - -def get_csv_filepath(gcm_rcm_couples, altitudes_list, year_min, year_max): - nb_gcm_rcm_couples = len(gcm_rcm_couples) - nb_altitudes_list = len(altitudes_list) - ensemble_folder_path = op.join(DATA_PATH, WEIGHT_FOLDER) - if not op.exists(ensemble_folder_path): - os.makedirs(ensemble_folder_path, exist_ok=True) - csv_filename = "weights_{}_{}_{}_{}.csv".format(nb_gcm_rcm_couples, nb_altitudes_list, year_min, year_max) - weight_csv_filepath = op.join(ensemble_folder_path, csv_filename) - return weight_csv_filepath - - -def load_gcm_rcm_couple_to_weight(gcm_rcm_couples, altitudes_list, year_min, year_max): - filepath = get_csv_filepath(gcm_rcm_couples, altitudes_list, year_min, year_max) - df = pd.read_csv(filepath, index_col=0) - d = df[WEIGHT_COLUMN_NAME].to_dict() - d = {tuple(k.split(SEPARATOR_STR)): v for k, v in d.items()} - return d - - -def save_gcm_rcm_couple_to_weight(visualizer: VisualizerForProjectionEnsemble, scm_study_class, - year_min, year_max): - gcm_rcm_couple_to_nllh_sum = OrderedDict() - for c in visualizer.gcm_rcm_couples: - gcm_rcm_couple_to_nllh_sum[c] = 0 - for ensemble_fit in visualizer.ensemble_fits(ensemble_class=IndependentEnsembleFit): - # Load the AltitudeStudies - scm_studies = AltitudesStudies(scm_study_class, ensemble_fit.altitudes, year_min=year_min,year_max=year_max) - for altitude, study in scm_studies.altitude_to_study.items(): - for massif_name, maxima in study.massif_name_to_annual_maxima.items(): - # Check that all the gcm_rcm_couple have a model for this massif_name - if condition_to_compute_nllh(ensemble_fit, massif_name, visualizer): - print(ensemble_fit.altitudes, massif_name) - coordinates = [np.array([altitude, year]) for year in study.ordered_years] - nllh_list = [] - for gcm_rcm_couple in visualizer.gcm_rcm_couples: - best_function_from_fit = get_function_from_fit(ensemble_fit, massif_name, gcm_rcm_couple) - # It is normal that it could crash, because some models where fitted with data smaller than - # the data used to compute the nllh - nllh = compute_nllh(coordinates, maxima, best_function_from_fit, - maximum_from_obs=False, assertion_for_inf=False) - nllh_list.append(nllh) - if all([not np.isinf(nllh) for nllh in nllh_list]): - for nllh, gcm_rcm_couple in zip(nllh_list, visualizer.gcm_rcm_couples): - gcm_rcm_couple_to_nllh_sum[gcm_rcm_couple] += nllh - - # Compute the final weight - print(gcm_rcm_couple_to_nllh_sum) - llh_list = -np.array(list(gcm_rcm_couple_to_nllh_sum.values())) - weights = softmax(llh_list) - couple_names = [gcm_rcm_couple_to_str(c) for c in visualizer.gcm_rcm_couples] - gcm_rcm_couple_to_normalized_weights = dict(zip(couple_names, weights)) - print(gcm_rcm_couple_to_normalized_weights) - # Save to csv - filepath = get_csv_filepath(visualizer.gcm_rcm_couples, visualizer.altitudes_list, year_min, year_max) - df = pd.DataFrame({WEIGHT_COLUMN_NAME: weights}, index=couple_names) - print(df) - df.to_csv(filepath) - - -def condition_to_compute_nllh(ensemble_fit, massif_name, visualizer): - return all( - [massif_name in ensemble_fit.gcm_rcm_couple_to_visualizer[c].massif_name_to_one_fold_fit for c in - visualizer.gcm_rcm_couples]) - - -def get_function_from_fit(ensemble_fit, massif_name, gcm_rcm_couple): - visualizer = ensemble_fit.gcm_rcm_couple_to_visualizer[gcm_rcm_couple] - one_fold_fit = visualizer.massif_name_to_one_fold_fit[massif_name] - return one_fold_fit.best_function_from_fit - def main_weight_computation(): start = time.time() @@ -111,23 +42,24 @@ def main_weight_computation(): }[study_class] ensemble_fit_classes = [IndependentEnsembleFit] temporal_covariate_for_fit = TimeTemporalCovariate - model_classes = ALTITUDINAL_GEV_MODELS_BASED_ON_POINTWISE_ANALYSIS + # model_classes = ALTITUDINAL_GEV_MODELS_BASED_ON_POINTWISE_ANALYSIS + model_classes = [StationaryAltitudinal, AltitudinalShapeLinearTimeStationary] remove_physically_implausible_models = True - scenario = AdamontScenario.histo + scenario = AdamontScenario.rcp85_extended gcm_rcm_couples = get_gcm_rcm_couples(scenario) year_min = 1982 - year_max = 2005 - # todo: maybe i should also limit the years for the fit of the model for each ensemble ? + year_max = 2019 + weight_computer_class = NllhWeightComputer - fast = None + fast = True if fast is None: massif_names = None altitudes_list = altitudes_for_groups[2:3] - gcm_rcm_couples = gcm_rcm_couples[:10] + gcm_rcm_couples = gcm_rcm_couples[:] elif fast: - massif_names = ['Pelvoux'][:1] - altitudes_list = altitudes_for_groups[2:3] - gcm_rcm_couples = gcm_rcm_couples[:2] + massif_names = ['Vanoise', 'Pelvoux', 'Chartreuse'][:1] + altitudes_list = altitudes_for_groups[1:2] + gcm_rcm_couples = gcm_rcm_couples[:3] else: massif_names = None altitudes_list = altitudes_for_groups[:] @@ -139,9 +71,11 @@ def main_weight_computation(): massif_names=massif_names, temporal_covariate_for_fit=temporal_covariate_for_fit, remove_physically_implausible_models=remove_physically_implausible_models, - gcm_to_year_min_and_year_max=None, + gcm_to_year_min_and_year_max={c[0]: (year_min, year_max) for c in gcm_rcm_couples}, ) - save_gcm_rcm_couple_to_weight(visualizer, scm_study_class, year_min, year_max) + + weight_computer = weight_computer_class(visualizer, scm_study_class, year_min, year_max) # type:AbstractWeightComputer + weight_computer.compute_weights_and_save_them() end = time.time() duration = str(datetime.timedelta(seconds=end - start)) @@ -150,5 +84,5 @@ def main_weight_computation(): if __name__ == '__main__': main_weight_computation() - # d = load_gcm_rcm_couple_to_weight(['sd', 'sdf'], [23]) - # print(d) + d = load_gcm_rcm_couple_to_weight(['sd', 'sdf'], [23], 1982, 2019, AdamontScenario.rcp85_extended) + print(d) diff --git a/projects/projected_swe/non_stationary_weight_computer.py b/projects/projected_swe/non_stationary_weight_computer.py new file mode 100644 index 00000000..c7ffaec6 --- /dev/null +++ b/projects/projected_swe/non_stationary_weight_computer.py @@ -0,0 +1,41 @@ +import numpy as np + +from extreme_fit.estimator.margin_estimator.abstract_margin_estimator import compute_nllh +from projects.projected_swe.abstract_weight_computer import AbstractWeightComputer +from spatio_temporal_dataset.coordinates.temporal_coordinates.abstract_temporal_covariate_for_fit import \ + TimeTemporalCovariate + + +class NllhWeightComputer(AbstractWeightComputer): + + def compute_gcm_rcm_couple_to_local_nllh(self, ensemble_fit, gcm_rcm_couples_subset, massif_name, maxima, + altitude, reference_study): + assert ensemble_fit.temporal_covariate_for_fit is TimeTemporalCovariate, \ + 'for temperature covariate you should transform the coordinates' + gcm_rcm_couple_to_local_nllh = {} + # Check that all the gcm_rcm_couple have a model for this massif_name + if self.condition_to_compute_nllh(ensemble_fit, massif_name, self.visualizer): + print(ensemble_fit.altitudes, massif_name) + coordinates = [np.array([altitude, year]) for year in reference_study.ordered_years] + nllh_list = [] + for gcm_rcm_couple in gcm_rcm_couples_subset: + best_function_from_fit = self.get_function_from_fit(ensemble_fit, massif_name, gcm_rcm_couple) + # It is normal that it could crash, because some models where fitted with data smaller than + # the data used to compute the nllh + + nllh = compute_nllh(coordinates, maxima, best_function_from_fit, + maximum_from_obs=False, assertion_for_inf=False) + nllh_list.append(nllh) + + if all([not np.isinf(nllh) for nllh in nllh_list]): + return dict(zip(gcm_rcm_couples_subset, nllh_list)) + + def condition_to_compute_nllh(self, ensemble_fit, massif_name, visualizer): + return all( + [massif_name in ensemble_fit.gcm_rcm_couple_to_visualizer[c].massif_name_to_one_fold_fit for c in + visualizer.gcm_rcm_couples]) + + def get_function_from_fit(self, ensemble_fit, massif_name, gcm_rcm_couple): + visualizer = ensemble_fit.gcm_rcm_couple_to_visualizer[gcm_rcm_couple] + one_fold_fit = visualizer.massif_name_to_one_fold_fit[massif_name] + return one_fold_fit.best_function_from_fit diff --git a/projects/projected_swe/utils.py b/projects/projected_swe/utils.py new file mode 100644 index 00000000..2c04c17d --- /dev/null +++ b/projects/projected_swe/utils.py @@ -0,0 +1,73 @@ +import os + +from collections import OrderedDict + +import pandas as pd +import os.path as op +import datetime +import time +import numpy as np +from scipy.special import softmax + +from extreme_data.meteo_france_data.adamont_data.adamont.adamont_safran import AdamontSnowfall +from extreme_data.meteo_france_data.adamont_data.adamont_scenario import AdamontScenario, get_gcm_rcm_couples, \ + gcm_rcm_couple_to_str, SEPARATOR_STR, scenario_to_str, str_to_gcm_rcm_couple +from extreme_data.meteo_france_data.scm_models_data.altitudes_studies import AltitudesStudies +from extreme_data.meteo_france_data.scm_models_data.safran.safran import SafranSnowfall1Day +from extreme_data.meteo_france_data.scm_models_data.utils import Season +from extreme_data.utils import DATA_PATH +from extreme_fit.estimator.margin_estimator.abstract_margin_estimator import compute_nllh +from extreme_fit.model.margin_model.polynomial_margin_model.gev_altitudinal_models import StationaryAltitudinal +from extreme_fit.model.margin_model.polynomial_margin_model.utils import \ + ALTITUDINAL_GEV_MODELS_BASED_ON_POINTWISE_ANALYSIS +from extreme_trend.ensemble_fit.independent_ensemble_fit.independent_ensemble_fit import IndependentEnsembleFit +from extreme_trend.ensemble_fit.visualizer_for_projection_ensemble import VisualizerForProjectionEnsemble +from extreme_trend.one_fold_fit.altitude_group import altitudes_for_groups +from spatio_temporal_dataset.coordinates.temporal_coordinates.abstract_temporal_covariate_for_fit import \ + TimeTemporalCovariate +from spatio_temporal_dataset.coordinates.temporal_coordinates.temperature_covariate import \ + AnomalyTemperatureWithSplineTemporalCovariate + +WEIGHT_COLUMN_NAME = "all weights" + +WEIGHT_FOLDER = "ensemble weight" + + +def get_csv_filepath(gcm_rcm_couples, altitudes_list, year_min, year_max, scenario): + nb_gcm_rcm_couples = len(gcm_rcm_couples) + nb_altitudes_list = len(altitudes_list) + ensemble_folder_path = op.join(DATA_PATH, WEIGHT_FOLDER) + if not op.exists(ensemble_folder_path): + os.makedirs(ensemble_folder_path, exist_ok=True) + scenario_str = scenario_to_str(scenario) + csv_filename = "weights_{}_{}_{}_{}_{}.csv" \ + .format(nb_gcm_rcm_couples, nb_altitudes_list, year_min, year_max, scenario_str) + weight_csv_filepath = op.join(ensemble_folder_path, csv_filename) + return weight_csv_filepath + + +def save_to_filepath(df, gcm_rcm_couples, altitudes_list, + year_min, year_max, + scenario): + filepath = get_csv_filepath(gcm_rcm_couples, altitudes_list, year_min, year_max, scenario) + df = df.round(decimals=3) + df.index = [gcm_rcm_couple_to_str(i) for i in df.index] + df.columns = [gcm_rcm_couple_to_str(i) if j > 0 else i for j, i in enumerate(df.columns)] + + # df.columns = [gcm_rcm_couple_to_str(i) for i in df.index] + print(df.head()) + df.to_csv(filepath) + + +def load_gcm_rcm_couple_to_weight(gcm_rcm_couples, altitudes_list, year_min, year_max, scenario, + gcm_rcm_couple_missing=None): + filepath = get_csv_filepath(gcm_rcm_couples, altitudes_list, year_min, year_max, scenario) + df = pd.read_csv(filepath, index_col=0) + df.index = [str_to_gcm_rcm_couple(i) for i in df.index] + df.columns = [str_to_gcm_rcm_couple(i) if j > 0 else i for j, i in enumerate(df.columns)] + if gcm_rcm_couple_missing is None: + column_name = WEIGHT_COLUMN_NAME + else: + column_name = gcm_rcm_couple_missing + d = df[column_name].to_dict() + return d -- GitLab