From 815d8196937030e93fc9da2b4a8a2c2ff6afd244 Mon Sep 17 00:00:00 2001 From: Le Roux Erwan <erwan.le-roux@irstea.fr> Date: Wed, 22 Jul 2020 16:36:32 +0200 Subject: [PATCH] [contrasting] add default pval equal to zero in gnFit (to avoid the program to crash when called by python). refactor anderson test. add anderson test by default for the one fold analysis. --- extreme_fit/distribution/gumbel/gnfit_fixed.R | 2 + extreme_fit/distribution/gumbel/gumbel_gof.py | 6 +++ extreme_trend/abstract_gev_trend_test.py | 14 +----- .../altitudes_fit/main_altitudes_studies.py | 44 +++++++++---------- ...es_visualizer_for_non_stationary_models.py | 26 ++++++++--- .../one_fold_analysis/one_fold_fit.py | 41 ++++++++++------- .../one_fold_analysis/plot_total_aic.py | 10 ++++- 7 files changed, 85 insertions(+), 58 deletions(-) diff --git a/extreme_fit/distribution/gumbel/gnfit_fixed.R b/extreme_fit/distribution/gumbel/gnfit_fixed.R index 6822300a..2a638eef 100644 --- a/extreme_fit/distribution/gumbel/gnfit_fixed.R +++ b/extreme_fit/distribution/gumbel/gnfit_fixed.R @@ -77,6 +77,7 @@ gnfit_fixed <- function (dat, dist, df = NULL, pr = NULL, threshold = NULL) pval <- exp(1.111 - 34.242 * w + 12.832 * w^2) } else { + pval <- 0 # I added that to avoid the code to crash warning("p-value is smaller than 7.37e-10") } z$Wpval <- pval @@ -96,6 +97,7 @@ gnfit_fixed <- function (dat, dist, df = NULL, pr = NULL, threshold = NULL) pval <- exp(1.2937 - 5.709 * A + 0.0186 * A^2) } else { + pval <- 0 # I added that to avoid the code to crash warning("p-value is smaller than 7.37e-10") } z$Apval <- pval diff --git a/extreme_fit/distribution/gumbel/gumbel_gof.py b/extreme_fit/distribution/gumbel/gumbel_gof.py index 35900799..41665109 100644 --- a/extreme_fit/distribution/gumbel/gumbel_gof.py +++ b/extreme_fit/distribution/gumbel/gumbel_gof.py @@ -11,5 +11,11 @@ def cramer_von_mises_and_anderson_darling_tests_pvalues_for_gumbel_distribution( return res['Wpval'], res['Apval'] +def goodness_of_fit_anderson(quantiles, significance_level=0.05): + test = cramer_von_mises_and_anderson_darling_tests_pvalues_for_gumbel_distribution(quantiles) + _, ander_darling_test_pvalue = test + return ander_darling_test_pvalue > significance_level + + if __name__ == '__main__': cramer_von_mises_and_anderson_darling_tests_pvalues_for_gumbel_distribution(np.arange(50)) diff --git a/extreme_trend/abstract_gev_trend_test.py b/extreme_trend/abstract_gev_trend_test.py index c18c003c..43a1e800 100644 --- a/extreme_trend/abstract_gev_trend_test.py +++ b/extreme_trend/abstract_gev_trend_test.py @@ -11,7 +11,7 @@ from extreme_data.eurocode_data.utils import EUROCODE_QUANTILE, YEAR_OF_INTEREST from extreme_data.meteo_france_data.scm_models_data.crocus.crocus_variables import AbstractSnowLoadVariable from extreme_fit.distribution.gev.gev_params import GevParams from extreme_fit.distribution.gumbel.gumbel_gof import \ - cramer_von_mises_and_anderson_darling_tests_pvalues_for_gumbel_distribution + cramer_von_mises_and_anderson_darling_tests_pvalues_for_gumbel_distribution, goodness_of_fit_anderson from extreme_fit.estimator.margin_estimator.utils import fitted_linear_margin_estimator from extreme_fit.model.margin_model.linear_margin_model.temporal_linear_margin_models import \ StationaryTemporalModel, GumbelTemporalModel @@ -70,18 +70,8 @@ class AbstractGevTrendTest(object): @property def goodness_of_fit_anderson_test(self): - # significance_level_to_index = dict(zip([0.25, 0.1, 0.05, 0.025, 0.01], list(range(5)))) - # print(significance_level_to_index) - # assert self.SIGNIFICANCE_LEVEL in significance_level_to_index - # # significance_level=array([25. , 10. , 5. , 2.5, 1. ])) - # index_for_significance_level_5_percent = 2 quantiles = self.compute_empirical_quantiles(estimator=self.unconstrained_estimator) - # test_res = anderson(quantiles, dist='gumbel_r') # type: AndersonResult - # return test_res.statistic < test_res.critical_values[index_for_significance_level_5_percent] - # print(quantiles) - test = cramer_von_mises_and_anderson_darling_tests_pvalues_for_gumbel_distribution(quantiles) - _, ander_darling_test_pvalue = test - return ander_darling_test_pvalue > self.SIGNIFICANCE_LEVEL + return goodness_of_fit_anderson(quantiles, self.SIGNIFICANCE_LEVEL) @property def name(self): diff --git a/projects/altitude_spatial_model/altitudes_fit/main_altitudes_studies.py b/projects/altitude_spatial_model/altitudes_fit/main_altitudes_studies.py index 097490a5..5b013a5f 100644 --- a/projects/altitude_spatial_model/altitudes_fit/main_altitudes_studies.py +++ b/projects/altitude_spatial_model/altitudes_fit/main_altitudes_studies.py @@ -2,7 +2,8 @@ import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np -from projects.altitude_spatial_model.altitudes_fit.one_fold_analysis.plot_total_aic import plot_total_aic +from projects.altitude_spatial_model.altitudes_fit.one_fold_analysis.plot_total_aic import plot_total_aic, \ + plot_individual_aic mpl.rcParams['text.usetex'] = True mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}'] @@ -22,21 +23,6 @@ from projects.altitude_spatial_model.altitudes_fit.one_fold_analysis.altitudes_s AltitudesStudiesVisualizerForNonStationaryModels -def plot_altitudinal_fit(studies, massif_names=None): - model_classes = ALTITUDINAL_GEV_MODELS + ALTITUDINAL_GUMBEL_MODELS - visualizer = AltitudesStudiesVisualizerForNonStationaryModels(studies=studies, - model_classes=model_classes, - massif_names=massif_names, - show=False) - # Plot the results for the model that minimizes the individual aic - OneFoldFit.best_estimator_minimizes_total_aic = False - visualizer.plot_moments() - # visualizer.plot_best_coef_maps() - visualizer.plot_shape_map() - - plot_total_aic(model_classes, visualizer) - - def plot_time_series(studies, massif_names=None): studies.plot_maxima_time_series(massif_names=massif_names) @@ -47,19 +33,33 @@ def plot_moments(studies, massif_names=None): studies.plot_mean_maxima_against_altitude(massif_names=massif_names, std=std, change=change) +def plot_altitudinal_fit(studies, massif_names=None): + model_classes = ALTITUDINAL_GEV_MODELS + ALTITUDINAL_GUMBEL_MODELS + visualizer = AltitudesStudiesVisualizerForNonStationaryModels(studies=studies, + model_classes=model_classes, + massif_names=massif_names, + show=False) + # Plot the results for the model that minimizes the individual aic + plot_individual_aic(visualizer) + # Plot the results for the model that minimizes the total aic + plot_total_aic(model_classes, visualizer) + + def main(): - # altitudes = [900, 1200, 1500, 1800, 2100, 2400, 2700, 3000][4:7] + altitudes = [900, 1200, 1500, 1800, 2100, 2400, 2700, 3000][4:7] # altitudes = [900, 1200, 1500, 1800, 2100, 2400, 2700, 3000][:] # todo: l ecart pour les saisons de l automne ou de sprint # vient probablement de certains zéros pas pris en compte pour le fit, # mais pris en compte pour le calcul de mon aic - altitudes = [900, 1200, 1500, 1800, 2100, 2400, 2700, 3000, 3300, 3600, 3900][:] + # altitudes = [900, 1200, 1500, 1800, 2100, 2400, 2700, 3000, 3300, 3600, 3900][:] study_classes = [SafranSnowfall1Day, SafranSnowfall3Days, SafranSnowfall5Days, SafranSnowfall7Days][:2] study_classes = [SafranPrecipitation1Day, SafranPrecipitation3Days, SafranPrecipitation5Days, SafranPrecipitation7Days][:] study_classes = [SafranSnowfall1Day, SafranPrecipitation1Day, - SafranSnowfall3Days, SafranPrecipitation3Days][:] - seasons = [Season.automn, Season.winter, Season.spring][::-1] + SafranSnowfall3Days, SafranPrecipitation3Days][:1] + # seasons = [Season.automn, Season.winter, Season.spring][::-1] + seasons = [Season.winter] + massif_names = None # massif_names = ['Aravis'] # massif_names = ['Chartreuse', 'Belledonne'] @@ -68,8 +68,8 @@ def main(): for study_class in study_classes: studies = AltitudesStudies(study_class, altitudes, season=season) print('inner loop', season, study_class) - plot_time_series(studies, massif_names) - plot_moments(studies, massif_names) + # plot_time_series(studies, massif_names) + # plot_moments(studies, massif_names) plot_altitudinal_fit(studies, massif_names) diff --git a/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/altitudes_studies_visualizer_for_non_stationary_models.py b/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/altitudes_studies_visualizer_for_non_stationary_models.py index 387a5acf..859e661f 100644 --- a/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/altitudes_studies_visualizer_for_non_stationary_models.py +++ b/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/altitudes_studies_visualizer_for_non_stationary_models.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Dict import matplotlib.pyplot as plt import numpy as np @@ -25,17 +25,26 @@ class AltitudesStudiesVisualizerForNonStationaryModels(StudyVisualizer): model_classes: List[AbstractSpatioTemporalPolynomialModel], show=False, massif_names=None, - fit_method=MarginFitMethod.extremes_fevd_mle): + fit_method=MarginFitMethod.extremes_fevd_mle, + display_only_model_that_pass_anderson_test=True): super().__init__(studies.study, show=show, save_to_file=not show) - self.massif_names = massif_names if massif_names is not None else self.study.study_massif_names self.studies = studies self.non_stationary_models = model_classes self.fit_method = fit_method - self.massif_name_to_one_fold_fit = {} + self.display_only_model_that_pass_anderson_test = display_only_model_that_pass_anderson_test + self.massif_names = massif_names if massif_names is not None else self.study.study_massif_names + self.massif_name_to_massif_id = {m: i for i, m in enumerate(self.massif_names)} + # Load one fold fit + self._massif_name_to_one_fold_fit = {} for massif_name in self.massif_names: dataset = studies.spatio_temporal_dataset(massif_name=massif_name) old_fold_fit = OneFoldFit(massif_name, dataset, model_classes, self.fit_method) - self.massif_name_to_one_fold_fit[massif_name] = old_fold_fit + self._massif_name_to_one_fold_fit[massif_name] = old_fold_fit + + @property + def massif_name_to_one_fold_fit(self) -> Dict[str, OneFoldFit]: + return {massif_name: old_fold_fit for massif_name, old_fold_fit in self._massif_name_to_one_fold_fit.items() + if not self.display_only_model_that_pass_anderson_test or old_fold_fit.goodness_of_fit_anderson_test} def plot_moments(self): for method_name in ['moment', 'changes_in_the_moment', 'relative_changes_in_the_moment']: @@ -46,12 +55,12 @@ class AltitudesStudiesVisualizerForNonStationaryModels(StudyVisualizer): ax = plt.gca() min_altitude, *_, max_altitude = self.studies.altitudes altitudes_plot = np.linspace(min_altitude, max_altitude, num=50) - for massif_id, massif_name in enumerate(self.massif_names): + for massif_name, one_fold_fit in self.massif_name_to_one_fold_fit.items(): massif_altitudes = self.studies.massif_name_to_altitudes[massif_name] ind = (min(massif_altitudes) <= altitudes_plot) & (altitudes_plot <= max(massif_altitudes)) massif_altitudes_plot = altitudes_plot[ind] - one_fold_fit = self.massif_name_to_one_fold_fit[massif_name] values = one_fold_fit.__getattribute__(method_name)(massif_altitudes_plot, order=order) + massif_id = self.massif_name_to_massif_id[massif_name] plot_against_altitude(massif_altitudes_plot, ax, massif_id, massif_name, values) # Plot settings ax.legend(prop={'size': 7}, ncol=3) @@ -128,3 +137,6 @@ class AltitudesStudiesVisualizerForNonStationaryModels(StudyVisualizer): label='Shape plot for {} {}'.format(SCM_STUDY_CLASS_TO_ABBREVIATION[type(self.study)], self.study.variable_unit), add_x_label=False, graduation=0.1, massif_name_to_text=self.massif_name_to_name) + + def plot_altitude_change_of_sign(self): + pass diff --git a/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/one_fold_fit.py b/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/one_fold_fit.py index d288900c..1a64a7a3 100644 --- a/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/one_fold_fit.py +++ b/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/one_fold_fit.py @@ -4,6 +4,7 @@ import rpy2 from cached_property import cached_property from scipy.stats import chi2 +from extreme_fit.distribution.gumbel.gumbel_gof import goodness_of_fit_anderson from extreme_fit.estimator.margin_estimator.utils import fitted_linear_margin_estimator_short from extreme_fit.function.param_function.polynomial_coef import PolynomialAllCoef, PolynomialCoef from extreme_fit.model.margin_model.polynomial_margin_model.gev_altitudinal_models import StationaryAltitudinal @@ -11,13 +12,14 @@ from extreme_fit.model.margin_model.polynomial_margin_model.gumbel_altitudinal_m StationaryGumbelAltitudinal, AbstractGumbelAltitudinalModel from extreme_fit.model.margin_model.utils import MarginFitMethod from root_utils import classproperty +from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset class OneFoldFit(object): SIGNIFICANCE_LEVEL = 0.05 best_estimator_minimizes_total_aic = False - def __init__(self, massif_name, dataset, models_classes, fit_method=MarginFitMethod.extremes_fevd_mle): + def __init__(self, massif_name: str, dataset: AbstractDataset, models_classes, fit_method=MarginFitMethod.extremes_fevd_mle): self.massif_name = massif_name self.dataset = dataset self.models_classes = models_classes @@ -88,20 +90,7 @@ class OneFoldFit(object): @cached_property def sorted_estimators(self): estimators = list(self.model_class_to_estimator.values()) - # estimators_with_finite_aic = [] - # for estimator in estimators: - # # try: - # aic = estimator.aic() - # npt.assert_almost_equal(estimator.result_from_model_fit.aic, aic, decimal=5) - # print(self.massif_name, estimator.margin_model.name_str, aic) - # estimators_with_finite_aic.append(estimator) - # # except (AssertionError, rpy2.rinterface.RRuntimeError) as e: - # # raise e - # # print(self.massif_name, estimator.margin_model.name_str, 'infinite aic') - # # print(e) - # print('Summary {}: {}/{} fitted'.format(self.massif_name, len(estimators_with_finite_aic), len(estimators))) - sorted_estimators = sorted([estimator for estimator in estimators], - key=lambda e: e.aic()) + sorted_estimators = sorted([estimator for estimator in estimators], key=lambda e: e.aic()) return sorted_estimators @property @@ -168,3 +157,25 @@ class OneFoldFit(object): return False else: return self.likelihood_ratio > chi2.ppf(q=1 - self.SIGNIFICANCE_LEVEL, df=self.degree_freedom_chi2) + + @property + def goodness_of_fit_anderson_test(self): + quantiles = self.compute_empirical_quantiles(estimator=self.best_estimator) + goodness_of_fit_anderson_test = goodness_of_fit_anderson(quantiles, self.SIGNIFICANCE_LEVEL) + if not goodness_of_fit_anderson_test: + print('{} with {} does not pass the anderson test'.format(self.massif_name, self.folder_for_plots)) + return goodness_of_fit_anderson_test + + def compute_empirical_quantiles(self, estimator): + empirical_quantiles = [] + df_maxima_gev = self.dataset.observations.df_maxima_gev + df_coordinates = self.dataset.coordinates.df_coordinates() + for coordinate, maximum in zip(df_coordinates.values.copy(), df_maxima_gev.values.copy()): + gev_param = estimator.function_from_fit.get_params( + coordinate=coordinate, + is_transformed=False) + assert len(maximum) == 1 + maximum_standardized = gev_param.gumbel_standardization(maximum[0]) + empirical_quantiles.append(maximum_standardized) + empirical_quantiles = sorted(empirical_quantiles) + return empirical_quantiles diff --git a/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/plot_total_aic.py b/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/plot_total_aic.py index bd28d16e..0c9bba51 100644 --- a/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/plot_total_aic.py +++ b/projects/altitude_spatial_model/altitudes_fit/one_fold_analysis/plot_total_aic.py @@ -10,6 +10,13 @@ from projects.altitude_spatial_model.altitudes_fit.one_fold_analysis.one_fold_fi from projects.exceeding_snow_loads.utils import dpi_paper1_figure +def plot_individual_aic(visualizer): + OneFoldFit.best_estimator_minimizes_total_aic = False + visualizer.plot_moments() + # visualizer.plot_best_coef_maps() + visualizer.plot_shape_map() + + def plot_total_aic(model_classes, visualizer): # Compute the mean AIC for each model_class OneFoldFit.best_estimator_minimizes_total_aic = True @@ -35,7 +42,7 @@ def plot_total_aic(model_classes, visualizer): best_model_class_for_total_aic = sorted_model_class[0] for one_fold_fit in visualizer.massif_name_to_one_fold_fit.values(): one_fold_fit.best_estimator_class_for_total_aic = best_model_class_for_total_aic - # Plot the ranking of the + # Plot the ranking of the model based on their total aic plot_total_aic_repartition(visualizer, sorted_labels, sorted_scores) # Plot the results for the model that minimizes the mean aic visualizer.plot_moments() @@ -43,7 +50,6 @@ def plot_total_aic(model_classes, visualizer): visualizer.plot_best_coef_maps() - def plot_total_aic_repartition(visualizer, labels, scores): """ Plot a single trend curves -- GitLab