From 48f1ba1b6d1879398c2dc5b69ea6b452122c057a Mon Sep 17 00:00:00 2001 From: Le Roux Erwan <erwan.le-roux@irstea.fr> Date: Tue, 28 May 2019 17:10:52 +0200 Subject: [PATCH] [METEO FRANCE DATA][STATIONS] add final version of the first analysis of Reanalysis versus station measurements results --- .../stations_data/comparison_analysis.py | 150 ++++++++++++++---- .../margin_model/linear_margin_model.py | 8 + .../abstract_spatio_temporal_observations.py | 3 + 3 files changed, 134 insertions(+), 27 deletions(-) diff --git a/experiment/meteo_france_data/stations_data/comparison_analysis.py b/experiment/meteo_france_data/stations_data/comparison_analysis.py index 5a340873..2277d51b 100644 --- a/experiment/meteo_france_data/stations_data/comparison_analysis.py +++ b/experiment/meteo_france_data/stations_data/comparison_analysis.py @@ -4,11 +4,11 @@ from cached_property import cached_property from experiment.meteo_france_data.scm_models_data.safran.safran import SafranSnowfall from experiment.meteo_france_data.visualization.study_visualization.main_study_visualizer import \ - ALL_ALTITUDES + ALL_ALTITUDES, ALL_ALTITUDES_WITH_20_STATIONS_AT_LEAST from extreme_estimator.estimator.full_estimator.abstract_full_estimator import \ FullEstimatorInASingleStepWithSmoothMargin from extreme_estimator.extreme_models.margin_model.linear_margin_model import LinearAllParametersAllDimsMarginModel, \ - LinearLocationAllDimsMarginModel + LinearLocationAllDimsMarginModel, LinearShapeAllDimsMarginModel from extreme_estimator.extreme_models.max_stable_model.abstract_max_stable_model import CovarianceFunction from spatio_temporal_dataset.coordinates.abstract_coordinates import AbstractCoordinates from spatio_temporal_dataset.coordinates.spatial_coordinates.abstract_spatial_coordinates import \ @@ -28,31 +28,34 @@ import pandas as pd class ComparisonAnalysis(object): - def __init__(self, altitude=900): - assert altitude in [900, 1200] + def __init__(self, altitude=900, nb_border_data_to_remove=0, normalize_observations=True, margin=150, + transformation_class=BetweenZeroAndOneNormalization, exclude_some_massifs_from_the_intersection=False): + self.exclude_some_massifs_from_the_intersection = exclude_some_massifs_from_the_intersection + self.normalize_observations = normalize_observations self.altitude = altitude - self.transformation_class = BetweenZeroAndOneNormalization - self.year_min = 1958 - self.year_max = 2004 + self.margin = margin + self.transformation_class = transformation_class + self.nb_border_data_to_remove = nb_border_data_to_remove + self.year_min = 1958 + nb_border_data_to_remove + self.year_max = 2004 - nb_border_data_to_remove ##################### STATION ATTRIBUTES ############################ - def load_main_df(self): - df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes') - df = df.iloc[:78, 4:] - return df - def load_main_df_for_altitude(self): df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes') df = df.iloc[:78] - margin = 150 - ind_altitude = self.altitude - margin < df['ALTITUDE'] - ind_altitude &= df['ALTITUDE'] <= self.altitude + margin + ind_altitude = self.altitude - self.margin < df['ALTITUDE'] + ind_altitude &= df['ALTITUDE'] <= self.altitude + self.margin df = df.loc[ind_altitude] # type: pd.DataFrame # Remove dulpicate for commune, Pellafol we should keep the first, i.e. 930 which has more data than the other df.drop_duplicates(subset='COMMUNE', inplace=True) df.set_index('COMMUNE', inplace=True) df = df.iloc[:, 3:] + # Get values + df_values = self.get_values(df) + # Keep only stations who have not any Nan values + ind = ~df_values.isna().any(axis=1) + df = df.loc[ind] return df def load_main_df_for_altitude_and_good_massifs(self): @@ -60,6 +63,8 @@ class ComparisonAnalysis(object): # Keep only the massif that also belong to the study (so that the normalization are roughly comparable) ind_massif = df['MASSIF_PRA'].isin(self.intersection_massif_names) df = df.loc[ind_massif] + # Keep only one station per massif, to have the same number of points (the first by default) + df = df.drop_duplicates(subset='MASSIF_PRA') return df @property @@ -74,10 +79,17 @@ class ComparisonAnalysis(object): @property def stations_observations(self): df = self.load_main_df_for_altitude_and_good_massifs() + df = self.get_values(df) + obs = AbstractSpatioTemporalObservations(df_maxima_gev=df) + if self.normalize_observations: + obs.normalize() + return obs + + def get_values(self, df): df = df.iloc[:, 7:] df.columns = df.columns.astype(int) df = df.loc[:, self.year_min:self.year_max] - return AbstractSpatioTemporalObservations(df_maxima_gev=df) + return df @property def station_dataset(self): @@ -95,13 +107,23 @@ class ComparisonAnalysis(object): @cached_property def study(self): # Build the study for the same years - return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=1, year_min=self.year_min, year_max=self.year_max+1) + return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=1, year_min=self.year_min, + year_max=self.year_max + 1) @cached_property def intersection_massif_names(self): intersection_of_massif_names = list(set(self.massif_names).intersection(set(self.study.study_massif_names))) diff_due_to_wrong_names = set(self.massif_names) - set(self.study.study_massif_names) assert not diff_due_to_wrong_names, diff_due_to_wrong_names + + # remove on purpose some massifs (to understand if it the massifs that change the results or the year that were removed) + # this created big differences in the results for altitude=900m margin=150m and nb=2 + # maybe this is due to a difference between the massif coordinate and the station (that belong to the massif) coordinate + # or this might be due to a big difference between the observations + if self.exclude_some_massifs_from_the_intersection: + massifs_to_remove = ['Mercantour'] + intersection_of_massif_names = list(set(intersection_of_massif_names) - set(massifs_to_remove)) + return intersection_of_massif_names def study_coordinates(self, use_study_coordinate_with_latitude_and_longitude=True): @@ -120,23 +142,32 @@ class ComparisonAnalysis(object): observations = self.study.observations_annual_maxima maxima_gev_of_interest = observations.df_maxima_gev.loc[self.intersection_massif_names] observations.df_maxima_gev = maxima_gev_of_interest + if self.normalize_observations: + observations.normalize() return observations @property def study_dataset_latitude_longitude(self): dataset = AbstractDataset(observations=self.study_observations, - coordinates=self.study_coordinates(use_study_coordinate_with_latitude_and_longitude=True)) + coordinates=self.study_coordinates( + use_study_coordinate_with_latitude_and_longitude=True)) return dataset @property def study_dataset_lambert(self): dataset = AbstractDataset(observations=self.study_observations, - coordinates=self.study_coordinates(use_study_coordinate_with_latitude_and_longitude=False)) + coordinates=self.study_coordinates( + use_study_coordinate_with_latitude_and_longitude=False)) return dataset # After a short analysis (run df_altitude to check) we decided on the altitude # 900 and 1200 seems to be the best altitudes + def load_main_df(self): + df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes') + df = df.iloc[:78, 4:] + return df + def reduce_altitude(self, altitude=900) -> pd.Series: df = self.load_main_df() margin = 150 @@ -149,11 +180,13 @@ class ComparisonAnalysis(object): d['Nb stations'] = len(df) # Number of massifs d['Nb massifs'] = len(set(df['MASSIF_PRA'])) - # Mean number of non-Nan values + df_values = df.iloc[:, 7:] df_values_from_1958 = df_values.iloc[:, 13:] - d['Percentage of Nan'] = df_values_from_1958.isna().mean().mean() - print(df_values_from_1958.columns) + # Mean number of non-Nan values + d['% of Nan'] = df_values_from_1958.isna().mean().mean() + # Number of lines with only Nan + d['Lines w Nan'] = df_values_from_1958.isna().all(axis=1).sum() return pd.Series(d) def altitude_short_analysis(self): @@ -170,18 +203,81 @@ class ComparisonAnalysis(object): ##################### COMPARE THE TWO DATASETS BY FITTING THE SAME MODEL ############################ - def spatial_comparison(self): + def spatial_comparison(self, margin_model_class): max_stable_models = load_test_max_stable_models(default_covariance_function=CovarianceFunction.powexp) for max_stable_model in [max_stable_models[1], max_stable_models[-2]]: print('\n\n', get_display_name_from_object_type(type(max_stable_model))) - for dataset in [self.station_dataset, self.study_dataset_latitude_longitude, self.study_dataset_lambert][1:]: - margin_model = LinearLocationAllDimsMarginModel(coordinates=dataset.coordinates) + for dataset in [self.station_dataset, self.study_dataset_lambert]: + margin_model = margin_model_class(coordinates=dataset.coordinates) estimator = FullEstimatorInASingleStepWithSmoothMargin(dataset=dataset, margin_model=margin_model, max_stable_model=max_stable_model) estimator.fit() print(estimator.margin_function_fitted.coef_dict) + # print(estimato) + + +def choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan(): + for margin in [50, 100, 150, 200, 250, 300][2:3]: + for altitude in [900, 1200, 1800][-1:]: + for nb in range(1, 15): + s = ComparisonAnalysis(altitude=altitude, nb_border_data_to_remove=nb, margin=margin) + print(margin, altitude, nb, 'nb massifs', len(s.intersection_massif_names), 'nb stations', + len(s.stations_observations), 'nb observations', s.stations_observations.nb_obs, + s.study_observations.nb_obs, + s.stations_coordinates.index) + + +def run_comparison_for_optimal_parameters_for_altitude_900(): + for nb in [0, 1, 2][:]: + for transformation_class in [None, BetweenZeroAndOneNormalization][1:]: + comparison = ComparisonAnalysis(altitude=900, nb_border_data_to_remove=nb, margin=150, + exclude_some_massifs_from_the_intersection=nb == 2, + transformation_class=transformation_class, + normalize_observations=True) + print('nb:', nb, comparison.intersection_massif_names) + # margin_model_classes = [LinearShapeAllDimsMarginModel, LinearLocationAllDimsMarginModel, + # LinearAllParametersAllDimsMarginModel] + for margin_model_class in [LinearAllParametersAllDimsMarginModel]: + print(get_display_name_from_object_type(margin_model_class)) + comparison.spatial_comparison(margin_model_class) + + +""" +Comparaison données de re-analysis et données de stations + +J'ai utilisé le fichier "PrecipitationsAvalanches_MaxPrecipit_ParPoste_ParHiver_traites.xls" + +Après des analyses avec la fonction 'choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan' +j'ai choisis de lancer mes analyses avec: + -une altitude de 900m + -une margin de 150m (donc je selectionne toutes les stations entre 750m et 1050m). +Je ne choisis que des stations qui ont des observations complètes sur toute la periode d'observation. +et je m'asssure de n'avoir une seule station par massif (qui appartient à l intersection des massifs entre les study et les stations) + +Souvent les observations manquantes se situaient dans les premières ou dans les dernières années +j'ai donc ajouté un parametre nb_to_remove_border qui enlever ces observations (à la fois pour les study et les stations). +Ce parametre entrainent donc des datasets avec moins d observations, mais avec plus de masssifs/stations + +Par contre, dans le cas nb_to_remove=2, il y avait de grosses différences si j'incluais ou non le massif Mercantour +donc en tout attendant de mieux comprendre, j'ai prefere exclure ce massif dans ce cas + +Dans tous les cas, nb_to_remove de 0 à 2 +pour n'importe quel modele de marges +et pour un max stable BrownResnick ou ExtremalT +alors le signe des coefficient de marges selon les coordonées Lambert sont toujours les mêmes que l'on utilise les données +de reanalysis ou les données de stations +""" + + +""" +A way to improve the analysis would be to have another altitude of reference with a lot of data +But for the other altitude, we have data issues because there is a Nan in the middle of the data +Instead of removing on the side, I should remove the years that concerns as much station from the same altitude level +I should find the "optimal" years to remove +Then I should find a way to remove the same years in the study +""" if __name__ == '__main__': - s = ComparisonAnalysis(altitude=1200) - s.spatial_comparison() \ No newline at end of file + # run_comparison_for_optimal_parameters_for_altitude_900() + choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan() diff --git a/extreme_estimator/extreme_models/margin_model/linear_margin_model.py b/extreme_estimator/extreme_models/margin_model/linear_margin_model.py index 24016d13..7956be5c 100644 --- a/extreme_estimator/extreme_models/margin_model/linear_margin_model.py +++ b/extreme_estimator/extreme_models/margin_model/linear_margin_model.py @@ -97,11 +97,19 @@ class LinearMarginModelExample(LinearMarginModel): GevParams.LOC: [1], GevParams.SCALE: [0]}) + class LinearLocationAllDimsMarginModel(LinearMarginModel): def load_margin_functions(self, margin_function_class: type = None, gev_param_name_to_dims=None): super().load_margin_functions({GevParams.LOC: self.coordinates.coordinates_dims}) + +class LinearShapeAllDimsMarginModel(LinearMarginModel): + + def load_margin_functions(self, margin_function_class: type = None, gev_param_name_to_dims=None): + super().load_margin_functions({GevParams.SHAPE: self.coordinates.coordinates_dims}) + + class LinearAllParametersAllDimsMarginModel(LinearMarginModel): def load_margin_functions(self, margin_function_class: type = None, gev_param_name_to_dims=None): diff --git a/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py b/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py index 5825458b..a772e793 100644 --- a/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py +++ b/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py @@ -125,6 +125,9 @@ class AbstractSpatioTemporalObservations(object): def __str__(self) -> str: return self._df_maxima.__str__() + def __len__(self): + return self._df_maxima.__len__() + def print_summary(self): # Write a summary of observations df = self.df_maxima_gev -- GitLab