diff --git a/experiment/meteo_france_data/scm_models_data/visualization/study_visualization/study_visualizer.py b/experiment/meteo_france_data/scm_models_data/visualization/study_visualization/study_visualizer.py index ad99f02945112a21467e7052786e7a98654880ed..7b5d3c9f33379c8f804e3aa9faed5bb657c8562e 100644 --- a/experiment/meteo_france_data/scm_models_data/visualization/study_visualization/study_visualizer.py +++ b/experiment/meteo_france_data/scm_models_data/visualization/study_visualization/study_visualizer.py @@ -46,18 +46,33 @@ from utils import get_display_name_from_object_type, VERSION_TIME, float_to_str_ BLOCK_MAXIMA_DISPLAY_NAME = 'block maxima ' -class StudyVisualizer(object): +class VisualizationParameters(object): + + def __init__(self, save_to_file=False, only_one_graph=False, only_first_row=False, show=True): + self.only_first_row = only_first_row + self.only_one_graph = only_one_graph + self.save_to_file = save_to_file + + # PLOT ARGUMENTS + self.show = False if self.save_to_file else show + if self.only_one_graph: + self.figsize = (6.0, 4.0) + elif self.only_first_row: + self.figsize = (8.0, 6.0) + else: + self.figsize = (16.0, 10.0) + self.subplot_space = 0.5 + self.coef_zoom_map = 1 + + +class StudyVisualizer(VisualizationParameters): def __init__(self, study: AbstractStudy, show=True, save_to_file=False, only_one_graph=False, only_first_row=False, vertical_kde_plot=False, year_for_kde_plot=None, plot_block_maxima_quantiles=False, - temporal_non_stationarity=False, - transformation_class=None, - verbose=False, - multiprocessing=False, - complete_non_stationary_trend_analysis=False, - normalization_under_one_observations=True, + temporal_non_stationarity=False, transformation_class=None, verbose=False, multiprocessing=False, + complete_non_stationary_trend_analysis=False, normalization_under_one_observations=True, score_class=MeanScore): - + super().__init__(save_to_file, only_one_graph, only_first_row, show) self.nb_cores = 7 self.massif_id_to_smooth_maxima = {} self.temporal_non_stationarity = temporal_non_stationarity @@ -90,17 +105,6 @@ class StudyVisualizer(object): self.score_class = score_class self.score = self.score_class(self.number_of_top_values) # type: AbstractTrendScore - # PLOT ARGUMENTS - self.show = False if self.save_to_file else show - if self.only_one_graph: - self.figsize = (6.0, 4.0) - elif self.only_first_row: - self.figsize = (8.0, 6.0) - else: - self.figsize = (16.0, 10.0) - self.subplot_space = 0.5 - self.coef_zoom_map = 1 - # Modify some class attributes # Remove some assert AbstractParamFunction.OUT_OF_BOUNDS_ASSERT = False diff --git a/experiment/meteo_france_data/stations_data/comparison_analysis.py b/experiment/meteo_france_data/stations_data/comparison_analysis.py index daf5e81938bbc58c188ef92114bd3977e0b4f4f6..c02d2b8dec674c7006eb1e0c8365b1abdabc032b 100644 --- a/experiment/meteo_france_data/stations_data/comparison_analysis.py +++ b/experiment/meteo_france_data/stations_data/comparison_analysis.py @@ -5,7 +5,7 @@ from typing import List from cached_property import cached_property from experiment.meteo_france_data.scm_models_data.safran.safran import SafranSnowfall -from experiment.meteo_france_data.scm_models_data.visualization import \ +from experiment.meteo_france_data.scm_models_data.visualization.study_visualization.main_study_visualizer import \ ALL_ALTITUDES from extreme_estimator.estimator.full_estimator.abstract_full_estimator import \ FullEstimatorInASingleStepWithSmoothMargin @@ -22,6 +22,10 @@ from spatio_temporal_dataset.spatio_temporal_observations.abstract_spatio_tempor from test.test_utils import load_test_max_stable_models from utils import get_display_name_from_object_type +REANALYSE_STR = 'reanalyse' +ALTITUDE_COLUMN_NAME = 'ALTITUDE' +MASSIF_COLUMN_NAME = 'MASSIF_PRA' + DATA_PATH = r'/home/erwan/Documents/projects/spatiotemporalextremes/local/spatio_temporal_datasets/Johan_data/PrecipitationsAvalanches_MaxPrecipit_ParPoste_ParHiver_traites.xls' import pandas as pd @@ -30,7 +34,11 @@ import pandas as pd class ComparisonAnalysis(object): def __init__(self, altitude=900, nb_border_data_to_remove=0, normalize_observations=True, margin=150, - transformation_class=BetweenZeroAndOneNormalization, exclude_some_massifs_from_the_intersection=False): + transformation_class=BetweenZeroAndOneNormalization, exclude_some_massifs_from_the_intersection=False, + keep_only_station_without_nan_values=True, + one_station_per_massif=True): + self.keep_only_station_without_nan_values = keep_only_station_without_nan_values + self.one_station_per_massif = one_station_per_massif self.exclude_some_massifs_from_the_intersection = exclude_some_massifs_from_the_intersection self.normalize_observations = normalize_observations self.altitude = altitude @@ -42,28 +50,33 @@ class ComparisonAnalysis(object): ##################### STATION ATTRIBUTES ############################ - def load_main_df_for_altitude(self): + def load_main_df(self): df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes') df = df.iloc[:78] - ind_altitude = self.altitude - self.margin < df['ALTITUDE'] - ind_altitude &= df['ALTITUDE'] <= self.altitude + self.margin + + ind_altitude = self.altitude - self.margin < df[ALTITUDE_COLUMN_NAME] + ind_altitude &= df[ALTITUDE_COLUMN_NAME] <= self.altitude + self.margin df = df.loc[ind_altitude] # type: pd.DataFrame - # Remove dulpicate for commune, Pellafol we should keep the first, i.e. 930 which has more data than the other + # Remove dupicate for commune, Pellafol we should keep the first, i.e. 930 which has more data than the other df.drop_duplicates(subset='COMMUNE', inplace=True) df.set_index('COMMUNE', inplace=True) df = df.iloc[:, 3:] # Get values df_values = self.get_values(df) - # Keep only stations who have not any Nan values - ind = ~df_values.isna().any(axis=1) - df = df.loc[ind] + if self.keep_only_station_without_nan_values: + # Keep only stations who have not any Nan values + ind = ~df_values.isna().any(axis=1) + df = df.loc[ind] return df - def load_main_df_for_altitude_and_good_massifs(self): - df = self.load_main_df_for_altitude().copy() + def load_main_df_stations_that_belong_to_intersection_massifs(self): + df = self.load_main_df().copy() # Keep only the massif that also belong to the study (so that the normalization are roughly comparable) ind_massif = df['MASSIF_PRA'].isin(self.intersection_massif_names) - df = df.loc[ind_massif] + return df.loc[ind_massif] + + def load_main_df_one_station_per_massif(self): + df = self.load_main_df_stations_that_belong_to_intersection_massifs() # Keep only one station per massif, to have the same number of points (the first by default) df = df.drop_duplicates(subset='MASSIF_PRA') # Sort all the DataFrame so that the massif order correspond @@ -72,9 +85,16 @@ class ComparisonAnalysis(object): df.drop(labels='MASSIF_IDX', axis=1, inplace=True) return df + @property + def load_df_stations(self): + if self.one_station_per_massif: + return self.load_main_df_one_station_per_massif() + else: + return self.load_main_df_station_intersection_clean() + @property def stations_coordinates(self): - df = self.load_main_df_for_altitude_and_good_massifs() + df = self.load_main_df() df = df.loc[:, ['LAMBERTX', 'LAMBERTY']] df.rename({'LAMBERTX': AbstractCoordinates.COORDINATE_X, 'LAMBERTY': AbstractCoordinates.COORDINATE_Y}, axis=1, inplace=True) @@ -83,7 +103,7 @@ class ComparisonAnalysis(object): @property def stations_observations(self): - df = self.load_main_df_for_altitude_and_good_massifs() + df = self.load_main_df() df = self.get_values(df) obs = AbstractSpatioTemporalObservations(df_maxima_gev=df) if self.normalize_observations: @@ -104,7 +124,7 @@ class ComparisonAnalysis(object): @property def massif_names(self): - df = self.load_main_df_for_altitude() + df = self.load_main_df() return list(set(df['MASSIF_PRA'])) ##################### STUDY ATTRIBUTES ############################ @@ -112,9 +132,13 @@ class ComparisonAnalysis(object): @cached_property def study(self): # Build the study for the same years - return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=1, year_min=self.year_min, + return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=3, year_min=self.year_min, year_max=self.year_max + 1) + @property + def nb_massifs(self): + return len(self.intersection_massif_names) + @cached_property def intersection_massif_names(self): intersection_of_massif_names = list(set(self.massif_names).intersection(set(self.study.study_massif_names))) @@ -168,13 +192,9 @@ class ComparisonAnalysis(object): # After a short analysis (run df_altitude to check) we decided on the altitude # 900 and 1200 seems to be the best altitudes - def load_main_df(self): + def reduce_altitude(self, altitude=900) -> pd.Series: df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes') df = df.iloc[:78, 4:] - return df - - def reduce_altitude(self, altitude=900) -> pd.Series: - df = self.load_main_df() margin = 150 ind_altitude = altitude - margin < df['ALTITUDE'] ind_altitude &= df['ALTITUDE'] <= altitude + margin @@ -206,6 +226,35 @@ class ComparisonAnalysis(object): # Finally I might prefer the altitude 900, which seems to have less missing values print(df) + ################## MERGING SOME ATTRIBUTES ################### + def load_main_df_station_intersection_clean(self): + assert not self.one_station_per_massif + df = self.load_main_df_stations_that_belong_to_intersection_massifs().loc[:, ['MASSIF_PRA', 'ALTITUDE']] + df_coord = self.stations_coordinates.df_all_coordinates + df_obs = self.stations_observations.df_maxima_gev + return pd.concat([df, df_coord, df_obs], axis=1) + + def load_main_df_study_intersection_clean(self): + assert not self.one_station_per_massif + df_coord = self.study_dataset_lambert.coordinates.df_all_coordinates + df_obs = self.study_observations.df_maxima_gev + study_index = pd.Series(df_coord.index) + ' ' + REANALYSE_STR + df = pd.DataFrame({'MASSIF_PRA': df_coord.index.values}, index=study_index) + df['ALTITUDE'] = self.altitude + df_coord.index = study_index + df_obs.index = study_index + df_study = pd.concat([df, df_coord, df_obs], axis=1) + return df_study + + def load_main_df_merged_intersection_clean(self): + df_stations = self.load_main_df_station_intersection_clean() + df_study = self.load_main_df_study_intersection_clean() + diff = set(df_study.columns).symmetric_difference(set(df_stations.columns)) + assert not diff, diff + df = pd.concat([df_study.loc[:, df_stations.columns], df_stations], axis=0) + df = df.sort_values([MASSIF_COLUMN_NAME]) + return df + ##################### COMPARE THE TWO DATASETS BY FITTING THE SAME MODEL ############################ def spatial_comparison(self, margin_model_class, default_covariance_function=CovarianceFunction.powexp): @@ -232,7 +281,8 @@ class ComparisonAnalysis(object): print(estimator.result_from_fit.other_coef_dict) estimators.append(estimator) # Compare the sign of them margin coefficient for the estimators - coefs = [{k: v for k, v in e.result_from_fit.margin_coef_dict.items() if 'Coeff1' not in k} for e in estimators] - different_sign = [k for k, v in coefs[0].items() if np.sign(coefs[1][k]) != np.sign(v) ] - print('All linear coefficient have the same sign: {}, different_signs for: {}'.format(len(different_sign) == 0, different_sign)) - + coefs = [{k: v for k, v in e.result_from_fit.margin_coef_dict.items() if 'Coeff1' not in k} for e in + estimators] + different_sign = [k for k, v in coefs[0].items() if np.sign(coefs[1][k]) != np.sign(v)] + print('All linear coefficient have the same sign: {}, different_signs for: {}'.format( + len(different_sign) == 0, different_sign)) diff --git a/experiment/meteo_france_data/stations_data/main_station_comparison.py b/experiment/meteo_france_data/stations_data/main_station_comparison.py index bbfcfd17c077418f3ae4d3a253794e7195e27a64..18589d823ddec369b11fc4986ba8bc4eb9121737 100644 --- a/experiment/meteo_france_data/stations_data/main_station_comparison.py +++ b/experiment/meteo_france_data/stations_data/main_station_comparison.py @@ -1,7 +1,26 @@ +from experiment.meteo_france_data.scm_models_data.visualization.study_visualization.main_study_visualizer import \ + ALL_ALTITUDES_WITH_20_STATIONS_AT_LEAST from experiment.meteo_france_data.stations_data.comparison_analysis import ComparisonAnalysis +from experiment.meteo_france_data.stations_data.visualization.comparisons_visualization.comparisons_visualization import \ + ComparisonsVisualization + + +def visualize_full_comparison(): + vizu = ComparisonsVisualization(altitudes=ALL_ALTITUDES_WITH_20_STATIONS_AT_LEAST, margin=150) + vizu.visualize_maximum() + + +def visualize_fast_comparison(): + vizu = ComparisonsVisualization(altitudes=[900]) + vizu.visualize_maximum() + + +def example(): + vizu = ComparisonsVisualization(altitudes=[900]) + vizu._visualize_maximum(vizu.comparisons[0], 'Beaufortain', show=True) if __name__ == '__main__': - comparison = ComparisonAnalysis(altitude=900, nb_border_data_to_remove=nb, margin=150, - exclude_some_massifs_from_the_intersection=nb == 2, - transformation_class=transformation_class, - normalize_observations=True) \ No newline at end of file + # visualize_fast_comparison() + visualize_full_comparison() + # example() + diff --git a/experiment/meteo_france_data/stations_data/visualization/comparisons_visualization/comparisons_visualization.py b/experiment/meteo_france_data/stations_data/visualization/comparisons_visualization/comparisons_visualization.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f399d6a2daf46bfa6c14cbab94fabb8fd6b2e258 100644 --- a/experiment/meteo_france_data/stations_data/visualization/comparisons_visualization/comparisons_visualization.py +++ b/experiment/meteo_france_data/stations_data/visualization/comparisons_visualization/comparisons_visualization.py @@ -0,0 +1,90 @@ +from typing import Dict, List + +import math +import matplotlib.pyplot as plt +import pandas as pd + +from experiment.meteo_france_data.scm_models_data.visualization.study_visualization.study_visualizer import \ + VisualizationParameters +from experiment.meteo_france_data.stations_data.comparison_analysis import ComparisonAnalysis, MASSIF_COLUMN_NAME, \ + REANALYSE_STR, ALTITUDE_COLUMN_NAME +from itertools import chain + + +class ComparisonsVisualization(VisualizationParameters): + + def __init__(self, altitudes=None, keep_only_station_without_nan_values=False, margin=150): + self.keep_only_station_without_nan_values = keep_only_station_without_nan_values + if self.keep_only_station_without_nan_values: + self.nb_columns = 5 + else: + self.nb_columns = 7 + # Load altitude_to_comparison dictionary + super().__init__() + self.altitude_to_comparison = {} # type: Dict[int, ComparisonAnalysis] + for altitude in altitudes: + comparison = ComparisonAnalysis(altitude=altitude, + normalize_observations=False, + one_station_per_massif=False, + transformation_class=None, + margin=margin, + keep_only_station_without_nan_values=keep_only_station_without_nan_values) + self.altitude_to_comparison[altitude] = comparison + + @property + def comparisons(self) -> List[ComparisonAnalysis]: + return list(self.altitude_to_comparison.values()) + + @property + def nb_plot(self): + return sum([c.nb_massifs for c in self.comparisons]) + + @property + def massifs(self): + return sorted(set(chain(*[c.intersection_massif_names for c in self.comparisons]))) + + def _visualize_main(self, visualization_ax_function, title=''): + nb_rows = math.ceil(self.nb_plot / self.nb_columns) + fig, axes = plt.subplots(nb_rows, self.nb_columns, figsize=self.figsize) + fig.subplots_adjust(hspace=self.subplot_space, wspace=self.subplot_space) + axes = axes.flatten() + + ax_idx = 0 + for massif in self.massifs: + for c in [c for c in self.comparisons if massif in c.intersection_massif_names]: + visualization_ax_function(c, massif, axes[ax_idx]) + ax_idx += 1 + plt.suptitle(title) + plt.show() + + def visualize_maximum(self): + return self._visualize_main(self._visualize_maximum, 'Recent trend of Annual maxima of snowfall') + + def _visualize_maximum(self, comparison: ComparisonAnalysis, massif, ax=None, show=False): + if ax is None: + _, ax = plt.subplots(1, 1, figsize=self.figsize) + + df = comparison.load_main_df_merged_intersection_clean() + ind = df[MASSIF_COLUMN_NAME] == massif + df.drop([MASSIF_COLUMN_NAME], axis=1, inplace=True) + assert sum(ind) > 0 + df = df.loc[ind] # type: pd.DataFrame + colors_station = ['r', 'tab:orange', 'tab:purple', 'm', 'k'] + for color, (i, s) in zip(colors_station, df.iterrows()): + label = i + label += ' ({}m)'.format(s[ALTITUDE_COLUMN_NAME]) + s_values = s.iloc[3:].to_dict() + plot_color = color if REANALYSE_STR not in label else 'g' + ax.plot(list(s_values.keys()), list(s_values.values()), label=label, color=plot_color) + ax.legend(prop={'size': 5}) + ax.set_title('{} at {}'.format(massif, comparison.altitude)) + + if show: + + plt.show() + + def visualize_gev(self): + return self._visualize_main(self._visualize_gev) + + def _visualize_gev(self): + pass