From 48f1ba1b6d1879398c2dc5b69ea6b452122c057a Mon Sep 17 00:00:00 2001
From: Le Roux Erwan <erwan.le-roux@irstea.fr>
Date: Tue, 28 May 2019 17:10:52 +0200
Subject: [PATCH] [METEO FRANCE DATA][STATIONS] add final version of the first
 analysis of Reanalysis versus station measurements results

---
 .../stations_data/comparison_analysis.py      | 150 ++++++++++++++----
 .../margin_model/linear_margin_model.py       |   8 +
 .../abstract_spatio_temporal_observations.py  |   3 +
 3 files changed, 134 insertions(+), 27 deletions(-)

diff --git a/experiment/meteo_france_data/stations_data/comparison_analysis.py b/experiment/meteo_france_data/stations_data/comparison_analysis.py
index 5a340873..2277d51b 100644
--- a/experiment/meteo_france_data/stations_data/comparison_analysis.py
+++ b/experiment/meteo_france_data/stations_data/comparison_analysis.py
@@ -4,11 +4,11 @@ from cached_property import cached_property
 
 from experiment.meteo_france_data.scm_models_data.safran.safran import SafranSnowfall
 from experiment.meteo_france_data.visualization.study_visualization.main_study_visualizer import \
-    ALL_ALTITUDES
+    ALL_ALTITUDES, ALL_ALTITUDES_WITH_20_STATIONS_AT_LEAST
 from extreme_estimator.estimator.full_estimator.abstract_full_estimator import \
     FullEstimatorInASingleStepWithSmoothMargin
 from extreme_estimator.extreme_models.margin_model.linear_margin_model import LinearAllParametersAllDimsMarginModel, \
-    LinearLocationAllDimsMarginModel
+    LinearLocationAllDimsMarginModel, LinearShapeAllDimsMarginModel
 from extreme_estimator.extreme_models.max_stable_model.abstract_max_stable_model import CovarianceFunction
 from spatio_temporal_dataset.coordinates.abstract_coordinates import AbstractCoordinates
 from spatio_temporal_dataset.coordinates.spatial_coordinates.abstract_spatial_coordinates import \
@@ -28,31 +28,34 @@ import pandas as pd
 
 class ComparisonAnalysis(object):
 
-    def __init__(self, altitude=900):
-        assert altitude in [900, 1200]
+    def __init__(self, altitude=900, nb_border_data_to_remove=0, normalize_observations=True, margin=150,
+                 transformation_class=BetweenZeroAndOneNormalization, exclude_some_massifs_from_the_intersection=False):
+        self.exclude_some_massifs_from_the_intersection = exclude_some_massifs_from_the_intersection
+        self.normalize_observations = normalize_observations
         self.altitude = altitude
-        self.transformation_class = BetweenZeroAndOneNormalization
-        self.year_min = 1958
-        self.year_max = 2004
+        self.margin = margin
+        self.transformation_class = transformation_class
+        self.nb_border_data_to_remove = nb_border_data_to_remove
+        self.year_min = 1958 + nb_border_data_to_remove
+        self.year_max = 2004 - nb_border_data_to_remove
 
     ##################### STATION ATTRIBUTES ############################
 
-    def load_main_df(self):
-        df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes')
-        df = df.iloc[:78, 4:]
-        return df
-
     def load_main_df_for_altitude(self):
         df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes')
         df = df.iloc[:78]
-        margin = 150
-        ind_altitude = self.altitude - margin < df['ALTITUDE']
-        ind_altitude &= df['ALTITUDE'] <= self.altitude + margin
+        ind_altitude = self.altitude - self.margin < df['ALTITUDE']
+        ind_altitude &= df['ALTITUDE'] <= self.altitude + self.margin
         df = df.loc[ind_altitude]  # type: pd.DataFrame
         # Remove dulpicate for commune, Pellafol we should keep the first, i.e. 930 which has more data than the other
         df.drop_duplicates(subset='COMMUNE', inplace=True)
         df.set_index('COMMUNE', inplace=True)
         df = df.iloc[:, 3:]
+        # Get values
+        df_values = self.get_values(df)
+        # Keep only stations who have not any Nan values
+        ind = ~df_values.isna().any(axis=1)
+        df = df.loc[ind]
         return df
 
     def load_main_df_for_altitude_and_good_massifs(self):
@@ -60,6 +63,8 @@ class ComparisonAnalysis(object):
         # Keep only the massif that also belong to the study (so that the normalization are roughly comparable)
         ind_massif = df['MASSIF_PRA'].isin(self.intersection_massif_names)
         df = df.loc[ind_massif]
+        # Keep only one station per massif, to have the same number of points (the first by default)
+        df = df.drop_duplicates(subset='MASSIF_PRA')
         return df
 
     @property
@@ -74,10 +79,17 @@ class ComparisonAnalysis(object):
     @property
     def stations_observations(self):
         df = self.load_main_df_for_altitude_and_good_massifs()
+        df = self.get_values(df)
+        obs = AbstractSpatioTemporalObservations(df_maxima_gev=df)
+        if self.normalize_observations:
+            obs.normalize()
+        return obs
+
+    def get_values(self, df):
         df = df.iloc[:, 7:]
         df.columns = df.columns.astype(int)
         df = df.loc[:, self.year_min:self.year_max]
-        return AbstractSpatioTemporalObservations(df_maxima_gev=df)
+        return df
 
     @property
     def station_dataset(self):
@@ -95,13 +107,23 @@ class ComparisonAnalysis(object):
     @cached_property
     def study(self):
         # Build the study for the same years
-        return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=1, year_min=self.year_min, year_max=self.year_max+1)
+        return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=1, year_min=self.year_min,
+                              year_max=self.year_max + 1)
 
     @cached_property
     def intersection_massif_names(self):
         intersection_of_massif_names = list(set(self.massif_names).intersection(set(self.study.study_massif_names)))
         diff_due_to_wrong_names = set(self.massif_names) - set(self.study.study_massif_names)
         assert not diff_due_to_wrong_names, diff_due_to_wrong_names
+
+        # remove on purpose some massifs (to understand if it the massifs that change the results or the year that were removed)
+        # this created big differences in the results for altitude=900m margin=150m and nb=2
+        # maybe this is due to a difference between the massif coordinate and the station (that belong to the massif) coordinate
+        # or this might be due to a big difference between the observations
+        if self.exclude_some_massifs_from_the_intersection:
+            massifs_to_remove = ['Mercantour']
+            intersection_of_massif_names = list(set(intersection_of_massif_names) - set(massifs_to_remove))
+
         return intersection_of_massif_names
 
     def study_coordinates(self, use_study_coordinate_with_latitude_and_longitude=True):
@@ -120,23 +142,32 @@ class ComparisonAnalysis(object):
         observations = self.study.observations_annual_maxima
         maxima_gev_of_interest = observations.df_maxima_gev.loc[self.intersection_massif_names]
         observations.df_maxima_gev = maxima_gev_of_interest
+        if self.normalize_observations:
+            observations.normalize()
         return observations
 
     @property
     def study_dataset_latitude_longitude(self):
         dataset = AbstractDataset(observations=self.study_observations,
-                                  coordinates=self.study_coordinates(use_study_coordinate_with_latitude_and_longitude=True))
+                                  coordinates=self.study_coordinates(
+                                      use_study_coordinate_with_latitude_and_longitude=True))
         return dataset
 
     @property
     def study_dataset_lambert(self):
         dataset = AbstractDataset(observations=self.study_observations,
-                                  coordinates=self.study_coordinates(use_study_coordinate_with_latitude_and_longitude=False))
+                                  coordinates=self.study_coordinates(
+                                      use_study_coordinate_with_latitude_and_longitude=False))
         return dataset
 
     # After a short analysis (run df_altitude to check) we decided on the altitude
     # 900 and 1200 seems to be the best altitudes
 
+    def load_main_df(self):
+        df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes')
+        df = df.iloc[:78, 4:]
+        return df
+
     def reduce_altitude(self, altitude=900) -> pd.Series:
         df = self.load_main_df()
         margin = 150
@@ -149,11 +180,13 @@ class ComparisonAnalysis(object):
         d['Nb stations'] = len(df)
         # Number of massifs
         d['Nb massifs'] = len(set(df['MASSIF_PRA']))
-        # Mean number of non-Nan values
+
         df_values = df.iloc[:, 7:]
         df_values_from_1958 = df_values.iloc[:, 13:]
-        d['Percentage of Nan'] = df_values_from_1958.isna().mean().mean()
-        print(df_values_from_1958.columns)
+        # Mean number of non-Nan values
+        d['% of Nan'] = df_values_from_1958.isna().mean().mean()
+        # Number of lines with only Nan
+        d['Lines w Nan'] = df_values_from_1958.isna().all(axis=1).sum()
         return pd.Series(d)
 
     def altitude_short_analysis(self):
@@ -170,18 +203,81 @@ class ComparisonAnalysis(object):
 
     ##################### COMPARE THE TWO DATASETS BY FITTING THE SAME MODEL ############################
 
-    def spatial_comparison(self):
+    def spatial_comparison(self, margin_model_class):
         max_stable_models = load_test_max_stable_models(default_covariance_function=CovarianceFunction.powexp)
         for max_stable_model in [max_stable_models[1], max_stable_models[-2]]:
             print('\n\n', get_display_name_from_object_type(type(max_stable_model)))
-            for dataset in [self.station_dataset, self.study_dataset_latitude_longitude, self.study_dataset_lambert][1:]:
-                margin_model = LinearLocationAllDimsMarginModel(coordinates=dataset.coordinates)
+            for dataset in [self.station_dataset, self.study_dataset_lambert]:
+                margin_model = margin_model_class(coordinates=dataset.coordinates)
                 estimator = FullEstimatorInASingleStepWithSmoothMargin(dataset=dataset,
                                                                        margin_model=margin_model,
                                                                        max_stable_model=max_stable_model)
                 estimator.fit()
                 print(estimator.margin_function_fitted.coef_dict)
+                # print(estimato)
+
+
+def choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan():
+    for margin in [50, 100, 150, 200, 250, 300][2:3]:
+        for altitude in [900, 1200, 1800][-1:]:
+            for nb in range(1, 15):
+                s = ComparisonAnalysis(altitude=altitude, nb_border_data_to_remove=nb, margin=margin)
+                print(margin, altitude, nb, 'nb massifs', len(s.intersection_massif_names), 'nb stations',
+                      len(s.stations_observations), 'nb observations', s.stations_observations.nb_obs,
+                      s.study_observations.nb_obs,
+                      s.stations_coordinates.index)
+
+
+def run_comparison_for_optimal_parameters_for_altitude_900():
+    for nb in [0, 1, 2][:]:
+        for transformation_class in [None, BetweenZeroAndOneNormalization][1:]:
+            comparison = ComparisonAnalysis(altitude=900, nb_border_data_to_remove=nb, margin=150,
+                                            exclude_some_massifs_from_the_intersection=nb == 2,
+                                            transformation_class=transformation_class,
+                                            normalize_observations=True)
+            print('nb:', nb, comparison.intersection_massif_names)
+            # margin_model_classes = [LinearShapeAllDimsMarginModel, LinearLocationAllDimsMarginModel,
+            #           LinearAllParametersAllDimsMarginModel]
+            for margin_model_class in [LinearAllParametersAllDimsMarginModel]:
+                print(get_display_name_from_object_type(margin_model_class))
+                comparison.spatial_comparison(margin_model_class)
+
+
+"""
+Comparaison donnÃ©es de re-analysis et donnÃ©es de stations
+
+J'ai utilisÃ© le fichier "PrecipitationsAvalanches_MaxPrecipit_ParPoste_ParHiver_traites.xls"
+
+AprÃ¨s des analyses avec la fonction 'choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan'
+j'ai choisis de lancer mes analyses avec:
+    -une altitude de 900m 
+    -une margin de 150m (donc je selectionne toutes les stations entre 750m et 1050m). 
+Je ne choisis que des stations qui ont des observations complÃ¨tes sur toute la periode d'observation. 
+et je m'asssure de n'avoir une seule station par massif (qui appartient Ã  l intersection des massifs entre les study et les stations)
+
+Souvent les observations manquantes se situaient dans les premiÃ¨res ou dans les derniÃ¨res annÃ©es
+j'ai donc ajoutÃ© un parametre nb_to_remove_border qui enlever ces observations (Ã  la fois pour les study et les stations).
+Ce parametre entrainent donc des datasets avec moins d observations, mais avec plus de masssifs/stations
+
+Par contre, dans le cas nb_to_remove=2, il y avait de grosses diffÃ©rences si j'incluais ou non le massif Mercantour
+donc en tout attendant de mieux comprendre, j'ai prefere exclure ce massif dans ce cas
+
+Dans tous les cas, nb_to_remove de 0 Ã  2
+pour n'importe quel modele de marges
+et pour un max stable BrownResnick ou ExtremalT
+alors le signe des coefficient de marges selon les coordonÃ©es Lambert sont toujours les mÃªmes que l'on utilise les donnÃ©es 
+de reanalysis ou les donnÃ©es de stations
+"""
+
+
+"""
+A way to improve the analysis would be to have another altitude of reference with a lot of data
+But for the other altitude, we have data issues because there is a Nan in the middle of the data
+Instead of removing on the side, I should remove the years that concerns as much station from the same altitude level
+I should find the "optimal" years to remove
+Then I should find a way to remove the same years in the study
+"""
 
 if __name__ == '__main__':
-    s = ComparisonAnalysis(altitude=1200)
-    s.spatial_comparison()
\ No newline at end of file
+    # run_comparison_for_optimal_parameters_for_altitude_900()
+    choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan()
diff --git a/extreme_estimator/extreme_models/margin_model/linear_margin_model.py b/extreme_estimator/extreme_models/margin_model/linear_margin_model.py
index 24016d13..7956be5c 100644
--- a/extreme_estimator/extreme_models/margin_model/linear_margin_model.py
+++ b/extreme_estimator/extreme_models/margin_model/linear_margin_model.py
@@ -97,11 +97,19 @@ class LinearMarginModelExample(LinearMarginModel):
                                        GevParams.LOC: [1],
                                        GevParams.SCALE: [0]})
 
+
 class LinearLocationAllDimsMarginModel(LinearMarginModel):
 
     def load_margin_functions(self, margin_function_class: type = None, gev_param_name_to_dims=None):
         super().load_margin_functions({GevParams.LOC: self.coordinates.coordinates_dims})
 
+
+class LinearShapeAllDimsMarginModel(LinearMarginModel):
+
+    def load_margin_functions(self, margin_function_class: type = None, gev_param_name_to_dims=None):
+        super().load_margin_functions({GevParams.SHAPE: self.coordinates.coordinates_dims})
+
+
 class LinearAllParametersAllDimsMarginModel(LinearMarginModel):
 
     def load_margin_functions(self, margin_function_class: type = None, gev_param_name_to_dims=None):
diff --git a/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py b/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py
index 5825458b..a772e793 100644
--- a/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py
+++ b/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py
@@ -125,6 +125,9 @@ class AbstractSpatioTemporalObservations(object):
     def __str__(self) -> str:
         return self._df_maxima.__str__()
 
+    def __len__(self):
+        return self._df_maxima.__len__()
+
     def print_summary(self):
         # Write a summary of observations
         df = self.df_maxima_gev
-- 
GitLab