An error occurred while loading the file. Please try again.
-
remi.clement authoreda0362f3d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
from collections import OrderedDict
from cached_property import cached_property
from experiment.meteo_france_data.scm_models_data.safran.safran import SafranSnowfall
from experiment.meteo_france_data.visualization.study_visualization.main_study_visualizer import \
ALL_ALTITUDES, ALL_ALTITUDES_WITH_20_STATIONS_AT_LEAST
from extreme_estimator.estimator.full_estimator.abstract_full_estimator import \
FullEstimatorInASingleStepWithSmoothMargin
from extreme_estimator.extreme_models.margin_model.linear_margin_model import LinearAllParametersAllDimsMarginModel, \
LinearLocationAllDimsMarginModel, LinearShapeAllDimsMarginModel
from extreme_estimator.extreme_models.max_stable_model.abstract_max_stable_model import CovarianceFunction
from extreme_estimator.extreme_models.max_stable_model.max_stable_models import ExtremalT, BrownResnick
from spatio_temporal_dataset.coordinates.abstract_coordinates import AbstractCoordinates
from spatio_temporal_dataset.coordinates.spatial_coordinates.abstract_spatial_coordinates import \
AbstractSpatialCoordinates
from spatio_temporal_dataset.coordinates.transformed_coordinates.transformation.uniform_normalization import \
BetweenZeroAndOneNormalization
from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset
from spatio_temporal_dataset.spatio_temporal_observations.abstract_spatio_temporal_observations import \
AbstractSpatioTemporalObservations
from test.test_utils import load_test_max_stable_models
from utils import get_display_name_from_object_type
DATA_PATH = r'/home/erwan/Documents/projects/spatiotemporalextremes/local/spatio_temporal_datasets/Johan_data/PrecipitationsAvalanches_MaxPrecipit_ParPoste_ParHiver_traites.xls'
import pandas as pd
class ComparisonAnalysis(object):
def __init__(self, altitude=900, nb_border_data_to_remove=0, normalize_observations=True, margin=150,
transformation_class=BetweenZeroAndOneNormalization, exclude_some_massifs_from_the_intersection=False):
self.exclude_some_massifs_from_the_intersection = exclude_some_massifs_from_the_intersection
self.normalize_observations = normalize_observations
self.altitude = altitude
self.margin = margin
self.transformation_class = transformation_class
self.nb_border_data_to_remove = nb_border_data_to_remove
self.year_min = 1958 + nb_border_data_to_remove
self.year_max = 2004 - nb_border_data_to_remove
##################### STATION ATTRIBUTES ############################
def load_main_df_for_altitude(self):
df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes')
df = df.iloc[:78]
ind_altitude = self.altitude - self.margin < df['ALTITUDE']
ind_altitude &= df['ALTITUDE'] <= self.altitude + self.margin
df = df.loc[ind_altitude] # type: pd.DataFrame
# Remove dulpicate for commune, Pellafol we should keep the first, i.e. 930 which has more data than the other
df.drop_duplicates(subset='COMMUNE', inplace=True)
df.set_index('COMMUNE', inplace=True)
df = df.iloc[:, 3:]
# Get values
df_values = self.get_values(df)
# Keep only stations who have not any Nan values
ind = ~df_values.isna().any(axis=1)
df = df.loc[ind]
return df
def load_main_df_for_altitude_and_good_massifs(self):
df = self.load_main_df_for_altitude().copy()
# Keep only the massif that also belong to the study (so that the normalization are roughly comparable)
ind_massif = df['MASSIF_PRA'].isin(self.intersection_massif_names)
df = df.loc[ind_massif]
# Keep only one station per massif, to have the same number of points (the first by default)
df = df.drop_duplicates(subset='MASSIF_PRA')
return df
@property
def stations_coordinates(self):
df = self.load_main_df_for_altitude_and_good_massifs()
df = df.loc[:, ['LAMBERTX', 'LAMBERTY']]
df.rename({'LAMBERTX': AbstractCoordinates.COORDINATE_X,
'LAMBERTY': AbstractCoordinates.COORDINATE_Y}, axis=1, inplace=True)
coordinates = AbstractSpatialCoordinates.from_df(df, transformation_class=self.transformation_class)
return coordinates
@property
def stations_observations(self):
df = self.load_main_df_for_altitude_and_good_massifs()
df = self.get_values(df)
obs = AbstractSpatioTemporalObservations(df_maxima_gev=df)
if self.normalize_observations:
obs.normalize()
return obs
def get_values(self, df):
df = df.iloc[:, 7:]
df.columns = df.columns.astype(int)
df = df.loc[:, self.year_min:self.year_max]
return df
@property
def station_dataset(self):
dataset = AbstractDataset(observations=self.stations_observations,
coordinates=self.stations_coordinates)
return dataset
@property
def massif_names(self):
df = self.load_main_df_for_altitude()
return list(set(df['MASSIF_PRA']))
##################### STUDY ATTRIBUTES ############################
@cached_property
def study(self):
# Build the study for the same years
return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=1, year_min=self.year_min,
year_max=self.year_max + 1)
@cached_property
def intersection_massif_names(self):
intersection_of_massif_names = list(set(self.massif_names).intersection(set(self.study.study_massif_names)))
diff_due_to_wrong_names = set(self.massif_names) - set(self.study.study_massif_names)
assert not diff_due_to_wrong_names, diff_due_to_wrong_names
# remove on purpose some massifs (to understand if it the massifs that change the results or the year that were removed)
# this created big differences in the results for altitude=900m margin=150m and nb=2
# maybe this is due to a difference between the massif coordinate and the station (that belong to the massif) coordinate
# or this might be due to a big difference between the observations
if self.exclude_some_massifs_from_the_intersection:
massifs_to_remove = ['Mercantour']
intersection_of_massif_names = list(set(intersection_of_massif_names) - set(massifs_to_remove))
return intersection_of_massif_names
def study_coordinates(self, use_study_coordinate_with_latitude_and_longitude=True):
# Build coordinate, from two possibles dataframes for the coordinates
if use_study_coordinate_with_latitude_and_longitude:
df = self.study.df_massifs_longitude_and_latitude
else:
df = self.study.load_df_centroid()
coordinate = AbstractSpatialCoordinates.from_df(df=df.loc[self.intersection_massif_names],
transformation_class=self.transformation_class)
return coordinate
@property
def study_observations(self):
# Get the observations
observations = self.study.observations_annual_maxima
maxima_gev_of_interest = observations.df_maxima_gev.loc[self.intersection_massif_names]
observations.df_maxima_gev = maxima_gev_of_interest
if self.normalize_observations:
observations.normalize()
return observations
@property
def study_dataset_latitude_longitude(self):
dataset = AbstractDataset(observations=self.study_observations,
coordinates=self.study_coordinates(
use_study_coordinate_with_latitude_and_longitude=True))
return dataset
@property
def study_dataset_lambert(self):
dataset = AbstractDataset(observations=self.study_observations,
coordinates=self.study_coordinates(
use_study_coordinate_with_latitude_and_longitude=False))
return dataset
# After a short analysis (run df_altitude to check) we decided on the altitude
# 900 and 1200 seems to be the best altitudes
def load_main_df(self):
df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes')
df = df.iloc[:78, 4:]
return df
def reduce_altitude(self, altitude=900) -> pd.Series:
df = self.load_main_df()
margin = 150
ind_altitude = altitude - margin < df['ALTITUDE']
ind_altitude &= df['ALTITUDE'] <= altitude + margin
df = df.loc[ind_altitude]
# Put all the result into an ordered dict
d = OrderedDict()
# Number of stations
d['Nb stations'] = len(df)
# Number of massifs
d['Nb massifs'] = len(set(df['MASSIF_PRA']))
df_values = df.iloc[:, 7:]
df_values_from_1958 = df_values.iloc[:, 13:]
# Mean number of non-Nan values
d['% of Nan'] = df_values_from_1958.isna().mean().mean()
# Number of lines with only Nan
d['Lines w Nan'] = df_values_from_1958.isna().all(axis=1).sum()
return pd.Series(d)
def altitude_short_analysis(self):
altitudes = ALL_ALTITUDES
df = pd.concat([self.reduce_altitude(altitude) for altitude in altitudes], axis=1)
df = df.transpose()
df.index = altitudes
# WIth the observation, the altitude 1200 seems the best
# 1200 nb_stations:23 nb_massifs:15
# I should try a fit
# Finally I might prefer the altitude 900, which seems to have less missing values
print(df)
##################### COMPARE THE TWO DATASETS BY FITTING THE SAME MODEL ############################
def spatial_comparison(self, margin_model_class, default_covariance_function=CovarianceFunction.powexp):
# max_stable_models = load_test_max_stable_models(default_covariance_function=CovarianceFunction.powexp)
# max_stable_models = [max_stable_models[1], max_stable_models[-2]]
max_stable_models = load_test_max_stable_models(default_covariance_function=default_covariance_function)
max_stable_models = [m for m in max_stable_models if isinstance(m, (BrownResnick, ExtremalT))]
for max_stable_model in max_stable_models:
print('\n\n', get_display_name_from_object_type(type(max_stable_model)))
if hasattr(max_stable_model, 'covariance_function'):
print(max_stable_model.covariance_function)
for dataset in [self.station_dataset, self.study_dataset_lambert]:
margin_model = margin_model_class(coordinates=dataset.coordinates)
estimator = FullEstimatorInASingleStepWithSmoothMargin(dataset=dataset,
margin_model=margin_model,
max_stable_model=max_stable_model)
estimator.fit()
print(estimator.result_from_fit.margin_coef_dict)
print(estimator.result_from_fit.other_coef_dict)
# print(estimato)
def choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan():
for margin in [50, 100, 150, 200, 250, 300][2:3]:
for altitude in [900, 1200, 1800][:1]:
for nb in range(1, 4):
s = ComparisonAnalysis(altitude=altitude, nb_border_data_to_remove=nb, margin=margin)
print(margin, altitude, nb, 'nb massifs', len(s.intersection_massif_names), 'nb stations',
len(s.stations_observations), 'nb observations', s.stations_observations.nb_obs,
s.study_observations.nb_obs,
s.stations_coordinates.index)
def run_comparison_for_optimal_parameters_for_altitude_900():
for nb in [0, 1, 2][:]:
for transformation_class in [None, BetweenZeroAndOneNormalization][1:]:
comparison = ComparisonAnalysis(altitude=900, nb_border_data_to_remove=nb, margin=150,
exclude_some_massifs_from_the_intersection=nb == 2,
transformation_class=transformation_class,
normalize_observations=True)
print('\n-----------\nnb:', nb, comparison.intersection_massif_names)
# margin_model_classes = [LinearShapeAllDimsMarginModel, LinearLocationAllDimsMarginModel,
# LinearAllParametersAllDimsMarginModel]
for margin_model_class in [LinearAllParametersAllDimsMarginModel]:
print(get_display_name_from_object_type(margin_model_class))
comparison.spatial_comparison(margin_model_class)
"""
Comparaison données de re-analysis et données de stations
J'ai utilisé le fichier "PrecipitationsAvalanches_MaxPrecipit_ParPoste_ParHiver_traites.xls"
Après des analyses avec la fonction 'choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan'
j'ai choisis de lancer mes analyses avec:
-une altitude de 900m
-une margin de 150m (donc je selectionne toutes les stations entre 750m et 1050m).
Je ne choisis que des stations qui ont des observations complètes sur toute la periode d'observation.
et je m'asssure de n'avoir une seule station par massif (qui appartient à l intersection des massifs entre les study et les stations)
Souvent les observations manquantes se situaient dans les premières ou dans les dernières années
j'ai donc ajouté un parametre nb_to_remove_border qui enlever ces observations (à la fois pour les study et les stations).
Ce parametre entrainent donc des datasets avec moins d observations, mais avec plus de masssifs/stations
Par contre, dans le cas nb_to_remove=2, il y avait de grosses différences si j'incluais ou non le massif Mercantour
donc en tout attendant de mieux comprendre, j'ai prefere exclure ce massif dans ce cas
Dans tous les cas, nb_to_remove de 0 à 2
pour n'importe quel modele de marges
et pour un max stable BrownResnick ou ExtremalT
alors le signe des coefficient de marges selon les coordonées Lambert sont toujours les mêmes que l'on utilise les données
de reanalysis ou les données de stations
"""
"""
A way to improve the analysis would be to have another altitude of reference with a lot of data
But for the other altitude, we have data issues because there is a Nan in the middle of the data
Instead of removing on the side, I should remove the years that concerns as much station from the same altitude level
I should find the "optimal" years to remove
Then I should find a way to remove the same years in the study
"""
if __name__ == '__main__':
run_comparison_for_optimal_parameters_for_altitude_900()
# choice_of_altitude_and_nb_border_data_to_remove_to_get_data_without_nan()