An error occurred while loading the file. Please try again.
-
Le Roux Erwan authoredb65db5e5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
from collections import OrderedDict
import numpy as np
from typing import List
from cached_property import cached_property
from experiment.meteo_france_data.scm_models_data.safran.safran import SafranSnowfall
from experiment.meteo_france_data.scm_models_data.visualization.study_visualization.main_study_visualizer import \
ALL_ALTITUDES
from extreme_estimator.estimator.full_estimator.abstract_full_estimator import \
FullEstimatorInASingleStepWithSmoothMargin
from extreme_estimator.extreme_models.max_stable_model.abstract_max_stable_model import CovarianceFunction
from extreme_estimator.extreme_models.max_stable_model.max_stable_models import ExtremalT, BrownResnick
from spatio_temporal_dataset.coordinates.abstract_coordinates import AbstractCoordinates
from spatio_temporal_dataset.coordinates.spatial_coordinates.abstract_spatial_coordinates import \
AbstractSpatialCoordinates
from spatio_temporal_dataset.coordinates.transformed_coordinates.transformation.uniform_normalization import \
BetweenZeroAndOneNormalization
from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset
from spatio_temporal_dataset.spatio_temporal_observations.abstract_spatio_temporal_observations import \
AbstractSpatioTemporalObservations
from test.test_utils import load_test_max_stable_models
from utils import get_display_name_from_object_type
REANALYSE_STR = 'reanalyse'
ALTITUDE_COLUMN_NAME = 'ALTITUDE'
MASSIF_COLUMN_NAME = 'MASSIF_PRA'
DATA_PATH = r'/home/erwan/Documents/projects/spatiotemporalextremes/local/spatio_temporal_datasets/Johan_data/PrecipitationsAvalanches_MaxPrecipit_ParPoste_ParHiver_traites.xls'
import pandas as pd
class ComparisonAnalysis(object):
def __init__(self, altitude=900, nb_border_data_to_remove=0, normalize_observations=True, margin=150,
transformation_class=BetweenZeroAndOneNormalization, exclude_some_massifs_from_the_intersection=False,
keep_only_station_without_nan_values=True,
one_station_per_massif=True):
self.keep_only_station_without_nan_values = keep_only_station_without_nan_values
self.one_station_per_massif = one_station_per_massif
self.exclude_some_massifs_from_the_intersection = exclude_some_massifs_from_the_intersection
self.normalize_observations = normalize_observations
self.altitude = altitude
self.margin = margin
self.transformation_class = transformation_class
self.nb_border_data_to_remove = nb_border_data_to_remove
self.year_min = 1958 + nb_border_data_to_remove
self.year_max = 2004 - nb_border_data_to_remove
##################### STATION ATTRIBUTES ############################
def load_main_df(self):
# this sheet name: Mean metrics 24.666666666666668 for the sheet name it was worse Mean metrics 36.022222222222226
df = pd.read_excel(DATA_PATH, sheet_name='max alpes')
df = df.iloc[:78]
ind_altitude = self.altitude - self.margin < df[ALTITUDE_COLUMN_NAME]
ind_altitude &= df[ALTITUDE_COLUMN_NAME] <= self.altitude + self.margin
df = df.loc[ind_altitude] # type: pd.DataFrame
# Remove dupicate for commune, Pellafol we should keep the first, i.e. 930 which has more data than the other
df.drop_duplicates(subset='COMMUNE', inplace=True)
df.set_index('COMMUNE', inplace=True)
df = df.iloc[:, 3:]
# Get values
df_values = self.get_values(df)
if self.keep_only_station_without_nan_values:
# Keep only stations who have not any Nan values
ind = ~df_values.isna().any(axis=1)
df = df.loc[ind]
return df
def load_main_df_stations_that_belong_to_intersection_massifs(self):
df = self.load_main_df().copy()
# Keep only the massif that also belong to the study (so that the normalization are roughly comparable)
ind_massif = df['MASSIF_PRA'].isin(self.intersection_massif_names)
return df.loc[ind_massif]
def load_main_df_one_station_per_massif(self):
df = self.load_main_df_stations_that_belong_to_intersection_massifs()
# Keep only one station per massif, to have the same number of points (the first by default)
df = df.drop_duplicates(subset='MASSIF_PRA')
# Sort all the DataFrame so that the massif order correspond
df['MASSIF_IDX'] = [self.intersection_massif_names.index(m) for m in df['MASSIF_PRA']]
df = df.sort_values(['MASSIF_IDX'])
df.drop(labels='MASSIF_IDX', axis=1, inplace=True)
return df
@property
def load_df_stations(self):
if self.one_station_per_massif:
return self.load_main_df_one_station_per_massif()
else:
return self.load_main_df_station_intersection_clean()
@property
def stations_coordinates(self):
df = self.load_main_df()
df = df.loc[:, ['LAMBERTX', 'LAMBERTY']]
df.rename({'LAMBERTX': AbstractCoordinates.COORDINATE_X,
'LAMBERTY': AbstractCoordinates.COORDINATE_Y}, axis=1, inplace=True)
coordinates = AbstractSpatialCoordinates.from_df(df, transformation_class=self.transformation_class)
return coordinates
@property
def stations_observations(self):
df = self.load_main_df()
df = self.get_values(df)
obs = AbstractSpatioTemporalObservations(df_maxima_gev=df)
if self.normalize_observations:
obs.normalize()
return obs
def get_values(self, df):
df = df.iloc[:, 7:]
df.columns = df.columns.astype(int)
df = df.loc[:, self.year_min:self.year_max]
return df
@property
def station_dataset(self):
dataset = AbstractDataset(observations=self.stations_observations,
coordinates=self.stations_coordinates)
return dataset
@property
def massif_names(self):
df = self.load_main_df()
return list(set(df['MASSIF_PRA']))
##################### STUDY ATTRIBUTES ############################
@cached_property
def study(self):
# Build the study for the same years
return SafranSnowfall(altitude=self.altitude, nb_consecutive_days=3, year_min=self.year_min,
year_max=self.year_max + 1)
@property
def nb_massifs(self):
return len(self.intersection_massif_names)
@cached_property
def intersection_massif_names(self):
intersection_of_massif_names = list(set(self.massif_names).intersection(set(self.study.study_massif_names)))
diff_due_to_wrong_names = set(self.massif_names) - set(self.study.study_massif_names)
assert not diff_due_to_wrong_names, diff_due_to_wrong_names
# remove on purpose some massifs (to understand if it the massifs that change the results or the year that were removed)
# this created big differences in the results for altitude=900m margin=150m and nb=2
# maybe this is due to a difference between the massif coordinate and the station (that belong to the massif) coordinate
# or this might be due to a big difference between the observations
if self.exclude_some_massifs_from_the_intersection:
massifs_to_remove = ['Mercantour']
intersection_of_massif_names = list(set(intersection_of_massif_names) - set(massifs_to_remove))
return intersection_of_massif_names
def study_coordinates(self, use_study_coordinate_with_latitude_and_longitude=True):
# Build coordinate, from two possibles dataframes for the coordinates
if use_study_coordinate_with_latitude_and_longitude:
df = self.study.df_massifs_longitude_and_latitude
else:
df = self.study.load_df_centroid()
coordinate = AbstractSpatialCoordinates.from_df(df=df.loc[self.intersection_massif_names],
transformation_class=self.transformation_class)
return coordinate
@property
def study_observations(self):
# Get the observations
observations = self.study.observations_annual_maxima
maxima_gev_of_interest = observations.df_maxima_gev.loc[self.intersection_massif_names]
observations.df_maxima_gev = maxima_gev_of_interest
if self.normalize_observations:
observations.normalize()
return observations
@property
def study_dataset_latitude_longitude(self) -> AbstractDataset:
dataset = AbstractDataset(observations=self.study_observations,
coordinates=self.study_coordinates(
use_study_coordinate_with_latitude_and_longitude=True))
return dataset
@property
def study_dataset_lambert(self) -> AbstractDataset:
dataset = AbstractDataset(observations=self.study_observations,
coordinates=self.study_coordinates(
use_study_coordinate_with_latitude_and_longitude=False))
return dataset
# After a short analysis (run df_altitude to check) we decided on the altitude
# 900 and 1200 seems to be the best altitudes
def reduce_altitude(self, altitude=900) -> pd.Series:
df = pd.read_excel(DATA_PATH, sheet_name='max alpes 2500m presentes')
df = df.iloc[:78, 4:]
margin = 150
ind_altitude = altitude - margin < df['ALTITUDE']
ind_altitude &= df['ALTITUDE'] <= altitude + margin
df = df.loc[ind_altitude]
# Put all the result into an ordered dict
d = OrderedDict()
# Number of stations
d['Nb stations'] = len(df)
# Number of massifs
d['Nb massifs'] = len(set(df['MASSIF_PRA']))
df_values = df.iloc[:, 7:]
df_values_from_1958 = df_values.iloc[:, 13:]
# Mean number of non-Nan values
d['% of Nan'] = df_values_from_1958.isna().mean().mean()
# Number of lines with only Nan
d['Lines w Nan'] = df_values_from_1958.isna().all(axis=1).sum()
return pd.Series(d)
def altitude_short_analysis(self):
altitudes = ALL_ALTITUDES
df = pd.concat([self.reduce_altitude(altitude) for altitude in altitudes], axis=1)
df = df.transpose()
df.index = altitudes
# WIth the observation, the altitude 1200 seems the best
# 1200 nb_stations:23 nb_massifs:15
# I should try a fit
# Finally I might prefer the altitude 900, which seems to have less missing values
print(df)
################## MERGING SOME ATTRIBUTES ###################
def load_main_df_station_intersection_clean(self):
assert not self.one_station_per_massif
df = self.load_main_df_stations_that_belong_to_intersection_massifs().loc[:, ['MASSIF_PRA', 'ALTITUDE']]
df_coord = self.stations_coordinates.df_all_coordinates
df_obs = self.stations_observations.df_maxima_gev
return pd.concat([df, df_coord, df_obs], axis=1)
def load_main_df_study_intersection_clean(self):
assert not self.one_station_per_massif
df_coord = self.study_dataset_lambert.coordinates.df_all_coordinates
df_obs = self.study_observations.df_maxima_gev
study_index = pd.Series(df_coord.index) + ' ' + REANALYSE_STR
df = pd.DataFrame({'MASSIF_PRA': df_coord.index.values}, index=study_index)
df['ALTITUDE'] = self.altitude
df_coord.index = study_index
df_obs.index = study_index
df_study = pd.concat([df, df_coord, df_obs], axis=1)
return df_study
@cached_property
def df_merged_intersection_clean(self):
df_stations = self.load_main_df_station_intersection_clean()
df_study = self.load_main_df_study_intersection_clean()
diff = set(df_study.columns).symmetric_difference(set(df_stations.columns))
assert not diff, diff
df = pd.concat([df_study.loc[:, df_stations.columns], df_stations], axis=0)
df = df.sort_values([MASSIF_COLUMN_NAME])
return df
##################### COMPARE THE TWO DATASETS BY FITTING THE SAME MODEL ############################
def spatial_comparison(self, margin_model_class, default_covariance_function=CovarianceFunction.powexp):
# max_stable_models = load_test_max_stable_models(default_covariance_function=CovarianceFunction.powexp)
# max_stable_models = [max_stable_models[1], max_stable_models[-2]]
max_stable_models = load_test_max_stable_models(default_covariance_function=default_covariance_function)
max_stable_models = [m for m in max_stable_models if isinstance(m, (BrownResnick, ExtremalT))]
for max_stable_model in max_stable_models:
print('\n\n', get_display_name_from_object_type(type(max_stable_model)))
if hasattr(max_stable_model, 'covariance_function'):
print(max_stable_model.covariance_function)
estimators = []
datasets = [self.station_dataset, self.study_dataset_lambert] # type: List[AbstractDataset]
# Checks that the dataset have the same index
assert pd.Index.equals(datasets[0].observations.columns, datasets[1].observations.columns)
# assert datasets[0].observations.columns
for dataset in datasets:
margin_model = margin_model_class(coordinates=dataset.coordinates)
estimator = FullEstimatorInASingleStepWithSmoothMargin(dataset=dataset,
margin_model=margin_model,
max_stable_model=max_stable_model)
estimator.fit()
print(estimator.result_from_fit.margin_coef_dict)
print(estimator.result_from_fit.other_coef_dict)
estimators.append(estimator)
# Compare the sign of them margin coefficient for the estimators
coefs = [{k: v for k, v in e.result_from_fit.margin_coef_dict.items() if 'Coeff1' not in k} for e in
estimators]
different_sign = [k for k, v in coefs[0].items() if np.sign(coefs[1][k]) != np.sign(v)]
print('All linear coefficient have the same sign: {}, different_signs for: {}'.format(
len(different_sign) == 0, different_sign))