From 2fd079cafbec00da6ca6c44ba2faf1eb21a2b199 Mon Sep 17 00:00:00 2001 From: Le Roux Erwan <erwan.le-roux@irstea.fr> Date: Fri, 30 Nov 2018 16:18:50 +0100 Subject: [PATCH] [EXPERIMENT] improve fit_diagnosis --- .../{split => fit_diagnosis}/main_split.py | 14 ++-- .../{split => fit_diagnosis}/split_curve.py | 19 +++-- .../visualization_margin_model.py | 5 +- extreme_estimator/estimator/full_estimator.py | 2 +- .../estimator/margin_estimator.py | 4 +- .../margin_model/margin_function/utils.py | 7 ++ .../coordinates/abstract_coordinates.py | 78 +++++++++++-------- .../alps_station_2D_coordinates.py | 2 +- .../transformed_coordinates.py | 4 +- .../dataset/abstract_dataset.py | 6 +- .../slicer/abstract_slicer.py | 15 ++-- .../abstract_spatio_temporal_observations.py | 12 +++ .../alps_precipitation_observations.py | 35 ++++++++- 13 files changed, 142 insertions(+), 61 deletions(-) rename experiment/{split => fit_diagnosis}/main_split.py (87%) rename experiment/{split => fit_diagnosis}/split_curve.py (82%) diff --git a/experiment/split/main_split.py b/experiment/fit_diagnosis/main_split.py similarity index 87% rename from experiment/split/main_split.py rename to experiment/fit_diagnosis/main_split.py index c56b4420..a12220bb 100644 --- a/experiment/split/main_split.py +++ b/experiment/fit_diagnosis/main_split.py @@ -1,6 +1,9 @@ -from experiment.split.split_curve import SplitCurve, LocFunction +import random + +from experiment.fit_diagnosis.split_curve import SplitCurve, LocFunction from extreme_estimator.estimator.full_estimator import FullEstimatorInASingleStepWithSmoothMargin -from extreme_estimator.extreme_models.margin_model.smooth_margin_model import ConstantMarginModel +from extreme_estimator.extreme_models.margin_model.smooth_margin_model import ConstantMarginModel, \ + LinearAllParametersAllDimsMarginModel from extreme_estimator.extreme_models.max_stable_model.max_stable_models import Smith from extreme_estimator.gev_params import GevParams from spatio_temporal_dataset.coordinates.unidimensional_coordinates.coordinates_1D import LinSpaceCoordinates @@ -9,6 +12,7 @@ from spatio_temporal_dataset.dataset.simulation_dataset import FullSimulatedData from spatio_temporal_dataset.slicer.spatial_slicer import SpatialSlicer from spatio_temporal_dataset.slicer.spatio_temporal_slicer import SpatioTemporalSlicer +random.seed(42) def load_dataset(): nb_points = 50 @@ -17,7 +21,6 @@ def load_dataset(): # MarginModel Linear with respect to the shape (from 0.01 to 0.02) params_sample = { - # (GevParams.GEV_SHAPE, 0): 0.2, (GevParams.GEV_LOC, 0): 10, (GevParams.GEV_SHAPE, 0): 1.0, (GevParams.GEV_SCALE, 0): 1.0, @@ -29,17 +32,18 @@ def load_dataset(): coordinates=coordinates, max_stable_model=max_stable_model, train_split_ratio=0.8, - slicer_class=SpatioTemporalSlicer) + slicer_class=SpatialSlicer) def full_estimator(dataset): max_stable_model = Smith() - margin_model_for_estimator = ConstantMarginModel(dataset.coordinates) + margin_model_for_estimator = LinearAllParametersAllDimsMarginModel(dataset.coordinates) full_estimator = FullEstimatorInASingleStepWithSmoothMargin(dataset, margin_model_for_estimator, max_stable_model) return full_estimator if __name__ == '__main__': + dataset = load_dataset() dataset.slicer.summary() full_estimator = full_estimator(dataset) diff --git a/experiment/split/split_curve.py b/experiment/fit_diagnosis/split_curve.py similarity index 82% rename from experiment/split/split_curve.py rename to experiment/fit_diagnosis/split_curve.py index a9bb6f70..78c48bd4 100644 --- a/experiment/split/split_curve.py +++ b/experiment/fit_diagnosis/split_curve.py @@ -39,17 +39,23 @@ class SplitCurve(object): # Fit the estimator and get the margin_function self.estimator.fit() # todo: potentially I will do the fit several times, and retrieve the mean error + # there is a big variablility so it would be really interesting to average, to make real self.margin_function_fitted = estimator.margin_function_fitted self.error_dict = error_dict_between_margin_functions(self.margin_function_sample, self.margin_function_fitted) # todo: add quantile curve, additionally to the marginal curve + @property + def main_title(self): + return self.dataset.slicer.summary(show=False) + def visualize(self): - fig, axes = plt.subplots(3, 2, sharex='col', sharey='row') + fig, axes = plt.subplots(3, 2) fig.subplots_adjust(hspace=0.4, wspace=0.4, ) for i, gev_param_name in enumerate(GevParams.GEV_PARAM_NAMES): self.margin_graph(axes[i, 0], gev_param_name) self.score_graph(axes[i, 1], gev_param_name) + fig.suptitle(self.main_title) plt.show() def margin_graph(self, ax, gev_param_name): @@ -63,9 +69,10 @@ class SplitCurve(object): ax.set_title(title_str) def score_graph(self, ax, gev_param_name): - for split in self.dataset.splits: - s = self.error_dict[gev_param_name] - data = [1.5] * 7 + [2.5] * 2 + [3.5] * 8 + [4.5] * 3 + [5.5] * 1 + [6.5] * 8 + # todo: for the moment only the train/test are interresting (the spatio temporal isn"t working yet) sns.set_style('whitegrid') - sns.kdeplot(np.array(data), bw=0.5, ax=ax) - print() + s = self.error_dict[gev_param_name] + for split in self.dataset.splits: + ind = self.dataset.coordinates_index(split) + data = s.loc[ind].values + sns.kdeplot(data, bw=0.5, ax=ax, label=split.name).set(xlim=0) diff --git a/experiment/regression_margin/visualization_margin_model.py b/experiment/regression_margin/visualization_margin_model.py index b5f548b3..d4f71149 100644 --- a/experiment/regression_margin/visualization_margin_model.py +++ b/experiment/regression_margin/visualization_margin_model.py @@ -22,12 +22,11 @@ class VisualizationMarginModel(unittest.TestCase): @classmethod def example_visualization_1D(cls): coordinates = LinSpaceCoordinates.from_nb_points(nb_points=cls.nb_points) - # MarginModel Linear with respect to the shape (from 0.01 to 0.02) - margin_model = cls.margin_model(coordinates=coordinates, params_sample={GevParams.GEV_SHAPE: 0.02}) + margin_model = cls.margin_model(coordinates=coordinates, params_sample={(GevParams.GEV_SHAPE, 1): 0.02}) if cls.DISPLAY: margin_model.margin_function_sample.visualize() if __name__ == '__main__': VisualizationMarginModel.example_visualization_1D() - VisualizationMarginModel.example_visualization_2D() + # VisualizationMarginModel.example_visualization_2D() diff --git a/extreme_estimator/estimator/full_estimator.py b/extreme_estimator/estimator/full_estimator.py index 42d870ac..ecfcb66c 100644 --- a/extreme_estimator/estimator/full_estimator.py +++ b/extreme_estimator/estimator/full_estimator.py @@ -43,7 +43,7 @@ class SmoothMarginalsThenUnitaryMsp(AbstractFullEstimator): # Compute the maxima_frech maxima_gev_train = self.dataset.maxima_gev(split=self.train_split) maxima_frech = AbstractMarginModel.gev2frech(maxima_gev=maxima_gev_train, - coordinates_values=self.dataset.coordinates_values, + coordinates_values=self.dataset.coordinates_values(self.train_split), margin_function=self.margin_estimator.margin_function_fitted) # Update maxima frech field through the dataset object self.dataset.set_maxima_frech(maxima_frech, split=self.train_split) diff --git a/extreme_estimator/estimator/margin_estimator.py b/extreme_estimator/estimator/margin_estimator.py index 86e8e42c..e4268dde 100644 --- a/extreme_estimator/estimator/margin_estimator.py +++ b/extreme_estimator/estimator/margin_estimator.py @@ -32,6 +32,6 @@ class SmoothMarginEstimator(AbstractMarginEstimator): def _fit(self): maxima_gev = self.dataset.maxima_gev(split=self.train_split) - corodinate_values = self.dataset.coordinates_values + coordinate_values = self.dataset.coordinates_values(self.train_split) self._margin_function_fitted = self.margin_model.fitmargin_from_maxima_gev(maxima_gev=maxima_gev, - coordinates_values=corodinate_values) + coordinates_values=coordinate_values) diff --git a/extreme_estimator/extreme_models/margin_model/margin_function/utils.py b/extreme_estimator/extreme_models/margin_model/margin_function/utils.py index 4a653f01..441ef09a 100644 --- a/extreme_estimator/extreme_models/margin_model/margin_function/utils.py +++ b/extreme_estimator/extreme_models/margin_model/margin_function/utils.py @@ -10,6 +10,13 @@ def abs_error(s1, s2): def error_dict_between_margin_functions(margin1: AbstractMarginFunction, margin2: AbstractMarginFunction): + """ + Return a serie, indexed by the same index as the coordinates + Each value correspond to the error for this coordinate + :param margin1: + :param margin2: + :return: + """ assert margin1.coordinates == margin2.coordinates margin1_gev_params, margin2_gev_params = margin1.gev_params_for_coordinates, margin2.gev_params_for_coordinates gev_param_name_to_error_serie = {} diff --git a/spatio_temporal_dataset/coordinates/abstract_coordinates.py b/spatio_temporal_dataset/coordinates/abstract_coordinates.py index 8d7e3511..ff5e939d 100644 --- a/spatio_temporal_dataset/coordinates/abstract_coordinates.py +++ b/spatio_temporal_dataset/coordinates/abstract_coordinates.py @@ -14,16 +14,19 @@ from spatio_temporal_dataset.slicer.temporal_slicer import TemporalSlicer class AbstractCoordinates(object): - # Columns + # Spatial columns COORDINATE_X = 'coord_x' COORDINATE_Y = 'coord_y' COORDINATE_Z = 'coord_z' COORDINATE_NAMES = [COORDINATE_X, COORDINATE_Y, COORDINATE_Z] - COORDINATE_SPLIT = 'coord_split' + COORDINATE_SPATIAL_SPLIT = 'coord_spatial_split' + # Temporal columns + COORDINATE_T = 'coord_t' + COORDINATE_TEMPORAL_SPLIT = 'coord_temporal_split' - def __init__(self, df_coord: pd.DataFrame, s_split: pd.Series = None): - self.df_coord = df_coord # type: pd.DataFrame - self.s_split = s_split # type: pd.Series + def __init__(self, df_coord: pd.DataFrame, s_spatial_split: pd.Series = None): + self.df_all_coordinates = df_coord # type: pd.DataFrame + self.s_spatial_split = s_spatial_split # type: pd.Series # ClassMethod constructor @@ -33,18 +36,18 @@ class AbstractCoordinates(object): assert cls.COORDINATE_X in df.columns # Create a split based on the train_split_ratio if train_split_ratio is not None: - assert cls.COORDINATE_SPLIT not in df.columns, "A split has already been defined" + assert cls.COORDINATE_SPATIAL_SPLIT not in df.columns, "A split has already been defined" s_split = s_split_from_ratio(index=df.index, train_split_ratio=train_split_ratio) - df[cls.COORDINATE_SPLIT] = s_split + df[cls.COORDINATE_SPATIAL_SPLIT] = s_split # Potentially, a split column can be specified directly in df - if cls.COORDINATE_SPLIT not in df.columns: + if cls.COORDINATE_SPATIAL_SPLIT not in df.columns: df_coord = df s_split = None else: - df_coord = df.loc[:, cls.coordinates_columns(df)] - s_split = df[cls.COORDINATE_SPLIT] + df_coord = df.loc[:, cls.coordinates_spatial_columns(df)] + s_split = df[cls.COORDINATE_SPATIAL_SPLIT] assert s_split.isin([TRAIN_SPLIT_STR, TEST_SPLIT_STR]).all() - return cls(df_coord=df_coord, s_split=s_split) + return cls(df_coord=df_coord, s_spatial_split=s_split) @classmethod def from_csv(cls, csv_path: str = None): @@ -53,7 +56,7 @@ class AbstractCoordinates(object): df = pd.read_csv(csv_path) # Index correspond to the first column index_column_name = df.columns[0] - assert index_column_name not in cls.coordinates_columns(df) + assert index_column_name not in cls.coordinates_spatial_columns(df) df.set_index(index_column_name, inplace=True) return cls.from_df(df) @@ -70,7 +73,7 @@ class AbstractCoordinates(object): return cls.from_df(df=df_sample, train_split_ratio=train_split_ratio) @classmethod - def coordinates_columns(cls, df_coord: pd.DataFrame) -> List[str]: + def coordinates_spatial_columns(cls, df_coord: pd.DataFrame) -> List[str]: coord_columns = [cls.COORDINATE_X] for additional_coord in [cls.COORDINATE_Y, cls.COORDINATE_Z]: if additional_coord in df_coord.columns: @@ -79,7 +82,7 @@ class AbstractCoordinates(object): @property def columns(self): - return self.coordinates_columns(df_coord=self.df_coord) + return self.coordinates_spatial_columns(df_coord=self.df_all_coordinates) @property def nb_columns(self): @@ -87,46 +90,57 @@ class AbstractCoordinates(object): @property def index(self): - return self.df_coord.index + # todo: this should be replace whenever possible by coordinates_index + return self.df_all_coordinates.index + + @property def df_merged(self) -> pd.DataFrame: # Merged DataFrame of df_coord and s_split - return self.df_coord if self.s_split is None else self.df_coord.join(self.s_split) + return self.df_all_coordinates if self.s_spatial_split is None else self.df_all_coordinates.join(self.s_spatial_split) def df_coordinates(self, split: Split = Split.all) -> pd.DataFrame: - if self.train_ind is None: - return self.df_coord + if self.ind_train_spatial is None: + return self.df_all_coordinates + if split is Split.all: - return self.df_coord + return self.df_all_coordinates + if split in [Split.train_temporal, Split.test_temporal]: - return self.df_coord + return self.df_all_coordinates + elif split in [Split.train_spatial, Split.train_spatiotemporal, Split.test_spatiotemporal_temporal]: - return self.df_coord.loc[self.train_ind] + return self.df_all_coordinates.loc[self.ind_train_spatial] + elif split in [Split.test_spatial, Split.test_spatiotemporal, Split.test_spatiotemporal_spatial]: - return self.df_coord.loc[~self.train_ind] + return self.df_all_coordinates.loc[~self.ind_train_spatial] + else: raise NotImplementedError('Unknown split: {}'.format(split)) def coordinates_values(self, split: Split = Split.all) -> np.ndarray: return self.df_coordinates(split).values + def coordinate_index(self, split: Split = Split.all) -> pd.Index: + return self.df_coordinates(split).index + @property def x_coordinates(self) -> np.ndarray: - return self.df_coord[self.COORDINATE_X].values.copy() + return self.df_all_coordinates[self.COORDINATE_X].values.copy() @property def y_coordinates(self) -> np.ndarray: - return self.df_coord[self.COORDINATE_Y].values.copy() + return self.df_all_coordinates[self.COORDINATE_Y].values.copy() @property - def train_ind(self) -> pd.Series: - return train_ind_from_s_split(s_split=self.s_split) + def ind_train_spatial(self) -> pd.Series: + return train_ind_from_s_split(s_split=self.s_spatial_split) # Visualization def visualize(self): - nb_coordinates_columns = len(self.coordinates_columns(self.df_coord)) + nb_coordinates_columns = len(self.coordinates_spatial_columns(self.df_all_coordinates)) if nb_coordinates_columns == 1: self.visualization_1D() elif nb_coordinates_columns == 2: @@ -135,21 +149,21 @@ class AbstractCoordinates(object): self.visualization_3D() def visualization_1D(self): - assert len(self.coordinates_columns(self.df_coord)) >= 1 + assert len(self.coordinates_spatial_columns(self.df_all_coordinates)) >= 1 x = self.coordinates_values()[:] y = np.zeros(len(x)) plt.scatter(x, y) plt.show() def visualization_2D(self): - assert len(self.coordinates_columns(self.df_coord)) >= 2 + assert len(self.coordinates_spatial_columns(self.df_all_coordinates)) >= 2 coordinates_values = self.coordinates_values() x, y = coordinates_values[:, 0], coordinates_values[:, 1] plt.scatter(x, y) plt.show() def visualization_3D(self): - assert len(self.coordinates_columns(self.df_coord)) == 3 + assert len(self.coordinates_spatial_columns(self.df_all_coordinates)) == 3 coordinates_values = self.coordinates_values() x, y, z = coordinates_values[:, 0], coordinates_values[:, 1], coordinates_values[:, 2] fig = plt.figure() @@ -160,10 +174,10 @@ class AbstractCoordinates(object): # Magic Methods def __len__(self): - return len(self.df_coord) + return len(self.df_all_coordinates) def __mul__(self, other: float): - self.df_coord *= other + self.df_all_coordinates *= other return self def __rmul__(self, other): diff --git a/spatio_temporal_dataset/coordinates/spatial_coordinates/alps_station_2D_coordinates.py b/spatio_temporal_dataset/coordinates/spatial_coordinates/alps_station_2D_coordinates.py index a69b43f9..529ac577 100644 --- a/spatio_temporal_dataset/coordinates/spatial_coordinates/alps_station_2D_coordinates.py +++ b/spatio_temporal_dataset/coordinates/spatial_coordinates/alps_station_2D_coordinates.py @@ -10,7 +10,7 @@ class AlpsStation2DCoordinates(AlpsStation3DCoordinates): def from_csv(cls, csv_file='coord-lambert2'): # Remove the Z coordinates from df_coord spatial_coordinates = super().from_csv(csv_file) # type: AlpsStation3DCoordinates - spatial_coordinates.df_coord.drop(cls.COORDINATE_Z, axis=1, inplace=True) + spatial_coordinates.df_all_coordinates.drop(cls.COORDINATE_Z, axis=1, inplace=True) return spatial_coordinates diff --git a/spatio_temporal_dataset/coordinates/transformed_coordinates/transformed_coordinates.py b/spatio_temporal_dataset/coordinates/transformed_coordinates/transformed_coordinates.py index 5022b244..fe34e194 100644 --- a/spatio_temporal_dataset/coordinates/transformed_coordinates/transformed_coordinates.py +++ b/spatio_temporal_dataset/coordinates/transformed_coordinates/transformed_coordinates.py @@ -7,8 +7,8 @@ class TransformedCoordinates(AbstractCoordinates): @classmethod def from_coordinates(cls, coordinates: AbstractCoordinates, transformation_function: AbstractTransformation): - df_coordinates_transformed = coordinates.df_coord.copy() + df_coordinates_transformed = coordinates.df_all_coordinates.copy() df_coordinates_transformed = transformation_function.transform(df_coord=df_coordinates_transformed) - return cls(df_coord=df_coordinates_transformed, s_split=coordinates.s_split) + return cls(df_coord=df_coordinates_transformed, s_spatial_split=coordinates.s_spatial_split) diff --git a/spatio_temporal_dataset/dataset/abstract_dataset.py b/spatio_temporal_dataset/dataset/abstract_dataset.py index 4c7a7886..ed9bc236 100644 --- a/spatio_temporal_dataset/dataset/abstract_dataset.py +++ b/spatio_temporal_dataset/dataset/abstract_dataset.py @@ -21,7 +21,7 @@ class AbstractDataset(object): assert isinstance(slicer_class, type) self.observations = observations self.coordinates = coordinates - self.slicer = slicer_class(coordinates_train_ind=self.coordinates.train_ind, + self.slicer = slicer_class(coordinates_train_ind=self.coordinates.ind_train_spatial, observations_train_ind=self.observations.train_ind) # type: AbstractSlicer assert isinstance(self.slicer, AbstractSlicer) @@ -61,10 +61,12 @@ class AbstractDataset(object): # Coordinates wrapper - @property def coordinates_values(self, split: Split = Split.all) -> np.ndarray: return self.coordinates.coordinates_values(split=split) + def coordinates_index(self, split: Split= Split.all) -> pd.Index: + return self.coordinates.coordinate_index(split=split) + # Slicer wrapper @property diff --git a/spatio_temporal_dataset/slicer/abstract_slicer.py b/spatio_temporal_dataset/slicer/abstract_slicer.py index b45df076..ab2673cd 100644 --- a/spatio_temporal_dataset/slicer/abstract_slicer.py +++ b/spatio_temporal_dataset/slicer/abstract_slicer.py @@ -37,16 +37,19 @@ class AbstractSlicer(object): def some_required_ind_are_not_defined(self): pass - def summary(self): - print('Slicer summary: \n') + def summary(self, show=True): + msg = '' for s, global_name in [(self.index_train_ind, "Spatial"), (self.column_train_ind, "Temporal")]: - print(global_name + ' split') + msg += global_name + ': ' if s is None: - print('Not handled by this slicer') + msg += 'Not handled by this slicer' else: for f, name in [(len, 'Total'), (sum, 'train')]: - print("{}: {}".format(name, f(s))) - print('\n') + msg += "{}: {} ".format(name, f(s)) + msg += ' / ' + if show: + print(msg) + return msg def loc_split(self, df: pd.DataFrame, split: Split): # split should belong to the list of split accepted by the slicer diff --git a/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py b/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py index 40e05099..e23525b7 100644 --- a/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py +++ b/spatio_temporal_dataset/spatio_temporal_observations/abstract_spatio_temporal_observations.py @@ -1,3 +1,4 @@ +import os.path as op import pandas as pd import numpy as np @@ -28,6 +29,17 @@ class AbstractSpatioTemporalObservations(object): assert s_split.isin([TRAIN_SPLIT_STR, TEST_SPLIT_STR]).all() self.s_split = s_split + @classmethod + def from_csv(cls, csv_path: str = None): + assert csv_path is not None + assert op.exists(csv_path) + df = pd.read_csv(csv_path) + # # Index correspond to the first column + # index_column_name = df.columns[0] + # assert index_column_name not in cls.coordinates_columns(df) + # df.set_index(index_column_name, inplace=True) + return cls.from_df(df) + @property def _df_maxima(self) -> pd.DataFrame: if self.df_maxima_frech is not None: diff --git a/spatio_temporal_dataset/spatio_temporal_observations/alps_precipitation_observations.py b/spatio_temporal_dataset/spatio_temporal_observations/alps_precipitation_observations.py index 4311bc5f..d1bf3c89 100644 --- a/spatio_temporal_dataset/spatio_temporal_observations/alps_precipitation_observations.py +++ b/spatio_temporal_dataset/spatio_temporal_observations/alps_precipitation_observations.py @@ -1,5 +1,38 @@ +import os.path as op + +import pandas as pd + from spatio_temporal_dataset.spatio_temporal_observations.abstract_spatio_temporal_observations import AbstractSpatioTemporalObservations +from utils import get_full_path class AlpsPrecipitationObservations(AbstractSpatioTemporalObservations): - pass \ No newline at end of file + + RELATIVE_PATH = r'local/spatio_temporal_datasets/Gilles - precipitations' + FULL_PATH = get_full_path(relative_path=RELATIVE_PATH) + + @classmethod + def from_csv(cls, csv_file='max_precip_3j'): + csv_path = op.join(cls.FULL_PATH, csv_file + '.csv') + return super().from_csv(csv_path) + + @classmethod + def transform_gilles_txt_into_csv(cls): + filepath = op.join(cls.FULL_PATH, 'original data', 'max_precip_3j.txt') + station_to_coordinates = {} + # todo: be absolutely sure of the mapping, in the txt file of Gilles we do not have the coordinate + # this seems rather dangerous, especially if he loaded + # todo: re-do the whole analysis, and extraction myself about the snow + with open(filepath, 'r') as f: + for l in f: + _, station_name, coordinates = l.split('"') + coordinates = coordinates.split() + print(len(coordinates)) + assert len(coordinates) == 55 + station_to_coordinates[station_name] = coordinates + # df = pd.DataFrame.from_dict(data=station_to_coordinates, orient='index', + # columns=[cls.COORDINATE_X, cls.COORDINATE_Y, cls.COORDINATE_Z]) + # print(df.head()) + # filepath = op.join(cls.FULL_PATH, 'max_precip_3j.csv') + # assert not op.exists(filepath) + # df.to_csv(filepath) \ No newline at end of file -- GitLab