diff --git a/extreme_estimator/estimator/full_estimator.py b/extreme_estimator/estimator/full_estimator.py index 20eda73f26b6d9af56b9b852033d6920f33b9119..aac5229c40fbf3d7460fabfa9ec1f9397d76520f 100644 --- a/extreme_estimator/estimator/full_estimator.py +++ b/extreme_estimator/estimator/full_estimator.py @@ -8,6 +8,7 @@ from extreme_estimator.estimator.abstract_estimator import AbstractEstimator from extreme_estimator.estimator.margin_estimator import SmoothMarginEstimator from extreme_estimator.estimator.max_stable_estimator import MaxStableEstimator from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset +from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit class AbstractFullEstimator(AbstractEstimator): @@ -41,11 +42,12 @@ class SmoothMarginalsThenUnitaryMsp(AbstractFullEstimator): # Estimate the margin parameters self.margin_estimator.fit() # Compute the maxima_frech - maxima_frech = AbstractMarginModel.gev2frech(maxima_gev=self.dataset.maxima_gev, + maxima_gev_train = self.dataset.maxima_gev(split=SpatialTemporalSplit.train) + maxima_frech = AbstractMarginModel.gev2frech(maxima_gev=maxima_gev_train, coordinates_values=self.dataset.coordinates_values, margin_function=self.margin_estimator.margin_function_fitted) # Update maxima frech field through the dataset object - self.dataset.maxima_frech = maxima_frech + self.dataset.set_maxima_frech(maxima_frech, split=SpatialTemporalSplit.train) # Estimate the max stable parameters self.max_stable_estimator.fit() @@ -68,7 +70,7 @@ class FullEstimatorInASingleStepWithSmoothMargin(AbstractFullEstimator): def _fit(self): # Estimate both the margin and the max-stable structure self.full_params_fitted = self.max_stable_model.fitmaxstab( - maxima_gev=self.dataset.maxima_gev, + maxima_gev=self.dataset.maxima_gev(split=SpatialTemporalSplit.train), df_coordinates=self.dataset.df_coordinates, fit_marge=True, fit_marge_form_dict=self.linear_margin_function_to_fit.form_dict, diff --git a/extreme_estimator/estimator/margin_estimator.py b/extreme_estimator/estimator/margin_estimator.py index 4453d868ea19cf65434a296035ee7fb876e5586a..6941fecfe9f49c22827d1ef06623f9010f491ab3 100644 --- a/extreme_estimator/estimator/margin_estimator.py +++ b/extreme_estimator/estimator/margin_estimator.py @@ -4,13 +4,14 @@ from extreme_estimator.extreme_models.margin_model.margin_function.abstract_marg AbstractMarginFunction from extreme_estimator.extreme_models.margin_model.smooth_margin_model import LinearMarginModel from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset +from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit class AbstractMarginEstimator(AbstractEstimator): def __init__(self, dataset: AbstractDataset): super().__init__(dataset) - assert self.dataset.maxima_gev is not None + assert self.dataset.maxima_gev() is not None self._margin_function_fitted = None @property @@ -32,5 +33,7 @@ class SmoothMarginEstimator(AbstractMarginEstimator): self.margin_model = margin_model def _fit(self): - self._margin_function_fitted = self.margin_model.fitmargin_from_maxima_gev(maxima_gev=self.dataset.maxima_gev, - coordinates_values=self.dataset.coordinates_values) + maxima_gev = self.dataset.maxima_gev(split=SpatialTemporalSplit.train) + corodinate_values = self.dataset.coordinates_values + self._margin_function_fitted = self.margin_model.fitmargin_from_maxima_gev(maxima_gev=maxima_gev, + coordinates_values=corodinate_values) diff --git a/extreme_estimator/estimator/max_stable_estimator.py b/extreme_estimator/estimator/max_stable_estimator.py index d83809a3cd2a3c8107732710f234707688829cda..dff88648bc183ede968a87e43df2b92f9de63dc4 100644 --- a/extreme_estimator/estimator/max_stable_estimator.py +++ b/extreme_estimator/estimator/max_stable_estimator.py @@ -3,6 +3,8 @@ from extreme_estimator.extreme_models.max_stable_model.abstract_max_stable_model from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset import numpy as np +from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit + class AbstractMaxStableEstimator(AbstractEstimator): @@ -16,9 +18,9 @@ class AbstractMaxStableEstimator(AbstractEstimator): class MaxStableEstimator(AbstractMaxStableEstimator): def _fit(self): - assert self.dataset.maxima_frech is not None + assert self.dataset.maxima_frech(split=SpatialTemporalSplit.train) is not None self.max_stable_params_fitted = self.max_stable_model.fitmaxstab( - maxima_frech=self.dataset.maxima_frech, + maxima_frech=self.dataset.maxima_frech(split=SpatialTemporalSplit.train), df_coordinates=self.dataset.coordinates.df_coordinates) def _error(self, true_max_stable_params: dict): diff --git a/spatio_temporal_dataset/coordinates/abstract_coordinates.py b/spatio_temporal_dataset/coordinates/abstract_coordinates.py index 68f997d63e070c9c2d492db182bfe6e28f18ccd8..dc18803bd359032720de6f1eaf64d7c7589045e5 100644 --- a/spatio_temporal_dataset/coordinates/abstract_coordinates.py +++ b/spatio_temporal_dataset/coordinates/abstract_coordinates.py @@ -13,14 +13,16 @@ class AbstractCoordinates(object): COORDINATE_Y = 'coord_y' COORDINATE_Z = 'coord_z' COORDINATE_NAMES = [COORDINATE_X, COORDINATE_Y, COORDINATE_Z] - COORD_SPLIT = 'coord_split' - # Constants + COORDINATE_SPLIT = 'coord_split' + # Constants for the split column TRAIN_SPLIT_STR = 'train_split' TEST_SPLIT_STR = 'test_split' def __init__(self, df_coordinates: pd.DataFrame, s_split: pd.Series = None): - self.df_coordinates = df_coordinates - self.s_split = s_split + self.df_coordinates = df_coordinates # type: pd.DataFrame + self.s_split = s_split # type: pd.Series + + # ClassMethod constructor @classmethod def from_df(cls, df: pd.DataFrame): @@ -28,9 +30,30 @@ class AbstractCoordinates(object): assert cls.COORDINATE_X in df.columns df_coordinates = df.loc[:, cls.coordinates_columns(df)] # Potentially, a split column can be specified - s_split = df[cls.COORD_SPLIT] if cls.COORD_SPLIT in df.columns else None + s_split = df[cls.COORDINATE_SPLIT] if cls.COORDINATE_SPLIT in df.columns else None + if s_split is not None: + assert s_split.isin([cls.TRAIN_SPLIT_STR, cls.TEST_SPLIT_STR]) return cls(df_coordinates=df_coordinates, s_split=s_split) + @classmethod + def from_csv(cls, csv_path: str = None): + assert csv_path is not None + assert op.exists(csv_path) + df = pd.read_csv(csv_path) + return cls.from_df(df) + + @classmethod + def from_nb_points(cls, nb_points: int, **kwargs): + # Call the default class method from csv + coordinates = cls.from_csv() # type: AbstractCoordinates + # Sample randomly nb_points coordinates + nb_coordinates = len(coordinates) + if nb_points > nb_coordinates: + raise Exception('Nb coordinates in csv: {} < Nb points desired: {}'.format(nb_coordinates, nb_points)) + else: + df_sample = pd.DataFrame.sample(coordinates.df, n=nb_points) + return cls.from_df(df=df_sample) + @classmethod def coordinates_columns(cls, df_coord: pd.DataFrame) -> List[str]: coord_columns = [cls.COORDINATE_X] @@ -52,25 +75,6 @@ class AbstractCoordinates(object): # Merged DataFrame of df_coord and s_split return self.df_coordinates if self.s_split is None else self.df_coordinates.join(self.s_split) - @classmethod - def from_csv(cls, csv_path: str = None): - assert csv_path is not None - assert op.exists(csv_path) - df = pd.read_csv(csv_path) - return cls.from_df(df) - - @classmethod - def from_nb_points(cls, nb_points: int, **kwargs): - # Call the default class method from csv - coordinates = cls.from_csv() # type: AbstractCoordinates - # Sample randomly nb_points coordinates - nb_coordinates = len(coordinates) - if nb_points > nb_coordinates: - raise Exception('Nb coordinates in csv: {} < Nb points desired: {}'.format(nb_coordinates, nb_points)) - else: - df_sample = pd.DataFrame.sample(coordinates.df, n=nb_points) - return cls.from_df(df=df_sample) - def df_coordinates_split(self, split_str: str) -> pd.DataFrame: assert self.s_split is not None ind = self.s_split == split_str @@ -92,16 +96,15 @@ class AbstractCoordinates(object): return self.df_coordinates.loc[:, self.COORDINATE_Y].values.copy() @property - def coordinates_train(self) -> np.ndarray: - return self._coordinates_values(df_coordinates=self.df_coordinates_split(self.TRAIN_SPLIT_STR)) - - @property - def coordinates_test(self) -> np.ndarray: - return self._coordinates_values(df_coordinates=self.df_coordinates_split(self.TEST_SPLIT_STR)) + def index(self) -> pd.Series: + return self.df_coordinates.index @property - def index(self): - return self.df_coordinates.index + def train_ind(self) -> pd.Series: + if self.s_split is None: + return None + else: + return self.s_split.isin([self.TRAIN_SPLIT_STR]) # Visualization diff --git a/spatio_temporal_dataset/dataset/abstract_dataset.py b/spatio_temporal_dataset/dataset/abstract_dataset.py index f42453efab75cb81dd6022e294feb88f7bcaa876..f17d9610e1e95f75a4773d4688fd20c9c6558391 100644 --- a/spatio_temporal_dataset/dataset/abstract_dataset.py +++ b/spatio_temporal_dataset/dataset/abstract_dataset.py @@ -2,6 +2,8 @@ import os import numpy as np import os.path as op import pandas as pd + +from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit, SpatioTemporalSlicer from spatio_temporal_dataset.temporal_observations.abstract_temporal_observations import AbstractTemporalObservations from spatio_temporal_dataset.coordinates.abstract_coordinates import AbstractCoordinates @@ -13,6 +15,8 @@ class AbstractDataset(object): # assert is_same_index.all() self.temporal_observations = temporal_observations self.coordinates = coordinates + self.spatio_temporal_slicer = SpatioTemporalSlicer(coordinate_train_ind=self.coordinates.train_ind, + observation_train_ind=self.temporal_observations.train_ind) @classmethod def from_csv(cls, csv_path: str): @@ -41,15 +45,11 @@ class AbstractDataset(object): def coordinates_values(self): return self.coordinates.coordinates_values - @property - def maxima_gev(self) -> np.ndarray: - return self.temporal_observations.maxima_gev - - @property - def maxima_frech(self): - return self.temporal_observations.maxima_frech + def maxima_gev(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all) -> np.ndarray: + return self.temporal_observations.maxima_gev(split, self.spatio_temporal_slicer) - @maxima_frech.setter - def maxima_frech(self, maxima_frech_to_set): - self.temporal_observations.maxima_frech = maxima_frech_to_set + def maxima_frech(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all) -> np.ndarray: + return self.temporal_observations.maxima_frech(split, self.spatio_temporal_slicer) + def set_maxima_frech(self, maxima_frech_values: np.ndarray, split: SpatialTemporalSplit = SpatialTemporalSplit.all): + self.temporal_observations.set_maxima_frech(maxima_frech_values, split, self.spatio_temporal_slicer) \ No newline at end of file diff --git a/spatio_temporal_dataset/dataset/spatio_temporal_split.py b/spatio_temporal_dataset/dataset/spatio_temporal_split.py new file mode 100644 index 0000000000000000000000000000000000000000..599e656db35b1aea19c2b48446ebd1c8560469c2 --- /dev/null +++ b/spatio_temporal_dataset/dataset/spatio_temporal_split.py @@ -0,0 +1,48 @@ +from enum import Enum + +import pandas as pd + + +class SpatialTemporalSplit(Enum): + all = 0 + train = 1 + test = 2 + test_temporal = 3 + test_spatial = 4 + + +class SpatioTemporalSlicer(object): + + def __init__(self, coordinate_train_ind: pd.Series, observation_train_ind: pd.Series): + self.index_train_ind = coordinate_train_ind # type: pd.Series + self.column_train_ind = observation_train_ind # type: pd.Series + if self.ind_are_not_defined: + assert self.index_train_ind is None and self.column_train_ind is None, "One split was not defined" + + @property + def index_test_ind(self) -> pd.Series: + return ~self.index_train_ind + + @property + def column_test_ind(self) -> pd.Series: + return ~self.column_train_ind + + @property + def ind_are_not_defined(self): + return self.index_train_ind is None or self.column_train_ind is None + + def loc_split(self, df: pd.DataFrame, split: SpatialTemporalSplit): + assert isinstance(split, SpatialTemporalSplit) + # By default, if one of the two split is not defined we return all the data + if self.ind_are_not_defined or split is SpatialTemporalSplit.all: + return df + assert df.columns == self.column_train_ind.index + assert df.index == self.index_train_ind.index + if split is SpatialTemporalSplit.train: + return df.loc[self.index_train_ind, self.column_train_ind] + elif split is SpatialTemporalSplit.test: + return df.loc[self.index_test_ind, self.column_test_ind] + elif split is SpatialTemporalSplit.test_spatial: + return df.loc[self.index_test_ind, self.column_train_ind] + elif split is SpatialTemporalSplit.test_temporal: + return df.loc[self.index_train_ind, self.column_test_ind] diff --git a/spatio_temporal_dataset/temporal_observations/abstract_temporal_observations.py b/spatio_temporal_dataset/temporal_observations/abstract_temporal_observations.py index 34e1796ab7b7925d9275cbd98369a9b942ce2ab8..8a19a12307639f23a5a3627f16731520f31ed56d 100644 --- a/spatio_temporal_dataset/temporal_observations/abstract_temporal_observations.py +++ b/spatio_temporal_dataset/temporal_observations/abstract_temporal_observations.py @@ -1,14 +1,25 @@ import pandas as pd +import numpy as np + +from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit, SpatioTemporalSlicer class AbstractTemporalObservations(object): - def __init__(self, df_maxima_frech: pd.DataFrame = None, df_maxima_gev: pd.DataFrame = None): + # Constants for the split column + TRAIN_SPLIT_STR = 'train_split' + TEST_SPLIT_STR = 'test_split' + + def __init__(self, df_maxima_frech: pd.DataFrame = None, df_maxima_gev: pd.DataFrame = None, + s_split: pd.Series = None): """ Main attribute of the class is the DataFrame df_maxima Index are stations index Columns are the temporal moment of the maxima """ + if s_split is not None: + assert s_split.isin([self.TRAIN_SPLIT_STR, self.TEST_SPLIT_STR]) + self.s_split = s_split self.df_maxima_frech = df_maxima_frech self.df_maxima_gev = df_maxima_gev @@ -16,29 +27,29 @@ class AbstractTemporalObservations(object): def from_df(cls, df): pass - @property - def maxima_gev(self): - return self.df_maxima_gev.values + @staticmethod + def df_maxima(df: pd.DataFrame, split: SpatialTemporalSplit = SpatialTemporalSplit.all, + slicer: SpatioTemporalSlicer = None): + if slicer is None: + assert split is SpatialTemporalSplit.all + return df + else: + return slicer.loc_split(df, split) - @property - def maxima_frech(self): - return self.df_maxima_frech.values + def maxima_gev(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all, slicer: SpatioTemporalSlicer = None): + return self.df_maxima(self.df_maxima_gev, split, slicer).values - @maxima_frech.setter - def maxima_frech(self, maxima_frech_to_set): - assert maxima_frech_to_set is not None - assert maxima_frech_to_set.shape == self.maxima_gev.shape - self.df_maxima_frech = pd.DataFrame(data=maxima_frech_to_set, - index=self.df_maxima_gev.index, - columns=self.df_maxima_gev.columns) + def maxima_frech(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all, slicer: SpatioTemporalSlicer = None): + return self.df_maxima(self.df_maxima_frech, split, slicer).values - @property - def column_to_time_index(self): - pass + def set_maxima_frech(self, maxima_frech_values: np.ndarray, split: SpatialTemporalSplit = SpatialTemporalSplit.all, + slicer: SpatioTemporalSlicer = None): + df = self.df_maxima(self.df_maxima_frech, split, slicer) + df.loc[:] = maxima_frech_values @property - def index(self): - return self.df_maxima_gev.index - - - + def train_ind(self) -> pd.Series: + if self.s_split is None: + return None + else: + return self.s_split.isin([self.TRAIN_SPLIT_STR]) diff --git a/spatio_temporal_dataset/temporal_observations/annual_maxima_observations.py b/spatio_temporal_dataset/temporal_observations/annual_maxima_observations.py index b159bbba36516e71689f7b397e88f7bb6e510904..a197687c57b32592ad3cb59fbda5d277342b1c19 100644 --- a/spatio_temporal_dataset/temporal_observations/annual_maxima_observations.py +++ b/spatio_temporal_dataset/temporal_observations/annual_maxima_observations.py @@ -40,7 +40,7 @@ class FullAnnualMaxima(MaxStableAnnualMaxima): coordinates: AbstractCoordinates, margin_model: AbstractMarginModel): max_stable_annual_maxima = super().from_sampling(nb_obs, max_stable_model, coordinates) # Compute df_maxima_gev from df_maxima_frech - maxima_gev = margin_model.rmargin_from_maxima_frech(maxima_frech=max_stable_annual_maxima.maxima_frech, + maxima_gev = margin_model.rmargin_from_maxima_frech(maxima_frech=max_stable_annual_maxima.maxima_frech(), coordinates_values=coordinates.coordinates_values) max_stable_annual_maxima.df_maxima_gev = pd.DataFrame(data=maxima_gev, index=coordinates.index) return max_stable_annual_maxima diff --git a/test/test_spatio_temporal_dataset/test_temporal_observations.py b/test/test_spatio_temporal_dataset/test_temporal_observations.py index 3e0657dac3e44e2853d2655147b8a83da4809e60..204b96060d238adce36653367bf740ea3cbabdd5 100644 --- a/test/test_spatio_temporal_dataset/test_temporal_observations.py +++ b/test/test_spatio_temporal_dataset/test_temporal_observations.py @@ -1,9 +1,22 @@ import unittest +import numpy as np + +import pandas as pd + +from spatio_temporal_dataset.temporal_observations.abstract_temporal_observations import AbstractTemporalObservations class TestTemporalObservations(unittest.TestCase): DISPLAY = False + def test_set_maxima_gev(self): + df = pd.DataFrame.from_dict({'ok': [2, 5]}) + temporal_observation = AbstractTemporalObservations(df_maxima_frech=df) + example = np.array([[3], [6]]) + temporal_observation.set_maxima_frech(maxima_frech_values=example) + maxima_frech = temporal_observation.maxima_frech() + self.assertTrue(np.equal(example, maxima_frech).all(), msg="{} {}".format(example, maxima_frech)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_unitary/test_rmaxstab/test_rmaxstab_with_margin.py b/test/test_unitary/test_rmaxstab/test_rmaxstab_with_margin.py index fac7b39364ebf729fcac6f33d46f37d49349bc73..1db26310de084ddf2c1d8f9ad1234eee4951a5e8 100644 --- a/test/test_unitary/test_rmaxstab/test_rmaxstab_with_margin.py +++ b/test/test_unitary/test_rmaxstab/test_rmaxstab_with_margin.py @@ -51,7 +51,7 @@ class TestRMaxStabWithMarginConstant(TestUnitaryAbstract): @property def python_output(self): dataset = self.python_code() - return np.sum(dataset.maxima_gev) + return np.sum(dataset.maxima_gev()) def test_rmaxstab_with_constant_margin(self): self.compare() @@ -96,7 +96,7 @@ class TestRMaxStabWithLinearMargin(TestUnitaryAbstract): @property def python_output(self): dataset = self.python_code() - return np.sum(dataset.maxima_gev) + return np.sum(dataset.maxima_gev()) def test_rmaxstab_with_linear_margin(self): self.compare() diff --git a/test/test_unitary/test_rmaxstab/test_rmaxstab_without_margin.py b/test/test_unitary/test_rmaxstab/test_rmaxstab_without_margin.py index c97a072610c59080b26fd3708dfa5a7d94a2d6da..3d522b8194c877b976b3fcd352a61383cf9dbb29 100644 --- a/test/test_unitary/test_rmaxstab/test_rmaxstab_without_margin.py +++ b/test/test_unitary/test_rmaxstab/test_rmaxstab_without_margin.py @@ -37,7 +37,7 @@ class TestRMaxStab(TestUnitaryAbstract): coordinates, max_stable_model = self.python_code() m = MaxStableAnnualMaxima.from_sampling(nb_obs=40, max_stable_model=max_stable_model, coordinates=coordinates) # TODO: understand why the array are not in the same order - return np.sum(m.maxima_frech) + return np.sum(m.maxima_frech()) def test_rmaxstab(self): self.compare()