Commit 5ebb6469 authored by Le Roux Erwan's avatar Le Roux Erwan
Browse files

[DATASET] add spatio_temporal_split. add test. refactor code accordingly.

parent 219ab0c4
No related merge requests found
Showing with 158 additions and 76 deletions
+158 -76
......@@ -8,6 +8,7 @@ from extreme_estimator.estimator.abstract_estimator import AbstractEstimator
from extreme_estimator.estimator.margin_estimator import SmoothMarginEstimator
from extreme_estimator.estimator.max_stable_estimator import MaxStableEstimator
from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset
from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit
class AbstractFullEstimator(AbstractEstimator):
......@@ -41,11 +42,12 @@ class SmoothMarginalsThenUnitaryMsp(AbstractFullEstimator):
# Estimate the margin parameters
self.margin_estimator.fit()
# Compute the maxima_frech
maxima_frech = AbstractMarginModel.gev2frech(maxima_gev=self.dataset.maxima_gev,
maxima_gev_train = self.dataset.maxima_gev(split=SpatialTemporalSplit.train)
maxima_frech = AbstractMarginModel.gev2frech(maxima_gev=maxima_gev_train,
coordinates_values=self.dataset.coordinates_values,
margin_function=self.margin_estimator.margin_function_fitted)
# Update maxima frech field through the dataset object
self.dataset.maxima_frech = maxima_frech
self.dataset.set_maxima_frech(maxima_frech, split=SpatialTemporalSplit.train)
# Estimate the max stable parameters
self.max_stable_estimator.fit()
......@@ -68,7 +70,7 @@ class FullEstimatorInASingleStepWithSmoothMargin(AbstractFullEstimator):
def _fit(self):
# Estimate both the margin and the max-stable structure
self.full_params_fitted = self.max_stable_model.fitmaxstab(
maxima_gev=self.dataset.maxima_gev,
maxima_gev=self.dataset.maxima_gev(split=SpatialTemporalSplit.train),
df_coordinates=self.dataset.df_coordinates,
fit_marge=True,
fit_marge_form_dict=self.linear_margin_function_to_fit.form_dict,
......
......@@ -4,13 +4,14 @@ from extreme_estimator.extreme_models.margin_model.margin_function.abstract_marg
AbstractMarginFunction
from extreme_estimator.extreme_models.margin_model.smooth_margin_model import LinearMarginModel
from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset
from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit
class AbstractMarginEstimator(AbstractEstimator):
def __init__(self, dataset: AbstractDataset):
super().__init__(dataset)
assert self.dataset.maxima_gev is not None
assert self.dataset.maxima_gev() is not None
self._margin_function_fitted = None
@property
......@@ -32,5 +33,7 @@ class SmoothMarginEstimator(AbstractMarginEstimator):
self.margin_model = margin_model
def _fit(self):
self._margin_function_fitted = self.margin_model.fitmargin_from_maxima_gev(maxima_gev=self.dataset.maxima_gev,
coordinates_values=self.dataset.coordinates_values)
maxima_gev = self.dataset.maxima_gev(split=SpatialTemporalSplit.train)
corodinate_values = self.dataset.coordinates_values
self._margin_function_fitted = self.margin_model.fitmargin_from_maxima_gev(maxima_gev=maxima_gev,
coordinates_values=corodinate_values)
......@@ -3,6 +3,8 @@ from extreme_estimator.extreme_models.max_stable_model.abstract_max_stable_model
from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset
import numpy as np
from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit
class AbstractMaxStableEstimator(AbstractEstimator):
......@@ -16,9 +18,9 @@ class AbstractMaxStableEstimator(AbstractEstimator):
class MaxStableEstimator(AbstractMaxStableEstimator):
def _fit(self):
assert self.dataset.maxima_frech is not None
assert self.dataset.maxima_frech(split=SpatialTemporalSplit.train) is not None
self.max_stable_params_fitted = self.max_stable_model.fitmaxstab(
maxima_frech=self.dataset.maxima_frech,
maxima_frech=self.dataset.maxima_frech(split=SpatialTemporalSplit.train),
df_coordinates=self.dataset.coordinates.df_coordinates)
def _error(self, true_max_stable_params: dict):
......
......@@ -13,14 +13,16 @@ class AbstractCoordinates(object):
COORDINATE_Y = 'coord_y'
COORDINATE_Z = 'coord_z'
COORDINATE_NAMES = [COORDINATE_X, COORDINATE_Y, COORDINATE_Z]
COORD_SPLIT = 'coord_split'
# Constants
COORDINATE_SPLIT = 'coord_split'
# Constants for the split column
TRAIN_SPLIT_STR = 'train_split'
TEST_SPLIT_STR = 'test_split'
def __init__(self, df_coordinates: pd.DataFrame, s_split: pd.Series = None):
self.df_coordinates = df_coordinates
self.s_split = s_split
self.df_coordinates = df_coordinates # type: pd.DataFrame
self.s_split = s_split # type: pd.Series
# ClassMethod constructor
@classmethod
def from_df(cls, df: pd.DataFrame):
......@@ -28,9 +30,30 @@ class AbstractCoordinates(object):
assert cls.COORDINATE_X in df.columns
df_coordinates = df.loc[:, cls.coordinates_columns(df)]
# Potentially, a split column can be specified
s_split = df[cls.COORD_SPLIT] if cls.COORD_SPLIT in df.columns else None
s_split = df[cls.COORDINATE_SPLIT] if cls.COORDINATE_SPLIT in df.columns else None
if s_split is not None:
assert s_split.isin([cls.TRAIN_SPLIT_STR, cls.TEST_SPLIT_STR])
return cls(df_coordinates=df_coordinates, s_split=s_split)
@classmethod
def from_csv(cls, csv_path: str = None):
assert csv_path is not None
assert op.exists(csv_path)
df = pd.read_csv(csv_path)
return cls.from_df(df)
@classmethod
def from_nb_points(cls, nb_points: int, **kwargs):
# Call the default class method from csv
coordinates = cls.from_csv() # type: AbstractCoordinates
# Sample randomly nb_points coordinates
nb_coordinates = len(coordinates)
if nb_points > nb_coordinates:
raise Exception('Nb coordinates in csv: {} < Nb points desired: {}'.format(nb_coordinates, nb_points))
else:
df_sample = pd.DataFrame.sample(coordinates.df, n=nb_points)
return cls.from_df(df=df_sample)
@classmethod
def coordinates_columns(cls, df_coord: pd.DataFrame) -> List[str]:
coord_columns = [cls.COORDINATE_X]
......@@ -52,25 +75,6 @@ class AbstractCoordinates(object):
# Merged DataFrame of df_coord and s_split
return self.df_coordinates if self.s_split is None else self.df_coordinates.join(self.s_split)
@classmethod
def from_csv(cls, csv_path: str = None):
assert csv_path is not None
assert op.exists(csv_path)
df = pd.read_csv(csv_path)
return cls.from_df(df)
@classmethod
def from_nb_points(cls, nb_points: int, **kwargs):
# Call the default class method from csv
coordinates = cls.from_csv() # type: AbstractCoordinates
# Sample randomly nb_points coordinates
nb_coordinates = len(coordinates)
if nb_points > nb_coordinates:
raise Exception('Nb coordinates in csv: {} < Nb points desired: {}'.format(nb_coordinates, nb_points))
else:
df_sample = pd.DataFrame.sample(coordinates.df, n=nb_points)
return cls.from_df(df=df_sample)
def df_coordinates_split(self, split_str: str) -> pd.DataFrame:
assert self.s_split is not None
ind = self.s_split == split_str
......@@ -92,16 +96,15 @@ class AbstractCoordinates(object):
return self.df_coordinates.loc[:, self.COORDINATE_Y].values.copy()
@property
def coordinates_train(self) -> np.ndarray:
return self._coordinates_values(df_coordinates=self.df_coordinates_split(self.TRAIN_SPLIT_STR))
@property
def coordinates_test(self) -> np.ndarray:
return self._coordinates_values(df_coordinates=self.df_coordinates_split(self.TEST_SPLIT_STR))
def index(self) -> pd.Series:
return self.df_coordinates.index
@property
def index(self):
return self.df_coordinates.index
def train_ind(self) -> pd.Series:
if self.s_split is None:
return None
else:
return self.s_split.isin([self.TRAIN_SPLIT_STR])
# Visualization
......
......@@ -2,6 +2,8 @@ import os
import numpy as np
import os.path as op
import pandas as pd
from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit, SpatioTemporalSlicer
from spatio_temporal_dataset.temporal_observations.abstract_temporal_observations import AbstractTemporalObservations
from spatio_temporal_dataset.coordinates.abstract_coordinates import AbstractCoordinates
......@@ -13,6 +15,8 @@ class AbstractDataset(object):
# assert is_same_index.all()
self.temporal_observations = temporal_observations
self.coordinates = coordinates
self.spatio_temporal_slicer = SpatioTemporalSlicer(coordinate_train_ind=self.coordinates.train_ind,
observation_train_ind=self.temporal_observations.train_ind)
@classmethod
def from_csv(cls, csv_path: str):
......@@ -41,15 +45,11 @@ class AbstractDataset(object):
def coordinates_values(self):
return self.coordinates.coordinates_values
@property
def maxima_gev(self) -> np.ndarray:
return self.temporal_observations.maxima_gev
@property
def maxima_frech(self):
return self.temporal_observations.maxima_frech
def maxima_gev(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all) -> np.ndarray:
return self.temporal_observations.maxima_gev(split, self.spatio_temporal_slicer)
@maxima_frech.setter
def maxima_frech(self, maxima_frech_to_set):
self.temporal_observations.maxima_frech = maxima_frech_to_set
def maxima_frech(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all) -> np.ndarray:
return self.temporal_observations.maxima_frech(split, self.spatio_temporal_slicer)
def set_maxima_frech(self, maxima_frech_values: np.ndarray, split: SpatialTemporalSplit = SpatialTemporalSplit.all):
self.temporal_observations.set_maxima_frech(maxima_frech_values, split, self.spatio_temporal_slicer)
\ No newline at end of file
from enum import Enum
import pandas as pd
class SpatialTemporalSplit(Enum):
all = 0
train = 1
test = 2
test_temporal = 3
test_spatial = 4
class SpatioTemporalSlicer(object):
def __init__(self, coordinate_train_ind: pd.Series, observation_train_ind: pd.Series):
self.index_train_ind = coordinate_train_ind # type: pd.Series
self.column_train_ind = observation_train_ind # type: pd.Series
if self.ind_are_not_defined:
assert self.index_train_ind is None and self.column_train_ind is None, "One split was not defined"
@property
def index_test_ind(self) -> pd.Series:
return ~self.index_train_ind
@property
def column_test_ind(self) -> pd.Series:
return ~self.column_train_ind
@property
def ind_are_not_defined(self):
return self.index_train_ind is None or self.column_train_ind is None
def loc_split(self, df: pd.DataFrame, split: SpatialTemporalSplit):
assert isinstance(split, SpatialTemporalSplit)
# By default, if one of the two split is not defined we return all the data
if self.ind_are_not_defined or split is SpatialTemporalSplit.all:
return df
assert df.columns == self.column_train_ind.index
assert df.index == self.index_train_ind.index
if split is SpatialTemporalSplit.train:
return df.loc[self.index_train_ind, self.column_train_ind]
elif split is SpatialTemporalSplit.test:
return df.loc[self.index_test_ind, self.column_test_ind]
elif split is SpatialTemporalSplit.test_spatial:
return df.loc[self.index_test_ind, self.column_train_ind]
elif split is SpatialTemporalSplit.test_temporal:
return df.loc[self.index_train_ind, self.column_test_ind]
import pandas as pd
import numpy as np
from spatio_temporal_dataset.dataset.spatio_temporal_split import SpatialTemporalSplit, SpatioTemporalSlicer
class AbstractTemporalObservations(object):
def __init__(self, df_maxima_frech: pd.DataFrame = None, df_maxima_gev: pd.DataFrame = None):
# Constants for the split column
TRAIN_SPLIT_STR = 'train_split'
TEST_SPLIT_STR = 'test_split'
def __init__(self, df_maxima_frech: pd.DataFrame = None, df_maxima_gev: pd.DataFrame = None,
s_split: pd.Series = None):
"""
Main attribute of the class is the DataFrame df_maxima
Index are stations index
Columns are the temporal moment of the maxima
"""
if s_split is not None:
assert s_split.isin([self.TRAIN_SPLIT_STR, self.TEST_SPLIT_STR])
self.s_split = s_split
self.df_maxima_frech = df_maxima_frech
self.df_maxima_gev = df_maxima_gev
......@@ -16,29 +27,29 @@ class AbstractTemporalObservations(object):
def from_df(cls, df):
pass
@property
def maxima_gev(self):
return self.df_maxima_gev.values
@staticmethod
def df_maxima(df: pd.DataFrame, split: SpatialTemporalSplit = SpatialTemporalSplit.all,
slicer: SpatioTemporalSlicer = None):
if slicer is None:
assert split is SpatialTemporalSplit.all
return df
else:
return slicer.loc_split(df, split)
@property
def maxima_frech(self):
return self.df_maxima_frech.values
def maxima_gev(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all, slicer: SpatioTemporalSlicer = None):
return self.df_maxima(self.df_maxima_gev, split, slicer).values
@maxima_frech.setter
def maxima_frech(self, maxima_frech_to_set):
assert maxima_frech_to_set is not None
assert maxima_frech_to_set.shape == self.maxima_gev.shape
self.df_maxima_frech = pd.DataFrame(data=maxima_frech_to_set,
index=self.df_maxima_gev.index,
columns=self.df_maxima_gev.columns)
def maxima_frech(self, split: SpatialTemporalSplit = SpatialTemporalSplit.all, slicer: SpatioTemporalSlicer = None):
return self.df_maxima(self.df_maxima_frech, split, slicer).values
@property
def column_to_time_index(self):
pass
def set_maxima_frech(self, maxima_frech_values: np.ndarray, split: SpatialTemporalSplit = SpatialTemporalSplit.all,
slicer: SpatioTemporalSlicer = None):
df = self.df_maxima(self.df_maxima_frech, split, slicer)
df.loc[:] = maxima_frech_values
@property
def index(self):
return self.df_maxima_gev.index
def train_ind(self) -> pd.Series:
if self.s_split is None:
return None
else:
return self.s_split.isin([self.TRAIN_SPLIT_STR])
......@@ -40,7 +40,7 @@ class FullAnnualMaxima(MaxStableAnnualMaxima):
coordinates: AbstractCoordinates, margin_model: AbstractMarginModel):
max_stable_annual_maxima = super().from_sampling(nb_obs, max_stable_model, coordinates)
# Compute df_maxima_gev from df_maxima_frech
maxima_gev = margin_model.rmargin_from_maxima_frech(maxima_frech=max_stable_annual_maxima.maxima_frech,
maxima_gev = margin_model.rmargin_from_maxima_frech(maxima_frech=max_stable_annual_maxima.maxima_frech(),
coordinates_values=coordinates.coordinates_values)
max_stable_annual_maxima.df_maxima_gev = pd.DataFrame(data=maxima_gev, index=coordinates.index)
return max_stable_annual_maxima
import unittest
import numpy as np
import pandas as pd
from spatio_temporal_dataset.temporal_observations.abstract_temporal_observations import AbstractTemporalObservations
class TestTemporalObservations(unittest.TestCase):
DISPLAY = False
def test_set_maxima_gev(self):
df = pd.DataFrame.from_dict({'ok': [2, 5]})
temporal_observation = AbstractTemporalObservations(df_maxima_frech=df)
example = np.array([[3], [6]])
temporal_observation.set_maxima_frech(maxima_frech_values=example)
maxima_frech = temporal_observation.maxima_frech()
self.assertTrue(np.equal(example, maxima_frech).all(), msg="{} {}".format(example, maxima_frech))
if __name__ == '__main__':
unittest.main()
......@@ -51,7 +51,7 @@ class TestRMaxStabWithMarginConstant(TestUnitaryAbstract):
@property
def python_output(self):
dataset = self.python_code()
return np.sum(dataset.maxima_gev)
return np.sum(dataset.maxima_gev())
def test_rmaxstab_with_constant_margin(self):
self.compare()
......@@ -96,7 +96,7 @@ class TestRMaxStabWithLinearMargin(TestUnitaryAbstract):
@property
def python_output(self):
dataset = self.python_code()
return np.sum(dataset.maxima_gev)
return np.sum(dataset.maxima_gev())
def test_rmaxstab_with_linear_margin(self):
self.compare()
......
......@@ -37,7 +37,7 @@ class TestRMaxStab(TestUnitaryAbstract):
coordinates, max_stable_model = self.python_code()
m = MaxStableAnnualMaxima.from_sampling(nb_obs=40, max_stable_model=max_stable_model, coordinates=coordinates)
# TODO: understand why the array are not in the same order
return np.sum(m.maxima_frech)
return np.sum(m.maxima_frech())
def test_rmaxstab(self):
self.compare()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment