Commit 2fd079ca authored by Le Roux Erwan's avatar Le Roux Erwan
Browse files

[EXPERIMENT] improve fit_diagnosis

parent a4098711
No related merge requests found
Showing with 142 additions and 61 deletions
+142 -61
from experiment.split.split_curve import SplitCurve, LocFunction
import random
from experiment.fit_diagnosis.split_curve import SplitCurve, LocFunction
from extreme_estimator.estimator.full_estimator import FullEstimatorInASingleStepWithSmoothMargin
from extreme_estimator.extreme_models.margin_model.smooth_margin_model import ConstantMarginModel
from extreme_estimator.extreme_models.margin_model.smooth_margin_model import ConstantMarginModel, \
LinearAllParametersAllDimsMarginModel
from extreme_estimator.extreme_models.max_stable_model.max_stable_models import Smith
from extreme_estimator.gev_params import GevParams
from spatio_temporal_dataset.coordinates.unidimensional_coordinates.coordinates_1D import LinSpaceCoordinates
......@@ -9,6 +12,7 @@ from spatio_temporal_dataset.dataset.simulation_dataset import FullSimulatedData
from spatio_temporal_dataset.slicer.spatial_slicer import SpatialSlicer
from spatio_temporal_dataset.slicer.spatio_temporal_slicer import SpatioTemporalSlicer
random.seed(42)
def load_dataset():
nb_points = 50
......@@ -17,7 +21,6 @@ def load_dataset():
# MarginModel Linear with respect to the shape (from 0.01 to 0.02)
params_sample = {
# (GevParams.GEV_SHAPE, 0): 0.2,
(GevParams.GEV_LOC, 0): 10,
(GevParams.GEV_SHAPE, 0): 1.0,
(GevParams.GEV_SCALE, 0): 1.0,
......@@ -29,17 +32,18 @@ def load_dataset():
coordinates=coordinates,
max_stable_model=max_stable_model,
train_split_ratio=0.8,
slicer_class=SpatioTemporalSlicer)
slicer_class=SpatialSlicer)
def full_estimator(dataset):
max_stable_model = Smith()
margin_model_for_estimator = ConstantMarginModel(dataset.coordinates)
margin_model_for_estimator = LinearAllParametersAllDimsMarginModel(dataset.coordinates)
full_estimator = FullEstimatorInASingleStepWithSmoothMargin(dataset, margin_model_for_estimator, max_stable_model)
return full_estimator
if __name__ == '__main__':
dataset = load_dataset()
dataset.slicer.summary()
full_estimator = full_estimator(dataset)
......
......@@ -39,17 +39,23 @@ class SplitCurve(object):
# Fit the estimator and get the margin_function
self.estimator.fit()
# todo: potentially I will do the fit several times, and retrieve the mean error
# there is a big variablility so it would be really interesting to average, to make real
self.margin_function_fitted = estimator.margin_function_fitted
self.error_dict = error_dict_between_margin_functions(self.margin_function_sample, self.margin_function_fitted)
# todo: add quantile curve, additionally to the marginal curve
@property
def main_title(self):
return self.dataset.slicer.summary(show=False)
def visualize(self):
fig, axes = plt.subplots(3, 2, sharex='col', sharey='row')
fig, axes = plt.subplots(3, 2)
fig.subplots_adjust(hspace=0.4, wspace=0.4, )
for i, gev_param_name in enumerate(GevParams.GEV_PARAM_NAMES):
self.margin_graph(axes[i, 0], gev_param_name)
self.score_graph(axes[i, 1], gev_param_name)
fig.suptitle(self.main_title)
plt.show()
def margin_graph(self, ax, gev_param_name):
......@@ -63,9 +69,10 @@ class SplitCurve(object):
ax.set_title(title_str)
def score_graph(self, ax, gev_param_name):
for split in self.dataset.splits:
s = self.error_dict[gev_param_name]
data = [1.5] * 7 + [2.5] * 2 + [3.5] * 8 + [4.5] * 3 + [5.5] * 1 + [6.5] * 8
# todo: for the moment only the train/test are interresting (the spatio temporal isn"t working yet)
sns.set_style('whitegrid')
sns.kdeplot(np.array(data), bw=0.5, ax=ax)
print()
s = self.error_dict[gev_param_name]
for split in self.dataset.splits:
ind = self.dataset.coordinates_index(split)
data = s.loc[ind].values
sns.kdeplot(data, bw=0.5, ax=ax, label=split.name).set(xlim=0)
......@@ -22,12 +22,11 @@ class VisualizationMarginModel(unittest.TestCase):
@classmethod
def example_visualization_1D(cls):
coordinates = LinSpaceCoordinates.from_nb_points(nb_points=cls.nb_points)
# MarginModel Linear with respect to the shape (from 0.01 to 0.02)
margin_model = cls.margin_model(coordinates=coordinates, params_sample={GevParams.GEV_SHAPE: 0.02})
margin_model = cls.margin_model(coordinates=coordinates, params_sample={(GevParams.GEV_SHAPE, 1): 0.02})
if cls.DISPLAY:
margin_model.margin_function_sample.visualize()
if __name__ == '__main__':
VisualizationMarginModel.example_visualization_1D()
VisualizationMarginModel.example_visualization_2D()
# VisualizationMarginModel.example_visualization_2D()
......@@ -43,7 +43,7 @@ class SmoothMarginalsThenUnitaryMsp(AbstractFullEstimator):
# Compute the maxima_frech
maxima_gev_train = self.dataset.maxima_gev(split=self.train_split)
maxima_frech = AbstractMarginModel.gev2frech(maxima_gev=maxima_gev_train,
coordinates_values=self.dataset.coordinates_values,
coordinates_values=self.dataset.coordinates_values(self.train_split),
margin_function=self.margin_estimator.margin_function_fitted)
# Update maxima frech field through the dataset object
self.dataset.set_maxima_frech(maxima_frech, split=self.train_split)
......
......@@ -32,6 +32,6 @@ class SmoothMarginEstimator(AbstractMarginEstimator):
def _fit(self):
maxima_gev = self.dataset.maxima_gev(split=self.train_split)
corodinate_values = self.dataset.coordinates_values
coordinate_values = self.dataset.coordinates_values(self.train_split)
self._margin_function_fitted = self.margin_model.fitmargin_from_maxima_gev(maxima_gev=maxima_gev,
coordinates_values=corodinate_values)
coordinates_values=coordinate_values)
......@@ -10,6 +10,13 @@ def abs_error(s1, s2):
def error_dict_between_margin_functions(margin1: AbstractMarginFunction, margin2: AbstractMarginFunction):
"""
Return a serie, indexed by the same index as the coordinates
Each value correspond to the error for this coordinate
:param margin1:
:param margin2:
:return:
"""
assert margin1.coordinates == margin2.coordinates
margin1_gev_params, margin2_gev_params = margin1.gev_params_for_coordinates, margin2.gev_params_for_coordinates
gev_param_name_to_error_serie = {}
......
......@@ -14,16 +14,19 @@ from spatio_temporal_dataset.slicer.temporal_slicer import TemporalSlicer
class AbstractCoordinates(object):
# Columns
# Spatial columns
COORDINATE_X = 'coord_x'
COORDINATE_Y = 'coord_y'
COORDINATE_Z = 'coord_z'
COORDINATE_NAMES = [COORDINATE_X, COORDINATE_Y, COORDINATE_Z]
COORDINATE_SPLIT = 'coord_split'
COORDINATE_SPATIAL_SPLIT = 'coord_spatial_split'
# Temporal columns
COORDINATE_T = 'coord_t'
COORDINATE_TEMPORAL_SPLIT = 'coord_temporal_split'
def __init__(self, df_coord: pd.DataFrame, s_split: pd.Series = None):
self.df_coord = df_coord # type: pd.DataFrame
self.s_split = s_split # type: pd.Series
def __init__(self, df_coord: pd.DataFrame, s_spatial_split: pd.Series = None):
self.df_all_coordinates = df_coord # type: pd.DataFrame
self.s_spatial_split = s_spatial_split # type: pd.Series
# ClassMethod constructor
......@@ -33,18 +36,18 @@ class AbstractCoordinates(object):
assert cls.COORDINATE_X in df.columns
# Create a split based on the train_split_ratio
if train_split_ratio is not None:
assert cls.COORDINATE_SPLIT not in df.columns, "A split has already been defined"
assert cls.COORDINATE_SPATIAL_SPLIT not in df.columns, "A split has already been defined"
s_split = s_split_from_ratio(index=df.index, train_split_ratio=train_split_ratio)
df[cls.COORDINATE_SPLIT] = s_split
df[cls.COORDINATE_SPATIAL_SPLIT] = s_split
# Potentially, a split column can be specified directly in df
if cls.COORDINATE_SPLIT not in df.columns:
if cls.COORDINATE_SPATIAL_SPLIT not in df.columns:
df_coord = df
s_split = None
else:
df_coord = df.loc[:, cls.coordinates_columns(df)]
s_split = df[cls.COORDINATE_SPLIT]
df_coord = df.loc[:, cls.coordinates_spatial_columns(df)]
s_split = df[cls.COORDINATE_SPATIAL_SPLIT]
assert s_split.isin([TRAIN_SPLIT_STR, TEST_SPLIT_STR]).all()
return cls(df_coord=df_coord, s_split=s_split)
return cls(df_coord=df_coord, s_spatial_split=s_split)
@classmethod
def from_csv(cls, csv_path: str = None):
......@@ -53,7 +56,7 @@ class AbstractCoordinates(object):
df = pd.read_csv(csv_path)
# Index correspond to the first column
index_column_name = df.columns[0]
assert index_column_name not in cls.coordinates_columns(df)
assert index_column_name not in cls.coordinates_spatial_columns(df)
df.set_index(index_column_name, inplace=True)
return cls.from_df(df)
......@@ -70,7 +73,7 @@ class AbstractCoordinates(object):
return cls.from_df(df=df_sample, train_split_ratio=train_split_ratio)
@classmethod
def coordinates_columns(cls, df_coord: pd.DataFrame) -> List[str]:
def coordinates_spatial_columns(cls, df_coord: pd.DataFrame) -> List[str]:
coord_columns = [cls.COORDINATE_X]
for additional_coord in [cls.COORDINATE_Y, cls.COORDINATE_Z]:
if additional_coord in df_coord.columns:
......@@ -79,7 +82,7 @@ class AbstractCoordinates(object):
@property
def columns(self):
return self.coordinates_columns(df_coord=self.df_coord)
return self.coordinates_spatial_columns(df_coord=self.df_all_coordinates)
@property
def nb_columns(self):
......@@ -87,46 +90,57 @@ class AbstractCoordinates(object):
@property
def index(self):
return self.df_coord.index
# todo: this should be replace whenever possible by coordinates_index
return self.df_all_coordinates.index
@property
def df_merged(self) -> pd.DataFrame:
# Merged DataFrame of df_coord and s_split
return self.df_coord if self.s_split is None else self.df_coord.join(self.s_split)
return self.df_all_coordinates if self.s_spatial_split is None else self.df_all_coordinates.join(self.s_spatial_split)
def df_coordinates(self, split: Split = Split.all) -> pd.DataFrame:
if self.train_ind is None:
return self.df_coord
if self.ind_train_spatial is None:
return self.df_all_coordinates
if split is Split.all:
return self.df_coord
return self.df_all_coordinates
if split in [Split.train_temporal, Split.test_temporal]:
return self.df_coord
return self.df_all_coordinates
elif split in [Split.train_spatial, Split.train_spatiotemporal, Split.test_spatiotemporal_temporal]:
return self.df_coord.loc[self.train_ind]
return self.df_all_coordinates.loc[self.ind_train_spatial]
elif split in [Split.test_spatial, Split.test_spatiotemporal, Split.test_spatiotemporal_spatial]:
return self.df_coord.loc[~self.train_ind]
return self.df_all_coordinates.loc[~self.ind_train_spatial]
else:
raise NotImplementedError('Unknown split: {}'.format(split))
def coordinates_values(self, split: Split = Split.all) -> np.ndarray:
return self.df_coordinates(split).values
def coordinate_index(self, split: Split = Split.all) -> pd.Index:
return self.df_coordinates(split).index
@property
def x_coordinates(self) -> np.ndarray:
return self.df_coord[self.COORDINATE_X].values.copy()
return self.df_all_coordinates[self.COORDINATE_X].values.copy()
@property
def y_coordinates(self) -> np.ndarray:
return self.df_coord[self.COORDINATE_Y].values.copy()
return self.df_all_coordinates[self.COORDINATE_Y].values.copy()
@property
def train_ind(self) -> pd.Series:
return train_ind_from_s_split(s_split=self.s_split)
def ind_train_spatial(self) -> pd.Series:
return train_ind_from_s_split(s_split=self.s_spatial_split)
# Visualization
def visualize(self):
nb_coordinates_columns = len(self.coordinates_columns(self.df_coord))
nb_coordinates_columns = len(self.coordinates_spatial_columns(self.df_all_coordinates))
if nb_coordinates_columns == 1:
self.visualization_1D()
elif nb_coordinates_columns == 2:
......@@ -135,21 +149,21 @@ class AbstractCoordinates(object):
self.visualization_3D()
def visualization_1D(self):
assert len(self.coordinates_columns(self.df_coord)) >= 1
assert len(self.coordinates_spatial_columns(self.df_all_coordinates)) >= 1
x = self.coordinates_values()[:]
y = np.zeros(len(x))
plt.scatter(x, y)
plt.show()
def visualization_2D(self):
assert len(self.coordinates_columns(self.df_coord)) >= 2
assert len(self.coordinates_spatial_columns(self.df_all_coordinates)) >= 2
coordinates_values = self.coordinates_values()
x, y = coordinates_values[:, 0], coordinates_values[:, 1]
plt.scatter(x, y)
plt.show()
def visualization_3D(self):
assert len(self.coordinates_columns(self.df_coord)) == 3
assert len(self.coordinates_spatial_columns(self.df_all_coordinates)) == 3
coordinates_values = self.coordinates_values()
x, y, z = coordinates_values[:, 0], coordinates_values[:, 1], coordinates_values[:, 2]
fig = plt.figure()
......@@ -160,10 +174,10 @@ class AbstractCoordinates(object):
# Magic Methods
def __len__(self):
return len(self.df_coord)
return len(self.df_all_coordinates)
def __mul__(self, other: float):
self.df_coord *= other
self.df_all_coordinates *= other
return self
def __rmul__(self, other):
......
......@@ -10,7 +10,7 @@ class AlpsStation2DCoordinates(AlpsStation3DCoordinates):
def from_csv(cls, csv_file='coord-lambert2'):
# Remove the Z coordinates from df_coord
spatial_coordinates = super().from_csv(csv_file) # type: AlpsStation3DCoordinates
spatial_coordinates.df_coord.drop(cls.COORDINATE_Z, axis=1, inplace=True)
spatial_coordinates.df_all_coordinates.drop(cls.COORDINATE_Z, axis=1, inplace=True)
return spatial_coordinates
......
......@@ -7,8 +7,8 @@ class TransformedCoordinates(AbstractCoordinates):
@classmethod
def from_coordinates(cls, coordinates: AbstractCoordinates,
transformation_function: AbstractTransformation):
df_coordinates_transformed = coordinates.df_coord.copy()
df_coordinates_transformed = coordinates.df_all_coordinates.copy()
df_coordinates_transformed = transformation_function.transform(df_coord=df_coordinates_transformed)
return cls(df_coord=df_coordinates_transformed, s_split=coordinates.s_split)
return cls(df_coord=df_coordinates_transformed, s_spatial_split=coordinates.s_spatial_split)
......@@ -21,7 +21,7 @@ class AbstractDataset(object):
assert isinstance(slicer_class, type)
self.observations = observations
self.coordinates = coordinates
self.slicer = slicer_class(coordinates_train_ind=self.coordinates.train_ind,
self.slicer = slicer_class(coordinates_train_ind=self.coordinates.ind_train_spatial,
observations_train_ind=self.observations.train_ind) # type: AbstractSlicer
assert isinstance(self.slicer, AbstractSlicer)
......@@ -61,10 +61,12 @@ class AbstractDataset(object):
# Coordinates wrapper
@property
def coordinates_values(self, split: Split = Split.all) -> np.ndarray:
return self.coordinates.coordinates_values(split=split)
def coordinates_index(self, split: Split= Split.all) -> pd.Index:
return self.coordinates.coordinate_index(split=split)
# Slicer wrapper
@property
......
......@@ -37,16 +37,19 @@ class AbstractSlicer(object):
def some_required_ind_are_not_defined(self):
pass
def summary(self):
print('Slicer summary: \n')
def summary(self, show=True):
msg = ''
for s, global_name in [(self.index_train_ind, "Spatial"), (self.column_train_ind, "Temporal")]:
print(global_name + ' split')
msg += global_name + ': '
if s is None:
print('Not handled by this slicer')
msg += 'Not handled by this slicer'
else:
for f, name in [(len, 'Total'), (sum, 'train')]:
print("{}: {}".format(name, f(s)))
print('\n')
msg += "{}: {} ".format(name, f(s))
msg += ' / '
if show:
print(msg)
return msg
def loc_split(self, df: pd.DataFrame, split: Split):
# split should belong to the list of split accepted by the slicer
......
import os.path as op
import pandas as pd
import numpy as np
......@@ -28,6 +29,17 @@ class AbstractSpatioTemporalObservations(object):
assert s_split.isin([TRAIN_SPLIT_STR, TEST_SPLIT_STR]).all()
self.s_split = s_split
@classmethod
def from_csv(cls, csv_path: str = None):
assert csv_path is not None
assert op.exists(csv_path)
df = pd.read_csv(csv_path)
# # Index correspond to the first column
# index_column_name = df.columns[0]
# assert index_column_name not in cls.coordinates_columns(df)
# df.set_index(index_column_name, inplace=True)
return cls.from_df(df)
@property
def _df_maxima(self) -> pd.DataFrame:
if self.df_maxima_frech is not None:
......
import os.path as op
import pandas as pd
from spatio_temporal_dataset.spatio_temporal_observations.abstract_spatio_temporal_observations import AbstractSpatioTemporalObservations
from utils import get_full_path
class AlpsPrecipitationObservations(AbstractSpatioTemporalObservations):
pass
\ No newline at end of file
RELATIVE_PATH = r'local/spatio_temporal_datasets/Gilles - precipitations'
FULL_PATH = get_full_path(relative_path=RELATIVE_PATH)
@classmethod
def from_csv(cls, csv_file='max_precip_3j'):
csv_path = op.join(cls.FULL_PATH, csv_file + '.csv')
return super().from_csv(csv_path)
@classmethod
def transform_gilles_txt_into_csv(cls):
filepath = op.join(cls.FULL_PATH, 'original data', 'max_precip_3j.txt')
station_to_coordinates = {}
# todo: be absolutely sure of the mapping, in the txt file of Gilles we do not have the coordinate
# this seems rather dangerous, especially if he loaded
# todo: re-do the whole analysis, and extraction myself about the snow
with open(filepath, 'r') as f:
for l in f:
_, station_name, coordinates = l.split('"')
coordinates = coordinates.split()
print(len(coordinates))
assert len(coordinates) == 55
station_to_coordinates[station_name] = coordinates
# df = pd.DataFrame.from_dict(data=station_to_coordinates, orient='index',
# columns=[cls.COORDINATE_X, cls.COORDINATE_Y, cls.COORDINATE_Z])
# print(df.head())
# filepath = op.join(cls.FULL_PATH, 'max_precip_3j.csv')
# assert not op.exists(filepath)
# df.to_csv(filepath)
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment