diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/R/gev_fit/gev_fit.R b/extreme_estimator/R_fit/gev_fit/gev_fit.R similarity index 100% rename from R/gev_fit/gev_fit.R rename to extreme_estimator/R_fit/gev_fit/gev_fit.R diff --git a/R/gev_fit/gev_marginal.py b/extreme_estimator/R_fit/gev_fit/gev_marginal.py similarity index 68% rename from R/gev_fit/gev_marginal.py rename to extreme_estimator/R_fit/gev_fit/gev_marginal.py index 7758532f38ec330cbccc42da13ed7538d5f54677..424eecd084d9ee2b59cbe5753adef3a7127f7507 100644 --- a/R/gev_fit/gev_marginal.py +++ b/extreme_estimator/R_fit/gev_fit/gev_marginal.py @@ -9,16 +9,28 @@ def frechet_unitary_transformation(data, location, scale, shape): return (1 + shape * (data - location) / scale) ** (1 / shape) +class GevParameters(object): + + def __init__(self, location, scale, shape): + self.location = location + self.scale = scale + self.shape = shape + + +def frechet_unitary_transformation_from_gev_parameters(data, gev_parameters: GevParameters): + return frechet_unitary_transformation(data, gev_parameters.location) + + class GevMarginal(object): - def __init__(self, position, data, estimator=GevMleFit): + def __init__(self, coordinate, data, estimator=GevMleFit): """ Class to fit a GEV a list of data. Compute also the corresponding unitary data - :param position: Represents the spatio-temporal position of the marginals + :param coordinate: Represents the spatio-temporal spatial_coordinates of the marginals :param data: array of data corresponding to this position (and potentially its neighborhood) """ - self.position = position + self.coordinate = coordinate self.data = data self.gev_estimator = estimator(x_gev=data) self.gev_parameters_estimated = [self.location, self.scale, self.shape] diff --git a/R/gev_fit/gev_mle_fit.py b/extreme_estimator/R_fit/gev_fit/gev_mle_fit.py similarity index 100% rename from R/gev_fit/gev_mle_fit.py rename to extreme_estimator/R_fit/gev_fit/gev_mle_fit.py diff --git a/extreme_estimator/R_fit/max_stable_fit/max_stable_fit.R b/extreme_estimator/R_fit/max_stable_fit/max_stable_fit.R new file mode 100644 index 0000000000000000000000000000000000000000..b51112f7a53e36f9236df8eedd0e0057a7cb0513 --- /dev/null +++ b/extreme_estimator/R_fit/max_stable_fit/max_stable_fit.R @@ -0,0 +1,28 @@ +library(SpatialExtremes) + + +# Boolean for python call +call_main = !exists("python_wrapping") + +if (call_main) { + set.seed(42) + n.obs = 50 + n.site = 2 + coord <- matrix(rnorm(2*n.site, sd = sqrt(.2)), ncol = 2) + + print(coord) + # Generate the data + # data <- rmaxstab(n.obs, coord, "gauss", cov11 = 100, cov12 = 25, cov22 = 220) + data <- rmaxstab(n.obs, coord, "brown", range = 3, smooth = 0.5) + # Fit back the data + print(data) + # res = fitmaxstab(data, coord, "gauss", fit.marge=FALSE, ) + res = fitmaxstab(data, coord, "brown") + # res = fitmaxstab(data, coord, "gauss", start=list(0,0,0)) + print(res) + print(class(res)) + print(names(res)) + print(res['fitted.values']) + print(res['convergence']) + +} diff --git a/extreme_estimator/R_fit/max_stable_fit/max_stable_models.py b/extreme_estimator/R_fit/max_stable_fit/max_stable_models.py new file mode 100644 index 0000000000000000000000000000000000000000..3aaaa7dd0e66fb2b9cf47a8aae23922ea663d0b0 --- /dev/null +++ b/extreme_estimator/R_fit/max_stable_fit/max_stable_models.py @@ -0,0 +1,95 @@ +import rpy2 + +from rpy2.robjects import ListVector + +from extreme_estimator.R_fit.utils import get_loaded_r +import numpy as np + + +def optimize_fit(max_stable_model_list, params_start_fit_list): + pass + + +class MaxStableModel(object): + + def __init__(self, params_start_fit=None, params_sample=None): + self.cov_mod = None + self.default_params_start_fit = None + self.default_params_sample = None + self.user_params_start_fit = params_start_fit + self.user_params_sample = params_sample + self.r = get_loaded_r() + + def fitmaxstab(self, maxima: np.ndarray, coord: np.ndarray, ): + # todo: find how to specify a startign point, understand how is it set by default + # res = None + # tries = 5 + # nb_tries = 0 + # while res is None and nb_tries < tries: + # try: + # res = self.r.fitmaxstab(maxima, coord, **self.cov_mod_param) # type: ListVector + # except rpy2.rinterface.RRuntimeError: + # pass + # nb_tries += 1 + # print(nb_tries) + res = self.r.fitmaxstab(np.transpose(maxima), coord, **self.cov_mod_param) # type: ListVector + fitted_values = res.rx2('fitted.values') + fitted_values = {key: fitted_values.rx2(key)[0] for key in fitted_values.names} + return fitted_values + + def rmaxstab(self, nb_obs: int, coord: np.ndarray, ) -> np.ndarray: + """ + Return an numpy of maxima. With rows being the stations and columns being the years of maxima + """ + maxima = np.array(self.r.rmaxstab(nb_obs, coord, **self.cov_mod_param, **self.params_sample)) + return np.transpose(maxima) + + @property + def cov_mod_param(self): + return {'cov.mod': self.cov_mod} + + @property + def params_start_fit(self): + return self.merge_params(default_params=self.default_params_start_fit, input_params=self.user_params_start_fit) + + @property + def params_sample(self): + return self.merge_params(default_params=self.default_params_sample, input_params=self.user_params_sample) + + @staticmethod + def merge_params(default_params, input_params): + merged_params = default_params.copy() + if input_params is not None: + assert isinstance(default_params, dict) and isinstance(input_params, dict) + assert set(input_params.keys()).issubset(set(default_params.keys())) + merged_params.update(input_params) + return merged_params + + +class GaussianMSP(MaxStableModel): + + def __init__(self, params_start_fit=None, params_sample=None): + super().__init__(params_start_fit=params_start_fit, params_sample=params_sample) + self.cov_mod = 'gauss' + self.default_params_start_fit = {} + self.default_params_sample = { + 'cov11': 100, + 'cov12': 25, + 'cov22': 220, + } + + +class BrownResick(MaxStableModel): + + def __init__(self, params_start_fit=None, params_sample=None): + super().__init__(params_start_fit=params_start_fit, params_sample=params_sample) + self.cov_mod = 'brown' + self.default_params_start_fit = {} + self.default_params_sample = { + "range": 3, + "smooth": 0.5, + } + +class ExtremalT(MaxStableModel): + pass + diff --git a/extreme_estimator/R_fit/max_stable_fit/test.R b/extreme_estimator/R_fit/max_stable_fit/test.R new file mode 100644 index 0000000000000000000000000000000000000000..64f29ab89fef278397278e49b9be32038a2f4456 --- /dev/null +++ b/extreme_estimator/R_fit/max_stable_fit/test.R @@ -0,0 +1,110 @@ +library(SpatialExtremes) +##Define the spatial_coordinates of each location +n.site <- 30 +locations <- matrix(runif(2*n.site, 0, 10), ncol = 2) +colnames(locations) <- c("lon", "lat") + +##Simulate a max-stable process - with unit Frechet margins +data <- rmaxstab(40, locations, cov.mod = "whitmat", nugget = 0, range = 3, +smooth = 0.5) + +##Now define the spatial model for the GEV parameters +param.loc <- -10 + 2 * locations[,2] +param.scale <- 5 + 2 * locations[,1] + locations[,2]^2 +param.shape <- rep(0.2, n.site) + +##Transform the unit Frechet margins to GEV +for (i in 1:n.site) + data[,i] <- frech2gev(data[,i], param.loc[i], param.scale[i], +param.shape[i]) + +##Define a model for the GEV margins to be fitted +##shape ~ 1 stands for the GEV shape parameter is constant +##over the region +loc.form <- loc ~ lat +scale.form <- scale ~ lon + I(lat^2) +shape.form <- shape ~ 1 + +##Fit a max-stable process using the Schlather's model +fitmaxstab(data, locations, "whitmat", loc.form, scale.form, + shape.form) + +## Model without any spatial structure for the GEV parameters +## Be careful this could be *REALLY* time consuming +fitmaxstab(data, locations, "whitmat") + +## Fixing the smooth parameter of the Whittle-Matern family +## to 0.5 - e.g. considering exponential family. We suppose the data +## are unit Frechet here. +fitmaxstab(data, locations, "whitmat", smooth = 0.5, fit.marge = FALSE) + +## Fitting a penalized smoothing splines for the margins with the +## Smith's model +data <- rmaxstab(40, locations, cov.mod = "gauss", cov11 = 100, cov12 = + 25, cov22 = 220) + +## And transform it to ordinary GEV margins with a non-linear +## function +fun <- function(x) + 2 * sin(pi * x / 4) + 10 +fun2 <- function(x) + (fun(x) - 7 ) / 15 + +param.loc <- fun(locations[,2]) +param.scale <- fun(locations[,2]) +param.shape <- fun2(locations[,1]) + +##Transformation from unit Frechet to common GEV margins +for (i in 1:n.site) + data[,i] <- frech2gev(data[,i], param.loc[i], param.scale[i], +param.shape[i]) + +##Defining the knots, penalty, degree for the splines +n.knots <- 5 +knots <- quantile(locations[,2], prob = 1:n.knots/(n.knots+1)) +knots2 <- quantile(locations[,1], prob = 1:n.knots/(n.knots+1)) + +##Be careful the choice of the penalty (i.e. the smoothing parameter) +##may strongly affect the result Here we use p-splines for each GEV +##parameter - so it's really CPU demanding but one can use 1 p-spline +##and 2 linear models. +##A simple linear model will be clearly faster... +loc.form <- y ~ rb(lat, knots = knots, degree = 3, penalty = .5) +scale.form <- y ~ rb(lat, knots = knots, degree = 3, penalty = .5) +shape.form <- y ~ rb(lon, knots = knots2, degree = 3, penalty = .5) + +fitted <- fitmaxstab(data, locations, "gauss", loc.form, scale.form, shape.form, + control = list(ndeps = rep(1e-6, 24), trace = 10), + method = "BFGS") +fitted +op <- par(mfrow=c(1,3)) +plot(locations[,2], param.loc, col = 2, ylim = c(7, 14), + ylab = "location parameter", xlab = "latitude") +plot(fun, from = 0, to = 10, add = TRUE, col = 2) +points(locations[,2], predict(fitted)[,"loc"], col = "blue", pch = 5) +new.data <- cbind(lon = seq(0, 10, length = 100), lat = seq(0, 10, length = 100)) +lines(new.data[,1], predict(fitted, new.data)[,"loc"], col = "blue") +legend("topleft", c("true values", "predict. values", "true curve", "predict. curve"), + col = c("red", "blue", "red", "blue"), pch = c(1, 5, NA, NA), inset = 0.05, + lty = c(0, 0, 1, 1), ncol = 2) + +plot(locations[,2], param.scale, col = 2, ylim = c(7, 14), + ylab = "scale parameter", xlab = "latitude") +plot(fun, from = 0, to = 10, add = TRUE, col = 2) +points(locations[,2], predict(fitted)[,"scale"], col = "blue", pch = 5) +lines(new.data[,1], predict(fitted, new.data)[,"scale"], col = "blue") +legend("topleft", c("true values", "predict. values", "true curve", "predict. curve"), + col = c("red", "blue", "red", "blue"), pch = c(1, 5, NA, NA), inset = 0.05, + lty = c(0, 0, 1, 1), ncol = 2) + +plot(locations[,1], param.shape, col = 2, + ylab = "shape parameter", xlab = "longitude") +plot(fun2, from = 0, to = 10, add = TRUE, col = 2) +points(locations[,1], predict(fitted)[,"shape"], col = "blue", pch = 5) +lines(new.data[,1], predict(fitted, new.data)[,"shape"], col = "blue") +legend("topleft", c("true values", "predict. values", "true curve", "predict. curve"), + col = c("red", "blue", "red", "blue"), pch = c(1, 5, NA, NA), inset = 0.05, + lty = c(0, 0, 1, 1), ncol = 2) +par(op) + +## End(Not run) \ No newline at end of file diff --git a/extreme_estimator/R_fit/utils.py b/extreme_estimator/R_fit/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8e308400161397f53be82873aa5b9451e078231c --- /dev/null +++ b/extreme_estimator/R_fit/utils.py @@ -0,0 +1,19 @@ +import rpy2.robjects as ro +import pandas as pd +import numpy as np +import rpy2.robjects.numpy2ri as npr + + +def get_loaded_r(): + r = ro.r + ro.numpy2ri.activate() + r.library('SpatialExtremes') + return r + + +# def conversion_to_FloatVector(data): +# """Convert DataFrame or numpy array into FloatVector for r""" +# if isinstance(data, pd.DataFrame): +# data = data.values +# assert isinstance(data, np.ndarray) +# return npr.numpy2ri(data) diff --git a/extreme_estimator/estimator/abstract_estimator.py b/extreme_estimator/estimator/abstract_estimator.py new file mode 100644 index 0000000000000000000000000000000000000000..84e8f5ff6514a69521211e4d696d427ac7a74e4a --- /dev/null +++ b/extreme_estimator/estimator/abstract_estimator.py @@ -0,0 +1,4 @@ + + +class AbstractEstimator(object): + pass \ No newline at end of file diff --git a/extreme_estimator/estimator/msp_estimator.py b/extreme_estimator/estimator/msp_estimator.py new file mode 100644 index 0000000000000000000000000000000000000000..03c2f502a5afb33a7f4bf953e00c2a2f0ea8c84c --- /dev/null +++ b/extreme_estimator/estimator/msp_estimator.py @@ -0,0 +1,22 @@ +from extreme_estimator.estimator.abstract_estimator import AbstractEstimator +from extreme_estimator.R_fit.max_stable_fit.max_stable_models import MaxStableModel +from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset +import numpy as np + + +class MaxStableEstimator(AbstractEstimator): + + def __init__(self, dataset: AbstractDataset, max_stable_model: MaxStableModel): + self.dataset = dataset + self.max_stable_model = max_stable_model + self.max_stable_params_fitted = None + + def fit(self): + self.max_stable_params_fitted = self.max_stable_model.fitmaxstab(maxima=self.dataset.maxima, coord=self.dataset.coord) + + def error(self, true_max_stable_params: dict): + absolute_errors = {param_name: np.abs(param_true_value - self.max_stable_params_fitted[param_name]) + for param_name, param_true_value in true_max_stable_params.items()} + mean_absolute_error = np.mean(np.array(list(absolute_errors.values()))) + # return {**absolute_errors, **{'mae': mean_absolute_error}} + return mean_absolute_error diff --git a/extreme_estimator/robustness_plot/abstract_robustness.py b/extreme_estimator/robustness_plot/abstract_robustness.py new file mode 100644 index 0000000000000000000000000000000000000000..7ab9deca568823f39210db41941f8df2d757cf25 --- /dev/null +++ b/extreme_estimator/robustness_plot/abstract_robustness.py @@ -0,0 +1,98 @@ +from typing import List + +from extreme_estimator.estimator.msp_estimator import MaxStableEstimator +from extreme_estimator.R_fit.max_stable_fit.max_stable_models import GaussianMSP, MaxStableModel +from itertools import product + +from spatio_temporal_dataset.dataset.simulation_dataset import SimulatedDataset +from spatio_temporal_dataset.spatial_coordinates.generated_coordinate import CircleCoordinates +import matplotlib.pyplot as plt + +plt.style.use('seaborn-white') + + +class DisplayItem(object): + + def __init__(self, argument_name, default_value, dislay_name=None): + self.argument_name = argument_name + self.default_value = default_value + self.dislay_name = dislay_name if dislay_name is not None else self.argument_name + + def values_from_kwargs(self, **kwargs): + return kwargs.get(self.argument_name, [self.default_value]) + + def value_from_kwargs(self, **kwargs): + return kwargs.get(self.argument_name, self.default_value) + + +MaxStableModelItem = DisplayItem('max_stable_model', GaussianMSP) +SpatialCoordinateClassItem = DisplayItem('spatial_coordinate_class', CircleCoordinates) +SpatialParamsItem = DisplayItem('spatial_params', {"r": 1}) +NbStationItem = DisplayItem('nb_station', None) +NbObservationItem = DisplayItem('nb_obs', 50) + + +class AbstractRobustnessPlot(object): + + def __init__(self, grid_row_item, grid_column_item, plot_row_item, plot_label_item): + self.grid_row_item = grid_row_item # type: DisplayItem + self.grid_column_item = grid_column_item # type: DisplayItem + self.plot_row_item = plot_row_item # type: DisplayItem + self.plot_label_item = plot_label_item # type: DisplayItem + + self.estimation_error = self.estimation_error_max_stable_unitary_frechet + + def robustness_grid_plot(self, **kwargs): + grid_row_values = self.grid_row_item.values_from_kwargs(**kwargs) + grid_column_values = self.grid_column_item.values_from_kwargs(**kwargs) + nb_grid_rows, nb_grid_columns = len(grid_row_values), len(grid_column_values) + fig = plt.figure() + fig.subplots_adjust(hspace=0.4, wspace=0.4) + for i, (grid_row_value, grid_column_value) in enumerate(product(grid_row_values, grid_column_values), 1): + print('Grid plot: {}={} {}={}'.format(self.grid_row_item.dislay_name, grid_row_value, + self.grid_column_item.dislay_name, grid_column_value)) + ax = fig.add_subplot(nb_grid_rows, nb_grid_columns, i) + kwargs_single_plot = kwargs.copy() + kwargs_single_plot.update({self.grid_row_item.argument_name: grid_row_value, + self.grid_column_item.argument_name: grid_column_value}) + self.robustness_single_plot(ax, **kwargs_single_plot) + plt.show() + + def robustness_single_plot(self, ax, **kwargs_single_plot): + plot_row_values = self.plot_row_item.values_from_kwargs(**kwargs_single_plot) + plot_label_values = self.plot_label_item.values_from_kwargs(**kwargs_single_plot) + colors = ['blue', 'red', 'green', 'black'] + assert isinstance(plot_label_values, list), plot_label_values + assert isinstance(plot_row_values, list), plot_row_values + for j, plot_label_value in enumerate(plot_label_values): + plot_row_value_to_error = {} + # todo: do some parallzlization here + for plot_row_value in plot_row_values: + kwargs_single_point = kwargs_single_plot.copy() + kwargs_single_point.update({self.plot_row_item.argument_name: plot_row_value, + self.plot_label_item.argument_name: plot_label_value}) + plot_row_value_to_error[plot_row_value] = self.estimation_error(**kwargs_single_point) + plot_column_values = [plot_row_value_to_error[plot_row_value] for plot_row_value in plot_row_values] + ax.plot(plot_row_values, plot_column_values, color=colors[j % len(colors)], label=str(j)) + ax.legend() + ax.set_xlabel(self.plot_row_item.dislay_name) + ax.set_ylabel('Absolute error') + ax.set_title('Title (display all the other parameters)') + + @staticmethod + def estimation_error_max_stable_unitary_frechet(**kwargs_single_points): + # Get the argument from kwargs + print(kwargs_single_points) + max_stable_model = MaxStableModelItem.value_from_kwargs(**kwargs_single_points) + spatial_coordinate_class = SpatialCoordinateClassItem.value_from_kwargs(**kwargs_single_points) + nb_station = NbStationItem.value_from_kwargs(**kwargs_single_points) + spatial_params = SpatialParamsItem.value_from_kwargs(**kwargs_single_points) + nb_obs = NbObservationItem.value_from_kwargs(**kwargs_single_points) + # Run the estimation + spatial_coordinate = spatial_coordinate_class.from_nb_points(nb_points=nb_station, **spatial_params) + dataset = SimulatedDataset.from_max_stable_sampling(nb_obs=nb_obs, max_stable_model=max_stable_model, + spatial_coordinates=spatial_coordinate) + estimator = MaxStableEstimator(dataset, max_stable_model) + estimator.fit() + errors = estimator.error(max_stable_model.params_sample) + return errors diff --git a/extreme_estimator/robustness_plot/spatial_robustness.py b/extreme_estimator/robustness_plot/spatial_robustness.py new file mode 100644 index 0000000000000000000000000000000000000000..aa5e35c51b9fbdac7d6fb9475ecd881a95004ac9 --- /dev/null +++ b/extreme_estimator/robustness_plot/spatial_robustness.py @@ -0,0 +1,15 @@ +from extreme_estimator.R_fit.max_stable_fit.max_stable_models import GaussianMSP, BrownResick +from extreme_estimator.robustness_plot.abstract_robustness import DisplayItem, AbstractRobustnessPlot, \ + SpatialCoordinateClassItem, NbObservationItem, NbStationItem, MaxStableModelItem, SpatialParamsItem +from spatio_temporal_dataset.spatial_coordinates.generated_coordinate import CircleCoordinates + +spatial_robustness = AbstractRobustnessPlot(grid_row_item=SpatialCoordinateClassItem, + grid_column_item=NbObservationItem, + plot_row_item=NbStationItem, + plot_label_item=MaxStableModelItem) + +# Put only the parameter that will vary +spatial_robustness.robustness_grid_plot(**{ + NbStationItem.argument_name: [10, 20, 30, 40, 50], + MaxStableModelItem.argument_name: [GaussianMSP(), BrownResick()][:] +}) diff --git a/spatio_temporal_dataset/__init__.py b/spatio_temporal_dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/spatio_temporal_dataset/dataset/abstract_dataset.py b/spatio_temporal_dataset/dataset/abstract_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..25d13e0e9653dfa2d460c17631ef10cc8c565828 --- /dev/null +++ b/spatio_temporal_dataset/dataset/abstract_dataset.py @@ -0,0 +1,44 @@ +import os +import os.path as op +import pandas as pd +from spatio_temporal_dataset.temporal_maxima.temporal_maxima import TemporalMaxima +from spatio_temporal_dataset.spatial_coordinates.abstract_coordinate import AbstractSpatialCoordinates + + +class AbstractDataset(object): + + def __init__(self, temporal_maxima: TemporalMaxima, spatial_coordinates: AbstractSpatialCoordinates): + assert (temporal_maxima.index == spatial_coordinates.index).all() + self.temporal_maxima = temporal_maxima + self.spatial_coordinates = spatial_coordinates + + @classmethod + def from_csv(cls, csv_path: str): + assert op.exists(csv_path) + df = pd.read_csv(csv_path) + temporal_maxima = TemporalMaxima.from_df(df) + spatial_coordinates = AbstractSpatialCoordinates.from_df(df) + return cls(temporal_maxima=temporal_maxima, spatial_coordinates=spatial_coordinates) + + def to_csv(self, csv_path: str): + dirname = op.dirname(csv_path) + if not op.exists(dirname): + os.makedirs(dirname) + self.df_dataset.to_csv(csv_path) + + @property + def df_dataset(self) -> pd.DataFrame: + # Merge dataframes with the maxima and with the coordinates + return self.temporal_maxima.df_maxima.join(self.spatial_coordinates.df_coord) + + @property + def coord(self): + return self.spatial_coordinates.coord + + @property + def maxima(self): + return self.temporal_maxima.maxima + + +class RealDataset(AbstractDataset): + pass diff --git a/spatio_temporal_dataset/dataset/simulation_dataset.py b/spatio_temporal_dataset/dataset/simulation_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b7e0175232b7c5fed9b572ab0ec93c1ee72d56aa --- /dev/null +++ b/spatio_temporal_dataset/dataset/simulation_dataset.py @@ -0,0 +1,32 @@ +from extreme_estimator.R_fit.max_stable_fit.max_stable_models import MaxStableModel, GaussianMSP +import pandas as pd +from spatio_temporal_dataset.dataset.abstract_dataset import AbstractDataset +from spatio_temporal_dataset.temporal_maxima.temporal_maxima import TemporalMaxima +from spatio_temporal_dataset.spatial_coordinates.abstract_coordinate import AbstractSpatialCoordinates +from spatio_temporal_dataset.spatial_coordinates.generated_coordinate import CircleCoordinates + + +class SimulatedDataset(AbstractDataset): + + def __init__(self, temporal_maxima: TemporalMaxima, spatial_coordinates: AbstractSpatialCoordinates, + max_stable_model: MaxStableModel): + super().__init__(temporal_maxima, spatial_coordinates) + self.max_stable_model = max_stable_model + + @classmethod + def from_max_stable_sampling(cls, nb_obs: int, max_stable_model:MaxStableModel, spatial_coordinates: AbstractSpatialCoordinates): + maxima = max_stable_model.rmaxstab(nb_obs=nb_obs, coord=spatial_coordinates.coord) + df_maxima = pd.DataFrame(data=maxima, index=spatial_coordinates.index) + temporal_maxima = TemporalMaxima(df_maxima=df_maxima) + return cls(temporal_maxima=temporal_maxima, spatial_coordinates=spatial_coordinates, max_stable_model=max_stable_model) + + +if __name__ == '__main__': + coord = CircleCoordinates.from_nb_points(nb_points=5, max_radius=1) + max_stable_model = GaussianMSP() + dataset = SimulatedDataset.from_max_stable_sampling(nb_obs=50, max_stable_model=max_stable_model, spatial_coordinates=coord) + print(dataset.df_dataset) + + + + diff --git a/spatio_temporal_dataset/marginals/__init__.py b/spatio_temporal_dataset/marginals/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/spatio_temporal_dataset/marginals/abstract_marginals.py b/spatio_temporal_dataset/marginals/abstract_marginals.py new file mode 100644 index 0000000000000000000000000000000000000000..63a9b49d475e18c34ac041d99bc19d489422f3f0 --- /dev/null +++ b/spatio_temporal_dataset/marginals/abstract_marginals.py @@ -0,0 +1,48 @@ +from typing import List + +from R.gev_fit.gev_marginal import GevMarginal, frechet_unitary_transformation +from spatio_temporal_dataset.stations.station import Station + + +class AbstractMarginals(object): + + def __init__(self, stations: List[Station]): + self.stations = stations + self.gev_marginals = [] # type: List[GevMarginal] + + # Compute some temporal arguments + self.years_of_study = self.stations[0].year_of_study + self.temporal_window_size = 20 + + + self.smoothing_function = None + + def sample_marginals(self, smoothing_function): + pass + + @property + def gev_parameters(self): + return [gev_marginal.gev_parameters_estimated for gev_marginal in self.gev_marginals] + + +class FrechetUnitaryTransformationFunction(object): + pass + +class NoSmoothing(object): + + def gev_parameters(self, coordinate): + pass + +class ContinuousSmoothing(object): + + def frechet_unitary_transformation(self, data, coordinate): + gev_parameters = self.gev_parameters(coordinate) + transformed_data = frechet_unitary_transformation(data, gev_parameters) + + + def gev_parameters(self, coordinate): + return + return () + +if __name__ == '__main__': + pass \ No newline at end of file diff --git a/spatio_temporal_dataset/marginals/spatial_marginals.py b/spatio_temporal_dataset/marginals/spatial_marginals.py new file mode 100644 index 0000000000000000000000000000000000000000..ef411a53da08a8cc530e8ab9f0a653fc3b66e7dd --- /dev/null +++ b/spatio_temporal_dataset/marginals/spatial_marginals.py @@ -0,0 +1,13 @@ +from typing import List + +from spatio_temporal_dataset.marginals.abstract_marginals import AbstractMarginals +from R.gev_fit.gev_marginal import GevMarginal +from spatio_temporal_dataset.stations.station import Station + + +class SpatialMarginal(AbstractMarginals): + + def __init__(self, stations: List[Station]): + super().__init__(stations) + for station in self.stations: + self.gev_marginals.append(GevMarginal(coordinate=station, data=station.annual_maxima.values)) \ No newline at end of file diff --git a/spatio_temporal_dataset/marginals/spatio_temporal_marginals.py b/spatio_temporal_dataset/marginals/spatio_temporal_marginals.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/spatio_temporal_dataset/max_stable/abstract_max_stable.py b/spatio_temporal_dataset/max_stable/abstract_max_stable.py new file mode 100644 index 0000000000000000000000000000000000000000..a2fe3404415be5a662a59348957526422383e779 --- /dev/null +++ b/spatio_temporal_dataset/max_stable/abstract_max_stable.py @@ -0,0 +1,10 @@ + + +class AbstractMaxStableOptimization(object): + + def __init__(self, marginals): + pass + """ + We can conclude on the dimension of the max stable + """ + diff --git a/spatio_temporal_dataset/spatial_coordinates/abstract_coordinate.py b/spatio_temporal_dataset/spatial_coordinates/abstract_coordinate.py new file mode 100644 index 0000000000000000000000000000000000000000..3d25de71f2056f24f2ba3f69e703049cf70dc211 --- /dev/null +++ b/spatio_temporal_dataset/spatial_coordinates/abstract_coordinate.py @@ -0,0 +1,61 @@ +import os.path as op +import pandas as pd +import matplotlib.pyplot as plt + + +class AbstractSpatialCoordinates(object): + + # Columns + COORD_X = 'coord_x' + COORD_Y = 'coord_y' + COORD_SPLIT = 'coord_split' + # Constants + TRAIN_SPLIT_STR = 'train_split' + TEST_SPLIT_STR = 'test_split' + + def __init__(self, df_coord: pd.DataFrame, s_split: pd.Series = None): + self.s_split = s_split + self.df_coord = df_coord + + @classmethod + def from_df(cls, df): + assert cls.COORD_X in df.columns and cls.COORD_Y in df.columns + df_coord = df.loc[:, [cls.COORD_X, cls.COORD_Y]] + s_split = df[cls.COORD_SPLIT] if cls.COORD_SPLIT in df.columns else None + return cls(df_coord=df_coord, s_split=s_split) + + @classmethod + def from_csv(cls, csv_path): + assert op.exists(csv_path) + df = pd.read_csv(csv_path) + return cls.from_df(df) + + def df_coord_split(self, split_str): + assert self.s_split is not None + ind = self.s_split == split_str + return self.df_coord.loc[ind] + + @property + def df_coord_train(self): + return self.df_coord_split(self.TRAIN_SPLIT_STR) + + @property + def df_coord_test(self): + return self.df_coord_split(self.TEST_SPLIT_STR) + + @property + def nb_points(self): + return len(self.df_coord) + + @property + def coord(self): + return self.df_coord.values + + @property + def index(self): + return self.df_coord.index + + def visualization(self): + x, y = self.coord[:, 0], self.coord[:, 1] + plt.scatter(x, y) + plt.show() diff --git a/spatio_temporal_dataset/spatial_coordinates/generated_coordinate.py b/spatio_temporal_dataset/spatial_coordinates/generated_coordinate.py new file mode 100644 index 0000000000000000000000000000000000000000..b937b22e9d4f7944b27569bf8b8ee2bbf9d57153 --- /dev/null +++ b/spatio_temporal_dataset/spatial_coordinates/generated_coordinate.py @@ -0,0 +1,46 @@ +import math +import numpy as np +import pandas as pd + +from extreme_estimator.R_fit.utils import get_loaded_r +from spatio_temporal_dataset.spatial_coordinates.abstract_coordinate import AbstractSpatialCoordinates +import matplotlib.pyplot as plt + + +class SimulatedCoordinates(AbstractSpatialCoordinates): + """ + Common manipulation on generated coordinates + """ + + def __init__(self, df_coord, s_split=None): + super().__init__(df_coord, s_split) + + @classmethod + def from_nb_points(cls, nb_points, **kwargs): + pass + + +class CircleCoordinates(SimulatedCoordinates): + + @classmethod + def from_nb_points(cls, nb_points, **kwargs): + max_radius = kwargs.get('max_radius', 1.0) + # Sample uniformly inside the circle + r = get_loaded_r() + angles = np.array(r.runif(nb_points, max=2 * math.pi)) + radius = np.sqrt(np.array(r.runif(nb_points, max=max_radius))) + df = pd.DataFrame.from_dict({cls.COORD_X: radius * np.cos(angles), cls.COORD_Y: radius * np.sin(angles)}) + return cls.from_df(df) + + def visualization(self): + r = 1.0 + circle1 = plt.Circle((0, 0), r, color='r', fill=False) + plt.gcf().gca().set_xlim((-r, r)) + plt.gcf().gca().set_ylim((-r, r)) + plt.gcf().gca().add_artist(circle1) + super().visualization() + + +if __name__ == '__main__': + coord = CircleCoordinates.from_nb_points(nb_points=500, max_radius=1) + coord.visualization() diff --git a/spatio_temporal_dataset/spatio_temporal_data_handler.py b/spatio_temporal_dataset/spatio_temporal_data_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..b94df33b796673b636ec98dbe9833b99f07bff15 --- /dev/null +++ b/spatio_temporal_dataset/spatio_temporal_data_handler.py @@ -0,0 +1,59 @@ +from typing import List +import pandas as pd +from spatio_temporal_dataset.marginals.abstract_marginals import AbstractMarginals +from spatio_temporal_dataset.marginals.spatial_marginals import SpatialMarginal +from spatio_temporal_dataset.stations.station import Station, load_stations_from_dataframe +from spatio_temporal_dataset.stations.station_distance import EuclideanDistance2D, StationDistance +import pickle +import os.path as op +from itertools import combinations + + +class SpatioTemporalDataHandler(object): + + def __init__(self, marginals_class: type, stations: List[Station], station_distance: StationDistance): + self.stations = stations + + # Compute once the distances between stations + for station1, station2 in combinations(self.stations, 2): + distance = station_distance.compute_distance(station1, station2) + station1.distance[station2] = distance + station2.distance[station1] = distance + + # Compute the marginals + self.marginals = marginals_class(self.stations) # type: AbstractMarginals + + # Define the max stable + # self.max_stable = + + print(self.marginals.gev_parameters) + + @classmethod + def from_dataframe(cls, df): + return cls.from_spatial_dataframe(df) + + @classmethod + def from_spatial_dataframe(cls, df): + stations = load_stations_from_dataframe(df) + marginal_class = SpatialMarginal + station_distance = EuclideanDistance2D() + return cls(marginals_class=marginal_class, stations=stations, station_distance=station_distance) + + +def get_spatio_temporal_data_handler(pickle_path: str, load_pickle: bool = True, dump_pickle: bool = False, *args) \ + -> SpatioTemporalDataHandler: + # Either load or dump pickle of a SpatioTemporalDataHandler object + assert load_pickle or dump_pickle + if load_pickle: + assert op.exists(pickle_path) and not dump_pickle + spatio_temporal_experiment = pickle.load(pickle_path) + else: + assert not op.exists(pickle_path) + spatio_temporal_experiment = SpatioTemporalDataHandler(*args) + pickle.dump(spatio_temporal_experiment, file=pickle_path) + return spatio_temporal_experiment + + +if __name__ == '__main__': + df = pd.DataFrame(1, index=['station1', 'station2'], columns=['200' + str(i) for i in range(18)]) + xp = SpatioTemporalDataHandler.from_dataframe(df) diff --git a/spatio_temporal_dataset/stations/__init__.py b/spatio_temporal_dataset/stations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/spatio_temporal_dataset/stations/station.py b/spatio_temporal_dataset/stations/station.py new file mode 100644 index 0000000000000000000000000000000000000000..9e3f6aa1123350edb5548916c35c7b8a695174ca --- /dev/null +++ b/spatio_temporal_dataset/stations/station.py @@ -0,0 +1,29 @@ +import pandas as pd +import numpy as np + + +class Station(object): + def __init__(self, name: str, annual_maxima: pd.Series, longitude=np.nan, latitude=np.nan, altitude=np.nan): + self.annual_maxima = annual_maxima + self.year_of_study = list(annual_maxima.index) + self.name = name + self.altitude = altitude + self.latitude = latitude + self.longitude = longitude + self.distance = {} + + +def load_stations_from_dataframe(df): + return [Station(name=i, annual_maxima=row) for i, row in df.iterrows()] + +def load_station_from_two_dataframe(df, location_df): + pass + + +if __name__ == '__main__': + df = pd.DataFrame(1, index=['station1', 'station2'], columns=['200' + str(i) for i in range(9)]) + stations = load_stations_from_dataframe(df) + station = stations[0] + print(station.name) + print(station.annual_maxima) + print(station.year_of_study) diff --git a/spatio_temporal_dataset/stations/station_distance.py b/spatio_temporal_dataset/stations/station_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..d4a40eb5ef918d278e4eb5483b11a2fec93c2019 --- /dev/null +++ b/spatio_temporal_dataset/stations/station_distance.py @@ -0,0 +1,22 @@ +from spatio_temporal_dataset.stations.station import Station +import numpy as np + + +class StationDistance(object): + + @classmethod + def compute_distance(self, station1: Station, station2: Station) -> float: + return np.nan + + +def euclidean_distance(arr1: np.ndarray, arr2: np.ndarray) -> float: + return np.linalg.norm(arr1 - arr2) + + +class EuclideanDistance2D(StationDistance): + + @classmethod + def compute_distance(self, station1: Station, station2: Station) -> float: + print(station1.latitude) + stations_coordinates = [np.array([station.latitude, station.longitude]) for station in [station1, station2]] + return euclidean_distance(*stations_coordinates) diff --git a/spatio_temporal_dataset/temporal_maxima/__init__.py b/spatio_temporal_dataset/temporal_maxima/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/spatio_temporal_dataset/temporal_maxima/temporal_maxima.py b/spatio_temporal_dataset/temporal_maxima/temporal_maxima.py new file mode 100644 index 0000000000000000000000000000000000000000..a7c19cb339e1bfeb311e934531b9f561b8a89754 --- /dev/null +++ b/spatio_temporal_dataset/temporal_maxima/temporal_maxima.py @@ -0,0 +1,24 @@ +import pandas as pd + +class TemporalMaxima(object): + + def __init__(self, df_maxima: pd.DataFrame): + """ + Main attribute of the class is the DataFrame df_maxima + Index are stations index, Columns are the year of the maxima + """ + self.df_maxima = df_maxima + + @classmethod + def from_df(cls, df): + pass + + @property + def index(self): + return self.df_maxima.index + + @property + def maxima(self): + return self.df_maxima.values + + # todo: add the transformed_maxima and the not-trasnformed maxima diff --git a/test/test_pipeline.py b/test/test_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..252f829bda873a853631bf8a7681f6bed9516dec --- /dev/null +++ b/test/test_pipeline.py @@ -0,0 +1,60 @@ +import unittest +import pandas as pd + +from spatio_temporal_dataset.spatio_temporal_data_handler import SpatioTemporalDataHandler + + +class TestPipeline(unittest): + + def main_pipeline(self): + # Select a type of marginals (either spatial, spatio temporal, temporal) + # this will define the dimension of the climatic space of interest + pass + # Select the max stable + + # Define an optimization process + # The algo: In 1 time, in 2 times, ..., or more complex patterns + # This algo have at least main procedures (that might be repeated several times) + + # For each procedure, we shall define: + # - The loss + # - The optimization method for each part of the process + + def blanchet_smooth_pipeline(self): + pass + # Spatial marginal + + # NO MAX STABLE + + # Procedure: + # Optimization of a single likelihood process that sums up the likelihood of all the terms. + + def padoan_extreme_pipeline(self): + pass + # Spatial marginal + + # todo: question, when we are optimizing the full Pairwise loss, are we just optimization the relations ? + # or ideally do we need to add the term of order 1 + + def gaume(self): + # Combining the 2 + pass + + + + + def test_pipeline_spatial(self): + pass + # Sample from a + + # Fit the spatio temporal experiment margin + + # Fit the max stable process + + def test_dataframe_fit_unitary(self): + df = pd.DataFrame(1, index=['station1', 'station2'], columns=['200' + str(i) for i in range(18)]) + xp = SpatioTemporalDataHandler.from_dataframe(df) + +if __name__ == '__main__': + df = pd.DataFrame(1, index=['station1', 'station2'], columns=['200' + str(i) for i in range(18)]) + xp = SpatioTemporalDataHandler.from_dataframe(df) \ No newline at end of file