diff --git a/experiment/meteo_france_data/scm_models_data/abstract_study.py b/experiment/meteo_france_data/scm_models_data/abstract_study.py index 647e6549455dd4bd619af4187a00699238f321cd..65fe393b3c03cd24679782f7489221196f038197 100644 --- a/experiment/meteo_france_data/scm_models_data/abstract_study.py +++ b/experiment/meteo_france_data/scm_models_data/abstract_study.py @@ -16,7 +16,8 @@ from matplotlib.colors import Normalize from netCDF4 import Dataset from experiment.meteo_france_data.scm_models_data.abstract_variable import AbstractVariable -from experiment.meteo_france_data.scm_models_data.scm_constants import ALTITUDES, ZS_INT_23, ZS_INT_MASK, LONGITUDES, LATITUDES +from experiment.meteo_france_data.scm_models_data.scm_constants import ALTITUDES, ZS_INT_23, ZS_INT_MASK, LONGITUDES, \ + LATITUDES from experiment.meteo_france_data.visualization.utils import get_km_formatter from extreme_estimator.extreme_models.margin_model.margin_function.abstract_margin_function import \ AbstractMarginFunction @@ -48,84 +49,12 @@ class AbstractStudy(object): self.year_max = year_max self.multiprocessing = multiprocessing - def write_to_file(self, df: pd.DataFrame): - if not op.exists(self.result_full_path): - os.makedirs(self.result_full_path, exist_ok=True) - df.to_csv(op.join(self.result_full_path, 'merged_array_{}_altitude.csv'.format(self.altitude))) - - """ Data """ - - @property - def df_all_daily_time_series_concatenated(self) -> pd.DataFrame: - df_list = [pd.DataFrame(time_serie, columns=self.study_massif_names) for time_serie in - self.year_to_daily_time_serie_array.values()] - df_concatenated = pd.concat(df_list) - return df_concatenated + """ Annual maxima """ @property def observations_annual_maxima(self) -> AnnualMaxima: return AnnualMaxima(df_maxima_gev=pd.DataFrame(self.year_to_annual_maxima, index=self.study_massif_names)) - @property - def df_annual_total(self) -> pd.DataFrame: - return pd.DataFrame(self.year_to_annual_total, index=self.study_massif_names).transpose() - - def annual_aggregation_function(self, *args, **kwargs): - raise NotImplementedError() - - """ Load some attributes only once """ - - @property - def year_to_dataset_ordered_dict(self) -> OrderedDict: - print('This code is quite long... ' - 'You should consider year_to_variable which is way faster when multiprocessing=True') - # Map each year to the correspond netCDF4 Dataset - path_files, ordered_years = self.ordered_years_and_path_files() - datasets = [Dataset(path_file) for path_file in path_files] - return OrderedDict(zip(ordered_years, datasets)) - - def ordered_years_and_path_files(self): - nc_files = [(int(f.split('_')[-2][:4]), f) for f in os.listdir(self.study_full_path) if f.endswith('.nc')] - ordered_years, path_files = zip(*[(year, op.join(self.study_full_path, nc_file)) - for year, nc_file in sorted(nc_files, key=lambda t: t[0]) - if self.year_min <= year < self.year_max]) - return path_files, ordered_years - - @property - def ordered_years(self): - return self.ordered_years_and_path_files()[1] - - @cached_property - def year_to_variable_array(self) -> OrderedDict: - # Map each year to the variable array - path_files, ordered_years = self.ordered_years_and_path_files() - if self.multiprocessing: - with Pool(NB_CORES) as p: - variables = p.map(self.load_variables, path_files) - else: - variables = [self.load_variables(path_file) for path_file in path_files] - return OrderedDict(zip(ordered_years, variables)) - - def load_variables(self, path_file): - dataset = Dataset(path_file) - keyword = self.load_keyword() - if isinstance(keyword, str): - return np.array(dataset.variables[keyword]) - else: - return [np.array(dataset.variables[k]) for k in keyword] - - def load_keyword(self): - return self.variable_class.keyword() - - @property - def start_year_and_stop_year(self) -> Tuple[int, int]: - ordered_years = self.ordered_years - return ordered_years[0], ordered_years[-1] - - @cached_property - def year_to_daily_time_serie_array(self) -> OrderedDict: - return self._year_to_daily_time_serie_array - @cached_property def year_to_annual_maxima(self) -> OrderedDict: # Map each year to an array of size nb_massif @@ -134,6 +63,15 @@ class AbstractStudy(object): year_to_annual_maxima[year] = time_serie.max(axis=0) return year_to_annual_maxima + """ Annual total """ + + @property + def df_annual_total(self) -> pd.DataFrame: + return pd.DataFrame(self.year_to_annual_total, index=self.study_massif_names).transpose() + + def annual_aggregation_function(self, *args, **kwargs): + raise NotImplementedError() + @cached_property def year_to_annual_total(self) -> OrderedDict: # Map each year to an array of size nb_massif @@ -145,10 +83,15 @@ class AbstractStudy(object): def apply_annual_aggregation(self, time_serie): return self.annual_aggregation_function(time_serie, axis=0) - def instantiate_variable_object(self, variable_array) -> AbstractVariable: - return self.variable_class(variable_array) + """ Load daily observations """ - """ Private methods to be overwritten """ + @cached_property + def year_to_daily_time_serie_array(self) -> OrderedDict: + return self._year_to_daily_time_serie_array + + @property + def _year_to_max_daily_time_serie(self) -> OrderedDict: + return self._year_to_daily_time_serie_array @property def _year_to_daily_time_serie_array(self) -> OrderedDict: @@ -167,78 +110,117 @@ class AbstractStudy(object): year_to_daily_time_serie_array[year] = daily_time_serie return year_to_daily_time_serie_array - @property - def _year_to_max_daily_time_serie(self) -> OrderedDict: - return self._year_to_daily_time_serie_array + def instantiate_variable_object(self, variable_array) -> AbstractVariable: + return self.variable_class(variable_array) + + """ Load Variables and Datasets """ + + @cached_property + def year_to_variable_array(self) -> OrderedDict: + # Map each year to the variable array + path_files, ordered_years = self.ordered_years_and_path_files + if self.multiprocessing: + with Pool(NB_CORES) as p: + variables = p.map(self.load_variables, path_files) + else: + variables = [self.load_variables(path_file) for path_file in path_files] + return OrderedDict(zip(ordered_years, variables)) + + def load_variables(self, path_file): + dataset = Dataset(path_file) + keyword = self.load_keyword() + if isinstance(keyword, str): + return np.array(dataset.variables[keyword]) + else: + return [np.array(dataset.variables[k]) for k in keyword] - ########## + def load_keyword(self): + return self.variable_class.keyword() @property - def study_massif_names(self) -> List[str]: - return self.altitude_to_massif_names[self.altitude] + def year_to_dataset_ordered_dict(self) -> OrderedDict: + print('This code is quite long... ' + 'You should consider year_to_variable which is way faster when multiprocessing=True') + # Map each year to the correspond netCDF4 Dataset + path_files, ordered_years = self.ordered_years_and_path_files + datasets = [Dataset(path_file) for path_file in path_files] + return OrderedDict(zip(ordered_years, datasets)) @cached_property - def all_massif_names(self) -> List[str]: - """ - Pour l'identification des massifs, le numéro de la variable massif_num correspond à celui de l'attribut num_opp - """ - metadata_path = op.join(self.full_path, self.REANALYSIS_FOLDER, 'metadata') - dbf = Dbf5(op.join(metadata_path, 'massifs_alpes.dbf')) - df = dbf.to_dataframe().copy() # type: pd.DataFrame - dbf.f.close() - df.sort_values(by='num_opp', inplace=True) - all_massif_names = list(df['nom']) - # Correct a massif name - all_massif_names[all_massif_names.index('Beaufortin')] = 'Beaufortain' - return all_massif_names + def ordered_years_and_path_files(self): + nc_files = [(int(f.split('_')[-2][:4]), f) for f in os.listdir(self.study_full_path) if f.endswith('.nc')] + ordered_years, path_files = zip(*[(year, op.join(self.study_full_path, nc_file)) + for year, nc_file in sorted(nc_files, key=lambda t: t[0]) + if self.year_min <= year < self.year_max]) + return path_files, ordered_years + + """ Temporal properties """ @property - def original_safran_massif_id_to_massif_name(self) -> Dict[int, str]: - return {massif_id: massif_name for massif_id, massif_name in enumerate(self.all_massif_names)} + def ordered_years(self): + return self.ordered_years_and_path_files[1] - @cached_property - def massifs_coordinates_for_display(self) -> AbstractSpatialCoordinates: - # Coordinate object that represents the massif coordinates in Lambert extended - # extracted for a csv file, and used only for display purposes - df = self.load_df_centroid() - # Filter, keep massifs present at the altitude of interest - df = df.loc[self.study_massif_names] - # Build coordinate object from df_centroid - return AbstractSpatialCoordinates.from_df(df) + @property + def start_year_and_stop_year(self) -> Tuple[int, int]: + ordered_years = self.ordered_years + return ordered_years[0], ordered_years[-1] + + """ Spatial properties """ + + @property + def study_massif_names(self) -> List[str]: + return self.altitude_to_massif_names[self.altitude] @property def df_massifs_longitude_and_latitude(self) -> pd.DataFrame: # DataFrame object that represents the massif coordinates in degrees extracted from the SCM data + # Another way of getting the latitudes and longitudes could have been the following: # any_ordered_dict = list(self.year_to_dataset_ordered_dict.values())[0] # longitude = np.array(any_ordered_dict.variables['longitude']) # latitude = np.array(any_ordered_dict.variables['latitude']) longitude = np.array(LONGITUDES) latitude = np.array(LATITUDES) - index = self.altitude_to_massif_names[self.altitude] columns = [AbstractSpatialCoordinates.COORDINATE_X, AbstractSpatialCoordinates.COORDINATE_Y] data = dict(zip(columns, [longitude[self.altitude_mask], latitude[self.altitude_mask]])) - return pd.DataFrame(data=data, index=index, columns=columns) + return pd.DataFrame(data=data, index=self.study_massif_names, columns=columns) - def load_df_centroid(self) -> pd.DataFrame: - # Load df_centroid containing all the massif names - df_centroid = pd.read_csv(op.join(self.map_full_path, 'coordonnees_massifs_alpes.csv')) - df_centroid.set_index('NOM', inplace=True) - # Check that the names of massifs are the same - symmetric_difference = set(df_centroid.index).symmetric_difference(self.all_massif_names) - assert len(symmetric_difference) == 0, symmetric_difference - # Sort the column in the order of the SAFRAN dataset - df_centroid = df_centroid.reindex(self.all_massif_names, axis=0) - for coord_column in [AbstractCoordinates.COORDINATE_X, AbstractCoordinates.COORDINATE_Y]: - df_centroid.loc[:, coord_column] = df_centroid[coord_column].str.replace(',', '.').astype(float) - return df_centroid + @property + def missing_massif_name(self): + return set(self.all_massif_names) - set(self.altitude_to_massif_names[self.altitude]) + + @cached_property + def altitude_mask(self): + altitude_mask = ZS_INT_MASK == self.altitude + assert np.sum(altitude_mask) == len(self.altitude_to_massif_names[self.altitude]) + return altitude_mask + + """ Path properties """ @property - def coordinate_id_to_massif_name(self) -> Dict[int, str]: - df_centroid = self.load_df_centroid() - return dict(zip(df_centroid['id'], df_centroid.index)) + def title(self): + return "{}/at altitude {}m ({} mountain chains)".format(self.variable_name, self.altitude, + len(self.study_massif_names)) + + @property + def variable_name(self): + return self.variable_class.NAME + ' (in {})'.format(self.variable_unit) + + @property + def variable_unit(self): + return self.variable_class.UNIT """ Visualization methods """ + @cached_property + def massifs_coordinates_for_display(self) -> AbstractSpatialCoordinates: + # Coordinate object that represents the massif coordinates in Lambert extended + # extracted for a csv file, and used only for display purposes + df = self.load_df_centroid() + # Filter, keep massifs present at the altitude of interest + df = df.loc[self.study_massif_names] + # Build coordinate object from df_centroid + return AbstractSpatialCoordinates.from_df(df) + def visualize_study(self, ax=None, massif_name_to_value=None, show=True, fill=True, replace_blue_by_white=True, label=None, add_text=False, cmap=None, vmax=100, vmin=0): if massif_name_to_value is None: @@ -300,49 +282,68 @@ class AbstractStudy(object): if show: plt.show() + """ + CLASS ATTRIBUTES COMMON TO ALL OBJECTS + (written as object attributes/methods for simplicity) + """ + + """ Path properties """ + @property - def idx_to_coords_list(self): - df_massif = pd.read_csv(op.join(self.map_full_path, 'massifsalpes.csv')) - coord_tuples = [(row_massif['idx'], row_massif[AbstractCoordinates.COORDINATE_X], - row_massif[AbstractCoordinates.COORDINATE_Y]) - for _, row_massif in df_massif.iterrows()] - all_idxs = set([t[0] for t in coord_tuples]) - return {idx: [coords for idx_loop, *coords in coord_tuples if idx == idx_loop] for idx in all_idxs} + def relative_path(self) -> str: + return r'local/spatio_temporal_datasets' @property - def all_coords_list(self): - all_values = [] - for e in self.idx_to_coords_list.values(): - all_values.extend(e) - return list(zip(*all_values)) + def full_path(self) -> str: + return get_full_path(relative_path=self.relative_path) @property - def visualization_x_limits(self): - return min(self.all_coords_list[0]), max(self.all_coords_list[0]) + def map_full_path(self) -> str: + return op.join(self.full_path, 'map') @property - def visualization_y_limits(self): - return min(self.all_coords_list[1]), max(self.all_coords_list[1]) + def result_full_path(self) -> str: + return op.join(self.full_path, 'results') + + @property + def study_full_path(self) -> str: + assert self.model_name in ['Safran', 'Crocus'] + study_folder = 'meteo' if self.model_name is 'Safran' else 'pro' + return op.join(self.full_path, self.REANALYSIS_FOLDER, study_folder) + + """ Spatial properties """ + + @property + def original_safran_massif_id_to_massif_name(self) -> Dict[int, str]: + return {massif_id: massif_name for massif_id, massif_name in enumerate(self.all_massif_names)} @cached_property - def mask_french_alps(self): - resolution = AbstractMarginFunction.VISUALIZATION_RESOLUTION - mask_french_alps = np.zeros([resolution, resolution]) - for polygon in self.idx_to_coords_list.values(): - xy_values = list(zip(*polygon)) - normalized_polygon = [] - for values, (minlim, max_lim) in zip(xy_values, [self.visualization_x_limits, self.visualization_y_limits]): - values -= minlim - values *= resolution / (max_lim - minlim) - normalized_polygon.append(values) - normalized_polygon = list(zip(*normalized_polygon)) - img = Image.new('L', (resolution, resolution), 0) - ImageDraw.Draw(img).polygon(normalized_polygon, outline=1, fill=1) - mask_massif = np.array(img) - mask_french_alps += mask_massif - return ~np.array(mask_french_alps, dtype=bool) + def all_massif_names(self) -> List[str]: + """ + Pour l'identification des massifs, le numéro de la variable massif_num correspond à celui de l'attribut num_opp + """ + metadata_path = op.join(self.full_path, self.REANALYSIS_FOLDER, 'metadata') + dbf = Dbf5(op.join(metadata_path, 'massifs_alpes.dbf')) + df = dbf.to_dataframe().copy() # type: pd.DataFrame + dbf.f.close() + df.sort_values(by='num_opp', inplace=True) + all_massif_names = list(df['nom']) + # Correct a massif name + all_massif_names[all_massif_names.index('Beaufortin')] = 'Beaufortain' + return all_massif_names - """ Some properties """ + def load_df_centroid(self) -> pd.DataFrame: + # Load df_centroid containing all the massif names + df_centroid = pd.read_csv(op.join(self.map_full_path, 'coordonnees_massifs_alpes.csv')) + df_centroid.set_index('NOM', inplace=True) + # Check that the names of massifs are the same + symmetric_difference = set(df_centroid.index).symmetric_difference(self.all_massif_names) + assert len(symmetric_difference) == 0, symmetric_difference + # Sort the column in the order of the SAFRAN dataset + df_centroid = df_centroid.reindex(self.all_massif_names, axis=0) + for coord_column in [AbstractCoordinates.COORDINATE_X, AbstractCoordinates.COORDINATE_Y]: + df_centroid.loc[:, coord_column] = df_centroid[coord_column].str.replace(',', '.').astype(float) + return df_centroid @cached_property def massif_name_to_altitudes(self) -> Dict[str, List[int]]: @@ -364,49 +365,51 @@ class AbstractStudy(object): altitude_to_massif_names[altitude].append(massif_name) return altitude_to_massif_names - @property - def missing_massif_name(self): - return set(self.all_massif_names) - set(self.altitude_to_massif_names[self.altitude]) - - @cached_property - def altitude_mask(self): - altitude_mask = ZS_INT_MASK == self.altitude - assert np.sum(altitude_mask) == len(self.altitude_to_massif_names[self.altitude]) - return altitude_mask - - @property - def title(self): - return "{}/at altitude {}m ({} mountain chains)".format(self.variable_name, self.altitude, - len(self.study_massif_names)) - - @property - def variable_name(self): - return self.variable_class.NAME + ' (in {})'.format(self.variable_unit) + """ Visualization methods """ @property - def variable_unit(self): - return self.variable_class.UNIT - - """ Some path properties """ + def coordinate_id_to_massif_name(self) -> Dict[int, str]: + df_centroid = self.load_df_centroid() + return dict(zip(df_centroid['id'], df_centroid.index)) @property - def relative_path(self) -> str: - return r'local/spatio_temporal_datasets' + def idx_to_coords_list(self): + df_massif = pd.read_csv(op.join(self.map_full_path, 'massifsalpes.csv')) + coord_tuples = [(row_massif['idx'], row_massif[AbstractCoordinates.COORDINATE_X], + row_massif[AbstractCoordinates.COORDINATE_Y]) + for _, row_massif in df_massif.iterrows()] + all_idxs = set([t[0] for t in coord_tuples]) + return {idx: [coords for idx_loop, *coords in coord_tuples if idx == idx_loop] for idx in all_idxs} @property - def full_path(self) -> str: - return get_full_path(relative_path=self.relative_path) + def all_coords_list(self): + all_values = [] + for e in self.idx_to_coords_list.values(): + all_values.extend(e) + return list(zip(*all_values)) @property - def map_full_path(self) -> str: - return op.join(self.full_path, 'map') + def visualization_x_limits(self): + return min(self.all_coords_list[0]), max(self.all_coords_list[0]) @property - def result_full_path(self) -> str: - return op.join(self.full_path, 'results') + def visualization_y_limits(self): + return min(self.all_coords_list[1]), max(self.all_coords_list[1]) - @property - def study_full_path(self) -> str: - assert self.model_name in ['Safran', 'Crocus'] - study_folder = 'meteo' if self.model_name is 'Safran' else 'pro' - return op.join(self.full_path, self.REANALYSIS_FOLDER, study_folder) + @cached_property + def mask_french_alps(self): + resolution = AbstractMarginFunction.VISUALIZATION_RESOLUTION + mask_french_alps = np.zeros([resolution, resolution]) + for polygon in self.idx_to_coords_list.values(): + xy_values = list(zip(*polygon)) + normalized_polygon = [] + for values, (minlim, max_lim) in zip(xy_values, [self.visualization_x_limits, self.visualization_y_limits]): + values -= minlim + values *= resolution / (max_lim - minlim) + normalized_polygon.append(values) + normalized_polygon = list(zip(*normalized_polygon)) + img = Image.new('L', (resolution, resolution), 0) + ImageDraw.Draw(img).polygon(normalized_polygon, outline=1, fill=1) + mask_massif = np.array(img) + mask_french_alps += mask_massif + return ~np.array(mask_french_alps, dtype=bool) diff --git a/experiment/meteo_france_data/scm_models_data/safran/safran.py b/experiment/meteo_france_data/scm_models_data/safran/safran.py index 767e2446ff18b2fe80f994109ba6cdc0b3b2a977..cd718e2239e4f147935d89f6fff0d52f853e16ce 100644 --- a/experiment/meteo_france_data/scm_models_data/safran/safran.py +++ b/experiment/meteo_france_data/scm_models_data/safran/safran.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # print(study.all_massif_names) # print(study.massif_name_to_altitudes) # print(study.year_to_daily_time_serie_array[1958].shape) - print(study.missing_massif_name) + # print(study.missing_massif_name) # print(len(d.variables['time'])) # print(study.year_to_annual_total) diff --git a/experiment/meteo_france_data/visualization/study_visualization/study_visualizer.py b/experiment/meteo_france_data/visualization/study_visualization/study_visualizer.py index dcc1cc3a804da2dcae9627e30fc329ebf628ac6d..a91ea95408ed81ed6b89e656b9fd6bbd4132c847 100644 --- a/experiment/meteo_france_data/visualization/study_visualization/study_visualizer.py +++ b/experiment/meteo_france_data/visualization/study_visualization/study_visualizer.py @@ -749,9 +749,16 @@ class StudyVisualizer(object): for massif_name in self.study.study_massif_names} return pd.DataFrame(massif_to_gev_mle, columns=self.study.study_massif_names) + @property + def df_all_daily_time_series_concatenated(self) -> pd.DataFrame: + df_list = [pd.DataFrame(time_serie, columns=self.study.study_massif_names) for time_serie in + self.study.year_to_daily_time_serie_array.values()] + df_concatenated = pd.concat(df_list) + return df_concatenated + def df_gpd_mle(self, threshold) -> pd.DataFrame: # Fit a margin fit on each massif - massif_to_gev_mle = {massif_name: GpdMleFit(self.study.df_all_daily_time_series_concatenated[massif_name], + massif_to_gev_mle = {massif_name: GpdMleFit(self.df_all_daily_time_series_concatenated[massif_name], threshold=threshold).gpd_params.summary_serie for massif_name in self.study.study_massif_names} return pd.DataFrame(massif_to_gev_mle, columns=self.study.study_massif_names) diff --git a/test/test_experiment/test_SCM_study.py b/test/test_experiment/test_SCM_study.py index 7e233cfd4ea5aff8bc746287849e4f68f51a67f5..206bdf6dd1e3d37eaf01f843476107f107231f85 100644 --- a/test/test_experiment/test_SCM_study.py +++ b/test/test_experiment/test_SCM_study.py @@ -32,7 +32,7 @@ class TestSCMAllStudy(unittest.TestCase): only_first_one=False, verbose=False, altitudes=sample(set(ALL_ALTITUDES), k=nb_sample), nb_days=nb_days): self.assertTrue('day' in study.variable_name) - first_path_file = study.ordered_years_and_path_files()[0][0] + first_path_file = study.ordered_years_and_path_files[0][0] variable_array = study.load_variables(path_file=first_path_file) variable_object = study.instantiate_variable_object(variable_array) self.assertEqual((365, 263), variable_object.daily_time_serie_array.shape,