format.R 17.00 KiB
# \\\
# Copyright 2021-2022 Louis Héraut*1
# *1   INRAE, France
#      louis.heraut@inrae.fr
# This file is part of ash R toolbox.
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# ash R toolbox is distributed in the hope that it will be useful, but 
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with ash R toolbox.  If not, see <https://www.gnu.org/licenses/>.
# ///
# processing/format.R
# Manages all the format problem of data and info. Mainly problem of
# input and output of the 'StatsAnalysisTrend' package. It also allows
# to join different selections of station and to gets exact period of
# trend analysis.
# Usefull library
library(dplyr)
library(Hmisc)
## 1. BEFORE TREND ANALYSE ___________________________________________
### 1.1. Joining selection ___________________________________________
# Joins tibbles of different selection of station as a unique one
join_selection = function (df_data_AEAG, df_data_INRAE, df_meta_AEAG, df_meta_INRAE) {
    # If there is an INRAE and an Agence de l'eau Adour-Garonne selection
    if (!is.null(df_data_INRAE) & !is.null(df_data_AEAG)) {
        # Gets the station in common
        common = levels(factor(df_meta_INRAE[df_meta_INRAE$code %in% df_meta_AEAG$code,]$code))
        # Gets the Nv station to add
        INRAEadd = levels(factor(df_meta_INRAE[!(df_meta_INRAE$code %in% df_meta_AEAG$code),]$code))
        # Selects only the INRAE meta to add
        df_meta_INRAEadd = df_meta_INRAE[df_meta_INRAE$code %in% INRAEadd,]
        # Names the source of the selection
        df_meta_AEAG$source = 'AEAG'
        df_meta_INRAEadd$source = 'INRAE'
        # Joins INRAE data to AEAG data
        df_meta = full_join(df_meta_AEAG, df_meta_INRAEadd)
        # Selects only the INRAE data to add
        df_data_INRAEadd = df_data_INRAE[df_data_INRAE$code %in% INRAEadd,]
        # Joins INRAE meta to AEAG meta
        df_data = full_join(df_data_AEAG, df_data_INRAEadd)
    # If there is just an Agence de l'eau Adour-Garonne selection
    } else if (is.null(df_data_INRAE) & !is.null(df_data_AEAG)) {
        df_meta_AEAG$source = 'AEAG'
        df_meta = df_meta_AEAG
        df_data = df_data_AEAG
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
# If there is just an INRAE selection } else if (!is.null(df_data_INRAE) & is.null(df_data_AEAG)) { df_meta_INRAE$source = 'INRAE' df_meta = df_meta_INRAE df_data = df_data_INRAE # If there is no selection } else { stop('No data') } return (list(data=df_data, meta=df_meta)) } ### 1.2. Manages missing data ________________________________________ missing_data = function (df_data, df_meta, yearLac_day=3, yearStart='01-01', Code=NULL, df_mod=NULL) { if (is.null(Code)) { # Get all different stations code Code = levels(factor(df_meta$code)) nCode = length(Code) } else { nCode = length(Code) } for (code in Code) { # Extracts the data corresponding to the code df_data_code = df_data[df_data$code == code,] DateMD = substr(df_data_code$Date, 6, 10) idyearStart = which(DateMD == yearStart) if (DateMD[1] != yearStart) { idyearStart = c(1, idyearStart) } NidyearStart = length(idyearStart) for (i in 1:NidyearStart) { Start = df_data_code$Date[idyearStart[i]] if (i < NidyearStart) { End = df_data_code$Date[idyearStart[i+1] - 1] } else { End = df_data_code$Date[length(df_data_code$Date)] } OkYear = df_data_code$Date >= Start & df_data_code$Date <= End df_data_code_year = df_data_code[OkYear,] StartReal = as.Date(paste(substr(Start, 1, 4), yearStart, sep='-')) EndReal = as.Date(paste(as.numeric(substr(Start, 1, 4)) + 1, yearStart, sep='-')) nbDate = as.numeric(difftime(EndReal, StartReal, units="days")) nbNA = sum(as.numeric(is.na(df_data_code_year$Value))) nbNA = nbNA + abs(as.numeric(difftime(StartReal, Start, units="days"))) nbNA = nbNA + abs(as.numeric(difftime(EndReal, End+1, units="days"))) yearLacMiss_pct = nbNA/nbDate * 100 if (nbNA > yearLac_day) { df_data_code_year$Value = NA df_data_code[OkYear,] = df_data_code_year if (!is.null(df_mod)) { df_mod = add_mod(df_mod, code, type='Missing data management', fun_name='NA assignment',
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
comment=paste('From ', Start, ' to ', End, sep='')) } } else if (nbNA <= yearLac_day & nbNA > 1) { DateJ = as.numeric(df_data_code_year$Date) Value = df_data_code_year$Value Value = approxExtrap(x=DateJ, y=Value, xout=DateJ, method="linear", na.rm=TRUE)$y df_data_code$Value[OkYear] = Value if (!is.null(df_mod)) { df_mod = add_mod(df_mod, code, type='Missing data management', fun_name='approxExtrap', comment=paste( 'Linear extrapolation of NA from ', Start, ' to ', End, sep='')) } } } df_data[df_data$code == code,] = df_data_code } if (!is.null(df_mod)) { res = list(data=df_data, mod=df_mod) return (res) } else { return (df_data) } } ### 1.3. Sampling of the data ________________________________________ sampling_data = function (df_data, df_meta, sampleSpan=c('05-01', '11-30'), Code=NULL, df_mod=NULL) { if (is.null(Code)) { # Get all different stations code Code = levels(factor(df_meta$code)) nCode = length(Code) } else { nCode = length(Code) } # 1972 is leap year reference is case of leap year comparison sampleStart = as.Date(paste('1972', sampleSpan[1], sep='-')) sampleEnd = as.Date(paste('1972', sampleSpan[2], sep='-')) for (code in Code) { # Extracts the data corresponding to the code df_data_code = df_data[df_data$code == code,] DateMD = substr(df_data_code$Date, 6, 10) Date = paste('1972', DateMD, sep='-') df_data_code$Value[Date < sampleStart | Date > sampleEnd] = NA if (!is.null(df_mod)) { df_mod = add_mod(df_mod, code, type='Seasonal sampling ', fun_name='NA assignment', comment=paste('Before ', sampleStart, ' and after ', sampleEnd, sep='')) } # Leap year verification
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
# print(df_data_code[df_data_code$Date > as.Date("1992-02-25"),]) df_data[df_data$code == code,] = df_data_code } if (!is.null(df_mod)) { res = list(data=df_data, mod=df_mod) return (res) } else { return (df_data) } } ## 2. DURING TREND ANALYSE ___________________________________________ ### 2.1. Preparation _________________________________________________ # Prepares the data in order to have a list of a data tibble with # date, group and flow column and a info tibble with the station code # and group column to fit the entry of the 'extract.Var' function in # the 'StatsAnalysisTrend' package prepare = function(df_data, colnamegroup=NULL) { # Forces the column name to group to be a vector colnamegroup = c(colnamegroup) # Converts it to index of the column to group colindgroup = which(colnames(df_data) == colnamegroup) # Groups the data by those indexes df_data = group_by_at(df_data, colindgroup) # Creates a new tibble of data with a group column data = tibble(Date=df_data$Date, group=group_indices(df_data), Value=df_data$Value) # Gets the different value of the group Gkey = group_keys(df_data) # Creates a new tibble of info of the group info = bind_cols(group=seq(1:nrow(Gkey)), Gkey) # Stores data and info tibble as a list that match the entry of # the 'extract.Var' function res = list(data=data, info=info) return (res) } ### 2.2. Re-preparation ______________________________________________ # Re-prepares the data in outing of the 'extract.Var' function in # the 'StatsAnalysisTrend' package in order to fit again to the # entry of the same function reprepare = function(df_XEx, df_Xlist, colnamegroup=NULL) { # Changes the column name of the results of the # 'extract.Var' function colnames(df_XEx) = c('Date', 'group', 'Value') # Converts Date column as character df_XEx$Date = as.character(df_XEx$Date) # Takes the first date as example exDate = df_XEx$Date[1] # Finds the number of dash in the date nbt = lengths(regmatches(exDate, gregexpr('-', exDate))) # If there is only one dash if (nbt == 1) { # Converts it to date from a year and a month df_XEx$Date = paste(df_XEx$Date, '01', sep='-') # If there is no dash } else if (nbt == 0) { # Converts it to date from only a year
281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
df_XEx$Date = paste(df_XEx$Date, '01', '01', sep='-') # If there is more than 2 dashes } else if (nbt != 2) { # This is not a classical date stop('erreur of date format') } # Recreates the outing of the 'extract.Var' function nicer df_XEx = bind_cols(Date=as.Date(df_XEx$Date, format="%Y-%m-%d"), df_XEx[-1], df_Xlist$info[df_XEx$group, 2:ncol(df_Xlist$info)]) # Prepares the nicer outing df_XlistEx = prepare(df_XEx, colnamegroup=colnamegroup) return (df_XlistEx) } ### 2.3. Prepare date ________________________________________________ prepare_date = function(df_XEx, df_Xlist, per.start="01-01") { df_dateStart = summarise(group_by(df_Xlist$data, group), Date=min(Date)) # filter(group_by(df_Xlist$data, group), Date == min(Date)) df_dateStart$Date_julian = NA df_dateStart$DateHydro_julian = NA date = as.Date(df_dateStart$Date) date_per.start = as.Date(paste(substr(date, 1, 4), '-', per.start, sep='')) date[date < date_per.start] = date_per.start[date < date_per.start] df_dateStart$Date = date origin = as.Date(paste(format(df_dateStart$Date, "%Y"), '-', per.start, sep='')) originHydro = as.Date(paste(format(df_dateStart$Date, "%Y"), '-01-01', sep='')) for (i in 1:nrow(df_dateStart)) { dateJultmp = julian(date[i], origin=origin[i]) df_dateStart$Date_julian[i] = dateJultmp dateJulHydrotmp = julian(date[i], origin=originHydro[i]) df_dateStart$DateHydro_julian[i] = dateJulHydrotmp } df_dateStart$Year = format(df_dateStart$Date, "%Y") for (group in df_dateStart$group) { Ok_dateStart = df_dateStart$group == group Shift = df_dateStart$Date_julian[Ok_dateStart] year = df_dateStart$Year[Ok_dateStart] OkXEx_code_year = df_XEx$group1 == group & df_XEx$datetime == year df_XEx$values[OkXEx_code_year] = df_XEx$values[OkXEx_code_year] + Shift OkXEx_code = df_XEx$group1 == group ShiftHydro = df_dateStart$DateHydro_julian[Ok_dateStart] df_XEx$values[OkXEx_code] = df_XEx$values[OkXEx_code] + ShiftHydro ## Add 365 when the point is too remote # XEx_code = df_XEx$values[OkXEx_code] # meanXEx_code = mean(XEx_code, na.rm=TRUE) # dXEx_code = meanXEx_code - XEx_code # stdXEx_code = sd(XEx_code, na.rm=TRUE) # OkOverStd = dXEx_code >= stdXEx_code*3 # OkOverStd[is.na(OkOverStd)] = FALSE
351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
# XEx_code[OkOverStd] = XEx_code[OkOverStd] + 365 # df_XEx$values[OkXEx_code] = XEx_code # print(group) # print(df_XEx$datetime[df_XEx$group1 == group][dXEx_code >= stdXEx_code*3]) } df_XEx$datetime = as.double(df_XEx$datetime) return (df_XEx) } ## 3. AFTER TREND ANALYSE ____________________________________________ ### 3.1. Period of trend _____________________________________________ # Compute the start and the end of the period for a trend analysis # according to the accessible data get_period = function (per, df_Xtrend, df_XEx, df_Xlist) { # Converts results of trend to tibble df_Xtrend = tibble(df_Xtrend) # Fix the period start and end of the accessible period to a # default date df_Xtrend$period_start = as.Date("1970-01-01") df_Xtrend$period_end = as.Date("1970-01-01") # Changes the format of the date variable to date df_Xlisttmp = reprepare(df_XEx, df_Xlist, colnamegroup=c('code')) df_XExtmp = df_Xlisttmp$data # For all the different group for (g in df_Xlisttmp$info$group) { # Gets the analyse data associated to the group df_XExtmp_code = df_XExtmp[df_XExtmp$group == g,] # Gets the id in the trend result associated to the group id = which(df_Xtrend$group1 == g) # Computes index of the nearest accessible start and end date OkStart = df_XExtmp_code$Date >= as.Date(per[1]) OkEnd = df_XExtmp_code$Date <= as.Date(per[2]) # DateStart = df_XExtmp_code$Date[OkStart] # DateEnd = df_XExtmp_code$Date[OkEnd] DateStart = df_XExtmp_code$Date DateEnd = df_XExtmp_code$Date iStart = which.min(abs(DateStart - as.Date(per[1]))) iEnd = which.min(abs(DateEnd - as.Date(per[2]))) # print(nrow(df_XEx)) # print(as.Date(DateStart[iStart])) # print(head(df_XEx)) # print(tail(df_XEx)) # print(as.Date(DateEnd[iEnd])) # Stores the start and end of the trend analysis df_Xtrend$period_start[id] = as.Date(DateStart[iStart]) df_Xtrend$period_end[id] = as.Date(DateEnd[iEnd]) } return (df_Xtrend) } ### 3.2. Cleaning ____________________________________________________ # Cleans the trend results of the function 'Estimate.stats' in the # 'StatsAnalysisTrend' package. It adds the station code and the # intercept of the trend to the trend results. Also makes the data # more presentable. clean = function (df_Xtrend, df_XEx, df_Xlist) {
421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
# Reprepares the list of data and info in order to be presentable df_Xlist = reprepare(df_XEx, df_Xlist, colnamegroup=c('code')) # Adds a column of station code df_Xlist$data$code = NA # For all the group for (g in df_Xlist$info$group) { # Adds the station code corresponding to each group info df_Xlist$data$code[which(df_Xlist$data$group == g)] = df_Xlist$info$code[df_Xlist$info$group == g] } # Adds the info to trend tibble df_Xtrend = bind_cols(df_Xtrend, df_Xlist$info[df_Xtrend$group1, 2:ncol(df_Xlist$info)]) # Renames the column of group of trend results colnames(df_Xtrend)[1] = 'group' # Adds the intercept value of trend df_Xtrend = get_intercept(df_Xtrend, df_Xlist, unit2day=365.25) # Changes the position of the intercept column df_Xtrend = relocate(df_Xtrend, intercept, .after=trend) # Creates a list of results to return res = list(trend=df_Xtrend, data=df_Xlist$data, info=df_Xlist$info) return (res) } ## 4. OTHER __________________________________________________________ add_mod = function (df_mod, Code, type, fun_name, comment, df_meta=NULL) { if (Code == 'all' & is.null(df_meta)) { Code = NA # erreur } else if (Code == 'all' & !is.null(df_meta)) { # Get all different stations code Code = levels(factor(df_meta$code)) } for (code in Code) { df_modtmp = tibble(code=code, type=type, fun_name=fun_name, comment=comment) df_mod = bind_rows(df_mod, df_modtmp) } return (df_mod) }