analyse.R

# \\\
# Copyright 2021-2022 Louis Héraut*1
#
# *1   INRAE, France
#      louis.heraut@inrae.fr
#
# This file is part of ash R toolbox.
#
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# ash R toolbox is distributed in the hope that it will be useful, but 
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ash R toolbox.  If not, see <https://www.gnu.org/licenses/>.
# ///
#
#
# processing/analyse.R
#
# File that realise all the possible analysis of data.
# This file regroup mainly the functions use to compute the trend
# analysis of hydrologic variables thanks to the Mann-Kendall Test.
# Functions needed for break or gap analysis are also present.


# Usefull library
library(dplyr)
library(zoo)
library(StatsAnalysisTrend)
library(lubridate)
library(trend)

# Sourcing R file
source('processing/format.R', encoding='latin1')


## 1. TREND ANALYSIS
### 1.0. Intercept of trend
# Compute intercept values of linear trends with first order values
# of trends and the data on which analysis is performed.
get_intercept = function (df_Xtrend, df_Xlist, unit2day=365.25) {

    # Create a column in trend full of NA
    df_Xtrend$intercept = NA

    # For all different group
    for (g in df_Xlist$info$group) {
        # Get the data and trend value linked to this group
        df_data_code = df_Xlist$data[df_Xlist$data$group == g,]
        df_Xtrend_code = df_Xtrend[df_Xtrend$group == g,]

        # Get the time start and end of the different periods
        Start = df_Xtrend_code$period_start
        End = df_Xtrend_code$period_end
        # Extract only the unrepeated dates
        UStart = levels(factor(Start))
        UEnd = levels(factor(End))
        # Get the number of different periods of trend analysis
        nPeriod = max(length(UStart), length(UEnd))

        # For each of these perdiods
        for (i in 1:nPeriod) {
            # Get data and trend associated to the period
            df_data_code_per = 
                df_data_code[df_data_code$Date >= Start[i] 
                             & df_data_code$Date <= End[i],]
            df_Xtrend_code_per = 
                df_Xtrend_code[df_Xtrend_code$period_start == Start[i] 
                              & df_Xtrend_code$period_end == End[i],]

            # Get the group associated to this period
            id = which(df_Xtrend$group == g 
                       & df_Xtrend$period_start == Start[i] 
                       & df_Xtrend$period_end == End[i])

            # Compute mean of flow and time period
            mu_X = mean(df_data_code_per$Value, na.rm=TRUE)
            mu_t = as.numeric(mean(c(Start[i],
                                     End[i]),
                                   na.rm=TRUE)) / unit2day

            # Get the intercept of the trend
            b = mu_X - mu_t * df_Xtrend_code_per$trend
            # And store it
            df_Xtrend$intercept[id] = b
        } 
    }
    return (df_Xtrend)
}

### 1.1. QA
# Realise the trend analysis of the average annual flow (QA)
# hydrological variable
get_QAtrend = function (df_data, period, p_thresold) {

    # Make sure to convert the period to a list
    period = as.list(period)

    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_QAtrendB = tibble()

    # For all periods
    for (per in period) {
        # Prepare the data to fit the entry of extract.Var
        df_QAlist = prepare(df_data, colnamegroup=c('code'))

        # Compute the yearly mean over the data
        df_QAEx = extract.Var(data.station=df_QAlist,
                              funct=mean,
                              timestep='year',
                              period=per,
                              pos.datetime=1,
                              na.rm=TRUE)
        # Compute the trend analysis
        df_QAtrend = Estimate.stats(data.extract=df_QAEx,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval
        if (I > Imax) {
            # Store it and the associated data and info
            Imax = I
            df_QAlistB = df_QAlist
            df_QAExB = df_QAEx
        }

        # Specify the period of analyse
        df_QAtrend = get_period(per, df_QAtrend, df_QAEx, df_QAlist)
        # Store the trend
        df_QAtrendB = bind_rows(df_QAtrendB, df_QAtrend)   
    } 
    # Clean results of trend analyse
    res_QAtrend = clean(df_QAtrendB, df_QAExB, df_QAlistB)
    return (res_QAtrend)
}

### 1.2. QMNA
# Realise the trend analysis of the monthly minimum flow in the
# year (QMNA) hydrological variable
get_QMNAtrend = function (df_data, period, p_thresold) {

    # Make sure to convert the period to a list
    period = as.list(period)
    
    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_QMNAtrendB = tibble()

    # For all periods
    for (per in period) {
        # Prepare the data to fit the entry of extract.Var
        df_QMNAlist = prepare(df_data, colnamegroup=c('code'))
        # Compute the montly mean over the data
        df_QMNAEx = extract.Var(data.station=df_QMNAlist,
                                funct=mean,
                                period=per,
                                timestep='year-month',
                                per.start="01",
                                pos.datetime=1,
                                na.rm=TRUE)
        # Rerepare the data to fit the entry of extract.Var
        df_QMNAlist = reprepare(df_QMNAEx,
                                df_QMNAlist,
                                colnamegroup=c('code'))
        # Compute the yearly min over the data
        df_QMNAEx = extract.Var(data.station=df_QMNAlist,
                                funct=min,
                                period=per,
                                timestep='year',
                                pos.datetime=1,
                                na.rm=TRUE)
        # Compute the trend analysis        
        df_QMNAtrend = Estimate.stats(data.extract=df_QMNAEx,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval
        if (I > Imax) {
            # Store it and the associated data and info
            Imax = I
            df_QMNAlistB = df_QMNAlist
            df_QMNAExB = df_QMNAEx
        }
        
        # Specify the period of analyse
        df_QMNAtrend = get_period(per, df_QMNAtrend,
                                  df_QMNAEx,
                                  df_QMNAlist)
        # Store the trend
        df_QMNAtrendB = bind_rows(df_QMNAtrendB, df_QMNAtrend)
    }
    # Clean results of trend analyse
    res_QMNAtrend = clean(df_QMNAtrendB, df_QMNAExB, df_QMNAlistB)
    return (res_QMNAtrend)
}

### 1.3. VCN10
# Realises the trend analysis of the minimum 10 day average flow
# over the year (VCN10) hydrological variable
get_VCN10trend = function (df_data, df_meta, period, p_thresold) {

    # Get all different stations code
    Code = levels(factor(df_meta$code))
    # Blank tibble to store the data averaged
    df_data_roll = tibble()

    # For all the code
    for (code in Code) {
        # Get the data associated to the code
        df_data_code = df_data[df_data$code == code,]
        # Perform the roll mean of the flow over 10 days
        df_data_roll_code = tibble(Date=df_data_code$Date,
                                   Value=rollmean(df_data_code$Value, 
                                                  10,
                                                  fill=NA),
                                   code=code)
        # Store the results
        df_data_roll = bind_rows(df_data_roll, df_data_roll_code)
    }

    # Make sure to convert the period to a list
    period = as.list(period)
    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_VCN10trendB = tibble()

    # For all periods
    for (per in period) {
        # Prepare the data to fit the entry of extract.Var
        df_VCN10list = prepare(df_data_roll, colnamegroup=c('code'))
        # Compute the yearly min over the averaged data
        df_VCN10Ex = extract.Var(data.station=df_VCN10list,
                                 funct=min,
                                 period=per,
                                 timestep='year',
                                 pos.datetime=1,
                                 na.rm=TRUE)
        # Compute the trend analysis
        df_VCN10trend = Estimate.stats(data.extract=df_VCN10Ex,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval       
        if (I > Imax) {
            # Store it and the associated data and info           
            Imax = I
            df_VCN10listB = df_VCN10list
            df_VCN10ExB = df_VCN10Ex
        }

        # Specify the period of analyse
        df_VCN10trend = get_period(per, df_VCN10trend, df_VCN10Ex,
                                   df_VCN10list)
        # Store the trend
        df_VCN10trendB = bind_rows(df_VCN10trendB, df_VCN10trend)
    }
    # Clean results of trend analyse
    res_VCN10trend = clean(df_VCN10trendB, df_VCN10ExB, df_VCN10listB)
    return (res_VCN10trend)
}

### 1.4. tINI date

which_underfirst = function (L, UpLim) {
    id = which(L <= UpLim)[1]
    return (id)
}

get_tINItrend = function (df_data, df_meta, period, p_thresold) {

    # Get all different stations code
    Code = levels(factor(df_meta$code))
    # Gets the number of station
    nCode = length(Code)
    
    # Blank tibble to store the data averaged
    df_data_roll = tibble() 

    # For all the code
    for (code in Code) {
        # Get the data associated to the code
        df_data_code = df_data[df_data$code == code,]
        # Perform the roll mean of the flow over 10 days
        df_data_roll_code = tibble(Date=df_data_code$Date,
                                   Value=rollmean(df_data_code$Value, 
                                                  10,
                                                  fill=NA),
                                   code=code)
        # Store the results
        df_data_roll = bind_rows(df_data_roll, df_data_roll_code)
    }

    # Make sure to convert the period to a list
    period = as.list(period)
    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_tINItrendB = tibble()

    # For all periods
    for (per in period) {

        df_tINIEx = tibble()
        df_tINIlist = list(data=tibble(), info=tibble())
        
        # For all the code
        for (k in 1:nCode) {
            # Gets the code
            code = Code[k]
            # print(code)
            
            per.start = df_meta$start_year[df_meta$code == code]
            per.start = paste(sprintf("%02d", per.start), '-01', sep='')
            
            # Get the data associated to the code
            df_data_roll_code = df_data_roll[df_data_roll$code == code,]

            # print('aa')
            
            # Get the data associated to the code
            df_data_code = df_data[df_data$code == code,]

            # print('bb')
            
            # Prepare the data to fit the entry of extract.Var
            df_QNAlist_code = prepare(df_data_code,
                                      colnamegroup=c('code'))

            # print('cc')

            
            # Compute the yearly mean over the data
            df_QNAEx_code = extract.Var(data.station=df_QNAlist_code,
                                        funct=min,
                                        timestep='year',
                                        period=per,
                                        per.start=per.start,
                                        pos.datetime=1,
                                        na.rm=TRUE)

            # print('dd')
            
            QNAmax = max(df_QNAEx_code$values, na.rm=TRUE)

            # print('ee')
            
            # Prepare the data to fit the entry of extract.Var
            df_tINIlist_code = prepare(df_data_roll_code,
                                       colnamegroup=c('code'))

            # print('ff')
            
            # Compute the yearly min over the averaged data
            df_tINIEx_code = extract.Var(data.station=df_tINIlist_code,
                                         funct=which_underfirst,
                                         period=per,
                                         per.start=per.start,
                                         timestep='year',
                                         pos.datetime=1,
                                         UpLim=QNAmax)

            # print('gg')
            # print(df_tINIEx_code)

            
            df_tINIEx_code$group1 = k
            df_tINIlist_code$data$group = k
            df_tINIlist_code$info$group = k

            
            # Converts index of the tINI to the julian date associated
            df_tINIEx_code = prepare_date(df_tINIEx_code,
                                          df_tINIlist_code,
                                          per.start=per.start)

            # print('hh')
            # print(df_tINIEx_code)
            
            # Store the results
            df_tINIEx = bind_rows(df_tINIEx, df_tINIEx_code)
            
            df_tINIlist$data = bind_rows(df_tINIlist$data,
                                         df_tINIlist_code$data)
            
            df_tINIlist$info = bind_rows(df_tINIlist$info,
                                         df_tINIlist_code$info)

            # print('ii')
            
        }

        # print('11')
        
        # Compute the trend analysis
        df_tINItrend = Estimate.stats(data.extract=df_tINIEx,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval       
        if (I > Imax) {
            # Store it and the associated data and info           
            Imax = I
            df_tINIlistB = df_tINIlist
            df_tINIExB = df_tINIEx
        }

        # print(per.start)
        
        # Specify the period of analyse
        df_tINItrend = get_period(per, df_tINItrend, df_tINIEx,
                                   df_tINIlist)
        # Store the trend
        df_tINItrendB = bind_rows(df_tINItrendB, df_tINItrend)
    }
    # Clean results of trend analyse
    res_tINItrend = clean(df_tINItrendB, df_tINIExB, df_tINIlistB)
    return (res_tINItrend)
}

### 1.5. tMID date
# Realises the trend analysis of the date of the minimum 10 day
# average flow over the year (VCN10) hydrological variable
get_tMIDtrend = function (df_data, df_meta, period, p_thresold) {

    # Get all different stations code
    Code = levels(factor(df_meta$code))
    # Blank tibble to store the data averaged
    df_data_roll = tibble() 

    # For all the code
    for (code in Code) {
        # Get the data associated to the code
        df_data_code = df_data[df_data$code == code,]
        # Perform the roll mean of the flow over 10 days
        df_data_roll_code = tibble(Date=df_data_code$Date,
                                   Value=rollmean(df_data_code$Value, 
                                                  10,
                                                  fill=NA),
                                   code=code)
        # Store the results
        df_data_roll = bind_rows(df_data_roll, df_data_roll_code)
    }

    # Make sure to convert the period to a list
    period = as.list(period)
    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_tMIDtrendB = tibble()

    # For all periods
    for (per in period) {
        # Prepare the data to fit the entry of extract.Var
        df_tMIDlist = prepare(df_data_roll, colnamegroup=c('code'))
        # Compute the yearly min over the averaged data
        df_tMIDEx = extract.Var(data.station=df_tMIDlist,
                                 funct=which.min,
                                 period=per,
                                 timestep='year',
                                 pos.datetime=1)

        # Converts index of the tMID to the julian date associated
        df_tMIDEx = prepare_date(df_tMIDEx, df_tMIDlist)
        
        # Compute the trend analysis
        df_tMIDtrend = Estimate.stats(data.extract=df_tMIDEx,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval       
        if (I > Imax) {
            # Store it and the associated data and info           
            Imax = I
            df_tMIDlistB = df_tMIDlist
            df_tMIDExB = df_tMIDEx
        }

        # Specify the period of analyse
        df_tMIDtrend = get_period(per, df_tMIDtrend, df_tMIDEx,
                                   df_tMIDlist)
        # Store the trend
        df_tMIDtrendB = bind_rows(df_tMIDtrendB, df_tMIDtrend)
    }
    # Clean results of trend analyse
    res_tMIDtrend = clean(df_tMIDtrendB, df_tMIDExB, df_tMIDlistB)
    return (res_tMIDtrend)
}


## 2. OTHER ANALYSES
### 2.1. Hydrograph
xref = matrix(
    c(0.033, 0.025, 0.033, 0.075, 0.167, 0.217, 0.142,
      0.092, 0.067, 0.058, 0.050, 0.042,
      0.050, 0.050, 0.058, 0.083, 0.150, 0.167, 0.117,
      0.083, 0.058, 0.058, 0.067, 0.058,
      0.058, 0.050, 0.100, 0.142, 0.158, 0.092, 0.067,
      0.050, 0.042, 0.058, 0.083, 0.100,
      0.112, 0.098, 0.101, 0.125, 0.122, 0.072, 0.036,
      0.024, 0.039, 0.067, 0.102, 0.102,
      0.139, 0.092, 0.082, 0.099, 0.087, 0.039, 0.015,
      0.012, 0.036, 0.108, 0.159, 0.131,
      0.136, 0.105, 0.091, 0.057, 0.042, 0.034, 0.036,
      0.052, 0.068, 0.111, 0.132, 0.136,
      0.156, 0.154, 0.117, 0.119, 0.086, 0.044, 0.025,
      0.015, 0.025, 0.044, 0.089, 0.127,
      0.204, 0.163, 0.118, 0.102, 0.060, 0.030, 0.018,
      0.012, 0.023, 0.041, 0.087, 0.143,
      0.157, 0.130, 0.119, 0.094, 0.062, 0.042, 0.028,
      0.021, 0.035, 0.062, 0.099, 0.150,
      0.128, 0.142, 0.122, 0.128, 0.105, 0.065, 0.035,
      0.024, 0.031, 0.044, 0.074, 0.101,
      0.133, 0.126, 0.111, 0.110, 0.081, 0.056, 0.038,
      0.027, 0.042, 0.063, 0.098, 0.117,
      0.099, 0.100, 0.101, 0.099, 0.088, 0.078, 0.072,
      0.064, 0.064, 0.069, 0.076, 0.089),
    ncol=12, byrow=TRUE)

colnames(xref) = seq(1, 12, 1)
row.names(xref) = c('GROUP11', 'GROUP10', 'GROUP9', 'GROUP8',
                   'GROUP7', 'GROUUK', 'GROUP6', 'GROUP5',
                   'GROUP4', 'GROUP3', 'GROUP2', 'GROUP1')    

# Computes the hydrograph of a station
get_hydrograph = function (df_data, period=NULL, df_meta=NULL) {

    # If there is a specified period
    if (!is.null(period)) {
        # Extracts only the data of this period
        df_data = df_data[df_data$Date >= as.Date(period[1])
                          & df_data$Date <= as.Date(period[2]),]
    }
    
    # If there is the metadata
    if (!is.null(df_meta)) {
        # New column in metadata for hydrological regime
        df_meta$regime_hydro = NA
        # New column in metadata for the start of the hydrological year
        df_meta$start_year = NA

        # Get all different stations code
        Code = levels(factor(df_meta$code))
        # Number of stations
        nCode = length(Code)
        
    # Otherwise it is just a list of flow from one station
    } else {
        # Only one code is present
        nCode = 1
    }

    # Blank tibble to store data
    df_QM = tibble()
    # For all accessible code
    for (k in 1:nCode) {
        # If there is the metadata
        if (!is.null(df_meta)) {
            # Gets the code
            code = Code[k]
            # Get the associated data
            df_data_code = df_data[df_data$code == code,]
        } else {
            # The data are the date for the current code
            df_data_code = df_data
        }
        
        # Gets a list of the month of the data as numeric
        monthData = as.numeric(format(df_data_code$Date, "%m"))
        # Blank list to stock month mean
        QM_code = c()
        # For all months
        for (i in 1:12) {
            # Gets all the flow data associated to the current month
            data = df_data_code$Value[monthData == i]
            # Averages the data
            QM_code[i] = mean(data, na.rm=TRUE)
        }

        regime = 0
        classRegime = ""
        distance = rep(0, length(xref[,1]))
        distancemin = 0
        for (j in 1:length(xref[,1])) {
            distance[j] = sum((QM_code / mean(QM_code) - xref[j,])^2)
        }
        regime = 1 + length(xref[,1]) - which.min(distance)
        distancemin = distance[which.min(distance)]
        
        if (regime < 7) {
            classRegime = "Pluvial"
        } else if (regime >= 7 & regime < 10) {
            classRegime = "Transition"
        } else if (regime >= 10) {
            classRegime = "Nival glaciaire"
        }
        
        # If there is the metadata
        if (!is.null(df_meta)) {
            # Creates a temporary tibble to store hydrograph results
            df_QMtmp = tibble(QM=QM_code, code=code)
            # Stores it
            df_QM = bind_rows(df_QM, df_QMtmp)
            # Stores result of the hydrological regime
            df_meta$regime_hydro[df_meta$code == code] = classRegime
            # Computes the month of the max QM
            maxMonth = which.max(QM_code)
            
            # Stores it as the start of the hydrological year
            df_meta$start_year[df_meta$code == code] = maxMonth
        # Otherwise
        } else {
            # No tibble needed
            df_QM = QM_code
            df_meta = classRegime
        }
    }
    # Returns the hydrograph and meta data
    return (list(QM=df_QM, meta=df_meta))
}
    
### 2.2. Break date
# Compute the break date of the flow data by station 
get_break = function (df_data, df_meta, p_thresold=0.05) {
    
    # Get all different stations code
    Code = levels(factor(df_meta$code))
    # Number of stations
    nCode = length(Code)

    # Blank date break list and associated station code vector
    date_break = list()
    Code_break = c()

    # For all accessible code
    for (code in Code) {
        # Get the associated data
        df_data_code = df_data[df_data$code == code,] 
        # Remove NA data
        df_data_codeNoNA = df_data_code[!is.na(df_data_code$Value),]

        # Perform the break analysis thanks to the Pettitt test
        res_break = pettitt.test(df_data_codeNoNA$Value)

        # Extract p value
        p_value = res_break$p
        # The length of the data analysed
        nbreak = res_break$nobs
        # Index of the break date
        ibreak = res_break$estimate

        # If the p value results is under the thresold
        if (p_value <= p_thresold) {
            # Get the mean of the index break if there is several
            ibreak = round(mean(ibreak), 0)
            # Store the date break with its associated code
            date_break = append(date_break, 
                                df_data_codeNoNA$Date[ibreak])
            Code_break = append(Code_break, code)
        }
        # step1 = mean(df_data_codeNoNA$Value[1:ibreak])
        # step2 = mean(df_data_codeNoNA$Value[(ibreak+1):nbreak])
    }
    # Create a tibble with the break analysis results
    df_break = tibble(code=Code_break, Date=as.Date(date_break))
    return (df_break)
}

### 2.3. Time gap
# Compute the time gap by station
get_lacune = function (df_data, df_meta) {
    
    # Get all different stations code
    Code = levels(factor(df_meta$code))
    
    # Create new vector to stock results for cumulative and mean
    # time gap by station
    tLac = c()
    meanLac = c()

    # Get rows where there is no NA
    NoNA = complete.cases(df_data)
    # Get data where there is no NA
    df_data_NoNA = df_data[NoNA,]

    # For every station
    for (code in Code) {   
        # Get only the data rows for the selected station
        df_data_code = df_data[df_data$code==code,]
        # Get date for the selected station
        Date = df_data_code$Date
        # Get time span for the selection station
        span = as.numeric(Date[length(Date)] - Date[1])
        
        # Get only the data rows with no NA for the selected station
        df_data_NoNA_code = df_data_NoNA[df_data_NoNA$code==code,]
        # Get date for the selected station
        Date_NoNA = df_data_NoNA_code$Date
        
        # Compute the time gap
        lac = as.numeric(diff(Date_NoNA) - 1)

        # Compute the cumulative gap
        lac_sum = sum(lac)
        # Store the cumulative gap rate
        tLac = c(tLac, lac_sum/span)

        # Compute the mean gap
        lac_mean = mean(lac[lac != 0])
        # Store the mean gap
        meanLac = c(meanLac, lac_mean) 
    }
    
    # Compute the cumulative gap rate in pourcent
    tLac100 = tLac * 100
    # Create tibble for lacune
    df_lac = tibble(code=Code, tLac100=tLac100, meanLac=meanLac)
    # Join a tibble
    df_meta = full_join(df_meta, df_lac)
    return (df_meta)
}