analyse.R

# \\\
# Copyright 2021-2022 Louis Héraut*1
#
# *1   INRAE, France
#      louis.heraut@inrae.fr
#
# This file is part of ash R toolbox.
#
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# ash R toolbox is distributed in the hope that it will be useful, but 
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ash R toolbox.  If not, see <https://www.gnu.org/licenses/>.
# ///
#
#
# processing/analyse.R
#
# File that realise all the possible analysis of data.
# This file regroup mainly the functions use to compute the trend
# analysis of hydrologic variables thanks to the Mann-Kendall Test.
# Functions needed for break or gap analysis are also present.


# Usefull library
library(dplyr)
library(zoo)
library(StatsAnalysisTrend)
library(lubridate)
library(trend)

# Sourcing R file
source('processing/format.R', encoding='latin1')


## 1. TREND ANALYSIS
### 1.0. Intercept of trend
# Compute intercept values of linear trends with first order values
# of trends and the data on which analysis is performed.
get_intercept = function (df_Xtrend, df_Xlist, unit2day=365.25) {

    # Create a column in trend full of NA
    df_Xtrend$intercept = NA

    # For all different group
    for (g in df_Xlist$info$group) {
        # Get the data and trend value linked to this group
        df_data_code = df_Xlist$data[df_Xlist$data$group == g,]
        df_Xtrend_code = df_Xtrend[df_Xtrend$group == g,]

        # Get the time start and end of the different periods
        Start = df_Xtrend_code$period_start
        End = df_Xtrend_code$period_end
        # Extract only the unrepeated dates
        UStart = levels(factor(Start))
        UEnd = levels(factor(End))
        # Get the number of different periods of trend analysis
        nPeriod = max(length(UStart), length(UEnd))

        # For each of these perdiods
        for (i in 1:nPeriod) {
            # Get data and trend associated to the period
            df_data_code_per = 
                df_data_code[df_data_code$Date >= Start[i] 
                             & df_data_code$Date <= End[i],]
            df_Xtrend_code_per = 
                df_Xtrend_code[df_Xtrend_code$period_start == Start[i] 
                              & df_Xtrend_code$period_end == End[i],]

            # Get the group associated to this period
            id = which(df_Xtrend$group == g 
                       & df_Xtrend$period_start == Start[i] 
                       & df_Xtrend$period_end == End[i])

            # Compute mean of flow and time period
            mu_X = mean(df_data_code_per$Qm3s, na.rm=TRUE)
            mu_t = as.numeric(mean(c(Start[i],
                                     End[i]),
                                   na.rm=TRUE)) / unit2day

            # Get the intercept of the trend
            b = mu_X - mu_t * df_Xtrend_code_per$trend
            # And store it
            df_Xtrend$intercept[id] = b
        } 
    }
    return (df_Xtrend)
}

### 1.1. QA
# Realise the trend analysis of the average annual flow (QA)
# hydrological variable
get_QAtrend = function (df_data, period, p_thresold) {

    # Make sure to convert the period to a list
    period = as.list(period)

    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_QAtrendB = tibble()

    # For all periods
    for (per in period) {
        # Prepare the data to fit the entry of extract.Var
        df_QAlist = prepare(df_data, colnamegroup=c('code'))

        # Compute the yearly mean over the data
        df_QAEx = extract.Var(data.station=df_QAlist,
                              funct=mean,
                              timestep='year',
                              period=per,
                              pos.datetime=1,
                              na.rm=TRUE)
        # Compute the trend analysis
        df_QAtrend = Estimate.stats(data.extract=df_QAEx,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval
        if (I > Imax) {
            # Store it and the associated data and info
            Imax = I
            df_QAlistB = df_QAlist
            df_QAExB = df_QAEx
        }

        # Specify the period of analyse
        df_QAtrend = get_period(per, df_QAtrend, df_QAEx, df_QAlist)
        # Store the trend
        df_QAtrendB = bind_rows(df_QAtrendB, df_QAtrend)   
    } 
    # Clean results of trend analyse
    res_QAtrend = clean(df_QAtrendB, df_QAExB, df_QAlistB)
    return (res_QAtrend)
}

### 1.2. QMNA
# Realise the trend analysis of the monthly minimum flow in the
# year (QMNA) hydrological variable
get_QMNAtrend = function (df_data, period, p_thresold) {

    # Make sure to convert the period to a list
    period = as.list(period)
    
    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_QMNAtrendB = tibble()

    # For all periods
    for (per in period) {
        # Prepare the data to fit the entry of extract.Var
        df_QMNAlist = prepare(df_data, colnamegroup=c('code'))
        # Compute the montly mean over the data
        df_QMNAEx = extract.Var(data.station=df_QMNAlist,
                                funct=mean,
                                period=per,
                                timestep='year-month',
                                per.start="01",
                                pos.datetime=1,
                                na.rm=TRUE)
        # Rerepare the data to fit the entry of extract.Var
        df_QMNAlist = reprepare(df_QMNAEx,
                                df_QMNAlist,
                                colnamegroup=c('code'))
        # Compute the yearly min over the data
        df_QMNAEx = extract.Var(data.station=df_QMNAlist,
                                funct=min,
                                period=per,
                                timestep='year',
                                pos.datetime=1,
                                na.rm=TRUE)
        # Compute the trend analysis        
        df_QMNAtrend = Estimate.stats(data.extract=df_QMNAEx,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval
        if (I > Imax) {
            # Store it and the associated data and info
            Imax = I
            df_QMNAlistB = df_QMNAlist
            df_QMNAExB = df_QMNAEx
        }
        
        # Specify the period of analyse
        df_QMNAtrend = get_period(per, df_QMNAtrend,
                                  df_QMNAEx,
                                  df_QMNAlist)
        # Store the trend
        df_QMNAtrendB = bind_rows(df_QMNAtrendB, df_QMNAtrend)
    }
    # Clean results of trend analyse
    res_QMNAtrend = clean(df_QMNAtrendB, df_QMNAExB, df_QMNAlistB)
    return (res_QMNAtrend)
}

### 1.3. VCN10
# Realise the trend analysis of the minimum 10 day average flow over the year (VCN10) hydrological variable
get_VCN10trend = function (df_data, df_meta, period, p_thresold) {

    # Get all different stations code
    Code = levels(factor(df_meta$code))
    # Blank tibble to store the data averaged
    df_data_roll = tibble()

    # For all the code
    for (c in Code) {
        # Get the data associated to the code
        df_data_code = df_data[df_data$code == c,]
        # Perform the roll mean of the flow over 10 days
        df_data_code = tibble(Date=rollmean(df_data_code$Date,
                                            10,
                                            fill=NA),
                              Qm3s=rollmean(df_data_code$Qm3s, 
                                            10,
                                            fill=NA),
                              code=c)
        # Store the results
        df_data_roll = bind_rows(df_data_roll, df_data_code)
    }

    # Make sure to convert the period to a list
    period = as.list(period)
    # Set the max interval period as the minimal possible
    Imax = 0
    # Blank tibble for data to return
    df_VCN10trendB = tibble()

    # For all periods
    for (per in period) {
        # Prepare the data to fit the entry of extract.Var
        df_VCN10list = prepare(df_data_roll, colnamegroup=c('code'))
        # Compute the yearly min over the averaged data
        df_VCN10Ex = extract.Var(data.station=df_VCN10list,
                                 funct=min,
                                 period=per,
                                 timestep='year',
                                 pos.datetime=1,
                                 na.rm=TRUE)
        # Compute the trend analysis
        df_VCN10trend = Estimate.stats(data.extract=df_VCN10Ex,
                                      level=p_thresold)

        # Get the associated time interval
        I = interval(per[1], per[2])
        # If it is the largest interval       
        if (I > Imax) {
            # Store it and the associated data and info           
            Imax = I
            df_VCN10listB = df_VCN10list
            df_VCN10ExB = df_VCN10Ex
        }

        # Specify the period of analyse
        df_VCN10trend = get_period(per, df_VCN10trend, df_VCN10Ex,
                                   df_VCN10list)
        # Store the trend
        df_VCN10trendB = bind_rows(df_VCN10trendB, df_VCN10trend)
    }
    # Clean results of trend analyse
    res_VCN10trend = clean(df_VCN10trendB, df_VCN10ExB, df_VCN10listB)
    return (res_VCN10trend)
}


## 2. OTHER ANALYSES
### 2.1. Break date
# Compute the break date of the flow data by station 
get_break = function (df_data, df_meta, p_thresold=0.05) {
    
    # Get all different stations code
    Code = levels(factor(df_meta$code))
    # Number of stations
    nCode = length(Code)

    # Blank date break list and associated station code vector
    date_break = list()
    Code_break = c()

    # For all accessible code
    for (code in Code) {
        # Get the associated data
        df_data_code = df_data[df_data$code == code,] 
        # Remove NA data
        df_data_codeNoNA = df_data_code[!is.na(df_data_code$Qm3s),]

        # Perform the break analysis thanks to the Pettitt test
        res_break = pettitt.test(df_data_codeNoNA$Qm3s)

        # Extract p value
        p_value = res_break$p
        # The length of the data analysed
        nbreak = res_break$nobs
        # Index of the break date
        ibreak = res_break$estimate

        # If the p value results is under the thresold
        if (p_value <= p_thresold) {
            # Get the mean of the index break if there is several
            ibreak = round(mean(ibreak), 0)
            # Store the date break with its associated code
            date_break = append(date_break, 
                                df_data_codeNoNA$Date[ibreak])
            Code_break = append(Code_break, code)
        }
        # step1 = mean(df_data_codeNoNA$Qm3s[1:ibreak])
        # step2 = mean(df_data_codeNoNA$Qm3s[(ibreak+1):nbreak])
    }
    # Create a tibble with the break analysis results
    df_break = tibble(code=Code_break, Date=as.Date(date_break))
    return (df_break)
}

### 2.2. Time gap
# Compute the time gap by station
get_lacune = function (df_data, df_meta) {
    
    # Get all different stations code
    Code = levels(factor(df_meta$code))
    
    # Create new vector to stock results for cumulative and mean
    # time gap by station
    tLac = c()
    meanLac = c()

    # Get rows where there is no NA
    NoNA = complete.cases(df_data)
    # Get data where there is no NA
    df_data_NoNA = df_data[NoNA,]

    # For every station
    for (code in Code) {   
        # Get only the data rows for the selected station
        df_data_code = df_data[df_data$code==code,]
        # Get date for the selected station
        Date = df_data_code$Date
        # Get time span for the selection station
        span = as.numeric(Date[length(Date)] - Date[1])
        
        # Get only the data rows with no NA for the selected station
        df_data_NoNA_code = df_data_NoNA[df_data_NoNA$code==code,]
        # Get date for the selected station
        Date_NoNA = df_data_NoNA_code$Date
        
        # Compute the time gap
        lac = as.numeric(diff(Date_NoNA) - 1)

        # Compute the cumulative gap
        lac_sum = sum(lac)
        # Store the cumulative gap rate
        tLac = c(tLac, lac_sum/span)

        # Compute the mean gap
        lac_mean = mean(lac[lac != 0])
        # Store the mean gap
        meanLac = c(meanLac, lac_mean) 
    }
    
    # Compute the cumulative gap rate in pourcent
    tLac100 = tLac * 100
    # Create tibble for lacune
    df_lac = tibble(code=Code, tLac100=tLac100, meanLac=meanLac)
    # Join a tibble
    df_meta = full_join(df_meta, df_lac)
    return (df_meta)
}