Newer
Older
#
# *1 INRAE, France
# louis.heraut@inrae.fr
#
# This file is part of ash R toolbox.
#
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# ash R toolbox is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ash R toolbox. If not, see <https://www.gnu.org/licenses/>.
# ///
#
#
# processing/analyse.R
#
# File that realise all the possible analysis of data.
# This file regroup mainly the functions use to compute the trend
# analysis of hydrologic variables thanks to the Mann-Kendall Test.
# Functions needed for break or gap analysis are also present.
# Usefull library
library(dplyr)
## 1. TREND ANALYSIS _________________________________________________
### 1.0. Intercept of trend __________________________________________
# Compute intercept values of linear trends with first order values
# of trends and the data on which analysis is performed.
get_intercept = function (df_Xtrend, df_Xlist, unit2day=365.25) {
df_data_code = df_Xlist$data[df_Xlist$data$group == g,]
df_Xtrend_code = df_Xtrend[df_Xtrend$group == g,]
Start = df_Xtrend_code$period_start
End = df_Xtrend_code$period_end
# Extract only the unrepeated dates
UStart = levels(factor(Start))
df_data_code_per =
df_data_code[df_data_code$Date >= Start[i]
& df_data_code$Date <= End[i],]
df_Xtrend_code_per =
df_Xtrend_code[df_Xtrend_code$period_start == Start[i]
& df_Xtrend_code$period_end == End[i],]
id = which(df_Xtrend$group == g
& df_Xtrend$period_start == Start[i]
& df_Xtrend$period_end == End[i])
mu_t = as.numeric(mean(c(Start[i],
End[i]),
na.rm=TRUE)) / unit2day
df_Xtrend$intercept[id] = b
}
}
return (df_Xtrend)
}
### 1.1. QA __________________________________________________________
# Realise the trend analysis of the average annual flow (QA)
# hydrological variable
get_QAtrend = function (df_data, df_meta, period, alpha, yearLac_day, df_mod=tibble()) {
res = missing_data(df_data, df_meta,
yearLac_day=yearLac_day,
df_mod=df_mod)
df_data = res$data
df_mod = res$mod
df_QAlist = prepare(df_data, colnamegroup=c('code'))
df_QAEx = extract.Var(data.station=df_QAlist,
funct=mean,
timestep='year',
period=per,
pos.datetime=1,
na.rm=TRUE)
Imax = I
df_QAlistB = df_QAlist
df_QAExB = df_QAEx
}
df_QAtrend = get_period(per, df_QAtrend, df_QAEx, df_QAlist)
# Store the trend
df_QAtrendB = bind_rows(df_QAtrendB, df_QAtrend)
res_QAtrend = clean(df_QAtrendB, df_QAExB, df_QAlistB)
res = list(data=df_data, mod=df_mod, analyse=res_QAtrend)
return (res)
### 1.2. QMNA ________________________________________________________
# Realise the trend analysis of the monthly minimum flow in the
# year (QMNA) hydrological variable
get_QMNAtrend = function (df_data, df_meta, period, alpha, sampleSpan, yearLac_day, df_mod=tibble()) {
res = missing_data(df_data, df_meta,
yearLac_day=yearLac_day,
df_mod=df_mod)
df_data = res$data
df_mod = res$mod
res = sampling_data(df_data, df_meta,
sampleSpan=sampleSpan,
df_mod=df_mod)
df_data = res$data
df_mod = res$mod
df_QMNAlist = prepare(df_data, colnamegroup=c('code'))
df_QMNAEx = extract.Var(data.station=df_QMNAlist,
funct=mean,
period=per,
df_QMNAlist = reprepare(df_QMNAEx,
df_QMNAlist,
colnamegroup=c('code'))
df_QMNAEx = extract.Var(data.station=df_QMNAlist,
funct=min,
period=per,
timestep='year',
pos.datetime=1,
na.rm=TRUE)
Imax = I
df_QMNAlistB = df_QMNAlist
df_QMNAExB = df_QMNAEx
}
df_QMNAtrendB = bind_rows(df_QMNAtrendB, df_QMNAtrend)
}
res_QMNAtrend = clean(df_QMNAtrendB, df_QMNAExB, df_QMNAlistB)
res = list(data=df_data, mod=df_mod, analyse=res_QMNAtrend)
return (res)
### 1.3. VCN10 _______________________________________________________
rollmean_code = function (df_data, Code, nroll=10, df_mod=NULL) {
df_data_roll_code = tibble(Date=df_data_code$Date,
Value=rollmean(df_data_code$Value,
10,
fill=NA),
code=code)
if (!is.null(df_mod)) {
df_mod = add_mod(df_mod, code,
type='Rolling average',
fun_name='rollmean',
comment='Rolling average of 10 day over all the data')
}
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
if (!is.null(df_mod)) {
res = list(data=df_data, mod=df_mod)
return (res)
} else {
return (df_data_roll)
}
}
# Realises the trend analysis of the minimum 10 day average flow
# over the year (VCN10) hydrological variable
get_VCN10trend = function (df_data, df_meta, period, alpha, sampleSpan, yearLac_day, df_mod=tibble()) {
# Get all different stations code
Code = levels(factor(df_meta$code))
# Computes the rolling average by 10 days over the data
res = rollmean_code(df_data, Code, 10, df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
# Removes incomplete data from time series
res = missing_data(df_data_roll, df_meta,
yearLac_day=yearLac_day,
df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
# Samples the data
res = sampling_data(df_data_roll, df_meta,
sampleSpan=sampleSpan,
df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
df_VCN10list = prepare(df_data_roll, colnamegroup=c('code'))
df_VCN10Ex = extract.Var(data.station=df_VCN10list,
funct=min,
period=per,
timestep='year',
pos.datetime=1,
na.rm=TRUE)
# Get the associated time interval
I = interval(per[1], per[2])
# If it is the largest interval
if (I > Imax) {
# Store it and the associated data and info
Imax = I
df_VCN10listB = df_VCN10list
df_VCN10ExB = df_VCN10Ex
}
# Specify the period of analyse
df_VCN10trend = get_period(per, df_VCN10trend, df_VCN10Ex,
df_VCN10list)
# Store the trend
df_VCN10trendB = bind_rows(df_VCN10trendB, df_VCN10trend)
}
# Clean results of trend analyse
res_VCN10trend = clean(df_VCN10trendB, df_VCN10ExB, df_VCN10listB)
res = list(data=df_data_roll, mod=df_mod,
analyse=res_VCN10trend)
return (res)
### 1.4. tDEB date ___________________________________________________
which_underfirst = function (L, UpLim, select_longest=TRUE) {
if (select_longest) {
dID = diff(ID)
dID = c(10, dID)
IDjump = which(dID != 1)
Njump = length(IDjump)
Periods = vector(mode='list', length=Njump)
Nperiod = c()
for (i in 1:Njump) {
idStart = IDjump[i]
if (i < Njump) {
idEnd = IDjump[i+1] - 1
} else {
idEnd = length(ID)
}
period = ID[idStart:idEnd]
Periods[[i]] = period
Nperiod = c(Nperiod, length(period))
period_max = Periods[[which.max(Nperiod)]]
id = period_max[1]
} else {
id = ID[1]
get_tDEBtrend = function (df_data, df_meta, period, alpha, sampleSpan, yearLac_day, thresold_type='VCN10', select_longest=TRUE, df_mod=tibble()) {
# Get all different stations code
Code = levels(factor(df_meta$code))
# Gets the number of station
nCode = length(Code)
# Computes the rolling average by 10 days over the data
res = rollmean_code(df_data, Code, 10, df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
# Removes incomplete data from time series
df_data = missing_data(df_data,
df_meta=df_meta,
yearLac_day=yearLac_day)
# Samples the data
df_data = sampling_data(df_data,
df_meta=df_meta,
sampleSpan=sampleSpan)
# Removes incomplete data from the averaged time series
res = missing_data(df_data_roll,
df_meta=df_meta,
yearLac_day=yearLac_day,
df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
res = sampling_data(df_data_roll,
df_meta=df_meta,
sampleSpan=sampleSpan,
df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
# Make sure to convert the period to a list
period = as.list(period)
# Set the max interval period as the minimal possible
Imax = 0
# Blank tibble for data to return
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
if (thresold_type == 'QNj') {
# Prepare the data to fit the entry of extract.Var
df_QTlist = prepare(df_data,
colnamegroup=c('code'))
} else if (thresold_type == 'VCN10') {
# Prepare the data to fit the entry of extract.Var
df_QTlist = prepare(df_data_roll,
colnamegroup=c('code'))
}
# Compute the yearly mean over the data
df_QTEx = extract.Var(data.station=df_QTlist,
funct=min,
timestep='year',
period=per,
pos.datetime=1,
na.rm=TRUE)
df_QT = summarise(group_by(df_QTEx, group1),
values=max(values, na.rm=TRUE))
# Renames the column of group of trend results
colnames(df_QT) = c('group', 'Thresold')
df_QT = full_join(df_QT, df_QTlist$info, by='group')
df_QT = df_QT[-1]
# print(df_QT)
df_tDEBEx = tibble()
df_tDEBlist = list(data=tibble(), info=tibble())
# For all the code
for (k in 1:nCode) {
# Gets the code
code = Code[k]
# Get the data associated to the code
df_data_code = df_data[df_data$code == code,]
# Get the averaged data associated to the code
df_data_roll_code = df_data_roll[df_data_roll$code == code,]
QT_code = df_QT$Thresold[df_QT$code == code]
# Compute the yearly min over the averaged data
UpLim=QT_code,
select_longest=select_longest)
df_tDEBEx_code$group1 = k
df_tDEBlist_code$data$group = k
df_tDEBlist_code$info$group = k
# Converts index of the tDEB to the julian date associated
df_tDEBEx_code = prepare_date(df_tDEBEx_code,
df_tDEBlist_code)
df_tDEBlist$data = bind_rows(df_tDEBlist$data,
df_tDEBlist_code$data)
df_tDEBlist$info = bind_rows(df_tDEBlist$info,
df_tDEBlist_code$info)
# Get the associated time interval
I = interval(per[1], per[2])
# If it is the largest interval
if (I > Imax) {
# Store it and the associated data and info
Imax = I
df_tDEBtrend = get_period(per, df_tDEBtrend, df_tDEBEx,
df_tDEBlist)
res_tDEBtrend = clean(df_tDEBtrendB, df_tDEBExB, df_tDEBlistB)
res = list(data=df_data_roll, mod=df_mod,
analyse=res_tDEBtrend)
return (res)
### 1.5. tCEN date ___________________________________________________
# Realises the trend analysis of the date of the minimum 10 day
# average flow over the year (VCN10) hydrological variable
get_tCENtrend = function (df_data, df_meta, period, alpha, sampleSpan, yearLac_day, df_mod=tibble()) {
# Get all different stations code
Code = levels(factor(df_meta$code))
# Blank tibble to store the data averaged
df_data_roll = tibble()
# Computes the rolling average by 10 days over the data
res = rollmean_code(df_data, Code, 10, df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
res = missing_data(df_data_roll, df_meta,
yearLac_day=yearLac_day,
df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
res = sampling_data(df_data_roll, df_meta,
sampleSpan=sampleSpan,
df_mod=df_mod)
df_data_roll = res$data
df_mod = res$mod
# Make sure to convert the period to a list
period = as.list(period)
# Set the max interval period as the minimal possible
Imax = 0
# Blank tibble for data to return
# For all periods
for (per in period) {
# Prepare the data to fit the entry of extract.Var
funct=which.min,
period=per,
timestep='year',
pos.datetime=1)
# Converts index of the tCEN to the julian date associated
df_tCENEx = prepare_date(df_tCENEx, df_tCENlist)
df_tCENtrend = get_period(per, df_tCENtrend, df_tCENEx,
df_tCENlist)
res = list(data=df_data_roll, mod=df_mod,
analyse=res_tCENtrend)
return (res)
## 2. OTHER ANALYSES _________________________________________________
### 2.1. Hydrograph __________________________________________________
0.133, 0.126, 0.111, 0.110, 0.081, 0.056, 0.038,
0.017, 0.008, 0.017, 0.042, 0.108, 0.183, 0.200,
0.175, 0.117, 0.067, 0.042, 0.025),
ncol=12, byrow=TRUE)
colnames(xref) = seq(1, 12, 1)
row.names(xref) = c('GROUP1', 'GROUP2', 'GROUP3', 'GROUP4',
'GROUP5', 'GROUP6', 'GROUP7', 'GROUP8',
'GROUP9', 'GROUP10', 'GROUP11', 'GROUP12')
# Computes the hydrograph of a station
get_hydrograph = function (df_data, period=NULL, df_meta=NULL) {
# If there is a specified period
if (!is.null(period)) {
# Extracts only the data of this period
df_data = df_data[df_data$Date >= as.Date(period[1])
& df_data$Date <= as.Date(period[2]),]
}
# If there is the metadata
if (!is.null(df_meta)) {
# New column in metadata for hydrological regime
df_meta$regime_hydro = NA
# New column in metadata for the start of the hydrological year
df_meta$start_year = NA
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
# Get all different stations code
Code = levels(factor(df_meta$code))
# Number of stations
nCode = length(Code)
# Otherwise it is just a list of flow from one station
} else {
# Only one code is present
nCode = 1
}
# Blank tibble to store data
df_QM = tibble()
# For all accessible code
for (k in 1:nCode) {
# If there is the metadata
if (!is.null(df_meta)) {
# Gets the code
code = Code[k]
# Get the associated data
df_data_code = df_data[df_data$code == code,]
} else {
# The data are the date for the current code
df_data_code = df_data
}
# Gets a list of the month of the data as numeric
monthData = as.numeric(format(df_data_code$Date, "%m"))
# Blank list to stock month mean
QM_code = c()
# For all months
for (i in 1:12) {
# Gets all the flow data associated to the current month
data = df_data_code$Value[monthData == i]
# Averages the data
QM_code[i] = mean(data, na.rm=TRUE)
}
regime = 0
classRegime = ""
distance = rep(0, length(xref[,1]))
distancemin = 0
for (j in 1:length(xref[,1])) {
distance[j] = sum((QM_code / mean(QM_code) - xref[j,])^2)
}
regime = 1 + length(xref[,1]) - which.min(distance)
distancemin = distance[which.min(distance)]
if (regime < 7) {
classRegime = "Pluvial"
} else if (regime >= 7 & regime < 10) {
classRegime = "Transition"
} else if (regime >= 10) {
# If there is the metadata
if (!is.null(df_meta)) {
# Creates a temporary tibble to store hydrograph results
df_QMtmp = tibble(QM=QM_code, code=code)
# Stores it
df_QM = bind_rows(df_QM, df_QMtmp)
# Stores result of the hydrological regime
df_meta$regime_hydro[df_meta$code == code] = classRegime
# Stores it as the start of the hydrological year
df_meta$start_year[df_meta$code == code] = maxMonth
# Otherwise
} else {
# No tibble needed
df_QM = QM_code
df_meta = classRegime
}
}
# Returns the hydrograph and meta data
return (list(QM=df_QM, meta=df_meta))
}
### 2.2. Break date __________________________________________________
# Get all different stations code
Code = levels(factor(df_meta$code))
# Number of stations
nCode = length(Code)
# Blank date break list and associated station code vector
date_break = list()
Code_break = c()
# For all accessible code
for (code in Code) {
# Get the associated data
df_data_code = df_data[df_data$code == code,]
# Remove NA data
df_data_codeNoNA = df_data_code[!is.na(df_data_code$Value),]
# Extract p value
p_value = res_break$p
# The length of the data analysed
nbreak = res_break$nobs
# Index of the break date
ibreak = res_break$estimate
# If the p value results is under the thresold
# Get the mean of the index break if there is several
ibreak = round(mean(ibreak), 0)
# Store the date break with its associated code
date_break = append(date_break,
df_data_codeNoNA$Date[ibreak])
Code_break = append(Code_break, code)
}
# step1 = mean(df_data_codeNoNA$Value[1:ibreak])
# step2 = mean(df_data_codeNoNA$Value[(ibreak+1):nbreak])
}
# Create a tibble with the break analysis results
df_break = tibble(code=Code_break, Date=as.Date(date_break))
return (df_break)
}
### 2.3. Time gap ____________________________________________________
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
# Compute the time gap by station
get_lacune = function (df_data, df_meta) {
# Get all different stations code
Code = levels(factor(df_meta$code))
# Create new vector to stock results for cumulative and mean
# time gap by station
tLac = c()
meanLac = c()
# Get rows where there is no NA
NoNA = complete.cases(df_data)
# Get data where there is no NA
df_data_NoNA = df_data[NoNA,]
# For every station
for (code in Code) {
# Get only the data rows for the selected station
df_data_code = df_data[df_data$code==code,]
# Get date for the selected station
Date = df_data_code$Date
# Get time span for the selection station
span = as.numeric(Date[length(Date)] - Date[1])
# Get only the data rows with no NA for the selected station
df_data_NoNA_code = df_data_NoNA[df_data_NoNA$code==code,]
# Get date for the selected station
Date_NoNA = df_data_NoNA_code$Date
# Compute the time gap
lac = as.numeric(diff(Date_NoNA) - 1)
# Compute the cumulative gap
lac_sum = sum(lac)
# Store the cumulative gap rate
tLac = c(tLac, lac_sum/span)
# Compute the mean gap
lac_mean = mean(lac[lac != 0])
# Store the mean gap
meanLac = c(meanLac, lac_mean)
}
# Compute the cumulative gap rate in pourcent
tLac100 = tLac * 100
# Create tibble for lacune
df_lac = tibble(code=Code, tLac100=tLac100, meanLac=meanLac)
# Join a tibble
df_meta = full_join(df_meta, df_lac)
return (df_meta)
}
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
### 2.4. Compute square root of data _________________________________
compute_sqrt = function (df_data) {
df_sqrt = tibble(Date=df_data$Date,
Value=sqrt(df_data$Value),
code=df_data$code)
return (df_sqrt)
}
### 2.5. Criticism of data ___________________________________________
add_critique = function (df_critique, Code, author, level, start_date, variable, type, comment='', end_date=NULL, df_meta=NULL, resdir=NULL) {
if (Code == 'all' & is.null(df_meta)) {
Code = NA # erreur
} else if (Code == 'all' & !is.null(df_meta)) {
# Get all different stations code
Code = levels(factor(df_meta$code))
}
if (is.null(end_date)) {
end_date = start_date
}
df_tmp = tibble(code=Code, author=author, level=level,
start_date=start_date, end_date=end_date,
variable=variable, type=type,
comment=comment)
df_critique = bind_rows(df_critique, df_tmp)
nc = nrow(df_critique)
print('Criticism registered')
print(df_critique[(nc-2):nc,])
if (!is.null(resdir)) {
write_critique(df_critique, resdir)
}
return (df_critique)
}
# df_critique = add_critique(df_critique, resdir=resdir, Code='', author='louis', level=, start_date=, end_date=NA, variable='', type='', comment='')