Newer
Older
#
# *1 INRAE, France
# louis.heraut@inrae.fr
#
# This file is part of ash R toolbox.
#
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# ash R toolbox is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ash R toolbox. If not, see <https://www.gnu.org/licenses/>.
# ///
#
#
# processing/format.R
#
# Manages all the format problem of data and info. Mainly problem of
# input and output of the 'StatsAnalysisTrend' package. It also allows
# to join different selections of station and to gets exact period of
# trend analysis.
## 1. BEFORE TREND ANALYSE ___________________________________________
### 1.1. Joining selection ___________________________________________
# Joins tibbles of different selection of station as a unique one
join_selection = function (df_data_AEAG, df_data_INRAE, df_meta_AEAG, df_meta_INRAE) {
# If there is an INRAE and an Agence de l'eau Adour-Garonne selection
if (!is.null(df_data_INRAE) & !is.null(df_data_AEAG)) {
common = levels(factor(df_meta_INRAE[df_meta_INRAE$code %in% df_meta_AEAG$code,]$code))
INRAEadd = levels(factor(df_meta_INRAE[!(df_meta_INRAE$code %in% df_meta_AEAG$code),]$code))
# Selects only the INRAE meta to add
df_meta_INRAEadd = df_meta_INRAE[df_meta_INRAE$code %in% INRAEadd,]
df_meta_AEAG$source = 'AEAG'
df_meta_INRAEadd$source = 'INRAE'
# Joins INRAE data to AEAG data
df_meta = full_join(df_meta_AEAG, df_meta_INRAEadd)
# Selects only the INRAE data to add
df_data_INRAEadd = df_data_INRAE[df_data_INRAE$code %in% INRAEadd,]
# Joins INRAE meta to AEAG meta
df_data = full_join(df_data_AEAG, df_data_INRAEadd)
# If there is just an Agence de l'eau Adour-Garonne selection
} else if (is.null(df_data_INRAE) & !is.null(df_data_AEAG)) {
df_meta_AEAG$source = 'AEAG'
df_meta = df_meta_AEAG
df_data = df_data_AEAG
} else if (!is.null(df_data_INRAE) & is.null(df_data_AEAG)) {
df_meta_INRAE$source = 'INRAE'
df_meta = df_meta_INRAE
df_data = df_data_INRAE
# If there is no selection
} else {
stop('No data')
}
return (list(data=df_data, meta=df_meta))
}
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
### 1.2. Local correction of data ____________________________________
flag_data = function (df_data, df_meta, df_flag, Code=NULL, df_mod=NULL) {
if (is.null(Code)) {
# Get all different stations code
Code = levels(factor(df_meta$code))
nCode = length(Code)
} else {
nCode = length(Code)
}
for (code in Code) {
if (code %in% df_flag$code) {
df_flag_code = df_flag[df_flag$code == code,]
nbFlag = nrow(df_flag_code)
for (i in 1:nbFlag) {
newValue = df_flag_code$newValue[i]
date = df_flag_code$Date[i]
OKcor = df_data$code == code & df_data$Date == date
oldValue = df_data$Value[OKcor]
df_data$Value[OKcor] = newValue
if (!is.null(df_mod)) {
df_mod =
add_mod(df_mod, code,
type='Value correction',
fun_name='Manual new value assignment',
comment=paste('At ', date,
' the value ', oldValue,
' becomes ', newValue,
sep=''))
}
}
}
}
if (!is.null(df_mod)) {
res = list(data=df_data, mod=df_mod)
return (res)
} else {
return (df_data)
}
}
### 1.3. Manages missing data ________________________________________
missing_data = function (df_data, df_meta, dayLac_lim=3, yearNA_lim=10, yearStart='01-01', Code=NULL, df_mod=NULL) {
if (is.null(Code)) {
# Get all different stations code
Code = levels(factor(df_meta$code))
nCode = length(Code)
} else {
nCode = length(Code)
}
for (code in Code) {
# Extracts the data corresponding to the code
df_data_code = df_data[df_data$code == code,]
DateNA = df_data_code$Date[is.na(df_data_code$Value)]
dDateNA = diff(DateNA)
if (any(dDateNA != 1)) {
dDateNA = c(10, dDateNA)
idJump = which(dDateNA != 1)
NJump = length(idJump)
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
for (i in 1:NJump) {
idStart = idJump[i]
if (i < NJump) {
idEnd = idJump[i+1] - 1
} else {
idEnd = length(DateNA)
}
Start = DateNA[idStart]
End = DateNA[idEnd]
duration = (End - Start)/365.25
if (duration >= yearNA_lim) {
df_data_code$Value[df_data_code$Date <= End] = NA
if (!is.null(df_mod)) {
df_mod =
add_mod(df_mod, code,
type='Missing data management',
fun_name='NA assignment',
comment=paste('From the start of measurements',
' to ', End, sep=''))
}
}
}
}
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
DateMD = substr(df_data_code$Date, 6, 10)
idyearStart = which(DateMD == yearStart)
if (DateMD[1] != yearStart) {
idyearStart = c(1, idyearStart)
}
NidyearStart = length(idyearStart)
for (i in 1:NidyearStart) {
Start = df_data_code$Date[idyearStart[i]]
if (i < NidyearStart) {
End = df_data_code$Date[idyearStart[i+1] - 1]
} else {
End = df_data_code$Date[length(df_data_code$Date)]
}
OkYear = df_data_code$Date >= Start & df_data_code$Date <= End
df_data_code_year = df_data_code[OkYear,]
StartReal = as.Date(paste(substr(Start, 1, 4),
yearStart, sep='-'))
EndReal = as.Date(paste(as.numeric(substr(Start, 1, 4)) + 1,
yearStart, sep='-'))
nbDate = as.numeric(difftime(EndReal, StartReal,
units="days"))
nbNA = sum(as.numeric(is.na(df_data_code_year$Value)))
nbNA = nbNA + abs(as.numeric(difftime(StartReal, Start,
units="days")))
nbNA = nbNA + abs(as.numeric(difftime(EndReal, End+1,
units="days")))
yearLacMiss_pct = nbNA/nbDate * 100
df_data_code_year$Value = NA
df_data_code[OkYear,] = df_data_code_year
if (!is.null(df_mod)) {
df_mod = add_mod(df_mod, code,
type='Missing data management',
fun_name='NA assignment',
comment=paste('From ', Start,
' to ', End, sep=''))
}
} else if (nbNA <= dayLac_lim & nbNA > 1) {
DateJ = as.numeric(df_data_code_year$Date)
Value = df_data_code_year$Value
Value = approxExtrap(x=DateJ,
y=Value,
xout=DateJ,
method="linear",
na.rm=TRUE)$y
df_data_code$Value[OkYear] = Value
if (!is.null(df_mod)) {
df_mod = add_mod(df_mod, code,
type='Missing data management',
fun_name='approxExtrap',
comment=paste(
'Linear extrapolation of NA from ',
Start, ' to ', End, sep=''))
}
}
}
df_data[df_data$code == code,] = df_data_code
}
if (!is.null(df_mod)) {
res = list(data=df_data, mod=df_mod)
return (res)
} else {
return (df_data)
}
### 1.4. Sampling of the data ________________________________________
sampling_data = function (df_data, df_meta, sampleSpan=c('05-01', '11-30'), Code=NULL, df_mod=NULL) {
if (is.null(Code)) {
# Get all different stations code
Code = levels(factor(df_meta$code))
nCode = length(Code)
} else {
nCode = length(Code)
}
# 1972 is leap year reference is case of leap year comparison
sampleStart = as.Date(paste('1972', sampleSpan[1], sep='-'))
sampleEnd = as.Date(paste('1972', sampleSpan[2], sep='-'))
for (code in Code) {
# Extracts the data corresponding to the code
df_data_code = df_data[df_data$code == code,]
DateMD = substr(df_data_code$Date, 6, 10)
df_data_code$Value[Date < sampleStart | Date > sampleEnd] = NA
if (!is.null(df_mod)) {
df_mod = add_mod(df_mod, code,
type='Seasonal sampling ',
fun_name='NA assignment',
comment=paste('Before ', sampleStart,
' and after ', sampleEnd,
sep=''))
}
# Leap year verification
# print(df_data_code[df_data_code$Date > as.Date("1992-02-25"),])
df_data[df_data$code == code,] = df_data_code
}
if (!is.null(df_mod)) {
res = list(data=df_data, mod=df_mod)
return (res)
} else {
return (df_data)
}
## 2. DURING TREND ANALYSE ___________________________________________
### 2.1. Preparation _________________________________________________
# Prepares the data in order to have a list of a data tibble with
# date, group and flow column and a info tibble with the station code
# and group column to fit the entry of the 'extract.Var' function in
# the 'StatsAnalysisTrend' package
prepare = function(df_data, colnamegroup=NULL) {
colindgroup = which(colnames(df_data) == colnamegroup)
df_data = group_by_at(df_data, colindgroup)
data = tibble(Date=df_data$Date,
group=group_indices(df_data),
info = bind_cols(group=seq(1:nrow(Gkey)),
Gkey)
# Stores data and info tibble as a list that match the entry of
# the 'extract.Var' function
res = list(data=data, info=info)
return (res)
}
### 2.2. Re-preparation ______________________________________________
# Re-prepares the data in outing of the 'extract.Var' function in
# the 'StatsAnalysisTrend' package in order to fit again to the
# entry of the same function
reprepare = function(df_XEx, df_Xlist, colnamegroup=NULL) {
# Changes the column name of the results of the
# 'extract.Var' function
nbt = lengths(regmatches(exDate, gregexpr('-', exDate)))
# Converts it to date from a year and a month
df_XEx$Date = paste(df_XEx$Date, '01', sep='-')
# If there is no dash
# Converts it to date from only a year
df_XEx$Date = paste(df_XEx$Date, '01', '01', sep='-')
# If there is more than 2 dashes
df_XEx[-1],
df_Xlist$info[df_XEx$group,
2:ncol(df_Xlist$info)])
df_XlistEx = prepare(df_XEx, colnamegroup=colnamegroup)
return (df_XlistEx)
### 2.3. Prepare date ________________________________________________
prepare_date = function(df_XEx, df_Xlist, per.start="01-01") {
# filter(group_by(df_Xlist$data, group), Date == min(Date))
df_dateStart$Date_julian = NA
df_dateStart$DateHydro_julian = NA
date = as.Date(df_dateStart$Date)
date_per.start = as.Date(paste(substr(date, 1, 4),
'-', per.start, sep=''))
date[date < date_per.start] = date_per.start[date < date_per.start]
df_dateStart$Date = date
origin = as.Date(paste(format(df_dateStart$Date, "%Y"),
'-', per.start, sep=''))
# originHydro = as.Date(paste(format(df_dateStart$Date, "%Y"),
# '-01-01', sep=''))
# print(date[i])
# dateJulHydrotmp = julian(date[i], origin=originHydro[i])
# df_dateStart$DateHydro_julian[i] = dateJulHydrotmp
Ok_dateStart = df_dateStart$group == group
Shift = df_dateStart$Date_julian[Ok_dateStart]
year = df_dateStart$Year[Ok_dateStart]
OkXEx_code_year = df_XEx$group1 == group & df_XEx$datetime == year
df_XEx$values[OkXEx_code_year] =
df_XEx$values[OkXEx_code_year] + Shift
# ShiftHydro = df_dateStart$DateHydro_julian[Ok_dateStart]
# df_XEx$values[OkXEx_code] = df_XEx$values[OkXEx_code] + ShiftHydro
## Add 365 when the point is too remote
# XEx_code = df_XEx$values[OkXEx_code]
# meanXEx_code = mean(XEx_code, na.rm=TRUE)
# dXEx_code = meanXEx_code - XEx_code
# stdXEx_code = sd(XEx_code, na.rm=TRUE)
# OkOverStd = dXEx_code >= stdXEx_code*3
# OkOverStd[is.na(OkOverStd)] = FALSE
# XEx_code[OkOverStd] = XEx_code[OkOverStd] + 365
# df_XEx$values[OkXEx_code] = XEx_code
# print(group)
# print(df_XEx$datetime[df_XEx$group1 == group][dXEx_code >= stdXEx_code*3])
## 3. AFTER TREND ANALYSE ____________________________________________
### 3.1. Period of trend _____________________________________________
# Compute the start and the end of the period for a trend analysis
# according to the accessible data
get_period = function (per, df_Xtrend, df_XEx, df_Xlist) {
# Converts results of trend to tibble
df_Xtrend = tibble(df_Xtrend)
# Fix the period start and end of the accessible period to a
# default date
df_Xtrend$period_start = as.Date("1970-01-01")
df_Xtrend$period_end = as.Date("1970-01-01")
# Changes the format of the date variable to date
df_Xlisttmp = reprepare(df_XEx, df_Xlist, colnamegroup=c('code'))
df_XExtmp = df_Xlisttmp$data
# For all the different group
for (g in df_Xlisttmp$info$group) {
# Gets the analyse data associated to the group
df_XExtmp_code = df_XExtmp[df_XExtmp$group == g,]
# Gets the id in the trend result associated to the group
id = which(df_Xtrend$group1 == g)
# Computes index of the nearest accessible start and end date
OkStart = df_XExtmp_code$Date >= as.Date(per[1])
OkEnd = df_XExtmp_code$Date <= as.Date(per[2])
# DateStart = df_XExtmp_code$Date[OkStart]
# DateEnd = df_XExtmp_code$Date[OkEnd]
DateStart = df_XExtmp_code$Date
DateEnd = df_XExtmp_code$Date
iStart = which.min(abs(DateStart - as.Date(per[1])))
iEnd = which.min(abs(DateEnd - as.Date(per[2])))
# print(nrow(df_XEx))
# print(as.Date(DateStart[iStart]))
# print(head(df_XEx))
# print(tail(df_XEx))
# print(as.Date(DateEnd[iEnd]))
df_Xtrend$period_start[id] = as.Date(DateStart[iStart])
df_Xtrend$period_end[id] = as.Date(DateEnd[iEnd])
### 3.2. Cleaning ____________________________________________________
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
# Cleans the trend results of the function 'Estimate.stats' in the
# 'StatsAnalysisTrend' package. It adds the station code and the
# intercept of the trend to the trend results. Also makes the data
# more presentable.
clean = function (df_Xtrend, df_XEx, df_Xlist) {
# Reprepares the list of data and info in order to be presentable
df_Xlist = reprepare(df_XEx, df_Xlist, colnamegroup=c('code'))
# Adds a column of station code
df_Xlist$data$code = NA
# For all the group
for (g in df_Xlist$info$group) {
# Adds the station code corresponding to each group info
df_Xlist$data$code[which(df_Xlist$data$group == g)] = df_Xlist$info$code[df_Xlist$info$group == g]
}
# Adds the info to trend tibble
df_Xtrend = bind_cols(df_Xtrend,
df_Xlist$info[df_Xtrend$group1,
2:ncol(df_Xlist$info)])
# Renames the column of group of trend results
colnames(df_Xtrend)[1] = 'group'
# Adds the intercept value of trend
df_Xtrend = get_intercept(df_Xtrend, df_Xlist, unit2day=365.25)
# Changes the position of the intercept column
df_Xtrend = relocate(df_Xtrend, intercept, .after=trend)
# Creates a list of results to return
res = list(trend=df_Xtrend, data=df_Xlist$data, info=df_Xlist$info)
return (res)
}
## 4. OTHER __________________________________________________________
add_mod = function (df_mod, Code, type, fun_name, comment, df_meta=NULL) {
if (Code == 'all' & is.null(df_meta)) {
Code = NA # erreur
} else if (Code == 'all' & !is.null(df_meta)) {
# Get all different stations code
Code = levels(factor(df_meta$code))
}
for (code in Code) {
df_modtmp = tibble(code=code, type=type,
fun_name=fun_name,
comment=comment)
df_mod = bind_rows(df_mod, df_modtmp)
}
return (df_mod)
}