Newer
Older
#
# *1 INRAE, France
# louis.heraut@inrae.fr
#
# This file is part of ash R toolbox.
#
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# ash R toolbox is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ash R toolbox. If not, see <https://www.gnu.org/licenses/>.
# ///
#
#
# processing/format.R
#
# Manages all the format problem of data and info. Mainly problem of
# input and output of the 'StatsAnalysisTrend' package. It also allows
# to join different selections of station and to gets exact period of
# trend analysis.
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
## 1. BEFORE TREND ANALYSE
### 1.1. Joining selection
# Joins tibbles of different selection of station as a unique one
join = function (df_data_AG, df_data_IN, df_meta_AG, df_meta_IN) {
# If there is an INRAE and an Agence de l'eau Adour-Garonne selection
if (!is.null(df_data_IN) & !is.null(df_data_AG)) {
# Gets the station in common
common = levels(factor(df_meta_IN[df_meta_IN$code %in% df_meta_AG$code,]$code))
# Gets the Nv station to add
INadd = levels(factor(df_meta_IN[!(df_meta_IN$code %in% df_meta_AG$code),]$code))
# Selects only the IN meta to add
df_meta_INadd = df_meta_IN[df_meta_IN$code %in% INadd,]
# Names the source of the selection
df_meta_AG$source = 'AG'
df_meta_INadd$source = 'IN'
# Joins IN data to AG data
df_meta = full_join(df_meta_AG, df_meta_INadd)
# Selects only the IN data to add
df_data_INadd = df_data_IN[df_data_IN$code %in% INadd,]
# Joins IN meta to AG meta
df_data = full_join(df_data_AG, df_data_INadd)
# If there is just an Agence de l'eau Adour-Garonne selection
} else if (is.null(df_data_IN) & !is.null(df_data_AG)) {
df_meta_AG$source = 'AG'
df_meta = df_meta_AG
df_data = df_data_AG
# If there is just an INRAE selection
} else if (!is.null(df_data_IN) & is.null(df_data_AG)) {
df_meta_IN$source = 'IN'
df_meta = df_meta_IN
df_data = df_data_IN
# If there is no selection
} else {
stop('No data')
}
return (list(data=df_data, meta=df_meta))
}
### 1.2. Manages missing data
missing_data = function (df_data, df_meta, yearLac_day=3, yearStart='01-01', Code=NULL) {
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
if (is.null(Code)) {
# Get all different stations code
Code = levels(factor(df_meta$code))
nCode = length(Code)
} else {
nCode = length(Code)
}
for (code in Code) {
# Extracts the data corresponding to the code
df_data_code = df_data[df_data$code == code,]
DateMD = substr(df_data_code$Date, 6, 10)
idyearStart = which(DateMD == yearStart)
if (DateMD[1] != yearStart) {
idyearStart = c(1, idyearStart)
}
NidyearStart = length(idyearStart)
for (i in 1:NidyearStart) {
Start = df_data_code$Date[idyearStart[i]]
if (i < NidyearStart) {
End = df_data_code$Date[idyearStart[i+1] - 1]
} else {
End = df_data_code$Date[length(df_data_code$Date)]
}
OkYear = df_data_code$Date >= Start & df_data_code$Date <= End
df_data_code_year = df_data_code[OkYear,]
StartReal = as.Date(paste(substr(Start, 1, 4),
yearStart, sep='-'))
EndReal = as.Date(paste(as.numeric(substr(Start, 1, 4)) + 1,
yearStart, sep='-'))
nbDate = as.numeric(difftime(EndReal, StartReal,
units="days"))
nbNA = sum(as.numeric(is.na(df_data_code_year$Value)))
nbNA = nbNA + abs(as.numeric(difftime(StartReal, Start,
units="days")))
nbNA = nbNA + abs(as.numeric(difftime(EndReal, End+1,
units="days")))
yearLacMiss_pct = nbNA/nbDate * 100
df_data_code_year$Value = NA
df_data_code[OkYear,] = df_data_code_year
} else if (nbNA <= yearLac_day & nbNA > 1) {
DateJ = as.numeric(df_data_code_year$Date)
Value = df_data_code_year$Value
Value = approxExtrap(x=DateJ,
y=Value,
xout=DateJ,
method="linear",
na.rm=TRUE)$y
df_data_code$Value[OkYear] = Value
}
}
df_data[df_data$code == code,] = df_data_code
}
return (df_data)
}
### 1.3. Sampling of the data
sampling_data = function (df_data, df_meta, sampleSpan=c('05-01', '11-30'), Code=NULL) {
if (is.null(Code)) {
# Get all different stations code
Code = levels(factor(df_meta$code))
nCode = length(Code)
} else {
nCode = length(Code)
}
# 1972 is leap year reference is case of leap year comparison
sampleStart = as.Date(paste('1972', sampleSpan[1], sep='-'))
sampleEnd = as.Date(paste('1972', sampleSpan[2], sep='-'))
for (code in Code) {
# Extracts the data corresponding to the code
df_data_code = df_data[df_data$code == code,]
DateMD = substr(df_data_code$Date, 6, 10)
df_data_code$Value[Date < sampleStart | Date > sampleEnd] = NA
# Leap year verification
# print(df_data_code[df_data_code$Date > as.Date("1992-02-25"),])
df_data[df_data$code == code,] = df_data_code
}
return (df_data)
}
## 2. DURING TREND ANALYSE
### 2.1. Preparation
# Prepares the data in order to have a list of a data tibble with
# date, group and flow column and a info tibble with the station code
# and group column to fit the entry of the 'extract.Var' function in
# the 'StatsAnalysisTrend' package
prepare = function(df_data, colnamegroup=NULL) {
colindgroup = which(colnames(df_data) == colnamegroup)
df_data = group_by_at(df_data, colindgroup)
data = tibble(Date=df_data$Date,
group=group_indices(df_data),
info = bind_cols(group=seq(1:nrow(Gkey)),
Gkey)
# Stores data and info tibble as a list that match the entry of
# the 'extract.Var' function
res = list(data=data, info=info)
return (res)
}
# Re-prepares the data in outing of the 'extract.Var' function in
# the 'StatsAnalysisTrend' package in order to fit again to the
# entry of the same function
reprepare = function(df_XEx, df_Xlist, colnamegroup=NULL) {
# Changes the column name of the results of the
# 'extract.Var' function
nbt = lengths(regmatches(exDate, gregexpr('-', exDate)))
# Converts it to date from a year and a month
df_XEx$Date = paste(df_XEx$Date, '01', sep='-')
# If there is no dash
# Converts it to date from only a year
df_XEx$Date = paste(df_XEx$Date, '01', '01', sep='-')
# If there is more than 2 dashes
df_XEx[-1],
df_Xlist$info[df_XEx$group,
2:ncol(df_Xlist$info)])
df_XlistEx = prepare(df_XEx, colnamegroup=colnamegroup)
return (df_XlistEx)
prepare_date = function(df_XEx, df_Xlist, per.start="01-01") {
# filter(group_by(df_Xlist$data, group), Date == min(Date))
df_dateStart$Date_julian = NA
df_dateStart$DateHydro_julian = NA
date = as.Date(df_dateStart$Date)
date_per.start = as.Date(paste(substr(date, 1, 4),
'-', per.start, sep=''))
date[date < date_per.start] = date_per.start[date < date_per.start]
df_dateStart$Date = date
origin = as.Date(paste(format(df_dateStart$Date, "%Y"),
'-', per.start, sep=''))
originHydro = as.Date(paste(format(df_dateStart$Date, "%Y"),
'-01-01', sep=''))
for (i in 1:nrow(df_dateStart)) {
df_dateStart$Date_julian[i] = dateJultmp
dateJulHydrotmp = julian(date[i], origin=originHydro[i])
df_dateStart$DateHydro_julian[i] = dateJulHydrotmp
for (group in df_dateStart$group) {
Ok_dateStart = df_dateStart$group == group
Shift = df_dateStart$Date_julian[Ok_dateStart]
year = df_dateStart$Year[Ok_dateStart]
OkXEx_code_year = df_XEx$group1 == group & df_XEx$datetime == year
df_XEx$values[OkXEx_code_year] =
df_XEx$values[OkXEx_code_year] + Shift
OkXEx_code = df_XEx$group1 == group
ShiftHydro = df_dateStart$DateHydro_julian[Ok_dateStart]
df_XEx$values[OkXEx_code] = df_XEx$values[OkXEx_code] + ShiftHydro
OkOverStd = dXEx_code >= stdXEx_code*3
OkOverStd[is.na(OkOverStd)] = FALSE
XEx_code[OkOverStd] = XEx_code[OkOverStd] + 365
# print(group)
# print(df_XEx$datetime[df_XEx$group1 == group][dXEx_code >= stdXEx_code*3])
## 3. AFTER TREND ANALYSE
### 3.1. Period of trend
# Compute the start and the end of the period for a trend analysis
# according to the accessible data
get_period = function (per, df_Xtrend, df_XEx, df_Xlist) {
# Converts results of trend to tibble
df_Xtrend = tibble(df_Xtrend)
# Fix the period start and end of the accessible period to a
# default date
df_Xtrend$period_start = as.Date("1970-01-01")
df_Xtrend$period_end = as.Date("1970-01-01")
# Changes the format of the date variable to date
df_Xlisttmp = reprepare(df_XEx, df_Xlist, colnamegroup=c('code'))
df_XExtmp = df_Xlisttmp$data
# For all the different group
for (g in df_Xlisttmp$info$group) {
# Gets the analyse data associated to the group
df_XExtmp_code = df_XExtmp[df_XExtmp$group == g,]
# Gets the id in the trend result associated to the group
id = which(df_Xtrend$group1 == g)
# Computes index of the nearest accessible start and end date
OkStart = df_XExtmp_code$Date >= as.Date(per[1])
OkEnd = df_XExtmp_code$Date <= as.Date(per[2])
# DateStart = df_XExtmp_code$Date[OkStart]
# DateEnd = df_XExtmp_code$Date[OkEnd]
DateStart = df_XExtmp_code$Date
DateEnd = df_XExtmp_code$Date
iStart = which.min(abs(DateStart - as.Date(per[1])))
iEnd = which.min(abs(DateEnd - as.Date(per[2])))
# print(nrow(df_XEx))
# print(as.Date(DateStart[iStart]))
# print(head(df_XEx))
# print(tail(df_XEx))
# print(as.Date(DateEnd[iEnd]))
df_Xtrend$period_start[id] = as.Date(DateStart[iStart])
df_Xtrend$period_end[id] = as.Date(DateEnd[iEnd])
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
### 3.2. Cleaning
# Cleans the trend results of the function 'Estimate.stats' in the
# 'StatsAnalysisTrend' package. It adds the station code and the
# intercept of the trend to the trend results. Also makes the data
# more presentable.
clean = function (df_Xtrend, df_XEx, df_Xlist) {
# Reprepares the list of data and info in order to be presentable
df_Xlist = reprepare(df_XEx, df_Xlist, colnamegroup=c('code'))
# Adds a column of station code
df_Xlist$data$code = NA
# For all the group
for (g in df_Xlist$info$group) {
# Adds the station code corresponding to each group info
df_Xlist$data$code[which(df_Xlist$data$group == g)] = df_Xlist$info$code[df_Xlist$info$group == g]
}
# Adds the info to trend tibble
df_Xtrend = bind_cols(df_Xtrend,
df_Xlist$info[df_Xtrend$group1,
2:ncol(df_Xlist$info)])
# Renames the column of group of trend results
colnames(df_Xtrend)[1] = 'group'
# Adds the intercept value of trend
df_Xtrend = get_intercept(df_Xtrend, df_Xlist, unit2day=365.25)
# Changes the position of the intercept column
df_Xtrend = relocate(df_Xtrend, intercept, .after=trend)
# Creates a list of results to return
res = list(trend=df_Xtrend, data=df_Xlist$data, info=df_Xlist$info)
return (res)
}