Newer
Older
#
# *1 INRAE, France
# louis.heraut@inrae.fr
#
# This file is part of ash R toolbox.
#
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# ash R toolbox is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ash R toolbox. If not, see <https://www.gnu.org/licenses/>.
# ///
#
#
# processing/extract.R
#
# Regroups functions to generation of generical station selection
# according to pre-existing directory of data or specific other
# selection format as '.docx' file from Agence de l'eau Adour-Garonne.
# Also useful to extract the data and the metadata present in the
# Banque Hydro files from a selection of station. Manages also
# shapefiles loading.
### 1. GENERAL METADATA ON STATION ___________________________________
iStatut = c('0'='inconnu',
'1'='station avec signification hydrologique',
'2'='station sans signification hydrologique',
'3'="station d'essai")
'6'="suivi d'étiage",
'7'='bassin expérimental',
'1'='une échelle',
'2'='deux échelles, station mère',
'3'='deux échelles, station fille',
'4'='débits mesurés',
iDebit = c('0'='reconstitué',
'1'=paste("réel (prise en compte de l'eau rajoutée ",
"ou retirée du bassin selon aménagements)",
iQBE = c('0'='qualité basses eaux inconnue',
'1'='qualité basses eaux bonne',
'2'='qualité basses eaux douteuse')
iQME = c('0'='qualité moyennes eaux inconnue',
'1'='qualité moyennes eaux bonne',
'2'='qualité moyennes eaux douteuse')
iQHE = c('0'='qualité hautes eaux inconnue',
'1'='qualité hautes eaux bonne',
'2'='qualité hautes eaux douteuse')
'A'='Rhin',
'B'='Meuse',
'F'='Seine aval (Marne incluse)',
'J'='Bretagne',
'K'='Loire',
'L'='Loire',
'M'='Loire',
'O0'='Garonne',
'O1'='Garonne',
'O2'='Garonne',
'O3'='Tarn-Aveyron',
'O4'='Tarn-Aveyron',
'O5'='Tarn-Aveyron',
'O6'='Tarn-Aveyron',
'O7'='Lot',
'O8'='Lot',
'O9'='Lot',
'S'="Fleuves côtiers de l'Adour-Garonne",
'U'='Saône',
'V'='Rhône',
'W'='Isère',
'Y'='Fleuves côtiers du Rhône-Méditérannée et Corse',
'Z'='Îles',
'1'='Guadeloupe',
'2'='Martinique',
'5'='Guyane',
'6'='Guyane',
'7'='Guyane',
'8'='Guyane',
'9'='Guyane',
## 2. SELECTION ______________________________________________________
### 2.1. Creation of selection _______________________________________
# Create a txt file that resume all the station data files present
# in a filedir
create_selection = function (computer_data_path, filedir, outname, optname='_HYDRO_QJM') {
outfile = file.path(computer_data_path, outname)
# Path to find the directory of desired codes
dir_path = file.path(computer_data_path, filedir)
# Create a filelist of all the filename in the above directory
filelist_tmp = list.files(dir_path)
# Create a filelist to store all station codes
codelist = c()
# For all the filename in the file list
# Extracts the station code
code = gsub("[^[:alnum:] ].*$", '', f)
df_file = tibble(code=codelist,
filename=paste(codelist, optname, '.txt',
sep=''),
write.table(df_file, outfile, sep=";", col.names=TRUE, quote=FALSE)
# Returns that it is done with the path
print('Done')
print(paste('path : ', outfile, sep=''))
print('example of file : ')
print(head(df_file))
}
# Example
# create_selection(
# "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
# "France207",
### 2.2. Agence de l'eau Adour-Garonne selection _____________________
# Gets the selection of station from the 'Liste-station_RRSE.docx' file
get_selection_AEAG = function (computer_data_path, listdir, listname,
cnames=c('code','station', 'BV_km2',
'axe_principal_concerne',
'longueur_serie', 'commentaires',
'choix'),
c_num=c('BV_km2', 'longueur_serie')) {
list_path = file.path(computer_data_path, listdir, listname)
sample_data = read_docx(list_path)
content = docx_summary(sample_data)
table_cells = content %>% filter(content_type == "table cell")
table_data = table_cells %>% filter(!is_header) %>% select(row_id,
cell_id,
text)
# Splits data into individual columns
splits = split(table_data, table_data$cell_id)
splits = lapply(splits, function(x) x$text)
# Combines columns back together in wide format
df_selec = bind_cols(splits)
# Removes the first line
df_selec$c = as.numeric(sub(",", ".",
pull(df_selec, c)))
selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout')
filename=paste(df_selec$code, '_HYDRO_QJM.txt',
sep=''),
ok=selec)
# "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
# "",
# "Liste-station_RRSE.docx",
# cnames=c('code','station',
# 'BV_km2',
# 'axe_principal_concerne',
# 'longueur_serie',
# 'commentaires',
# 'choix'),
# c_num=c('BV_km2',
# 'longueur_serie'))
### 2.3. INRAE selection _____________________________________________
# Gets the selection of station from the selection txt file generated
# by the 'create_selection' function
get_selection_INRAE = function (computer_data_path, listdir, listname) {
list_path = file.path(computer_data_path, listdir, listname)
df_selec = read.table(list_path,
header=TRUE,
encoding='UTF-8',
df_selec = tibble(code=as.character(df_selec$code),
filename=as.character(df_selec$filename),
ok=df_selec$ok)
return (df_selec)
}
# Example
# "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
# "",
## 3. EXTRACTION _____________________________________________________
# Extraction of metadata of stations
extract_meta = function (computer_data_path, filedir, filename,
verbose=TRUE) {
# Convert the filename in vector
# If the filename is 'all' or regroup more than one filename
# Create a filelist to store all the filename
# Get all the filename in the data directory selected
filelist_tmp = list.files(file.path(computer_data_path,
# For all the filename in the directory selected
# If the filename extention is 'txt'
# Store the filename in the filelist
# The filelist correspond to the filename
}
# Create a blank data frame
# For all the file in the filelist
# Concatenate by raw data frames created by this function
# when filename correspond to only one filename
extract_meta(computer_data_path,
# Set the rownames by default (to avoid strange numbering)
# Get the filename from the vector
filename = filename[1]
print(paste("extraction of BH meta for file :", filename))
file_path = file.path(computer_data_path, filedir, filename)
if (file.exists(file_path) & substr(file_path, nchar(file_path),
nchar(file_path)) != '/') {
metatxt = c(readLines(file_path, n=41, encoding="UTF-8"))
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
tibble(
# Station code
code=trimws(substr(metatxt[11], 38,
nchar(metatxt[11]))),
# Station name
nom=trimws(substr(metatxt[12], 39,
nchar(metatxt[12]))),
# Territory
territoire=trimws(substr(metatxt[13], 39,
nchar(metatxt[13]))),
# Administrator
gestionnaire=trimws(substr(metatxt[7], 60,
nchar(metatxt[7]))),
# Lambert 93 localisation
L93X_m_IN=as.numeric(substr(metatxt[16], 65, 77)),
L93X_m_BH=as.numeric(substr(metatxt[16], 38, 50)),
L93Y_m_IN=as.numeric(substr(metatxt[16], 79, 90)),
L93Y_m_BH=as.numeric(substr(metatxt[16], 52, 63)),
# Surface
surface_km2_IN=as.numeric(substr(metatxt[19], 52, 63)),
surface_km2_BH=as.numeric(substr(metatxt[19], 38, 50)),
# Elevation
altitude_m_IN=as.numeric(substr(metatxt[20], 52, 63)),
altitude_m_BH=as.numeric(substr(metatxt[20], 38, 50)),
# Start and end of the data
debut=substr(metatxt[25], 38, 50),
fin=substr(metatxt[25], 52, 63),
# Different other info about the flow quality and type
statut=iStatut[trimws(substr(metatxt[26], 38, 50))],
finalite=iFinalite[trimws(substr(metatxt[26], 52, 56))],
type=iType[trimws(substr(metatxt[26], 58, 58))],
influence=iInfluence[trimws(substr(metatxt[26], 60, 60))],
debit=iDebit[trimws(substr(metatxt[26], 62, 62))],
QBE=iQBE[trimws(substr(metatxt[26], 72, 72))],
QME=iQME[trimws(substr(metatxt[26], 74, 74))],
QHE=iQHE[trimws(substr(metatxt[26], 76, 76))],
# The path to the data file of BH
file_path=file_path)
Ltmp = names(iRegHydro)[nchar(names(iRegHydro)) == 2]
Ltmp = substr(Ltmp, 1, 1)
infoSecteur = rle(sort(Ltmp))$values
oneL = substr(df_meta$code, 1, 1)
twoL = substr(df_meta$code, 1, 2)
RH = c()
for (i in 1:length(oneL)) {
if (oneL[i] %in% infoSecteur) {
RHtmp = iRegHydro[twoL[i]]
} else {
RHtmp = iRegHydro[oneL[i]]
}
RH = c(RH, RHtmp)
}
} else {
print(paste('filename', file_path, 'do not exist'))
return (NULL)
}
# "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
# "BanqueHydro_Export2021",
# c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))
### 3.2. Extraction of data __________________________________________
extract_data = function (computer_data_path, filedir, filename,
verbose=TRUE) {
# If the filename is 'all' or regroup more than one filename
# Create a filelist to store all the filename
# Get all the filename in the data directory selected
filelist_tmp = list.files(file.path(computer_data_path,
# For all the filename in the directory selected
# If the filename extention is 'txt'
# Store the filename in the filelist
# For all the file in the filelist
# Concatenate by raw data frames created by this function
# when filename correspond to only one filename
extract_data(computer_data_path,
# Set the rownames by default (to avoid strange numbering)
# Get the filename from the vector
print(paste("extraction of BH data for file :", filename))
file_path = file.path(computer_data_path, filedir, filename)
if (file.exists(file_path) & substr(file_path, nchar(file_path),
nchar(file_path)) != '/') {
# Extract the data as a data frame
df_data = read.table(file_path,
header=TRUE,
na.strings=c(' -99', ' -99.000'),
sep=';',
df_meta = extract_meta(computer_data_path, filedir, filename,
verbose=FALSE)
# Create a tibble with the date as Date class and the code
# of the station
df_data = tibble(Date=as.Date(as.character(df_data$Date),
format="%Y%m%d"),
Qmmj=df_data$Qmmj,
val_H=as.character(df_data$val_H),
val_I=as.character(df_data$val_I),
code=code)
return (df_data)
} else {
print(paste('filename', file_path, 'do not exist'))
return (NULL)
}
# "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
# '',
# c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))
## 4. SHAPEFILE MANAGEMENT ___________________________________________
# Generates a list of shapefiles to draw a hydrological map of
# the France
ini_shapefile = function (resources_path, fr_shpdir, fr_shpname, bs_shpdir, bs_shpname, sbs_shpdir, sbs_shpname, rv_shpdir, rv_shpname, is_river=TRUE) {
fr_shppath = file.path(resources_path, fr_shpdir, fr_shpname)
bs_shppath = file.path(resources_path, bs_shpdir, bs_shpname)
sbs_shppath = file.path(resources_path, sbs_shpdir, sbs_shpname)
rv_shppath = file.path(resources_path, rv_shpdir, rv_shpname)
# France
fr_spdf = readOGR(dsn=fr_shppath, verbose=FALSE)
proj4string(fr_spdf) = CRS("+proj=longlat +ellps=WGS84")
# Transformation in Lambert93
france = spTransform(fr_spdf, CRS("+init=epsg:2154"))
df_france = tibble(fortify(france))
bassin = readOGR(dsn=bs_shppath, verbose=FALSE)
df_bassin = tibble(fortify(bassin))
# Hydrological basin
subbassin = readOGR(dsn=sbs_shppath, verbose=FALSE)
df_subbassin = tibble(fortify(subbassin))
river = readOGR(dsn=rv_shppath, verbose=FALSE) ### trop long ###
river = river[which(river$Classe == 1),]
df_river = tibble(fortify(river))
} else {
df_river = NULL
}
return (list(france=df_france, bassin=df_bassin, subbassin=df_subbassin, river=df_river))