extract.R 19.75 KiB
# \\\
# Copyright 2021-2022 Louis Héraut*1
# *1   INRAE, France
#      louis.heraut@inrae.fr
# This file is part of ash R toolbox.
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# ash R toolbox is distributed in the hope that it will be useful, but 
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with ash R toolbox.  If not, see <https://www.gnu.org/licenses/>.
# ///
# processing/extract.R
# Regroups functions to generation of generical station selection
# according to pre-existing directory of data or specific other
# selection format as '.docx' file from Agence de l'eau Adour-Garonne.
# Also useful to extract the data and the metadata present in the
# Banque Hydro files from a selection of station. Manages also
# shapefiles loading.
# Usefull library
library(tools)
library(dplyr)
library(officer)
### 1. GENERAL METADATA ON STATION ___________________________________
# Status of the station
iStatut = c('0'='inconnu', 
            '1'='station avec signification hydrologique', 
            '2'='station sans signification hydrologique', 
            '3'="station d'essai")
# Goal
iFinalite = c('0'='inconnue', 
              '1'="hydrométrie générale", 
              '2'='alerte de crue', 
              '3'="hydrométrie générale et alerte de crue",
              '4'="gestion d'ouvrage", 
              '5'='police des eaux', 
              '6'="suivi d'étiage", 
              '7'='bassin expérimental', 
              '8'='drainage')
# Type of measure
iType = c('0'='inconnu',
          '1'='une échelle',
          '2'='deux échelles, station mère',
          '3'='deux échelles, station fille',
          '4'='débits mesurés',
          '5'='virtuelle')
# Influence of the flow
iInfluence = c('0'='inconnue',
               '1'='nulle ou faible',
               '2'='en étiage seulement',
               '3'='forte en toute saison')
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
# Type of flow iDebit = c('0'='reconstitué', '1'=paste("réel (prise en compte de l'eau rajoutée ", "ou retirée du bassin selon aménagements)", sep=''), '2'='naturel') # Quality of low water flow iQBE = c('0'='qualité basses eaux inconnue', '1'='qualité basses eaux bonne', '2'='qualité basses eaux douteuse') # Quality of mean water flow iQME = c('0'='qualité moyennes eaux inconnue', '1'='qualité moyennes eaux bonne', '2'='qualité moyennes eaux douteuse') # Quality of high water flow iQHE = c('0'='qualité hautes eaux inconnue', '1'='qualité hautes eaux bonne', '2'='qualité hautes eaux douteuse') # Hydrological region iRegHydro = c('D'='Affluents du Rhin', 'E'="Fleuves côtiers de l'Artois-Picardie", 'A'='Rhin', 'B'='Meuse', 'F'='Seine aval (Marne incluse)', 'G'='Fleuves côtiers haut normands', 'H'='Seine amont', 'I'='Fleuves côtiers bas normands', 'J'='Bretagne', 'K'='Loire', 'L'='Loire', 'M'='Loire', 'N'='Fleuves côtiers au sud de la Loire', 'O0'='Garonne', 'O1'='Garonne', 'O2'='Garonne', 'O3'='Tarn-Aveyron', 'O4'='Tarn-Aveyron', 'O5'='Tarn-Aveyron', 'O6'='Tarn-Aveyron', 'O7'='Lot', 'O8'='Lot', 'O9'='Lot', 'P'='Dordogne', 'Q'='Adour', 'R'='Charente', 'S'="Fleuves côtiers de l'Adour-Garonne", 'U'='Saône', 'V'='Rhône', 'W'='Isère', 'X'='Durance', 'Y'='Fleuves côtiers du Rhône-Méditérannée et Corse', 'Z'='Îles', '1'='Guadeloupe', '2'='Martinique', '5'='Guyane', '6'='Guyane', '7'='Guyane', '8'='Guyane', '9'='Guyane', '4'='Réunion') ## 2. SELECTION ______________________________________________________ ### 2.1. Creation of selection _______________________________________ # Create a txt file that resume all the station data files present
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
# in a filedir create_selection = function (computer_data_path, filedir, outname, optname='_HYDRO_QJM') { # Out file for store results outfile = file.path(computer_data_path, outname) # Path to find the directory of desired codes dir_path = file.path(computer_data_path, filedir) # Create a filelist of all the filename in the above directory filelist_tmp = list.files(dir_path) # Create a filelist to store all station codes codelist = c() # For all the filename in the file list for (f in filelist_tmp) { # If the filename is a 'txt' file if (file_ext(f) == 'txt') { # Extracts the station code code = gsub("[^[:alnum:] ].*$", '', f) # Then the station code is stored codelist = c(codelist, code) } } # Create a tibble to store the data to write df_file = tibble(code=codelist, filename=paste(codelist, optname, '.txt', sep=''), ok=TRUE) # Write the data in a txt file write.table(df_file, outfile, sep=";", col.names=TRUE, quote=FALSE) # Returns that it is done with the path print('Done') print(paste('path : ', outfile, sep='')) print('example of file : ') print(head(df_file)) } # Example # create_selection( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "France207", # "INRAE_selection.txt") ### 2.2. Agence de l'eau Adour-Garonne selection _____________________ # Gets the selection of station from the 'Liste-station_RRSE.docx' file get_selection_AEAG = function (computer_data_path, listdir, listname, cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), c_num=c('BV_km2', 'longueur_serie')) { # Gets the file path to the data list_path = file.path(computer_data_path, listdir, listname) # Reads and formats the docx file sample_data = read_docx(list_path) content = docx_summary(sample_data) table_cells = content %>% filter(content_type == "table cell") table_data = table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text) # Splits data into individual columns splits = split(table_data, table_data$cell_id) splits = lapply(splits, function(x) x$text) # Combines columns back together in wide format df_selec = bind_cols(splits) # Removes the first line df_selec = df_selec[-1,] # Changes the columns name
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
names(df_selec) = cnames # For all the numerical column for (c in c_num) { # Convert them as numeric df_selec$c = as.numeric(sub(",", ".", pull(df_selec, c))) } # Perfoms the selection according to the column of choice selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout') # Stores it in the tibble of selection df_selec = bind_cols(df_selec, filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''), ok=selec) return (df_selec) } # Example # df_selec_AEAG = get_selection_AEAG( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "", # "Liste-station_RRSE.docx", # cnames=c('code','station', # 'BV_km2', # 'axe_principal_concerne', # 'longueur_serie', # 'commentaires', # 'choix'), # c_num=c('BV_km2', # 'longueur_serie')) ### 2.3. INRAE selection _____________________________________________ # Gets the selection of station from the selection txt file generated # by the 'create_selection' function get_selection_INRAE = function (computer_data_path, listdir, listname) { # Gets the file path to the data list_path = file.path(computer_data_path, listdir, listname) # Extracts the data as a data frame df_selec = read.table(list_path, header=TRUE, encoding='UTF-8', sep=';') # Stores it in the tibble of selection df_selec = tibble(code=as.character(df_selec$code), filename=as.character(df_selec$filename), ok=df_selec$ok) return (df_selec) } # Example # df_selec_INRAE = get_selection_INRAE( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "", # "INRAE_selection.txt") ## 3. EXTRACTION _____________________________________________________ ### 3.1. Extraction of metadata # Extraction of metadata of stations extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all'
281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename } # Create a blank data frame df_meta = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function # when filename correspond to only one filename df_meta = rbind(df_meta, extract_meta(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_meta) = NULL return (df_meta) } # Get the filename from the vector filename = filename[1] # Print metadata if asked if (verbose) { print(paste("extraction of BH meta for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { # Extract all the header metatxt = c(readLines(file_path, n=41, encoding="UTF-8")) # Create a tibble with all the metadata needed # (IN for INRAE data and BH for BH data) df_meta = tibble( # Station code code=trimws(substr(metatxt[11], 38, nchar(metatxt[11]))), # Station name nom=trimws(substr(metatxt[12], 39, nchar(metatxt[12]))), # Territory territoire=trimws(substr(metatxt[13], 39, nchar(metatxt[13]))), # Administrator gestionnaire=trimws(substr(metatxt[7], 60, nchar(metatxt[7]))),
351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
# Lambert 93 localisation L93X_m_IN=as.numeric(substr(metatxt[16], 65, 77)), L93X_m_BH=as.numeric(substr(metatxt[16], 38, 50)), L93Y_m_IN=as.numeric(substr(metatxt[16], 79, 90)), L93Y_m_BH=as.numeric(substr(metatxt[16], 52, 63)), # Surface surface_km2_IN=as.numeric(substr(metatxt[19], 52, 63)), surface_km2_BH=as.numeric(substr(metatxt[19], 38, 50)), # Elevation altitude_m_IN=as.numeric(substr(metatxt[20], 52, 63)), altitude_m_BH=as.numeric(substr(metatxt[20], 38, 50)), # Start and end of the data debut=substr(metatxt[25], 38, 50), fin=substr(metatxt[25], 52, 63), # Different other info about the flow quality and type statut=iStatut[trimws(substr(metatxt[26], 38, 50))], finalite=iFinalite[trimws(substr(metatxt[26], 52, 56))], type=iType[trimws(substr(metatxt[26], 58, 58))], influence=iInfluence[trimws(substr(metatxt[26], 60, 60))], debit=iDebit[trimws(substr(metatxt[26], 62, 62))], QBE=iQBE[trimws(substr(metatxt[26], 72, 72))], QME=iQME[trimws(substr(metatxt[26], 74, 74))], QHE=iQHE[trimws(substr(metatxt[26], 76, 76))], # The path to the data file of BH file_path=file_path) Ltmp = names(iRegHydro)[nchar(names(iRegHydro)) == 2] Ltmp = substr(Ltmp, 1, 1) infoSecteur = rle(sort(Ltmp))$values oneL = substr(df_meta$code, 1, 1) twoL = substr(df_meta$code, 1, 2) RH = c() for (i in 1:length(oneL)) { if (oneL[i] %in% infoSecteur) { RHtmp = iRegHydro[twoL[i]] } else { RHtmp = iRegHydro[oneL[i]] } RH = c(RH, RHtmp) } # Adding of the hydrological region df_meta$region_hydro = RH return (df_meta) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_meta = extract_meta( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "BanqueHydro_Export2021", # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt')) ### 3.2. Extraction of data __________________________________________ # Extraction of data extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename)
421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
# If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename } # Create a blank data frame df_data = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function # when filename correspond to only one filename df_data = rbind(df_data, extract_data(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_data) = NULL return (df_data) } # Get the filename from the vector filename = filename[1] # Print metadata if asked if (verbose) { print(paste("extraction of BH data for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { # Extract the data as a data frame df_data = read.table(file_path, header=TRUE, na.strings=c(' -99', ' -99.000'), sep=';', skip=41) # Extract all the metadata for the station df_meta = extract_meta(computer_data_path, filedir, filename, verbose=FALSE) # Get the code of the station code = df_meta$code # Create a tibble with the date as Date class and the code # of the station df_data = tibble(Date=as.Date(as.character(df_data$Date), format="%Y%m%d"),
491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
Value=df_data$Qls * 1E-3, Qmmj=df_data$Qmmj, val_H=as.character(df_data$val_H), val_I=as.character(df_data$val_I), code=code) return (df_data) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_data = extract_data( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # '', # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt')) ## 4. SHAPEFILE MANAGEMENT ___________________________________________ # Generates a list of shapefiles to draw a hydrological map of # the France ini_shapefile = function (resources_path, fr_shpdir, fr_shpname, bs_shpdir, bs_shpname, sbs_shpdir, sbs_shpname, rv_shpdir, rv_shpname, show_river=TRUE) { # Path for shapefile fr_shppath = file.path(resources_path, fr_shpdir, fr_shpname) bs_shppath = file.path(resources_path, bs_shpdir, bs_shpname) sbs_shppath = file.path(resources_path, sbs_shpdir, sbs_shpname) rv_shppath = file.path(resources_path, rv_shpdir, rv_shpname) # France fr_spdf = readOGR(dsn=fr_shppath, verbose=FALSE) proj4string(fr_spdf) = CRS("+proj=longlat +ellps=WGS84") # Transformation in Lambert93 france = spTransform(fr_spdf, CRS("+init=epsg:2154")) df_france = tibble(fortify(france)) # Hydrological basin bassin = readOGR(dsn=bs_shppath, verbose=FALSE) df_bassin = tibble(fortify(bassin)) # Hydrological basin subbassin = readOGR(dsn=sbs_shppath, verbose=FALSE) df_subbassin = tibble(fortify(subbassin)) # If the river shapefile needs to be load if (show_river) { # Hydrographic network river = readOGR(dsn=rv_shppath, verbose=FALSE) ### trop long ### river = river[which(river$Classe == 1),] df_river = tibble(fortify(river)) } else { df_river = NULL } return (list(france=df_france, bassin=df_bassin, subbassin=df_subbassin, river=df_river)) }