# Usefull library library(tools) library(dplyr) library(officer) # General information on station iStatut = c('0'='inconnu', '1'='station avec signification hydrologique', '2'='station sans signification hydrologique', '3'="station d'essai") iFinalite = c('0'='inconnue', '1'="hydrométrie générale", '2'='alerte de crue', '3'="hydrométrie générale et alerte de crue", '4'="gestion d'ouvrage", '5'='police des eaux', '6'="suivi d'étiage", '7'='bassin expérimental', '8'='drainage') iType = c('0'='inconnu', '1'='une échelle', '2'='deux échelles, station mère', '3'='deux échelles, station fille', '4'='débits mesurés', '5'='virtuelle') iInfluence = c('0'='inconnue', '1'='nulle ou faible', '2'='en étiage seulement', '3'='forte en toute saison') iDebit = c('0'='reconstitué', '1'="réel (prise en compte de l'eau rajoutée ou retirée du bassin selon aménagements)", '2'='naturel') iQBE = c('0'='qualité basses eaux inconnue', '1'='qualité basses eaux bonne', '2'='qualité basses eaux douteuse') iQME = c('0'='qualité moyennes eaux inconnue', '1'='qualité moyennes eaux bonne', '2'='qualité moyennes eaux douteuse') iQHE = c('0'='qualité hautes eaux inconnue', '1'='qualité hautes eaux bonne', '2'='qualité hautes eaux douteuse') # Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory get_selection = function (computer_data_path, listdir, listname, cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), c_num=c('BV_km2', 'longueur_serie')) { # Get the file path to the data list_path = file.path(computer_data_path, listdir, listname) sample_data = read_docx(list_path) content = docx_summary(sample_data) table_cells <- content %>% filter(content_type == "table cell") table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text) # Split data into individual columns splits <- split(table_data, table_data$cell_id) splits <- lapply(splits, function(x) x$text) # Combine columns back together in wide format df_selec <- bind_cols(splits) df_selec = df_selec[-1,] names(df_selec) = cnames for (c in c_num) { df_selec$c = as.numeric(sub(",", ".", pull(df_selec, c))) } selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout') df_selec = bind_cols(df_selec, filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''), ok=selec ) return (df_selec) } # Example # df_selec = get_selection( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "", # "Liste-station_RRSE.docx", # cnames=c('code','station', # 'BV_km2', # 'axe_principal_concerne', # 'longueur_serie', # 'commentaires', # 'choix'), # c_num=c('BV_km2', # 'longueur_serie')) # Extraction of information extractBH_info = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename } # Create a blank data frame df_info = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_info = rbind(df_info, extractBH_info(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_info) = NULL return (df_info) } # Get the filename from the vector filename = filename[1] # Print information if asked if (verbose) { print(paste("extraction of info for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path)) { # Extract all the header infotxt = c(readLines(file_path, n=41)) # Create a tibble with all the information needed df_info = tibble(code=trimws(substr(infotxt[11], 38, nchar(infotxt[11]))), nom=trimws(substr(infotxt[12], 39, nchar(infotxt[12]))), territoire=trimws(substr(infotxt[13], 39, nchar(infotxt[13]))), L93X=as.numeric(substr(infotxt[16], 38, 50)), L93Y=as.numeric(substr(infotxt[16], 52, 63)), surface_km2=as.numeric(substr(infotxt[19], 38, 50)), statut=iStatut[trimws(substr(infotxt[26], 38, 50))], finalite=iFinalite[trimws(substr(infotxt[26], 52, 56))], type=iType[trimws(substr(infotxt[26], 58, 58))], influence=iInfluence[trimws(substr(infotxt[26], 60, 60))], debit=iDebit[trimws(substr(infotxt[26], 62, 62))], QBE=iQBE[trimws(substr(infotxt[26], 72, 72))], QME=iQME[trimws(substr(infotxt[26], 74, 74))], QHE=iQHE[trimws(substr(infotxt[26], 76, 76))], file_path=file_path ) return (df_info) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_info = extractBH_info( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # '', # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt')) # Extraction of data extractBH_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename } # Create a blank data frame df_data = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_data = rbind(df_data, extractBH_data(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_data) = NULL return (df_data) } # Get the filename from the vector filename = filename[1] # Print information if asked if (verbose) { print(paste("extraction of data for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path)) { # Extract the data as a data frame df_data = read.table(file_path, header=TRUE, na.strings=c(' -99', ' -99.000'), sep=';', skip=41) # Extract all the information for the station df_info = extractBH_info(computer_data_path, filedir, filename, verbose=FALSE) # Get the code of the station code = df_info$code # Create a tibble with the date as Date class and the code of the station df_data = tibble(Date=as.Date(as.character(df_data$Date), format="%Y%m%d"), Qm3s=df_data$Qls * 1E-3, df_data[-1:-2], code=code) return (df_data) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_data = extractBH_data( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # '', # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))