# Usefull library library(tools) library(dplyr) library(officer) # General metadata on station iStatut = c('0'='inconnu', '1'='station avec signification hydrologique', '2'='station sans signification hydrologique', '3'="station d'essai") iFinalite = c('0'='inconnue', '1'="hydrom�trie g�n�rale", '2'='alerte de crue', '3'="hydrom�trie g�n�rale et alerte de crue", '4'="gestion d'ouvrage", '5'='police des eaux', '6'="suivi d'�tiage", '7'='bassin exp�rimental', '8'='drainage') iType = c('0'='inconnu', '1'='une �chelle', '2'='deux �chelles, station m�re', '3'='deux �chelles, station fille', '4'='d�bits mesur�s', '5'='virtuelle') iInfluence = c('0'='inconnue', '1'='nulle ou faible', '2'='en �tiage seulement', '3'='forte en toute saison') iDebit = c('0'='reconstitu�', '1'="r�el (prise en compte de l'eau rajout�e ou retir�e du bassin selon am�nagements)", '2'='naturel') iQBE = c('0'='qualit� basses eaux inconnue', '1'='qualit� basses eaux bonne', '2'='qualit� basses eaux douteuse') iQME = c('0'='qualit� moyennes eaux inconnue', '1'='qualit� moyennes eaux bonne', '2'='qualit� moyennes eaux douteuse') iQHE = c('0'='qualit� hautes eaux inconnue', '1'='qualit� hautes eaux bonne', '2'='qualit� hautes eaux douteuse') iRegHydro = c('D'='Affluents du Rhin', 'E'="Fleuves c�tiers de l'Artois-Picardie", 'A'='Rhin', 'B'='Meuse', 'F'='Seine aval (Marne incluse)', 'G'='Fleuves c�tiers haut normands', 'H'='Seine amont', 'I'='Fleuves c�tiers bas normands', 'J'='Bretagne', 'K'='Loire', 'L'='Loire', 'M'='Loire', 'N'='Fleuves c�tiers au sud de la Loire', 'O'='Garonne', 'P'='Dordogne', 'Q'='Adour', 'R'='Charente', 'S'="Fleuves c�tiers de l'Adour-Garonne", 'U'='Sa�ne', 'V'='Rh�ne', 'W'='Is�re', 'X'='Durance', 'Y'='Fleuves c�tiers du Rh�ne-M�dit�rann�e et Corse', 'Z'='�les', '1'='Guadeloupe', '2'='Martinique', '5'='Guyane', '6'='Guyane', '7'='Guyane', '8'='Guyane', '9'='Guyane', '4'='R�union') create_selection = function (computer_data_path, filedir, outname) { outfile = file.path(computer_data_path, outname) codelist = c() dir_path = file.path(computer_data_path, filedir) filelist_tmp = list.files(dir_path) for (f in filelist_tmp) { if (file_ext(f) == 'txt') { codelist = c(codelist, gsub('.txt', '', f)) } } df_file = tibble(code=codelist, filename=paste(codelist, '_HYDRO_QJM.txt', sep=''), ok=TRUE) write.table(df_file, outfile, sep=";", col.names=TRUE, quote=FALSE) return (NULL) } # Example # create_selection( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "France207", # "nival_selection.txt") # Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory get_selection_AG = function (computer_data_path, listdir, listname, cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), c_num=c('BV_km2', 'longueur_serie')) { # Get the file path to the data list_path = file.path(computer_data_path, listdir, listname) sample_data = read_docx(list_path) content = docx_summary(sample_data) table_cells <- content %>% filter(content_type == "table cell") table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text) # Split data into individual columns splits <- split(table_data, table_data$cell_id) splits <- lapply(splits, function(x) x$text) # Combine columns back together in wide format df_selec <- bind_cols(splits) df_selec = df_selec[-1,] # Change the columns name names(df_selec) = cnames for (c in c_num) { df_selec$c = as.numeric(sub(",", ".", pull(df_selec, c))) } selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout') df_selec = bind_cols(df_selec, filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''), ok=selec ) return (df_selec) } # Example df_selec_AG = get_selection_AG( "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", "", "Liste-station_RRSE.docx", cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), c_num=c('BV_km2', 'longueur_serie')) get_selection_NV = function (computer_data_path, listdir, listname) { # Get the file path to the data list_path = file.path(computer_data_path, listdir, listname) # Extract the data as a data frame df_selec = read.table(list_path, header=TRUE, encoding='UTF-8', sep=';', ) df_selec = tibble(code=as.character(df_selec$code), filename=as.character(df_selec$filename), ok=df_selec$ok) return (df_selec) } # Example # df_selec_NV = get_selection_NV( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "", # "nival_selection.txt") # Extraction of metadata extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename } # Create a blank data frame df_meta = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_meta = rbind(df_meta, extract_meta(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_meta) = NULL return (df_meta) } # Get the filename from the vector filename = filename[1] # Print metadata if asked if (verbose) { print(paste("extraction of BH meta for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { # Extract all the header metatxt = c(readLines(file_path, n=41, encoding="UTF-8")) # Create a tibble with all the metadata needed df_meta = tibble(code=trimws(substr(metatxt[11], 38, nchar(metatxt[11]))), nom=trimws(substr(metatxt[12], 39, nchar(metatxt[12]))), territoire=trimws(substr(metatxt[13], 39, nchar(metatxt[13]))), L93X=as.numeric(substr(metatxt[16], 38, 50)), L93Y=as.numeric(substr(metatxt[16], 52, 63)), surface_km2=as.numeric(substr(metatxt[19], 38, 50)), statut=iStatut[trimws(substr(metatxt[26], 38, 50))], finalite=iFinalite[trimws(substr(metatxt[26], 52, 56))], type=iType[trimws(substr(metatxt[26], 58, 58))], influence=iInfluence[trimws(substr(metatxt[26], 60, 60))], debit=iDebit[trimws(substr(metatxt[26], 62, 62))], QBE=iQBE[trimws(substr(metatxt[26], 72, 72))], QME=iQME[trimws(substr(metatxt[26], 74, 74))], QHE=iQHE[trimws(substr(metatxt[26], 76, 76))], file_path=file_path, ) df_meta$region_hydro = iRegHydro[substr(df_meta$code, 1, 1)] return (df_meta) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_meta = extract_meta( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # '', # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt')) # Extraction of data extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename } # Create a blank data frame df_data = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_data = rbind(df_data, extract_data(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_data) = NULL return (df_data) } # Get the filename from the vector filename = filename[1] # Print metadata if asked if (verbose) { print(paste("extraction of BH data for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { # Extract the data as a data frame df_data = read.table(file_path, header=TRUE, na.strings=c(' -99', ' -99.000'), sep=';', skip=41) # Extract all the metadata for the station df_meta = extract_meta(computer_data_path, filedir, filename, verbose=FALSE) # Get the code of the station code = df_meta$code # Create a tibble with the date as Date class and the code of the station df_data = tibble(Date=as.Date(as.character(df_data$Date), format="%Y%m%d"), Qm3s=df_data$Qls * 1E-3, df_data[-1:-2], code=code) return (df_data) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_data = extract_data( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # '', # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))