extract.R 13.87 KiB
# Usefull library
library(tools)
library(dplyr)
library(officer)
# General metadata on station
iStatut = c('0'='inconnu', 
            '1'='station avec signification hydrologique', 
            '2'='station sans signification hydrologique', 
            '3'="station d'essai")
iFinalite = c('0'='inconnue', 
              '1'="hydromtrie gnrale", 
              '2'='alerte de crue', 
              '3'="hydromtrie gnrale et alerte de crue",
              '4'="gestion d'ouvrage", 
              '5'='police des eaux', 
              '6'="suivi d'tiage", 
              '7'='bassin exprimental', 
              '8'='drainage')
iType = c('0'='inconnu',
          '1'='une chelle',
          '2'='deux chelles, station mre',
          '3'='deux chelles, station fille',
          '4'='dbits mesurs',
          '5'='virtuelle')
iInfluence = c('0'='inconnue',
               '1'='nulle ou faible',
               '2'='en tiage seulement',
               '3'='forte en toute saison')
iDebit = c('0'='reconstitu',
           '1'="rel (prise en compte de l'eau rajoute ou retire du bassin selon amnagements)",
           '2'='naturel')
iQBE = c('0'='qualit basses eaux inconnue',
         '1'='qualit basses eaux bonne',
         '2'='qualit basses eaux douteuse')
iQME = c('0'='qualit moyennes eaux inconnue',
         '1'='qualit moyennes eaux bonne',
         '2'='qualit moyennes eaux douteuse')
iQHE = c('0'='qualit hautes eaux inconnue',
         '1'='qualit hautes eaux bonne',
         '2'='qualit hautes eaux douteuse')
iRegHydro = c('D'='Affluents du Rhin',
              'E'="Fleuves ctiers de l'Artois-Picardie",
              'A'='Rhin',
              'B'='Meuse',
              'F'='Seine aval (Marne incluse)',
              'G'='Fleuves ctiers haut normands',
              'H'='Seine amont',
              'I'='Fleuves ctiers bas normands',
              'J'='Bretagne',
              'K'='Loire',
              'L'='Loire',
              'M'='Loire',
              'N'='Fleuves ctiers au sud de la Loire',
              'O'='Garonne',
              'P'='Dordogne',
              'Q'='Adour',
              'R'='Charente',
              'S'="Fleuves ctiers de l'Adour-Garonne",
              'U'='Sane',
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
'V'='Rhne', 'W'='Isre', 'X'='Durance', 'Y'='Fleuves ctiers du Rhne-Mditranne et Corse', 'Z'='les', '1'='Guadeloupe', '2'='Martinique', '5'='Guyane', '6'='Guyane', '7'='Guyane', '8'='Guyane', '9'='Guyane', '4'='Runion') create_selection = function (computer_data_path, filedir, outname) { # Out file for store results outfile = file.path(computer_data_path, outname) # Path to find the directory of desired codes dir_path = file.path(computer_data_path, filedir) # Create a filelist of all the filename in the above directory filelist_tmp = list.files(dir_path) # Create a filelist to store all station codes codelist = c() # For all the filename in the file list for (f in filelist_tmp) { # If the filename is a 'txt' file if (file_ext(f) == 'txt') { # Then the station code is stored codelist = c(codelist, gsub('.txt', '', f)) } } df_file = tibble(code=codelist, filename=paste(codelist, '_HYDRO_QJM.txt', sep=''), ok=TRUE) write.table(df_file, outfile, sep=";", col.names=TRUE, quote=FALSE) return (NULL) } # Example # create_selection( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "France207", # "nival_selection.txt") # Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory get_selection_AG = function (computer_data_path, listdir, listname, cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), c_num=c('BV_km2', 'longueur_serie')) { # Get the file path to the data list_path = file.path(computer_data_path, listdir, listname) sample_data = read_docx(list_path) content = docx_summary(sample_data) table_cells <- content %>% filter(content_type == "table cell") table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text) # Split data into individual columns splits <- split(table_data, table_data$cell_id) splits <- lapply(splits, function(x) x$text) # Combine columns back together in wide format
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
df_selec <- bind_cols(splits) df_selec = df_selec[-1,] # Change the columns name names(df_selec) = cnames for (c in c_num) { df_selec$c = as.numeric(sub(",", ".", pull(df_selec, c))) } selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout') df_selec = bind_cols(df_selec, filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''), ok=selec ) return (df_selec) } # Example df_selec_AG = get_selection_AG( "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", "", "Liste-station_RRSE.docx", cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), c_num=c('BV_km2', 'longueur_serie')) get_selection_NV = function (computer_data_path, listdir, listname) { # Get the file path to the data list_path = file.path(computer_data_path, listdir, listname) # Extract the data as a data frame df_selec = read.table(list_path, header=TRUE, encoding='UTF-8', sep=';', ) df_selec = tibble(code=as.character(df_selec$code), filename=as.character(df_selec$filename), ok=df_selec$ok) return (df_selec) } # Example # df_selec_NV = get_selection_NV( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "", # "nival_selection.txt") # Extraction of metadata extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) {
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
# If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename } # Create a blank data frame df_meta = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_meta = rbind(df_meta, extract_meta(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_meta) = NULL return (df_meta) } # Get the filename from the vector filename = filename[1] # Print metadata if asked if (verbose) { print(paste("extraction of BH meta for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { # Extract all the header metatxt = c(readLines(file_path, n=41, encoding="UTF-8")) # Create a tibble with all the metadata needed df_meta = tibble(code=trimws(substr(metatxt[11], 38, nchar(metatxt[11]))), nom=trimws(substr(metatxt[12], 39, nchar(metatxt[12]))), territoire=trimws(substr(metatxt[13], 39, nchar(metatxt[13]))), gestionnaire=trimws(substr(metatxt[7], 60, nchar(metatxt[7]))), L93X_m_IN=as.numeric(substr(metatxt[16], 65, 77)), L93X_m_BH=as.numeric(substr(metatxt[16], 38, 50)), L93Y_m_IN=as.numeric(substr(metatxt[16], 79, 90)),
281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
L93Y_m_BH=as.numeric(substr(metatxt[16], 52, 63)), surface_km2_IN=as.numeric(substr(metatxt[19], 52, 63)), surface_km2_BH=as.numeric(substr(metatxt[19], 38, 50)), altitude_m_IN=as.numeric(substr(metatxt[20], 52, 63)), altitude_m_BH=as.numeric(substr(metatxt[20], 38, 50)), debut=substr(metatxt[25], 38, 50), fin=substr(metatxt[25], 52, 63), statut=iStatut[trimws(substr(metatxt[26], 38, 50))], finalite=iFinalite[trimws(substr(metatxt[26], 52, 56))], type=iType[trimws(substr(metatxt[26], 58, 58))], influence=iInfluence[trimws(substr(metatxt[26], 60, 60))], debit=iDebit[trimws(substr(metatxt[26], 62, 62))], QBE=iQBE[trimws(substr(metatxt[26], 72, 72))], QME=iQME[trimws(substr(metatxt[26], 74, 74))], QHE=iQHE[trimws(substr(metatxt[26], 76, 76))], file_path=file_path, ) df_meta$region_hydro = iRegHydro[substr(df_meta$code, 1, 1)] return (df_meta) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_meta = extract_meta( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "BanqueHydro_Export2021", # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt')) # Extraction of data extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') { # Store the filename in the filelist filelist = c(filelist, f) } } # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename }
351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
# Create a blank data frame df_data = data.frame() # For all the file in the filelist for (f in filelist) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_data = rbind(df_data, extract_data(computer_data_path, filedir, f)) } # Set the rownames by default (to avoid strange numbering) rownames(df_data) = NULL return (df_data) } # Get the filename from the vector filename = filename[1] # Print metadata if asked if (verbose) { print(paste("extraction of BH data for file :", filename)) } # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { # Extract the data as a data frame df_data = read.table(file_path, header=TRUE, na.strings=c(' -99', ' -99.000'), sep=';', skip=41) # Extract all the metadata for the station df_meta = extract_meta(computer_data_path, filedir, filename, verbose=FALSE) # Get the code of the station code = df_meta$code # Create a tibble with the date as Date class and the code of the station df_data = tibble(Date=as.Date(as.character(df_data$Date), format="%Y%m%d"), Qm3s=df_data$Qls * 1E-3, df_data[-1:-2], code=code) return (df_data) } else { print(paste('filename', file_path, 'do not exist')) return (NULL) } } # Example # df_data = extract_data( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # '', # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))