extract.R 10.33 KiB
# Usefull library
library(tools)
library(dplyr)
library(officer)
# General information on station
iStatut = c('0'='inconnu', 
            '1'='station avec signification hydrologique', 
            '2'='station sans signification hydrologique', 
            '3'="station d'essai")
iFinalite = c('0'='inconnue', 
              '1'="hydrométrie générale", 
              '2'='alerte de crue', 
              '3'="hydrométrie générale et alerte de crue",
              '4'="gestion d'ouvrage", 
              '5'='police des eaux', 
              '6'="suivi d'étiage", 
              '7'='bassin expérimental', 
              '8'='drainage')
iType = c('0'='inconnu',
          '1'='une échelle',
          '2'='deux échelles, station mère',
          '3'='deux échelles, station fille',
          '4'='débits mesurés',
          '5'='virtuelle')
iInfluence = c('0'='inconnue',
               '1'='nulle ou faible',
               '2'='en étiage seulement',
               '3'='forte en toute saison')
iDebit = c('0'='reconstitué',
           '1'="réel (prise en compte de l'eau rajoutée ou retirée du bassin selon aménagements)",
           '2'='naturel')
iQBE = c('0'='qualité basses eaux inconnue',
         '1'='qualité basses eaux bonne',
         '2'='qualité basses eaux douteuse')
iQME = c('0'='qualité moyennes eaux inconnue',
         '1'='qualité moyennes eaux bonne',
         '2'='qualité moyennes eaux douteuse')
iQHE = c('0'='qualité hautes eaux inconnue',
         '1'='qualité hautes eaux bonne',
         '2'='qualité hautes eaux douteuse')
# Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory
get_selection = function (computer_data_path, listdir, listname,
                          cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), 
                          cisnum=c('BV_km2', 'longueur_serie')) {
    # Get the file path to the data
    list_path = file.path(computer_data_path, listdir, listname)
    # Extract the data as a data frame
    # df_list = read.table(list_path,
    #                      header=TRUE,
    #                      sep=';',
    #                      dec=',',
    #                      quote='',
    #                      skip=0,
    #                      nrows=3,
    #                      strip.white=TRUE,
    #                      comment.char="",
    #                      colClasses=c("character", 
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
# "character", # "numeric", # "character", # "numeric", # "character", # "character")) sample_data = read_docx(list_path) content = docx_summary(sample_data) table_cells <- content %>% filter(content_type == "table cell") table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text) # Split data into individual columns splits <- split(table_data, table_data$cell_id) splits <- lapply(splits, function(x) x$text) # Combine columns back together in wide format df_list <- bind_cols(splits) df_list = df_list[-1,] names(df_list) = cnames for (c in cisnum) { df_list$c = as.numeric(sub(",", ".", pull(df_list, c))) } df_selec = df_list[df_list$choix == 'A garder' | df_list$choix == 'Ajout',] df_selec = bind_cols(df_selec, filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep='')) return (df_selec) } # Example # df_selec = get_selection( # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", # "liste_station", # "Liste-station_RRSE.docx", # cnames=c('code','station', # 'BV_km2', # 'axe_principal_concerne', # 'longueur_serie', # 'commentaires', # 'choix'), # cisnum=c('BV_km2', # 'longueur_serie')) # Extraction of information extract_info = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected for (f in filelist_tmp) { # If the filename extention is 'txt' if (file_ext(f) == 'txt') {