diff --git a/processing/extract.R b/processing/extract.R index 7da776274a2630065e214c97dd579782aa1b66de..78df646799563a45fa6adb46f34bdf0ec4d78681 100644 --- a/processing/extract.R +++ b/processing/extract.R @@ -1,6 +1,7 @@ # Usefull library library(tools) library(dplyr) +library(officer) # General information on station @@ -48,8 +49,76 @@ iQHE = c('0'='qualité hautes eaux inconnue', '2'='qualité hautes eaux douteuse') +# Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory +get_selection = function (computer_data_path, listdir, listname, + cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), + cisnum=c('BV_km2', 'longueur_serie')) { + + # Get the file path to the data + list_path = file.path(computer_data_path, listdir, listname) + + # Extract the data as a data frame + # df_list = read.table(list_path, + # header=TRUE, + # sep=';', + # dec=',', + # quote='', + # skip=0, + # nrows=3, + # strip.white=TRUE, + # comment.char="", + # colClasses=c("character", + # "character", + # "numeric", + # "character", + # "numeric", + # "character", + # "character")) + + sample_data = read_docx(list_path) + content = docx_summary(sample_data) + table_cells <- content %>% filter(content_type == "table cell") + table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text) + # Split data into individual columns + splits <- split(table_data, table_data$cell_id) + splits <- lapply(splits, function(x) x$text) + + # Combine columns back together in wide format + df_list <- bind_cols(splits) + + df_list = df_list[-1,] + + names(df_list) = cnames + + for (c in cisnum) { + df_list$c = as.numeric(sub(",", ".", + pull(df_list, c))) + } + + df_selec = df_list[df_list$choix == 'A garder' | df_list$choix == 'Ajout',] + + df_selec = bind_cols(df_selec, + filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep='')) + return (df_selec) +} + +# Example +# df_selec = get_selection( +# "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data", +# "liste_station", +# "Liste-station_RRSE.docx", +# cnames=c('code','station', +# 'BV_km2', +# 'axe_principal_concerne', +# 'longueur_serie', +# 'commentaires', +# 'choix'), +# cisnum=c('BV_km2', +# 'longueur_serie')) + + # Extraction of information -extract_info = function (data_path, filedir, filename, verbose=TRUE) { +extract_info = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) @@ -62,7 +131,7 @@ extract_info = function (data_path, filedir, filename, verbose=TRUE) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected - filelist_tmp = list.files(file.path(data_path, + filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected @@ -88,7 +157,7 @@ extract_info = function (data_path, filedir, filename, verbose=TRUE) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_info = rbind(df_info, - extract_info(data_path, + extract_info(computer_data_path, filedir, f)) } @@ -107,7 +176,7 @@ extract_info = function (data_path, filedir, filename, verbose=TRUE) { } # Get the file path to the data - file_path = file.path(data_path, filedir, filename) + file_path = file.path(computer_data_path, filedir, filename) # Extract all the header infotxt = c(readLines(file_path, n=41)) @@ -142,7 +211,7 @@ extract_info = function (data_path, filedir, filename, verbose=TRUE) { # Extraction of data -extract_data = function (data_path, filedir, filename, verbose=TRUE) { +extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # Convert the filename in vector filename = c(filename) @@ -155,7 +224,7 @@ extract_data = function (data_path, filedir, filename, verbose=TRUE) { # Create a filelist to store all the filename filelist = c() # Get all the filename in the data directory selected - filelist_tmp = list.files(file.path(data_path, + filelist_tmp = list.files(file.path(computer_data_path, filedir)) # For all the filename in the directory selected @@ -180,7 +249,7 @@ extract_data = function (data_path, filedir, filename, verbose=TRUE) { # Concatenate by raw data frames created by this function when filename correspond to only one filename df_data = rbind(df_data, - extract_data(data_path, + extract_data(computer_data_path, filedir, f)) } @@ -199,8 +268,9 @@ extract_data = function (data_path, filedir, filename, verbose=TRUE) { } # Get the file path to the data - file_path = file.path(data_path, filedir, filename) + file_path = file.path(computer_data_path, filedir, filename) + # Extract the data as a data frame df_data = read.table(file_path, header=TRUE, na.strings=c(' -99', ' -99.000'), @@ -208,7 +278,7 @@ extract_data = function (data_path, filedir, filename, verbose=TRUE) { skip=41)[,1:2] # Extract all the information for the station - df_info = extract_info(data_path, filedir, filename, verbose=FALSE) + df_info = extract_info(computer_data_path, filedir, filename, verbose=FALSE) # Get the code of the station code = df_info$code # Create a tibble with the date as Date class and the code of the station diff --git a/script.R b/script.R index 4ccc4bcbbdfdf0dc157f29c4863541922909e010..6a357f1ead97ebd77487adfb28c73442b8c63eb3 100644 --- a/script.R +++ b/script.R @@ -2,18 +2,39 @@ ### A MODIFIER ### # Path to the data -computer_data_path = #"/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data" - "C:\\Users\\louis.heraut\\Documents\\CDD_stationnarite\\data" +computer_data_path = + "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data" + # "C:\\Users\\louis.heraut\\Documents\\CDD_stationnarite\\data" # Work path -computer_work_path = #"/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/ASH" - "C:\\Users\\louis.heraut\\Documents\\CDD_stationnarite\\ASH" +computer_work_path = + "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/ASH" + # "C:\\Users\\louis.heraut\\Documents\\CDD_stationnarite\\ASH" +# Manual selection # Path to data that will be analysed -filedir = "test" - -filename = #c("H5920011_HYDRO_QJM.txt", "K4470010_HYDRO_QJM.txt") - "all" +filedir = + FALSE + # "test" + # "BanqueHydro_Export2021" +filename = + FALSE + # c("H5920011_HYDRO_QJM.txt", "K4470010_HYDRO_QJM.txt") + # "all" + +# Or list selection +# Path to the list file of station that will be analysed +listdir = + # FALSE + "" +listname = + "Liste-station_RRSE.docx" + # FALSE +BHdir = + "BanqueHydro_Export2021" + # FALSE + +# selecdir = "RRSE_selection" ################## @@ -44,6 +65,24 @@ if (!(file.exists(figdir))) { print(paste('figdir :', figdir)) +# Get only the selected station from a list station file +if (is.character(listdir) & is.character(listname) & is.character(BHdir)){ + df_selec = get_selection(computer_data_path, + listdir, + listname, + cnames=c('code', + 'station', + 'BV_km2', + 'axe_principal_concerne', + 'longueur_serie', + 'commentaires', + 'choix'), + cisnum=c('BV_km2', + 'longueur_serie')) + filedir = BHdir + filename = df_selec$filename +} + # Extract information about selected stations df_info = extract_info(computer_data_path, filedir, filename)