diff --git a/processing/extract.R b/processing/extract.R index 408398012569b37fa58400abfb82c88e9cf33545..9db968876b65ca9e0592956d61237370f8b39bd3 100644 --- a/processing/extract.R +++ b/processing/extract.R @@ -32,12 +32,14 @@ library(dplyr) library(officer) -# General metadata on station +### 1. GENERAL METADATA ON STATION +# Status of the station iStatut = c('0'='inconnu', '1'='station avec signification hydrologique', '2'='station sans signification hydrologique', '3'="station d'essai") +# Goal iFinalite = c('0'='inconnue', '1'="hydrométrie générale", '2'='alerte de crue', @@ -48,6 +50,7 @@ iFinalite = c('0'='inconnue', '7'='bassin expérimental', '8'='drainage') +# Type of measure iType = c('0'='inconnu', '1'='une échelle', '2'='deux échelles, station mère', @@ -55,28 +58,35 @@ iType = c('0'='inconnu', '4'='débits mesurés', '5'='virtuelle') +# Influence of the flow iInfluence = c('0'='inconnue', '1'='nulle ou faible', '2'='en étiage seulement', '3'='forte en toute saison') +# Type of flow iDebit = c('0'='reconstitué', - '1'="réel (prise en compte de l'eau rajoutée ou retirée du bassin selon aménagements)", + '1'=paste("réel (prise en compte de l'eau rajoutée ", + "ou retirée du bassin selon aménagements)", + sep=''), '2'='naturel') +# Quality of low water flow iQBE = c('0'='qualité basses eaux inconnue', '1'='qualité basses eaux bonne', '2'='qualité basses eaux douteuse') +# Quality of mean water flow iQME = c('0'='qualité moyennes eaux inconnue', '1'='qualité moyennes eaux bonne', '2'='qualité moyennes eaux douteuse') +# Quality of high water flow iQHE = c('0'='qualité hautes eaux inconnue', '1'='qualité hautes eaux bonne', '2'='qualité hautes eaux douteuse') - +# Hydrological region iRegHydro = c('D'='Affluents du Rhin', 'E'="Fleuves côtiers de l'Artois-Picardie", 'A'='Rhin', @@ -111,6 +121,8 @@ iRegHydro = c('D'='Affluents du Rhin', '4'='Réunion') +# Create a txt file that resume all the station data files present +# in a filedir create_selection = function (computer_data_path, filedir, outname) { # Out file for store results @@ -131,14 +143,13 @@ create_selection = function (computer_data_path, filedir, outname) { codelist = c(codelist, gsub('.txt', '', f)) } } - + # Create a tibble to store the data to write df_file = tibble(code=codelist, filename=paste(codelist, '_HYDRO_QJM.txt', sep=''), ok=TRUE) - + # Write the data in a txt file write.table(df_file, outfile, sep=";", col.names=TRUE, quote=FALSE) - return (NULL) } @@ -149,42 +160,49 @@ create_selection = function (computer_data_path, filedir, outname) { # "nival_selection.txt") -# Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory +# Gets the selection of station from the 'Liste-station_RRSE.docx' file get_selection_AG = function (computer_data_path, listdir, listname, - cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), + cnames=c('code','station', 'BV_km2', + 'axe_principal_concerne', + 'longueur_serie', 'commentaires', + 'choix'), c_num=c('BV_km2', 'longueur_serie')) { - # Get the file path to the data + # Gets the file path to the data list_path = file.path(computer_data_path, listdir, listname) - + + # Reads and formats the docx file sample_data = read_docx(list_path) content = docx_summary(sample_data) - table_cells <- content %>% filter(content_type == "table cell") - table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text) - # Split data into individual columns - splits <- split(table_data, table_data$cell_id) - splits <- lapply(splits, function(x) x$text) - - # Combine columns back together in wide format - df_selec <- bind_cols(splits) - + table_cells = content %>% filter(content_type == "table cell") + table_data = table_cells %>% filter(!is_header) %>% select(row_id, + cell_id, + text) + # Splits data into individual columns + splits = split(table_data, table_data$cell_id) + splits = lapply(splits, function(x) x$text) + # Combines columns back together in wide format + df_selec = bind_cols(splits) + + # Removes the first line df_selec = df_selec[-1,] - - # Change the columns name + # Changes the columns name names(df_selec) = cnames + # For all the numerical column for (c in c_num) { + # Convert them as numeric df_selec$c = as.numeric(sub(",", ".", pull(df_selec, c))) } - + + # Perfoms the selection according to the column of choice selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout') - + # Stores it in the tibble of selection df_selec = bind_cols(df_selec, - filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''), - ok=selec - ) - + filename=paste(df_selec$code, '_HYDRO_QJM.txt', + sep=''), + ok=selec) return (df_selec) } @@ -203,21 +221,21 @@ get_selection_AG = function (computer_data_path, listdir, listname, # 'longueur_serie')) +# Gets the selection of station from the selection txt file generated +# by the 'create_selection' function get_selection_IN = function (computer_data_path, listdir, listname) { - # Get the file path to the data + # Gets the file path to the data list_path = file.path(computer_data_path, listdir, listname) - - # Extract the data as a data frame + # Extracts the data as a data frame df_selec = read.table(list_path, header=TRUE, encoding='UTF-8', - sep=';', - ) + sep=';') + # Stores it in the tibble of selection df_selec = tibble(code=as.character(df_selec$code), filename=as.character(df_selec$filename), ok=df_selec$ok) - return (df_selec) } # Example @@ -227,8 +245,9 @@ get_selection_IN = function (computer_data_path, listdir, listname) { # "nival_selection.txt") -# Extraction of metadata -extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { +# Extraction of metadata of stations +extract_meta = function (computer_data_path, filedir, filename, + verbose=TRUE) { # Convert the filename in vector filename = c(filename) @@ -252,8 +271,7 @@ extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { filelist = c(filelist, f) } } - - # If the filename regroup more than one filename + # If the filename regroup more than one filename } else if (length(filename > 1)) { # The filelist correspond to the filename filelist = filename @@ -264,14 +282,13 @@ extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { # For all the file in the filelist for (f in filelist) { - - # Concatenate by raw data frames created by this function when filename correspond to only one filename + # Concatenate by raw data frames created by this function + # when filename correspond to only one filename df_meta = rbind(df_meta, extract_meta(computer_data_path, - filedir, - f)) + filedir, + f)) } - # Set the rownames by default (to avoid strange numbering) rownames(df_meta) = NULL return (df_meta) @@ -288,19 +305,22 @@ extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) - - if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { - + if (file.exists(file_path) & substr(file_path, nchar(file_path), + nchar(file_path)) != '/') { # Extract all the header metatxt = c(readLines(file_path, n=41, encoding="UTF-8")) # Create a tibble with all the metadata needed df_meta = - tibble(code=trimws(substr(metatxt[11], 38, nchar(metatxt[11]))), - nom=trimws(substr(metatxt[12], 39, nchar(metatxt[12]))), - territoire=trimws(substr(metatxt[13], 39, nchar(metatxt[13]))), - - gestionnaire=trimws(substr(metatxt[7], 60, nchar(metatxt[7]))), + tibble(code=trimws(substr(metatxt[11], 38, + nchar(metatxt[11]))), + nom=trimws(substr(metatxt[12], 39, + nchar(metatxt[12]))), + territoire=trimws(substr(metatxt[13], 39, + nchar(metatxt[13]))), + + gestionnaire=trimws(substr(metatxt[7], 60, + nchar(metatxt[7]))), L93X_m_IN=as.numeric(substr(metatxt[16], 65, 77)), L93X_m_BH=as.numeric(substr(metatxt[16], 38, 50)), @@ -318,9 +338,11 @@ extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { fin=substr(metatxt[25], 52, 63), statut=iStatut[trimws(substr(metatxt[26], 38, 50))], - finalite=iFinalite[trimws(substr(metatxt[26], 52, 56))], + finalite=iFinalite[trimws(substr(metatxt[26], 52, + 56))], type=iType[trimws(substr(metatxt[26], 58, 58))], - influence=iInfluence[trimws(substr(metatxt[26], 60, 60))], + influence=iInfluence[trimws(substr(metatxt[26], 60, + 60))], debit=iDebit[trimws(substr(metatxt[26], 62, 62))], QBE=iQBE[trimws(substr(metatxt[26], 72, 72))], QME=iQME[trimws(substr(metatxt[26], 74, 74))], @@ -346,14 +368,14 @@ extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) { # Extraction of data -extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { +extract_data = function (computer_data_path, filedir, filename, + verbose=TRUE) { # Convert the filename in vector filename = c(filename) # If the filename is 'all' or regroup more than one filename if (all(filename == 'all') | length(filename) > 1) { - # If the filename is 'all' if (all(filename == 'all')) { # Create a filelist to store all the filename @@ -370,9 +392,9 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { filelist = c(filelist, f) } } - # If the filename regroup more than one filename + # If the filename regroup more than one filename } else if (length(filename > 1)) { - # The filelist correspond to the filename + # The filelist correspond to the filename filelist = filename } @@ -381,19 +403,18 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # For all the file in the filelist for (f in filelist) { - - # Concatenate by raw data frames created by this function when filename correspond to only one filename + # Concatenate by raw data frames created by this function + # when filename correspond to only one filename df_data = rbind(df_data, extract_data(computer_data_path, filedir, f)) } - # Set the rownames by default (to avoid strange numbering) rownames(df_data) = NULL return (df_data) } - + # Get the filename from the vector filename = filename[1] @@ -405,8 +426,8 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # Get the file path to the data file_path = file.path(computer_data_path, filedir, filename) - if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') { - + if (file.exists(file_path) & substr(file_path, nchar(file_path), + nchar(file_path)) != '/') { # Extract the data as a data frame df_data = read.table(file_path, header=TRUE, @@ -415,16 +436,17 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { skip=41) # Extract all the metadata for the station - df_meta = extract_meta(computer_data_path, filedir, filename, verbose=FALSE) + df_meta = extract_meta(computer_data_path, filedir, filename, + verbose=FALSE) # Get the code of the station code = df_meta$code - # Create a tibble with the date as Date class and the code of the station + # Create a tibble with the date as Date class and the code + # of the station df_data = tibble(Date=as.Date(as.character(df_data$Date), format="%Y%m%d"), Qm3s=df_data$Qls * 1E-3, df_data[-1:-2], code=code) - return (df_data) } else { @@ -440,6 +462,8 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) { # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt')) +# Generates a list of shapefiles to draw a hydrological map of +# the France ini_shapefile = function (computer_data_path, fr_shpdir, fr_shpname, bs_shpdir, bs_shpname, rv_shpdir, rv_shpname, riv=TRUE) { # Path for shapefile @@ -454,18 +478,18 @@ ini_shapefile = function (computer_data_path, fr_shpdir, fr_shpname, bs_shpdir, france = spTransform(fr_spdf, CRS("+init=epsg:2154")) df_france = tibble(fortify(france)) - # Bassin hydrographique + # Hydrological basin bassin = readOGR(dsn=bs_shppath, verbose=FALSE) df_bassin = tibble(fortify(bassin)) + # If the river shapefile needs to be load if (riv) { - # Réseau hydrographique + # Hydrographic network river = readOGR(dsn=rv_shppath, verbose=FALSE) ### trop long ### river = river[which(river$Classe == 1),] df_river = tibble(fortify(river)) } else { df_river = NULL } - return (list(france=df_france, bassin=df_bassin, river=df_river)) }