From b6a9422d25b8811344b84fc384abb837cecc959e Mon Sep 17 00:00:00 2001
From: "louis.heraut" <louis.heraut@inrae.fr>
Date: Tue, 16 Nov 2021 10:39:33 +0100
Subject: [PATCH] Extract NV and BH

---
 processing/{extract.R => extractBH.R} |  49 ++++----
 processing/extractNV.R                | 156 ++++++++++++++++++++++++++
 script.R                              |  17 +--
 3 files changed, 191 insertions(+), 31 deletions(-)
 rename processing/{extract.R => extractBH.R} (88%)
 create mode 100644 processing/extractNV.R

diff --git a/processing/extract.R b/processing/extractBH.R
similarity index 88%
rename from processing/extract.R
rename to processing/extractBH.R
index a636b77..e2e45a6 100644
--- a/processing/extract.R
+++ b/processing/extractBH.R
@@ -52,7 +52,7 @@ iQHE = c('0'='qualité hautes eaux inconnue',
 # Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory
 get_selection = function (computer_data_path, listdir, listname,
                           cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), 
-                          cisnum=c('BV_km2', 'longueur_serie')) {
+                          c_num=c('BV_km2', 'longueur_serie')) {
     
     # Get the file path to the data
     list_path = file.path(computer_data_path, listdir, listname)
@@ -66,28 +66,31 @@ get_selection = function (computer_data_path, listdir, listname,
     splits <- lapply(splits, function(x) x$text)
     
     # Combine columns back together in wide format
-    df_list <- bind_cols(splits)
+    df_selec <- bind_cols(splits)
     
-    df_list = df_list[-1,]
+    df_selec = df_selec[-1,]
     
-    names(df_list) = cnames
+    names(df_selec) = cnames
 
-    for (c in cisnum) {
-        df_list$c = as.numeric(sub(",", ".",
-                                   pull(df_list, c)))
+    for (c in c_num) {
+        df_selec$c = as.numeric(sub(",", ".",
+                                    pull(df_selec, c)))
     }
     
-    df_selec = df_list[df_list$choix == 'A garder' | df_list$choix == 'Ajout',]
+    selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout')
     
     df_selec = bind_cols(df_selec, 
-                         filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''))
+                         filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''),
+                         ok=selec
+                         )
+    
     return (df_selec)
 }
 
 # Example
 # df_selec = get_selection(
 #     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
-#     "liste_station",
+#     "",
 #     "Liste-station_RRSE.docx",
 #     cnames=c('code','station', 
 #              'BV_km2',
@@ -95,12 +98,12 @@ get_selection = function (computer_data_path, listdir, listname,
 #              'longueur_serie',
 #              'commentaires',
 #              'choix'), 
-#     cisnum=c('BV_km2',
+#     c_num=c('BV_km2',
 #              'longueur_serie'))
 
 
 # Extraction of information
-extract_info = function (computer_data_path, filedir, filename, verbose=TRUE) {
+extractBH_info = function (computer_data_path, filedir, filename, verbose=TRUE) {
     
     # Convert the filename in vector
     filename = c(filename)
@@ -139,9 +142,9 @@ extract_info = function (computer_data_path, filedir, filename, verbose=TRUE) {
             
             # Concatenate by raw data frames created by this function when filename correspond to only one filename
             df_info = rbind(df_info,
-                            extract_info(computer_data_path, 
-                                         filedir, 
-                                         f))
+                            extractBH_info(computer_data_path, 
+                                           filedir, 
+                                           f))
         }
         
         # Set the rownames by default (to avoid strange numbering)
@@ -173,7 +176,7 @@ extract_info = function (computer_data_path, filedir, filename, verbose=TRUE) {
                    territoire=trimws(substr(infotxt[13], 39, nchar(infotxt[13]))),
                    L93X=as.numeric(substr(infotxt[16], 38, 50)),
                    L93Y=as.numeric(substr(infotxt[16], 52, 63)),
-                   surface=as.numeric(substr(infotxt[19], 38, 50)),
+                   surface_km2=as.numeric(substr(infotxt[19], 38, 50)),
                    statut=iStatut[trimws(substr(infotxt[26], 38, 50))],
                    finalite=iFinalite[trimws(substr(infotxt[26], 52, 56))],
                    type=iType[trimws(substr(infotxt[26], 58, 58))],
@@ -194,14 +197,14 @@ extract_info = function (computer_data_path, filedir, filename, verbose=TRUE) {
 }
 
 # Example
-# df_info = extract_info(
+# df_info = extractBH_info(
     # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
     # '',
     # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))
 
 
 # Extraction of data
-extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) {
+extractBH_data = function (computer_data_path, filedir, filename, verbose=TRUE) {
     
     # Convert the filename in vector
     filename = c(filename)
@@ -239,9 +242,9 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) {
 
             # Concatenate by raw data frames created by this function when filename correspond to only one filename
             df_data = rbind(df_data,
-                            extract_data(computer_data_path, 
-                                         filedir, 
-                                         f))
+                            extractBH_data(computer_data_path, 
+                                           filedir, 
+                                           f))
         }
 
         # Set the rownames by default (to avoid strange numbering)
@@ -270,7 +273,7 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) {
                              skip=41)   
 
         # Extract all the information for the station
-        df_info = extract_info(computer_data_path, filedir, filename, verbose=FALSE)
+        df_info = extractBH_info(computer_data_path, filedir, filename, verbose=FALSE)
         # Get the code of the station
         code = df_info$code
         # Create a tibble with the date as Date class and the code of the station
@@ -289,7 +292,7 @@ extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) {
 }
 
 # Example
-# df_data = extract_data(
+# df_data = extractBH_data(
 #     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
 #     '',
 #     c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))
diff --git a/processing/extractNV.R b/processing/extractNV.R
new file mode 100644
index 0000000..4bb8d7c
--- /dev/null
+++ b/processing/extractNV.R
@@ -0,0 +1,156 @@
+# Usefull library
+library(tools)
+library(dplyr)
+
+
+# Extraction of information
+extractNVlist_info = function (computer_data_path, filedir, listdir, listname, verbose=TRUE) {
+    
+    # Print information if asked
+    if (verbose) {
+        print(paste("extraction of info for file :", listname))
+    }
+
+    # Get the file path to the data
+    list_path = file.path(computer_data_path, listdir, listname)        
+
+    # Extract the data as a data frame
+    df_info = read.table(list_path,
+                         header=TRUE) 
+
+    # Create a filelist to store all the filename
+    codelist = c()
+    # Get all the filename in the data directory selected
+    filelist_tmp = list.files(file.path(computer_data_path,
+                                        filedir))
+
+    # For all the filename in the directory selected
+    for (f in filelist_tmp) {
+        # If the filename extention is 'txt'
+        if (file_ext(f) == 'txt') {
+            # Store the filename in the filelist
+            codelist = c(codelist, gsub('.txt', '', f)) 
+        }
+    }  
+
+    exist = df_info$CODE %in% codelist
+    missing = codelist[!(codelist %in% df_info$CODE)]
+    print(paste('station missing :', missing))
+    
+    df_info = df_info[exist,]
+
+    # Create a tibble with all the information needed
+    df_info =
+        tibble(code=as.character(df_info$CODE),
+               nom=as.character(df_info$NOM),
+               L93X=df_info$X_L2E,
+               L93Y=df_info$Y_L2E,
+               surface_km2=df_info$S_km2,
+               altitude_m=df_info$Alt,
+               file_path=paste(df_info$CODE, '.txt', sep='')
+               )
+
+    return (df_info)
+}
+
+# Example
+# df_info = extractNVlist_info(
+#     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
+#     'France207',
+#     '',
+#     'liste_bv_principaux_global.txt')
+
+
+# Extraction of data
+extractNV_data = function (computer_data_path, filedir, filename, verbose=TRUE) {
+    
+    # Convert the filename in vector
+    filename = c(filename)
+
+    # If the filename is 'all' or regroup more than one filename
+    if (all(filename == 'all') | length(filename) > 1) {
+
+        # If the filename is 'all'
+        if (all(filename == 'all')) {
+            # Create a filelist to store all the filename
+            filelist = c()
+             # Get all the filename in the data directory selected
+            filelist_tmp = list.files(file.path(computer_data_path,
+                                                filedir))
+
+            # For all the filename in the directory selected
+            for (f in filelist_tmp) {
+                # If the filename extention is 'txt'
+                if (file_ext(f) == 'txt') {
+                    # Store the filename in the filelist
+                    filelist = c(filelist, f) 
+                }
+            }
+            # If the filename regroup more than one filename
+        } else if (length(filename > 1)) {
+             # The filelist correspond to the filename
+            filelist = filename
+        } 
+
+        # Create a blank data frame
+        df_data = data.frame()
+
+        # For all the file in the filelist
+        for (f in filelist) {
+
+            # Concatenate by raw data frames created by this function when filename correspond to only one filename
+            df_data = rbind(df_data,
+                            extractNV_data(computer_data_path, 
+                                         filedir, 
+                                         f))
+        }
+
+        # Set the rownames by default (to avoid strange numbering)
+        rownames(df_data) = NULL
+        return (df_data)
+    }
+
+    # Get the filename from the vector
+    filename = filename[1]
+    
+    # Print information if asked
+    if (verbose) {
+        print(paste("extraction of data for file :", filename))
+    }
+
+    # Get the file path to the data
+    file_path = file.path(computer_data_path, filedir, filename)
+    
+    if (file.exists(file_path)) {
+
+        # Extract the data as a data frame
+        df_data = read.table(file_path,
+                             header=FALSE,
+                             skip=1,
+                             na.strings=c('-1', '-99.000'))   
+
+        # Create a tibble with the date as Date class and the code of the station
+        date = paste(df_data[,1],
+                     df_data[,2],
+                     df_data[,3],
+                     sep='-')
+
+        df_data = tibble(Date=as.Date(as.character(date),
+                                      format="%Y-%m-%d"),
+                         Qm3s=df_data[,4],
+                         QCode=df_data[,5],
+                         code=gsub('.txt', '', filename))
+
+        return (df_data)
+
+    } else {
+        print(paste('filename', file_path, 'do not exist'))
+        return (NULL)
+    }
+}
+
+# Example
+# df_data = extractNV_data(
+    # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
+    # 'France207',
+    # c('O0015310.txt', 'Q0214010.txt', 'P0115020.txt'))
diff --git a/script.R b/script.R
index 37994b7..ab8f9e0 100644
--- a/script.R
+++ b/script.R
@@ -46,7 +46,8 @@ BHdatadir =
 setwd(computer_work_path)
 
 # Sourcing R file
-source('processing/extract.R')
+source('processing/extractBH.R')
+source('processing/extractNV.R')
 source('processing/analyse.R')
 source('plotting/panel.R')
 
@@ -80,21 +81,21 @@ if (is.character(BHlistdir) & is.character(BHlistname) & is.character(BHdatadir)
                                       'longueur_serie',
                                       'commentaires',
                                       'choix'), 
-                             cisnum=c('BV_km2',
+                             c_num=c('BV_km2',
                                       'longueur_serie'))
     BHfiledir = BHdatadir
-    BHfilename = df_selec$filename
+    BHfilename = df_selec[df_selec$ok,]$filename
 }
 
 # Extract information about selected stations
-df_info = extract_info(computer_data_path, BHfiledir, BHfilename)
+df_info_BH = extractBH_info(computer_data_path, BHfiledir, BHfilename)
 
 # Extract data about selected stations
-df_data = extract_data(computer_data_path, BHfiledir, BHfilename)
+df_data_BH = extractBH_data(computer_data_path, BHfiledir, BHfilename)
 
 # Plot time panel of debit by stations
-panel(df_data, df_info, figdir, BHfiledir)
-# panel(df_data, df_info, figdir, BHfiledir, is_sqrt=TRUE)
+panel(df_data_BH, df_info_BH, figdir, BHfiledir)
+# panel(df_data_BH, df_info_BH, figdir, BHfiledir, is_sqrt=TRUE)
 
 # Compute gap parameters for stations
-df_lac = get_lacune(df_data, df_info)
+df_lac = get_lacune(df_data_BH, df_info_BH)
-- 
GitLab