extract.R

# \\\
# Copyright 2021-2022 Louis Héraut*1
#
# *1   INRAE, France
#      louis.heraut@inrae.fr
#
# This file is part of ash R toolbox.
#
# ash R toolbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# ash R toolbox is distributed in the hope that it will be useful, but 
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ash R toolbox.  If not, see <https://www.gnu.org/licenses/>.
# ///
#
#
# processing/extract.R
#
# Regroups functions to generation of generical station selection
# according to pre-existing directory of data or specific other
# selection format as '.docx' file from Agence de l'eau Adour-Garonne.
# Also useful to extract the data and the metadata present in the
# Banque Hydro files from a selection of station. Manages also
# shapefiles loading.


# Usefull library
library(tools)
library(dplyr)
library(officer)


### 1. GENERAL METADATA ON STATION ___________________________________
# Status of the station
iStatut = c('0'='inconnu', 
            '1'='station avec signification hydrologique', 
            '2'='station sans signification hydrologique', 
            '3'="station d'essai")

# Goal
iFinalite = c('0'='inconnue', 
              '1'="hydrométrie générale", 
              '2'='alerte de crue', 
              '3'="hydrométrie générale et alerte de crue",
              '4'="gestion d'ouvrage", 
              '5'='police des eaux', 
              '6'="suivi d'étiage", 
              '7'='bassin expérimental', 
              '8'='drainage')

# Type of measure
iType = c('0'='inconnu',
          '1'='une échelle',
          '2'='deux échelles, station mère',
          '3'='deux échelles, station fille',
          '4'='débits mesurés',
          '5'='virtuelle')

# Influence of the flow
iInfluence = c('0'='inconnue',
               '1'='nulle ou faible',
               '2'='en étiage seulement',
               '3'='forte en toute saison')

# Type of flow
iDebit = c('0'='reconstitué',
           '1'=paste("réel (prise en compte de l'eau rajoutée ",
                     "ou retirée du bassin selon aménagements)",
                     sep=''),
           '2'='naturel')

# Quality of low water flow
iQBE = c('0'='qualité basses eaux inconnue',
         '1'='qualité basses eaux bonne',
         '2'='qualité basses eaux douteuse')

# Quality of mean water flow
iQME = c('0'='qualité moyennes eaux inconnue',
         '1'='qualité moyennes eaux bonne',
         '2'='qualité moyennes eaux douteuse')

# Quality of high water flow
iQHE = c('0'='qualité hautes eaux inconnue',
         '1'='qualité hautes eaux bonne',
         '2'='qualité hautes eaux douteuse')

# Hydrological region
iRegHydro = c('D'='Affluents du Rhin',
              'E'="Fleuves côtiers de l'Artois-Picardie",
              'A'='Rhin',
              'B'='Meuse',
              'F'='Seine aval (Marne incluse)',
              'G'='Fleuves côtiers haut normands',
              'H'='Seine amont',
              'I'='Fleuves côtiers bas normands',
              'J'='Bretagne',
              'K'='Loire',
              'L'='Loire',
              'M'='Loire',
              'N'='Fleuves côtiers au sud de la Loire',
              'O0'='Garonne',
              'O1'='Garonne',
              'O2'='Garonne',
              'O3'='Tarn-Aveyron',
              'O4'='Tarn-Aveyron',
              'O5'='Tarn-Aveyron',
              'O6'='Tarn-Aveyron',
              'O7'='Lot',
              'O8'='Lot',
              'O9'='Lot',
              'P'='Dordogne',
              'Q'='Adour',
              'R'='Charente',
              'S'="Fleuves côtiers de l'Adour-Garonne",
              'U'='Saône',
              'V'='Rhône',
              'W'='Isère',
              'X'='Durance',
              'Y'='Fleuves côtiers du Rhône-Méditérannée et Corse',
              'Z'='Îles',
              '1'='Guadeloupe',
              '2'='Martinique',
              '5'='Guyane',
              '6'='Guyane',
              '7'='Guyane',
              '8'='Guyane',
              '9'='Guyane',
              '4'='Réunion')


## 2. SELECTION ______________________________________________________
### 2.1. Creation of selection _______________________________________
# Create a txt file that resume all the station data files present
# in a filedir
create_selection = function (computer_data_path, filedir, outname, optname='_HYDRO_QJM') {

    # Out file for store results
    outfile = file.path(computer_data_path, outname)
    
    # Path to find the directory of desired codes 
    dir_path = file.path(computer_data_path, filedir)
    # Create a filelist of all the filename in the above directory
    filelist_tmp = list.files(dir_path)
    # Create a filelist to store all station codes
    codelist = c()

    # For all the filename in the file list
    for (f in filelist_tmp) {
        # If the filename is a 'txt' file
            if (file_ext(f) == 'txt') {
                # Extracts the station code
                code = gsub("[^[:alnum:] ].*$", '', f)
                # Then the station code is stored
                codelist = c(codelist, code)
            }
        }  
    # Create a tibble to store the data to write
    df_file = tibble(code=codelist,
                     filename=paste(codelist, optname, '.txt',
                                    sep=''),
                     ok=TRUE)
    # Write the data in a txt file
    write.table(df_file, outfile, sep=";", col.names=TRUE, quote=FALSE)
    # Returns that it is done with the path
    print('Done')
    print(paste('path : ', outfile, sep=''))
    print('example of file : ')
    print(head(df_file))
}
# Example
# create_selection(
#     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
#     "France207",
#     "INRAE_selection.txt")

### 2.2. Agence de l'eau Adour-Garonne selection _____________________
# Gets the selection of station from the 'Liste-station_RRSE.docx' file
get_selection_AEAG = function (computer_data_path, listdir, listname,
                             cnames=c('code','station', 'BV_km2',
                                      'axe_principal_concerne',
                                      'longueur_serie', 'commentaires',
                                      'choix'), 
                             c_num=c('BV_km2', 'longueur_serie')) {
    
    # Gets the file path to the data
    list_path = file.path(computer_data_path, listdir, listname)

    # Reads and formats the docx file
    sample_data = read_docx(list_path)
    content = docx_summary(sample_data)
    table_cells = content %>% filter(content_type == "table cell")
    table_data = table_cells %>% filter(!is_header) %>% select(row_id,
                                                               cell_id,
                                                               text)
    # Splits data into individual columns
    splits = split(table_data, table_data$cell_id)
    splits = lapply(splits, function(x) x$text)
    # Combines columns back together in wide format
    df_selec = bind_cols(splits)

    # Removes the first line
    df_selec = df_selec[-1,]
    # Changes the columns name
    names(df_selec) = cnames

    # For all the numerical column
    for (c in c_num) {
        # Convert them as numeric
        df_selec$c = as.numeric(sub(",", ".",
                                    pull(df_selec, c)))
    }

    # Perfoms the selection according to the column of choice
    selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout')
    # Stores it in the tibble of selection
    df_selec = bind_cols(df_selec, 
                         filename=paste(df_selec$code, '_HYDRO_QJM.txt',
                                        sep=''),
                         ok=selec)
    return (df_selec)
}
# Example
# df_selec_AEAG = get_selection_AEAG(
    # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
    # "",
    # "Liste-station_RRSE.docx",
    # cnames=c('code','station', 
             # 'BV_km2',
             # 'axe_principal_concerne',
             # 'longueur_serie',
             # 'commentaires',
             # 'choix'), 
    # c_num=c('BV_km2',
            # 'longueur_serie'))

### 2.3. INRAE selection _____________________________________________
# Gets the selection of station from the selection txt file generated
# by the 'create_selection' function
get_selection_INRAE = function (computer_data_path, listdir, listname) {
    
    # Gets the file path to the data
    list_path = file.path(computer_data_path, listdir, listname)
    # Extracts the data as a data frame
    df_selec = read.table(list_path,
                          header=TRUE,
                          encoding='UTF-8',
                          sep=';')
    # Stores it in the tibble of selection
    df_selec = tibble(code=as.character(df_selec$code),
                      filename=as.character(df_selec$filename),
                      ok=df_selec$ok)
    return (df_selec)
}
# Example
# df_selec_INRAE = get_selection_INRAE(
    # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
    # "",
    # "INRAE_selection.txt")


## 3. EXTRACTION _____________________________________________________
### 3.1. Extraction of metadata
# Extraction of metadata of stations
extract_meta = function (computer_data_path, filedir, filename,
                         verbose=TRUE) {
    
    # Convert the filename in vector
    filename = c(filename)
    
    # If the filename is 'all' or regroup more than one filename
    if (all(filename == 'all') | length(filename) > 1) {

        # If the filename is 'all'
        if (all(filename == 'all')) {
            # Create a filelist to store all the filename
            filelist = c()
            # Get all the filename in the data directory selected
            filelist_tmp = list.files(file.path(computer_data_path,
                                                filedir))
            
            # For all the filename in the directory selected
            for (f in filelist_tmp) {
                # If the filename extention is 'txt'
                if (file_ext(f) == 'txt') {
                    # Store the filename in the filelist
                    filelist = c(filelist, f) 
                }
            }
        # If the filename regroup more than one filename
        } else if (length(filename > 1)) {
            # The filelist correspond to the filename
            filelist = filename
        }
        
        # Create a blank data frame
        df_meta = data.frame()
        
        # For all the file in the filelist
        for (f in filelist) {
            # Concatenate by raw data frames created by this function
            # when filename correspond to only one filename
            df_meta = rbind(df_meta,
                            extract_meta(computer_data_path, 
                                         filedir, 
                                         f))
        }
        # Set the rownames by default (to avoid strange numbering)
        rownames(df_meta) = NULL
        return (df_meta)
    }

    # Get the filename from the vector
    filename = filename[1]
    
    # Print metadata if asked
    if (verbose) {
        print(paste("extraction of BH meta for file :", filename))
    }

    # Get the file path to the data
    file_path = file.path(computer_data_path, filedir, filename)

    if (file.exists(file_path) & substr(file_path, nchar(file_path),
                                        nchar(file_path)) != '/') {
        # Extract all the header
        metatxt = c(readLines(file_path, n=41, encoding="UTF-8"))
        
        # Create a tibble with all the metadata needed
        # (IN for INRAE data and BH for BH data)
        df_meta =
            tibble(
                # Station code
                code=trimws(substr(metatxt[11], 38,
                                   nchar(metatxt[11]))),
                # Station name
                nom=trimws(substr(metatxt[12], 39,
                                  nchar(metatxt[12]))),
                # Territory
                territoire=trimws(substr(metatxt[13], 39,
                                         nchar(metatxt[13]))),
                # Administrator
                gestionnaire=trimws(substr(metatxt[7], 60,
                                           nchar(metatxt[7]))),
                # Lambert 93 localisation
                L93X_m_IN=as.numeric(substr(metatxt[16], 65, 77)),
                L93X_m_BH=as.numeric(substr(metatxt[16], 38, 50)),
                L93Y_m_IN=as.numeric(substr(metatxt[16], 79, 90)),
                L93Y_m_BH=as.numeric(substr(metatxt[16], 52, 63)),

                # Surface
                surface_km2_IN=as.numeric(substr(metatxt[19], 52, 63)),
                surface_km2_BH=as.numeric(substr(metatxt[19], 38, 50)),

                # Elevation
                altitude_m_IN=as.numeric(substr(metatxt[20], 52, 63)),
                altitude_m_BH=as.numeric(substr(metatxt[20], 38, 50)),

                # Start and end of the data
                debut=substr(metatxt[25], 38, 50),
                fin=substr(metatxt[25], 52, 63),

                # Different other info about the flow quality and type
                statut=iStatut[trimws(substr(metatxt[26], 38, 50))],
                finalite=iFinalite[trimws(substr(metatxt[26], 52, 56))],
                type=iType[trimws(substr(metatxt[26], 58, 58))],
                influence=iInfluence[trimws(substr(metatxt[26], 60, 60))],
                debit=iDebit[trimws(substr(metatxt[26], 62, 62))],
                QBE=iQBE[trimws(substr(metatxt[26], 72, 72))],
                QME=iQME[trimws(substr(metatxt[26], 74, 74))],
                QHE=iQHE[trimws(substr(metatxt[26], 76, 76))],

                # The path to the data file of BH
                file_path=file_path)

        Ltmp = names(iRegHydro)[nchar(names(iRegHydro)) == 2]
        Ltmp = substr(Ltmp, 1, 1)
        infoSecteur = rle(sort(Ltmp))$values
        
        oneL = substr(df_meta$code, 1, 1)
        twoL = substr(df_meta$code, 1, 2)
        RH = c()
        for (i in 1:length(oneL)) {
            if (oneL[i] %in% infoSecteur) {
                RHtmp = iRegHydro[twoL[i]]
            } else {
                RHtmp = iRegHydro[oneL[i]]
            }
            RH = c(RH, RHtmp)
        }
        
        # Adding of the hydrological region
        df_meta$region_hydro = RH
        return (df_meta)

    } else {
        print(paste('filename', file_path, 'do not exist'))
        return (NULL)
    }
}
# Example
# df_meta = extract_meta(
#     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
#     "BanqueHydro_Export2021",
#     c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))

### 3.2. Extraction of data __________________________________________
# Extraction of data
extract_data = function (computer_data_path, filedir, filename,
                         verbose=TRUE) {
    
    # Convert the filename in vector
    filename = c(filename)

    # If the filename is 'all' or regroup more than one filename
    if (all(filename == 'all') | length(filename) > 1) {
        # If the filename is 'all'
        if (all(filename == 'all')) {
            # Create a filelist to store all the filename
            filelist = c()
             # Get all the filename in the data directory selected
            filelist_tmp = list.files(file.path(computer_data_path,
                                                filedir))

            # For all the filename in the directory selected
            for (f in filelist_tmp) {
                # If the filename extention is 'txt'
                if (file_ext(f) == 'txt') {
                    # Store the filename in the filelist
                    filelist = c(filelist, f) 
                }
            }
        # If the filename regroup more than one filename
        } else if (length(filename > 1)) {
            # The filelist correspond to the filename
            filelist = filename
        } 

        # Create a blank data frame
        df_data = data.frame()

        # For all the file in the filelist
        for (f in filelist) {
            # Concatenate by raw data frames created by this function
            # when filename correspond to only one filename
            df_data = rbind(df_data,
                            extract_data(computer_data_path, 
                                           filedir, 
                                           f))
        }
        # Set the rownames by default (to avoid strange numbering)
        rownames(df_data) = NULL
        return (df_data)
    }
    
    # Get the filename from the vector
    filename = filename[1]
    
    # Print metadata if asked
    if (verbose) {
        print(paste("extraction of BH data for file :", filename))
    }

    # Get the file path to the data
    file_path = file.path(computer_data_path, filedir, filename)
    
    if (file.exists(file_path) & substr(file_path, nchar(file_path),
                                        nchar(file_path)) != '/') {
        # Extract the data as a data frame
        df_data = read.table(file_path,
                             header=TRUE,
                             na.strings=c('     -99', ' -99.000'),
                             sep=';',
                             skip=41)   

        # Extract all the metadata for the station
        df_meta = extract_meta(computer_data_path, filedir, filename,
                               verbose=FALSE)
        # Get the code of the station
        code = df_meta$code
        # Create a tibble with the date as Date class and the code
        # of the station
        df_data = tibble(Date=as.Date(as.character(df_data$Date),
                                      format="%Y%m%d"),
                         Value=df_data$Qls * 1E-3,
                         Qmmj=df_data$Qmmj,
                         val_H=as.character(df_data$val_H),
                         val_I=as.character(df_data$val_I),
                         code=code)
        return (df_data)

    } else {
        print(paste('filename', file_path, 'do not exist'))
        return (NULL)
    }
}
# Example
# df_data = extract_data(
#     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
#     '',
#     c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))


## 4. SHAPEFILE MANAGEMENT ___________________________________________
# Generates a list of shapefiles to draw a hydrological map of
# the France
ini_shapefile = function (resources_path, fr_shpdir, fr_shpname, bs_shpdir, bs_shpname, sbs_shpdir, sbs_shpname, rv_shpdir, rv_shpname, is_river=TRUE) {

    # Path for shapefile
    fr_shppath = file.path(resources_path, fr_shpdir, fr_shpname)
    bs_shppath = file.path(resources_path, bs_shpdir, bs_shpname)
    sbs_shppath = file.path(resources_path, sbs_shpdir, sbs_shpname)
    rv_shppath = file.path(resources_path, rv_shpdir, rv_shpname)

    # France
    fr_spdf = readOGR(dsn=fr_shppath, verbose=FALSE)    
    proj4string(fr_spdf) = CRS("+proj=longlat +ellps=WGS84")
    # Transformation in Lambert93
    france = spTransform(fr_spdf, CRS("+init=epsg:2154"))
    df_france = tibble(fortify(france))

    # Hydrological basin
    bassin = readOGR(dsn=bs_shppath, verbose=FALSE)
    df_bassin = tibble(fortify(bassin))

    # Hydrological basin
    subbassin = readOGR(dsn=sbs_shppath, verbose=FALSE)
    df_subbassin = tibble(fortify(subbassin))

    # If the river shapefile needs to be load
    if (is_river) {
        # Hydrographic network
        river = readOGR(dsn=rv_shppath, verbose=FALSE) ### trop long ###
        river = river[which(river$Classe == 1),]
        df_river = tibble(fortify(river))
    } else {
        df_river = NULL   
    }
    return (list(france=df_france, bassin=df_bassin, subbassin=df_subbassin, river=df_river))
}