extract.R

# Usefull library
library(tools)
library(dplyr)
library(officer)


# General metadata on station
iStatut = c('0'='inconnu', 
            '1'='station avec signification hydrologique', 
            '2'='station sans signification hydrologique', 
            '3'="station d'essai")

iFinalite = c('0'='inconnue', 
              '1'="hydromtrie gnrale", 
              '2'='alerte de crue', 
              '3'="hydromtrie gnrale et alerte de crue",
              '4'="gestion d'ouvrage", 
              '5'='police des eaux', 
              '6'="suivi d'tiage", 
              '7'='bassin exprimental', 
              '8'='drainage')

iType = c('0'='inconnu',
          '1'='une chelle',
          '2'='deux chelles, station mre',
          '3'='deux chelles, station fille',
          '4'='dbits mesurs',
          '5'='virtuelle')

iInfluence = c('0'='inconnue',
               '1'='nulle ou faible',
               '2'='en tiage seulement',
               '3'='forte en toute saison')

iDebit = c('0'='reconstitu',
           '1'="rel (prise en compte de l'eau rajoute ou retire du bassin selon amnagements)",
           '2'='naturel')

iQBE = c('0'='qualit basses eaux inconnue',
         '1'='qualit basses eaux bonne',
         '2'='qualit basses eaux douteuse')

iQME = c('0'='qualit moyennes eaux inconnue',
         '1'='qualit moyennes eaux bonne',
         '2'='qualit moyennes eaux douteuse')

iQHE = c('0'='qualit hautes eaux inconnue',
         '1'='qualit hautes eaux bonne',
         '2'='qualit hautes eaux douteuse')


iRegHydro = c('D'='Affluents du Rhin',
              'E'="Fleuves ctiers de l'Artois-Picardie",
              'A'='Rhin',
              'B'='Meuse',
              'F'='Seine aval (Marne incluse)',
              'G'='Fleuves ctiers haut normands',
              'H'='Seine amont',
              'I'='Fleuves ctiers bas normands',
              'J'='Bretagne',
              'K'='Loire',
              'L'='Loire',
              'M'='Loire',
              'N'='Fleuves ctiers au sud de la Loire',
              'O'='Garonne',
              'P'='Dordogne',
              'Q'='Adour',
              'R'='Charente',
              'S'="Fleuves ctiers de l'Adour-Garonne",
              'U'='Sane',
              'V'='Rhne',
              'W'='Isre',
              'X'='Durance',
              'Y'='Fleuves ctiers du Rhne-Mditranne et Corse',
              'Z'='les',
              '1'='Guadeloupe',
              '2'='Martinique',
              '5'='Guyane',
              '6'='Guyane',
              '7'='Guyane',
              '8'='Guyane',
              '9'='Guyane',
              '4'='Runion')


create_selection = function (computer_data_path, filedir, outname) {
    
    outfile = file.path(computer_data_path, outname)
    codelist = c()
    dir_path = file.path(computer_data_path, filedir)                  
    filelist_tmp = list.files(dir_path)
               
    for (f in filelist_tmp) {
            if (file_ext(f) == 'txt') {
                codelist = c(codelist, gsub('.txt', '', f)) 
            }
        }  

    df_file = tibble(code=codelist,
                     filename=paste(codelist,
                                    '_HYDRO_QJM.txt', sep=''),
                     ok=TRUE)

    write.table(df_file, outfile, sep=";", col.names=TRUE, quote=FALSE)
    
    return (NULL)
}

# Example
# create_selection(
#     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
#     "France207",
#     "nival_selection.txt")


# Get the selection of data from the 'Liste-station_RRSE' file and the BanqueHydro directory
get_selection_AG = function (computer_data_path, listdir, listname,
                             cnames=c('code','station', 'BV_km2', 'axe_principal_concerne', 'longueur_serie', 'commentaires', 'choix'), 
                             c_num=c('BV_km2', 'longueur_serie')) {
    
    # Get the file path to the data
    list_path = file.path(computer_data_path, listdir, listname)
    
    sample_data = read_docx(list_path)
    content = docx_summary(sample_data)
    table_cells <- content %>% filter(content_type == "table cell")
    table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text)
    # Split data into individual columns
    splits <- split(table_data, table_data$cell_id)
    splits <- lapply(splits, function(x) x$text)
    
    # Combine columns back together in wide format
    df_selec <- bind_cols(splits)
    
    df_selec = df_selec[-1,]
    
    # Change the columns name
    names(df_selec) = cnames

    for (c in c_num) {
        df_selec$c = as.numeric(sub(",", ".",
                                    pull(df_selec, c)))
    }
    
    selec = (df_selec$choix == 'A garder' | df_selec$choix == 'Ajout')
    
    df_selec = bind_cols(df_selec, 
                         filename=paste(df_selec$code, '_HYDRO_QJM.txt', sep=''),
                         ok=selec
                         )
    
    return (df_selec)
}

# Example
df_selec_AG = get_selection_AG(
    "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
    "",
    "Liste-station_RRSE.docx",
    cnames=c('code','station', 
             'BV_km2',
             'axe_principal_concerne',
             'longueur_serie',
             'commentaires',
             'choix'), 
    c_num=c('BV_km2',
            'longueur_serie'))


get_selection_NV = function (computer_data_path, listdir, listname) {
    
    # Get the file path to the data
    list_path = file.path(computer_data_path, listdir, listname)
    
    # Extract the data as a data frame
    df_selec = read.table(list_path,
                          header=TRUE,
                          encoding='UTF-8',
                          sep=';',
                          )
    df_selec = tibble(code=as.character(df_selec$code),
                      filename=as.character(df_selec$filename),
                      ok=df_selec$ok)

    return (df_selec)
}
# Example
# df_selec_NV = get_selection_NV(
    # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
    # "",
    # "nival_selection.txt")


# Extraction of metadata
extract_meta = function (computer_data_path, filedir, filename, verbose=TRUE) {
    
    # Convert the filename in vector
    filename = c(filename)
    
    # If the filename is 'all' or regroup more than one filename
    if (all(filename == 'all') | length(filename) > 1) {

        # If the filename is 'all'
        if (all(filename == 'all')) {
            # Create a filelist to store all the filename
            filelist = c()
            # Get all the filename in the data directory selected
            filelist_tmp = list.files(file.path(computer_data_path,
                                                filedir))
            
            # For all the filename in the directory selected
            for (f in filelist_tmp) {
                # If the filename extention is 'txt'
                if (file_ext(f) == 'txt') {
                    # Store the filename in the filelist
                    filelist = c(filelist, f) 
                }
            }

            # If the filename regroup more than one filename
        } else if (length(filename > 1)) {
            # The filelist correspond to the filename
            filelist = filename
        }
        
        # Create a blank data frame
        df_meta = data.frame()
        
        # For all the file in the filelist
        for (f in filelist) {
            
            # Concatenate by raw data frames created by this function when filename correspond to only one filename
            df_meta = rbind(df_meta,
                            extract_meta(computer_data_path, 
                                           filedir, 
                                           f))
        }
        
        # Set the rownames by default (to avoid strange numbering)
        rownames(df_meta) = NULL
        return (df_meta)
    }

    # Get the filename from the vector
    filename = filename[1]
    
    # Print metadata if asked
    if (verbose) {
        print(paste("extraction of BH meta for file :", filename))
    }

    # Get the file path to the data
    file_path = file.path(computer_data_path, filedir, filename)

    
    if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') {
        
        # Extract all the header
        metatxt = c(readLines(file_path, n=41, encoding="UTF-8"))
        
        # Create a tibble with all the metadata needed
        df_meta =
            tibble(code=trimws(substr(metatxt[11], 38, nchar(metatxt[11]))),
                   nom=trimws(substr(metatxt[12], 39, nchar(metatxt[12]))),
                   territoire=trimws(substr(metatxt[13], 39, nchar(metatxt[13]))),
                   L93X=as.numeric(substr(metatxt[16], 38, 50)),
                   L93Y=as.numeric(substr(metatxt[16], 52, 63)),
                   surface_km2=as.numeric(substr(metatxt[19], 38, 50)),
                   statut=iStatut[trimws(substr(metatxt[26], 38, 50))],
                   finalite=iFinalite[trimws(substr(metatxt[26], 52, 56))],
                   type=iType[trimws(substr(metatxt[26], 58, 58))],
                   influence=iInfluence[trimws(substr(metatxt[26], 60, 60))],
                   debit=iDebit[trimws(substr(metatxt[26], 62, 62))],
                   QBE=iQBE[trimws(substr(metatxt[26], 72, 72))],
                   QME=iQME[trimws(substr(metatxt[26], 74, 74))],
                   QHE=iQHE[trimws(substr(metatxt[26], 76, 76))],
                   file_path=file_path,
                   )

        df_meta$region_hydro = iRegHydro[substr(df_meta$code, 1, 1)]

        return (df_meta)

    } else {
        print(paste('filename', file_path, 'do not exist'))
        return (NULL)
    }
}

# Example
# df_meta = extract_meta(
    # "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
    # '',
    # c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))


# Extraction of data
extract_data = function (computer_data_path, filedir, filename, verbose=TRUE) {
    
    # Convert the filename in vector
    filename = c(filename)

    # If the filename is 'all' or regroup more than one filename
    if (all(filename == 'all') | length(filename) > 1) {

        # If the filename is 'all'
        if (all(filename == 'all')) {
            # Create a filelist to store all the filename
            filelist = c()
             # Get all the filename in the data directory selected
            filelist_tmp = list.files(file.path(computer_data_path,
                                                filedir))

            # For all the filename in the directory selected
            for (f in filelist_tmp) {
                # If the filename extention is 'txt'
                if (file_ext(f) == 'txt') {
                    # Store the filename in the filelist
                    filelist = c(filelist, f) 
                }
            }
            # If the filename regroup more than one filename
        } else if (length(filename > 1)) {
             # The filelist correspond to the filename
            filelist = filename
        } 

        # Create a blank data frame
        df_data = data.frame()

        # For all the file in the filelist
        for (f in filelist) {

            # Concatenate by raw data frames created by this function when filename correspond to only one filename
            df_data = rbind(df_data,
                            extract_data(computer_data_path, 
                                           filedir, 
                                           f))
        }

        # Set the rownames by default (to avoid strange numbering)
        rownames(df_data) = NULL
        return (df_data)
    }

    # Get the filename from the vector
    filename = filename[1]
    
    # Print metadata if asked
    if (verbose) {
        print(paste("extraction of BH data for file :", filename))
    }

    # Get the file path to the data
    file_path = file.path(computer_data_path, filedir, filename)
    
    if (file.exists(file_path) & substr(file_path, nchar(file_path), nchar(file_path)) != '/') {

        # Extract the data as a data frame
        df_data = read.table(file_path,
                             header=TRUE,
                             na.strings=c('     -99', ' -99.000'),
                             sep=';',
                             skip=41)   

        # Extract all the metadata for the station
        df_meta = extract_meta(computer_data_path, filedir, filename, verbose=FALSE)
        # Get the code of the station
        code = df_meta$code
        # Create a tibble with the date as Date class and the code of the station
        df_data = tibble(Date=as.Date(as.character(df_data$Date),
                                      format="%Y%m%d"),
                         Qm3s=df_data$Qls * 1E-3,
                         df_data[-1:-2],
                         code=code)

        return (df_data)

    } else {
        print(paste('filename', file_path, 'do not exist'))
        return (NULL)
    }
}

# Example
# df_data = extract_data(
#     "/home/louis/Documents/bouleau/INRAE/CDD_stationnarite/data",
#     '',
#     c('H5920011_HYDRO_QJM.txt', 'K4470010_HYDRO_QJM.txt'))