diff --git a/.gitignore b/.gitignore index 641b0dea298e87efa418fd5aaf3c497dea521903..3bfcaa2eef898011350385496105bec1d90c1c77 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ src/*.o src/*.so src/*.dll +src/*.mod src/symbols.rds inst/doc /doc/ diff --git a/NAMESPACE b/NAMESPACE index 7207a4fee597811b6f5f7ba0b930ea1fb549d992..c0829f4fcf2859939d1864058226a648f5bd24de 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ export(mixr) export(quick_transfr) export(rapriori) export(rsimilarity) +export(rsimilarity_model) export(uh) export(velocity) export(weightr) diff --git a/R/hdist.R b/R/hdist.R index e45b033cf9a9f4e17592da051528fdba785151a5..2d12e3dae85c3fc9210e1c554bb87b62cc1be3bf 100644 --- a/R/hdist.R +++ b/R/hdist.R @@ -202,8 +202,8 @@ call_gdist <- function(pts1, pts2, proj, intersect, rescale, diag, parallel, cor } }else{ # Compute the Ghosh distance matrix between all points - .Fortran("gdist", coord1=st_coordinates(pts1), coord2=st_coordinates(pts2), n1=length(pts1), n2=length(pts2), - proj=proj, rescale=rescale, diag=diag, nthreads=as.integer(cores), mdist=0)$mdist + .Call("c_gdist", coord1=st_coordinates(pts1), coord2=st_coordinates(pts2), + proj=proj, rescale=rescale, diag=diag, nthreads=as.integer(cores)) } } diff --git a/R/mixr.R b/R/mixr.R index 25da1e81094318afbcc126bbe4a831cf94182e44..6b72635437c5060b2f42f176bfb94a8b3eea0eb0 100644 --- a/R/mixr.R +++ b/R/mixr.R @@ -5,7 +5,7 @@ #' @name mixr #' @param obs "transfR" object of the gauged catchments #' @param sim "transfR" object of the ungauged catchments -#' @param mdist the distance matrix between gauged and ungauged catchments, as computed by +#' @param mdist the geographical distance matrix between gauged and ungauged catchments, as computed by #' the function \link{hdist}, used for donor selection #' @param distance the method to use for computing distance matrix if \code{mdist} is not provided. #' Possible values are "ghosh", "rghosh", "points", "centroids", "combined" as available @@ -17,15 +17,18 @@ #' @param weightC weight given to the distance between centroids if \code{distance} is "combined" #' (see the function \link{hdist}) #' @param similarity a hydrological similarity matrix between gauged and ungauged catchments -#' used for donor weighting (\code{1/mdist^power} is used if not provided). Several predictors +#' used for donor weighting (\code{1/mdist^power[1]} is used if not provided). Several predictors #' of hydrological similarity can be provided as a list of squared matrices in order to build #' a multivariate model for predicting similarity between a gauged and an ungauged catchment (see -#' \code{predictors} of the \link{rsimilarity} functions). +#' \code{predictors} of the \link{rsimilarity_model} function). +#' @param FUN either a function or a character string specifying the name of a predifined function to quantify the similarity +#' of two net rainfall time series. See \link{rsimilarity} for more details #' @param parallel logical indicating if the computation should be parallelised #' @param cores the number of cores to use for parallel execution if \code{parallel} is TRUE. #' If not specified, the number of cores is set to the value of \code{parallel::detectCores()} -#' @param power exponent applied in the inverse distance weighting strategy as defined by the function -#' \link{weightr} +#' @param power exponents. The first vector value is used in the inverse distance weighting strategy +#' (as defined by the function \link{weightr}) and the second vector value is used during computation +#' of Rn similarities (as defined by the function \link{rsimilarity_model}) #' @param ndonors maximum number of catchments to be used to simulate discharge of an ungauged #' catchment as defined by the function \link{weightr} #' @param donors list of vectors indicating the catchment IDs from which donors are selected for each ungauged catchments. @@ -33,7 +36,7 @@ #' @param maxdist maximum distance between a gauged and an ungauged catchment to allow the net rainfall #' to be transferred. This threshold is applied on the \code{mdist} distance matrix. If no units is provided, #' \code{maxdist} is assumed to be in [m]. -#' @param min_ndonors minimum number of gauged catchments to start using \link{rsimilarity} to build a similarity model +#' @param min_ndonors minimum number of gauged catchments to start using \link{rsimilarity_model} to build a similarity model #' of Rn time series for the weighting strategy. So this argument is only used when \code{similarity} is a list of matrices. #' It cannot be less than 3. If less than \code{min_ndonors} donors are found inverse distance weighting #' is applied using \code{mdist} @@ -46,7 +49,7 @@ #' in the \code{sim} object called "RnDonor", "Idonor" and "Wdonor" describing the net rainfall, the id and #' the weight of the donor catchments respectively #' @param verbose boolean indicating if information messages should be written to the console -#' @param ... other arguments to be passed to \link{rsimilarity} if \code{similarity} is a list of matrices +#' @param ... other arguments to be passed to \link{rsimilarity_model} if \code{similarity} is a list of matrices #' @return The \code{sim} object incremented by the new computed attributes. #' @details This function is a wrapper function for \link{hdist} and \link{weightr} to directly estimate #' the net rainfall on a set of ungauged catchments (\code{sim}) from a set of gauged catchments (\code{obs}). @@ -56,8 +59,8 @@ #' \deqn{R_n^i =\Sigma_{j=1}^{ndonors} R_n^j \cdot \lambda_j} #' where \eqn{\lambda_j} are defined by an inverse distance weighting function (see \link{weightr}). #' If \code{similarity} is a list of predictors of hydrological similarity, then \eqn{\lambda_j} -#' are provided by \link{rsimilarity}. -#' @seealso \link{hdist}, \link{weightr}, \link{rsimilarity} +#' are provided by \link{rsimilarity_model}. +#' @seealso \link{hdist}, \link{weightr}, \link{rsimilarity_model}, \link{rsimilarity} #' @examples #' \donttest{data(Oudon) #' object <- as_transfr(st = Oudon$obs, hl = Oudon$hl) @@ -72,9 +75,10 @@ #' object <- convolution(object, save_donor = FALSE) #' plot(object, i = 1, attribute = c("Qobs", "Qsim"))} #' @importFrom stats var +#' @importFrom units drop_units set_units #' @export -mixr <- function(obs, sim, mdist, distance = "rghosh", gres = 5, weightO = 0.8, weightC = 0.2, similarity, - parallel = FALSE, cores = NULL, power = 1, ndonors = 5, donors = NULL, maxdist = 50e3, +mixr <- function(obs, sim, mdist, distance = "rghosh", gres = 5, weightO = 0.8, weightC = 0.2, similarity, FUN="invRMSE", + parallel = FALSE, cores = NULL, power = c(1, 0.5), ndonors = 5, donors = NULL, maxdist = 50e3, min_ndonors = 3, flexible_donor = TRUE, cv = FALSE, save_donor = FALSE, verbose = TRUE, ...){ if(cv){ if(missing(sim)){ @@ -101,8 +105,15 @@ mixr <- function(obs, sim, mdist, distance = "rghosh", gres = 5, weightO = 0.8, if(length(donors)!= dim(sim$st)[2]) stop(paste0("The length of 'donors' list is ",length(donors)," but the number of ungauged catchments is ",dim(sim$st)[2],".")) if(any(sapply(donors,FUN = function(x) !all(x %in% 1:dim(obs$st)[2])))) stop("The value of 'donors' can not be matched to a gauged catchment.") } + + #--- Parallelisation + if(parallel & (missing(cores)|is.null(cores))){ + cores <- parallel::detectCores() + }else{ + cores <- 1 + } - #--- Distance matrix beetween sim and obs + #--- Distance matrix between sim and obs if(!missing(mdist)){ if(!(dim(mdist)[1]==dim(obs$st)[2] & dim(mdist)[2]==dim(sim$st)[2])) stop(paste0("The dimension of 'mdist' is (",dim(mdist)[1],",",dim(mdist)[2],") but (",dim(sim$st)[2],",",dim(obs$st)[2], ") was expected.")) }else{ @@ -110,7 +121,7 @@ mixr <- function(obs, sim, mdist, distance = "rghosh", gres = 5, weightO = 0.8, mdist <- hdist(x=obs, y=sim, method=distance, gres=gres, weightO=weightO, weightC=weightC, parallel=parallel, cores=cores, verbose=verbose) } - if(missing(similarity)){similarity <- units::drop_units(1/mdist^power)}else{ + if(missing(similarity)){similarity <- units::drop_units(1/mdist^power[1])}else{ if(is.matrix(similarity) & any(dim(mdist)!=dim(similarity))) stop("The dimensions of 'similarity' should be similar to 'mdist'.") if(is.list(similarity) & any(sapply(lapply(similarity,FUN=dim),FUN=function(x) any(dim(x)!=dim(mdist))))) stop("The dimensions of each 'similarity' matrices should be similar to 'mdist'.") } @@ -151,9 +162,13 @@ mixr <- function(obs, sim, mdist, distance = "rghosh", gres = 5, weightO = 0.8, if(is.na(no_variance)) no_variance <- TRUE nsimilarities <- max(apply(RnInv, MARGIN=1, FUN=function(x) sum(!is.na(x)))) # How many similarity metrics can be computed if(length(idonors)>=min_ndonors & length(idonors)>=3 & !no_variance & nsimilarities>=min_ndonors){ #Need to have at least 3 catchments to have several Rn similarities - similarity_model <- rsimilarity(Rn=RnInv, + similarity_model <- rsimilarity_model(Rn=RnInv, + FUN=FUN, + power=power[2], predictors=predictors, newpredictors=newpredictors, + parallel=parallel, + cores=cores, ...) similarities <- similarity_model$similarity # Save description of the similarity model @@ -170,13 +185,13 @@ mixr <- function(obs, sim, mdist, distance = "rghosh", gres = 5, weightO = 0.8, " km. Considering the gauging period, only ", nsimilarities," Rn similarities can be calculated.", "\nInverse Distance Weighting using 'mdist' will be used by default.")) } - similarities <- units::drop_units(1/distances[idonors]^power) + similarities <- units::drop_units(1/distances[idonors]^power[1]) } }else{ if(cv){similarities <- similarity[-i,i][idonors]}else{similarities <- similarity[idonors,i]} } if(verbose & is.list(similarity)) cat("### Weighting Rn of each donor catchments ###\n") - weights <- weightr(Rn=RnInv, distances=distances[idonors], similarities=similarities, ndonors=ndonors, power=power, flexible_donor=flexible_donor) + weights <- weightr(Rn=RnInv, distances=distances[idonors], similarities=similarities, ndonors=ndonors, power=power[1], flexible_donor=flexible_donor) sim$st$RnSim[,i] <- apply(RnInv*weights,MARGIN = 1,FUN = function(x){if(all(is.na(x))){NA}else{sum(x, na.rm = T)}}) } if(save_donor){ diff --git a/R/rsimilarity.R b/R/rsimilarity.R index 5553a479ee16716f672fe28a1744700593879ad8..f7eff7de14abeed0bc25236d8ce5176b93e086e2 100644 --- a/R/rsimilarity.R +++ b/R/rsimilarity.R @@ -1,28 +1,16 @@ -#' Prediction of hydrological similarity to drive transfer of hydrograph to ungauged catchments +#' Similarity of net rainfall time series #' -#' Quantify the similarity of net rainfall and model this hydrological similarity from catchment -#' similarity predictors +#' Calculate the similarity of net rainfall time series #' @name rsimilarity #' @param Rn net rainfall matrix of gauged catchments (rows for time index, and columns for catchment index) -#' @param predictors a list of at least two squared matrices that could be used to predict similarity of net rainfall. -#' The matrices should be squared to describe the similarity of each pair of gauged catchments. High values should -#' indicate high probability of similarity in net rainfall -#' @param newpredictors a matrix or a data.frame of predictors, to be used as new input to the model being built, -#' and from which we want to predict net rainfall similarity. Typically, a similarity between an ungauged catchment -#' and several gauged catchments that we want to weight as donors. Each column should correspond to each \code{predictors}, -#' and each row should correspond to each pair of catchments analysed. -#' @param FUN a function that will be used to quantify the similarity of two net rainfall time series. High values should -#' indicate high probability of similarity in net rainfall. Default is the inverse of root mean square error \code{invRMSE}, -#' but \code{KGE} (Kling–Gupta efficiency) can also be used -#' @param power exponent applied on \code{Rn} time series when computing \code{Rn} similarity (generally between -1 and 1, -#' a lower value emphasising the similarity of low flows). Default value is 0.5 -#' @param symmetrize a function to combine \code{FUN(Rn[,i],Rn[,j])} and \code{FUN(Rn[,j],Rn[,i])} into one similarity value -#' (typically if \code{FUN=KGE}, \code{KGE(Rn[,i],Rn[,j])!=KGE(Rn[,j],Rn[,i])}). Default is mean -#' @param model the method to estimate similarity of \code{Rn} from \code{descriptors}. This must be one of "lm", "glmnet" -#' @param args_glmnet list of arguments to be passed to \code{glmnet::cv.glmnet()} -#' @param verbose boolean indicating if warning messages should be written to the console -#' @param seed integer value to be used by \code{set.seed()} for reproducible results. Used only if \code{model="glmnet"} -#' @seealso \link{mixr} +#' @param FUN either a function or a character string specifying the name of a predifined function to quantify the similarity of two net rainfall time series. +#' Higher values should indicate a higher probability of similarity. Predefined functions include: "KGE" (Kling–Gupta efficiency), "invRMSE" (inverse of root mean square error), +#' and "RMSE" (root mean square error, should not be directly used as a similarity metric). The default is "invRMSE" +#' @param parallel logical indicating whether the computation should be parallelised +#' @param cores the number of cores to use for parallel execution if \code{parallel} is TRUE. +#' If not specified, the number of cores is set to the value of \code{parallel::detectCores()} +#' @return A square matrix of the similarity metric between each pair of catchments +#' @seealso \link{rsimilarity_model} #' @examples #' \donttest{data(Oudon) #' obs <- as_transfr(st = Oudon$obs, hl = Oudon$hl) @@ -31,153 +19,35 @@ #' obs <- lagtime(obs) #' obs <- rapriori(obs) #' obs <- inversion(obs, parallel = TRUE, cores=2) -#' mdist1 <- hdist(x = obs, y = obs, method = "rghosh", parallel = c(FALSE,TRUE), cores=2) -#' mdist2 <- mdist1^2 -#' rghosh1 <- seq(1000, 5000, by=100) -#' rghosh2 <- rghosh1^2 -#' res <- rsimilarity(Rn = obs$st$RnInv, -#' predictors = list(rghosh1=1/mdist1, rghosh2=1/mdist2), -#' newpredictors = data.frame(rghosh1=1/rghosh1, rghosh2=1/rghosh2), -#' seed=1234) -#' plot(rghosh1, res$similarity, ylab = "Predicted Rn similarity") -#' plot(rghosh2, res$similarity, ylab = "Predicted Rn similarity") -#' # rsimilarity() is automatically called by mixr() if mdist is a list -#' obs <- mixr(obs = obs, mdist = mdist1, -#' similarity = list(rghosh1=1/mdist1, rghosh2=1/mdist2), -#' parallel = TRUE, cores=2, cv = TRUE, save_donor = TRUE) -#' obs$similarity_models -#' obs <- convolution(obs) -#' plot(obs, i = 1, attribute = c("Qobs", "Qsim"))} +#' msim <- rsimilarity(Rn = obs$st$RnInv, FUN="KGE", parallel = TRUE, cores=2)} #' @importFrom units drop_units -#' @importFrom glmnet glmnet cv.glmnet -#' @importFrom stats lm predict complete.cases cor as.formula #' @export -rsimilarity <- function(Rn, predictors, newpredictors, FUN=invRMSE, power=0.5, symmetrize=mean, model="glmnet", - args_glmnet=list(s="lambda.min", lower.limits=0), verbose = TRUE, seed=NULL){ - - # Checks inputs +rsimilarity <- function(Rn, FUN="invRMSE", parallel=FALSE, cores=NULL){ + + # Checks if(inherits(Rn, "units")) Rn <- units::drop_units(Rn) - if(!inherits(predictors, "list")) predictors <- list(predictors) - if(any(!sapply(predictors,FUN=is.matrix))) stop("predictors should be a list with 2 or more matrices") - if(is.null(ncol(newpredictors))) stop("newpredictors should be a matrix or a data.frame with 2 or more columns") - if(length(predictors)<=1) stop("predictors should be a list with 2 or more matrices") - if(ncol(newpredictors)<=1) stop("newpredictors should be a matrix or a data.frame with 2 or more columns") - if(any(ncol(Rn)!=sapply(predictors,FUN=ncol)) | any(ncol(Rn)!=sapply(predictors,FUN=nrow))){ - stop(paste("All predictors elements should be a square matrix of",ncol(Rn),"x",ncol(Rn),"to be consistent with Rn"))} - if(length(predictors) != ncol(newpredictors)) stop("length(predictors) and ncol(newpredictors) should be the same") - - # Provide names of distances if not provided - if(is.null(names(predictors))) names(predictors) <- paste0("predictor",1:length(predictors)) - if(is.null(names(newpredictors))) names(newpredictors) <- paste0("predictor",1:length(newpredictors)) - if(any(names(predictors) != colnames(newpredictors))) stop("predictors and newpredictors do not have the same names") - - # Compute similarity matrix index of Rn - if(verbose) cat("\n### Computing similarity matrix of Rn ###\n") - n <- ncol(Rn) - mat_similarity <- matrix(NA, n, n) - mat_similarity <- sapply(1:n, function(i) { - sapply(1:n, function(j) FUN(Rn[, i]^power, Rn[, j]^power)) - }) - - # Convert matrix to vector for Rn similarity - if(verbose) cat("### Preparing data for the similarity model ###\n") - tmat_similarity <- t(mat_similarity) - dat_similarity <- data.frame(ij=mat_similarity[upper.tri(mat_similarity, diag = FALSE)],ji=tmat_similarity[upper.tri(tmat_similarity, diag = FALSE)]) - vec_similarity <- apply(dat_similarity, MARGIN=1, FUN=symmetrize) - - # Convert each matrix of predictors into vectors, to prepare data for linear model - model_data <- matrix(nrow=length(vec_similarity), ncol=length(predictors), dimnames=list(NULL,names(predictors))) - for(i in 1:length(predictors)){ - mat_pred <- predictors[[i]] - tmat_pred <- t(mat_pred) - dat_pred <- data.frame(ij=mat_pred[upper.tri(mat_pred, diag = FALSE)],ji=tmat_pred[upper.tri(tmat_pred, diag = FALSE)]) - model_data[,i] <- apply(dat_pred, MARGIN=1, FUN=symmetrize) - } - - # Remove NA or Inf value - to_remove <- apply(model_data, MARGIN=1, FUN=function(x) any(is.infinite(x) | is.na(x))) - to_remove <- to_remove | is.infinite(vec_similarity) | is.na(vec_similarity) - model_data <- model_data[!to_remove,] - vec_similarity <- vec_similarity[!to_remove] - - if(verbose & nrow(model_data) < 10 * ncol(model_data)) warning(paste0("Regression is done over a limited number of similarities (", - nrow(model_data)," observations for ",ncol(model_data)," predictors)")) - - if(verbose) cat("### Building the similarity model ###\n") - if(model == "glmnet"){ - if (!is.null(seed)) set.seed(seed) - # Linear model to predict Rn similarity from predictors using a GLM with lasso or elasticnet regularization - valid_args <- c(names(formals(glmnet::glmnet)),names(formals(glmnet::cv.glmnet))) - glm_similarity <- do.call(glmnet::cv.glmnet, args=c(list(x=model_data, y=vec_similarity), args_glmnet[names(args_glmnet) %in% valid_args])) - if(is.character(args_glmnet[["s"]])){s <- glm_similarity[[args_glmnet[["s"]]]]}else{s <- args_glmnet[["s"]]} - similarity <- as.vector(stats::predict(glm_similarity, newx=as.matrix(newpredictors), s=s)) - coef <- as.matrix(coef(glm_similarity, s=s))[-1,] - # Infinite similarity of predictors leads to infinite predicted similarity (instead of NA) - inf_similarity <- apply(newpredictors, MARGIN=1, FUN = function(x) any(is.infinite(x))) - if(any(inf_similarity)) similarity[inf_similarity] <- Inf - # Compute R2 manually - y <- vec_similarity - y_pred <- stats::predict(glm_similarity, newx = model_data, s = s) - y_mean <- mean(y) - ss_total <- sum((y - y_mean)^2) - ss_residual <- sum((y - y_pred)^2) - r2 <- 1 - (ss_residual / ss_total) - # Indicators of the distribution of the weights - n50 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.50) - n75 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.75) - n90 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.90) - # Gather results - results <- list(similarity=similarity, coef=coef, r2=r2, ncatch=as.integer(ncol(Rn)), ndist=length(vec_similarity), n50=n50, n75=n75, n90=n90) + if(!(inherits(Rn, "matrix") | inherits(Rn, "data.frame"))) stop("Rn should have at least two columns") + if(ncol(Rn) < 2) stop("Rn should have at least two columns") + + # Parallelisation + if(parallel & (missing(cores)|is.null(cores))){ + cores <- parallel::detectCores() + }else{ + cores <- 1 } - - if(model == "lm"){ - # Linear model to predict Rn similarity from predictors using lm - lm_data <- cbind(Rn_similarity=vec_similarity, model_data) - formula <- stats::as.formula(paste(colnames(lm_data)[1],"~", paste(colnames(lm_data)[-1], collapse = " + "))) - lm_similarity <- stats::lm(formula, data=as.data.frame(lm_data)) - similarity <- as.vector(stats::predict(lm_similarity, newdata=as.data.frame(newpredictors))) - coef <- as.matrix(coef(lm_similarity))[-1,] - r2 <- summary(lm_similarity)$r.squared - pvalue <- summary(lm_similarity)$coefficients[-1, "Pr(>|t|)"] - # Infinite similarity of predictors leads to infinite predicted similarity (instead of NA) - inf_similarity <- apply(newpredictors, MARGIN=1, FUN = function(x) any(is.infinite(x))) - if(any(inf_similarity)) similarity[inf_similarity] <- Inf - # Indicators of the distribution of the weights - n50 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.50) - n75 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.75) - n90 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.90) - # Gather results - results <- list(similarity=similarity, coef=coef, r2=r2, ncatch=ncol(Rn), ndist=length(vec_similarity), n50=n50, n75=n75, n90=n90, pvalue=pvalue) + + # Initialise msim + n <- ncol(Rn) + msim <- matrix(NA, n, n) + + # Compute similarity matrix + if(inherits(FUN, "function")){ + msim <- sapply(1:n, function(i) {sapply(1:n, function(j) FUN(Rn[, j], Rn[, i]))}) + }else{ + valid_FUN <- c("KGE", "RMSE", "invRMSE") + FUN <- match.arg(FUN, choices = valid_FUN) + crit <- match(FUN, valid_FUN) + msim <- .Call("c_similarity", Rn=Rn, crit=crit, nthreads=cores) } - - return(results) -} - -KGE <- function(obs, sim) { - # Remove missing values - valid <- !is.na(obs) & !is.na(sim) - if (!any(valid)) return(NA) - obs <- obs[valid] - sim <- sim[valid] - - # KGE components - r <- stats::cor(obs, sim, method = c("pearson")) - beta <- mean(sim) / mean(obs) - gamma <- (sd(sim) / mean(sim)) / (sd(obs) / mean(obs)) - - # Final KGE - kge <- 1 - sqrt((r - 1)^2 + (beta - 1)^2 + (gamma - 1)^2) - return(kge) -} - -invRMSE <- function(obs, sim) { - # Remove missing values - valid <- !is.na(obs) & !is.na(sim) - if (!any(valid)) return(NA) - obs <- obs[valid] - sim <- sim[valid] - - # RMSE - rmse <- sqrt(mean((sim - obs)^2)) - return(1 / rmse) + return(msim) } diff --git a/R/rsimilarity_model.R b/R/rsimilarity_model.R new file mode 100644 index 0000000000000000000000000000000000000000..e9a6fde8f20fd0900cfe4fc032c6d9f70cadcc8c --- /dev/null +++ b/R/rsimilarity_model.R @@ -0,0 +1,174 @@ +#' Prediction of hydrological similarity to drive transfer of hydrograph to ungauged catchments +#' +#' Quantify the similarity of net rainfall and model this hydrological similarity from catchment +#' similarity predictors +#' @name rsimilarity_model +#' @param Rn net rainfall matrix of gauged catchments (rows for time index, and columns for catchment index) +#' @param msim similarity matrix of net rainfall time series (as produced by \link{rsimilarity}) +#' @param predictors a list of at least two squared matrices that could be used to predict similarity of net rainfall. +#' The matrices should be squared to describe the similarity of each pair of gauged catchments. High values should +#' indicate high probability of similarity in net rainfall +#' @param newpredictors a matrix or a data.frame of predictors, to be used as new input to the model being built, +#' and from which we want to predict net rainfall similarity. Typically, a similarity between an ungauged catchment +#' and several gauged catchments that we want to weight as donors. Each column should correspond to each \code{predictors}, +#' and each row should correspond to each pair of catchments analysed. +#' @param FUN either a function or a character string specifying the name of a predifined function to quantify the similarity +#' of two net rainfall time series. See \link{rsimilarity} for more details +#' @param power exponent applied on \code{Rn} time series when computing \code{Rn} similarity (generally between -1 and 1, +#' a lower value emphasising the similarity of low flows). Default value is 0.5 +#' @param symmetrize a function to combine \code{FUN(Rn[,i],Rn[,j])} and \code{FUN(Rn[,j],Rn[,i])} into one similarity value +#' (typically if \code{FUN=KGE}, \code{KGE(Rn[,i],Rn[,j])!=KGE(Rn[,j],Rn[,i])}). Default is mean +#' @param model the method to estimate similarity of \code{Rn} from \code{descriptors}. This must be one of "lm", "glmnet" +#' @param args_glmnet list of arguments to be passed to \code{glmnet::cv.glmnet()} +#' @param parallel logical indicating whether the computation of Rn similarities should be parallelised +#' @param cores the number of cores to use for parallel execution if \code{parallel} is TRUE. +#' If not specified, the number of cores is set to the value of \code{parallel::detectCores()} +#' @param verbose boolean indicating if warning messages should be written to the console +#' @param seed integer value to be used by \code{set.seed()} for reproducible results. Used only if \code{model="glmnet"} +#' @seealso \link{rsimilarity}, \link{mixr} +#' @examples +#' \donttest{data(Oudon) +#' obs <- as_transfr(st = Oudon$obs, hl = Oudon$hl) +#' obs <- velocity(obs, method = "loire2016") +#' obs <- uh(obs) +#' obs <- lagtime(obs) +#' obs <- rapriori(obs) +#' obs <- inversion(obs, parallel = TRUE, cores=2) +#' mdist1 <- hdist(x = obs, y = obs, method = "rghosh", parallel = c(FALSE,TRUE), cores=2) +#' mdist2 <- mdist1^2 +#' rghosh1 <- seq(1000, 5000, by=100) +#' rghosh2 <- rghosh1^2 +#' res <- rsimilarity_model(Rn = obs$st$RnInv, +#' predictors = list(rghosh1=1/mdist1, rghosh2=1/mdist2), +#' newpredictors = data.frame(rghosh1=1/rghosh1, rghosh2=1/rghosh2), +#' seed=1234) +#' plot(rghosh1, res$similarity, ylab = "Predicted Rn similarity") +#' plot(rghosh2, res$similarity, ylab = "Predicted Rn similarity") +#' # rsimilarity_model() is automatically called by mixr() if mdist is a list +#' obs <- mixr(obs = obs, mdist = mdist1, +#' similarity = list(rghosh1=1/mdist1, rghosh2=1/mdist2), +#' parallel = TRUE, cores=2, cv = TRUE, save_donor = TRUE) +#' obs$similarity_models +#' obs <- convolution(obs) +#' plot(obs, i = 1, attribute = c("Qobs", "Qsim"))} +#' @importFrom units drop_units +#' @importFrom glmnet glmnet cv.glmnet +#' @importFrom stats lm predict complete.cases cor as.formula +#' @export +rsimilarity_model <- function(Rn, msim, predictors, newpredictors, FUN="invRMSE", power=0.5, symmetrize=mean, model="glmnet", + args_glmnet=list(s="lambda.min", lower.limits=0), parallel = FALSE, cores = NULL, verbose = TRUE, seed=NULL){ + + # Checks inputs: predictors and newpredictors + if(!inherits(predictors, "list")) predictors <- list(predictors) + if(any(!sapply(predictors,FUN=is.matrix))) stop("predictors should be a list with 2 or more matrices") + if(is.null(ncol(newpredictors))) stop("newpredictors should be a matrix or a data.frame with 2 or more columns") + if(length(predictors)<=1) stop("predictors should be a list with 2 or more matrices") + if(ncol(newpredictors)<=1) stop("newpredictors should be a matrix or a data.frame with 2 or more columns") + if(length(predictors) != ncol(newpredictors)) stop("length(predictors) and ncol(newpredictors) should be the same") + + # Provide names of distances if not provided + if(is.null(names(predictors))) names(predictors) <- paste0("predictor",1:length(predictors)) + if(is.null(names(newpredictors))) names(newpredictors) <- paste0("predictor",1:length(newpredictors)) + if(any(names(predictors) != colnames(newpredictors))) stop("predictors and newpredictors do not have the same names") + + # Compute similarity matrix index of Rn + if(missing(msim)){ + # Checks inputs: Rn + if(missing(Rn)){ + stop("Need to provide either Rn or msim") + }else{ + if(inherits(Rn, "units")) Rn <- units::drop_units(Rn) + if(any(ncol(Rn)!=sapply(predictors,FUN=ncol)) | any(ncol(Rn)!=sapply(predictors,FUN=nrow))){ + stop(paste("All predictors elements should be a square matrix of",ncol(Rn),"x",ncol(Rn),"to be consistent with Rn"))} + } + # Parallelisation + if(parallel & (missing(cores)|is.null(cores))){ + cores <- parallel::detectCores() + }else{ + cores <- 1 + } + if(verbose) cat("\n### Computing similarity matrix of Rn ###\n") + msim <- rsimilarity(Rn=Rn^power, FUN=FUN, parallel= parallel, cores=cores) + }else{ + # Checks inputs: msim + if(!inherits(msim, "matrix")) stop("msim should be a matrix") + if(nrow(msim) != ncol(msim)) stop("msim should be a square matrix") + if(any(nrow(msim) != nrow(predictors))) stop("msim should be a square matrix") + if(any(ncol(msim)!=sapply(predictors,FUN=ncol)) | any(ncol(msim)!=sapply(predictors,FUN=nrow))){ + stop(paste("msim should be a square matrix with dimensions similar to predictors"))} + } + + + # Convert matrix to vector for Rn similarity + if(verbose) cat("### Preparing data for the similarity model ###\n") + tmsim <- t(msim) + dat_similarity <- data.frame(ij=msim[upper.tri(msim, diag = FALSE)],ji=tmsim[upper.tri(tmsim, diag = FALSE)]) + vec_similarity <- apply(dat_similarity, MARGIN=1, FUN=symmetrize) + + # Convert each matrix of predictors into vectors, to prepare data for linear model + model_data <- matrix(nrow=length(vec_similarity), ncol=length(predictors), dimnames=list(NULL,names(predictors))) + for(i in 1:length(predictors)){ + mat_pred <- predictors[[i]] + tmat_pred <- t(mat_pred) + dat_pred <- data.frame(ij=mat_pred[upper.tri(mat_pred, diag = FALSE)],ji=tmat_pred[upper.tri(tmat_pred, diag = FALSE)]) + model_data[,i] <- apply(dat_pred, MARGIN=1, FUN=symmetrize) + } + + # Remove NA or Inf value + to_remove <- apply(model_data, MARGIN=1, FUN=function(x) any(is.infinite(x) | is.na(x))) + to_remove <- to_remove | is.infinite(vec_similarity) | is.na(vec_similarity) + model_data <- model_data[!to_remove,] + vec_similarity <- vec_similarity[!to_remove] + + if(verbose & nrow(model_data) < 10 * ncol(model_data)) warning(paste0("Regression is done over a limited number of similarities (", + nrow(model_data)," observations for ",ncol(model_data)," predictors)")) + + if(verbose) cat("### Building the similarity model ###\n") + if(model == "glmnet"){ + if (!is.null(seed)) set.seed(seed) + # Linear model to predict Rn similarity from predictors using a GLM with lasso or elasticnet regularization + valid_args <- c(names(formals(glmnet::glmnet)),names(formals(glmnet::cv.glmnet))) + glm_similarity <- do.call(glmnet::cv.glmnet, args=c(list(x=model_data, y=vec_similarity), args_glmnet[names(args_glmnet) %in% valid_args])) + if(is.character(args_glmnet[["s"]])){s <- glm_similarity[[args_glmnet[["s"]]]]}else{s <- args_glmnet[["s"]]} + similarity <- as.vector(stats::predict(glm_similarity, newx=as.matrix(newpredictors), s=s)) + coef <- as.matrix(coef(glm_similarity, s=s))[-1,] + # Infinite similarity of predictors leads to infinite predicted similarity (instead of NA) + inf_similarity <- apply(newpredictors, MARGIN=1, FUN = function(x) any(is.infinite(x))) + if(any(inf_similarity)) similarity[inf_similarity] <- Inf + # Compute R2 manually + y <- vec_similarity + y_pred <- stats::predict(glm_similarity, newx = model_data, s = s) + y_mean <- mean(y) + ss_total <- sum((y - y_mean)^2) + ss_residual <- sum((y - y_pred)^2) + r2 <- 1 - (ss_residual / ss_total) + # Indicators of the distribution of the weights + n50 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.50) + n75 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.75) + n90 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.90) + # Gather results + results <- list(similarity=similarity, coef=coef, r2=r2, ncatch=as.integer(ncol(msim)), ndist=length(vec_similarity), n50=n50, n75=n75, n90=n90) + } + + if(model == "lm"){ + # Linear model to predict Rn similarity from predictors using lm + lm_data <- cbind(Rn_similarity=vec_similarity, model_data) + formula <- stats::as.formula(paste(colnames(lm_data)[1],"~", paste(colnames(lm_data)[-1], collapse = " + "))) + lm_similarity <- stats::lm(formula, data=as.data.frame(lm_data)) + similarity <- as.vector(stats::predict(lm_similarity, newdata=as.data.frame(newpredictors))) + coef <- as.matrix(coef(lm_similarity))[-1,] + r2 <- summary(lm_similarity)$r.squared + pvalue <- summary(lm_similarity)$coefficients[-1, "Pr(>|t|)"] + # Infinite similarity of predictors leads to infinite predicted similarity (instead of NA) + inf_similarity <- apply(newpredictors, MARGIN=1, FUN = function(x) any(is.infinite(x))) + if(any(inf_similarity)) similarity[inf_similarity] <- Inf + # Indicators of the distribution of the weights + n50 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.50) + n75 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.75) + n90 <- which.min(cumsum(sort(similarity/sum(similarity), decreasing=TRUE)) <= 0.90) + # Gather results + results <- list(similarity=similarity, coef=coef, r2=r2, ncatch=as.integer(ncol(msim)), ndist=length(vec_similarity), n50=n50, n75=n75, n90=n90, pvalue=pvalue) + } + + return(results) +} diff --git a/man/mixr.Rd b/man/mixr.Rd index 04b51ebecd8a839e1bb20aa9dc080567682bfc4d..edd8756c8ef8f92959ceaad1bf703ec362047cf5 100644 --- a/man/mixr.Rd +++ b/man/mixr.Rd @@ -13,9 +13,10 @@ mixr( weightO = 0.8, weightC = 0.2, similarity, + FUN = "invRMSE", parallel = FALSE, cores = NULL, - power = 1, + power = c(1, 0.5), ndonors = 5, donors = NULL, maxdist = 50000, @@ -32,7 +33,7 @@ mixr( \item{sim}{"transfR" object of the ungauged catchments} -\item{mdist}{the distance matrix between gauged and ungauged catchments, as computed by +\item{mdist}{the geographical distance matrix between gauged and ungauged catchments, as computed by the function \link{hdist}, used for donor selection} \item{distance}{the method to use for computing distance matrix if \code{mdist} is not provided. @@ -49,18 +50,22 @@ distance (see the function \link{hdist})} (see the function \link{hdist})} \item{similarity}{a hydrological similarity matrix between gauged and ungauged catchments -used for donor weighting (\code{1/mdist^power} is used if not provided). Several predictors +used for donor weighting (\code{1/mdist^power[1]} is used if not provided). Several predictors of hydrological similarity can be provided as a list of squared matrices in order to build a multivariate model for predicting similarity between a gauged and an ungauged catchment (see -\code{predictors} of the \link{rsimilarity} functions).} +\code{predictors} of the \link{rsimilarity_model} function).} + +\item{FUN}{either a function or a character string specifying the name of a predifined function to quantify the similarity +of two net rainfall time series. See \link{rsimilarity} for more details} \item{parallel}{logical indicating if the computation should be parallelised} \item{cores}{the number of cores to use for parallel execution if \code{parallel} is TRUE. If not specified, the number of cores is set to the value of \code{parallel::detectCores()}} -\item{power}{exponent applied in the inverse distance weighting strategy as defined by the function -\link{weightr}} +\item{power}{exponents. The first vector value is used in the inverse distance weighting strategy +(as defined by the function \link{weightr}) and the second vector value is used during computation +of Rn similarities (as defined by the function \link{rsimilarity_model})} \item{ndonors}{maximum number of catchments to be used to simulate discharge of an ungauged catchment as defined by the function \link{weightr}} @@ -72,7 +77,7 @@ If empty, the \code{ndonors} closest catchments are used} to be transferred. This threshold is applied on the \code{mdist} distance matrix. If no units is provided, \code{maxdist} is assumed to be in [m].} -\item{min_ndonors}{minimum number of gauged catchments to start using \link{rsimilarity} to build a similarity model +\item{min_ndonors}{minimum number of gauged catchments to start using \link{rsimilarity_model} to build a similarity model of Rn time series for the weighting strategy. So this argument is only used when \code{similarity} is a list of matrices. It cannot be less than 3. If less than \code{min_ndonors} donors are found inverse distance weighting is applied using \code{mdist}} @@ -90,7 +95,7 @@ the weight of the donor catchments respectively} \item{verbose}{boolean indicating if information messages should be written to the console} -\item{...}{other arguments to be passed to \link{rsimilarity} if \code{similarity} is a list of matrices} +\item{...}{other arguments to be passed to \link{rsimilarity_model} if \code{similarity} is a list of matrices} } \value{ The \code{sim} object incremented by the new computed attributes. @@ -108,7 +113,7 @@ of \code{ndonors} gauged catchments \eqn{j}: \deqn{R_n^i =\Sigma_{j=1}^{ndonors} R_n^j \cdot \lambda_j} where \eqn{\lambda_j} are defined by an inverse distance weighting function (see \link{weightr}). If \code{similarity} is a list of predictors of hydrological similarity, then \eqn{\lambda_j} -are provided by \link{rsimilarity}. +are provided by \link{rsimilarity_model}. } \examples{ \donttest{data(Oudon) @@ -125,5 +130,5 @@ object <- convolution(object, save_donor = FALSE) plot(object, i = 1, attribute = c("Qobs", "Qsim"))} } \seealso{ -\link{hdist}, \link{weightr}, \link{rsimilarity} +\link{hdist}, \link{weightr}, \link{rsimilarity_model}, \link{rsimilarity} } diff --git a/man/rsimilarity.Rd b/man/rsimilarity.Rd index 4062b7416dfdcdd086967b90d7a35ecc5ada45c3..4c571802b45ef1aba9410ee215463f1a73d86125 100644 --- a/man/rsimilarity.Rd +++ b/man/rsimilarity.Rd @@ -2,54 +2,27 @@ % Please edit documentation in R/rsimilarity.R \name{rsimilarity} \alias{rsimilarity} -\title{Prediction of hydrological similarity to drive transfer of hydrograph to ungauged catchments} +\title{Similarity of net rainfall time series} \usage{ -rsimilarity( - Rn, - predictors, - newpredictors, - FUN = invRMSE, - power = 0.5, - symmetrize = mean, - model = "glmnet", - args_glmnet = list(alpha = 0.5, s = "lambda.min", lower.limits = 0), - verbose = TRUE, - seed = NULL -) +rsimilarity(Rn, FUN = "invRMSE", parallel = FALSE, cores = NULL) } \arguments{ \item{Rn}{net rainfall matrix of gauged catchments (rows for time index, and columns for catchment index)} -\item{predictors}{a list of at least two squared matrices that could be used to predict similarity of net rainfall. -The matrices should be squared to describe the similarity of each pair of gauged catchments. High values should -indicate high probability of similarity in net rainfall} +\item{FUN}{either a function or a character string specifying the name of a predifined function to quantify the similarity of two net rainfall time series. +Higher values should indicate a higher probability of similarity. Predefined functions include: "KGE" (Kling–Gupta efficiency), "invRMSE" (inverse of root mean square error), +and "RMSE" (root mean square error, should not be directly used as a similarity metric). The default is "invRMSE"} -\item{newpredictors}{a matrix or a data.frame of predictors, to be used as new input to the model being built, -and from which we want to predict net rainfall similarity. Typically, a similarity between an ungauged catchment -and several gauged catchments that we want to weight as donors. Each column should correspond to each \code{predictors}, -and each row should correspond to each pair of catchments analysed.} +\item{parallel}{logical indicating whether the computation should be parallelised} -\item{FUN}{a function that will be used to quantify the similarity of two net rainfall time series. High values should -indicate high probability of similarity in net rainfall. Default is the inverse of root mean square error \code{invRMSE}, -but \code{KGE} (Kling–Gupta efficiency) can also be used} - -\item{power}{exponent applied on \code{Rn} time series when computing \code{Rn} similarity (generally between -1 and 1, -a lower value emphasising the similarity of low flows). Default value is 0.5} - -\item{symmetrize}{a function to combine \code{FUN(Rn[,i],Rn[,j])} and \code{FUN(Rn[,j],Rn[,i])} into one similarity value -(typically if \code{FUN=KGE}, \code{KGE(Rn[,i],Rn[,j])!=KGE(Rn[,j],Rn[,i])}). Default is mean} - -\item{model}{the method to estimate similarity of \code{Rn} from \code{descriptors}. This must be one of "lm", "glmnet"} - -\item{args_glmnet}{list of arguments to be passed to \code{glmnet::cv.glmnet()}} - -\item{verbose}{boolean indicating if warning messages should be written to the console} - -\item{seed}{integer value to be used by \code{set.seed()} for reproducible results. Used only if \code{model="glmnet"}} +\item{cores}{the number of cores to use for parallel execution if \code{parallel} is TRUE. +If not specified, the number of cores is set to the value of \code{parallel::detectCores()}} +} +\value{ +A square matrix of the similarity metric between each pair of catchments } \description{ -Quantify the similarity of net rainfall and model this hydrological similarity from catchment -similarity predictors +Calculate the similarity of net rainfall time series } \examples{ \donttest{data(Oudon) @@ -59,24 +32,8 @@ obs <- uh(obs) obs <- lagtime(obs) obs <- rapriori(obs) obs <- inversion(obs, parallel = TRUE, cores=2) -mdist1 <- hdist(x = obs, y = obs, method = "rghosh", parallel = c(FALSE,TRUE), cores=2) -mdist2 <- mdist1^2 -rghosh1 <- seq(1000, 5000, by=100) -rghosh2 <- rghosh1^2 -res <- rsimilarity(Rn = obs$st$RnInv, - predictors = list(rghosh1=1/mdist1, rghosh2=1/mdist2), - newpredictors = data.frame(rghosh1=1/rghosh1, rghosh2=1/rghosh2), - seed=1234) -plot(rghosh1, res$similarity, ylab = "Predicted Rn similarity") -plot(rghosh2, res$similarity, ylab = "Predicted Rn similarity") -# rsimilarity() is automatically called by mixr() if mdist is a list -obs <- mixr(obs = obs, mdist = mdist1, - similarity = list(rghosh1=1/mdist1, rghosh2=1/mdist2), - parallel = TRUE, cores=2, cv = TRUE, save_donor = TRUE) -obs$similarity_models -obs <- convolution(obs) -plot(obs, i = 1, attribute = c("Qobs", "Qsim"))} +msim <- rsimilarity(Rn = obs$st$RnInv, FUN="KGE", parallel = TRUE, cores=2)} } \seealso{ -\link{mixr} +\link{rsimilarity_model} } diff --git a/man/rsimilarity_model.Rd b/man/rsimilarity_model.Rd new file mode 100644 index 0000000000000000000000000000000000000000..2ec831e458ea77bb250006f5b19c3184646aa822 --- /dev/null +++ b/man/rsimilarity_model.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rsimilarity_model.R +\name{rsimilarity_model} +\alias{rsimilarity_model} +\title{Prediction of hydrological similarity to drive transfer of hydrograph to ungauged catchments} +\usage{ +rsimilarity_model( + Rn, + msim, + predictors, + newpredictors, + FUN = "invRMSE", + power = 0.5, + symmetrize = mean, + model = "glmnet", + args_glmnet = list(s = "lambda.min", lower.limits = 0), + parallel = FALSE, + cores = NULL, + verbose = TRUE, + seed = NULL +) +} +\arguments{ +\item{Rn}{net rainfall matrix of gauged catchments (rows for time index, and columns for catchment index)} + +\item{msim}{similarity matrix of net rainfall time series (as produced by \link{rsimilarity})} + +\item{predictors}{a list of at least two squared matrices that could be used to predict similarity of net rainfall. +The matrices should be squared to describe the similarity of each pair of gauged catchments. High values should +indicate high probability of similarity in net rainfall} + +\item{newpredictors}{a matrix or a data.frame of predictors, to be used as new input to the model being built, +and from which we want to predict net rainfall similarity. Typically, a similarity between an ungauged catchment +and several gauged catchments that we want to weight as donors. Each column should correspond to each \code{predictors}, +and each row should correspond to each pair of catchments analysed.} + +\item{FUN}{either a function or a character string specifying the name of a predifined function to quantify the similarity +of two net rainfall time series. See \link{rsimilarity} for more details} + +\item{power}{exponent applied on \code{Rn} time series when computing \code{Rn} similarity (generally between -1 and 1, +a lower value emphasising the similarity of low flows). Default value is 0.5} + +\item{symmetrize}{a function to combine \code{FUN(Rn[,i],Rn[,j])} and \code{FUN(Rn[,j],Rn[,i])} into one similarity value +(typically if \code{FUN=KGE}, \code{KGE(Rn[,i],Rn[,j])!=KGE(Rn[,j],Rn[,i])}). Default is mean} + +\item{model}{the method to estimate similarity of \code{Rn} from \code{descriptors}. This must be one of "lm", "glmnet"} + +\item{args_glmnet}{list of arguments to be passed to \code{glmnet::cv.glmnet()}} + +\item{parallel}{logical indicating whether the computation of Rn similarities should be parallelised} + +\item{cores}{the number of cores to use for parallel execution if \code{parallel} is TRUE. +If not specified, the number of cores is set to the value of \code{parallel::detectCores()}} + +\item{verbose}{boolean indicating if warning messages should be written to the console} + +\item{seed}{integer value to be used by \code{set.seed()} for reproducible results. Used only if \code{model="glmnet"}} +} +\description{ +Quantify the similarity of net rainfall and model this hydrological similarity from catchment +similarity predictors +} +\examples{ +\donttest{data(Oudon) +obs <- as_transfr(st = Oudon$obs, hl = Oudon$hl) +obs <- velocity(obs, method = "loire2016") +obs <- uh(obs) +obs <- lagtime(obs) +obs <- rapriori(obs) +obs <- inversion(obs, parallel = TRUE, cores=2) +mdist1 <- hdist(x = obs, y = obs, method = "rghosh", parallel = c(FALSE,TRUE), cores=2) +mdist2 <- mdist1^2 +rghosh1 <- seq(1000, 5000, by=100) +rghosh2 <- rghosh1^2 +res <- rsimilarity_model(Rn = obs$st$RnInv, + predictors = list(rghosh1=1/mdist1, rghosh2=1/mdist2), + newpredictors = data.frame(rghosh1=1/rghosh1, rghosh2=1/rghosh2), + seed=1234) +plot(rghosh1, res$similarity, ylab = "Predicted Rn similarity") +plot(rghosh2, res$similarity, ylab = "Predicted Rn similarity") +# rsimilarity_model() is automatically called by mixr() if mdist is a list +obs <- mixr(obs = obs, mdist = mdist1, + similarity = list(rghosh1=1/mdist1, rghosh2=1/mdist2), + parallel = TRUE, cores=2, cv = TRUE, save_donor = TRUE) +obs$similarity_models +obs <- convolution(obs) +plot(obs, i = 1, attribute = c("Qobs", "Qsim"))} +} +\seealso{ +\link{rsimilarity}, \link{mixr} +} diff --git a/src/gdist.f90 b/src/gdist.f90 index 023b0c81a4642618fba6535fb3de96b3a15cb45d..ede1bd06516a16dc1e26e6c693f54038f7099bb6 100644 --- a/src/gdist.f90 +++ b/src/gdist.f90 @@ -1,108 +1,95 @@ -!For compiling the Fortran file : -!R CMD SHLIB gdist.f90 - -recursive subroutine gdist(coord1,coord2,n1,n2,proj,rescale,diag,nthreads,mdist) +recursive subroutine gdist(coord1, coord2, n1, n2, proj, rescale, diag, nthreads, mdist) bind(C, name="gdist") + use iso_c_binding, only: c_int, c_double use omp_lib - implicit none + integer(c_int), value :: n1, n2 + integer(c_int), value :: proj, rescale, diag + integer(c_int), value :: nthreads + real(c_double), intent(in) :: coord1(n1,2) + real(c_double), intent(in) :: coord2(n2,2) + real(c_double), intent(out) :: mdist - integer, intent(in) :: n1, n2 - double precision, intent(in) :: coord1(n1,2) - double precision, intent(in) :: coord2(n2,2) - logical, intent(in) :: proj, rescale, diag - integer, intent(inout) :: nthreads - double precision, intent(out) :: mdist - + ! Local variables + integer(c_int) :: i, j, ndist + real(c_double) :: diffx, diffy, dist, mdist1, mdist2, sdist + real(c_double) :: a, c_val + real(c_double), parameter :: radius = 6372.8_c_double - double precision :: dist, diffx, diffy, a, c - double precision :: mdist1, mdist2, sdist - real, parameter :: radius = 6372.8 - integer :: i,j,ndist - -! if(size(coord1,1).LT.size(coord2,1)) then -! call gdist(coord2,coord1,n2,n1,proj,rescale,diag,mdist) -! return -! end if - - if(diag) then - ndist=sum((/(i, i=1,n1, 1)/)) + ! Determine number of distances to compute + if (diag /= 0) then + ndist = sum((/(i, i=1,n1, 1)/)) else - ndist=n1*n2 + ndist = n1 * n2 end if -! if(ndist.GE.1e9) WRITE(*,'(A)') 'More than 1e9 distances to compute. Precision of the results not reliable.' - mdist = 0. - if(proj) then !euclidian distance - if(diag) then - !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist) reduction(+:mdist) - do i=1,n1 - sdist = 0. - do j=i,n2 - diffx = coord1(i,1)-coord2(j,1) - diffy = coord1(i,2)-coord2(j,2) - dist = sqrt(diffx*diffx+diffy*diffy) - sdist = sdist+dist -! mdist = mdist+dist/ndist - end do - mdist = mdist+sdist/ndist - end do - !$omp end parallel do - else - !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist) reduction(+:mdist) - do i=1,n1 - sdist = 0. - do j=1,n2 - diffx = coord1(i,1)-coord2(j,1) - diffy = coord1(i,2)-coord2(j,2) - dist = sqrt(diffx*diffx+diffy*diffy) - sdist = sdist+dist -! mdist = mdist+dist/ndist - end do - mdist = mdist+sdist/ndist - end do - !$omp end parallel do - end if - else !great-circle distance - if(diag) then - !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist) reduction(+:mdist) - do i=1,n1 - sdist = 0. - do j=i,n2 - diffx = coord1(i,1)-coord2(j,1) - diffy = coord1(i,2)-coord2(j,2) - a = (sin(diffx/2))**2 + cos(coord1(i,1))*cos(coord2(j,1))*(sin(diffy/2))**2 - c = 2*asin(sqrt(a)) - dist = radius*c - sdist = sdist+dist - !mdist = mdist+dist/ndist - end do - mdist = mdist+sdist/ndist - end do - !$omp end parallel do - else - !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist) reduction(+:mdist) - do i=1,n1 - sdist = 0. - do j=1,n2 - diffx = coord1(i,1)-coord2(j,1) - diffy = coord1(i,2)-coord2(j,2) - a = (sin(diffx/2))**2 + cos(coord1(i,1))*cos(coord2(j,1))*(sin(diffy/2))**2 - c = 2*asin(sqrt(a)) - dist = radius*c - sdist = sdist+dist -! mdist = mdist+dist/ndist - end do - mdist = mdist+sdist/ndist - end do - !$omp end parallel do - end if + mdist = 0.0_c_double + + if (proj /= 0) then ! Euclidean distance + if (diag /= 0) then + !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist) reduction(+:mdist) + do i = 1, n1 + sdist = 0.0_c_double + do j = i, n2 + diffx = coord1(i,1)-coord2(j,1) + diffy = coord1(i,2)-coord2(j,2) + dist = sqrt(diffx*diffx + diffy*diffy) + sdist = sdist + dist + end do + mdist = mdist + sdist / real(ndist, c_double) + end do + !$omp end parallel do + else + !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist) reduction(+:mdist) + do i = 1, n1 + sdist = 0.0_c_double + do j = 1, n2 + diffx = coord1(i,1)-coord2(j,1) + diffy = coord1(i,2)-coord2(j,2) + dist = sqrt(diffx*diffx + diffy*diffy) + sdist = sdist + dist + end do + mdist = mdist + sdist / real(ndist, c_double) + end do + !$omp end parallel do + end if + else ! Great-circle distance + if (diag /= 0) then + !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist, a, c_val) reduction(+:mdist) + do i = 1, n1 + sdist = 0.0_c_double + do j = i, n2 + diffx = coord1(i,1)-coord2(j,1) + diffy = coord1(i,2)-coord2(j,2) + a = (sin(diffx/2.0_c_double))**2 + cos(coord1(i,1))*cos(coord2(j,1))*(sin(diffy/2.0_c_double))**2 + c_val = 2.0_c_double * asin(sqrt(a)) + dist = radius * c_val + sdist = sdist + dist + end do + mdist = mdist + sdist / real(ndist, c_double) + end do + !$omp end parallel do + else + !$omp parallel do num_threads(nthreads) private(i, j, diffx, diffy, dist, sdist, a, c_val) reduction(+:mdist) + do i = 1, n1 + sdist = 0.0_c_double + do j = 1, n2 + diffx = coord1(i,1)-coord2(j,1) + diffy = coord1(i,2)-coord2(j,2) + a = (sin(diffx/2.0_c_double))**2 + cos(coord1(i,1))*cos(coord2(j,1))*(sin(diffy/2.0_c_double))**2 + c_val = 2.0_c_double * asin(sqrt(a)) + dist = radius * c_val + sdist = sdist + dist + end do + mdist = mdist + sdist / real(ndist, c_double) + end do + !$omp end parallel do + end if end if - if(rescale) then - call gdist(coord1,coord1,n1,n1,proj,.FALSE.,.TRUE.,nthreads,mdist1) - call gdist(coord2,coord2,n2,n2,proj,.FALSE.,.TRUE.,nthreads,mdist2) -! write(*,'(2I6,3F20.1)') n1, n2, mdist, mdist1, mdist2 - mdist = mdist - 0.5*(mdist1+mdist2) + if (rescale /= 0) then + call gdist(coord1, coord1, n1, n1, proj, 0, 1, nthreads, mdist1) + call gdist(coord2, coord2, n2, n2, proj, 0, 1, nthreads, mdist2) + mdist = mdist - 0.5_c_double * (mdist1 + mdist2) end if -end subroutine + end subroutine gdist \ No newline at end of file diff --git a/src/similarity_module.f90 b/src/similarity_module.f90 new file mode 100644 index 0000000000000000000000000000000000000000..340af523d4ff961e89efc19d77bbaa6303092659 --- /dev/null +++ b/src/similarity_module.f90 @@ -0,0 +1,189 @@ +module similarity_module + use iso_c_binding, only: c_int, c_double + use, intrinsic :: ieee_arithmetic + implicit none + + abstract interface + function objective_fun(sim, obs, n) result(val) + use iso_c_binding, only: c_int, c_double + implicit none + integer(c_int), intent(in) :: n + real(c_double), intent(in) :: sim(n), obs(n) + real(c_double) :: val + end function objective_fun + end interface + + procedure(objective_fun), pointer :: FUN => null() + +contains + + subroutine set_objective_function(crit) + implicit none + integer(c_int), intent(in) :: crit + + select case (crit) + case(1) + FUN => kge + case(2) + FUN => rmse + case(3) + FUN => invrmse + case default + FUN => invrmse + end select + + end subroutine set_objective_function + + subroutine similarity(Rn, nt, nb, crit, nthreads, sim_matrix) bind(C, name="similarity") + use, intrinsic :: ieee_arithmetic + use omp_lib + implicit none + integer(c_int), value :: nt, nb, crit, nthreads + real(c_double), intent(in) :: Rn(nt,nb) + real(c_double), intent(out) :: sim_matrix(nb,nb) + + integer(c_int) :: i, j + real(c_double) :: val, nan_val + nan_val = ieee_value(0.0_c_double, ieee_quiet_nan) + + call set_objective_function(crit) + if (.not. associated(FUN)) then + sim_matrix = nan_val + return + end if + + select case (crit) + case(2, 3) ! if symetric, like RMSE + !$omp parallel do num_threads(nthreads) default(shared) private(i, j) schedule(static) + do i = 1, nb + do j = i, nb + sim_matrix(i,j) = FUN(Rn(:,i), Rn(:,j), nt) + sim_matrix(j,i) = sim_matrix(i,j) + end do + end do + !$omp end parallel do + case default ! if not symetric, like KGE + !$omp parallel do num_threads(nthreads) default(shared) private(i, j) schedule(static) + do i = 1, nb + do j = 1, nb + sim_matrix(i,j) = FUN(Rn(:,i), Rn(:,j), nt) + end do + end do + !$omp end parallel do + end select + + end subroutine similarity + + function kge(sim, obs, n) result(kge_val) + use, intrinsic :: ieee_arithmetic + implicit none + integer, intent(in) :: n + real(c_double), dimension(n), intent(in) :: sim, obs + real(c_double) :: kge_val + integer :: i, count + real(c_double) :: sum_obs, sum_sim, sum_obs2, sum_sim2, sum_obs_sim + real(c_double) :: mean_obs, mean_sim, std_obs, std_sim, r, beta, gamma, diff + real(c_double) :: nan_val, eps + eps = tiny(1.0_c_double) + + count = 0 + sum_obs = 0.0_c_double + sum_sim = 0.0_c_double + sum_obs2 = 0.0_c_double + sum_sim2 = 0.0_c_double + sum_obs_sim = 0.0_c_double + nan_val = ieee_value(0.0_c_double, ieee_quiet_nan) + + ! Loop to filter NaN values and accumulate sums + do i = 1, n + if (.not. ieee_is_nan(obs(i)) .and. .not. ieee_is_nan(sim(i))) then + count = count + 1 + sum_obs = sum_obs + obs(i) + sum_sim = sum_sim + sim(i) + sum_obs2 = sum_obs2 + obs(i) * obs(i) + sum_sim2 = sum_sim2 + sim(i) * sim(i) + sum_obs_sim = sum_obs_sim + obs(i) * sim(i) + end if + end do + + ! Return NA if the common period is not long enough + if (count <= 1) then + kge_val = nan_val + return + end if + + ! Compute means + mean_obs = sum_obs / count + mean_sim = sum_sim / count + + ! Check to avoid division by zero + if (abs(mean_obs) < eps) then + kge_val = nan_val + return + end if + + ! Compute standard deviations (using Welford's correction) + std_obs = sqrt((sum_obs2 - sum_obs * mean_obs) / (count - 1)) + std_sim = sqrt((sum_sim2 - sum_sim * mean_sim) / (count - 1)) + + ! Check standard deviations + if (std_obs < eps .or. std_sim < eps) then + kge_val = nan_val + return + end if + + ! Compute correlation + r = ((sum_obs_sim - count * mean_obs * mean_sim) / (count - 1)) / (std_obs * std_sim) + + ! Compute KGE components + beta = mean_sim / mean_obs + gamma = (std_sim / mean_sim) / (std_obs / mean_obs) + + ! Final KGE computation + diff = (r - 1.0_c_double)**2 + (beta - 1.0_c_double)**2 + (gamma - 1.0_c_double)**2 + kge_val = 1.0_c_double - sqrt(diff) + end function kge + + function rmse(sim, obs, n) result(res) + integer, intent(in) :: n + real(c_double), dimension(n), intent(in) :: sim, obs + real(c_double) :: res + integer :: i, count + real(c_double) :: sum_sq, diff + real(c_double) :: nan_val + nan_val = ieee_value(0.0_c_double, ieee_quiet_nan) + count = 0 + sum_sq = 0.0_c_double + do i = 1, n + if (.not. ieee_is_nan(obs(i)) .and. .not. ieee_is_nan(sim(i))) then + count = count + 1 + diff = sim(i) - obs(i) + sum_sq = sum_sq + diff**2 + end if + end do + if (count == 0) then + res = nan_val + return + end if + res = sqrt(sum_sq / count) + + end function rmse + + function invrmse(sim, obs, n) result(res) + integer, intent(in) :: n + real(c_double), dimension(n), intent(in) :: sim, obs + real(c_double) :: rmse_val, res + real(c_double) :: nan_val + nan_val = ieee_value(0.0_c_double, ieee_quiet_nan) + + rmse_val = rmse(obs, sim, n) + if (rmse_val == 0.0_c_double) then + res = nan_val + else + res = 1.0_c_double / rmse_val + end if + + end function invrmse + +end module similarity_module + diff --git a/src/transfR.c b/src/transfR.c index a43d947f423f5de19e89e9d65bff7ed076302b5c..cba56b4cb2b51ec134b82015514addecf7be49c2 100644 --- a/src/transfR.c +++ b/src/transfR.c @@ -1,21 +1,40 @@ -#include <R_ext/RS.h> -#include <stdlib.h> // for NULL +#include <R.h> +#include <Rinternals.h> #include <R_ext/Rdynload.h> -/* FIXME: - Check these declarations against the C/Fortran source code. -*/ +extern void gdist(double *coord1, double *coord2, int n1, int n2, + int proj, int rescale, int diag, int nthreads, double *mdist); +SEXP c_gdist(SEXP coord1, SEXP coord2, SEXP proj, SEXP rescale, SEXP diag, SEXP nthreads) { + SEXP dim1 = PROTECT(getAttrib(coord1, R_DimSymbol)); + SEXP dim2 = PROTECT(getAttrib(coord2, R_DimSymbol)); + const int n1 = INTEGER(dim1)[0]; + const int n2 = INTEGER(dim2)[0]; + SEXP mdist = PROTECT(allocVector(REALSXP, 1)); + gdist(REAL(coord1), REAL(coord2), n1, n2, asInteger(proj), asInteger(rescale), asInteger(diag), asInteger(nthreads), REAL(mdist)); + UNPROTECT(3); + return mdist; +} -/* .Fortran calls */ -extern void F77_NAME(gdist)(void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern void similarity(double *Rn, int nt, int nb, int crit, int nthreads, double *sim_matrix); +SEXP c_similarity(SEXP Rn, SEXP crit, SEXP nthreads) { + SEXP dim = PROTECT(getAttrib(Rn, R_DimSymbol)); + const int nt = INTEGER(dim)[0]; + const int nb = INTEGER(dim)[1]; + SEXP sim_matrix = PROTECT(allocMatrix(REALSXP, nb, nb)); + similarity(REAL(Rn), nt, nb, asInteger(crit), asInteger(nthreads), REAL(sim_matrix)); + UNPROTECT(2); + return sim_matrix; +} -static const R_FortranMethodDef FortranEntries[] = { - {"gdist", (DL_FUNC) &F77_NAME(gdist), 9}, +/* Registration of native routines for .Call */ +static const R_CallMethodDef CallEntries[] = { + {"c_gdist", (DL_FUNC) &c_gdist, 6}, + {"c_similarity", (DL_FUNC) &c_similarity, 3}, {NULL, NULL, 0} }; void R_init_transfR(DllInfo *dll) { - R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL); + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); -} +} \ No newline at end of file