FUN.TRY.R 13.03 KiB
############################################ FUNCTION TO EXTRACT DECTED OUTLIER AND FORMAT TRY DATA Georges Kunstler
############################################ 14/06/2013
library(MASS, quietly=TRUE)
library(doParallel, quietly=TRUE)
library(mvoutlier, quietly=TRUE)
######################################################## Build a function that extract the variables
##'Description of the function to extract data from original TRY data
##'
##' based on the data structure of extraction from TRY data base
##' @title fun.extract.try
##' @param ObservationID.t  list of data identifier that we want to extract
##' @param data try data object
##' @param Non.Trait.Data list of names of non traits data that we want to extract
##' @param Trait.Data list of names of traits data that we want to extract
##' @return data.frame with one line per observation id with clumns with ObservationID Species Nontrait data for Traits: OrigValue OrigUnit StdValue 
##' @author Kunstler 
fun.extract.try <- function(ObservationID.t, data, Non.Trait.Data, Trait.Data) {
    data.temp <- data[data$ObservationID == ObservationID.t, ]
    ## Non trait data
    Vec.Non.Trait.Data <- rep(NA, length(Non.Trait.Data))
    names(Vec.Non.Trait.Data) <- Non.Trait.Data
    for (i in 1:length(Non.Trait.Data)) {
        if (sum(data.temp$DataName == Non.Trait.Data[i]) == 1) {
            Vec.Non.Trait.Data[i] <- data.temp[data.temp$DataName == Non.Trait.Data[i], 
                "OrigValueStr"]
        if (sum(data.temp$DataName == Non.Trait.Data[i]) > 1) {
            ## if(sum(data.temp$DataName==Non.Trait.Data[i] &
            ## grepl('Mean',data.temp$ValueKindName, fixed=TRUE))!=1){ print('error in
            ## ValueKindName')}
            Vec.Non.Trait.Data[i] <- data.temp[data.temp$DataName == Non.Trait.Data[i], 
                "OrigValueStr"][1]
    ## Trait data
    Vec.Trait.Data.OrigValue <- Vec.Trait.Data.OrigUnit <- Vec.Trait.Data.StdValue <- rep(NA, 
        length(Trait.Data))
    names(Vec.Trait.Data.OrigValue) <- paste("OrigValue", Trait.Data)
    names(Vec.Trait.Data.OrigUnit) <- paste("OrigUnitName", Trait.Data)
    names(Vec.Trait.Data.StdValue) <- paste("StdValue", Trait.Data)
    for (i in 1:length(Trait.Data)) {
        if (sum(grepl(Trait.Data[i], data.temp$TraitName, fixed = TRUE)) == 1) {
            Vec.Trait.Data.OrigValue[i] <- data.temp[grepl(Trait.Data[i], data.temp$TraitName, 
                fixed = TRUE), "OrigValue"]
            Vec.Trait.Data.OrigUnit[i] <- data.temp[grepl(Trait.Data[i], data.temp$TraitName, 
                fixed = TRUE), "OrigUnitStr"]
            Vec.Trait.Data.StdValue[i] <- data.temp[grepl(Trait.Data[i], data.temp$TraitName, 
                fixed = TRUE), "StdValue"]
        if (sum(grepl(Trait.Data[i], data.temp$TraitName, fixed = TRUE)) > 1) {
            if (sum((data.temp$ValueKindName %in% c("Best estimate", "Mean", "Site specific mean") & 
                !is.na(data.temp$ValueKindName))) == 1) {
                Vec.Trait.Data.OrigValue[i] <- mean(data.temp[grepl(Trait.Data[i], 
                  data.temp$TraitName, fixed = TRUE) & (data.temp$ValueKindName %in% 
                  c("Best estimate", "Mean", "Site specific mean") & !is.na(data.temp$ValueKindName)), 
                  "OrigValue"])
                Vec.Trait.Data.OrigUnit[i] <- (data.temp[grepl(Trait.Data[i], data.temp$TraitName, 
                  fixed = TRUE) & (data.temp$ValueKindName %in% c("Best estimate", 
                  "Mean", "Site specific mean") & !is.na(data.temp$ValueKindName)), 
                  "OrigUnitStr"])[1]
                Vec.Trait.Data.StdValue[i] <- mean(data.temp[grepl(Trait.Data[i], 
                  data.temp$TraitName, fixed = TRUE) & (data.temp$ValueKindName %in% 
                  c("Best estimate", "Mean", "Site specific mean") & !is.na(data.temp$ValueKindName)), 
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
"StdValue"]) } if (sum(data.temp$ValueKindName %in% c("Best estimate", "Mean", "Site specific mean")) < 1) { Vec.Trait.Data.OrigValue[i] <- mean(data.temp[grepl(Trait.Data[i], data.temp$TraitName, fixed = TRUE), "OrigValue"], na.rm = T) Vec.Trait.Data.OrigUnit[i] <- (data.temp[grepl(Trait.Data[i], data.temp$TraitName, fixed = TRUE), "OrigUnitStr"])[1] Vec.Trait.Data.StdValue[i] <- mean(data.temp[grepl(Trait.Data[i], data.temp$TraitName, fixed = TRUE), "StdValue"], na.rm = T) } } } ### EXPERIMENTAL DATA TYPE TF.exp.data <- sum(grepl("Growth & measurement conditions - experimental tre", data.temp$NonTraitCategories, fixed = TRUE)) > 0 names(TF.exp.data) <- "TF.exp.data" res.temp <- data.frame(ObservationID = ObservationID.t, AccSpeciesName = unique(data.temp$AccSpeciesName), t(Vec.Non.Trait.Data), TF.exp.data, t(Vec.Trait.Data.OrigValue), t(Vec.Trait.Data.OrigUnit), t(Vec.Trait.Data.StdValue)) return(res.temp) } ## outlier detection based on Kattage et al 2011 ##' Detection of univar outlier based on method of Kattge et al. 2011 ##' ##' ##' @title ##' @param x.na ##' @param log ##' @return TRUE FALSE vector to identify outlier TRUE : outlier ##' @author Kunstler fun.out.TF2 <- function(x.na, log = TRUE) { x <- x.na[!is.na(x.na)] x.num <- (1:length(x.na))[!is.na(x.na)] TF.vec <- rep(FALSE, length(x.na)) if (log) { fit.dist <- fitdistr(log10(na.omit(x)), "normal") high.bound <- fit.dist$estimate["mean"] + 2 * (fit.dist$estimate["sd"] + fit.dist$sd["sd"]) low.bound <- fit.dist$estimate["mean"] - 2 * (fit.dist$estimate["sd"] + fit.dist$sd["sd"]) TF.vec[x.num[log10(x) > high.bound | log10(x) < low.bound]] <- TRUE } else { fit.dist <- fitdistr((na.omit(x)), "normal") high.bound <- fit.dist$estimate["mean"] + 2 * (fit.dist$estimate["sd"] + fit.dist$sd["sd"]) low.bound <- fit.dist$estimate["mean"] - 2 * (fit.dist$estimate["sd"] + fit.dist$sd["sd"]) TF.vec[x.num[(x) > high.bound | (x) < low.bound]] <- TRUE } return((TF.vec)) } ######################## FUNCTION TO COMPUTE QUANTILE FOR HEIGHT f.quantile <- function(x, ind, probs) { quantile(x[ind], probs = probs, na.rm = TRUE) } f.quantile.boot2 <- function(x, R, probs = 0.99) { require(boot, quietly=TRUE) if (length(na.exclude(x)) > 0) { quant.boot <- boot(x, f.quantile, R = R, probs = probs) return(c(mean = mean(quant.boot$t), sd = sd(quant.boot$t), nobs = length(na.exclude(x)))) } else {
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
return(c(mean = NA, sd = NA, nobs = NA)) } } ##################### FUNcCTION TO COMPUTE MEAN SD AND NOBS WITH OR WITHOUT OUTLIER fun.mean.sd.nobs.out <- function(x, i) { if (length(x) > 50) { ## if more than 50 obs remove outlier outlier <- fun.out.TF2(x.na = x, log = TRUE) if (i == "StdValue.Plant.height.vegetative") { res.temp <- f.quantile.boot2(log10(x[!outlier]), R = 1000, probs = 0.99) } else { res.temp <- c(mean(log10(x[!outlier])), sd(log10(x[!outlier])), length(x[!outlier])) } } else { if (i == "StdValue.Plant.height.vegetative") { res.temp <- f.quantile.boot2(log10(x), R = 1000, probs = 0.99) } else { res.temp <- c(mean(log10(x)), sd(log10(x)), length(x)) } } return(res.temp) } ################################### extract mean sd per species or genus added species synonyme fun.species.traits <- function(species.code, species.table, col.sp = "sp", col.sp.syno = "Latin_name_syn", traits, data) { vec.mean <- vec.sd <- vec.nobs <- rep(NA, length(traits)) vec.exp <- vec.genus <- rep(FALSE, length(traits)) names(vec.mean) <- names(vec.sd) <- names(vec.exp) <- names(vec.genus) <- names(vec.nobs) <- traits species.syno <- species.table[species.table[[col.sp]] == species.code, col.sp.syno] # browser() for (i in traits) { if (sum((data$AccSpeciesName %in% species.syno) & !is.na(data[[i]])) > 0) { ## if data for this species or syno if data with out experiments if (sum((data$AccSpeciesName %in% species.syno) & (!is.na(data[[i]])) & (!data[["TF.exp.data"]])) > 0) { x <- data[[i]][data$AccSpeciesName %in% species.syno & (!is.na(data[[i]])) & (!data[["TF.exp.data"]])] res.temp <- fun.mean.sd.nobs.out(x, i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] } else { ### include experimental data x <- data[[i]][data$AccSpeciesName %in% species.syno & (!is.na(data[[i]]))] res.temp <- fun.mean.sd.nobs.out(x, i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] vec.exp[[i]] <- TRUE } } else { ### compute data at genus level if no data for the species genus <- sub(" .*", "", species.syno) if (sum(grepl(genus, data$AccSpeciesName) & (!is.na(data[[i]]))) > 0) { x <- data[[i]][grepl(genus, data$AccSpeciesName, fixed = TRUE) & (!is.na(data[[i]]))] res.temp <- fun.mean.sd.nobs.out(x, i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] vec.genus[[i]] <- TRUE } } } return(list(mean = vec.mean, sd = vec.sd, exp = vec.exp, genus = vec.genus, nobs = vec.nobs))
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
} ####################### FUNCTIONS TO Manipulate species names fun.get.genus <- function(x) gsub(paste(" ", gsub("^([a-zA-Z]* )", "", x), sep = ""), "", x, fixed = TRUE) trim.trailing <- function(x) sub("\\s+$", "", x) ####################################### FUN TO EXTRACT FOR A GIVEN DATA BASE fun.turn.list.in.DF <- function(sp, res.list) { data.mean <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$mean, res.list = res.list)) data.sd <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$sd, res.list = res.list)) data.exp <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$exp, res.list = res.list)) data.genus <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$genus, res.list = res.list)) data.nobs <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$nobs, res.list = res.list)) ## create data.frame withh all observation extract.species.try <- data.frame(data.mean, data.sd, data.exp, data.genus, data.nobs) names(extract.species.try) <- c(paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.Density"), "mean", sep = "."), paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.Density"), "sd", sep = "."), paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.Density"), "exp", sep = "."), paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.Density"), "genus", sep = "."), paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.Density"), "nobs", sep = ".")) return(extract.species.try) } fun.extract.format.sp.traits.TRY <- function(sp, sp.syno.table, data) { ## check syno data if not create a table with column syno repating the species ### test data sp and sp.syno.table match if (sum(!(sp %in% sp.syno.table[["sp"]])) > 0) stop("not same species name in sp and sp.syno.table") if (sum((sp.syno.table[["Latin_name_syn"]] %in% data[["AccSpeciesName"]])) == 0) stop("not a single similar species name in sp and TRY") ## extract traits <- c("StdValue.Leaf.nitrogen..N..content.per.dry.mass", "StdValue.Seed.mass", "StdValue.Leaf.specific.area..SLA.", "StdValue.Stem.specific.density..SSD.") res.list <- lapply(sp, FUN = fun.species.traits, species.table = sp.syno.table, traits = traits, data = data) names(res.list) <- sp ##### TRANSFORM LIST IN A TABLE extract.species.try <- fun.turn.list.in.DF(sp, res.list) ############### add mean sd of species or genus if we want to use that sd.vec.sp <- readRDS(file = "./data/process/sd.vec.sp.rds") sd.vec.genus <- readRDS(file = "./data/process/sd.vec.genus.rds") sd.names <- paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.Density"), "sd", sep = ".") genus.names <- paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.Density"), "genus", sep = ".") ### add columns extract.species.try.2 <- data.frame(extract.species.try, extract.species.try[, sd.names]) ## update value sd.names.1 <- paste(sd.names, 1, sep = ".") for (i in 1:length(sd.names.1)) { extract.species.try.2[[sd.names.1[i]]][!extract.species.try.2[[genus.names[i]]]] <- sd.vec.sp[i] extract.species.try.2[[sd.names.1[i]]][extract.species.try.2[[genus.names[i]]]] <- sd.vec.genus[i] } data.frame.TRY <- data.frame(sp = sp, Latin_name = sp.syno.table[["Latin_name_syn"]], extract.species.try.2) if (sum(!data.frame.TRY[["sp"]] == sp) > 0) stop("Wrong order of species code") return(data.frame.TRY) }
281