trait.fun.R 10.88 KiB
################# FUNCTION TO EXTRACT DECTED OUTLIER AND FORMAT TRY DATA Georges Kunstler
############################################ 14/06/2013
### just testing this out! ##############
### install all unstallled packages
source("R/packages.R")
check_packages(c("MASS", "doParallel","mvoutlier","plyr"))
## outlier detection based on Kattage et al 2011
##' Detection of univar outlier based on method of Kattge et al. 2011
##'
##' 
##' @title 
##' @param x.na 
##' @param log 
##' @return TRUE FALSE vector to identify outlier TRUE : outlier
##' @author Kunstler
fun.out.TF2 <- function(x.na, log = TRUE) {
    x <- x.na[!is.na(x.na)]
    x.num <- (1:length(x.na))[!is.na(x.na)]
    TF.vec <- rep(FALSE, length(x.na))
    if (log) {
        fit.dist <- fitdistr(log10(na.omit(x)), "normal")
        high.bound <- fit.dist$estimate["mean"] + 2 * (fit.dist$estimate["sd"] + 
            fit.dist$sd["sd"])
        low.bound <- fit.dist$estimate["mean"] - 2 * (fit.dist$estimate["sd"] + fit.dist$sd["sd"])
        TF.vec[x.num[log10(x) > high.bound | log10(x) < low.bound]] <- TRUE
    } else {
        fit.dist <- fitdistr((na.omit(x)), "normal")
        high.bound <- fit.dist$estimate["mean"] + 2 * (fit.dist$estimate["sd"] + 
            fit.dist$sd["sd"])
        low.bound <- fit.dist$estimate["mean"] - 2 * (fit.dist$estimate["sd"] + fit.dist$sd["sd"])
        TF.vec[x.num[(x) > high.bound | (x) < low.bound]] <- TRUE
    return((TF.vec))
######################## FUNCTION TO COMPUTE QUANTILE FOR HEIGHT
f.quantile <- function(x, ind, probs) {
    quantile(x[ind], probs = probs, na.rm = TRUE)
f.quantile.boot2 <- function(x, R, probs = 0.99) {
    require(boot, quietly=TRUE)
    if (length(na.exclude(x)) > 0) {
        quant.boot <- boot(x, f.quantile, R = R, probs = probs)
        return(c(mean = mean(quant.boot$t), sd = sd(quant.boot$t), nobs = length(na.exclude(x))))
    } else {
        return(c(mean = NA, sd = NA, nobs = NA))
##################### FUNcCTION TO COMPUTE MEAN SD AND NOBS WITH OR WITHOUT OUTLIER
fun.mean.sd.nobs.out <- function(x, i) {
    if (length(x) > 50) {
        ## if more than 50 obs remove outlier
        outlier <- fun.out.TF2(x.na = x, log = TRUE)
        if (i == "StdValue.Plant.height.vegetative") {
            res.temp <- f.quantile.boot2(log10(x[!outlier]), R = 1000, probs = 0.99)
        } else {
            res.temp <- c(mean(log10(x[!outlier])), sd(log10(x[!outlier])), length(x[!outlier]))
    } else {
        if (i == "StdValue.Plant.height.vegetative") {
            res.temp <- f.quantile.boot2(log10(x), R = 1000, probs = 0.99)
        } else {
            res.temp <- c(mean(log10(x)), sd(log10(x)), length(x))
    return(res.temp)
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
} ################################### extract mean sd per species or genus added species synonyme fun.species.traits <- function(species.code, species.table, col.sp = "sp", col.sp.syno = "Latin_name_syn", traits, data) { vec.mean <- vec.sd <- vec.nobs <- rep(NA, length(traits)) vec.exp <- vec.genus <- rep(FALSE, length(traits)) names(vec.mean) <- names(vec.sd) <- names(vec.exp) <- names(vec.genus) <- names(vec.nobs) <- traits species.syno <- species.table[species.table[[col.sp]] == species.code, col.sp.syno] # browser() for (i in traits) { if (sum((data$Latin_name%in% species.syno) & !is.na(data[[i]])) > 0) { ## if data for this species or syno if data with out experiments if (sum((data$Latin_name %in% species.syno) & (!is.na(data[[i]])) & (!data[["TF.exp.data"]])) > 0) { x <- data[[i]][data$Latin_name %in% species.syno & (!is.na(data[[i]])) & (!data[["TF.exp.data"]])] res.temp <- fun.mean.sd.nobs.out(x, i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] } else { ### include experimental data x <- data[[i]][data$Latin_name %in% species.syno & (!is.na(data[[i]]))] res.temp <- fun.mean.sd.nobs.out(x, i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] vec.exp[[i]] <- TRUE } } else { ### compute data at genus level if no data for the species genus <- unique(sub(" .*", "", species.syno)) if (sum(grepl(genus, data$Latin_name) & (!is.na(data[[i]]))) > 0) { x <- data[[i]][grepl(genus, data$Latin_name, fixed = TRUE) & (!is.na(data[[i]]))] res.temp <- fun.mean.sd.nobs.out(x, i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] vec.genus[[i]] <- TRUE } } } return(list(mean = vec.mean, sd = vec.sd, exp = vec.exp, genus = vec.genus, nobs = vec.nobs)) } ####################### FUNCTIONS TO Manipulate species names fun.get.genus <- function(x) gsub(paste(" ", gsub("^([a-zA-Z]* )", "", x), sep = ""), "", x, fixed = TRUE) trim.trailing <- function(x) sub("\\s+$", "", x) ################################################################################# ####################################### FUN TO EXTRACT FOR A GIVEN DATA BASE ### function top turn teh result of lapply from a list to a data frame with good structure fun.turn.list.in.DF <- function(sp, res.list) { data.mean <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$mean, res.list = res.list)) data.sd <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$sd, res.list = res.list)) data.exp <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$exp, res.list = res.list)) data.genus <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$genus, res.list = res.list)) data.nobs <- t(sapply(sp, FUN = function(i, res.list) res.list[[i]]$nobs, res.list = res.list)) ## create data.frame withh all observation extract.species.try <- data.frame(data.mean, data.sd, data.exp, data.genus, data.nobs, stringsAsFactors =FALSE) names(extract.species.try) <- c(paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.density"), "mean", sep = "."),
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.density"), "sd", sep = "."), paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.density"), "exp", sep = "."), paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.density"), "genus", sep = "."), paste(c("Leaf.N", "Seed.mass", "SLA", "Wood.density"), "nobs", sep = ".")) return(extract.species.try) } ########################## ##### FUNCTION TO EXTRACT TRY DATA FOR A SPECIES NEED TO DOCUMENT fun.extract.format.sp.traits.TRY <- function(sp, sp.syno.table, data) { ### test data sp and sp.syno.table match if (sum(!(sp %in% sp.syno.table[["sp"]])) > 0) stop("not same species name in sp and sp.syno.table") if (sum((sp.syno.table[["Latin_name_syn"]] %in% data[["Latin_name"]])) == 0) stop("not a single similar species name in sp and TRY") sp <- as.character(sp) sp.syno.table$sp <- as.character(sp.syno.table$sp) ## traits to extract traits <- c("Leaf.N", "Seed.mass", "SLA", "Wood.density") # lapply to extract res.list <- lapply(sp, FUN = fun.species.traits, species.table = sp.syno.table, traits = traits, data = data) names(res.list) <- sp.syno.table[["Latin_name_syn"]] ##### TRANSFORM LIST INTO A TABLE extract.species.try <- fun.turn.list.in.DF(sp.syno.table[["Latin_name_syn"]] , res.list) ##### TEST OF GOOD EXTRACTION OF TRAITS test.num <- sample((1:length(sp))[!is.na(extract.species.try[["SLA.mean"]])],1) if( extract.species.try[test.num,"SLA.mean"] != fun.species.traits(sp[test.num], species.table = sp.syno.table, traits = traits, data = data)$mean[grep("SLA",traits)]) stop('traits value not good for the species in extraction from TRY') data.frame.TRY <- data.frame(sp = sp, Latin_name = sp.syno.table[["Latin_name_syn"]], extract.species.try, stringsAsFactors =FALSE) if (sum(!data.frame.TRY[["sp"]] == sp) > 0) stop("Wrong order of species code") return(data.frame.TRY) } ############################## ############################## ### NO TRY TRAITS ### function to return mean and sd of traits per species or at genus level in a single line data.frame fun.spe.traits.notry <- function(Latin_name,data.tot,traits.mean,traits.sd,name.match.traits="Latin_name",SD.TF){ mean.vec <- c() sd.vec <- c() genus.vec <- c() for(i in 1:length(traits.mean)){ if(sum(!is.na(data.tot[[traits.mean[i]]]))>0){ data <- data.tot[!is.na(data.tot[[traits.mean[i]]]),] if(Latin_name %in% data[[name.match.traits]] ){ mean.vec[i] <-mean( data[data[[name.match.traits]] %in% Latin_name,traits.mean[i]]) genus.vec[i] <- FALSE if(SD.TF){sd.vec[i] <- mean(data[data[[name.match.traits]] %in% Latin_name,traits.sd[i]],na.rm=TRUE) }else{sd.vec[i] <- NA} }else{## do genus mean genus <- sub(" .*", "", Latin_name) genus.species <- sub(" .*", "", data[[name.match.traits]]) genus.vec[i] <- TRUE mean.vec[i] <- mean(data[genus.species %in% genus,traits.mean[i]],na.rm=TRUE) if(SD.TF){sd.vec[i] <- mean(data[genus.species %in% genus ,traits.sd[i]],na.rm=TRUE) }else{sd.vec[i] <- NA} } }else{ mean.vec[i] <- NA sd.vec[i] <- NA genus.vec[i] <- TRUE
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
} } names(mean.vec) <- traits.mean names(sd.vec) <- traits.sd names(genus.vec) <- sub("sd","genus",traits.sd) extract.species.traits <- data.frame(Latin_name,t(mean.vec),t(sd.vec),t(genus.vec)) return(extract.species.traits) } ########### ### FUNCTION TO EXTRACT ALL SPECIES fun.extract.format.sp.traits.NOT.TRY <- function(sp, Latin_name, data,name.match.traits="Latin_name") { require(plyr) ### test data sp and sp.syno.table match if (sum((Latin_name %in% data[[name.match.traits]])) == 0) stop("not a single similar species name in sp and Traits data") ## traits to extract traits <- c("Leaf.N", "Seed.mass", "SLA", "Wood.density","Max.height") ### NEED TO ADD TEST IF SD available in the data if(sum(grepl("sd",names(data)))>0) SD.TF <- TRUE traits.mean <- paste(traits,"mean",sep=".") traits.genus <- paste(traits,"genus",sep=".") if (SD.TF) traits.sd <- paste(traits,"sd",sep=".") ## extract data extract.species.traits <- rbind.fill(lapply(Latin_name,FUN=fun.spe.traits.notry ,data,traits.mean,traits.sd,name.match.traits,SD.TF)) data.frame.TRAITS <- data.frame(sp = sp, Latin_name=Latin_name , extract.species.traits, stringsAsFactors =FALSE) if (sum(!data.frame.TRAITS[["sp"]] == sp) > 0) stop("Wrong order of species code") return(data.frame.TRAITS) }