FUN.TRY.R 12.37 KiB
############################################
############################################
## FUNCTION TO EXTRACT DECTED OUTLIER AND FORMAT TRY DATA
## Georges Kunstler 14/06/2013
library(MASS)
library(doParallel)
library(mvoutlier)
########################################################
########################################################
########################################################
########################################################
###Build a function that  extract the variables
##'Description of the function to extract data from original TRY data
##'
##' based on the data structure of extraction from TRY data base
##' @title fun.extract.try
##' @param ObservationID.t  list of data identifier that we want to extract
##' @param data try data object
##' @param Non.Trait.Data list of names of non traits data that we want to extract
##' @param Trait.Data list of names of traits data that we want to extract
##' @return data.frame with one line per observation id with clumns with ObservationID Species Nontrait data for Traits: OrigValue OrigUnit StdValue 
##' @author Kunstler 
fun.extract.try <- function(ObservationID.t,data,Non.Trait.Data,Trait.Data){
data.temp  <- data[data$ObservationID==ObservationID.t,]
## Non trait data
Vec.Non.Trait.Data <- rep(NA,length(Non.Trait.Data))
names(Vec.Non.Trait.Data) <- Non.Trait.Data
for (i in 1:length(Non.Trait.Data)){
   if( sum(data.temp$DataName==Non.Trait.Data[i])==1){
Vec.Non.Trait.Data[i] <- data.temp[data.temp$DataName==Non.Trait.Data[i],"OrigValueStr"]
   if(sum(data.temp$DataName==Non.Trait.Data[i])>1){
       ## if(sum(data.temp$DataName==Non.Trait.Data[i] & grepl("Mean",data.temp$ValueKindName,
       ##      fixed=TRUE))!=1){ print("error in ValueKindName")}
       Vec.Non.Trait.Data[i] <- data.temp[data.temp$DataName==Non.Trait.Data[i] ,
                                    "OrigValueStr"][1]
## Trait data
Vec.Trait.Data.OrigValue <-Vec.Trait.Data.OrigUnit <- Vec.Trait.Data.StdValue <-
    rep(NA,length(Trait.Data))
names(Vec.Trait.Data.OrigValue) <- paste("OrigValue",Trait.Data)
names(Vec.Trait.Data.OrigUnit) <- paste("OrigUnitName",Trait.Data)
names(Vec.Trait.Data.StdValue) <- paste("StdValue",Trait.Data)
for (i in 1:length(Trait.Data)){
   if(sum(grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE))==1){
Vec.Trait.Data.OrigValue[i] <- data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"OrigValue"]
Vec.Trait.Data.OrigUnit[i] <- data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"OrigUnitStr"]
Vec.Trait.Data.StdValue[i] <- data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"StdValue"]
   if( sum(grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE))>1){
      if(sum((data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)))==1){
       Vec.Trait.Data.OrigValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE)&
           (data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)) ,"OrigValue"])
       Vec.Trait.Data.OrigUnit[i] <- (data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) &
           (data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)),"OrigUnitStr"])[1]
      Vec.Trait.Data.StdValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) &
           (data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)),"StdValue"])
      if(sum(data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean")  )<1){
       Vec.Trait.Data.OrigValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"OrigValue"],na.rm=T)
       Vec.Trait.Data.OrigUnit[i] <- (data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) ,"OrigUnitStr"])[1]
      Vec.Trait.Data.StdValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) ,"StdValue"],na.rm=T)
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
} } } ### EXPERIMENTAL DATA TYPE TF.exp.data <- sum(grepl("Growth & measurement conditions - experimental tre",data.temp$NonTraitCategories, fixed=TRUE) )>0 names(TF.exp.data) <- 'TF.exp.data' res.temp <- data.frame("ObservationID"=ObservationID.t,"AccSpeciesName"=unique(data.temp$AccSpeciesName) ,t(Vec.Non.Trait.Data),TF.exp.data, t(Vec.Trait.Data.OrigValue),t(Vec.Trait.Data.OrigUnit),t(Vec.Trait.Data.StdValue)) return(res.temp) } ## outlier detection based on Kattage et al 2011 ##' Detection of univar outlier based on method of Kattge et al. 2011 ##' ##' ##' @title ##' @param x.na ##' @param log ##' @return TRUE FALSE vector to identify outlier TRUE : outlier ##' @author Kunstler fun.out.TF2 <- function(x.na,log=TRUE){ x <- x.na[!is.na(x.na)] x.num <- (1:length(x.na))[!is.na(x.na)] TF.vec <- rep(FALSE,length(x.na)) if(log){ fit.dist <- fitdistr(log10(na.omit(x) ),'normal') high.bound <- fit.dist$estimate["mean"]+2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) low.bound <- fit.dist$estimate["mean"]-2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) TF.vec[x.num[log10(x)>high.bound | log10(x)<low.bound]] <- TRUE }else{ fit.dist <- fitdistr((na.omit(x) ),'normal') high.bound <- fit.dist$estimate["mean"]+2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) low.bound <- fit.dist$estimate["mean"]-2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) TF.vec[x.num[(x)>high.bound | (x)<low.bound]] <- TRUE } return((TF.vec)) } ######################## ######################## ### FUNCTION TO COMPUTE QUANTILE FOR HEIGHT f.quantile <- function (x,ind,probs){quantile(x[ind],probs=probs,na.rm=TRUE)} f.quantile.boot2 <- function(x,R,probs=0.99){ require(boot) if(length(na.exclude(x))>0){ quant.boot <- boot(x,f.quantile,R=R,probs=probs) return(c(mean=mean(quant.boot$t),sd=sd(quant.boot$t), nobs=length(na.exclude(x)))) }else{ return(c(mean=NA,sd=NA,nobs=NA)) } } ##################### ##################### ##### FUNcCTION TO COMPUTE MEAN SD AND NOBS WITH OR WITHOUT OUTLIER fun.mean.sd.nobs.out <- function(x,i){ if(length(x)>50){## if more than 50 obs remove outlier outlier <- fun.out.TF2(x.na=x,log=TRUE) if(i=="StdValue.Plant.height.vegetative"){ res.temp <- f.quantile.boot2(log10(x[!outlier]),R=1000,probs=0.99) }else{
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
res.temp <- c(mean(log10(x[!outlier])), sd(log10(x[!outlier])), length(x[!outlier]))} }else{ if(i=="StdValue.Plant.height.vegetative"){ res.temp <- f.quantile.boot2(log10(x),R=1000,probs=0.99) }else{ res.temp <- c(mean(log10(x)), sd(log10(x)), length(x))} } return(res.temp) } ################################### ################################### ####### extract mean sd per species or genus ####### added species synonyme fun.species.traits <- function(species.code,species.table,col.sp="sp",col.sp.syno="Latin_name_syn",traits,data){ vec.mean <- vec.sd <- vec.nobs <- rep(NA,length(traits)) vec.exp <- vec.genus <- rep(FALSE,length(traits)) names(vec.mean) <- names(vec.sd) <- names(vec.exp) <- names(vec.genus) <- names(vec.nobs)<- traits species.syno <- species.table[species.table[[col.sp]]==species.code,col.sp.syno] #browser() for(i in traits){ if(sum((data$AccSpeciesName %in% species.syno) & !is.na(data[[i]]))>0){ ## if data for this species or syno if(sum((data$AccSpeciesName %in% species.syno) & (!is.na(data[[i]])) & (!data[["TF.exp.data"]]))>0){## if data with out experiments x <- data[[i]][data$AccSpeciesName %in% species.syno & (!is.na(data[[i]])) & (!data[["TF.exp.data"]])] res.temp <- fun.mean.sd.nobs.out(x,i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] }else{### include experimental data x <- data[[i]][data$AccSpeciesName %in% species.syno & (!is.na(data[[i]])) ] res.temp <- fun.mean.sd.nobs.out(x,i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] vec.exp[[i]] <- TRUE } }else{### compute data at genus level if no data for the species genus <- sub(" .*","",species.syno) if(sum(grepl(genus,data$AccSpeciesName) & (!is.na(data[[i]])))>0){ x <- data[[i]][grepl(genus,data$AccSpeciesName,fixed=TRUE ) & (!is.na(data[[i]])) ] res.temp <- fun.mean.sd.nobs.out(x,i) vec.mean[[i]] <- res.temp[1] vec.sd[[i]] <- res.temp[2] vec.nobs[[i]] <- res.temp[3] vec.genus[[i]] <- TRUE } } } return(list(mean=vec.mean,sd=vec.sd,exp=vec.exp,genus=vec.genus,nobs=vec.nobs)) } ####################### ### FUNCTIONS TO Manipulate species names fun.get.genus <- function(x) gsub(paste(" ",gsub("^([a-zA-Z]* )","",x),sep=""),"",x,fixed=TRUE) trim.trailing <- function (x) sub("\\s+$", "", x) ####################################### ####################################### ##### FUN TO EXTRACT FOR A GIVEN DATA BASE
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
fun.turn.list.in.DF <- function(sp,res.list){ data.mean <- t(sapply(sp,FUN=function(i,res.list) res.list[[i]]$mean ,res.list=res.list)) data.sd <- t(sapply(sp,FUN=function(i,res.list) res.list[[i]]$sd,res.list=res.list)) data.exp <- t(sapply(sp,FUN=function(i,res.list) res.list[[i]]$exp ,res.list=res.list)) data.genus <- t(sapply(sp,FUN=function(i,res.list) res.list[[i]]$genus ,res.list=res.list)) data.nobs <- t(sapply(sp,FUN=function(i,res.list) res.list[[i]]$nobs ,res.list=res.list)) ## create data.frame withh all observation extract.species.try <- data.frame(data.mean,data.sd,data.exp,data.genus,data.nobs) names(extract.species.try) <- c(paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Height") ,"mean",sep=".") ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Height") ,"sd",sep=".") ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Height") ,"exp",sep=".") ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Height") ,"genus",sep=".") ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Height") ,"nobs",sep=".")) return(extract.species.try) } fun.extract.format.sp.traits.TRY <- function(sp,sp.syno.table,data){ ## check syno data if not create a table with column syno repating the species ### test data sp and sp.syno.table match if(sum(!(sp %in% sp.syno.table[["sp"]] ))>0) stop('not same species name in sp and sp.syno.table') if(sum((sp.syno.table[["Latin_name_syn"]] %in% data[["AccSpeciesName"]] ))==0) stop('not a single similar species name in sp and TRY') ## extract traits <- c("StdValue.Leaf.nitrogen..N..content.per.dry.mass", "StdValue.Seed.mass", "StdValue.Leaf.specific.area..SLA.", "StdValue.Stem.specific.density..SSD.", "StdValue.Plant.height.vegetative") res.list <- lapply(sp,FUN=fun.species.traits,species.table=sp.syno.table,traits=traits,data=data) names(res.list) <- sp ##### TRANSFORM LIST IN A TABLE extract.species.try <- fun.turn.list.in.DF(sp,res.list) ############### add mean sd of species or genus if we want to use that sd.vec.sp <- readRDS(file="./data/process/sd.vec.sp.rds") sd.vec.genus <- readRDS(file="./data/process/sd.vec.genus.rds") sd.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Height") ,"sd",sep=".") genus.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Height") ,"genus",sep=".") ### add columns extract.species.try.2 <- data.frame(extract.species.try, extract.species.try[,sd.names]) ## update value sd.names.1 <- paste(sd.names,1,sep=".") for (i in 1:length(sd.names.1)){ extract.species.try.2[[sd.names.1[i]]][!extract.species.try.2[[genus.names[i]]]] <- sd.vec.sp[i] extract.species.try.2[[sd.names.1[i]]][extract.species.try.2[[genus.names[i]]]] <- sd.vec.genus[i] } data.frame.TRY <- data.frame(sp=sp,Latin_name=sp.syno.table[["Latin_name_syn"]],extract.species.try.2) return(data.frame.TRY) }