############################################ ############################################ ## FUNCTION TO EXTRACT DECTED OUTLIER AND FORMAT TRY DATA ## Georges Kunstler 14/06/2013 ######################################################## ######################################################## ######################################################## ######################################################## ###Build a function that extract the variables ##'Description of the function to extract data from original TRY data ##' ##' based on the data structure of extraction from TRY data base ##' @title fun.extract.try ##' @param ObservationID.t list of data identifier that we want to extract ##' @param data try data object ##' @param Non.Trait.Data list of names of non traits data that we want to extract ##' @param Trait.Data list of names of traits data that we want to extract ##' @return data.frame with one line per observation id with clumns with ObservationID Species Nontrait data for Traits: OrigValue OrigUnit StdValue ##' @author Kunstler fun.extract.try <- function(ObservationID.t,data,Non.Trait.Data,Trait.Data){ data.temp <- data[data$ObservationID==ObservationID.t,] ## Non trait data Vec.Non.Trait.Data <- rep(NA,length(Non.Trait.Data)) names(Vec.Non.Trait.Data) <- Non.Trait.Data for (i in 1:length(Non.Trait.Data)){ if( sum(data.temp$DataName==Non.Trait.Data[i])==1){ Vec.Non.Trait.Data[i] <- data.temp[data.temp$DataName==Non.Trait.Data[i],"OrigValueStr"] } if(sum(data.temp$DataName==Non.Trait.Data[i])>1){ ## if(sum(data.temp$DataName==Non.Trait.Data[i] & grepl("Mean",data.temp$ValueKindName, ## fixed=TRUE))!=1){ print("error in ValueKindName")} Vec.Non.Trait.Data[i] <- data.temp[data.temp$DataName==Non.Trait.Data[i] , "OrigValueStr"][1] } } ## Trait data Vec.Trait.Data.OrigValue <-Vec.Trait.Data.OrigUnit <- Vec.Trait.Data.StdValue <- rep(NA,length(Trait.Data)) names(Vec.Trait.Data.OrigValue) <- paste("OrigValue",Trait.Data) names(Vec.Trait.Data.OrigUnit) <- paste("OrigUnitName",Trait.Data) names(Vec.Trait.Data.StdValue) <- paste("StdValue",Trait.Data) for (i in 1:length(Trait.Data)){ if(sum(grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE))==1){ Vec.Trait.Data.OrigValue[i] <- data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"OrigValue"] Vec.Trait.Data.OrigUnit[i] <- data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"OrigUnitStr"] Vec.Trait.Data.StdValue[i] <- data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"StdValue"] } if( sum(grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE))>1){ if(sum((data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)))==1){ Vec.Trait.Data.OrigValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE)& (data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)) ,"OrigValue"]) Vec.Trait.Data.OrigUnit[i] <- (data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) & (data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)),"OrigUnitStr"])[1] Vec.Trait.Data.StdValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) & (data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") & !is.na(data.temp$ValueKindName)),"StdValue"]) } if(sum(data.temp$ValueKindName %in% c("Best estimate","Mean","Site specific mean") )<1){ Vec.Trait.Data.OrigValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE),"OrigValue"],na.rm=T) Vec.Trait.Data.OrigUnit[i] <- (data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) ,"OrigUnitStr"])[1] Vec.Trait.Data.StdValue[i] <- mean(data.temp[grepl(Trait.Data[i],data.temp$TraitName, fixed=TRUE) ,"StdValue"],na.rm=T) } } } ### EXPERIMENTAL DATA TYPE TF.exp.data <- sum(grepl("Growth & measurement conditions - experimental tre",data.temp$NonTraitCategories, fixed=TRUE) )>0 names(TF.exp.data) <- 'TF.exp.data' res.temp <- data.frame("ObservationID"=ObservationID.t,"AccSpeciesName"=unique(data.temp$AccSpeciesName) ,t(Vec.Non.Trait.Data),TF.exp.data, t(Vec.Trait.Data.OrigValue),t(Vec.Trait.Data.OrigUnit),t(Vec.Trait.Data.StdValue)) return(res.temp) } ## outlier detection based on Kattage et al 2011 ##' Detection of univar outlier based on method of Kattge et al. 2011 ##' ##' ##' @title ##' @param x.na ##' @param log ##' @return TRUE FALSE vector to identify outlier TRUE : outlier ##' @author Kunstler fun.out.TF2 <- function(x.na,log=TRUE){ x <- x.na[!is.na(x.na)] x.num <- (1:length(x.na))[!is.na(x.na)] TF.vec <- rep(FALSE,length(x.na)) if(log){ fit.dist <- fitdistr(log10(na.omit(x) ),'normal') high.bound <- fit.dist$estimate["mean"]+2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) low.bound <- fit.dist$estimate["mean"]-2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) TF.vec[x.num[log10(x)>high.bound | log10(x)<low.bound]] <- TRUE }else{ fit.dist <- fitdistr((na.omit(x) ),'normal') high.bound <- fit.dist$estimate["mean"]+2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) low.bound <- fit.dist$estimate["mean"]-2*(fit.dist$estimate["sd"]+fit.dist$sd["sd"]) TF.vec[x.num[(x)>high.bound | (x)<low.bound]] <- TRUE } return((TF.vec)) } ################################### ################################### ####### extract mean sd per species or genus ####### add species synonyme fun.species.traits <- function(species,species.table,col.sp="Latin_name",col.sp.syno="Latin_name_syn",traits,data){ vec.mean <- vec.sd <- vec.nobs <- rep(NA,length(traits)) vec.exp <- vec.genus <- rep(FALSE,length(traits)) names(vec.mean) <- names(vec.sd) <- names(vec.exp) <- names(vec.genus) <- names(vec.nobs)<- traits species.syno <- species.table[species.table[[col.sp]]==species,col.sp.syno] #browser() for(i in traits){ if(sum((data$AccSpeciesName %in% species.syno) & !is.na(data[[i]]))>0){ ## if data for this species or syno if(sum((data$AccSpeciesName %in% species.syno) & (!is.na(data[[i]])) & (!data[["TF.exp.data"]]))>0){## if data with out experiments x <- data[[i]][data$AccSpeciesName %in% species.syno & (!is.na(data[[i]])) & (!data[["TF.exp.data"]])] if(length(x)>50){## if more than 50 obs remove outlier outlier <- fun.out.TF2(x.na=x,log=TRUE) vec.mean[[i]] <- mean(log10(x[!outlier])) vec.sd[[i]] <- sd(log10(x[!outlier])) vec.nobs[[i]] <- length(x[!outlier]) }else{ vec.mean[[i]] <- mean(log10(x)) vec.sd[[i]] <- sd(log10(x)) vec.nobs[[i]] <- length(x)} }else{### include experimental data x <- data[[i]][data$AccSpeciesName %in% species.syno & (!is.na(data[[i]])) ] if(length(x)>50){ outlier <- fun.out.TF2(x.na=x,log=TRUE) vec.mean[[i]] <- mean(log10(x[!outlier])) vec.sd[[i]] <- sd(log10(x[!outlier])) vec.exp[[i]] <- TRUE vec.nobs[[i]] <- length(x[!outlier]) }else{ vec.mean[[i]] <- mean(log10(x)) vec.sd[[i]] <- sd(log10(x)) vec.exp[[i]] <- TRUE vec.nobs[[i]] <- length(x)} } }else{### compte data at genus level if no data for the species genus <- sub(" .*","",species) if(sum(grepl(genus,data$AccSpeciesName) & (!is.na(data[[i]])))>0){ x <- data[[i]][grepl(genus,data$AccSpeciesName,fixed=TRUE ) & (!is.na(data[[i]])) ] if(length(x)>50){ outlier <- fun.out.TF2(x.na=x,log=TRUE) vec.mean[[i]] <- mean(log10(x[!outlier])) vec.sd[[i]] <- sd(log10(x[!outlier])) vec.genus[[i]] <- TRUE vec.nobs[[i]] <- length(x[!outlier]) }else{ vec.mean[[i]] <- mean(log10(x)) vec.sd[[i]] <- sd(log10(x)) vec.genus[[i]] <- TRUE vec.nobs[[i]] <- length(x)} } } } return(list(mean=vec.mean,sd=vec.sd,exp=vec.exp,genus=vec.genus,nobs=vec.nobs)) } ####################### ### FUNCTIONS TO Manipulate species names fun.get.genus <- function(x) gsub(paste(" ",gsub("^([a-zA-Z]* )","",x),sep=""),"",x,fixed=TRUE) trim.trailing <- function (x) sub("\\s+$", "", x)