Commit d3189a85 authored by Georges Kunstler's avatar Georges Kunstler
Browse files

new try

No related merge requests found
Showing with 42 additions and 270 deletions
+42 -270
......@@ -80,29 +80,8 @@ return(res.temp)
##### function to flag outliers
# univar outliers
library(extremevalues)
##' Function to identify univariate outlier from package extrem value
##'
##' Not used in this version
##' @title
##' @param x.na
##' @param distribution
##' @param method
##' @return
##' @author Kunstler
fun.out.TF <- function(x.na,distribution="lognormal",method){
require(extremevalues)
x <- x.na[!is.na(x.na)]
x.num <- (1:length(x.na))[!is.na(x.na)]
TF.vec <- rep(FALSE,length(x.na))
outliers.temp <- getOutliers(x,distribution="lognormal",method=method)
TF.vec[x.num[c(outliers.temp$iLeft,outliers.temp$iRight)]] <- TRUE
return(list(TF.vec,outliers.temp))
}
## second outlier detection based on Kattage et al 2011
## outlier detection based on Kattage et al 2011
##' Detection of univar outlier based on method of Kattge et al. 2011
##'
##'
......@@ -129,141 +108,7 @@ fun.out.TF2 <- function(x.na,log=TRUE){
return((TF.vec))
}
fun.univar.outlier <- function(data,traits,method="I"){
matrix.TF.outliers <- as.data.frame(matrix(NA,nrow=length(data[,1]),ncol=length(traits)))
names(matrix.TF.outliers) <- traits
pdf("./figs/univar.outliers.pdf")
for(i in traits){
x.na <- data[,i]
x <- data[!is.na(data[,i]),i]
if(sum(x<0.0000000001)<1){
list.out <- fun.out.TF(x.na,distribution="lognormal",method=method)
matrix.TF.outliers[,i] <- list.out[[1]]
outliers.temp <- list.out[[2]]
######
hist.obj <- hist(log10(x),main=i,breaks=40,xlim=range(log10(x),log10(outliers.temp$limit)))
hist(log10(x)[!data[!is.na(data[,i]),"TF.exp.data"]], breaks=hist.obj$breaks,add=TRUE,col="grey")
abline(v=log10(outliers.temp$limit[1]),col="red")
abline(v=log10(outliers.temp$limit[2]),col="red")
}else{print(paste(i,"negative"))
list.out <- fun.out.TF(x.na,distribution="lognormal",method=method)
matrix.TF.outliers[,i] <- list.out[[1]]
outliers.temp <- list.out[[2]]
TF.vec <- fun.out.TF(x,distribution="normal",method=method)
matrix.TF.outliers[,i] <- TF.vec
}
}
dev.off()
return(matrix.TF.outliers)
}
##################
## Multi variates outlier
fun.multi.measure <- function(X,Y,data){
return(sum(!is.na(data[,X]) & !is.na(data[,Y])))
}
fun.multi.measure.mat <- function(traits,data){
mat.multi <- matrix(NA,length(traits),length(traits))
for (i in 1:(length(traits)-1)){
for (j in (i+1):length(traits)){
mat.multi[i,j] <- fun.multi.measure(traits[i],traits[j],data=data)
}
}
rownames(mat.multi) <- traits
colnames(mat.multi) <- traits
return(mat.multi)
}
#### function detect outylier with two methods
fun.mv.out <- function(data.t,outbound,quant.treshold=0.995){
require(mvoutlier)
outliers.temp <- pcout(data.t,makeplot=FALSE,outbound)
Mahalanobi.dist <- covMcd(data.t)$mah
Mah.vec.01 <- rep(1,length(Mahalanobi.dist))
Mah.vec.01[Mahalanobi.dist>quantile(Mahalanobi.dist,probs=quant.treshold)] <- 0
return(data.frame(Mah.out=Mah.vec.01,pcout.01=outliers.temp$wfinal01))
}
####
fun.multivar.outlier <- function(data,traits,lim.obs=50,outbound=0.25){
## compute number of multi traits measurments
mat.multi <- fun.multi.measure.mat(traits,data)
## loop over traits combination and identify combination with enough observation
## mat to save outliers
mat.mv.outliers <- array(NA,dim=c(nrow(data),length(traits),length(traits)),dimnames=list(data$ObservationID,traits,traits))
pdf( "./figs/mv.outliers.pdf")
for (i in 1:(length(traits)-1)){
for (j in (i+1):length(traits)){
if(mat.multi[i,j]>lim.obs){
data.t <- log10(data[apply(is.na(data[,c(traits[i],traits[j]) ]),MARGIN=1,FUN=sum)<1,c(traits[i],traits[j])])
out.data <- fun.mv.out(data.t,outbound,quant.treshold=0.995)
mat.mv.outliers[apply(is.na(data[,c(traits[i],traits[j]) ]),MARGIN=1,FUN=sum)<1,i,j] <- out.data$Mah.out ##out.data$pcout.01
plot(data.t[,1],data.t[,2],col=c("grey","black")[out.data$pcout.01+1],xlab=traits[i],ylab=traits[j],
main=paste("outliers",i,j,"pdf",sep="."))
points(data.t[out.data$Mah.out==0,1],data.t[out.data$Mah.out==0,2],pch=3,col="red")
}
}
}
dev.off()
return(mat.mv.outliers)
}
############################################################
####################
## function for plot
fun.log.scatter.with.marg.hist <- function(x,y,name.x,name.y,lab.x,lab.y,outlier.x,outlier.y,array.outlier){
if(sum(apply(is.na(cbind(x,y)),MARGIN=1,FUN=sum)<1)>1){
data.t <- ((cbind(x,y)[apply(is.na(cbind(x,y)),MARGIN=1,FUN=sum)<1,]))
def.par <- par(no.readonly = TRUE) # save default, for resetting...
xhist <- hist(log10(x), breaks=20,plot=FALSE)
yhist <- hist(log10(y), breaks=20,plot=FALSE)
top <- max(c(xhist$counts, yhist$counts))
nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), TRUE)
#layout.show(nf)
## colors for outlier
col.vec <- rep(1,length(x[apply(is.na(cbind(x,y)),MARGIN=1,FUN=sum)<1]))
col.vec[apply((cbind(outlier.x,outlier.y)[apply(is.na(cbind(x,y)),MARGIN=1,FUN=sum)<1,]),MARGIN=1,FUN=sum)>0] <- 2
## col for mv outlier
col.mvout.vec <- array.outlier[apply(is.na(cbind(x,y)),MARGIN=1,FUN=sum)<1,name.x,name.y]
par(mar=c(5,5,1,1))
plot(data.t[,1],data.t[,2],log="xy"
,xlab=lab.x,ylab=lab.y,xlim=range(x,na.rm=TRUE),
ylim=range(y,na.rm=TRUE),col=c("black","red")[col.vec],pch=c(16,2)[col.vec],cex=c(0.5,1)[col.vec])
if ( sum(!is.na(col.mvout.vec))>1) {
points(data.t[col.mvout.vec==0 & !is.na(col.mvout.vec),1],data.t[col.mvout.vec==0 & !is.na(col.mvout.vec),2],pch=3,col="blue")
}
par(mar=c(0,5,1,1))
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0)
high.out <- hist(log10(x[outlier.x])[x[outlier.x]>mean(x[outlier.x],na.rm=T)],breaks=xhist$breaks,plot=FALSE)
barplot(high.out$count,col="red",add=TRUE, space=0)
low.out <- hist(log10(x[outlier.x])[x[outlier.x]<mean(x[outlier.x],na.rm=T)],breaks=xhist$breaks,plot=FALSE)
barplot(low.out$count,col="red",add=TRUE, space=0)
par(mar=c(5,0,1,1))
barplot(yhist$counts,axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE)
high.out <- hist(log10(y[outlier.y])[y[outlier.y]>mean(y[outlier.y],na.rm=T)],breaks=yhist$breaks,plot=FALSE)
barplot(high.out$count,col="red",add=TRUE, space=0, horiz=TRUE)
low.out <- hist(log10(y[outlier.y])[y[outlier.y]<mean(y[outlier.y],na.rm=T)],breaks=yhist$breaks,plot=FALSE)
barplot(low.out$count,col="red",add=TRUE, space=0, horiz=TRUE)
par(def.par)}else{print("less than 2 observation in common in x and y")}
}
###################################
......@@ -332,30 +177,3 @@ return(list(mean=vec.mean,sd=vec.sd,exp=vec.exp,genus=vec.genus,nobs=vec.nobs))
fun.get.genus <- function(x) gsub(paste(" ",gsub("^([a-zA-Z]* )","",x),sep=""),"",x,fixed=TRUE)
trim.trailing <- function (x) sub("\\s+$", "", x)
#####FUN to COMPUTE MEAN SD FOR GENUS OR SPECIES
fun.sd.sp.or.genus <- function(traits,species,genus=FALSE){
if(genus){
genus <- sapply(species,fun.get.genus)
data.t <- cbind(tapply((traits),
INDEX = genus,FUN=sd,na.rm=TRUE),
tapply(!is.na(traits),
INDEX = genus,FUN=sum,na.rm=TRUE))
colnames(data.t) <- c("sd","nobs")
return(data.t)
}else{
data.t <- cbind(tapply((traits),
INDEX = species,FUN=sd,na.rm=TRUE),
tapply(!is.na(traits),
INDEX = species,FUN=sum,na.rm=TRUE))
colnames(data.t) <- c("sd","nobs")
return(data.t)
}
}
## function to compute mean sd over species or genus with more than Nobs.lim observation
fun.sd.mean.sp.or.genus <- function(traits,species,genus=FALSE,Nobs.lim)
{
data.temp <- fun.sd.sp.or.genus((traits),species,genus=genus)
return(mean(data.temp[data.temp[,"nobs"]>Nobs.lim & !is.na(data.temp[,"sd"]),"sd"]))
}
......@@ -23,7 +23,7 @@ TRY.DATA <- rbind(TRY.DATA,TRY.DATA2)
rm(TRY.DATA2)
##################################
### ERROR FOUND IN THE DATA BASE
#1
########################
### problem with the seed mass of this obs seed mass = 0 DELETE
TRY.DATA <- TRY.DATA[!(TRY.DATA$ObservationID==1034196 & TRY.DATA$DataName=="Seed dry mass"),]
......@@ -118,18 +118,6 @@ test.tnrs <- read.delim("./output/tnrs_results(2).txt",sep="\t", na.strings="",s
## tnrs.api<-'http://tnrs.iplantc.org/tnrsm-svc'
## #Tree topology from Ackerly, D. 2009. Conservatism and diversification of plant functional traits: Evolutionary rates versus phylogenetic signal. PNAS 106:19699--19706.
## lobelioids.string<-'((((((Lobelia_kauaensis,Lobelia_villosa),Lobelia_gloria-montis),(Trematolobelia_kauaiensis,Trematolobelia_macrostachys)),((Lobelia_hypoleuca,Lobelia_yuccoides),Lobelia_niihauensis)),((Brighamia_insignis,Brighamia_rockii),(Delissea_rhytidosperma,Delissea_subcordata))),((((Cyanea_pilosa,Cyanea_acuminata),Cyanea_hirtella),(Cyanea_coriacea,Cyanea_leptostegia)),(((Clermontia_kakeana,Clermontia_parviflora),Clermontia_arborescens),Clermontia_fauriei)));'
## #Transform the newick sting into an ape phylo object
## tree<-read.tree(text=lobelioids.string)
## #Obtain the taxa names
## old.names<-tree$tip.label
## #Change the underscore characters into blank spaces
## old.names<-gsub('_',' ',old.names)
## old.names <- species.tab2$Latin_name_syn
## #Transporms the vector into a string
## old.names<-paste(old.names,collapse=',')
......@@ -151,14 +139,6 @@ test.tnrs <- read.delim("./output/tnrs_results(2).txt",sep="\t", na.strings="",s
## #If TNRS did not return any accepted name (no match, or name is already accepted), the submitted name is retained
## names[names[,2]=="",2]<-names[names[,2]=="",1]
## ### SAME ERROR FOR FAGUS SYLVATICA TEH WEB SITE GIVE A GOOD RESULTS BUT NOT THE CALL FROM R ?
old.names <- unique(species.tab2$Latin_name_syn)
old.names<-gsub('_',' ',old.names)
write.csv(as.matrix(old.name),row,names=FALSE)
### change format try species names
TRY.DATA.FORMATED$AccSpeciesName <- as.character(TRY.DATA.FORMATED$AccSpeciesName)
......@@ -193,10 +173,6 @@ names(data.ifn.species.try.noout) <- c(paste(c("Leaf.N","Seed.mass","SLA","Wood
,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density"
,"Vessel.area","Leaf.Lifespan"),"nobs",sep="."))
#### check species with genus mean
## data.ifn.species.try.noout[data.ifn.species.try.noout$SLA.genus,]
## data.ifn.species.try.noout[data.ifn.species.try.noout$SLA.genus |data.ifn.species.try.noout$Wood.Density.genus |data.ifn.species.try.noout$Seed.mass.genus ,]
saveRDS(data.ifn.species.try.noout ,file="./data/process/data.ifn.species.try.noout.rds")
......@@ -204,7 +180,7 @@ saveRDS(data.ifn.species.try.noout ,file="./data/process/data.ifn.species.try.no
data.ifn.species.try.noout <- readRDS("./data/process/data.ifn.species.try.noout.rds")
#####################################################################
#### assume that the SD is equal mean species
#### assume that the SD is equal mean SD species
########
## The table 5 in Kattge et al. 2011 GCB provides estimation of mean species sd
......@@ -212,92 +188,70 @@ data.ifn.species.try.noout <- readRDS("./data/process/data.ifn.species.try.noout
### Nmass species sd log 0.08
### Seed Mass sd log 0.13
# SEE
sd.log.SLA <- 0.09 ### based on Kattge et al. 2011
sd.log.Nmass <- 0.08 ### based on Kattge et al. 2011
sd.log.Seed.Mass <- 0.13 ### based on Kattge et al. 2011
sd.log.LL <- 0.03 ### based on Kattge et al. 2011
## # SEE
## sd.log.SLA <- 0.09 ### based on Kattge et al. 2011
## sd.log.Nmass <- 0.08 ### based on Kattge et al. 2011
## sd.log.Seed.Mass <- 0.13 ### based on Kattge et al. 2011
## sd.log.LL <- 0.03 ### based on Kattge et al. 2011
######################
### Computed sd over the data we have under the assumption sd constant over species
lm.obj <- (lm(log10(TRY.DATA.FORMATED$StdValue.Stem.specific.density..SSD.)~ TRY.DATA.FORMATED$AccSpeciesName))
sd.log.WD <-sd(residuals(lm.obj))
sd.vec.sp <- rep(NA,6)
lm.obj <- (lm(log10(TRY.DATA.FORMATED$StdValue.Leaf.specific.area..SLA.)~ TRY.DATA.FORMATED$AccSpeciesName))
sd.log.SLA <-sd(residuals(lm.obj))
lm.obj <- (lm(log10(TRY.DATA.FORMATED$StdValue.Leaf.lifespan)~ TRY.DATA.FORMATED$AccSpeciesName))
sd.log.LL <-sd(residuals(lm.obj))
lm.obj <- (lm(log10(TRY.DATA.FORMATED$StdValue.Seed.mass)~ TRY.DATA.FORMATED$AccSpeciesName))
sd.log.Seed.Mass <-sd(residuals(lm.obj))
for(i in 1:length(key.main.traits2)){
eval(parse(text=paste("lm.obj <-lm(log10(TRY.DATA.FORMATED$",key.main.traits2[i],")~TRY.DATA.FORMATED$AccSpeciesName)")))
sd.vec.sp[i] <- sd(residuals(lm.obj))
}
lm.obj <- (lm(log10(TRY.DATA.FORMATED$StdValue.Leaf.nitrogen..N..content.per.dry.mass)~ TRY.DATA.FORMATED$AccSpeciesName))
sd.log.Nmass <-sd(residuals(lm.obj))
#######################333
### Computed sd over teh data we have under teh assumption sd constant over genus
lm.obj <- lm(log10(TRY.DATA.FORMATED$StdValue.Stem.specific.density..SSD.)~ sapply(TRY.DATA.FORMATED$AccSpeciesName,fun.get.genus))
sd.log.WD.genus <-sd(residuals(lm.obj))
lm.obj <- lm(log10(TRY.DATA.FORMATED$StdValue.Leaf.specific.area..SLA.)~ sapply(TRY.DATA.FORMATED$AccSpeciesName,fun.get.genus))
sd.log.SLA.genus <-sd(residuals(lm.obj))
lm.obj <- lm(log10(TRY.DATA.FORMATED$StdValue.Leaf.lifespan)~ sapply(TRY.DATA.FORMATED$AccSpeciesName,fun.get.genus))
sd.log.LL.genus <-sd(residuals(lm.obj))
lm.obj <- lm(log10(TRY.DATA.FORMATED$StdValue.Seed.mass)~ sapply(TRY.DATA.FORMATED$AccSpeciesName,fun.get.genus))
sd.log.Seed.Mass.genus <-sd(residuals(lm.obj))
lm.obj <- lm(log10(TRY.DATA.FORMATED$StdValue.Leaf.nitrogen..N..content.per.dry.mass)~ sapply(TRY.DATA.FORMATED$AccSpeciesName,fun.get.genus))
sd.log.Nmass.genus <-sd(residuals(lm.obj))
sd.vec.genus <- rep(NA,6)
for(i in 1:length(key.main.traits2)){
eval(parse(text=paste("lm.obj <-lm(log10(TRY.DATA.FORMATED$",key.main.traits2[i],")~sapply(TRY.DATA.FORMATED$AccSpeciesName,fun.get.genus))")))
sd.vec.genus[i] <- sd(residuals(lm.obj))
}
#############
### change value of sd if less than 10 obs assume sd mean
nobs.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Leaf.Lifespan")
nobs.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density",
"Vessel.area","Leaf.Lifespan")
,"nobs",sep=".")
sd.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Leaf.Lifespan")
sd.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Vessel.area","Leaf.Lifespan")
,"sd",sep=".")
genus.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Leaf.Lifespan")
,"genus",sep=".")
genus.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density",
"Vessel.area","Leaf.Lifespan")
,"genus",sep=".")
sd.vec.sp <- c(sd.log.Nmass,sd.log.Seed.Mass,sd.log.SLA,sd.log.WD,sd.log.LL)
names(sd.vec.sp) <- c("sdlog10.sp.Nmass","sdlog10.sp.Seed.Mass","sdlog10.sp.SLA","sdlog10.sp.WD","sdlog10.sp.LL")
names(sd.vec.sp) <- c("sdlog10.sp.Nmass","sdlog10.sp.Seed.Mass","sdlog10.sp.SLA",
"sdlog10.sp.WD","sdlog10.sp.Vessel","sdlog10.sp.LL")
sd.vec.genus <- c(sd.log.Nmass.genus,sd.log.Seed.Mass.genus,sd.log.SLA.genus,sd.log.WD.genus,sd.log.LL.genus)
names(sd.vec.genus) <- c("sdlog10.gs.Nmass","sdlog10.gs.Seed.Mass","sdlog10.gs.SLA","sdlog10.gs.WD","sdlog10.gs.LL")
names(sd.vec.genus) <- c("sdlog10.gs.Nmass","sdlog10.gs.Seed.Mass","sdlog10.gs.SLA",
"sdlog10.gs.WD","sdlog10.gs.Vessel","sdlog10.gs.LL")
## save mean species and genus sd
saveRDS(sd.vec.sp,file="./data/process/sd.vec.sp.rds")
saveRDS(sd.vec.genus,file="./data/process/sd.vec.genus.rds")
### function to select obs with less than Nthresh obs
fun.select.sd.with.to.few.obs.sp <- function(data,sd.names,nobs.names,genus.names,Nthreshold=10){
(data[[nobs.names[i]]]<Nthreshold & !is.na(data[[nobs.names[i]]])
& !data[[genus.names[i]]]) }
sd.vec.sp <- readRDS(file="./data/process/sd.vec.sp.rds")
sd.vec.genus <- readRDS(file="./data/process/sd.vec.genus.rds")
fun.select.sd.with.to.few.obs.genus <- function(data,sd.names,nobs.names,genus.names,Nthreshold=10){
(data[[nobs.names[i]]]<Nthreshold & !is.na(data[[nobs.names[i]]])
& data[[genus.names[i]]]) }
####
data.TRY.sd.update <- data.ifn.species.try.noout
for (i in 1:length(nobs.names)){
print( sd.names[i])
print("species")
print(fun.select.sd.with.to.few.obs.sp(data=data.TRY.sd.update,sd.names,nobs.names,genus.names,Nthreshold=Nobs.lim))
data.TRY.sd.update[[sd.names[i]]][fun.select.sd.with.to.few.obs.sp(data=data.TRY.sd.update,sd.names,nobs.names,genus.names,Nthreshold=Nobs.lim)] <-
sd.vec.sp[i]
print("genus")
print(fun.select.sd.with.to.few.obs.genus(data=data.TRY.sd.update,sd.names,nobs.names,genus.names,Nthreshold=Nobs.lim))
data.TRY.sd.update[[sd.names[i]]][fun.select.sd.with.to.few.obs.genus(data=data.TRY.sd.update,sd.names,nobs.names,genus.names,Nthreshold=Nobs.lim)] <-
sd.vec.genus[i]
#### add column with the mean sd species or genus
data.TRY.sd.update <- data.frame(data.ifn.species.try.noout,
data.ifn.species.try.noout[,sd.names])
sd.names.1 <- paste(sd.names,1,sep=".")
for (i in 1:length(sd.names.1)){
data.TRY.sd.update[[sd.names.1[i]]][!data.TRY.sd.update[[genus.names[i]]]] <- sd.vec.sp[i]
data.TRY.sd.update[[sd.names.1[i]]][data.TRY.sd.update[[genus.names[i]]]] <- sd.vec.genus[i]
}
head(data.TRY.sd.update,10)
saveRDS(data.TRY.sd.update,file="./data/process/data.TRY.sd.update.rds")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment