########################################################
########################################################
###### READ TRY AND FORMAT DATA CHECK ERROR


################
#### use AccSpeciesName because not author name

source("./R/FUN.TRY.R")
library(MASS)
library(doParallel)
library(mvoutlier)

## read TRY data
TRY.DATA <- read.table("./data/raw/DataTRY/TRY_Proposal_177_DataRelease_2013_04_01.txt",
                       sep = "\t",header=TRUE,na.strings="", stringsAsFactors=FALSE)

TRY.DATA2 <- read.table("./data/raw/DataTRY/TRY_Proposal_177_DataRelease_2013_07_23.txt",
                       sep = "\t",header=TRUE,na.strings="", stringsAsFactors=FALSE)

### combine both data set
TRY.DATA <- rbind(TRY.DATA,TRY.DATA2)
rm(TRY.DATA2)
##################################
### ERROR FOUND IN THE DATA BASE
#1
########################
### problem with the seed mass of this obs seed mass = 0 DELETE
TRY.DATA <- TRY.DATA[!(TRY.DATA$ObservationID==1034196 & TRY.DATA$DataName=="Seed dry mass"),]
#### IS "Quercuscrispla sp" an error standing for Quercus crispula synonym of Quercus mongolica subsp. crispula (Blume) Menitsky ? ask Jens
## TRY.DATA[TRY.DATA$AccSpeciesName=="Quercuscrispla sp" ,]


########################
########################
### first create a table with one row per Observation.id and column for each traits and variable

Non.Trait.Data <- c("Latitude", "Longitude", "Reference", "Date of harvest / measurement",
"Altitude", "Mean annual temperature (MAT)","Mean sum of annual precipitation (PPT)",
  "Plant developmental status / plant age","Maximum height reference",
  "Source in Glopnet",  "Number of replicates", "Sun vers. shade leaf qualifier" )

Trait.Data <- sort(names(((table(TRY.DATA$TraitName)))))

##########################
#### REFORMAT DATA from TRY
registerDoParallel(cores=5) ## affect automaticaly half of the core detected to the foreach here I decide to affect 4 cores
getDoParWorkers() ## here 8 core so 4 core if want to use more registerDoParallel(cores=6)

 TRY.DATA.FORMATED <- foreach(ObservationID.t=unique(TRY.DATA$ObservationID), .combine=rbind) %dopar%
	{
            fun.extract.try(ObservationID.t,data=TRY.DATA,Non.Trait.Data,Trait.Data)
	}


## head(TRY.DATA.FORMATED) 
## dim(TRY.DATA.FORMATED) 

saveRDS(TRY.DATA.FORMATED,file="./data/process/TRY.DATA.FORMATED.rds")

########################
########## READ RDS
TRY.DATA.FORMATED <- readRDS("./data/process/TRY.DATA.FORMATED.rds")
## TRY.DATA.FORMATED[TRY.DATA.FORMATED$ObservationID==1034196,"StdValue.Seed.mass"] <- NA
## head(TRY.DATA.FORMATED)




####################
####################
## COMPUTE MEAN AND SD FOR SPECIES from FRENCH NFI
key.main.traits2 <-   c("StdValue.Leaf.nitrogen..N..content.per.dry.mass",
"StdValue.Seed.mass",
"StdValue.Leaf.specific.area..SLA.",
"StdValue.Stem.specific.density..SSD.",
"StdValue.Stem.conduit.area..vessel.and.tracheid.",
"StdValue.Leaf.lifespan")



###############################
##############################
## READ CSV TABLE WITH LATIN NAME and CODE FOR FRENCH NFI DATA

### NEED TO UPDATE WITH ALL SPECIES LATER
species.tab <- read.csv("./data/species.list/species.csv",sep="\t")
species.tab2 <- species.tab[!is.na(species.tab$Latin_name),]
rm(species.tab)
gc()

### species IFN reformat names
species.IFN <- unique(gsub("_", " ", species.tab2$Latin_name))
## clean species names and synonyme names
species.tab2$Latin_name <- (gsub("_", " ", species.tab2$Latin_name))
species.tab2$Latin_name_syn<- (gsub("_", " ", species.tab2$Latin_name_syn))
## remove trailing white space
species.tab2$Latin_name_syn<- trim.trailing(species.tab2$Latin_name_syn)

### export name to check in http://tnrs.iplantcollaborative.org/quick_start.html
old.names <-  unique(species.tab2$Latin_name_syn)
write.csv(as.matrix(old.names),file="./data/process/old.names.csv",row.names=FALSE)

## read data from TNRS iPLANT
test.tnrs <- read.delim("./output/tnrs_results(2).txt",sep="\t", na.strings="",stringsAsFactors=FALSE,header=TRUE)
## need to do the same for TRY to have same match

## ACCORDING TO WILL THE BEST SOURCE IS http://www.theplantlist.org/

###############################################
#### CALLING THE WEBSITE FROM R DOESN'T PROVIDES THE SAME RESULTS
### see http://ask.iplantcollaborative.org/question/839/species-in-tnrs-api-vs-web-app/
## # Example R script which calls the TNRS in the context of adding names to a phylogeny
## ## FROM Boyle et al. 2013 BMC Bioinformatics 
## library(ape)
## library(rjson)	
## library(RCurl) 

## tnrs.api<-'http://tnrs.iplantc.org/tnrsm-svc'

## old.names <-  species.tab2$Latin_name_syn
## #Transporms the vector into a string
## old.names<-paste(old.names,collapse=',')

## #The string needs to be URL-encoded
## old.names<-curlEscape(old.names)

## #Send a request to the TNRS service
## url<-paste(tnrs.api,'/matchNames?retrieve=best&names=',old.names,sep='')
## tnrs.json<-getURL(url) 

## #The response needs to be converted from JSON
## tnrs.results<-fromJSON(tnrs.json)

## #The corrected names are extracted from the response
## names<-sapply(tnrs.results[[1]], function(x) c(x$nameSubmitted,x$acceptedName))
## names<-as.data.frame(t(names),stringsAsFactors=FALSE)

## #If TNRS did not return any accepted name (no match, or name is already accepted), the submitted name is retained
## names[names[,2]=="",2]<-names[names[,2]=="",1] 

### change format try species names
TRY.DATA.FORMATED$AccSpeciesName <- as.character(TRY.DATA.FORMATED$AccSpeciesName)

#### extract mean and sd per species without experimental data and detection of outlier when enough data or if not enough data compute mean of genus
### The detection of outlier is based on teh method in Kattge et al. 2011 only for univariate outlier. I have try other univariate and multivariate method of detection of outlier but didn't work well
res.list <- lapply(species.IFN,FUN=fun.species.traits,species.table=species.tab2,traits=key.main.traits2,data=TRY.DATA.FORMATED)

names(res.list) <- species.IFN

##### TRANSFORM LIST IN A TABLE

data.mean <- t(sapply(species.IFN,FUN=function(i,res.list) res.list[[i]]$mean
            ,res.list=res.list))
data.sd <- t(sapply(species.IFN,FUN=function(i,res.list) res.list[[i]]$sd,res.list=res.list))
data.exp <- t(sapply(species.IFN,FUN=function(i,res.list) res.list[[i]]$exp
            ,res.list=res.list))
data.genus <- t(sapply(species.IFN,FUN=function(i,res.list) res.list[[i]]$genus
            ,res.list=res.list))
data.nobs <- t(sapply(species.IFN,FUN=function(i,res.list) res.list[[i]]$nobs
            ,res.list=res.list))

#### data frame with all
data.ifn.species.try.noout <- data.frame(data.mean,data.sd,data.exp,data.genus,data.nobs)
names(data.ifn.species.try.noout) <-  c(paste(c("Leaf.N","Seed.mass","SLA","Wood.Density"
                                  ,"Vessel.area","Leaf.Lifespan"),"mean",sep=".")
                                  ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density"
                                  ,"Vessel.area","Leaf.Lifespan"),"sd",sep=".")
                                  ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density"
                                  ,"Vessel.area","Leaf.Lifespan"),"exp",sep=".")
                                  ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density"
                                  ,"Vessel.area","Leaf.Lifespan"),"genus",sep=".")
                                  ,paste(c("Leaf.N","Seed.mass","SLA","Wood.Density"
                                  ,"Vessel.area","Leaf.Lifespan"),"nobs",sep="."))


saveRDS(data.ifn.species.try.noout ,file="./data/process/data.ifn.species.try.noout.rds")

####
data.ifn.species.try.noout <- readRDS("./data/process/data.ifn.species.try.noout.rds")

#####################################################################
#### assume that the SD is equal mean SD species 

########
## The table 5 in Kattge et al. 2011 GCB provides estimation of mean species sd
### SLA species sd log 0.09
### Nmass species sd log 0.08
### Seed  Mass sd log 0.13

## #  SEE
## sd.log.SLA <- 0.09 ### based on Kattge et al. 2011
## sd.log.Nmass <- 0.08 ### based on Kattge et al. 2011
## sd.log.Seed.Mass <-  0.13  ### based on Kattge et al. 2011
## sd.log.LL <-  0.03 ### based on Kattge et al. 2011


######################
### Computed sd over the data we have under the assumption sd constant over species
sd.vec.sp <-  rep(NA,6)

for(i in 1:length(key.main.traits2)){
eval(parse(text=paste("lm.obj <-lm(log10(TRY.DATA.FORMATED$",key.main.traits2[i],")~TRY.DATA.FORMATED$AccSpeciesName)")))
sd.vec.sp[i] <- sd(residuals(lm.obj))
}


#######################333
### Computed sd over teh data we have under teh assumption sd constant over genus
sd.vec.genus <-  rep(NA,6)
for(i in 1:length(key.main.traits2)){
eval(parse(text=paste("lm.obj <-lm(log10(TRY.DATA.FORMATED$",key.main.traits2[i],")~sapply(TRY.DATA.FORMATED$AccSpeciesName,fun.get.genus))")))
sd.vec.genus[i] <- sd(residuals(lm.obj))
}


#############
### change value of sd if less than 10 obs assume sd mean
nobs.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density",
                      "Vessel.area","Leaf.Lifespan")
                    ,"nobs",sep=".")
sd.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density","Vessel.area","Leaf.Lifespan")
                    ,"sd",sep=".")
genus.names <- paste(c("Leaf.N","Seed.mass","SLA","Wood.Density",
                       "Vessel.area","Leaf.Lifespan")
                     ,"genus",sep=".")

names(sd.vec.sp) <- c("sdlog10.sp.Nmass","sdlog10.sp.Seed.Mass","sdlog10.sp.SLA",
                      "sdlog10.sp.WD","sdlog10.sp.Vessel","sdlog10.sp.LL")

names(sd.vec.genus) <- c("sdlog10.gs.Nmass","sdlog10.gs.Seed.Mass","sdlog10.gs.SLA",
                         "sdlog10.gs.WD","sdlog10.gs.Vessel","sdlog10.gs.LL")

## save mean species and genus sd
saveRDS(sd.vec.sp,file="./data/process/sd.vec.sp.rds")
saveRDS(sd.vec.genus,file="./data/process/sd.vec.genus.rds")

### function to select obs with less than Nthresh obs
sd.vec.sp <- readRDS(file="./data/process/sd.vec.sp.rds")
sd.vec.genus <- readRDS(file="./data/process/sd.vec.genus.rds")



#### add column with the mean sd species or genus
data.TRY.sd.update <- data.frame(data.ifn.species.try.noout,
                                 data.ifn.species.try.noout[,sd.names]) 

sd.names.1 <- paste(sd.names,1,sep=".")

for (i in 1:length(sd.names.1)){
data.TRY.sd.update[[sd.names.1[i]]][!data.TRY.sd.update[[genus.names[i]]]] <- sd.vec.sp[i]
data.TRY.sd.update[[sd.names.1[i]]][data.TRY.sd.update[[genus.names[i]]]] <- sd.vec.genus[i]
}
head(data.TRY.sd.update,10)

saveRDS(data.TRY.sd.update,file="./data/process/data.TRY.sd.update.rds")

###
# plot sd to show mark
pdf("./figs/sd.traits.pdf")
r <- barplot(sd.vec.sp ,names.arg=c("Leaf.N","SM","SLA","WD","LL"),las=2,ylim=c(0,0.9),ylab="sd log10")
points(r[,1],sd.vec.genus,col="red",pch=16,cex=2)
for (i in 1:length(nobs.names)){
## sd.obs <-   data.TRY.sd.update[[sd.names[i]]][!data.TRY.sd.update[[genus.names[i]]]]
## points(rep(r[i,1],length(sd.obs)),sd.obs)
## sd.obs <-   data.TRY.sd.update[[sd.names[i]]][data.TRY.sd.update[[genus.names[i]]]]
## points(rep(r[i,1],length(sd.obs)),sd.obs,col="red",pch=4)
print(sd.obs)
}
dev.off()