format.function.R


###################################################
###################################################
###################################################
##### FUNCTION TO FORMAT DATA FOR THE WORKSHOP ANALYSIS


############################
## FUNCTION remove trailing white space
trim.trailing <- function (x) sub("\\s+$", "", x)


#############################################
## FUN to clean species name
fun.clean.species.tab <- function(species.tab){
species.tab2 <- species.tab[!is.na(species.tab$Latin_name),]
### species IFN reformat names
## clean species names and synonyme names
species.tab2$Latin_name <- (gsub("_", " ", species.tab2$Latin_name))
species.tab2$Latin_name_syn<- (gsub("_", " ", species.tab2$Latin_name_syn))
## remove trailing white space
species.tab2$Latin_name_syn<- trim.trailing(species.tab2$Latin_name_syn)
species.clean <- species.tab2[!duplicated(species.tab2$Latin_name),
                              c("code","Latin_name","Exotic_Native_cultivated")]
return(species.clean)}


#####################################################
### compute quantile 99% and sd with a bootstrap
f.quantile <- function (x,ind,probs){
    require(boot)
    quantile(x[ind],probs=probs,na.rm=TRUE)}

f.quantile.boot <-  function(i,x,fac,R,probs=0.99){
    require(boot)
    if(length(na.exclude(x[fac==i]))>0){
quant.boot <- boot(x[fac==i],f.quantile,R=R,probs=probs)
return(as.matrix(c(mean=mean(quant.boot$t),sd=sd(quant.boot$t),nobs=length(na.exclude(x[fac==i]))),ncol=3,nrow=1))
}else{
return(as.matrix(c(mean=NA,sd=NA,nobs=NA),ncol=3,nrow=1))
}
}

#######################
### function to compute number of dead per plot 
function.perc.dead <-  function(dead){
 sum(dead)/length(dead)}

function.perc.dead2 <- function(dead) { out <- sum(dead,na.rm=T)/length(dead[!is.na(dead)]); if(!is.finite(out)) out <- NA; return(out) }


######
######
## FUNCTION TO PLOT MAP OF TREE
##' .. Function to plot map of tree with circle function of their dbh..
##'
##' .. content for \details{} ..
##' @title 
##' @param plot.select the plot for which draw the map
##' @param x 
##' @param y 
##' @param plot vectore of plot id for each tree
##' @param D diameter in cm
##' @param inches controling the circle size
##' @param ... 
##' @return 
##' @author Kunstler
fun.circles.plot <- function(plot.select,x,y,plot,D,inches,...){
x.t <- x[plot==plot.select]
y.t <- y[plot==plot.select]
D.t <- D[plot==plot.select]
D.t[is.na(D.t)] <-  0
symbols(x.t,y.t,circles=D.t ,main=plot.select,inches=inches,...)
}


#########################
## 
##' .. Compute the basal area of competitor in a plot..
##'
##' .. content for \details{} ..
##' @title 
##' @param diam diameters of each individuals in cm
##' @param weights weight to compute the basal area in cm^2/m^2
##' @return basal area in cm^2/m^2 
##' @author Kunstler
BA.fun <- function(diam,weights){
((diam/2)^2)*pi*weights
}


#### 
##' .. function to compute the basal area of neighborood tree in plots ..
##'
##' .. content for \details{} ..
##' @title 
##' @param obs.id tree obs identifier
##' @param diam diam of tree in cm
##' @param sp species name or code
##' @param id.plot identifier of the plot
##' @param weights weights to compute basal area in cm^2/m^2 SO THE 1/AREA of teh plot (or subplot) with area in m^2
##' @param weights.full.plot weights for the whole plot to compute basal area in cm^2/m^2 or if NA use weights of the individuals (for simple plots)
##' @return data frame with obs.id and one column per species with basal area of the species (without the target tree)
##' @author Kunstler
BA.SP.FUN <-  function(obs.id,diam,sp,id.plot,weights,weight.full.plot){
print(length(obs.id))
require(data.table)
id.plot <- as.character(id.plot)
obs.id <-  as.character(obs.id)
## check equal length
if(!(length(obs.id)==length(diam) & length(obs.id)==length(sp) & length(obs.id)==length(id.plot) & length(obs.id)==length(weights)))
    stop("length of obs.id diam,sp id.plot & weights need to be the same")

## check sp is not numeric
if(is.numeric(sp)) stop("sp can not be numeric need to be charatcer do paste('sp',sp,sep='.')")

# compute BA tot per species per plot
BASP <- tapply(BA.fun(diam,weights),INDEX=list(id.plot,sp),FUN=sum,na.rm=T)
print(dim(BASP))
DATA.BASP <-  data.table(id.plot= rownames(BASP),BASP)
setnames( DATA.BASP,old=1:ncol(DATA.BASP), c("id.plot",colnames(BASP)))
setkeyv(DATA.BASP,c("id.plot"))
sp.name <- colnames(BASP)
rm(BASP)
print("first table created")
#### MERGE with indivudal tree
## use library(data.table)
if(!is.na(weight.full.plot)){
     data.indiv <- data.table(obs.id=obs.id,sp=sp,
                              id.plot=id.plot,diam=diam,
                              BA.indiv=BA.fun(diam,rep(weight.full.plot,length(diam))))
     setkeyv(data.indiv,"id.plot")
     print("second table created")
     data.merge <-  merge(data.indiv,DATA.BASP)
     print("merge done")
     # substract target BA
     for (i in (sp.name)){
     eval(parse(text=paste("data.merge[sp==\'",i,"\',",i,":=",i,"-BA.indiv]",sep="")))
     }

}else{
     data.indiv <- data.table(obs.id=obs.id,sp=sp,id.plot=id.plot,
                         diam=diam,weights=weights,
                         BA.indiv=BA.fun(diam,weights))
     setkeyv(data.indiv,"id.plot")
     print("second table created")
     data.merge <-  merge(data.indiv,DATA.BASP)
     print("merge done")
     for (i in (sp.name)){
     eval(parse(text=paste("data.merge[sp==\'",i,"\',",i,":=",i,"-BA.indiv]",sep="")))
     }
}
print("replacment done")
data.merge[,BA.indiv:=NULL]
print("first column removed")
#### delete column not used
data.merge[,sp:=NULL]
data.merge[,diam:=NULL]
data.merge[,id.plot:=NULL]
data.merge[,weights:=NULL]
print("columns removed")
return( (data.merge))
}

######################
#### apply BA.SP.FUN per census
# function to run on a subset to only one census 
BA.SP.FUN.l<-  function(census.id,census,obs.id,diam,sp,id.plot,weights,weight.full.plot){
return(BA.SP.FUN(obs.id=obs.id[census==census.id],diam=diam[census==census.id],sp=sp[census==census.id],id.plot=id.plot[census==census.id],weights=weights[census==census.id],weight.full.plot=weight.full.plot))
}
## function to apply over all census and merge back together
BA.SP.FUN.census<-  function(census,obs.id,diam,sp,id.plot,weights,weight.full.plot){
require(data.table)
unique.census <- unique(census)
res.list <- lapply(unique.census,FUN=BA.SP.FUN.l,census,obs.id,diam,sp,id.plot,weights,weight.full.plot)
res.mat <- rbind.fill(res.list )
res.mat <- data.table(res.mat[match(obs.id,res.mat[["obs.id"]]),])
return(res.mat)
}


#### 
##' .. function compute competition index with X Y coordinates based on a neighborhood of radius R ..
##'
##' .. content for \details{} ..
##' @title 
##' @param obs.id id of the observation (if one tree as multiple growth measurement one obs.id per measurment
##' @param xy.table table with x.y of teh individual
##' @param diam diam in cm
##' @param sp species
##' @param Rlim radius of neighborhood search
##' @param parallel run in paralle or not ?
##' @param rpuDist run with GPU distance computation
##' @return a data frame with nrow = length of obs.id and ncol =unique(sp)
##' @author Kunstler
BA.SP.FUN.XY <-  function(obs.id,xy.table,diam,sp,Rlim,parallel=FALSE,rpuDist=FALSE){
print(paste("n observation = ",nrow(xy.table)))
print("start computing distance matrix")
if(rpuDist){
  require(rpud)
  dist.mat <- rpuDist(xy.table,upper=TRUE,diag=TRUE)
}else{
  dist.mat <- as.matrix(dist(xy.table,upper=TRUE,diag=TRUE))
}
print('distance matrix computed')
dist.mat[dist.mat <Rlim] <- 1
dist.mat[dist.mat >Rlim] <- 0
diag(dist.mat) <- 0
print('distance matrix set to 0 1')
BA <- BA.fun(diam,weights=1/(pi*Rlim^2))
print(c(length(BA),length(diam)))
BA.mat <- matrix(rep(BA,length(BA)),nrow=length(BA),byrow=TRUE)
print('starting tapply over species')
fun.sum.sp <- function(x,sp) tapply(x,INDEX=sp,FUN=sum,na.rm=TRUE)
  if(parallel){
    ## parallel version
    require(doParallel)
    registerDoParallel(cores=4)
    mat <- dist.mat*BA.mat
    res.temp <- foreach(i=1:nrow(mat), .combine=rbind) %dopar% {
       fun.sum.sp(mat[i,],sp)
       }
    res.temp <-  data.frame(obs.id=obs.id,res.temp)
    return((res.temp))
  }else{
    res.temp <-  data.frame(obs.id=obs.id,t(apply(dist.mat*BA.mat,MARGIN=1,FUN=fun.sum.sp ,sp)))
    return(res.temp)
  }
}


#### 
##' .. function to compute competition index with X Y coordinates based on a neighborhood of radius R WITHOUT THE DISTANCE MATRIX..
##'
##' .. content for \details{} ..
##' @title 
##' @param obs.id id of the observation (if one tree as multiple growth measurement one obs.id per measurment
##' @param xy.table table with x.y of teh individual
##' @param diam diam in cm
##' @param sp species
##' @param Rlim radius of neighborhood search
##' @param parallel run in paralle or not ?
##' @param rpuDist run with GPU distance computation
##' @return a data frame with nrow = length of obs.id and ncol =unique(sp)
##' @author Kunstler
BA.SP.FUN.XY2 <-  function(obs.id,xy.table,diam,sp,Rlim,parallel){
rownames(xy.table) <- obs.id
print(paste("n observation = ",nrow(xy.table)))
print("start computing distance matrix")
## create local function
fun.compute.local <-  function(obs.id.t,obs.id,xy.table,diam,sp,Rlim){
dist.t <- as.vector(sqrt((outer(xy.table[obs.id==obs.id.t,"x"],xy.table[,"x"],FUN="-"))^2 +
    (outer(xy.table[obs.id==obs.id.t,"y"],xy.table[,"y"],FUN="-"))^2))
dist.t[dist.t>Rlim] <- 0
dist.t[dist.t<=Rlim] <- 1
res.BA.t <- tapply(BA.fun(diam,weights=dist.t/(pi*Rlim^2)),INDEX=sp,FUN=sum,na.rm=TRUE)
res.BA.t[is.na(res.BA.t)] <- 0
return(data.frame(obs.id=obs.id.t,t(res.BA.t)))
}
#lapply function
if(parallel){
  library(doParallel)
  print("start mclapply")
  list.BA.SP.data <- mclapply(obs.id,FUN=fun.compute.local,obs.id,xy.table,
                              diam,sp,Rlim,mc.cores=4)
  print("done")
  data.BA.SP <-  rbind.fill(list.BA.SP.data)
 }else{
  list.BA.SP.data <- lapply(obs.id,FUN=fun.compute.local,obs.id,xy.table,
                            diam,sp,Rlim)
   data.BA.SP <-  rbind.fill(list.BA.SP.data)
 }
return(data.BA.SP)
}


## function for lapply per census
BA.SP.FUN.XY.l <-  function(census.id,census,obs.id,xy.table,diam,sp,Rlim,parallel,rpuDist){
print(dim(xy.table))
if(length(obs.id[census==census.id]) <10000){
data.temp<-BA.SP.FUN.XY(obs.id=obs.id[census==census.id],
                         xy.table=xy.table[census==census.id,],
                         diam=diam[census==census.id],sp=sp[census==census.id],
                         Rlim=Rlim,parallel,rpuDist)
  }else{
data.temp<-BA.SP.FUN.XY2(obs.id=obs.id[census==census.id],
                         xy.table=xy.table[census==census.id,],
                         diam=diam[census==census.id],sp=sp[census==census.id],
                         Rlim=Rlim,parallel)
     }
return(data.temp)
}

### wrapping function to run BA.SP.FUN.XY per census
BA.SP.FUN.XY.census <-  function(census,obs.id,xy.table,diam,sp,Rlim,parallel=FALSE,rpuDist=FALSE){
unique.census <- unique(census)
print(paste("Vector of census to lapply over" ,paste(unique.census,collapse=","),sep=" "))
res.list <- lapply(unique.census,FUN=BA.SP.FUN.XY.l,census=census,obs.id=obs.id,xy.table=xy.table,diam=diam,sp=sp,Rlim=Rlim,parallel,rpuDist)
res.mat <- rbind.fill(res.list )
res.DF <- res.mat[match(obs.id,res.mat[["obs.id"]]),]
return(res.DF)
}

#####################################
#####################################
### FUNCTION TO COMPUTE BA.SP.XY PER PLOT AND MERGE TOGETHER
#### function to be apply per site
fun.compute.BA.SP.XY.per.plot <- function(i,data.tree,Rlim,xy.name=c('x','y'),parallel=FALSE,rpuDist=FALSE){
data.tree.s <- subset(data.tree,subset=data.tree[["plot"]] ==i)
print(paste("size of the data for plot",i,"is",nrow(data.tree.s)))
BA.SP.temp <- BA.SP.FUN.XY.census(census=data.tree[["census"]],
                                  obs.id=data.tree.s[['obs.id']],
                                  xy.table=data.tree.s[,xy.name],
                                  diam=data.tree.s[['D']],
                                  sp=(data.tree.s[['sp']]),
                                  Rlim=Rlim,
                                  parallel,
                                  rpuDist)
## replace NA per zero
print('replacing NA per zero')
BA.SP.temp[is.na(BA.SP.temp)] <- 0
print('done')
### TEST GOOD ORDER
if(sum(! (BA.SP.temp[["obs.id"]])==data.tree.s[['obs.id']]) >0) stop('rows not in the good order')
if(sum(!colnames(BA.SP.temp[,-1])==as.character((levels(data.tree.s[['sp']]))))>0) stop('colnames does mot match species name')

### compute sum per row
BATOT <- apply(BA.SP.temp[,-1],MARGIN=1,FUN=sum)
data.res <- data.frame(BA.SP.temp,BATOT=BATOT, stringsAsFactors =FALSE)
return(data.res)
}


##########################
### function to replace missing value per zero in competition matrix
function.replace.NA.negative <- function(data.BA.SP){
    for (i in 2:ncol(data.BA.SP)){
eval(parse(text=paste("data.BA.SP[is.na(",names(data.BA.SP)[i],"),",names(data.BA.SP)[i],":=0]",sep="")))
eval(parse(text=paste("data.BA.SP[",names(data.BA.SP)[i],"<0,",names(data.BA.SP)[i],":=0]",sep="")))
}
print('NA and negative replaced')
return(data.BA.SP)
}    

##############################################################
##function to generate data in good format per ecoregion
fun.data.per.ecoregion <- function(ecoregion,data.tot,plot.name,weight.full.plot,name.country,data.TRY=NA,species.lookup=NA){
if(is.null(data.tot[['weights']])) stop("Please create a weights vector, even if it is completely NA")
require(data.table)
data.tot <-  data.table(data.tot)
data <- data.tot[ecocode==ecoregion,]
print(dim(data))
rm(data.tot)
data.BA.SP <- BA.SP.FUN.census(census=data[['census']],
                               obs.id=as.vector(data[['obs.id']]),
			       diam=as.vector(data[['D']]),
		    	       sp=as.vector(data[['sp']]),
			       id.plot=as.vector(data[[plot.name]]),
			       weights=data[['weights']],
			       weight.full.plot=weight.full.plot)

print('competition index computed')
## change NA and <0 data for 0
data.BA.SP <- function.replace.NA.negative(data.BA.SP)
### CHECK IF sp and sp name for column are the same
if(sum(!(names(data.BA.SP)[-1] %in% unique(data[["sp"]]))) >0) stop("competition index sp name not the same as in data")
#### compute BA tot for all competitors
## data.BA.SP[,BATOT:=sum(.SD),by=obs.id] ## slower than apply why?? 
BATOT.s <-  apply(data.frame(data.BA.SP)[,-1],MARGIN=1,FUN=sum)
data.BA.SP[,BATOT:=BATOT.s]
print('BATOT COMPUTED')
### create data frame
DT.temp <- data.table(obs.id=data[["obs.id"]],ecocode=data[["ecocode"]])
setkeyv(DT.temp,"obs.id")
setkeyv(data.BA.SP,"obs.id")
print('starting last merge')
data.BA.sp <- merge(DT.temp,data.BA.SP)
## reorder data
data <-  data.table(data)
setkeyv(data,"obs.id")
## test if same order
if(sum(!data.BA.sp[["obs.id"]] == data[["obs.id"]]) >0) stop("competition index not in the same order than data")
#####
## ADD TRY DATA OR TRAITS IF NEEDED
if(is.data.frame(data.TRY)){
   sp.extract <- species.lookup[species.lookup[["sp"]] %in% unique(data[["sp"]]),]
   data.traits <- fun.extract.format.sp.traits.TRY(sp=sp.extract[["sp"]],sp.syno.table=sp.extract,data.TRY)
   ### TODO ADD OPTION TO INCLUE OTHER DATA on MAX HEIGHT
   ## save everything as a list
   print(dim(data.traits))
   list.temp <- list(data.tree=data,data.BA.SP=data.BA.sp,data.traits=data.traits)
   save(list.temp,file=paste("./data/process/list",name.country,ecoregion,"Rdata",sep="."))
}else{
   list.temp <- list(data.tree=data,data.BA.SP=data.BA.sp,data.traits=NA)
   saveRDS(list.temp,file=paste("./data/process/list",name.country,ecoregion,"Rdata",sep="."))
}
}    


#######
#### NEED TO WRITE FUNCTION TO FORMAT DATA FOR BIG TROPICAL PLOT

fun.data.per.bigplot <- function(data,name.site,data.TRAITS=NA,Rlim=15,xy.name,parallel=FALSE,rpuDist=FALSE){
require(data.table)

## compute competition matrix
list.BA.SP.data <- lapply(unique(data[["plot"]]),FUN=fun.compute.BA.SP.XY.per.plot,data=data,Rlim=Rlim,xy.name=xy.name,parallel=TRUE,rpuDist=FALSE)
print('competition index computed in list')
data.BA.SP <- rbind.fill(list.BA.SP.data)
print('competition index computed')
### test
if(sum(!(names(data.BA.SP)[-1] %in% c(levels(data[["sp"]]),"BATOT"))) >0) stop("competition index sp name not the same as in data")

### TODO REMOVE TREE IN BUFFER ZONE
obs.id.no.buffer <- subset(data,subset=((data[["x"]]-15)>0 & (data[["x"]]+15)<max(data[["x"]])) &
                   ((data[["y"]]-15)>0 & (data[["y"]]+15)<max(data[["y"]])),
           select="obs.id")
data <- subset(data,subset=data[["obs.id"]] %in%  as.character(obs.id.no.buffer[,1]))
data.BA.SP <- subset(data.BA.SP,subset=data.BA.SP[["obs.id"]] %in%  as.character(obs.id.no.buffer[,1]))
data.BA.SP$obs.id <-  as.character(data.BA.SP$obs.id)
## reorder data
data <-  data.table(data)
setkeyv(data,"obs.id")
data.BA.SP <-  data.table(data.BA.SP)
setkeyv(data.BA.SP,"obs.id")
## test if same order
if(sum(!as.character(data.BA.SP[["obs.id"]]) == data[["obs.id"]]) >0) stop("competition index not in the same order than data")
#####
## ADD TRY DATA OR TRAITS IF NEEDED
if(is.data.frame(data.TRAITS)){
    data.traits <- fun.extract.format.sp.traits.NOT.TRY(sp=unique(data[["sp"]]),
                   Latin_name=unique(data[["sp.name"]]),
                   data=data.TRAITS,name.match.traits="Latin_name") 
   list.temp <- list(data.tree=data,data.BA.SP=data.BA.SP,data.traits=data.traits)
   save(list.temp,file=paste("./data/process/list",name.site,"Rdata",sep="."))
  }else{
   list.temp <- list(data.tree=data,data.BA.SP=data.BA.sp,data.traits=NA)
   saveRDS(list.temp,file=paste("./data/process/list",name.site,"Rdata",sep="."))
  }
}