merge.data.US.R 4.67 KiB
#!/usr/bin/env Rscript
### MERGE us DATA Edited by FH
library(reshape, quietly=TRUE)
source("./R/format.function.R")
source("./R/FUN.TRY.R")
### read species names
species.clean <- read.csv("./data/species.list/REF_SPECIES.CSV", stringsAsFactors = FALSE)
## select column to keep
species.clean <- subset(species.clean, select = c("SPCD", "GENUS", "SPECIES", "VARIETY", 
    "SUBSPECIES", "SPECIES_SYMBOL"))
species.clean$Latin_name <- paste(species.clean[["GENUS"]], species.clean[["SPECIES"]], 
    sep = " ")
species.clean$Latin_name_syn <- paste(species.clean[["GENUS"]], species.clean[["SPECIES"]], 
    sep = " ")
names(species.clean)[1] <- "sp"
species.clean[["sp"]] <- paste("sp", species.clean[["sp"]], sep = ".")
###################################### MASSAGE TRAIT DATA HEIGHT DATA FOR TREE MISSING BRING US DATA FOR HEIGHT OVER
###################################### WHEN WE ANALYZE THAT DATASET LATER ON
##################################### FORMAT INDIVIDUAL TREE DATA
## change unit and names of variables to be the same in all data for the tree
data.us$G <- 10 * (data.us$FinalDbh - data.us$InitDbh)/data.us$Interval  ## diameter growth in mm per year
data.us$G[which(data.us$InitDbh == 0 | data.us$FinalDbh == -999)] <- NA
data.us$year <- data.us$Interval  ## number of year between measuremen
data.us$D <- data.us[["InitDbh"]]
data.us$D[data.us$D == 0] <- NA
## diameter in cm
data.us$dead <- as.numeric(data.us$FinalDbh > 0)  ## dummy variable for dead tree 0 alive 1 dead
data.us$sp <- as.character(data.us[["Species"]])  ## species code
data.us$plot <- as.character(data.us[["PlotID"]])  ## plot code
data.us$subplot <- paste(as.character(data.us[["PlotID"]]), as.character(data.us[["SubplotNumber"]]), 
    sep = ".")  ## plot code
data.us$htot <- rep(NA, length(data.us[["Species"]]))  ## height of tree in m - MISSING
data.us$tree.id <- as.character(data.us$TreeID)
## tree unique id
data.us$sp.name <- NA
## census is missing use as only one census and check with mark
data.us$census <- rep(1,nrow(data.us))
### add plot weights for computation of competition index (in 1/m^2)
data.us$weights <- 1/(10000 * data.us[["PlotSize"]])
###################### ECOREGION merge greco to have no ecoregion with low number of observation
greco <- read.csv(file = "./data/raw/DataUS/EcoregionCodes.csv", header = T)
colnames(greco)[1] <- "Ecocode"
table(data.us$Ecocode)
data.us <- merge(data.us, greco[, -4], by = "Ecocode")
data.us$DIVISION <- factor(data.us$DIVISION)
## Some ecoregions still have small # of individuals, so create a variable which
## does division if # ind < 10000 else it reads Domain
data.us$eco_codemerged <- as.character(data.us$DIVISION)
tab.small.div <- table(data.us$eco_codemerged)
sel.small.div <- which(table(data.us$eco_codemerged) < 10000)
for (i in 1:length(sel.small.div)) {
    find.ind <- which(data.us$eco_codemerged == names(tab.small.div)[sel.small.div[i]])
    print(length(find.ind))
    data.us$eco_codemerged[find.ind] <- as.character(data.us$DOMAIN)[find.ind]
###### PERCENT DEAD variable percent dead/cannot do with since dead variable is
###### missing compute numer of dead per plot to remove plot with disturbance
perc.dead <- tapply(data.us[["dead"]], INDEX = data.us[["plot"]], FUN = function.perc.dead)
# ## VARIABLE TO SELECT PLOT WITH NOT BIG DISTURBANCE KEEP OFTHER VARIABLES IF
# AVAILABLE (disturbance record)
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
data.us <- merge(data.us, data.frame(plot = names(perc.dead), perc.dead = perc.dead, stringsAsFactors = FALSE), by = "plot", sort = FALSE) ##### PLOT SELECTION FOR THE ANALYSIS ## remove everything from memory not need before computation rm(greco, perc.dead, tab.small.div, sel.small.div) #### create good data format to be run per ecoregion data.us$ecocode <- unlist(lapply(lapply(strsplit(data.us[["eco_codemerged"]], " "), FUN = substr, 1, 2), FUN = paste, collapse = ".")) data.us[["sp"]] <- paste("sp", data.us[["sp"]], sep = ".") ## variables to keep vec.abio.var.names <- c("MAT", "MAP") vec.basic.var <- c("tree.id", "sp", "plot", "subplot", "ecocode", "D", "G", "dead", "year", "htot", "Lon", "Lat", "perc.dead", "weights","census") data.tree <- subset(data.us, select = c(vec.basic.var, vec.abio.var.names)) rm(data.us) ## creat row unique id data.tree$obs.id <- as.character(1:nrow(data.tree)) gc() ### read TRY data TRY.DATA.FORMATED <- readRDS("./data/process/TRY.DATA.FORMATED.rds") #### GENERATE ONE OBJECT PER ECOREGION # vector of ecoregion name ecoregion.unique <- unique(data.tree[["ecocode"]]) #### lapply function system.time(lapply(ecoregion.unique, FUN = fun.data.per.ecoregion, data.tot = data.tree, plot.name = "subplot", weight.full.plot = NA, name.country = "US", data.TRY = TRY.DATA.FORMATED, species.lookup = species.clean))