split US scripts into functions

92dabaed · Daniel Falster · d2ff437f · 92dabaed · 92dabaed
Commit 92dabaed authored 11 years ago by Daniel Falster
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 1073 additions and 347 deletions
+1073 -347
--- a/R/format.function.R
+++ b/R/format.function.R
--- a/merge.data.US.R
+++ b/merge.data.US.R
+#!/usr/bin/env Rscript
+
 ### MERGE us DATA Edited by FH
-rm(list = ls())
+library(reshape, quietly=TRUE)
 source("./R/format.function.R")
-library(reshape)
 source("./R/FUN.TRY.R")

-######################### READ DATA read individuals tree data
-data.us <- read.csv("./data/raw/DataUS/FIA51_trees_w_supp.csv", header = TRUE, stringsAsFactors = FALSE)
-
 ### read species names
-species.clean <- read.csv("./data/species.list/REF_SPECIES.CSV", stringsAsFactors = FALSE)
-## select column to keep
-species.clean <- subset(species.clean, select = c("SPCD", "GENUS", "SPECIES", "VARIETY", 
-    "SUBSPECIES", "SPECIES_SYMBOL"))
-species.clean$Latin_name <- paste(species.clean[["GENUS"]], species.clean[["SPECIES"]], 
-    sep = " ")
-species.clean$Latin_name_syn <- paste(species.clean[["GENUS"]], species.clean[["SPECIES"]], 
-    sep = " ")
-
-names(species.clean)[1] <- "sp"
-species.clean[["sp"]] <- paste("sp", species.clean[["sp"]], sep = ".")
-
-###################################### MASSAGE TRAIT DATA HEIGHT DATA FOR TREE MISSING BRING US DATA FOR HEIGHT OVER
-###################################### WHEN WE ANALYZE THAT DATASET LATER ON
-
-##################################### FORMAT INDIVIDUAL TREE DATA
-
-## change unit and names of variables to be the same in all data for the tree
-data.us$G <- 10 * (data.us$FinalDbh - data.us$InitDbh)/data.us$Interval  ## diameter growth in mm per year
-data.us$G[which(data.us$InitDbh == 0 | data.us$FinalDbh == -999)] <- NA
-data.us$year <- data.us$Interval  ## number of year between measuremen
-data.us$D <- data.us[["InitDbh"]]
-data.us$D[data.us$D == 0] <- NA
-## diameter in cm
-data.us$dead <- as.numeric(data.us$FinalDbh > 0)  ## dummy variable for dead tree 0 alive 1 dead
-data.us$sp <- as.character(data.us[["Species"]])  ## species code
-data.us$plot <- as.character(data.us[["PlotID"]])  ## plot code
-data.us$subplot <- paste(as.character(data.us[["PlotID"]]), as.character(data.us[["SubplotNumber"]]), 
-    sep = ".")  ## plot code
-data.us$htot <- rep(NA, length(data.us[["Species"]]))  ## height of tree in m - MISSING
-data.us$tree.id <- as.character(data.us$TreeID)
-## tree unique id
-data.us$sp.name <- NA
-
-### add plot weights for computation of competition index (in 1/m^2)
-data.us$weights <- 1/(10000 * data.us[["PlotSize"]])
-
-###################### ECOREGION merge greco to have no ecoregion with low number of observation
+load_species_list <- function(filename = "./data/raw/DataUS/REF_SPECIES.CSV"){
+    data <- read.csv(filename, stringsAsFactors = FALSE)
+    ## select column to keep
+    data <- subset(data, select = c("SPCD", "GENUS", "SPECIES", "VARIETY", 
+        "SUBSPECIES", "SPECIES_SYMBOL"))
+    data$Latin_name <- paste(data[["GENUS"]], data[["SPECIES"]], sep = " ")
+    data$Latin_name_syn <- paste(data[["GENUS"]], data[["SPECIES"]], sep = " ")
+
+    names(data)[1] <- "sp"
+    data[["sp"]] <- paste("sp", data[["sp"]], sep = ".")    
+    data
+}
+
+load_inventory_data <- function(filename = "./data/raw/DataUS/FIA51_trees_w_supp.csv"){
+    ## READ DATA read individuals tree data
+    data <- read.csv(filename, header = TRUE, stringsAsFactors = FALSE)
+
+    ####### MASSAGE TRAIT DATA HEIGHT DATA FOR TREE MISSING BRING US DATA FOR HEIGHT OVER
+    ####### WHEN WE ANALYZE THAT DATASET LATER ON
+
+    ###### FORMAT INDIVIDUAL TREE DATA
+
+    ## change unit and names of variables to be the same in all data for the tree
+    data$G <- 10 * (data$FinalDbh - data$InitDbh)/data$Interval  ## diameter growth in mm per year
+    data$G[which(data$InitDbh == 0 | data$FinalDbh == -999)] <- NA
+    data$year <- data$Interval  ## number of year between measuremen
+    data$D <- data[["InitDbh"]] ## diameter in cm
+    data$D[data$D == 0] <- NA
+    data$dead <- as.numeric(data$FinalDbh > 0)  ## dummy variable for dead tree 0 alive 1 dead
+    data$sp <- as.character(data[["Species"]])  ## species code
+    data$plot <- as.character(data[["PlotID"]])  ## plot code
+    data$subplot <- paste(as.character(data[["PlotID"]]), as.character(data[["SubplotNumber"]]), 
+        sep = ".")  ## plot code
+    data$htot <- rep(NA, length(data[["Species"]]))  ## height of tree in m - MISSING
+    data$tree.id <- as.character(data$TreeID) ## unique id
+    data$sp.name <- NA
+
+    ### add plot weights for computation of competition index (in 1/m^2)
+    data$weights <- 1/(10000 * data.us[["PlotSize"]])
+    data
+}
+
+data.us <- load_inventory_data()
+
+###### ECOREGION merge greco to have no ecoregion with low number of observation
 greco <- read.csv(file = "./data/raw/DataUS/EcoregionCodes.csv", header = T)
 colnames(greco)[1] <- "Ecocode"

 table(data.us$Ecocode)
 data.us <- merge(data.us, greco[, -4], by = "Ecocode")
 data.us$DIVISION <- factor(data.us$DIVISION)
+
 ## Some ecoregions still have small # of individuals, so create a variable which
 ## does division if # ind < 10000 else it reads Domain
 data.us$eco_codemerged <- as.character(data.us$DIVISION)
@@ -63,8 +69,8 @@ for (i in 1:length(sel.small.div)) {
    data.us$eco_codemerged[find.ind] <- as.character(data.us$DOMAIN)[find.ind]
 }

-###################### PERCENT DEAD variable percent dead/cannot do with since dead variable is
-###################### missing compute numer of dead per plot to remove plot with disturbance
+###### PERCENT DEAD variable percent dead/cannot do with since dead variable is
+###### missing compute numer of dead per plot to remove plot with disturbance
 perc.dead <- tapply(data.us[["dead"]], INDEX = data.us[["plot"]], FUN = function.perc.dead)
 # ## VARIABLE TO SELECT PLOT WITH NOT BIG DISTURBANCE KEEP OFTHER VARIABLES IF
 # AVAILABLE (disturbance record)
@@ -72,8 +78,7 @@ data.us <- merge(data.us, data.frame(plot = names(perc.dead), perc.dead = perc.d
    by = "plot", sort = FALSE)


-########################################################### PLOT SELECTION FOR THE ANALYSIS
-
+##### PLOT SELECTION FOR THE ANALYSIS

 ## remove everything from memory not need before computation
 rm(greco, perc.dead, tab.small.div, sel.small.div)
@@ -89,6 +94,7 @@ vec.basic.var <- c("tree.id", "sp", "plot", "subplot", "ecocode", "D", "G", "dea
    "year", "htot", "Lon", "Lat", "perc.dead", "weights")
 data.tree <- subset(data.us, select = c(vec.basic.var, vec.abio.var.names))
 rm(data.us)
+
 ## creat row unique id
 data.tree$obs.id <- as.character(1:nrow(data.tree))
 gc()
@@ -96,13 +102,17 @@ gc()
 ### read TRY data
 TRY.DATA.FORMATED <- readRDS("./data/process/TRY.DATA.FORMATED.rds")

-#################### GENERATE ONE OBJECT PER ECOREGION
+#### GENERATE ONE OBJECT PER ECOREGION

 # vector of ecoregion name
 ecoregion.unique <- unique(data.tree[["ecocode"]])

-
-#### lapply function
-system.time(lapply(ecoregion.unique, FUN = fun.data.per.ecoregion, data.tot = data.tree, 
-    plot.name = "subplot", weight.full.plot = NA, name.country = "US", data.TRY = TRY.DATA.FORMATED, 
-    species.lookup = species.clean)) 
+#### split data by ecoregion and save to file
+system.time(lapply(ecoregion.unique, FUN = fun.data.per.ecoregion, 
+    data.tot = data.tree, 
+    plot.name = "subplot", 
+    weight.full.plot = NA, 
+    name.country = "US", 
+    data.TRY = TRY.DATA.FORMATED, 
+    species.lookup = load_species_list(), 
+    output.dir = "output"))