Here are some packages that will be used in this lecture.
randomForest
[CRAN]
the vanilla random forest packageranger
[CRAN] a fast
implementation of random forestrandomForestSRC
[CRAN]
added survival analysis capability and several additional featuresgrf
[CRAN] extended
from the ranger
package with additional statistical
toolsRLT
Use new [GitHub Ver. 4.2.5] a fast
implementation and additional features. Old [CRAN Ver. 3.2.6]
will be updated later.A more comprehensive list can be found here.
RLT
PackageIn windows, RLT
can be installed using
install_github
. The compilation of this package requires
GCC. Hence, you may also need to install Rtools
from here.
# you may need to install the devtools package first
library(devtools)
# install the RLT package from GitHub
# Rtools is required in windows
install_github("teazrq/RLT")
When installing RLT on macOS can be slightly trickier. It mainly consisting two steps:
Follow this guild to install GNU Fortran compiler, gfortran-12.2-universal.pkg
If you do not already have Xcode (most systems come with Xcode installed already), you can use this line to install
sudo xcode-select --install
Markvars
to point to the compiler
Create the Markvars
file using
makedir ~/.R
touch ~/.R/Makevars
Check your folder of gfortran
. The compiler will be
installed to /opt/gfortran
folder as default
Add these lines to point to the correct folder of the compiler.
You may use open -a TextEdit ~/.R/Makevars
to open a text
editor and type in these lines.
FC = opt/gfortran/bin/gfortran
F77 = /opt/gfortran/bin/gfortran
FLIBS = -L/opt/gfortran/lib
After completing these steps, you should be able to directly install
the RLT
package using
install_github("teazrq/RLT")
in RStudio the same way as
windows machine. The compilation may require a few minutes.
The previous steps would not activate OpenMP parallel computing, as
explained here. To
enable OpenMP while compiling the RLT
package, follow these
steps:
For example, to install the 14.0.6 version (check the ones that compatible with your Xcode version), use
curl -O https://mac.r-project.org/openmp/openmp-14.0.6-darwin20-Release.tar.gz
sudo tar fvxz openmp-14.0.6-darwin20-Release.tar.gz -C /
You should then see the following message
usr/local/lib/libomp.dylib
usr/local/include/ompt.h
usr/local/include/omp.h
usr/local/include/omp-tools.h
Markvars
The procedure is the same as we explained previously. You should
add these two lines into your Markvars
file:
CPPFLAGS += -Xclang -fopenmp
LDFLAGS += -lomp
After these two steps, you can then use
install_github("teazrq/RLT", force = TRUE)
to re-install
the RLT
package.
The following code test the performance of several packages in a regression problem
library(randomForest)
library(randomForestSRC)
library(ranger)
library(RLT)
We generate a dataset with 1000 observations and 400 variables, with 200 continuous variables and 200 categorical ones with three categories.
library(parallel)
# Set seed for reproducibility
set.seed(1)
# Define data size
trainn <- 800
testn <- 1000
n <- trainn + testn
p <- 30
# Generate continuous variables (X1) and categorical variables (X2)
X1 <- matrix(rnorm(n * p / 2), n, p / 2)
X2 <- matrix(as.integer(runif(n * p / 2) * 3), n, p / 2)
# Combine continuous and categorical variables into a data frame (X)
X <- data.frame(X1, X2)
# Convert the second half of the columns in X to factors
X[, (p / 2 + 1):p] <- lapply(X[, (p / 2 + 1):p], as.factor)
# Generate outcomes (y)
y <- 1 + rowSums(X[, 2:6]) + 2 * (X[, p / 2 + 1] %in% c(1, 3)) + rnorm(n)
# Set tuning parameters
ntrees <- 1000
ncores <- detectCores() - 1
nmin <- 30
mtry <- p / 2
samplereplace <- TRUE
sampleprob <- 0.80
rule <- "best"
nsplit <- ifelse(rule == "best", 0, 3)
importance <- TRUE
# Split data into training and testing sets
trainX <- X[1:trainn, ]
trainY <- y[1:trainn]
testX <- X[(trainn + 1):(trainn + testn), ]
testY <- y[(trainn + 1):(trainn + testn)]
# recording results
metric = data.frame(matrix(NA, 4, 6))
rownames(metric) = c("RLT", "randomForestSRC", "randomForest", "ranger")
colnames(metric) = c("fit.time", "pred.time", "oob.error", "pred.error",
"obj.size", "tree.size")
# using RLT package
start_time <- Sys.time()
RLTfit <- RLT(trainX, trainY, model = "regression",
ntrees = ntrees, mtry = mtry, nmin = nmin,
resample.prob = sampleprob, split.gen = rule,
resample.replace = samplereplace,
nsplit = nsplit, importance = importance,
param.control = list("alpha" = 0),
ncores = ncores, verbose = TRUE)
## Regression Random Forest ...
## ---------- Parameters Summary ----------
## (N, P) = (800, 30)
## # of trees = 1000
## (mtry, nmin) = (15, 30)
## splitting rule = Best
## sampling = 0.8 w/ replace
## (Obs, Var) weights = (No, No)
## importance = permute
## reinforcement = No
## ----------------------------------------
metric[1, 1] = difftime(Sys.time(), start_time, units = "secs")
start_time <- Sys.time()
RLTPred <- predict(RLTfit, testX, ncores = ncores)
metric[1, 2] = difftime(Sys.time(), start_time, units = "secs")
metric[1, 3] = mean((RLTfit$Prediction - trainY)^2)
metric[1, 4] = mean((RLTPred$Prediction - testY)^2)
metric[1, 5] = object.size(RLTfit)
metric[1, 6] = mean(unlist(lapply(RLTfit$FittedForest$SplitVar, length)))
# use randomForestSRC
options(rf.cores = ncores)
start_time <- Sys.time()
rsffit <- rfsrc(y ~ ., data = data.frame(trainX, "y"= trainY),
ntree = ntrees, nodesize = nmin/2, mtry = mtry,
samptype = ifelse(samplereplace == TRUE, "swor", "swr"),
nsplit = nsplit, sampsize = trainn*sampleprob,
importance = ifelse(importance, "permute", "none"))
metric[2, 1] = difftime(Sys.time(), start_time, units = "secs")
start_time <- Sys.time()
rsfpred = predict(rsffit, data.frame(testX))
metric[2, 2] = difftime(Sys.time(), start_time, units = "secs")
metric[2, 3] = mean((rsffit$predicted.oob - trainY)^2)
metric[2, 4] = mean((rsfpred$predicted - testY)^2)
metric[2, 5] = object.size(rsffit)
metric[2, 6] = rsffit$forest$totalNodeCount / rsffit$ntree
# use randomForest
start_time <- Sys.time()
rf.fit <- randomForest(trainX, trainY, ntree = ntrees,
mtry = mtry, nodesize = nmin,
replace = samplereplace,
sampsize = trainn*sampleprob,
importance = importance)
metric[3, 1] = difftime(Sys.time(), start_time, units = "secs")
start_time <- Sys.time()
rf.pred <- predict(rf.fit, testX)
metric[3, 2] = difftime(Sys.time(), start_time, units = "secs")
metric[3, 3] = mean((rf.fit$predicted - trainY)^2)
metric[3, 4] = mean((rf.pred - testY)^2)
metric[3, 5] = object.size(rf.fit)
metric[3, 6] = mean(colSums(rf.fit$forest$nodestatus != 0))
# use ranger
start_time <- Sys.time()
rangerfit <- ranger(trainY ~ ., data = data.frame(trainX),
num.trees = ntrees, min.node.size = nmin,
mtry = mtry, num.threads = ncores,
replace = samplereplace,
sample.fraction = sampleprob,
importance = "permutation",
respect.unordered.factors = "partition")
metric[4, 1] = difftime(Sys.time(), start_time, units = "secs")
start_time <- Sys.time()
rangerpred = predict(rangerfit, data.frame(testX))
metric[4, 2] = difftime(Sys.time(), start_time, units = "secs")
metric[4, 3] = mean((rangerfit$predictions - trainY)^2)
metric[4, 4] = mean((rangerpred$predictions - testY)^2)
metric[4, 5] = object.size(rangerfit)
metric[4, 6] = mean(unlist(lapply(rangerfit$forest$split.varIDs, length)))
# performance summary
metric
The variable importance comparison
par(mfrow=c(2,2))
par(mar = c(1, 2, 2, 2))
barplot(as.vector(RLTfit$VarImp), main = "RLT", cex.main = 3)
barplot(as.vector(rsffit$importance), main = "rsf", cex.main = 3)
barplot(rf.fit$importance[, 1], main = "rf", cex.main = 3)
barplot(as.vector(rangerfit$variable.importance), main = "ranger", cex.main = 3)