Skip to contents

Install and Load Package

Install and load the GitHub version of the RLT package. Do not use the CRAN version.

  # install.packages("devtools")
  # devtools::install_github("teazrq/RLT")
  library(RLT)
## RLT and Random Forests v4.2.6
## pre-release at github.com/teazrq/RLT

Load other packages used in this guide.

Single Variable Embedded Splitting

When reinforcement is enabled, an embedded random forest model and the corresponding variable importance measure will be used to search for the best splitting rule. There will be a default setting of parameters for the embedded model, however you can still tune them individually.

  # Set seed for reproducibility
  set.seed(1)

  # Define data size
  trainn <- 800
  testn <- 1000
  n <- trainn + testn
  p <- 30

  # Generate continuous variables (X1) and categorical variables (X2)
  X1 <- matrix(rnorm(n * p / 2), n, p / 2)
  #X2 <- matrix(rnorm(n * p / 2), n, p / 2)
  X2 <- matrix(as.integer(runif(n * p / 2) * 10), n, p / 2)

  # Combine continuous and categorical variables into a data frame (X)
  X <- data.frame(X1, X2)

  # Convert the second half of the columns in X to factors
  X[, (p / 2 + 1):p] <- lapply(X[, (p / 2 + 1):p], as.factor)

  # Generate outcomes (y)
  logit <- function(x) exp(x) / (1 + exp(x))
#  y <- as.factor(rbinom(n, 1, prob = logit(1 + rowSums(X[, 1:5]) + 2 * (X[, p / 2 + 1] %in% c(1, 3)) + rnorm(n))) + 2)
  
  y <- as.factor(rbinom(n, 1, prob = logit(1 + 1*X[, 2] + 1*X[, 5] + 3*(X[, p] %in% c(1, 3, 5, 7)))) + 2)
  
  # Set tuning parameters
  ntrees <- 1000
  ncores <- 10
  nmin <- 20
  mtry <- p/2
  samplereplace <- TRUE
  sampleprob <- 0.75
  rule <- "best"
  nsplit <- ifelse(rule == "best", 0, 3)
  importance <- TRUE

  # Split data into training and testing sets
  trainX <- X[1:trainn, ]
  trainY <- y[1:trainn]
  testX <- X[(trainn + 1):(trainn + testn), ]
  testY <- y[(trainn + 1):(trainn + testn)]
  start_time <- Sys.time()
  
  RLTfit <- RLT(trainX, trainY,
                ntrees = 500, ncores = 10, nmin = 10,
                split.gen = "random", nsplit = 2,
                resample.prob = 0.8, resample.replace = FALSE,
                reinforcement = TRUE, importance = "distribute",
                param.control = list("embed.ntrees" = 50,
                                     "embed.mtry" = 2/3,
                                     "embed.nmin" = 5,
                                     "alpha" = 0.1),
                verbose = TRUE)
## Classification Random Forest ... 
## ---------- Parameters Summary ----------
##               (N, P) = (800, 30)
##           # of trees = 500
##         (mtry, nmin) = (10, 10)
##       split generate = Random, 2
##             sampling = 0.8 w/o replace
##   (Obs, Var) weights = (No, No)
##                alpha = 0.1
##           importance = distribute
##        reinforcement = Yes
## ----------------------------------------
##  embed.ntrees            = 50
##  embed.mtry              = 66.7%
##  embed.nmin              = 5
##  embed.split.gen         = Random, 1
##  embed.resample.replace  = TRUE
##  embed.resample.prob     = 0.9
##  embed.mute              = 0
##  embed.protect           = 14
##  embed.threshold         = 0.25
## ----------------------------------------
## Do not have 10 cores, use maximum 4 cores.
  
  difftime(Sys.time(), start_time, units = "secs")
## Time difference of 173.4728 secs
  
  # prediction
  RLTPred <- predict(RLTfit, testX, ncores = ncores)

  # inbag and oobag errors
  mean(RLTfit$Prediction != trainY)
## [1] 0.17875
  mean(RLTPred$Prediction != testY)
## [1] 0.169
  
  # VI
  barplot(as.vector(RLTfit$VarImp), main = "RLT")