Random Seed and Reproducibility - Tutorial (RLT)
Ruoqing Zhu
Last Updated: May 18, 2026
Source:vignettes/articles/feature-seed.Rmd
feature-seed.RmdOverview
This page explains how to control randomness in your analyses with
base R’s set.seed().
Where to set the seed
At the very beginning of your analysis (recommended). This covers
synthetic data generation and internal resampling randomness in
RLT(). Or immediately before model fitting if your data are
fixed and only modeling randomness matters.
Prerequisites — See Get Started.
Demonstration — same seed, same results (regression)
We run the same pipeline twice with the same seed and compare outputs.
# ---------- Run A (seed = 1) ----------
set.seed(1)
# Small dataset (~100 obs)
trainn <- 80; testn <- 20; n <- trainn + testn; p <- 10
X1 <- matrix(rnorm(n * (p/2)), n, p/2)
X2 <- matrix(as.integer(runif(n * (p/2)) * 3), n, p/2) # integers 0,1,2
X_numeric <- data.frame(X1, X2)
y <- 1 + rowSums(X_numeric[, 2:6]) +
2 * (X_numeric[, p/2 + 1] %in% c(1, 2)) + rnorm(n)
X <- X_numeric
X[, (p/2 + 1):p] <- lapply(X[, (p/2 + 1):p], as.factor)
trainX <- X[1:trainn, ]; trainY <- y[1:trainn]
testX <- X[(trainn + 1):(trainn + testn), ]; testY <- y[(trainn + 1):(trainn + testn)]
# Fit
library(RLT)
ntrees <- 200; ncores <- 1
nmin <- 5; mtry <- p/2; samplereplace <- TRUE; sampleprob <- 0.80
rule <- "best"; nsplit <- ifelse(rule == "best", 0, 3); importance <- TRUE
fit_A <- RLT(
trainX, trainY, model = "regression",
ntrees = ntrees, mtry = mtry, nmin = nmin,
resample.prob = sampleprob, split.gen = rule,
resample.replace = samplereplace,
nsplit = nsplit, importance = importance,
param.control = list(alpha = 0),
ncores = ncores, verbose = FALSE
)
pred_A <- predict(fit_A, testX, ncores = ncores)
mse_train_A <- mean((fit_A$Prediction - trainY)^2)
mse_test_A <- mean((pred_A$Prediction - testY)^2)
# ---------- Run B (same seed = 1) ----------
set.seed(1)
# Recreate the same data and pipeline
trainn <- 80; testn <- 20; n <- trainn + testn; p <- 10
X1 <- matrix(rnorm(n * (p/2)), n, p/2)
X2 <- matrix(as.integer(runif(n * (p/2)) * 3), n, p/2)
X_numeric <- data.frame(X1, X2)
y <- 1 + rowSums(X_numeric[, 2:6]) +
2 * (X_numeric[, p/2 + 1] %in% c(1, 2)) + rnorm(n)
X <- X_numeric
X[, (p/2 + 1):p] <- lapply(X[, (p/2 + 1):p], as.factor)
trainX <- X[1:trainn, ]; trainY <- y[1:trainn]
testX <- X[(trainn + 1):(trainn + testn), ]; testY <- y[(trainn + 1):(trainn + testn)]
fit_B <- RLT(
trainX, trainY, model = "regression",
ntrees = ntrees, mtry = mtry, nmin = nmin,
resample.prob = sampleprob, split.gen = rule,
resample.replace = samplereplace,
nsplit = nsplit, importance = importance,
param.control = list(alpha = 0),
ncores = ncores, verbose = FALSE
)
pred_B <- predict(fit_B, testX, ncores = ncores)
mse_train_B <- mean((fit_B$Prediction - trainY)^2)
mse_test_B <- mean((pred_B$Prediction - testY)^2)
# ---------- Summary for same-seed runs ----------
list(
A_Train_MSE = round(mse_train_A, 6),
A_Test_MSE = round(mse_test_A, 6),
B_Train_MSE = round(mse_train_B, 6),
B_Test_MSE = round(mse_test_B, 6),
SameSeed_Predictions_Identical = isTRUE(all.equal(pred_A$Prediction, pred_B$Prediction))
)
## $A_Train_MSE
## [1] 3.456843
##
## $A_Test_MSE
## [1] 4.086383
##
## $B_Train_MSE
## [1] 3.456843
##
## $B_Test_MSE
## [1] 4.086383
##
## $SameSeed_Predictions_Identical
## [1] TRUEDemonstration — different seed, potentially different results
Now we change the seed and rerun the same pipeline once.
# ---------- Run C (seed = 2) ----------
set.seed(2)
trainn <- 80; testn <- 20; n <- trainn + testn; p <- 10
X1 <- matrix(rnorm(n * (p/2)), n, p/2)
X2 <- matrix(as.integer(runif(n * (p/2)) * 3), n, p/2)
X_numeric <- data.frame(X1, X2)
y <- 1 + rowSums(X_numeric[, 2:6]) +
2 * (X_numeric[, p/2 + 1] %in% c(1, 2)) + rnorm(n)
X <- X_numeric
X[, (p/2 + 1):p] <- lapply(X[, (p/2 + 1):p], as.factor)
trainX <- X[1:trainn, ]; trainY <- y[1:trainn]
testX <- X[(trainn + 1):(trainn + testn), ]; testY <- y[(trainn + 1):(trainn + testn)]
fit_C <- RLT(
trainX, trainY, model = "regression",
ntrees = ntrees, mtry = mtry, nmin = nmin,
resample.prob = sampleprob, split.gen = rule,
resample.replace = samplereplace,
nsplit = nsplit, importance = importance,
param.control = list(alpha = 0),
ncores = ncores, verbose = FALSE
)
pred_C <- predict(fit_C, testX, ncores = ncores)
mse_train_C <- mean((fit_C$Prediction - trainY)^2)
mse_test_C <- mean((pred_C$Prediction - testY)^2)
list(
C_Train_MSE = round(mse_train_C, 6),
C_Test_MSE = round(mse_test_C, 6),
DiffSeed_Predictions_EqualTo_RunA = isTRUE(all.equal(pred_C$Prediction, pred_A$Prediction))
)
## $C_Train_MSE
## [1] 3.323691
##
## $C_Test_MSE
## [1] 2.306646
##
## $DiffSeed_Predictions_EqualTo_RunA
## [1] FALSETips
- Choose any integer you like for the seed; the specific value doesn’t matter—consistency does.
- Keep one
set.seed()near the top of your script to make the whole workflow reproducible. - The same pattern works for classification and survival: place
set.seed()before data simulation (if any) and beforeRLT().