A simple example for regression

One of the most popular packages is gbm. In recent years xgboost becomes extremely popular on Kaggle. The main difference is the computational performances. In this course, gbm is sufficient for demonstrating the method.

  library(gbm)

We consider a regression problem with just one variable. The gbm package uses trees as week learners. You can further control the complexity of trees through tuning parameters.

  # a simple regression problem
  
  p = 1
  x = seq(0, 1, 0.001)
  fx <- function(x) 2*sin(3*pi*x)
  y = fx(x) + rnorm(length(x))

  plot(x, y, pch = 19, ylab = "y", col = "gray")
  lines(x, fx(x), lwd = 2) # plot the true regression line

  # fit regression boosting
  # I use a very large shrinkage value for demonstrating the functions
  # in practice you should use 0.1 or even smaller values for stability
  gbm.fit = gbm(y~x, data = data.frame(x, y), distribution = "gaussian",
                n.trees=300, shrinkage=0.5, bag.fraction=0.8)

  # somehow, cross-validation for 1 dimensional problem creates error
  # gbm(y ~ ., data = data.frame(x, y), cv.folds = 3) # this produces an error
  # plot the fitted regression function at several iterations
  par(mfrow=c(2,3))
  size=c(1,5,10,50,100,300)
  
  for(i in 1:6)
  {
    par(mar=c(2,2,3,1))
    plot(x, y, pch = 19, ylab = "y", col = "gray")
    lines(x, fx(x), lwd = 2)
    
    Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
    lines(x, Fx, lwd = 3, col = "red")
    title(paste("# of Iterations = ", size[i]))
  }

AdaBoost for binary classification

  n = 1000; set.seed(1)
  x = cbind(seq(0, 1, length.out = n), runif(n))
  py = (sin(4*pi*x[, 1]) + 1)/2
  y = rbinom(n, 1, py)
  
  plot(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)), 
       pch = 19, col = ifelse(y==1, "red", "blue"), xlab = "x", ylab = "P(Y=1 | X=x)")
  points(x[, 1], py, type = "l", lwd = 3)

  # fit AdaBoost with bootstrapping, again I am using a large shrinkage factor
  
  gbm.fit = gbm(y~., data.frame(x, y), distribution="adaboost", n.minobsinnode = 2, 
                n.trees=200, shrinkage = 1, bag.fraction=0.8, cv.folds = 10)
  
  # get the best number of trees from cross-validation (or oob if no cv is used)
  gbm.perf(gbm.fit)

## [1] 22
  # plot the decision function (Fx, not sign(Fx))
  par(mfrow=c(2,3))
  size=c(1, 5, 10, 22, 50, 100)

  for(i in 1:6)
  {
    par(mar=c(2,2,3,1))
    plot(x[, 1], py, type = "l", lwd = 3, ylab = "P(Y=1 | X=x)", col = "gray")
    points(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)), 
           pch = 19, col = ifelse(y==1, "red", "blue"))
    Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
    lines(x[, 1], 1/(1+exp(-2*Fx)), lwd = 3)
    title(paste("# of Iterations = ", size[i]))
  }

  # You can peek into a single tree
  pretty.gbm.tree(gbm.fit, i.tree = 1)
##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight  Prediction
## 0        0    0.24374374        1         2           3       129.2909    800 -0.03992647
## 1       -1    0.68000925       -1        -1          -1         0.0000    188  0.68000925
## 2       -1   -0.26108319       -1        -1          -1         0.0000    612 -0.26108319
## 3       -1   -0.03992647       -1        -1          -1         0.0000    800 -0.03992647

Tuning parameter: shrinkage

  # you may use a very small shrinkage factor to improve stability
  # however, this means you will need to use more trees

  gbm.fit = gbm(y~., data.frame(x, y), distribution="adaboost", n.minobsinnode = 2, 
                n.trees=1000, shrinkage = 0.1, bag.fraction=0.8, cv.folds = 10)  
  gbm.perf(gbm.fit)

## [1] 487
  par(mfrow=c(2,3))
  size=c(1,10,50,100,250,500) # 1000 trees is clearly over-fitting
  
  for(i in 1:6)
  {
    par(mar=c(2,2,3,1))
    plot(x[, 1], py, type = "l", lwd = 3, ylab = "P(Y=1 | X=x)", col = "gray")
    points(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)), 
           pch = 19, col = ifelse(y==1, "red", "blue"))
    Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
    lines(x[, 1], 1/(1+exp(-2*Fx)), lwd = 3)
    title(paste("# of Iterations = ", size[i]))
  }

Tune the weak learner

  # our previous example with a circle
  set.seed(1)
  n = 500
  x1 = runif(n, -1, 1)
  x2 = runif(n, -1, 1)
  y = rbinom(n, size = 1, prob = ifelse(x1^2 + x2^2 < 0.6, 0.8, 0.2))
  xgrid = expand.grid(x1 = seq(-1, 1, 0.01), x2 = seq(-1, 1, 0.01))
  # if we only use one variable in each tree
  gbm.fit = gbm(y~., data = data.frame(x1, x2, y), distribution="adaboost", 
                n.trees=1000, shrinkage=0.01, bag.fraction=0.8, 
                interaction.depth = 1, cv.folds=10)

  usetree = gbm.perf(gbm.fit, method="cv")

  Fx = predict(gbm.fit, xgrid, n.trees=usetree)

  pred = matrix(1/(1+exp(-2*Fx)) > 0.5, 201, 201)
  par(mar=rep(2,4))
  contour(seq(-1, 1, 0.01), seq(-1, 1, 0.01), pred, levels=0.5, labels="",axes=FALSE)
  points(x1, x2, col = ifelse(y == 1, "deepskyblue", "darkorange"), 
         pch = 19, yaxt="n", xaxt = "n")
  points(xgrid, pch=".", cex=1.2, col=ifelse(pred, "deepskyblue", "darkorange"))
  box()

  pretty.gbm.tree(gbm.fit, i.tree = 1)
##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight    Prediction
## 0        0  0.7279700371        1         2           3       24.10443    400 -0.0003848737
## 1       -1  0.0006567649       -1        -1          -1        0.00000    347  0.0006567649
## 2       -1 -0.0072046589       -1        -1          -1        0.00000     53 -0.0072046589
## 3       -1 -0.0003848737       -1        -1          -1        0.00000    400 -0.0003848737

We could also allow interactions in each tree. This will lead to deeper tree and it will take less number of iterations to achieve similar performances.

  gbm.fit = gbm(y~., data = data.frame(x1, x2, y), distribution="adaboost", 
                n.trees=1000, shrinkage=0.01, bag.fraction=0.8, 
                interaction.depth = 2, cv.folds=10)
  
  usetree = gbm.perf(gbm.fit, method="cv") 

  Fx = predict(gbm.fit, xgrid, n.trees=usetree)
  
  pred = matrix(1/(1+exp(-2*Fx)) > 0.5, 201, 201)
  par(mar=rep(2,4))
  contour(seq(-1, 1, 0.01), seq(-1, 1, 0.01), pred, levels=0.5, labels="",axes=FALSE)
  points(x1, x2, col = ifelse(y == 1, "deepskyblue", "darkorange"), 
         pch = 19, yaxt="n", xaxt = "n")
  points(xgrid, pch=".", cex=1.2, col=ifelse(pred, "deepskyblue", "darkorange"))
  box()

  pretty.gbm.tree(gbm.fit, i.tree = 1)
##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight   Prediction
## 0        1  -0.658632189        1         2           6       30.69190    400  0.000309508
## 1       -1  -0.005887393       -1        -1          -1        0.00000     71 -0.005887393
## 2        0   0.694127902        3         4           5       31.73123    329  0.001646833
## 3       -1   0.003043478       -1        -1          -1        0.00000    277  0.003043478
## 4       -1  -0.005792988       -1        -1          -1        0.00000     52 -0.005792988
## 5       -1   0.001646833       -1        -1          -1        0.00000    329  0.001646833
## 6       -1   0.000309508       -1        -1          -1        0.00000    400  0.000309508

Variable importance

You can estimate the Variable importance with similar fashion as a random forest

  gbm.fit = gbm(y~., data = data.frame(x1, x2, matrix(runif(n*10), n, 10), y),
                distribution="adaboost", n.trees=1000, shrinkage=0.01, 
                bag.fraction=0.8, interaction.depth = 2, cv.folds=10)

  summary(gbm.fit, method = permutation.test.gbm)

##    var    rel.inf
## 1   x1 38.1405889
## 2   x2 37.5402623
## 3   X2  5.5556527
## 4  X10  4.0639256
## 5   X7  2.8284456
## 6   X1  2.4353977
## 7   X8  2.2193764
## 8   X9  1.8153845
## 9   X6  1.6683819
## 10  X4  1.4292909
## 11  X5  1.3463841
## 12  X3  0.9569094