Abstract
This is the supplementaryR
file for AdaBoost and Gradient
Boosting in the lecture note “Boosting”.
One of the most popular packages is gbm
. In recent years
xgboost
becomes extremely popular on Kaggle. The main
difference is the computational performances. In this course,
gbm
is sufficient for demonstrating the method.
library(gbm)
We consider a regression problem with just one variable. The
gbm
package uses trees as week learners. You can further
control the complexity of trees through tuning parameters.
# a simple regression problem
p = 1
x = seq(0, 1, 0.001)
fx <- function(x) 2*sin(3*pi*x)
y = fx(x) + rnorm(length(x))
plot(x, y, pch = 19, ylab = "y", col = "gray")
lines(x, fx(x), lwd = 2) # plot the true regression line
# fit regression boosting
# I use a very large shrinkage value for demonstrating the functions
# in practice you should use 0.1 or even smaller values for stability
gbm.fit = gbm(y~x, data = data.frame(x, y), distribution = "gaussian",
n.trees=300, shrinkage=0.5, bag.fraction=0.8)
# somehow, cross-validation for 1 dimensional problem creates error
# gbm(y ~ ., data = data.frame(x, y), cv.folds = 3) # this produces an error
# plot the fitted regression function at several iterations
par(mfrow=c(2,3))
size=c(1,5,10,50,100,300)
for(i in 1:6)
{
par(mar=c(2,2,3,1))
plot(x, y, pch = 19, ylab = "y", col = "gray")
lines(x, fx(x), lwd = 2)
Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
lines(x, Fx, lwd = 3, col = "red")
title(paste("# of Iterations = ", size[i]))
}
n = 1000; set.seed(1)
x = cbind(seq(0, 1, length.out = n), runif(n))
py = (sin(4*pi*x[, 1]) + 1)/2
y = rbinom(n, 1, py)
plot(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)),
pch = 19, col = ifelse(y==1, "red", "blue"), xlab = "x", ylab = "P(Y=1 | X=x)")
points(x[, 1], py, type = "l", lwd = 3)
# fit AdaBoost with bootstrapping, again I am using a large shrinkage factor
gbm.fit = gbm(y~., data.frame(x, y), distribution="adaboost", n.minobsinnode = 2,
n.trees=200, shrinkage = 1, bag.fraction=0.8, cv.folds = 10)
# get the best number of trees from cross-validation (or oob if no cv is used)
gbm.perf(gbm.fit)
## [1] 22
# plot the decision function (Fx, not sign(Fx))
par(mfrow=c(2,3))
size=c(1, 5, 10, 22, 50, 100)
for(i in 1:6)
{
par(mar=c(2,2,3,1))
plot(x[, 1], py, type = "l", lwd = 3, ylab = "P(Y=1 | X=x)", col = "gray")
points(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)),
pch = 19, col = ifelse(y==1, "red", "blue"))
Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
lines(x[, 1], 1/(1+exp(-2*Fx)), lwd = 3)
title(paste("# of Iterations = ", size[i]))
}
# You can peek into a single tree
pretty.gbm.tree(gbm.fit, i.tree = 1)
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight Prediction
## 0 0 0.24374374 1 2 3 129.2909 800 -0.03992647
## 1 -1 0.68000925 -1 -1 -1 0.0000 188 0.68000925
## 2 -1 -0.26108319 -1 -1 -1 0.0000 612 -0.26108319
## 3 -1 -0.03992647 -1 -1 -1 0.0000 800 -0.03992647
shrinkage
# you may use a very small shrinkage factor to improve stability
# however, this means you will need to use more trees
gbm.fit = gbm(y~., data.frame(x, y), distribution="adaboost", n.minobsinnode = 2,
n.trees=1000, shrinkage = 0.1, bag.fraction=0.8, cv.folds = 10)
gbm.perf(gbm.fit)
## [1] 487
par(mfrow=c(2,3))
size=c(1,10,50,100,250,500) # 1000 trees is clearly over-fitting
for(i in 1:6)
{
par(mar=c(2,2,3,1))
plot(x[, 1], py, type = "l", lwd = 3, ylab = "P(Y=1 | X=x)", col = "gray")
points(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)),
pch = 19, col = ifelse(y==1, "red", "blue"))
Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
lines(x[, 1], 1/(1+exp(-2*Fx)), lwd = 3)
title(paste("# of Iterations = ", size[i]))
}
# our previous example with a circle
set.seed(1)
n = 500
x1 = runif(n, -1, 1)
x2 = runif(n, -1, 1)
y = rbinom(n, size = 1, prob = ifelse(x1^2 + x2^2 < 0.6, 0.8, 0.2))
xgrid = expand.grid(x1 = seq(-1, 1, 0.01), x2 = seq(-1, 1, 0.01))
# if we only use one variable in each tree
gbm.fit = gbm(y~., data = data.frame(x1, x2, y), distribution="adaboost",
n.trees=1000, shrinkage=0.01, bag.fraction=0.8,
interaction.depth = 1, cv.folds=10)
usetree = gbm.perf(gbm.fit, method="cv")
Fx = predict(gbm.fit, xgrid, n.trees=usetree)
pred = matrix(1/(1+exp(-2*Fx)) > 0.5, 201, 201)
par(mar=rep(2,4))
contour(seq(-1, 1, 0.01), seq(-1, 1, 0.01), pred, levels=0.5, labels="",axes=FALSE)
points(x1, x2, col = ifelse(y == 1, "deepskyblue", "darkorange"),
pch = 19, yaxt="n", xaxt = "n")
points(xgrid, pch=".", cex=1.2, col=ifelse(pred, "deepskyblue", "darkorange"))
box()
pretty.gbm.tree(gbm.fit, i.tree = 1)
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight Prediction
## 0 0 0.7279700371 1 2 3 24.10443 400 -0.0003848737
## 1 -1 0.0006567649 -1 -1 -1 0.00000 347 0.0006567649
## 2 -1 -0.0072046589 -1 -1 -1 0.00000 53 -0.0072046589
## 3 -1 -0.0003848737 -1 -1 -1 0.00000 400 -0.0003848737
We could also allow interactions in each tree. This will lead to deeper tree and it will take less number of iterations to achieve similar performances.
gbm.fit = gbm(y~., data = data.frame(x1, x2, y), distribution="adaboost",
n.trees=1000, shrinkage=0.01, bag.fraction=0.8,
interaction.depth = 2, cv.folds=10)
usetree = gbm.perf(gbm.fit, method="cv")
Fx = predict(gbm.fit, xgrid, n.trees=usetree)
pred = matrix(1/(1+exp(-2*Fx)) > 0.5, 201, 201)
par(mar=rep(2,4))
contour(seq(-1, 1, 0.01), seq(-1, 1, 0.01), pred, levels=0.5, labels="",axes=FALSE)
points(x1, x2, col = ifelse(y == 1, "deepskyblue", "darkorange"),
pch = 19, yaxt="n", xaxt = "n")
points(xgrid, pch=".", cex=1.2, col=ifelse(pred, "deepskyblue", "darkorange"))
box()
pretty.gbm.tree(gbm.fit, i.tree = 1)
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight Prediction
## 0 1 -0.658632189 1 2 6 30.69190 400 0.000309508
## 1 -1 -0.005887393 -1 -1 -1 0.00000 71 -0.005887393
## 2 0 0.694127902 3 4 5 31.73123 329 0.001646833
## 3 -1 0.003043478 -1 -1 -1 0.00000 277 0.003043478
## 4 -1 -0.005792988 -1 -1 -1 0.00000 52 -0.005792988
## 5 -1 0.001646833 -1 -1 -1 0.00000 329 0.001646833
## 6 -1 0.000309508 -1 -1 -1 0.00000 400 0.000309508
You can estimate the Variable importance with similar fashion as a random forest
gbm.fit = gbm(y~., data = data.frame(x1, x2, matrix(runif(n*10), n, 10), y),
distribution="adaboost", n.trees=1000, shrinkage=0.01,
bag.fraction=0.8, interaction.depth = 2, cv.folds=10)
summary(gbm.fit, method = permutation.test.gbm)
## var rel.inf
## 1 x1 38.1405889
## 2 x2 37.5402623
## 3 X2 5.5556527
## 4 X10 4.0639256
## 5 X7 2.8284456
## 6 X1 2.4353977
## 7 X8 2.2193764
## 8 X9 1.8153845
## 9 X6 1.6683819
## 10 X4 1.4292909
## 11 X5 1.3463841
## 12 X3 0.9569094