Abstract
This is the supplementaryR file for AdaBoost and Gradient
Boosting in the lecture note “Boosting”.
One of the most popular packages is gbm. In recent years
xgboost becomes extremely popular on Kaggle. The main
difference is the computational performances. In this course,
gbm is sufficient for demonstrating the method.
library(gbm)
We consider a regression problem with just one variable. The
gbm package uses trees as week learners. You can further
control the complexity of trees through tuning parameters.
# a simple regression problem
p = 1
x = seq(0, 1, 0.001)
fx <- function(x) 2*sin(3*pi*x)
y = fx(x) + rnorm(length(x))
plot(x, y, pch = 19, ylab = "y", col = "gray")
lines(x, fx(x), lwd = 2) # plot the true regression line
# fit regression boosting
# I use a very large shrinkage value for demonstrating the functions
# in practice you should use 0.1 or even smaller values for stability
gbm.fit = gbm(y~x, data = data.frame(x, y), distribution = "gaussian",
n.trees=300, shrinkage=0.5, bag.fraction=0.8)
# somehow, cross-validation for 1 dimensional problem creates error
# gbm(y ~ ., data = data.frame(x, y), cv.folds = 3) # this produces an error
# plot the fitted regression function at several iterations
par(mfrow=c(2,3))
size=c(1,5,10,50,100,300)
for(i in 1:6)
{
par(mar=c(2,2,3,1))
plot(x, y, pch = 19, ylab = "y", col = "gray")
lines(x, fx(x), lwd = 2)
Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
lines(x, Fx, lwd = 3, col = "red")
title(paste("# of Iterations = ", size[i]))
}
n = 1000; set.seed(1)
x = cbind(seq(0, 1, length.out = n), runif(n))
py = (sin(4*pi*x[, 1]) + 1)/2
y = rbinom(n, 1, py)
plot(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)),
pch = 19, col = ifelse(y==1, "red", "blue"), xlab = "x", ylab = "P(Y=1 | X=x)")
points(x[, 1], py, type = "l", lwd = 3)
# fit AdaBoost with bootstrapping, again I am using a large shrinkage factor
gbm.fit = gbm(y~., data.frame(x, y), distribution="adaboost", n.minobsinnode = 2,
n.trees=200, shrinkage = 1, bag.fraction=0.8, cv.folds = 10)
# get the best number of trees from cross-validation (or oob if no cv is used)
gbm.perf(gbm.fit)
## [1] 22
# plot the decision function (Fx, not sign(Fx))
par(mfrow=c(2,3))
size=c(1, 5, 10, 22, 50, 100)
for(i in 1:6)
{
par(mar=c(2,2,3,1))
plot(x[, 1], py, type = "l", lwd = 3, ylab = "P(Y=1 | X=x)", col = "gray")
points(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)),
pch = 19, col = ifelse(y==1, "red", "blue"))
Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
lines(x[, 1], 1/(1+exp(-2*Fx)), lwd = 3)
title(paste("# of Iterations = ", size[i]))
}
# You can peek into a single tree
pretty.gbm.tree(gbm.fit, i.tree = 1)
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight Prediction
## 0 0 0.24374374 1 2 3 129.2909 800 -0.03992647
## 1 -1 0.68000925 -1 -1 -1 0.0000 188 0.68000925
## 2 -1 -0.26108319 -1 -1 -1 0.0000 612 -0.26108319
## 3 -1 -0.03992647 -1 -1 -1 0.0000 800 -0.03992647
shrinkage # you may use a very small shrinkage factor to improve stability
# however, this means you will need to use more trees
gbm.fit = gbm(y~., data.frame(x, y), distribution="adaboost", n.minobsinnode = 2,
n.trees=1000, shrinkage = 0.1, bag.fraction=0.8, cv.folds = 10)
gbm.perf(gbm.fit)
## [1] 487
par(mfrow=c(2,3))
size=c(1,10,50,100,250,500) # 1000 trees is clearly over-fitting
for(i in 1:6)
{
par(mar=c(2,2,3,1))
plot(x[, 1], py, type = "l", lwd = 3, ylab = "P(Y=1 | X=x)", col = "gray")
points(x[, 1], y + ifelse(y == 1, runif(n, -0.15, 0), runif(n, 0, 0.15)),
pch = 19, col = ifelse(y==1, "red", "blue"))
Fx = predict(gbm.fit, n.trees=size[i]) # this returns the fitted function, but not class
lines(x[, 1], 1/(1+exp(-2*Fx)), lwd = 3)
title(paste("# of Iterations = ", size[i]))
}
# our previous example with a circle
set.seed(1)
n = 500
x1 = runif(n, -1, 1)
x2 = runif(n, -1, 1)
y = rbinom(n, size = 1, prob = ifelse(x1^2 + x2^2 < 0.6, 0.8, 0.2))
xgrid = expand.grid(x1 = seq(-1, 1, 0.01), x2 = seq(-1, 1, 0.01))
# if we only use one variable in each tree
gbm.fit = gbm(y~., data = data.frame(x1, x2, y), distribution="adaboost",
n.trees=1000, shrinkage=0.01, bag.fraction=0.8,
interaction.depth = 1, cv.folds=10)
usetree = gbm.perf(gbm.fit, method="cv")
Fx = predict(gbm.fit, xgrid, n.trees=usetree)
pred = matrix(1/(1+exp(-2*Fx)) > 0.5, 201, 201)
par(mar=rep(2,4))
contour(seq(-1, 1, 0.01), seq(-1, 1, 0.01), pred, levels=0.5, labels="",axes=FALSE)
points(x1, x2, col = ifelse(y == 1, "deepskyblue", "darkorange"),
pch = 19, yaxt="n", xaxt = "n")
points(xgrid, pch=".", cex=1.2, col=ifelse(pred, "deepskyblue", "darkorange"))
box()
pretty.gbm.tree(gbm.fit, i.tree = 1)
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight Prediction
## 0 0 0.7279700371 1 2 3 24.10443 400 -0.0003848737
## 1 -1 0.0006567649 -1 -1 -1 0.00000 347 0.0006567649
## 2 -1 -0.0072046589 -1 -1 -1 0.00000 53 -0.0072046589
## 3 -1 -0.0003848737 -1 -1 -1 0.00000 400 -0.0003848737
We could also allow interactions in each tree. This will lead to deeper tree and it will take less number of iterations to achieve similar performances.
gbm.fit = gbm(y~., data = data.frame(x1, x2, y), distribution="adaboost",
n.trees=1000, shrinkage=0.01, bag.fraction=0.8,
interaction.depth = 2, cv.folds=10)
usetree = gbm.perf(gbm.fit, method="cv")
Fx = predict(gbm.fit, xgrid, n.trees=usetree)
pred = matrix(1/(1+exp(-2*Fx)) > 0.5, 201, 201)
par(mar=rep(2,4))
contour(seq(-1, 1, 0.01), seq(-1, 1, 0.01), pred, levels=0.5, labels="",axes=FALSE)
points(x1, x2, col = ifelse(y == 1, "deepskyblue", "darkorange"),
pch = 19, yaxt="n", xaxt = "n")
points(xgrid, pch=".", cex=1.2, col=ifelse(pred, "deepskyblue", "darkorange"))
box()
pretty.gbm.tree(gbm.fit, i.tree = 1)
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight Prediction
## 0 1 -0.658632189 1 2 6 30.69190 400 0.000309508
## 1 -1 -0.005887393 -1 -1 -1 0.00000 71 -0.005887393
## 2 0 0.694127902 3 4 5 31.73123 329 0.001646833
## 3 -1 0.003043478 -1 -1 -1 0.00000 277 0.003043478
## 4 -1 -0.005792988 -1 -1 -1 0.00000 52 -0.005792988
## 5 -1 0.001646833 -1 -1 -1 0.00000 329 0.001646833
## 6 -1 0.000309508 -1 -1 -1 0.00000 400 0.000309508
You can estimate the Variable importance with similar fashion as a random forest
gbm.fit = gbm(y~., data = data.frame(x1, x2, matrix(runif(n*10), n, 10), y),
distribution="adaboost", n.trees=1000, shrinkage=0.01,
bag.fraction=0.8, interaction.depth = 2, cv.folds=10)
summary(gbm.fit, method = permutation.test.gbm)
## var rel.inf
## 1 x1 38.1405889
## 2 x2 37.5402623
## 3 X2 5.5556527
## 4 X10 4.0639256
## 5 X7 2.8284456
## 6 X1 2.4353977
## 7 X8 2.2193764
## 8 X9 1.8153845
## 9 X6 1.6683819
## 10 X4 1.4292909
## 11 X5 1.3463841
## 12 X3 0.9569094