Abstract
This is the supplementaryR
file for support vector machines in the lecture note “SVM”.
In linearly separable SVM, we find a hyperplane \[\{ x: \beta_0 + x^\text{T} \boldsymbol \beta = 0 \}\] such that all observations with \(y_i = 1\) are on one side, and observations with \(y_i = -1\) are on the other side. We generate the data as follows.
set.seed(1)
n <- 6
p <- 2
# Generate positive and negative examples
xneg <- matrix(rnorm(n*p,mean=0,sd=1),n,p)
xpos <- matrix(rnorm(n*p,mean=3,sd=1),n,p)
x <- rbind(xpos,xneg)
y <- matrix(as.factor(c(rep(1,n),rep(-1,n))))
# plot
plot(x,col=ifelse(y>0,"blue","red"), pch = 19, cex = 1.2, lwd = 2,
xlab = "X1", ylab = "X2", cex.lab = 1.5)
legend("bottomright", c("Positive", "Negative"),col=c("blue", "red"),
pch=c(19, 19), text.col=c("blue", "red"), cex = 1.5)
We use the e1071
package to fit the SVM. There is a cost parameter \(C\), with default value 1. This parameter has a significant impact on non-separable problems. However, for our separable case, we will set this to be a very large value, meaning that the cost for having a wrong classification is very large. We also need to specify the linear
kernel.
library(e1071)
svm.fit <- svm(y ~ ., data = data.frame(x, y), type='C-classification',
kernel='linear', scale=FALSE, cost = 10000)
Recall that there will only be a small number of points that decides the margin. These points are the ones with \(\alpha_i > 0\) in our dual form. These inforamtion are provided by two components:
coefs
provides the \(y_i*\alpha_i\) for the support vectorsSV
are the \(x_i\) values correspond to the support vectorsrho
is negative \(\beta_0\) b <- t(svm.fit$coefs) %*% svm.fit$SV
b0 <- -svm.fit$rho
# an alternative of b0 as the lecture note
b0 <- -(max(x[y == -1, ] %*% t(b)) + min(x[y == 1, ] %*% t(b)))/2
# plot on the data
plot(x,col=ifelse(y>0,"blue","red"), pch = 19, cex = 1.2, lwd = 2,
xlab = "X1", ylab = "X2", cex.lab = 1.5)
legend("bottomleft", c("Positive","Negative"),col=c("blue","red"),
pch=c(19, 19),text.col=c("blue","red"), cex = 1.5)
abline(a= -b0/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=1, lwd = 2)
# mark the support vectors
points(x[svm.fit$index, ], col="black", cex=3)
# the two margin lines
abline(a= (-b0-1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
abline(a= (-b0+1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
We generate a case where some observations are on the wrong side. We use the default \(C = 1\).
set.seed(70)
n <- 10 # number of data points for each class
p <- 2 # dimension
# Generate the positive and negative examples
xneg <- matrix(rnorm(n*p,mean=0,sd=1),n,p)
xpos <- matrix(rnorm(n*p,mean=1.5,sd=1),n,p)
x <- rbind(xpos,xneg)
y <- matrix(as.factor(c(rep(1,n),rep(-1,n))))
# Visualize the data
plot(x,col=ifelse(y>0,"blue","red"), pch = 19, cex = 1.2, lwd = 2,
xlab = "X1", ylab = "X2", cex.lab = 1.5)
legend("topright", c("Positive","Negative"),col=c("blue","red"),
pch=c(19, 19),text.col=c("blue","red"), cex = 1.5)
svm.fit <- svm(y ~ ., data = data.frame(x, y), type='C-classification',
kernel='linear',scale=FALSE, cost = 1)
b <- t(svm.fit$coefs) %*% svm.fit$SV
b0 <- -svm.fit$rho
points(x[svm.fit$index, ], col="black", cex=3)
abline(a= -b0/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=1, lwd = 2)
abline(a= (-b0-1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
abline(a= (-b0+1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
If we instead use a smaller \(C\):
# Visualize the data
plot(x,col=ifelse(y>0,"blue","red"), pch = 19, cex = 1.2, lwd = 2,
xlab = "X1", ylab = "X2", cex.lab = 1.5)
legend("topright", c("Positive","Negative"),col=c("blue","red"),
pch=c(19, 19),text.col=c("blue","red"), cex = 1.5)
# fit SVM with C = 10
svm.fit <- svm(y ~ ., data = data.frame(x, y), type='C-classification',
kernel='linear',scale=FALSE, cost = 0.1)
b <- t(svm.fit$coefs) %*% svm.fit$SV
b0 <- -svm.fit$rho
points(x[svm.fit$index, ], col="black", cex=3)
abline(a= -b0/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=1, lwd = 2)
abline(a= (-b0-1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
abline(a= (-b0+1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
We use the mixture.example
dataset in the ElemStatLearn
package. In addition, we use a different package kernlab
.
rm(list = ls())
library(ElemStatLearn)
data(mixture.example)
attach(mixture.example)
names(mixture.example)
## [1] "x" "y" "xnew" "prob" "marginal" "px1" "px2" "means"
### train the SVM using a different package
library(kernlab)
cost = 10
svm.fit <- ksvm(x,y,type="C-svc",kernel='vanilladot',C=cost,scaled=c())
## Setting default kernel parameters
#Bayes decision boundary:
prob1<-mixture.example$prob
prob.bayes <- matrix(prob1, length(px1), length(px2))
contour(px1, px2, prob.bayes, levels=0.5, lty=2, labels="", xlab="x1",ylab="x2",
main="SVM with linear kernal")
points(x, col=ifelse(y==1, "darkorange", "deepskyblue"), pch = 19)
# Extract the indices of the support vectors on the margin:
sv.alpha<-alpha(svm.fit)[[1]][which(alpha(svm.fit)[[1]]<cost)]
sv.index<-alphaindex(svm.fit)[[1]][which(alpha(svm.fit)[[1]]<cost)]
sv.matrix<-x[sv.index,]
points(sv.matrix, pch=16, col=ifelse(y[sv.index] == 1, "darkorange", "deepskyblue"), cex=1.5)
# Plot the hyperplane and the margins:
w <- t(cbind(coef(svm.fit)[[1]])) %*% xmatrix(svm.fit)[[1]]
b <- - b(svm.fit)
abline(a= -b/w[1,2], b=-w[1,1]/w[1,2], col="black", lty=1, lwd = 2)
abline(a= (-b-1)/w[1,2], b=-w[1,1]/w[1,2], col="black", lty=3, lwd = 2)
abline(a= (-b+1)/w[1,2], b=-w[1,1]/w[1,2], col="black", lty=3, lwd = 2)
dat = data.frame(y = factor(y), x)
fit = svm(factor(y) ~ ., data = dat, scale = FALSE, kernel = "radial", cost = 5)
xgrid = expand.grid(X1 = px1, X2 = px2)
func = predict(fit, xgrid, decision.values = TRUE)
func = attributes(func)$decision
ygrid = predict(fit, xgrid)
plot(xgrid, col = ifelse(ygrid == 1, "bisque", "cadetblue1"), pch = 20, cex = 0.2, main="SVM with radial kernal")
points(x, col=ifelse(y==1, "darkorange", "deepskyblue"))
contour(px1, px2, matrix(func, 69, 99), level = 0, add = TRUE)
contour(px1, px2, matrix(prob, 69, 99), level = 0.5, add = TRUE, col = "blue", lwd = 2)