Please remove this section when submitting your homework.
Students are encouraged to work together on homework and/or utilize advanced AI tools. However, sharing, copying, or providing any part of a homework solution or code to others is an infraction of the University’s rules on Academic Integrity. Any violation will be punished as severely as possible. Final submissions must be uploaded to Gradescope. No email or hard copy will be accepted. For late submission policy and grading rubrics, please refer to the course website.
HWx_yourNetID.pdf
. For example,
HW01_rqzhu.pdf
. Please note that this must be a
.pdf
file. .html
format
cannot be accepted. Make all of your R
code chunks visible for grading..Rmd
file
as a template, be sure to remove this instruction
section.R
is \(\geq
4.0.0\). This will ensure your random seed generation is the same
as everyone else. Please note that updating the R
version
may require you to reinstall all of your packages.Load the MNIST dataset, the same way as HW5.
# readin the data
mnist <- read.csv("https://pjreddie.com/media/files/mnist_train.csv", nrows = 2000)
colnames(mnist) = c("Digit", paste("Pixel", seq(1:784), sep = ""))
save(mnist, file = "mnist_first2000.RData")
# you can load the data with the following code
# load("mnist_first2000.RData")
dim(mnist)
## [1] 2000 785
e1071
package. Set the cost parameter \(C =
1\). # your code here
train <- mnist[1:1000,]
train <- train[train$Digit == 1 | train$Digit == 2,]
test <- mnist[1001:2000,]
test <- test[test$Digit == 1 | test$Digit == 2,]
library(e1071)
# this code will take a long time to run and have warnings.
# svmfit = svm(Digit ~ ., data = train, kernel = "linear", cost = 1)
# mean(predict(svmfit, train) == train$Digit)
# mean(predict(svmfit, test) == test$Digit)
# perform marginal screening
var = apply(train[, -1], 2, var)
varuse = order(var, decreasing = TRUE)[1:300]
train = train[, c(1, varuse + 1)]
test = test[, c(1, varuse + 1)]
# redo svm
svmfit = svm(as.factor(Digit) ~ ., data = train, kernel = "linear", cost = 1)
mean( predict(svmfit, train) == train$Digit )
## [1] 1
mean( predict(svmfit, test) == test$Digit )
## [1] 0.9901478
# extract coefficients
# create a coefficient vector of length 784
w = t(svmfit$SV) %*% svmfit$coefs
w[order(abs(w), decreasing = TRUE)[1:10],]
## Pixel152 Pixel261 Pixel232 Pixel435 Pixel406 Pixel151
## -0.11247789 -0.10729820 -0.08626845 0.07970712 0.07244728 -0.06964410
## Pixel378 Pixel517 Pixel351 Pixel231
## 0.06951482 -0.06348145 0.06307705 -0.06269874
# we can also plot them on a 28 by 28 grid
#coef.svm = rep(0, 784)
#coef.svm[varuse] = w
# create a color palette based on the values of the coefficients
#library(RColorBrewer)
#palette = brewer.pal(9, "RdBu")
#col.svm = palette[as.numeric(cut(coef.svm, 9))]
#col.svm[coef.svm == 0] = "white"
# plot the coefficients on a 28 by 28 grid
#par(mfrow = c(1, 1))
#image(1:28, 1:28, matrix(coef.svm, 28, 28)[, 28:1], col = col.svm, xlab = "", ylab = "")
#title("Coefficients of Pixels")
# perform PCA
pca = prcomp(train[, -1], center = TRUE, scale = TRUE)
# plot the first two principle components
par(mfrow = c(1, 1))
plot(pca$x[, 1], pca$x[, 2], col = as.factor(train$Digit), pch = 19, xlab = "PC1", ylab = "PC2")
legend("topright", legend = c("1", "2"), col = c("black", "red"), pch = 19)
title("First Two Principle Components")
# plot the first principle component against the linear separation rule
par(mfrow = c(1, 1))
plot(pca$x[, 1], data.matrix(train[, -1]) %*% w,
pch = 19, xlab = "PC1", ylab = "SVM Rule", col = as.factor(train$Digit))
# perform logistic regression with ridge penalty
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
x = data.matrix(train[, -1])
y = as.numeric(train$Digit) - 1
fit = cv.glmnet(x, y, family = "binomial", alpha = 0.5)
# plot the linear link function of the logistic regression model against the linear separation rule
par(mfrow = c(1, 1))
plot(predict(fit, x, s = "lambda.min"), data.matrix(train[, -1]) %*% w,
pch = 19, xlab = "Logistic Regression Rule", ylab = "SVM Rule", col = as.factor(train$Digit))
[25 pts] Our current SVM is only applicable to binary classification problems. In this question, we will extend it to multi-class classification problems. A simple idea is called one-vs-one (OVO) classification. For example, if we have 3 classes, we can fit 3 SVMs, each of which is trained on two classes. For a new observation, we throw that into each of the 3 SVMs and obtain three predictions. We then use the majority vote to determine its class. Carry out this approach using digits 1, 6, and 7 in our MNIST data. You still need to select the top pixels with the highest variance to avoid unnecessary warnings. But in this question, use only 100 pixels. For all models, keep the cost parameter \(C = 1\).
# your code here
train <- mnist[1:1001,]
train <- train[train$Digit == 1 | train$Digit == 6 | train$Digit == 7,]
test <- mnist[1001:2000,]
test <- test[test$Digit == 1 | test$Digit == 6 | test$Digit == 7,]
# perform marginal screening
var = apply(train[, -1], 2, var)
varuse = order(var, decreasing = TRUE)[1:100]
train = train[, c(1, varuse + 1)]
test = test[, c(1, varuse + 1)]
library(e1071)
# fit three models
svmfit16 = svm(as.factor(Digit) ~ ., data = train[train$Digit %in% c(1, 6), ], kernel = "linear", cost = 1)
svmfit17 = svm(as.factor(Digit) ~ ., data = train[train$Digit %in% c(1, 7), ], kernel = "linear", cost = 1)
svmfit67 = svm(as.factor(Digit) ~ ., data = train[train$Digit %in% c(6, 7), ], kernel = "linear", cost = 1)
# predict
pred16 = predict(svmfit16, test)
pred17 = predict(svmfit17, test)
pred67 = predict(svmfit67, test)
# majority vote
pred = rep(0, length(pred16))
for (i in 1:length(pred16)) {
pred[i] = names(which.max(table(c(pred16[i], pred17[i], pred67[i]))))
}
mean(pred == test$Digit)
## [1] 0.9589905
[25 pts] Load the spam
dataset from the
kernlab
package. In this is a classification example with
consists of 4,601 instances and 57 features. The response variable is
whether an email is spam or not. Use a nonlinear SVM with the Radial
Basis Function (RBF) kernel. Evaluate the performance of the trained
model. Complete the following tasks
kernlab
package and
split the dataset into training (70%) and testing (30%) sets. Keep a
seed.caret
package for
this task. You may need to experiment a few different values of \(C\) and \(\sigma\) to get a good model, but do not
use more than 9 different combinations overall since this can be very
slow.ROCR
package for this task. For the scaler predictor, use
the fitted values \(x^T \beta +
\beta_0\) from the SVM model. library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
data(spam)
dim(spam)
## [1] 4601 58
# split the data
set.seed(432)
trainid = sample(1:nrow(spam), 0.7 * nrow(spam))
traindata = spam[trainid, ]
testdata = spam[-trainid, ]
# Define the parameter grid to search
paramGrid <- expand.grid(.sigma = c(0.01, 0.1, 1), .C = c(1, 10, 100))
# Define the control object for train()
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 3)
# Train the model using train()
svmFit <- train(type ~ .,
data = traindata,
method = "svmRadial",
trControl = fitControl,
preProcess = c("center", "scale"),
tuneGrid = paramGrid)
# View the tuning results
print(svmFit)
## Support Vector Machines with Radial Basis Function Kernel
##
## 3220 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2577, 2576, 2576, 2576, 2575, 2577, ...
## Resampling results across tuning parameters:
##
## sigma C Accuracy Kappa
## 0.01 1 0.9279509 0.8474379
## 0.01 10 0.9352993 0.8634445
## 0.01 100 0.9303289 0.8531790
## 0.10 1 0.9038270 0.7936360
## 0.10 10 0.9039312 0.7947059
## 0.10 100 0.8983426 0.7833707
## 1.00 1 0.7733929 0.4730858
## 1.00 10 0.7773263 0.4851722
## 1.00 100 0.7753598 0.4812194
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01 and C = 10.
# fit the final model using the kernlab package
finalModel <- ksvm(type ~ ., data = traindata, kernel = "rbfdot", kpar = list(sigma = 0.1), C = 1)
# predict on the test data
pred <- predict(finalModel, testdata, type="decision")
# accuracy
mean( (pred > 0) == (testdata$type == "spam") )
## [1] 0.9181752
# ROC curve
library(ROCR)
rocfit <- prediction(pred, testdata$type)
perf <- performance(rocfit, "tpr", "fpr")
plot(perf, col = "red", main = "ROC Curve")
abline(a = 0, b = 1, lty = 2, col = "gray")