Instruction

Please remove this section when submitting your homework.

Students are encouraged to work together on homework and/or utilize advanced AI tools. However, there are two basic rules:

Final submissions must be uploaded to Gradescope. No email or hard copy will be accepted. Please refer to the course website for late submission policy and grading rubrics.

Question 1 [40 pts]: Linear SVM and support vectors

We will use the Social Network Ads data, available on Kaggle [link]. The .csv file is also available at our course website. The goal is to classify the outcome Purchased, and we will only use the two continuous variables EstimatedSalary and Age. Scale and center both covariates before you proceed with these following steps. For this question, you should use the e1071 package. Complete the following tasks:

Answer:

  # readin data 
  SNAds <- read.csv("Social_Network_Ads.csv")

  # scale and center the covariates
  SNAds$EstimatedSalary <- scale(SNAds$EstimatedSalary, center = TRUE, scale = TRUE)
  SNAds$Age <- scale(SNAds$Age, center = TRUE, scale = TRUE)

  # get training and testing data 
  set.seed(1)
  train <- rbind(SNAds[sample(which(SNAds$Purchased==1), 100), ],
               SNAds[sample(which(SNAds$Purchased==0), 100), ])
  test <- SNAds[-as.numeric(rownames(train)), ]
  
  # plot the training data
  plot(cbind(train$EstimatedSalary,train$Age),
       col=ifelse(train$Purchased>0, "deepskyblue", "darkorange"), 
       pch = 19, cex = 1.2, lwd = 2, 
       xlab = "EstimatedSalary", ylab = "Age", cex.lab = 1.5)
  legend("bottomright", c("Purchased", "Not Purchased"),
         col=c("deepskyblue", "darkorange"), cex = 0.8,
         pch=c(19, 19), text.col=c("deepskyblue", "darkorange"))
  
  # fit a linear SVM with cost = 1  
  library("e1071")
  svm.fit <- svm(Purchased ~ EstimatedSalary+Age, data = train, 
                 type='C-classification', 
                 kernel='linear', scale=FALSE, cost = 1)
  
  # calculate the betas and intercept of the decision line
  b <- t(svm.fit$coefs) %*% svm.fit$SV
  b0 <- -svm.fit$rho
  
  # add the decision line to the plot
  abline(a= -b0/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=1, lwd = 2)
  
  # add the two margin lines to the plot
  abline(a= (-b0-1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
  abline(a= (-b0+1)/b[1,2], b=-b[1,1]/b[1,2], col="black", lty=3, lwd = 2)
  
  # mark the support vectors
  points(cbind(train$EstimatedSalary,train$Age)[svm.fit$index, ], 
         col="black", cex=2)

  # training data (in-sample) classification accuracy
  train_pred = train$EstimatedSalary * b[1,1] + train$Age * b[1,2] + b0
  
  train_class = ifelse(train_pred > 0, 1, 0)
  train_accuracy = sum(train_class == train$Purchased) / nrow(train)
  train_accuracy
## [1] 0.845
  # confusion table of the results
  table(train_class, train$Purchased)
##            
## train_class  0  1
##           0 83 14
##           1 17 86
  # predict the outcome for the testing data
  test_pred = test$EstimatedSalary * b[1,1] + test$Age * b[1,2] + b0
  test_class = ifelse(test_pred > 0, 1, 0)
  test_accuracy = sum(test_class == test$Purchased) / nrow(test)
  test_accuracy
## [1] 0.835
  # confusion table of the results
  table(test_class, test$Purchased)
##           
## test_class   0   1
##          0 129   5
##          1  28  38
  # calculate the AUC and plot the ROC curve for the testing data
  library("ROCR")
  pred <- prediction(test_pred, test$Purchased)
  perf <- performance(pred, "tpr", "fpr")
  plot(perf, col="blue", lwd=2, main="ROC Curve for Testing Data")
  abline(a=0, b=1, lty=2, col="red")

  performance(pred, "auc")@y.values[[1]]
## [1] 0.9291957

Question 2 [30 pts]: Nonlinear SVM

In this question, we will use the same training and testing data from the previous question. Complete the following tasks. For this question, you can use the predict() function to make predictions.

Answer:

  # fit a SVM with cubic kernel with cost = 1 and coef0 = 1
  svm.fit.cubic <- svm(Purchased ~ EstimatedSalary+Age, data = train, 
                       type='C-classification', 
                       kernel='polynomial', degree=3, coef0=1,
                       scale=FALSE, cost = 1)
  
  # predict the outcome for the training data
  train_pred_cubic <- predict(svm.fit.cubic, train)
  
  # training data (in-sample) classification accuracy
  train_accuracy_cubic <- sum(train_pred_cubic == train$Purchased) / nrow(train)
  train_accuracy_cubic
## [1] 0.92
  # confusion table of the results
  table(train_pred_cubic, train$Purchased)
##                 
## train_pred_cubic  0  1
##                0 92  8
##                1  8 92
  # predict the outcome for the testing data
  test_pred_cubic <- predict(svm.fit.cubic, test)
  
  # testing data classification accuracy
  test_accuracy_cubic <- sum(test_pred_cubic == test$Purchased) / nrow(test)
  test_accuracy_cubic
## [1] 0.905
  # confusion table of the results
  table(test_pred_cubic, test$Purchased)
##                
## test_pred_cubic   0   1
##               0 138   0
##               1  19  43
  # calculate the AUC and plot the ROC curve for the testing data
  pred_cubic <- prediction(attributes(predict(svm.fit.cubic, test, decision.values=TRUE))$decision.values, test$Purchased)
  perf_cubic <- performance(pred_cubic, "tpr", "fpr")
  plot(perf_cubic, col="blue", lwd=2, main="ROC Curve for Testing Data (Cubic Kernel)")
  abline(a=0, b=1, lty=2, col="red")

  performance(pred_cubic, "auc")@y.values[[1]]
## [1] 0.9579322
  # generate a grid of values for EstimatedSalary and Age 
  x1.range <- seq(min(SNAds$EstimatedSalary), max(SNAds$EstimatedSalary), length=100)
  x2.range <- seq(min(SNAds$Age), max(SNAds$Age), length=100)
  grid <- expand.grid(EstimatedSalary = x1.range, Age = x2.range)
  grid$Purchased <- predict(svm.fit.cubic, newdata=grid)
  
  # plot the decision boundary
  plot(cbind(train$EstimatedSalary,train$Age),
       col=ifelse(train$Purchased>0, "deepskyblue", "darkorange"), 
       pch = 19, cex = 1.2, lwd = 2, 
       xlab = "EstimatedSalary", ylab = "Age", cex.lab = 1.5)
  points(grid$EstimatedSalary, grid$Age, col=ifelse(grid$Purchased==1, "lightblue", "navajowhite"), pch=15, cex=0.5)
  legend("bottomright", c("Purchased", "Not Purchased"),
         col=c("deepskyblue", "darkorange"), cex = 0.8,
         pch=c(19, 19), text.col = c("deepskyblue", "darkorange"))
  title("Decision Boundary for Cubic Kernel SVM")

Question 3 [30 pts]: SVM for hand written digit Data

Take digits 4 and 9 from zip.train and zip.test in the ElemStatLearn package. For this question, you should use the kernlab package, in combination with the caret package to tune the parameters. Make sure that you specify the method = argument so that the correct package/function is used to fit the model. You may consider reading the details from this documentation. Complete the following task.

Answer:

    # Construct the training and testing data (with digits 4 and 9).
    library("ElemStatLearn")
    train = zip.train[zip.train[, 1] %in% c(4,9), ]
    test = zip.test[zip.test[, 1] %in% c(4,9), ]
    library("caret")
## Loading required package: ggplot2
## Loading required package: lattice
    # train linear SVM using the caret and kernlab package
    library("kernlab")
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
    set.seed(1)
    svm.linear <- train(y ~ ., data = data.frame("x" = train[,-1], "y" = as.factor(train[,1])), 
                        method = "svmLinear",
                        preProcess = c("center", "scale"),
                        tuneGrid = expand.grid(C = c(0.01, 0.1, 0.5, 1)),
                        trControl = trainControl(method = "cv", number = 10))
# Support Vector Machines with Linear Kernel 
# 
# 1296 samples
#  256 predictor
#    2 classes: '4', '9' 
# 
# Pre-processing: centered (256), scaled (256) 
# Resampling: Cross-Validated (10 fold) 
# Summary of sample sizes: 1167, 1167, 1166, 1165, 1167, 1166, ... 
# Resampling results across tuning parameters:
# 
#   C     Accuracy   Kappa    
#   0.01  0.9899523  0.9799046
#   0.10  0.9845616  0.9691195
#   0.50  0.9853309  0.9706558
#   1.00  0.9853309  0.9706558
# 
# Accuracy was used to select the optimal model using the largest value.
# The final value used for the model was C = 0.01.
    # The best C is 0,01.
    svm.linear.best <- ksvm(x=train[,-1], y=as.factor(train[,1]), type="C-svc", kernel='vanilladot', C=0.01)
##  Setting default kernel parameters
    # Predict the testing data using this model and 
    #obtain the confusion table and testing data accuracy.
    pred <- predict(svm.linear.best, test[,-1])
    # confusion table
    table(pred, as.factor(test[,1]))
##     
## pred   4   9
##    4 192   5
##    9   8 172
    # testing data accuracy
    sum(diag(table(pred, as.factor(test[,1])))) / nrow(test)
## [1] 0.9655172
    # train radial Kernel SVM using the caret and kernlab package
    svm.radial <- train(y ~ ., data = data.frame("x" = train[,-1], "y" = as.factor(train[,1])), method = "svmRadial", preProcess = c("center", "scale"),
                        tuneGrid = expand.grid(C = c(0.1, 0.5, 1, 5), 
                                               sigma = c(0.01, 0.1, 1)),
                        trControl = trainControl(method="repeatedcv", 
                                                 number=10, repeats=3))
    svm.radial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 1296 samples
##  256 predictor
##    2 classes: '4', '9' 
## 
## Pre-processing: centered (256), scaled (256) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1166, 1167, 1166, 1166, 1165, 1167, ... 
## Resampling results across tuning parameters:
## 
##   C    sigma  Accuracy   Kappa      
##   0.1  0.01   0.9071647  0.814148156
##   0.1  0.10   0.5030889  0.000000000
##   0.1  1.00   0.5030889  0.000000000
##   0.5  0.01   0.9534699  0.906891026
##   0.5  0.10   0.5059253  0.005768462
##   0.5  1.00   0.5030889  0.000000000
##   1.0  0.01   0.9660660  0.932102426
##   1.0  0.10   0.6975090  0.392711286
##   1.0  1.00   0.5030889  0.000000000
##   5.0  0.01   0.9678688  0.935711021
##   5.0  0.10   0.7201310  0.438261851
##   5.0  1.00   0.5030889  0.000000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01 and C = 5.
    svm.radial.best <- ksvm(x=train[,-1], y=as.factor(train[,1]), 
                            kernel=rbfdot(sigma=0.01), C=5)
    # Predict the testing data using this model and obtain the confusion table and testing data accuracy.
    pred <- predict(svm.radial.best, test[,-1])
    # confusion table
    table(pred, test[,1])
##     
## pred   4   9
##    4 195   5
##    9   5 172
    # testing data accuracy
    sum(diag(table(pred, test[,1]))) / nrow(test)
## [1] 0.9734748