Abstract
This is the supplementaryR
file for \(k\) Nearest Neighbor in the lecture note “KNN”.
\(K\) nearest neighbor is a simple nonparametric method. Suppose we collect a set of observations \(\{x_i, y_i\}_{i=1}^n\), the prediction at a new target point is
\[\widehat y = \frac{1}{k} \sum_{x_i \in N_k(x_0)} y_i,\] where \(N_k(x_0)\) defines the \(k\) samples from the training data that are closest to \(x_0\). As default, closeness is defined using distance measures, such as the Euclidean distance. Here is a demonstration of the fitted regression function.
library(kknn)
set.seed(1)
# generate training data with 2*sin(x) and random Gaussian errors
x <- runif(15, 0, 2*pi)
y <- 2*sin(x) + rnorm(length(x))
# generate testing data points where we evaluate the prediction function
test.x = seq(0, 1, 0.001)*2*pi
# "1-nearest neighbor" regression
k = 1
knn.fit = kknn(y ~ x, train = data.frame(x = x, y = y),
test = data.frame(x = test.x),
k = k, kernel = "rectangular")
test.pred = knn.fit$fitted.values
# plot the data
par(mar=rep(2,4))
plot(x, y, xlim = c(0, 2*pi), pch = "o", cex = 2,
xlab = "", ylab = "", cex.lab = 1.5)
title(main=paste(k, "-Nearest Neighbor Regression", sep = ""), cex.main = 1.5)
# plot the true regression line
lines(test.x, 2*sin(test.x), col = "deepskyblue", lwd = 3)
box()
# plot the fitted line
lines(test.x, test.pred, type = "s", col = "darkorange", lwd = 3)
\(K\) is a crucial tuning parameter for \(K\)NN. Let’s observe its effect.
# generate more data
set.seed(1)
n = 200
x <- runif(n, 0, 2*pi)
y <- 2*sin(x) + rnorm(length(x))
test.y = 2*sin(test.x) + rnorm(length(test.x))
# 1-nearest neighbor
k = 1
knn.fit = kknn(y ~ x, train = data.frame(x = x, y = y),
test = data.frame(x = test.x),
k = k, kernel = "rectangular")
test.pred = knn.fit$fitted.values
par(mar=rep(2,4))
plot(x, y, xlim = c(0, 2*pi), pch = 19, cex = 1,
axes=FALSE, ylim = c(-4.25, 4.25))
title(main=paste(k, "-Nearest Neighbor Regression", sep = ""))
lines(test.x, 2*sin(test.x), col = "deepskyblue", lwd = 3)
lines(test.x, test.pred, type = "s", col = "darkorange", lwd = 3)
box()
Overall, we can evaluate the prediction error of this model:
# prediction error
mean((test.pred - test.y)^2)
## [1] 2.097488
If we consider different values of \(k\), we can observe the trade-off between bias and variance.
par(mfrow=c(2,3))
for (k in c(1, 5, 10, 33, 66, 100))
{
knn.fit = kknn(y ~ x, train = data.frame(x = x, y = y),
test = data.frame(x = test.x),
k = k, kernel = "rectangular")
test.pred = knn.fit$fitted.values
par(mar=rep(2,4))
plot(x, y, xlim = c(0, 2*pi), pch = 19, cex = 0.7,
axes=FALSE, ylim = c(-4.25, 4.25))
title(main=paste("K =", k))
lines(test.x, 2*sin(test.x), col = "deepskyblue", lwd = 3)
lines(test.x, test.pred, type = "s", col = "darkorange", lwd = 3)
box()
}
As \(k\) increases, we have a more stable model, i.e., smaller variance, however, the bias is also increased. As \(k\) decreases, the bias also decreases, but the model is less stable. Formally, the prediction error (at a given target point \(x_0\)) can be broken into three parts: the irreducible error, the bias squared, and variance.
\[\begin{aligned} E\Big[ \big( Y - \widehat f(x_0) \big)^2 \Big] &= E \Big[ \big( Y - f(x_0) + f(x_0) - E[\widehat f(x_0)] + E[\widehat f(x_0)] - \widehat f(x_0) \big)^2 \Big] \\ &= E \Big[ \big( Y - f(x_0) \big)^2 \Big] + E \Big[ \big(f(x_0) - E[\widehat f(x_0)] \big)^2 \Big] + E\Big[ \big(E[\widehat f(x_0)] - \widehat f(x_0) \big)^2 \Big] + \text{Cross Terms}\\ &= \underbrace{E\Big[ ( Y - f(x_0))^2 \big]}_{\text{Irreducible Error}} + \underbrace{\Big(f(x_0) - E[\widehat f(x_0)]\Big)^2}_{\text{Bias}^2} + \underbrace{E\Big[ \big(\widehat f(x_0) - E[\widehat f(x_0)] \big)^2 \Big]}_{\text{Variance}} \end{aligned}\]Tuning the parameter \(k\) can be a tricky task since we cannot know the bias in practice. Cross-validation can be used to directly estimate the prediction error. A 10-fold cross validation works as follows
# 10 fold cross validation
nfold = 10
infold = sample(rep(1:nfold, length.out=length(x)))
mydata = data.frame(x = x, y = y)
# we many consider a set of possible k values
allk = c(1:5, seq(10, 100, 10))
# save prediction errors of each fold
errorMatrix = matrix(NA, length(allk), nfold)
# loop across all possible k and all folds
for (l in 1:nfold)
{
for (k in 1:length(allk))
{
knn.fit = kknn(y ~ x, train = mydata[infold != l, ], test = mydata[infold == l, ], k = allk[k])
errorMatrix[k, l] = mean((knn.fit$fitted.values - mydata$y[infold == l])^2)
}
}
# plot the results for each fold
plot(rep(allk, nfold), as.vector(errorMatrix), pch = 19, cex = 0.5, xlab = "k", ylab = "prediction error")
points(allk, apply(errorMatrix, 1, mean), col = "red", pch = 19, type = "l", lwd = 3)
# which k is the best? average across all folds
allk[which.min(apply(errorMatrix, 1, mean))]
## [1] 30
For hard classification, KNN is not really much different from the regression model, except that a majority voting is used instead of averaging. For 1NN, it is a Voronoi tessellation.
# knn for classification:
library(class)
## Warning: package 'class' was built under R version 4.1.2
set.seed(1)
# generate 20 random observations, with random class 1/0
x <- matrix(runif(40), 20, 2)
g <- rbinom(20, 1, 0.5)
# generate a grid for plot
xgd1 = xgd2 = seq(0, 1, 0.01)
gd = expand.grid(xgd1, xgd2)
# fit a 1-nearest neighbor model and get the fitted class
knn1 <- knn(x, gd, g, k=1)
knn1.class <- matrix(knn1, length(xgd1), length(xgd2))
# plot the data
par(mar=rep(0.2,4))
plot(x, col=ifelse(g==1, "darkorange", "deepskyblue"), pch = 19, cex = 3, axes = FALSE, xlim= c(0, 1), ylim = c(0, 1))
symbols(0.7, 0.7, circles = 2, add = TRUE)
points(0.7, 0.7, pch = 19)
box()
# Voronoi tessalation plot (1NN)
library(deldir)
par(mar=rep(0.2,4))
z <- deldir(x = data.frame(x = x[,1], y = x[,2], z=as.factor(g)), rw = c(0, 1, 0, 1))
w <- tile.list(z)
plot(w, fillcol=ifelse(g==1, "bisque", "cadetblue1"))
points(x, col=ifelse(g==1, "darkorange", "deepskyblue"), pch = 19, cex = 3)
We use artificial data from the ElemStatLearn
package.
library(ElemStatLearn)
x <- mixture.example$x
y <- mixture.example$y
xnew <- mixture.example$xnew
par(mar=rep(2,4))
plot(x, col=ifelse(y==1, "darkorange", "deepskyblue"), axes = FALSE)
box()
The decision boundary is highly nonlinear. We can utilize the contour()
function to demonstrate the result.
# knn classification
k = 15
knn.fit <- knn(x, xnew, y, k=k)
px1 <- mixture.example$px1
px2 <- mixture.example$px2
pred <- matrix(knn.fit == "1", length(px1), length(px2))
contour(px1, px2, pred, levels=0.5, labels="",axes=FALSE)
box()
title(paste(k, "-Nearest Neighbor", sep= ""))
points(x, col=ifelse(y==1, "darkorange", "deepskyblue"))
mesh <- expand.grid(px1, px2)
points(mesh, pch=".", cex=1.2, col=ifelse(pred, "darkorange", "deepskyblue"))
As a comparison, we use a linear regression to fit the data and obtain a naive decision boundary.
# using linear regression to fit the data (not logistic)
lm.fit = glm(y~x)
lm.pred = matrix(as.matrix(cbind(1, xnew)) %*% as.matrix(lm.fit$coef) > 0.5, length(px1), length(px2))
par(mar=rep(2,4))
plot(mesh, pch=".", cex=1.2, col=ifelse(lm.pred, "darkorange", "deepskyblue"), axes=FALSE)
abline(a = (0.5 - lm.fit$coef[1])/lm.fit$coef[3], b = -lm.fit$coef[2]/lm.fit$coef[3], lwd = 2)
points(x, col=ifelse(y==1, "darkorange", "deepskyblue"))
title("Linear Regression of 0/1 Response")
box()
Nearest neighbor methods usually do not perform very well on high-dimensional data (there is a theoretical explanation to it, but can you think about why?). However, for this handwritten digit data, \(K\)NN works just fine. There is possibly a lower dimensional representation of the data so that when you evaluate the distance on the high-dimensional space, it is just as effective as working on the low dimensional representation.
# Handwritten Digit Recognition Data
library(ElemStatLearn)
# the first column is the true digit
dim(zip.train)
## [1] 7291 257
dim(zip.test)
## [1] 2007 257
# look at one sample
image(zip2image(zip.train, 1), col=gray(256:0/256), zlim=c(0,1), xlab="", ylab="", axes = FALSE)
## [1] "digit 6 taken"
We 3NN to predict all samples in the testing data. The model is fairly accurate.
# fit 3nn model and calculate the error
knn.fit <- knn(zip.train[, 2:257], zip.test[, 2:257], zip.train[, 1], k=3)
# overall prediction error
mean(knn.fit != zip.test[,1])
## [1] 0.05480817
# the confusion matrix
table(knn.fit, zip.test[,1])
##
## knn.fit 0 1 2 3 4 5 6 7 8 9
## 0 355 0 7 1 0 2 3 0 4 2
## 1 0 257 0 0 2 0 0 1 0 0
## 2 1 0 182 2 0 2 1 1 0 0
## 3 0 0 1 154 0 5 0 1 4 0
## 4 0 4 2 0 182 0 2 4 0 3
## 5 0 0 0 7 2 146 1 0 2 0
## 6 0 2 1 0 2 0 163 0 0 0
## 7 2 1 2 1 3 0 0 138 1 4
## 8 0 0 3 0 1 1 0 1 152 0
## 9 1 0 0 1 8 4 0 1 3 168