Abstract
This is the supplementaryR
file for classification models in the lecture note “Class”.
We use the South Africa heart data as a demonstration. The goal is to estimate the probability of chd
, the indicator of coronary heart disease.
library(ElemStatLearn)
data(SAheart)
heart = SAheart
heart$famhist = as.numeric(heart$famhist)-1
n = nrow(heart)
p = ncol(heart)
heart.full = glm(chd~., data=heart, family=binomial)
round(summary(heart.full)$coef, dig=3)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.151 1.308 -4.701 0.000
## sbp 0.007 0.006 1.135 0.256
## tobacco 0.079 0.027 2.984 0.003
## ldl 0.174 0.060 2.915 0.004
## adiposity 0.019 0.029 0.635 0.526
## famhist 0.925 0.228 4.061 0.000
## typea 0.040 0.012 3.214 0.001
## obesity -0.063 0.044 -1.422 0.155
## alcohol 0.000 0.004 0.027 0.978
## age 0.045 0.012 3.728 0.000
# fitted value
yhat = (heart.full$fitted.values>0.5)
table(yhat, SAheart$chd)
##
## yhat 0 1
## FALSE 256 77
## TRUE 46 83
Based on what we learned in class, we can solve this problem ourselves using numerical optimization. Here we will demonstrate an approach that uses general solvers. First, write the objective function of the logistic regression, for any value of \(\boldsymbol \beta\), \(\mathbf{X}\) and \(\mathbf{y}\).
# the negative log-likelihood function of logistic regression
my.loglik <- function(b, x, y)
{
bm = as.matrix(b)
xb = x %*% bm
# this returns the negative loglikelihood
return(sum(y*xb) - sum(log(1 + exp(xb))))
}
# Gradient
my.gradient <- function(b, x, y)
{
bm = as.matrix(b)
expxb = exp(x %*% bm)
return(t(x) %*% (y - expxb/(1+expxb)))
}
Let’s check the result of this function for some arbitrary \(\boldsymbol \beta\) value.
# prepare the data matrix, I am adding a column of 1 for intercept
x = as.matrix(cbind("intercept" = 1, heart[, 1:9]))
y = as.matrix(heart[,10])
# check my function
b = rep(0, ncol(x))
my.loglik(b, x, y) # scaler
## [1] -320.234
# check the optimal value and the likelihood
my.loglik(heart.full$coefficients, x, y)
## [1] -236.07
Then we optimize this objective function
# Use a general solver to get the optimal value
# Note that we are doing maximizaiton instead of minimization, hence, need to specify "fnscale" = -1
optim(b, fn = my.loglik, gr = my.gradient,
method = "BFGS", x = x, y = y, control = list("fnscale" = -1))
## $par
## [1] -6.150733305 0.006504017 0.079376464 0.173923988 0.018586578 0.925372019 0.039595096 -0.062909867 0.000121675 0.045225500
##
## $value
## [1] -236.07
##
## $counts
## function gradient
## 74 16
##
## $convergence
## [1] 0
##
## $message
## NULL
This matches our glm()
solution. Now, if we do not have a general solver, we should consider using the Newton-Raphson. You need to write a function to calculate the Hessian matrix and proceed with an optimization update. Figure this out in your homework. The solution will be released later on.
# After defining some functions ...
# my Newton-Raphson method
# set up an initial value
# this is sometimes crucial...
b = rep(0, ncol(x))
mybeta = my.logistic(b, x, y, tol = 1e-10, maxitr = 20,
gr = my.gradient, hess = my.hessian, verbose = TRUE)
## at iteration 1, current beta is
## -4.032 0.005 0.066 0.133 0.009 0.694 0.024 -0.045 -0.001 0.027
## at iteration 2, current beta is
## -5.684 0.006 0.077 0.167 0.017 0.884 0.037 -0.061 0 0.041
## at iteration 3, current beta is
## -6.127 0.007 0.079 0.174 0.019 0.924 0.039 -0.063 0 0.045
## at iteration 4, current beta is
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
## at iteration 5, current beta is
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
## at iteration 6, current beta is
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
## at iteration 7, current beta is
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
# the parameter value
mybeta
## [,1]
## intercept -6.1507208650
## sbp 0.0065040171
## tobacco 0.0793764457
## ldl 0.1739238981
## adiposity 0.0185865682
## famhist 0.9253704194
## typea 0.0395950250
## obesity -0.0629098693
## alcohol 0.0001216624
## age 0.0452253496
# get the standard error estimation
mysd = sqrt(diag(solve(-my.hessian(mybeta, x, y))))
With this solution, I can then get the standard errors and the p-value. You can check them with the glm()
function solution.
# my summary matrix
round(data.frame("beta" = mybeta, "sd" = mysd, "z" = mybeta/mysd,
"pvalue" = 2*(1-pnorm(abs(mybeta/mysd)))), dig=5)
## beta sd z pvalue
## intercept -6.15072 1.30826 -4.70145 0.00000
## sbp 0.00650 0.00573 1.13500 0.25637
## tobacco 0.07938 0.02660 2.98376 0.00285
## ldl 0.17392 0.05966 2.91517 0.00355
## adiposity 0.01859 0.02929 0.63458 0.52570
## famhist 0.92537 0.22789 4.06053 0.00005
## typea 0.03960 0.01232 3.21382 0.00131
## obesity -0.06291 0.04425 -1.42176 0.15509
## alcohol 0.00012 0.00448 0.02714 0.97835
## age 0.04523 0.01213 3.72846 0.00019
# check that with the glm fitting
round(summary(heart.full)$coef, dig=5)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.15072 1.30826 -4.70145 0.00000
## sbp 0.00650 0.00573 1.13500 0.25637
## tobacco 0.07938 0.02660 2.98376 0.00285
## ldl 0.17392 0.05966 2.91517 0.00355
## adiposity 0.01859 0.02929 0.63458 0.52570
## famhist 0.92537 0.22789 4.06053 0.00005
## typea 0.03960 0.01232 3.21382 0.00131
## obesity -0.06291 0.04425 -1.42176 0.15509
## alcohol 0.00012 0.00448 0.02714 0.97835
## age 0.04523 0.01213 3.72846 0.00019
Comparing densities (after removing some constants) in LDA. We create two density functions that use the same covariance matrix. The decision line is linear.
library(plotly)
## Error in library(plotly): there is no package called 'plotly'
library(mvtnorm)
# generate two densities
x1 = seq(-2.5, 2.5, 0.1)
x2 = seq(-2.5, 2.5, 0.1)
data = expand.grid(x1, x2)
Sigma = matrix(c(1, 0.5, 0.5, 1), 2, 2)
sigma_inv = solve(Sigma)
# generate two class means
mu1 = c(0.5, -1)
mu2 = c(-0.5, 0.5)
# define prior
p1 = 0.4
p2 = 0.6
# the density function after removing some constants
d1 = apply(data, 1, function(x) exp(-0.5 * t(x - mu1) %*% sigma_inv %*% (x - mu1))*p1 )
d2 = apply(data, 1, function(x) exp(-0.5 * t(x - mu2) %*% sigma_inv %*% (x - mu2))*p2 )
# plot the densities
plot_ly(x = x1, y = x2) %>%
add_surface(z = matrix(d1, length(x1), length(x2)), colorscale = list(c(0,"rgb(255,112,183)"),c(1,"rgb(128,0,64)"))) %>%
add_surface(z = matrix(d2, length(x1), length(x2))) %>%
layout(scene = list(xaxis = list(title = "X1"),
yaxis = list(title = "X2"),
zaxis = list(title = "Log Normal Densities")))
## Error in plot_ly(x = x1, y = x2) %>% add_surface(z = matrix(d1, length(x1), : could not find function "%>%"
Comparing densities (after removing some constants) in QDA. We create two density functions that use different covariance matrices. The decision line is nonlinear.
Sigma2 = matrix(c(1, -0.5, -0.5, 1), 2, 2)
sigma2_inv = solve(Sigma2)
# the density function after removing some constants
d1 = apply(data, 1, function(x) 1/sqrt(det(Sigma))*exp(-0.5 * t(x - mu1) %*% sigma_inv %*% (x - mu1))*p1 )
d2 = apply(data, 1, function(x) 1/sqrt(det(Sigma2))*exp(-0.5 * t(x - mu2) %*% sigma2_inv %*% (x - mu2))*p2 )
# plot the densities
plot_ly(x = x1, y = x2) %>%
add_surface(z = matrix(d1, length(x1), length(x2)), colorscale = list(c(0,"rgb(107,184,214)"),c(1,"rgb(0,90,124)"))) %>%
add_surface(z = matrix(d2, length(x1), length(x2))) %>%
layout(scene = list(xaxis = list(title = "X1"),
yaxis = list(title = "X2"),
zaxis = list(title = "Log Normal Densities")))
## Error in plot_ly(x = x1, y = x2) %>% add_surface(z = matrix(d1, length(x1), : could not find function "%>%"
We first sample 100 data from both the training and testing sets.
library(ElemStatLearn)
# a plot of some samples
findRows <- function(zip, n) {
# Find n (random) rows with zip representing 0,1,2,...,9
res <- vector(length=10, mode="list")
names(res) <- 0:9
ind <- zip[,1]
for (j in 0:9) {
res[[j+1]] <- sample( which(ind==j), n ) }
return(res)
}
set.seed(1)
# find 100 samples for each digit for both the training and testing data
train.id <- findRows(zip.train, 100)
train.id = unlist(train.id)
test.id <- findRows(zip.test, 100)
test.id = unlist(test.id)
X = zip.train[train.id, -1]
Y = zip.train[train.id, 1]
dim(X)
## [1] 1000 256
Xtest = zip.test[test.id, -1]
Ytest = zip.test[test.id, 1]
dim(Xtest)
## [1] 1000 256
We can then fit LDA and QDA and predict.
# fit LDA
library(MASS)
dig.lda=lda(X,Y)
Ytest.pred=predict(dig.lda, Xtest)$class
table(Ytest, Ytest.pred)
## Ytest.pred
## Ytest 0 1 2 3 4 5 6 7 8 9
## 0 92 0 2 2 0 0 1 0 3 0
## 1 0 94 0 0 4 0 2 0 0 0
## 2 2 2 66 7 5 2 4 2 10 0
## 3 2 0 3 75 2 8 0 3 6 1
## 4 0 4 2 1 76 1 3 2 2 9
## 5 2 0 3 10 0 79 0 0 3 3
## 6 0 0 4 1 3 4 86 0 1 1
## 7 0 0 0 2 5 0 0 87 0 6
## 8 2 0 4 5 6 7 1 0 72 3
## 9 0 0 0 1 4 0 0 5 0 90
mean(Ytest != Ytest.pred)
## [1] 0.183
However, QDA does not work in this case because there are too many parameters
dig.qda = qda(X, Y) # error message
## Error in qda.default(x, grouping, ...): some group is too small for 'qda'