Logistic Regression

We use the South Africa heart data as a demonstration. The goal is to estimate the probability of chd, the indicator of coronary heart disease.

    library(ElemStatLearn)
    data(SAheart)
    
    heart = SAheart
    heart$famhist = as.numeric(heart$famhist)-1
    n = nrow(heart)
    p = ncol(heart)
    
    heart.full = glm(chd~., data=heart, family=binomial)
    round(summary(heart.full)$coef, dig=3)
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -6.151      1.308  -4.701    0.000
## sbp            0.007      0.006   1.135    0.256
## tobacco        0.079      0.027   2.984    0.003
## ldl            0.174      0.060   2.915    0.004
## adiposity      0.019      0.029   0.635    0.526
## famhist        0.925      0.228   4.061    0.000
## typea          0.040      0.012   3.214    0.001
## obesity       -0.063      0.044  -1.422    0.155
## alcohol        0.000      0.004   0.027    0.978
## age            0.045      0.012   3.728    0.000
    # fitted value 
    yhat = (heart.full$fitted.values>0.5)
    table(yhat, SAheart$chd)
##        
## yhat      0   1
##   FALSE 256  77
##   TRUE   46  83

Based on what we learned in class, we can solve this problem ourselves using numerical optimization. Here we will demonstrate an approach that uses general solvers. First, write the objective function of the logistic regression, for any value of \(\boldsymbol \beta\), \(\mathbf{X}\) and \(\mathbf{y}\).

    # the negative log-likelihood function of logistic regression 
    my.loglik <- function(b, x, y)
    {
        bm = as.matrix(b)
        xb =  x %*% bm
        # this returns the negative loglikelihood
        return(sum(y*xb) - sum(log(1 + exp(xb))))
    }

    # Gradient
    my.gradient <- function(b, x, y)
    {
        bm = as.matrix(b) 
        expxb =  exp(x %*% bm)
        return(t(x) %*% (y - expxb/(1+expxb)))
    }

Let’s check the result of this function for some arbitrary \(\boldsymbol \beta\) value.

    # prepare the data matrix, I am adding a column of 1 for intercept
    
    x = as.matrix(cbind("intercept" = 1, heart[, 1:9]))
    y = as.matrix(heart[,10])
    
    # check my function
    b = rep(0, ncol(x))
    my.loglik(b, x, y) # scaler
## [1] -320.234
    # check the optimal value and the likelihood
    my.loglik(heart.full$coefficients, x, y)
## [1] -236.07

Then we optimize this objective function

    # Use a general solver to get the optimal value
    # Note that we are doing maximizaiton instead of minimization, hence, need to specify "fnscale" = -1
    optim(b, fn = my.loglik, gr = my.gradient, 
          method = "BFGS", x = x, y = y, control = list("fnscale" = -1))
## $par
##  [1] -6.150733305  0.006504017  0.079376464  0.173923988  0.018586578  0.925372019  0.039595096 -0.062909867  0.000121675  0.045225500
## 
## $value
## [1] -236.07
## 
## $counts
## function gradient 
##       74       16 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

This matches our glm() solution. Now, if we do not have a general solver, we should consider using the Newton-Raphson. You need to write a function to calculate the Hessian matrix and proceed with an optimization update. Figure this out in your homework. The solution will be released later on.

    # After defining some functions ... 
    # my Newton-Raphson method
    # set up an initial value
    # this is sometimes crucial...
    
    b = rep(0, ncol(x))
    
    mybeta = my.logistic(b, x, y, tol = 1e-10, maxitr = 20, 
                         gr = my.gradient, hess = my.hessian, verbose = TRUE)
## at iteration 1, current beta is 
## -4.032 0.005 0.066 0.133 0.009 0.694 0.024 -0.045 -0.001 0.027
## at iteration 2, current beta is 
## -5.684 0.006 0.077 0.167 0.017 0.884 0.037 -0.061 0 0.041
## at iteration 3, current beta is 
## -6.127 0.007 0.079 0.174 0.019 0.924 0.039 -0.063 0 0.045
## at iteration 4, current beta is 
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
## at iteration 5, current beta is 
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
## at iteration 6, current beta is 
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
## at iteration 7, current beta is 
## -6.151 0.007 0.079 0.174 0.019 0.925 0.04 -0.063 0 0.045
    # the parameter value
    mybeta
##                    [,1]
## intercept -6.1507208650
## sbp        0.0065040171
## tobacco    0.0793764457
## ldl        0.1739238981
## adiposity  0.0185865682
## famhist    0.9253704194
## typea      0.0395950250
## obesity   -0.0629098693
## alcohol    0.0001216624
## age        0.0452253496
    # get the standard error estimation 
    mysd = sqrt(diag(solve(-my.hessian(mybeta, x, y))))

With this solution, I can then get the standard errors and the p-value. You can check them with the glm() function solution.

    # my summary matrix
    round(data.frame("beta" = mybeta, "sd" = mysd, "z" = mybeta/mysd, 
          "pvalue" = 2*(1-pnorm(abs(mybeta/mysd)))), dig=5)
##               beta      sd        z  pvalue
## intercept -6.15072 1.30826 -4.70145 0.00000
## sbp        0.00650 0.00573  1.13500 0.25637
## tobacco    0.07938 0.02660  2.98376 0.00285
## ldl        0.17392 0.05966  2.91517 0.00355
## adiposity  0.01859 0.02929  0.63458 0.52570
## famhist    0.92537 0.22789  4.06053 0.00005
## typea      0.03960 0.01232  3.21382 0.00131
## obesity   -0.06291 0.04425 -1.42176 0.15509
## alcohol    0.00012 0.00448  0.02714 0.97835
## age        0.04523 0.01213  3.72846 0.00019
    # check that with the glm fitting 
    round(summary(heart.full)$coef, dig=5)
##             Estimate Std. Error  z value Pr(>|z|)
## (Intercept) -6.15072    1.30826 -4.70145  0.00000
## sbp          0.00650    0.00573  1.13500  0.25637
## tobacco      0.07938    0.02660  2.98376  0.00285
## ldl          0.17392    0.05966  2.91517  0.00355
## adiposity    0.01859    0.02929  0.63458  0.52570
## famhist      0.92537    0.22789  4.06053  0.00005
## typea        0.03960    0.01232  3.21382  0.00131
## obesity     -0.06291    0.04425 -1.42176  0.15509
## alcohol      0.00012    0.00448  0.02714  0.97835
## age          0.04523    0.01213  3.72846  0.00019

Discriminant Analysis

Comparing densities (after removing some constants) in LDA. We create two density functions that use the same covariance matrix. The decision line is linear.

    library(plotly)
## Error in library(plotly): there is no package called 'plotly'
    library(mvtnorm)

    # generate two densities 
    x1 = seq(-2.5, 2.5, 0.1)
    x2 = seq(-2.5, 2.5, 0.1)
    data = expand.grid(x1, x2)
    
    Sigma = matrix(c(1, 0.5, 0.5, 1), 2, 2)
    sigma_inv = solve(Sigma)
    
    # generate two class means
    mu1 = c(0.5, -1)
    mu2 = c(-0.5, 0.5)
    
    # define prior 
    
    p1 = 0.4 
    p2 = 0.6
    
    # the density function after removing some constants
    d1 = apply(data, 1, function(x) exp(-0.5 * t(x - mu1) %*% sigma_inv %*% (x - mu1))*p1 )
    d2 = apply(data, 1, function(x) exp(-0.5 * t(x - mu2) %*% sigma_inv %*% (x - mu2))*p2 )
    
    # plot the densities
    plot_ly(x = x1, y = x2) %>% 
            add_surface(z = matrix(d1, length(x1), length(x2)), colorscale = list(c(0,"rgb(255,112,183)"),c(1,"rgb(128,0,64)"))) %>% 
            add_surface(z = matrix(d2, length(x1), length(x2))) %>% 
            layout(scene = list(xaxis = list(title = "X1"), 
                                yaxis = list(title = "X2"),
                                zaxis = list(title = "Log Normal Densities")))
## Error in plot_ly(x = x1, y = x2) %>% add_surface(z = matrix(d1, length(x1), : could not find function "%>%"

Comparing densities (after removing some constants) in QDA. We create two density functions that use different covariance matrices. The decision line is nonlinear.

    Sigma2 = matrix(c(1, -0.5, -0.5, 1), 2, 2)
    sigma2_inv = solve(Sigma2)
    
    # the density function after removing some constants
    d1 = apply(data, 1, function(x) 1/sqrt(det(Sigma))*exp(-0.5 * t(x - mu1) %*% sigma_inv %*% (x - mu1))*p1 )
    d2 = apply(data, 1, function(x) 1/sqrt(det(Sigma2))*exp(-0.5 * t(x - mu2) %*% sigma2_inv %*% (x - mu2))*p2 )
    
    # plot the densities
    plot_ly(x = x1, y = x2) %>% 
            add_surface(z = matrix(d1, length(x1), length(x2)), colorscale = list(c(0,"rgb(107,184,214)"),c(1,"rgb(0,90,124)"))) %>% 
            add_surface(z = matrix(d2, length(x1), length(x2))) %>% 
            layout(scene = list(xaxis = list(title = "X1"), 
                                yaxis = list(title = "X2"),
                                zaxis = list(title = "Log Normal Densities")))  
## Error in plot_ly(x = x1, y = x2) %>% add_surface(z = matrix(d1, length(x1), : could not find function "%>%"

Example: the Hand Written Digit Data

We first sample 100 data from both the training and testing sets.

    library(ElemStatLearn)
    # a plot of some samples 
    findRows <- function(zip, n) {
        # Find n (random) rows with zip representing 0,1,2,...,9
        res <- vector(length=10, mode="list")
        names(res) <- 0:9
        ind <- zip[,1]
        for (j in 0:9) {
        res[[j+1]] <- sample( which(ind==j), n ) }
        return(res) 
    }
    
    set.seed(1)
    
    # find 100 samples for each digit for both the training and testing data
    train.id <- findRows(zip.train, 100)
    train.id = unlist(train.id)
    
    test.id <- findRows(zip.test, 100)
    test.id = unlist(test.id)
    
    X = zip.train[train.id, -1]
    Y = zip.train[train.id, 1]
    dim(X)
## [1] 1000  256
    Xtest = zip.test[test.id, -1]
    Ytest = zip.test[test.id, 1]
    dim(Xtest)
## [1] 1000  256

We can then fit LDA and QDA and predict.

    # fit LDA
    library(MASS)
    
    dig.lda=lda(X,Y)
    Ytest.pred=predict(dig.lda, Xtest)$class
    table(Ytest, Ytest.pred)
##      Ytest.pred
## Ytest  0  1  2  3  4  5  6  7  8  9
##     0 92  0  2  2  0  0  1  0  3  0
##     1  0 94  0  0  4  0  2  0  0  0
##     2  2  2 66  7  5  2  4  2 10  0
##     3  2  0  3 75  2  8  0  3  6  1
##     4  0  4  2  1 76  1  3  2  2  9
##     5  2  0  3 10  0 79  0  0  3  3
##     6  0  0  4  1  3  4 86  0  1  1
##     7  0  0  0  2  5  0  0 87  0  6
##     8  2  0  4  5  6  7  1  0 72  3
##     9  0  0  0  1  4  0  0  5  0 90
    mean(Ytest != Ytest.pred)
## [1] 0.183

However, QDA does not work in this case because there are too many parameters

    dig.qda = qda(X, Y) # error message
## Error in qda.default(x, grouping, ...): some group is too small for 'qda'