##--------------------------------------------------------------## ## Script for Lecture 8: ## ## R Programming 2: Beyond the Basics ## ## John Fox ## ## Introduction to the R Statistical Computing Environment ## ## ICPSR ## ## 2021 ## ##--------------------------------------------------------------## # Non-trivial programming example: # implementation of Newton-Rapson algorithm for logistic regression # first attempt, bugged! lreg <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") if (verbose) cat("\n") list(coef=as.vector(b), vcov=var.b, iterations=it) } # prepare data to test library("carData") # for data head(Mroz, 10) Mroz$lfp <- with(Mroz, ifelse(lfp == "yes", 1, 0)) Mroz$wc <- with(Mroz, ifelse(wc == "yes", 1, 0)) Mroz$hc <- with(Mroz, ifelse(hc == "yes", 1, 0)) head(Mroz, 10) xtabs(~lfp, data=Mroz) mod.mroz <- with(Mroz, # fails! lreg(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) # debugging (a digression) # call stack to error: traceback traceback() # debug(): the step-through debugger debug(lreg) mod.mroz <- with(Mroz, lreg(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) undebug(lreg) # also see ?browser, ?debugonce # logistic regression by Newton-Raphson (corrected): lreg <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE){ # X is the model matrix # y is the response vector of 0s and 1s # max.iter is the maximum number of iterations # tol is a convergence criterion # verbose: show iteration history? X <- cbind(1, X) # add constant b <- b.last <- rep(0, ncol(X)) # initialize coefficients it <- 1 # initialize iteration counter while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- as.vector(1/(1 + exp(-X %*% b))) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) # update coefficients if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b # update previous coefficients it <- it + 1 # increment counter } if (verbose) cat("\n") # newline if (it > max.iter) warning("maximum iterations exceeded") list(coefficients=as.vector(b), vcov=var.b, iterations=it) } mod.mroz <- with(Mroz, lreg(cbind(k5, k618, age, wc, hc, lwg, inc), lfp, verbose=TRUE)) coef(mod.mroz) sqrt(diag(mod.mroz$vcov)) mod.mroz$iterations summary(glm(lfp ~ ., data=Mroz, family=binomial)) # check # optimizing code (another digression) # a larger (but not large) problem set.seed(12345) # for reproducibility X <- matrix(rnorm(10000*10), 10000, 10) y <- rbinom(10000, 1, prob=1/(1 + exp(-cbind(1, X) %*% rep(1, 11)))) head(X) head(y) table(y) # measuring time: system.time (also see microbenchmark package) system.time(mod.1 <- lreg(X, y)) system.time(mod.glm <- glm(y ~ X, family=binomial)) rbind(coef(mod.1), coef(mod.glm)) all.equal(coef(mod.1), coef(mod.glm)) # profiling code for time and memory usage (tmp <- tempfile()) # create temporary file Rprof(tmp, memory.profiling=TRUE) # turn on profiling mod.1 <- lreg(X, y) Rprof() # turn off profiling summaryRprof(tmp, memory="both") # summarize results unlink(tmp) # delete temporary file (note not mnemonic!) # making the program more efficient # (could use sparse matrix arithmetic via the Matrix package) diagprod <- function(d, X){ if (!is.matrix(X) || !nrow(X) == length(d)) stop("X and d not conformable") d*X } # explanation: XX <- matrix(1, 5, 3) # don't want to overwrite X XX (V <- diag(1:5)) V %*% XX (1:5) * XX # same thing! diagprod(1:5, XX) # same thing lreg2 <- function(X, y, max.iter=10, tol=1E-6) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ p <- as.vector(1/(1 + exp(-X %*% b))) var.b <- solve(crossprod(X, diagprod(p*(1 - p), X))) b <- b + var.b %*% crossprod(X, y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") list(coefficients=as.vector(b), vcov=var.b, iterations=it) } system.time(mod.2 <- lreg2(X, y)) all.equal(coef(mod.2), coef(mod.1)) # check tmp <- tempfile() # create temporary file Rprof(tmp, memory.profiling=TRUE, interval=0.002) # profiling with smaller sampling interval mod.2 <- lreg2(X, y) Rprof() # turn off profiling summaryRprof(tmp, memory="both") # summarize results unlink(tmp) # delete temporary file # The S3 object system # S3 classes mod.prestige <- lm(prestige ~ income + education + women, data=Prestige) attributes(mod.prestige) class(mod.prestige) v <- 1:10 v attributes(v) class(v) class(v) <- "character" attributes(v) v class(v) # S3 generic functions and methods print # the print generic stats:::print.lm # print method for "lm" objects (from the stats package namespace) print(mod.prestige) mod.prestige # print() is invoked implicitly stats:::print.lm(mod.prestige) # equivalent, but bad form print.lm(mod.prestige) # doesn't work, because method not "exported" methods("print") # print methods methods(class="lm") # methods for objects of class "lm" # S3 "inheritance" mod.mroz <- glm(lfp ~ ., family=binomial, data=Mroz) class(mod.mroz) dfbeta # generic function methods("dfbeta") # only an "lm" method head(dfbeta(mod.prestige)) # for an "lm" object head(dfbeta(mod.mroz)) # for a "glm" object, inheriting "lm" method