##------------------------------------------------------------## ## Script for Sessions 4-6: R Programming ## ## John Fox ## ## Introduction to the R Statistical Computing Environment ## ## ICPSR Summer Program ## ## 2010 ## ##------------------------------------------------------------## # Preliminary programming example library(car) # for data mod.duncan <- lm(prestige ~ income + education, data=Duncan) cook <- sqrt(cooks.distance(mod.duncan)) plot(hatvalues(mod.duncan), rstudent(mod.duncan), cex=(10/max(cook))*cook) abline(h=c(-2, 0, 2), lty=2) abline(v=c(2, 3)* length(coef(mod.duncan))/length(rstudent(mod.duncan)), lty=1) identify(hatvalues(mod.duncan), rstudent(mod.duncan), rownames(Duncan)) inflPlot <- function(model, scale=10, col=c(1, 2), identify=TRUE, labels=names(rstud), ... ) { # Plot hatvalues, Studentized residuals, and Cook's distances # for a linear or generalized linear model # Arguments: # model: an lm or glm model object # scale: a scaling factor for the circles representing # Cook's D # col: colors for non-noteworthy and noteworthy points # identify points: label noteworthy points (TRUE or FALSE) # labels: for identified points hatval <- hatvalues(model) rstud <- rstudent(model) cook <- sqrt(cooks.distance(model)) scale <- scale/max(cook, na.rm = TRUE) p <- length(coef(model)) n <- length(rstud) cutoff <- sqrt(4/(n - p)) plot(hatval, rstud, xlab = "Hat-Values", ylab = "Studentized Residuals", cex=scale*cook, col=ifelse(cook > cutoff, col[2], col[1]), ...) abline(v = c(2, 3)*p/n, lty = "dashed") bonf <- qt(.025/n, df = n - p - 1, lower.tail=FALSE) abline(h=c(-bonf, -2, 0, 2, bonf), lty="dashed") if (identify){ noteworthy <- cook > cutoff | abs(rstud) > bonf | hatval > 2*p/n pos <- ifelse(hatval - mean(range(hatval, na.rm=TRUE)) <= 0, 4, 2) text(hatval[noteworthy], rstud[noteworthy], labels[noteworthy], pos = pos[noteworthy]) return(which(noteworthy)) } else return(invisible(NULL)) } inflPlot(mod.duncan) inflPlot(mod.duncan, ylim=c(-3, 4), las=1, col=gray(c(0.5, 0))) # Basic matrix operations (A <- matrix(c(1, 2, -4, 3, 5, 0), nrow=2, ncol=3)) (B <- matrix(1:6, 2, 3)) (C <- matrix(c(2, -2, 0, 1, -1, 1, 4 ,4, -4), 3, 3, byrow=TRUE)) A + B # addition A - B # subtraction 2*A # product of a scalar and a matrix -A # negation A %*% C # matrix product A %*% B # not conformable for matrix multiplication # vectors are treated as row or column vectors as needed a <- rep(1, 3) b <- c(1, 5, 3) C %*% a a %*% C a %*% b # inner product outer(a, b) # outer product t(B) # matrix transpose solve(C) # matrix inverse solve(C) %*% C # check library(MASS) # for fractions() fractions(solve(C)) solve(C, b) # solve matrix equation Cx = b solve(C) %*% b # equivalent # illustration: naive computation of LS regression X <- cbind(1, as.matrix(Prestige[,1:3])) # attach the constant y <- Prestige[ , 4] head(X) # first 6 rows head(y) head(Prestige[ , 4, drop=FALSE]) # as a column solve(t(X) %*% X) %*% t(X) %*% y # LS coefficients lm(prestige ~ education + income + women, data=Prestige) # check? # eigenvalues and eigenvectors R <- with(Prestige, cor(cbind(education, income, women))) R # correlation matrix eigen(R) # principal-components analysis det(R) # determinant diag(R) # extract diagonal diag(R) <- NA # set diagonal R diag(1:3) # make diagonal matrix diag(3) # order-3 identity matrix # Control structures # conditionals # if ... else abs1 <- function(x) if (x < 0) -x else x abs1(-5) abs1(5) abs1(-3:3) # wrong! the first element, -3, controls the result # ifelse (vectorized) abs2 <- function(x) ifelse(x < 0, -x, x) abs2(-3:3) # cascading conditionals sign1 <- function(x){ if (x < 0) -1 else if (x > 0) 1 else 0 } sign1(-5) sign2 <- function(x){ ifelse (x < 0, -1, ifelse(x > 0, 1, 0)) } sign2(c(-5, 0, 10)) # switch convert2meters <- function(x, units=c("inches", "feet", "yards", "miles")) { units <- match.arg(units) switch(units, inches = x * 0.0254, feet = x * 0.3048, yards = x * 0.9144, miles = x * 1609.344) } convert2meters(10, "inches") convert2meters(10, "in") # abbreviation convert2meters(3, "feet") convert2meters(100, "yards") convert2meters(5, "miles") convert2meters(7, "fathoms") # error! # iteration # for loops prod(1:5) # ok for small numbers gamma(5 + 1) # better factorial(5) # best factorial fact1 <- function(x){ if (x <= 1) return(1) f <- 1 # initialize for (i in 1:x) f <- f * i # accumulate product f # return result } fact1(5) fact1(-5) # wrong! fact2 <- function(x) { # with input checks if ((!is.numeric(x)) || (x != floor(x)) || (x < 0) || (length(x) > 1)) stop("x must be a non-negative integer") f <- 1 # initialize for (i in 1:x) f <- f * i # accumulate product f # return result } fact2(5) fact2(-5) # error! # while loops fact3 <- function(x){ if ((!is.numeric(x)) || (x != floor(x)) || (x < 0) || (length(x) > 1)) stop("x must be a non-negative integer") i <- f <- 1 # initialize while (i <= x) { f <- f * i # accumulate product i <- i + 1 # increment counter } f # return result } fact3(5) # repeat loops fact4 <- function(x) { if ((!is.numeric(x)) || (x != floor(x)) || (x < 0) || (length(x) > 1)) stop("x must be a non-negative integer") i <- f <- 1 # initialize repeat { f <- f * i # accumulate product i <- i + 1 # increment counter if (i > x) break # termination test } f # return result } fact4(5) # recursion fact5 <- function(x){ if (x <= 1) 1 # termination condition else x * fact5(x - 1) # recursive call } fact5(5) fact6 <- fact5 remove(fact5) fact6(5) # error! fact7 <- function(x) { if (x <= 1) 1 else x * Recall(x - 1) # recursive call } fact7(5) fact8 <- fact7 remove(fact7) fact8(5) # still works with fact7 removed # Avoiding loops # the "apply" family # apply (arrays) head(DavisThin, 10) # first 10 rows dim(DavisThin) DavisThin$thin.drive <- apply(DavisThin, 1, sum) # row sums head(DavisThin$thin.drive, 10) apply(DavisThin, 2, mean) # column means colMeans(DavisThin) # more efficient (rowMeans, colSums, rowSums too) DavisThin$thin.drive <- NULL # remove thin.drive DavisThin[1, 2] <- DavisThin[2, 4] <- DavisThin[10, 3] <- NA # some missing data head(DavisThin, 10) head(apply(DavisThin, 1, sum), 10) head(apply(DavisThin, 1, function(x) 7*mean(x, na.rm=TRUE)), 10) DavisThin[1, 2:5] <- NA # create some more missing data head(DavisThin, 10) make.scale <- function(items) { if (sum(is.na(items)) >= 4) NA else 7*mean(items, na.rm=TRUE) } head(apply(DavisThin, 1, make.scale), 10) # lapply, sapply (lists) thin.list <- as.list(DavisThin) str(thin.list) # structure of the result lapply(thin.list, mean, na.rm=TRUE) # returns list (na.rm passed to mean) lapply(thin.list, function(x) mean(x, na.rm=TRUE)) # equivalent sapply(thin.list, mean, na.rm=TRUE) # simplifies result # mapply (multiple arguments integrate(dnorm, lower=-1.96, upper=1.96) # integrate normal density (low <- c(-Inf, -3:3)) (high <- c(-3:3, Inf)) (P <- mapply(function(lo, hi) integrate(dnorm, lo, hi)$value, lo=low, hi=high)) sum(P) pnorm(high) - pnorm(low) # check # Vectorize Integrate <- Vectorize( function(fn, lower, upper) integrate(fn, lower, upper)$value, vectorize.args=c("lower", "upper") ) Integrate(dnorm, lower=low, upper=high) # tapply (tables of statistics) some(Moore) # randomly sample 10 observations with(Moore, tapply(conformity, list(Status=partner.status, Authoritarianism=fcategory), mean)) Moore$fcategory <- factor(Moore$fcategory, levels=c("low", "medium", "high")) # reorder levels Moore$partner.status <- factor(Moore$partner.status, levels=c("low", "high")) with(Moore, tapply(conformity, list(Status=partner.status, Authoritarianism=fcategory), mean)) # to loop or not to loop? time1 <- function(n){ # inefficient! a <- NULL for(i in 1:n) a <- c(a, i^2) a } system.time(time1(30000)) time2 <- function(n){ # better a <- numeric(n) # initialize to final length for(i in 1:n) a[i] <- i^2 a } system.time(time2(30000)) time3 <- function(n){ # best a <- (1:n)^2 # vectorize a } system.time(time3(30000)) time4 <- function(n){ # (slightly) inefficient! a <- numeric(n) for(i in 1:n) a[i] <- 2 * pi * sin(i) a } system.time(time4(100000)) time5 <- function(n){ # better a <- numeric(n) for(i in 1:n) a[i] <- sin(i) 2 * pi * a # move outside loop } system.time(time5(100000)) time6 <- function(n){ # best 2 * pi * sin(1:n) # fully vectorized } system.time(time6(100000)) # don't vectorize for its own sake matrices <- vector(mode="list", length=10000) # allocate for (i in seq_along(matrices)) matrices[[i]] <- matrix(rnorm(10000), 100, 100) # simple system.time({ S1 <- matrix(0, 100, 100) # initialize for (M in matrices) S1 <- S1 + M # accumulate }) # clever system.time(S2 <- apply(array(unlist(matrices), dim = c(100, 100, 10000)), 1:2, sum)) # a smaller problem system.time({ S1 <- matrix(0, 100, 100) # initialize for (M in matrices[1:1000]) S1 <- S1 + M # accumulate }) # clever system.time(S2 <- apply(array(unlist(matrices[1:1000]), dim = c(100, 100, 1000)), 1:2, sum)) all(S1 == S2) # are answers EXACTLY equal (bad idea!)? max(abs(S1 - S2)) # how different? all.equal(S1, S2) # equal within tolerance ?all.equal # Non-trivial programming examples # estimation of logistic regression by Newton-Raphson lreg1 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE){ # X is the model matrix # y is the response vector of 0s and 1s # max.iter is the maximum number of iterations # tol is a convergence criterion # verbose: show iteration history? X <- cbind(1, X) # add constant b <- b.last <- rep(0, ncol(X)) # initialize coefficients it <- 1 # initialize iteration counter while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- as.vector(1/(1 + exp(-X %*% b))) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) # update coefficients if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b # update previous coefficients it <- it + 1 # increment counter } if (verbose) cat("\n") # newline if (it > max.iter) warning("maximum iterations exceeded") list(coefficients=as.vector(b), var=var.b, iterations=it) } head(Mroz) # first 6 observations Mroz$lfp <- with(Mroz, ifelse(lfp == "yes", 1, 0)) Mroz$wc <- with(Mroz, ifelse(wc == "yes", 1, 0)) Mroz$hc <- with(Mroz, ifelse(hc == "yes", 1, 0)) mod.mroz.1 <- with(Mroz, lreg1(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) mod.mroz.1$coefficients sqrt(diag(mod.mroz.1$var)) # std. errors mod.mroz.glm <- glm(lfp ~ ., family=binomial, data=Mroz) # check coef(mod.mroz.glm) - mod.mroz.1$coefficients sqrt(diag(vcov(mod.mroz.glm))) - sqrt(diag(mod.mroz.1$var)) # estimation of logistic regression by optimization lreg2 <- function(X, y, method="BFGS"){ X <- cbind(1, X) negLogL <- function(b, X, y){ p <- as.vector(1/(1 + exp(-X %*% b))) - sum(y*log(p) + (1 - y)*log(1 - p)) } grad <- function(b, X, y){ p <- as.vector(1/(1 + exp(-X %*% b))) - apply(((y - p)*X), 2, sum) } result <- optim(rep(0, ncol(X)), negLogL, gr=grad, hessian=TRUE, method=method, X=X, y=y) list(coefficients=result$par, var=solve(result$hessian), deviance=2*result$value, converged=result$convergence == 0) } mod.mroz.2 <- with(Mroz, lreg2(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) mod.mroz.2$coefficients sqrt(diag(mod.mroz.2$var)) mod.mroz.2$converged coef(mod.mroz.glm) - mod.mroz.2$coefficients # check sqrt(diag(vcov(mod.mroz.glm))) - sqrt(diag(mod.mroz.2$var)) # translating numbers into words makeDigits <- function(x) strsplit(as.character(x), "")[[1]] makeDigits(123456) makeDigits(-123456) # whoops! makeDigits(1000000000) # whoops! options("scipen") ?options options(scipen=100) makeDigits(1000000000) makeNumber <- function(x) as.numeric(paste(x, collapse="")) makeNumber(c("1", "2", "3", "4", "5")) ones <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine") teens <- c("ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", " seventeen", "eighteen", "nineteen") names(ones) <- names(teens) <- 0:9 tens <- c("twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety") names(tens) <- 2:9 suffixes <- c("thousand,", "million,", "billion,", "trillion,") ones["5"] teens["3"] tens["7"] trim <- function(text){ gsub("(^\ *)|((\ *|-|,\ zero|-zero)$)", "", text) } trim("twenty-one") trim("twenty-zero") number2words <- function(x){ negative <- x < 0 x <- abs(x) digits <- makeDigits(x) nDigits <- length(digits) result <- if (nDigits == 1) as.vector(ones[digits]) else if (nDigits == 2) if (x <= 19) as.vector(teens[digits[2]]) else trim(paste(tens[digits[1]], "-", ones[digits[2]], sep="")) else if (nDigits == 3) { tail <- makeNumber(digits[2:3]) if (tail == 0) paste(ones[digits[1]], "hundred") else trim(paste(ones[digits[1]], "hundred", number2words(tail))) } else { nSuffix <- ((nDigits + 2) %/% 3) - 1 if (nSuffix > length(suffixes) || nDigits > 15) stop(paste(x, "is too large!")) pick <- 1:(nDigits - 3*nSuffix) trim(paste(number2words(makeNumber(digits[pick])), suffixes[nSuffix], number2words(makeNumber(digits[-pick])))) } if (negative) paste("minus", result) else result } number2words(123456789) number2words(-123456789) number2words(-123456000) debug(number2words) number2words(123456789) remove(makeDigits, makeNumber, number2words, trim, ones, teens, tens, suffixes) options(scipen=0) # back to default numbers2words <- function(x, billion=c("US", "UK"), and=if (billion == "US") "" else "and") { billion <- match.arg(billion) trim <- function(text){ gsub("(^\ *)|((\ *|-|,\ zero|-zero)$)", "", text) } makeNumber <- function(x) as.numeric(paste(x, collapse="")) makeDigits <- function(x) strsplit(as.character(x), "")[[1]] helper <- function(x) { negative <- x < 0 x <- abs(x) digits <- makeDigits(x) nDigits <- length(digits) result <- if (nDigits == 1) as.vector(ones[digits]) else if (nDigits == 2) if (x <= 19) as.vector(teens[digits[2]]) else trim(paste(tens[digits[1]], "-", ones[digits[2]], sep="")) else if (nDigits == 3) { tail <- makeNumber(digits[2:3]) if (tail == 0) paste(ones[digits[1]], "hundred") else trim(paste(ones[digits[1]], trim(paste("hundred", and)), helper(tail))) } else { nSuffix <- ((nDigits + 2) %/% 3) - 1 if (nSuffix > length(suffixes) || nDigits > 15) stop(paste(x, "is too large!")) pick <- 1:(nDigits - 3*nSuffix) trim(paste(helper(makeNumber(digits[pick])), suffixes[nSuffix], helper(makeNumber(digits[-pick])))) } if (billion == "UK"){ words <- strsplit(result, " ")[[1]] if (length(grep("million,", words)) > 1) result <- sub(" million, ", ", ", result) } if (negative) paste("minus", result) else result } opts <- options(scipen=100) on.exit(options(opts)) ones <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine") teens <- c("ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", " seventeen", "eighteen", "nineteen") names(ones) <- names(teens) <- 0:9 tens <- c("twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety") names(tens) <- 2:9 suffixes <- if (billion == "US") c("thousand,", "million,", "billion,", "trillion,") else c("thousand,", "million,", "thousand million,", "billion,") x <- round(x) if (length(x) > 1) sapply(x, helper) else helper(x) } numbers2words(c(1234567890123, -0123, 1000)) numbers2words(c(1234567890123, -0123, 1000), billion="UK") # Simulation example: # efficiency of LS and robust regression when errors are normal and non-normal X <- as.matrix(Prestige[,c("education", "income", "women")]) mod <- lm(prestige ~ education + income + women, data=Prestige) beta <- coef(mod) # taken as "true" parameters sigma <- summary(mod)$sigma # taken as true scale of errors Ey <- fitted(mod) # taken as expectation of response # initialize: robust.t <- robust.normal <- LS.t <- LS.normal <- matrix(0, 1000, 4) (seed <- sample(1e6, 1)) set.seed(seed) # for reproducibility system.time( for (i in 1:1000){ normal.errors <- sigma*rnorm(102) # sample errors t.errors <- sigma*rt(102, 2) y.normal <-Ey + normal.errors # construct observed response y.t <- Ey + t.errors LS.normal[i,] <- lsfit(X, y.normal)$coef - beta # LS sampling error LS.t[i,] <- lsfit(X, y.t)$coef - beta robust.normal[i,] <- coef(rlm(y.normal ~ X, method="MM", maxit=100)) - beta robust.t[i,] <- coef(rlm(y.t ~ X, method="MM", maxit=100)) - beta } ) summary(lm(prestige ~ education + income + women, data=Prestige)) # estimated relative bias (should be close to 0): colMeans(LS.normal)/beta colMeans(robust.normal)/beta colMeans(LS.t)/beta colMeans(robust.t)/beta # estimated RMSE: sqrt(colMeans(LS.normal^2))/abs(beta) sqrt(colMeans(robust.normal^2))/abs(beta) sqrt(colMeans(LS.t^2))/abs(beta) sqrt(colMeans(robust.t^2))/abs(beta) # Debugging library(car) # for data Mroz$lfp <- with(Mroz, ifelse(lfp == "yes", 1, 0)) Mroz$wc <- with(Mroz, ifelse(wc == "yes", 1, 0)) Mroz$hc <- with(Mroz, ifelse(hc == "yes", 1, 0)) # bugged! lreg3 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") if (verbose) cat("\n") list(coefficients=as.vector(b), var=var.b, iterations=it) } mod.mroz.3 <- with(Mroz, lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) # call stack to error: traceback traceback() # interrupt execution, examine environment of function: browser ?browser # still bugged! lreg3 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) browser() var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") if (verbose) cat("\n") list(coefficients=as.vector(b), var=var.b, iterations=it) } mod.mroz.3 <- with(Mroz, lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) # imagined sequence of commands in browser mode: # t(X) %*% V %*% X # str(X) # head(X) # str(V) # V # str(p) # head(p) # p <- as.vector(p) # str(diag(p * (1 - p))) # V <- diag(p * (1 - p)) # V[1:10, 1:10] # # Q # step-through debugger: debug ?debug # bugged! (original bugged function) lreg3 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") if (verbose) cat("\n") list(coefficients=as.vector(b), var=var.b, iterations=it) } debug(lreg3) # set debug flag mod.mroz.3 <- with(Mroz, lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) # postmortem debugger: debugger undebug(lreg3) ?debugger options(error=dump.frames) mod.mroz.3 <- with(Mroz, lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) debugger() # Measuring time and memory allocation options(error=NULL) # restore default lreg1 # by Newton-Raphson lreg2 # by optimization # a larger (but not large!) problem set.seed(12345) # for reproducibility X <- matrix(rnorm(5000*10), 5000, 10) y <- rbinom(5000, 1, prob=1/(1 + exp(-cbind(1, X) %*% rep(1, 11)))) # measuring time: system.time system.time(mod.1 <- lreg1(X, y)) mod.1$coef system.time(mod.2 <- lreg2(X, y)) mod.2$coef system.time(mod.glm <- glm(y ~ X, family=binomial)) coef(mod.glm) # profiling time and memory use: Rprof (tmp <- tempfile()) # create temporary file Rprof(tmp, memory.profiling=TRUE) # turn on profiling mod.1 <- lreg1(X, y) Rprof() # turn off profiling summaryRprof(tmp, memory="both") # summarize results unlink(tmp) # delete temporary file tmp <- tempfile() # create temporary file Rprof(tmp, memory.profiling=TRUE) # turn on profiling mod.2 <- lreg2(X, y) Rprof() # turn off profiling summaryRprof(tmp, memory="both") # summarize results unlink(tmp) # delete temporary file # new version of Newton-Raphson function lreg4 <- function(X, y, max.iter=10, tol=1E-6) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ p <- as.vector(1/(1 + exp(-X %*% b))) var.b <- solve(crossprod(X, p * (1 - p) * X)) b <- b + var.b %*% crossprod(X, y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") list(coefficients=as.vector(b), var=var.b, iterations=it) } # explanation: (XX <- matrix(1, 5, 3)) # don't want to overwrite X (V <- diag(1:5)) V %*% XX 1:5 * XX # same thing! system.time(mod.4 <- lreg4(X, y)) mod.4$coef # check tmp <- tempfile() # create temporary file Rprof(tmp, memory.profiling=TRUE, interval=0.002) # profiling with smaller sampling interval mod.4 <- lreg4(X, y) Rprof() # turn off profiling summaryRprof(tmp, memory="both") # summarize results unlink(tmp) # delete temporary file # a much larger problem set.seed(12345) # for reproducibility X <- matrix(rnorm(100000*10), 100000, 10) y <- rbinom(100000, 1, prob=1/(1 + exp(-cbind(1, X) %*% rep(1, 11)))) system.time(mod.1 <- lreg1(X, y)) system.time(mod.4 <- lreg4(X, y)) system.time(mod.2 <- lreg2(X, y)) system.time(mod.glm <- glm(y ~ X, family=binomial)) max(abs(mod.4$coef - coef(mod.glm))) # The S3 object system # S3 classes library(car) # for data mod.prestige <- lm(prestige ~ income + education + women, data=Prestige) attributes(mod.prestige) class(mod.prestige) v <- 1:10 v attributes(v) class(v) class(v) <- "character" attributes(v) v class(v) # S3 generic functions and methods print # the print generic print.lm # print method for "lm" objects mod.prestige print(mod.prestige) # equivalent print.lm(mod.prestige) # equivalent, but bad form methods("print") # print methods methods(class="lm") # methods for objects of class "lm" # S3 "inheritance" mod.mroz <- glm(lfp ~ ., family=binomial, data=Mroz) class(mod.mroz) # Example: a logistic-regression function # generic function lreg <- function(X, ...){ UseMethod("lreg") } # default method lreg.default <- function(X, y, predictors=colnames(X), max.iter=10, tol=1E-6, constant=TRUE, ...) { if (!is.numeric(X) || !is.matrix(X)) stop("X must be a numeric matrix") if (!is.numeric(y) || !all(y == 0 | y == 1)) stop("y must contain only 0s and 1s") if (nrow(X) != length(y)) stop("X and y contain different numbers of observations") if (constant) { X <- cbind(1, X) colnames(X)[1] <- "Constant" } b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ p <- as.vector(1/(1 + exp(-X %*% b))) var.b <- solve(crossprod(X, p * (1 - p) * X)) b <- b + var.b %*% crossprod(X, y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") dev <- -2*sum(y*log(p) + (1 - y)*log(1 - p)) result <- list(coefficients=as.vector(b), var=var.b, deviance=dev, converged= it <= max.iter, predictors=predictors, iterations = it) class(result) <- "lreg" result } mod.mroz.3 <- with(Mroz, lreg(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) class(mod.mroz.3) mod.mroz.3 # whoops! # print and summary methods for class "lreg" print.lreg <- function(x, ...) { coef <- x$coefficients names(coef) <- x$predictors print(coef) if (!x$converged) cat("\n *** lreg did not converge ***\n") invisible(x) } summary.lreg <- function(object, ...) { b <- object$coefficients se <- sqrt(diag(object$var)) z <- b/se table <- cbind(b, se, z, 2*(1-pnorm(abs(z)))) colnames(table) <- c("Estimate", "Std.Err", "z value", "Pr(>|z|)") rownames(table) <- object$predictors result <- list(coef=table, deviance=object$deviance, converged=object$converged) class(result) <- "summary.lreg" result } print.summary.lreg <- function(x, ...) { printCoefmat(x$coef, signif.stars=FALSE) cat("\nDeviance =", x$deviance,"\n") if (!x$converged) cat("\n Note: *** lreg did not converge ***\n") invisible(x) } mod.mroz.3 summary(mod.mroz.3) # Writing a statistical modeling function # a formula method for lreg lreg.formula <- function(formula, data, subset, na.action, model = TRUE, contrasts = NULL, ...) { call <- match.call() # returns the function call mf <- match.call(expand.dots = FALSE) # the function call w/o ... args <- match(c("formula", "data", "subset", "na.action"), names(mf), 0) # which arguments are present? mf <- mf[c(1, args)] mf$drop.unused.levels <- TRUE mf[[1]] <- as.name("model.frame") mf <- eval.parent(mf) # create a model frame terms <- attr(mf, "terms") # terms object for the model y <- model.response(mf) # response variable values <- sort(unique(y)) if (length(values) != 2) stop("the response variable is not binary") y <- as.numeric(y == values[2]) X <- model.matrix(terms, mf, contrasts) # model matrix mod <- lreg.default(X, y, predictors=colnames(X), constant=FALSE, ...) mod$na.action <- attr(mf, "na.action") mod$contrasts <- attr(X, "contrasts") mod$xlevels <- .getXlevels(terms, mf) mod$call <- call mod$terms <- terms if (model) mod$model <- mf mod } remove(Mroz) # delete modified Mroz data set mod.lreg <- lreg(lfp ~ ., data=Mroz) # Mroz will be found in car mod.lreg