##--------------------------------------------------------------## ## Script for Lecture 7: ## ## R Programming, Part I: Basics ## ## John Fox ## ## Introduction to the R Statistical Computing Environment ## ## ICPSR ## ## 2021 ## ##--------------------------------------------------------------## # Preliminary programming example: # creating lagged variables in a regression (x <- 1:10) c(NA, x) c(NA, x)[-11] # simple version: Lag <- function(x, lag=1){ length.x <- length(x) c(rep(NA, lag), x[-((length.x - lag + 1):length.x)]) } Lag(1:10) Lag(1:10, 2) Lag(letters) set.seed(123) # for reproducibility (fac <- factor(sample(c("a", "b", "c"), 20, replace=TRUE))) Lag(fac, 2) # incorrect! Lag(1:10, -1) # fails! # enhanced version (handles factors, "leads", does error checking): Lag <- function(x, lag=1){ # lag a variable, padding with NAs # x: variable to lag # lag: number of observations to lag (negative = lead) if (!(is.vector(x) || is.factor(x))) stop ("x must be a vector or a factor") if (!(length(lag) == 1 && floor(lag) == lag)) stop ("lag must be an integer") length.x <- length(x) if (abs(lag) > length.x) stop("|lag| > length of x") result <- if (lag == 0) x else if (lag > 0) c(rep(NA, lag), x[1:(length.x - lag)]) else { lag <- abs(lag) c(x[(lag + 1):length.x], rep(NA, lag)) } if (is.factor(x)) result <- factor(levels(x)[result]) result } Lag(1:10) Lag(1:10, 2) Lag(letters) Lag(fac, 2) # works Lag(1:10, -1) # "lead" Lag(1:10, 1.5) # informative error # an application library("carData") # for data ?Bfox Bfox$debt.1 <- Lag(Bfox$debt) Bfox$debt.2 <- Lag(Bfox$debt, 2) Bfox$debt.3 <- Lag(Bfox$debt, 3) Bfox lm(partic ~ debt, data=Bfox) lm(partic ~ Lag(debt), data=Bfox) lm(partic ~ Lag(debt, 2), data=Bfox) lm(partic ~ debt + Lag(debt) + Lag(debt, 2), data=Bfox) # Basic matrix operations (A <- matrix(c(1, 2, -4, 3, 5, 0), nrow=2, ncol=3)) (B <- matrix(1:6, 2, 3)) (C <- matrix(c(2, -2, 0, 1, -1, 1, 4 ,4, -4), 3, 3, byrow=TRUE)) A + B # addition A - B # subtraction 2*A # product of a scalar and a matrix -A # negation A %*% C # matrix product A %*% B # error: not conformable for matrix multiplication! # vectors are treated as row or column vectors as needed a <- rep(1, 3) b <- c(1, 5, 3) C %*% a a %*% C a %*% b # inner product outer(a, b) # outer product a %o% b # equivalent outer(a, b, `==`) # generalized outer product t(B) # matrix transpose solve(C) # matrix inverse solve(C) %*% C # check library(MASS) # for fractions() fractions(solve(C)) fractions(solve(C, b)) # solve matrix equation Cx = b fractions(solve(C) %*% b) # equivalent # illustration: naive computation of LS regression X <- cbind(1, as.matrix(Prestige[,1:3])) # attach the constant y <- Prestige[, 4] head(X) # first 6 rows head(y) head(Prestige[, 4, drop=FALSE]) # as a one-column matrix solve(t(X) %*% X) %*% t(X) %*% y # LS coefficients lm(prestige ~ education + income + women, data=Prestige) # check? as.vector(solve(t(X) %*% X) %*% t(X) %*% y) # coerce to vector, loses names # eigenvalues and eigenvectors R <- with(Prestige, cor(cbind(education, income, women))) R # correlation matrix eigen(R) # principal-components analysis (eigenvalues and vectors) det(R) # determinant diag(R) # extract diagonal diag(R) <- NA # set diagonal R mean(abs(R), na.rm=TRUE) # mean of absolute off-diagonal elements diag(1:3) # make diagonal matrix diag(3) # order-3 identity matrix # some other matrix-algebra functions: qr(), svd(), # chol() (Choleski factorization) # Control structures # conditionals # if ... else abs1 <- function(x) if (x < 0) -x else x abs1(-5) abs1(5) abs1(-3:3) # wrong! the first element, -3, controls the result (-3:3) < 0 # ifelse (vectorized) abs2 <- function(x) ifelse(x < 0, -x, x) abs2(-3:3) # cascading conditionals sign1 <- function(x){ if (x < 0) -1 else if (x > 0) 1 else 0 } sign1(-5) sign2 <- function(x){ ifelse (x < 0, -1, ifelse(x > 0, 1, 0)) } sign2(c(-5, 0, 10)) # switch (on your own time) convert2meters <- function(x, units=c("inches", "feet", "yards", "miles")) { units <- match.arg(units) switch(units, inches = x * 0.0254, feet = x * 0.3048, yards = x * 0.9144, miles = x * 1609.344) } convert2meters(10, "inches") convert2meters(3, "feet") convert2meters(100, "yards") convert2meters(5, "miles") convert2meters(7, "fathoms") # iteration # for loops prod(1:5) # factorial, ok for small numbers gamma(5 + 1) # better factorial(5) # best factorial fact1 <- function(x){ if (x <= 1) return(1) f <- 1 # initialize for (i in 1:x){ f <- f * i # accumulate product } f # return result } fact1(5) fact1(-5) # wrong! fact2 <- function(x) { # with input checks if ((!is.numeric(x)) || (x != floor(x)) || (x < 0) || (length(x) > 1)) stop("x must be a non-negative integer") f <- 1 # initialize for (i in 1:x) f <- f * i # accumulate product f # return result } fact2(5) fact2(-5) fact2(1.5) # while loops fact3 <- function(x){ if ((!is.numeric(x)) || (x != floor(x)) || (x < 0) || (length(x) > 1)) stop("x must be a non-negative integer") i <- f <- 1 # initialize while (i <= x) { f <- f * i # accumulate product i <- i + 1 # increment counter } f # return result } fact3(5) # repeat loops (on your own time) fact4 <- function(x) { if ((!is.numeric(x)) || (x != floor(x)) || (x < 0) || (length(x) > 1)) stop("x must be a non-negative integer") i <- 1 # initialize f <- 1 repeat { f <- f * i # accumulate product i <- i + 1 # increment counter if (i > x) break # termination test } f # return result } fact4(5) # recursion fact5 <- function(x) { if (x <= 1) { # termination condition 1 } else { # recursive call x * fact5(x - 1) } } fact5(5) trace(fact5) fact5(5) untrace(fact5) # Avoiding loops (as time permits) # the "apply" family # apply (arrays) head(DavisThin, 10) # first 10 rows dim(DavisThin) ?DavisThin DavisThin$thin.drive <- apply(DavisThin, 1, sum) # row sums head(DavisThin$thin.drive, 10) apply(DavisThin, 2, mean) # column means colMeans(DavisThin) # more efficient (rowMeans, colSums, rowSums too) # lapply, sapply (lists) thin.list <- as.list(DavisThin) str(thin.list) # structure of the result lapply(thin.list, mean) # returns list (na.rm passed to mean) sapply(thin.list, mean) # simplifies result sapply(thin.list, function(x) sum(x)/length(x)) # equivalent, with anonymous function # tapply (tables of statistics) View(Moore) # randomly sample 10 observations with(Moore, tapply(conformity, list(Status=partner.status, Authoritarianism=fcategory), mean)) Moore$fcategory <- factor(Moore$fcategory, # reorder levels levels=c("low", "medium", "high")) Moore$partner.status <- factor(Moore$partner.status, levels=c("low", "high")) with(Moore, tapply(conformity, list(Status=partner.status, Authoritarianism=fcategory), mean)) library("car") # for Tapply() Tapply(conformity ~ partner.status*fcategory, mean, data=Moore) # others (e.g., mapply): apropos("apply") # to loop or not to loop? (and how to loop when one loops) time1 <- function(n){ # inefficient! a <- NULL for(i in 1:n) a <- c(a, i^2) a } system.time(time1(1e5)) time2 <- function(n){ # better a <- numeric(n) # initialize to final length for(i in 1:n) a[i] <- i^2 a } system.time(time2(1e6)) # 10 times larger time3 <- function(n){ # best a <- (1:n)^2 # vectorize a } system.time(time3(1e6)) # the rest as time permits time4 <- function(n){ # (slightly) inefficient! a <- numeric(n) for(i in 1:n) a[i] <- 2 * pi * sin(i) a } system.time(time4(1e6)) time5 <- function(n){ # slightly better a <- numeric(n) for(i in 1:n) a[i] <- sin(i) 2 * pi * a # move outside loop } system.time(time5(1e6)) time6 <- function(n){ # best 2 * pi * sin(1:n) # fully vectorized } system.time(time6(1e6)) # don't vectorize for its own sake matrices <- vector(mode="list", length=10000) # allocate for (i in seq_along(matrices)) # takes awhile matrices[[i]] <- matrix(rnorm(10000), 100, 100) # simple, using loop system.time({ S1 <- matrix(0, 100, 100) # initialize for (M in matrices) S1 <- S1 + M # accumulate }) # "clever," avoiding loop ;) system.time(S2 <- apply(array(unlist(matrices), dim = c(100, 100, 10000)), 1:2, sum)) all(S1 == S2) # are answers EXACTLY equal (bad idea!)? max(abs(S1 - S2)) # how different? all.equal(S1, S2) # equal within tolerance ?all.equal