##------------------------------------------------------------## ## Script for Session 1: Getting Started With R ## ## John Fox ## ## Introduction to the R Statistical Computing Environment ## ## ICPSR Summer Program ## ## 2010 ## ##------------------------------------------------------------## # Basics # arithmetic, interacting with the interpreter # basic arithmetic operations 2 + 3 # addition 2 - 3 # subtraction 2*3 # multiplication 2/3 # division 2^3 # exponentiation # precedence of operators 4^2 - 3*2 1 - 6 + 4 2^-3 (4^2) - (3*2) # use parentheses to group, clarify 4 + 3^2 (4 + 3)^2 -2--3 -2 - -3 # use spaces to clarify # functions, arguments to functions, obtaining help log(100) log(100, base=10) log10(100) # equivalent log(100, b=10) # argument abbreviation help(log) ?log args(log) log(100, 10) # arguments by position example("log") apropos("log") help.search("log") RSiteSearch("loglinear", "functions") `+`(2, 3) # even operators are functions # vectors c(1, 2, 3, 4) # combine 1:4 # integer-sequence operator 4:1 -1:2 # note precedence seq(1, 4) seq(2, 8, by=2) # specify interval seq(0, 1, by=0.1) # non-integer sequence seq(0, 1, length=11) # specify number of elements # vectorized arithmetic c(1, 2, 3, 4)/2 c(1, 2, 3, 4)/c(4, 3, 2, 1) log(c(0.1, 1, 10, 100), 10) c(1, 2, 3, 4) + c(4, 3) # no warning c(1, 2, 3, 4) + c(4, 3, 2) # produces warning # variables x <- c(1, 2, 3, 4) # assignment x # print x/2 (y <- sqrt(x)) # assign and print (x <- rnorm(100)) summary(x) # a "generic" function # character and logical data (words <- c("To", "be", "or", "not", "to", "be")) paste(words, collapse=" ") (vals <- c(TRUE, TRUE, FALSE, TRUE)) !vals sum(vals) # coercion to numeric sum(!vals) # not operator c("A", FALSE, 3.0) # coercion to character # basic indexing x[12] # 12th element words[2] # second element vals[3] # third element x[6:15] # elements 6 through 15 x[-(11:100)] # omit elements 11 through 100 (note parentheses) #comparison and logical operators 1 == 2 # note == to test equality 1 != 2 1 <= 2 1 < 1:3 3:1 > 1:3 3:1 >= 1:3 TRUE & c(TRUE, FALSE) # and c(TRUE, FALSE, FALSE) | c(TRUE, TRUE, FALSE) # or ! c(T, F) # abbreviations of TRUE and FALSE, best avoided! (z <- x[1:10]) z < -0.5 z > 0.5 z < -0.5 | z > 0.5 # < and > of higher precedence than | abs(z) > 0.5 # absolute value z[abs(z) > 0.5] # indexing by a logical vector # user-defined functions mean(x) sum(x)/length(x) myMean <- function(x) sum(x)/length(x) myMean # can be printed like any object myMean(x) y # from sqrt(c(1, 2, 3, 4)) myMean(y) myMean(1:100) x # global x undisturbed myVar <- function(x) sum((x - myMean(x))^2)/(length(x) - 1) myVar(1:100) var(1:100) # check # cleaning up objects() remove(x, y, z, vals, words) objects() # using traceback() letters myVar(letters) traceback() # Duncan example # creating a data frame from data stored in a file Duncan <- read.table(file.choose(), header=TRUE) Duncan summary(Duncan) # generic summary function # attaching a data frame (best avoided) prestige # error! attach(Duncan) prestige # the search path search() # distributions and bivariate relationships hist(prestige) pairs(cbind(prestige, income, education), panel=function(x, y){ points(x, y) abline(lm(y ~ x), lty="dashed") lines(lowess(x, y)) }, diag.panel=function(x){ par(new=TRUE) hist(x, main="", axes=FALSE) } ) scatmat <- function(...) { # user-defined function pairs(cbind(...), panel=function(x, y){ points(x, y) abline(lm(y ~ x), lty=2) lines(lowess(x, y)) }, diag.panel=function(x){ par(new=TRUE) hist(x, main="", axes=FALSE) } ) } scatmat(prestige, income, education) plot(income, education) identify(income, education, row.names(Duncan)) row.names(Duncan)[c(6, 16, 27)] # fitting a regression (duncan.model <- lm(prestige ~ income + education)) summary(duncan.model) # again, summary generic # regression diagnostics library(car) search() hist(rstudent(duncan.model)) qqPlot(duncan.model) plot(hatvalues(duncan.model)) abline(h = 2*3/45) # twice the average hat-value identify(hatvalues(duncan.model), labels=row.names(Duncan)) plot(cooks.distance(duncan.model)) abline(h = 4/(45 - 3)) # rough cutoff for Cook's D identify(cooks.distance(duncan.model), labels=row.names(Duncan)) avPlots(duncan.model, id.method="identify", labels=rownames(Duncan)) crPlots(duncan.model, span=0.7) spreadLevelPlot(duncan.model) ncvTest(duncan.model) ncvTest(duncan.model, var.formula= ~ income + education) # refitting the model (remove <- whichNames(c("minister", "conductor"), Duncan)) summary(update(duncan.model, subset=-remove)) detach("Duncan") # Generic functions, methods, and the class-based S3 object system summary(Duncan$type) summary(Duncan$prestige) summary(Duncan) summary(duncan.model) class(Duncan$type) class(Duncan$prestige) class(Duncan) class(duncan.model) summary # the generic summary function args(summary.lm) # the lm summary method summary(duncan.model) summary.lm(duncan.model) # equivalent, but bad form methods(coef) coef(duncan.model) # invokes default method ?Mroz (mod.mroz <- glm(lfp ~ ., family=binomial, data=Mroz)) # logistic regression class(mod.mroz) args(summary.glm) summary(mod.mroz) summary.glm(mod.mroz) methods(variable.names) variable.names(duncan.model) variable.names(mod.mroz) # invokes the lm method