##------------------------------------------------------------## ## Script for Session 2: Statistical Models in R ## ## John Fox ## ## Introduction to the R Statistical Computing Environment ## ## ICPSR Summer Program ## ## 2010 ## ##------------------------------------------------------------## # Linear models in R # multiple regression (Prestige data) library(car) names(Prestige) some(Prestige) # 10 randomly sampling observations prestige.mod <- lm(prestige ~ education + log2(income) + women, data=Prestige) summary(prestige.mod) confint(prestige.mod) # dummy regression Prestige$type # a factor class(Prestige$type) str(Prestige$type) # structure Prestige.2 <- na.omit(Prestige) # filter out missing data nrow(Prestige) nrow(Prestige.2) levels(Prestige.2$type) Prestige.2$type <- with(Prestige.2, factor(type, levels=c("bc", "wc", "prof"))) # reorder levels Prestige.2$type # generating contrasts from factors getOption("contrasts") contrasts(Prestige.2$type) model.matrix(~ type, data=Prestige.2) contrasts(Prestige.2$type) <- contr.treatment(levels(Prestige.2$type), base=2) # changing baseline category contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- "contr.helmert" # Helmert contrasts contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- "contr.sum" # "deviation" contrasts contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- NULL # back to default Prestige.2$type.ord <- ordered(Prestige.2$type, levels=c("bc", "wc", "prof")) # ordered factor Prestige.2$type.ord round(contrasts(Prestige.2$type.ord), 3) # orthogonal polynomial contrasts prestige.mod.1 <- lm(prestige ~ log2(income) + education + type, data=Prestige.2) summary(prestige.mod.1) anova(prestige.mod.1) # sequential ("type-I") tests prestige.mod.0 <- lm(prestige ~ income + education, data=Prestige.2) # note: NA's filtered! summary(prestige.mod.0) prestige.mod.0 <- update(prestige.mod.1, . ~ . - type) # equivalent anova(prestige.mod.0, prestige.mod.1) # incremental F-test Anova(prestige.mod.1) # "type-II" tests prestige.mod.3 <- update(prestige.mod.1, . ~ . + log2(income):type + education:type) # adding interactions summary(prestige.mod.3) Anova(prestige.mod.3) lm(prestige ~ log2(income*type) + education*type, data=Prestige.2) # equivalent specifications lm(prestige ~ (log2(income) + education)*type, data=Prestige.2) # effect displays library(effects) plot(allEffects(prestige.mod.3), ask=FALSE) # Anova Models some(Moore) Moore$fcategory <- factor(Moore$fcategory, levels=c("low", "medium", "high")) Moore$partner.status <- relevel(Moore$partner.status, ref="low") xtabs(~ fcategory + partner.status, data=Moore) with(Moore, tapply(conformity, list(Authoritarianism=fcategory, "Partner's Status"=partner.status), mean)) with(Moore, tapply(conformity, list(Authoritarianism=fcategory, "Partner's Status"=partner.status), sd)) # graph of means: with(Moore, { interaction.plot(fcategory, partner.status, conformity, type="b", pch=c(1, 16), cex=2, ylim=range(conformity)) points(jitter(as.numeric(fcategory), factor=0.5), conformity, pch=ifelse(partner.status == "low", "L", "H")) identify(fcategory, conformity) }) # ANOVA tables contr <- options(contrasts=c("contr.sum", "contr.poly")) # contr.sum = deviation contrasts moore.mod <- lm(conformity ~ fcategory*partner.status, data=Moore) summary(moore.mod) Anova(moore.mod) # type II sums of squares Anova(moore.mod, type="III") # type III sums of squares options(contr) # restore defaults # more on lm args(lm) some(Davis) lm(weight ~ repwt, data=Davis, subset=sex == "F") # observation selection (women only) lm(weight ~ repwt, data=Davis, subset=1:100) lm(prestige ~ income + education, data=Duncan, subset=-c(6, 16)) lm(conformity ~ partner.status*fcategory, # specifying contrasts contrasts=list(partner.status=contr.sum, fcategory=contr.poly), data=Moore) lm(100*conformity/40 ~ partner.status*fcategory, data=Moore) # data argument; note computation of y lm(prestige~I(income + education), data=Duncan) # "protecting" expresssion on RHS of the model # Generalized linear models # binary logit model some(Mroz) mroz.mod <- glm(lfp ~ k5 + k618 + age + wc + hc + lwg + inc, data=Mroz, family=binomial) summary(mroz.mod) round(exp(cbind(Estimate=coef(mroz.mod), confint(mroz.mod))), 2) # odds ratios mroz.mod.2 <- update(mroz.mod, . ~ . - k5 - k618) anova(mroz.mod.2, mroz.mod, test="Chisq") # likelihood-ratio test Anova(mroz.mod) # analysis-of-deviance table plot(allEffects(mroz.mod), ask=FALSE) # Poisson regression some(Ornstein) nrow(Ornstein) (tab <- xtabs(~interlocks, data=Ornstein)) x <- as.numeric(names(tab)) # the names are the distinct values of interlocks plot(x, tab, type="h", xlab="Number of Interlocks", ylab="Frequency") points(x, tab, pch=16) mod.ornstein <- glm(interlocks ~ log2(assets) + nation + sector, family=poisson, data=Ornstein) summary(mod.ornstein) Anova(mod.ornstein) # quasi-Poisson model, allowing for overdispersion mod.ornstein.q <- update(mod.ornstein, family=quasipoisson) summary(mod.ornstein.q) plot(allEffects(mod.ornstein.q, default.levels=50), ask=FALSE) # repeated-measures ANOVA and MANOVA some(OBrienKaiser) ?OBrienKaiser contrasts(OBrienKaiser$treatment) contrasts(OBrienKaiser$gender) # defining the within-subjects design phase <- factor(rep(c("pretest", "posttest", "followup"), c(5, 5, 5)), levels=c("pretest", "posttest", "followup")) hour <- ordered(rep(1:5, 3)) idata <- data.frame(phase, hour) idata # fitting the multivariate linear model mod.ok <- lm(cbind(pre.1, pre.2, pre.3, pre.4, pre.5, post.1, post.2, post.3, post.4, post.5, fup.1, fup.2, fup.3, fup.4, fup.5) ~ treatment*gender, data=OBrienKaiser) mod.ok # multivariate and univariate tests (av.ok <- Anova(mod.ok, idata=idata, idesign=~phase*hour)) summary(av.ok) # graphing the means # reshape the data from "wide" to "long" OBrien.long <- reshape(OBrienKaiser, varying=c("pre.1", "pre.2", "pre.3", "pre.4", "pre.5", "post.1", "post.2", "post.3", "post.4", "post.5", "fup.1", "fup.2", "fup.3", "fup.4", "fup.5"), v.names="score", timevar="phase.hour", direction="long") OBrien.long$phase <- ordered(c("pre", "post", "fup")[1 + ((OBrien.long$phase.hour - 1) %/% 5)], levels=c("pre", "post", "fup")) OBrien.long$hour <- ordered(1 + ((OBrien.long$phase.hour - 1) %% 5)) dim(OBrien.long) head(OBrien.long, 25) # first 25 rows # compute means Means <- as.data.frame(ftable(with(OBrien.long, tapply(score, list(treatment=treatment, gender=gender, phase=phase, hour=hour), mean)))) names(Means)[5] <- "score" dim(Means) head(Means, 25) # graph of means library(lattice) xyplot(score ~ hour | phase + treatment, groups=gender, type="b", strip=function(...) strip.default(strip.names=c(TRUE, TRUE), ...), ylab="Mean Score", data=Means, auto.key=list(title="Gender", cex.title=1))