##-----------------------------------------------------------------## ## Script for Day 1b: ## ## Linear and Generalized Linear Models in R ## ## John Fox ## ## The R Statistical Computing Environment: The Basics and Beyond ## ## ICPSR Summer Program, Berkeley ## ## 2016 ## ##-----------------------------------------------------------------## # Linear models in R # multiple regression (Prestige data) library(car) names(Prestige) # from car package some(Prestige) # 10 randomly sampling observations View(Prestige) ?Prestige #help on data set prestige.mod <- lm(prestige ~ education + log2(income) + women, data=Prestige) # data argument avoids attach summary(prestige.mod) confint(prestige.mod) Anova(prestige.mod) # dummy regression Prestige$type # a factor class(Prestige$type) str(Prestige$type) # structure Prestige.2 <- na.omit(Prestige) # filter out missing data nrow(Prestige) nrow(Prestige.2) levels(Prestige.2$type) Prestige.2$type <- with(Prestige.2, factor(type, levels=c("bc", "wc", "prof"))) # reorder levels # with() avoids attach() Prestige.2$type # generating contrasts from factors getOption("contrasts") contrasts(Prestige.2$type) model.matrix(~ type, data=Prestige.2) contrasts(Prestige.2$type) <- contr.treatment(levels(Prestige.2$type), base=2) # changing baseline category contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- "contr.helmert" # Helmert contrasts contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- "contr.sum" # "deviation" contrasts contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- NULL # back to default Prestige.2$type.ord <- ordered(Prestige.2$type, levels=c("bc", "wc", "prof")) # ordered factor Prestige.2$type.ord round(contrasts(Prestige.2$type.ord), 3) # orthogonal polynomial contrasts crossprod(contrasts(Prestige.2$type.ord)) # floating-point arithmetic # is not exact! round(crossprod(contrasts(Prestige.2$type.ord)), 10) zapsmall(crossprod(contrasts(Prestige.2$type.ord)), 10) prestige.mod.1 <- lm(prestige ~ log2(income) + education + type, data=Prestige.2) summary(prestige.mod.1) anova(prestige.mod.1) # sequential ("type-I") tests prestige.mod.0 <- lm(prestige ~ log2(income) + education, data=Prestige.2) # note: NA's filtered! summary(prestige.mod.0) prestige.mod.0 <- update(prestige.mod.1, . ~ . - type) # equivalent anova(prestige.mod.0, prestige.mod.1) # incremental F-test Anova(prestige.mod.1) # "type-II" tests (partial, obeying marginality) prestige.mod.3 <- update(prestige.mod.1, . ~ . + log2(income):type + education:type) # adding interactions summary(prestige.mod.3) Anova(prestige.mod.3) # equivalent specifications: lm(prestige ~ log2(income)*type + education*type, data=Prestige.2) lm(prestige ~ (log2(income) + education)*type, data=Prestige.2) # effect displays library(effects) windows() # for demo only plot(allEffects(prestige.mod.3)) plot(allEffects(prestige.mod.3, partial.residuals=TRUE), span=3/4) # Anova Models (time permitting) some(Moore) ?Moore Moore$fcategory <- factor(Moore$fcategory, levels=c("low", "medium", "high")) Moore$partner.status <- relevel(Moore$partner.status, ref="low") xtabs(~ fcategory + partner.status, data=Moore) with(Moore, tapply(conformity, list(Authoritarianism=fcategory, "Partner's Status"=partner.status), mean)) with(Moore, tapply(conformity, list(Authoritarianism=fcategory, "Partner's Status"=partner.status), sd)) # graph of means: with(Moore, { interaction.plot(fcategory, partner.status, conformity, type="b", pch=c(1, 16), cex=2, ylim=range(conformity)) points(jitter(as.numeric(fcategory), factor=0.5), conformity, pch=ifelse(partner.status == "low", "L", "H")) identify(fcategory, conformity) }) # ANOVA tables contr <- options(contrasts=c("contr.sum", "contr.poly")) # contr.sum = deviation contrasts contrasts(Moore$fcategory) moore.mod <- lm(conformity ~ fcategory*partner.status, data=Moore) summary(moore.mod) Anova(moore.mod) # type II sums of squares Anova(moore.mod, type="III") # type III sums of squares # (partial, violating marginality) contr # previously saved options(contr) # restore defaults # more on lm args(lm) some(Davis) ?Davis lm(weight ~ repwt, data=Davis, subset=sex == "F") # observation selection # (women only) lm(weight ~ repwt, data=Davis, subset=1:100) # first 100 cases lm(prestige ~ income + education, data=Duncan, subset=-c(6, 16)) # omit cases 6 & 16 lm(conformity ~ partner.status*fcategory, # specifying contrasts contrasts=list(partner.status=contr.sum, fcategory=contr.poly), data=Moore) lm(100*conformity/40 ~ partner.status*fcategory, data=Moore) # data argument; # note computation of y lm(prestige~I(income + education), data=Duncan) # "protecting" expresssion # on RHS of the model # Generalized linear models # binary logit model some(Mroz) ?Mroz mroz.mod <- glm(lfp ~ ., # regress lfp on all others data=Mroz, family=binomial) summary(mroz.mod) round(exp(cbind(Estimate=coef(mroz.mod), confint(mroz.mod))), 2) # odds ratios mroz.mod.2 <- update(mroz.mod, . ~ . - k5 - k618) anova(mroz.mod.2, mroz.mod, test="Chisq") # likelihood-ratio test Anova(mroz.mod) # analysis-of-deviance table plot(allEffects(mroz.mod)) plot(Effect("lwg", mroz.mod, partial.residuals=TRUE)) # Poisson regression (time permitting) some(Ornstein) nrow(Ornstein) ?Ornstein (tab <- xtabs(~interlocks, data=Ornstein)) x <- as.numeric(names(tab)) # the names are the distinct values of interlocks plot(x, as.vector(tab), type="h", xlab="Number of Interlocks", ylab="Frequency") points(x, tab, pch=16) mod.ornstein <- glm(interlocks ~ log2(assets) + nation + sector, family=poisson, data=Ornstein) summary(mod.ornstein) Anova(mod.ornstein) # quasi-Poisson model, allowing for overdispersion mod.ornstein.q <- update(mod.ornstein, family=quasipoisson) summary(mod.ornstein.q) plot(allEffects(mod.ornstein.q, default.levels=50)) plot(Effect("assets", mod.ornstein.q, partial.residuals=TRUE))