##------------------------------------------------------------##
##       Script for Session 2: Statistical Models in R        ##
##                       John Fox                             ##
##  Introduction to the R Statistical Computing Environment   ##
##                ICPSR Summer Program                        ##
##                         2010                               ##
##------------------------------------------------------------##

# Linear models in R
    
    # multiple regression (Prestige data)
    
library(car)
names(Prestige)
some(Prestige)  # 10 randomly sampling observations

prestige.mod <- lm(prestige ~ education + log2(income) + women,
    data=Prestige)
summary(prestige.mod)

confint(prestige.mod)

    # dummy regression
    
Prestige$type # a factor
class(Prestige$type) 
str(Prestige$type) # structure

Prestige.2 <- na.omit(Prestige) # filter out missing data
nrow(Prestige)
nrow(Prestige.2)
levels(Prestige.2$type)
Prestige.2$type <- with(Prestige.2, factor(type, levels=c("bc", "wc", "prof"))) # reorder levels
Prestige.2$type

        # generating contrasts from factors
        
getOption("contrasts")
contrasts(Prestige.2$type)
model.matrix(~ type, data=Prestige.2)

contrasts(Prestige.2$type) <- contr.treatment(levels(Prestige.2$type), base=2)  # changing baseline category
contrasts(Prestige.2$type)

contrasts(Prestige.2$type) <- "contr.helmert"  # Helmert contrasts
contrasts(Prestige.2$type)

contrasts(Prestige.2$type) <- "contr.sum"  # "deviation" contrasts
contrasts(Prestige.2$type)

contrasts(Prestige.2$type) <- NULL  # back to default

Prestige.2$type.ord <- ordered(Prestige.2$type, levels=c("bc", "wc", "prof")) # ordered factor
Prestige.2$type.ord
round(contrasts(Prestige.2$type.ord), 3)   # orthogonal polynomial contrasts

prestige.mod.1 <- lm(prestige ~ log2(income) + education + type, data=Prestige.2)
summary(prestige.mod.1)

anova(prestige.mod.1)   # sequential ("type-I") tests

prestige.mod.0 <- lm(prestige ~ income + education, data=Prestige.2) # note: NA's filtered!
summary(prestige.mod.0)

prestige.mod.0 <- update(prestige.mod.1, . ~ . - type) # equivalent
anova(prestige.mod.0, prestige.mod.1)   # incremental F-test

Anova(prestige.mod.1)   # "type-II" tests

prestige.mod.3 <- update(prestige.mod.1, 
    . ~ . + log2(income):type + education:type)     # adding interactions
summary(prestige.mod.3)
Anova(prestige.mod.3)

lm(prestige ~ log2(income*type) + education*type, data=Prestige.2) # equivalent specifications
lm(prestige ~ (log2(income) + education)*type, data=Prestige.2)

        # effect displays
    
library(effects)
plot(allEffects(prestige.mod.3), ask=FALSE)

    # Anova Models

some(Moore)

Moore$fcategory <- factor(Moore$fcategory, levels=c("low", "medium", "high"))
Moore$partner.status <- relevel(Moore$partner.status, ref="low")

xtabs(~ fcategory + partner.status, data=Moore)

with(Moore, tapply(conformity,
    list(Authoritarianism=fcategory, "Partner's Status"=partner.status),
    mean))
with(Moore, tapply(conformity,
    list(Authoritarianism=fcategory, "Partner's Status"=partner.status),
    sd))

            # graph of means:
            
with(Moore, {
    interaction.plot(fcategory, partner.status, conformity, type="b",
        pch=c(1, 16), cex=2, ylim=range(conformity))
    points(jitter(as.numeric(fcategory), factor=0.5), conformity,
        pch=ifelse(partner.status == "low", "L", "H"))
    identify(fcategory, conformity)
})

            # ANOVA tables

contr <- options(contrasts=c("contr.sum", "contr.poly"))   # contr.sum = deviation contrasts
moore.mod <- lm(conformity ~ fcategory*partner.status, data=Moore)
summary(moore.mod)

Anova(moore.mod)    # type II sums of squares
Anova(moore.mod, type="III")    # type III sums of squares

options(contr) # restore defaults


    # more on lm

args(lm)

some(Davis)
lm(weight ~ repwt, data=Davis, subset=sex == "F")  # observation selection (women only)
lm(weight ~ repwt, data=Davis, subset=1:100)

lm(prestige ~ income + education, data=Duncan, subset=-c(6, 16))

lm(conformity ~ partner.status*fcategory,  # specifying contrasts
    contrasts=list(partner.status=contr.sum, fcategory=contr.poly),
    data=Moore)

lm(100*conformity/40 ~ partner.status*fcategory, data=Moore)  # data argument; note computation of y

lm(prestige~I(income + education), data=Duncan)  # "protecting" expresssion on RHS of the model


# Generalized linear models
    
    # binary logit model
    
some(Mroz)

mroz.mod <- glm(lfp ~ k5 + k618 + age + wc + hc + lwg + inc,
   data=Mroz, family=binomial)
summary(mroz.mod)

round(exp(cbind(Estimate=coef(mroz.mod), confint(mroz.mod))), 2) # odds ratios
    
mroz.mod.2 <- update(mroz.mod, . ~ . - k5 - k618)
anova(mroz.mod.2, mroz.mod, test="Chisq") # likelihood-ratio test

Anova(mroz.mod)  # analysis-of-deviance table

plot(allEffects(mroz.mod), ask=FALSE)

    # Poisson regression

some(Ornstein)
nrow(Ornstein)
(tab <- xtabs(~interlocks, data=Ornstein))

x <- as.numeric(names(tab)) # the names are the distinct values of interlocks
plot(x, tab, type="h", xlab="Number of Interlocks", ylab="Frequency")
points(x, tab, pch=16)

mod.ornstein <- glm(interlocks ~ log2(assets) + nation + sector,
   family=poisson, data=Ornstein)
summary(mod.ornstein)
Anova(mod.ornstein)

        # quasi-Poisson model, allowing for overdispersion
        
mod.ornstein.q <- update(mod.ornstein, family=quasipoisson)
summary(mod.ornstein.q)

plot(allEffects(mod.ornstein.q, default.levels=50), ask=FALSE)

# repeated-measures ANOVA and MANOVA 

some(OBrienKaiser)
?OBrienKaiser
contrasts(OBrienKaiser$treatment)
contrasts(OBrienKaiser$gender)

    # defining the within-subjects design
    
phase <- factor(rep(c("pretest", "posttest", "followup"), c(5, 5, 5)),
    levels=c("pretest", "posttest", "followup"))
hour <- ordered(rep(1:5, 3))
idata <- data.frame(phase, hour)
idata

    # fitting the multivariate linear model
    
mod.ok <- lm(cbind(pre.1, pre.2, pre.3, pre.4, pre.5, 
                     post.1, post.2, post.3, post.4, post.5, 
                     fup.1, fup.2, fup.3, fup.4, fup.5) ~  treatment*gender, 
                data=OBrienKaiser)
mod.ok
    
    # multivariate and univariate tests

(av.ok <- Anova(mod.ok, idata=idata, idesign=~phase*hour)) 

summary(av.ok)

    # graphing the means
    
        # reshape the data from "wide" to "long"
    
OBrien.long <- reshape(OBrienKaiser, 
    varying=c("pre.1", "pre.2", "pre.3", "pre.4", "pre.5", 
        "post.1", "post.2", "post.3", "post.4", "post.5", 
        "fup.1", "fup.2", "fup.3",  "fup.4", "fup.5"),
    v.names="score",
    timevar="phase.hour", direction="long")
OBrien.long$phase <- ordered(c("pre", "post", "fup")[1 + ((OBrien.long$phase.hour - 1) %/% 5)], 
    levels=c("pre", "post", "fup"))
OBrien.long$hour <- ordered(1 + ((OBrien.long$phase.hour - 1) %% 5))
dim(OBrien.long)
head(OBrien.long, 25) # first 25 rows

        # compute means

Means <- as.data.frame(ftable(with(OBrien.long, 
    tapply(score, list(treatment=treatment, gender=gender, phase=phase, hour=hour), mean))))
names(Means)[5] <- "score"
dim(Means)
head(Means, 25)

        # graph of means
        
library(lattice)
xyplot(score ~ hour | phase + treatment, groups=gender, type="b", 
    strip=function(...) strip.default(strip.names=c(TRUE, TRUE), ...), 
    ylab="Mean Score", data=Means, auto.key=list(title="Gender", cex.title=1))