##-----------------------------------------------------------------##
##                    Script for Day 1b:                           ##
##          Linear and Generalized Linear Models in R              ##
##                        John Fox                                 ##
## The R Statistical Computing Environment: The Basics and Beyond  ##
##              ICPSR Summer Program, Berkeley                     ##
##                          2016                                   ##
##-----------------------------------------------------------------##

# Linear models in R

# multiple regression (Prestige data)

library(car)
names(Prestige)  # from car package
some(Prestige)  # 10 randomly sampling observations
View(Prestige)
?Prestige  #help on data set

prestige.mod <- lm(prestige ~ education + log2(income) + women,
    data=Prestige)  # data argument avoids attach
summary(prestige.mod)

confint(prestige.mod)
Anova(prestige.mod)

# dummy regression

Prestige$type # a factor
class(Prestige$type) 
str(Prestige$type) # structure

Prestige.2 <- na.omit(Prestige) # filter out missing data
nrow(Prestige)
nrow(Prestige.2)
levels(Prestige.2$type)
Prestige.2$type <- with(Prestige.2, factor(type, 
    levels=c("bc", "wc", "prof"))) # reorder levels
    # with() avoids attach()
Prestige.2$type

# generating contrasts from factors

getOption("contrasts")
contrasts(Prestige.2$type)
model.matrix(~ type, data=Prestige.2)

contrasts(Prestige.2$type) <- contr.treatment(levels(Prestige.2$type), base=2)
                                                # changing baseline category
contrasts(Prestige.2$type)

contrasts(Prestige.2$type) <- "contr.helmert"  # Helmert contrasts
contrasts(Prestige.2$type)

contrasts(Prestige.2$type) <- "contr.sum"  # "deviation" contrasts
contrasts(Prestige.2$type)

contrasts(Prestige.2$type) <- NULL  # back to default

Prestige.2$type.ord <- ordered(Prestige.2$type, 
    levels=c("bc", "wc", "prof")) # ordered factor
Prestige.2$type.ord
round(contrasts(Prestige.2$type.ord), 3)   # orthogonal polynomial contrasts
crossprod(contrasts(Prestige.2$type.ord))  # floating-point arithmetic 
#    is not exact!
round(crossprod(contrasts(Prestige.2$type.ord)), 10)
zapsmall(crossprod(contrasts(Prestige.2$type.ord)), 10)

prestige.mod.1 <- lm(prestige ~ log2(income) + education + type, 
    data=Prestige.2)
summary(prestige.mod.1)

anova(prestige.mod.1)   # sequential ("type-I") tests

prestige.mod.0 <- lm(prestige ~ log2(income) + education, 
    data=Prestige.2) # note: NA's filtered!
summary(prestige.mod.0)

prestige.mod.0 <- update(prestige.mod.1, . ~ . - type) # equivalent
anova(prestige.mod.0, prestige.mod.1)   # incremental F-test

Anova(prestige.mod.1)   # "type-II" tests (partial, obeying marginality)

prestige.mod.3 <- update(prestige.mod.1, 
    . ~ . + log2(income):type + education:type)     # adding interactions
summary(prestige.mod.3)
Anova(prestige.mod.3)

# equivalent specifications:
lm(prestige ~ log2(income)*type + education*type, data=Prestige.2) 
lm(prestige ~ (log2(income) + education)*type, data=Prestige.2)

# effect displays

library(effects)
windows()  # for demo only
plot(allEffects(prestige.mod.3))
plot(allEffects(prestige.mod.3, partial.residuals=TRUE), span=3/4)

# Anova Models (time permitting)

some(Moore)
?Moore

Moore$fcategory <- factor(Moore$fcategory, 
    levels=c("low", "medium", "high"))
Moore$partner.status <- relevel(Moore$partner.status, ref="low")

xtabs(~ fcategory + partner.status, data=Moore)

with(Moore, tapply(conformity,
    list(Authoritarianism=fcategory, "Partner's Status"=partner.status),
    mean))
with(Moore, tapply(conformity,
    list(Authoritarianism=fcategory, "Partner's Status"=partner.status),
    sd))

# graph of means:

with(Moore, {
    interaction.plot(fcategory, partner.status, conformity, type="b",
        pch=c(1, 16), cex=2, ylim=range(conformity))
    points(jitter(as.numeric(fcategory), factor=0.5), conformity,
        pch=ifelse(partner.status == "low", "L", "H"))
    identify(fcategory, conformity)
})

# ANOVA tables

contr <- options(contrasts=c("contr.sum", "contr.poly"))   
# contr.sum = deviation contrasts
contrasts(Moore$fcategory)
moore.mod <- lm(conformity ~ fcategory*partner.status, data=Moore)
summary(moore.mod)

Anova(moore.mod)    # type II sums of squares
Anova(moore.mod, type="III")    # type III sums of squares 
    #  (partial, violating marginality)

contr  # previously saved
options(contr) # restore defaults


# more on lm

args(lm)

some(Davis)
?Davis
lm(weight ~ repwt, data=Davis, subset=sex == "F")  # observation selection 
                                #   (women only)
lm(weight ~ repwt, data=Davis, subset=1:100)  # first 100 cases

lm(prestige ~ income + education, data=Duncan, subset=-c(6, 16))  
                                            # omit cases 6 & 16

lm(conformity ~ partner.status*fcategory,  # specifying contrasts
    contrasts=list(partner.status=contr.sum, fcategory=contr.poly),
    data=Moore)

lm(100*conformity/40 ~ partner.status*fcategory, data=Moore)  # data argument; 
    #  note computation of y

lm(prestige~I(income + education), data=Duncan)  # "protecting" expresssion 
                                                 #     on RHS of the model


# Generalized linear models

# binary logit model

some(Mroz)
?Mroz

mroz.mod <- glm(lfp ~ .,   # regress lfp on all others
    data=Mroz, family=binomial)
summary(mroz.mod)

round(exp(cbind(Estimate=coef(mroz.mod), confint(mroz.mod))), 2) # odds ratios

mroz.mod.2 <- update(mroz.mod, . ~ . - k5 - k618)
anova(mroz.mod.2, mroz.mod, test="Chisq") # likelihood-ratio test

Anova(mroz.mod)  # analysis-of-deviance table

plot(allEffects(mroz.mod))
plot(Effect("lwg", mroz.mod, partial.residuals=TRUE))

# Poisson regression (time permitting)

some(Ornstein)
nrow(Ornstein)
?Ornstein
(tab <- xtabs(~interlocks, data=Ornstein))

x <- as.numeric(names(tab)) # the names are the distinct values of interlocks
plot(x, as.vector(tab), type="h", 
    xlab="Number of Interlocks", ylab="Frequency")
points(x, tab, pch=16)

mod.ornstein <- glm(interlocks ~ log2(assets) + nation + sector,
    family=poisson, data=Ornstein)
summary(mod.ornstein)
Anova(mod.ornstein)

# quasi-Poisson model, allowing for overdispersion

mod.ornstein.q <- update(mod.ornstein, family=quasipoisson)
summary(mod.ornstein.q)

plot(allEffects(mod.ornstein.q, default.levels=50))
plot(Effect("assets", mod.ornstein.q, partial.residuals=TRUE))