##------------------------------------------------------------------------##
## Script for Chapter 5, An R and S-PLUS Companion to Applied Regression  ##
##    John Fox                                                            ##
##    Sage Publications, 2002                                             ##
##------------------------------------------------------------------------##

# preliminaries

options(width=65)
options(digits=5)

# logit, probit, and c-log-log links

eta<-seq(-4, 4, length=100)
mu.logit<-1/(1+exp(-eta))
mu.probit<-pnorm(eta, 0, pi/sqrt(3))
mu.cloglog<-1-exp(-exp(eta))

plot(c(eta,eta,eta), c(mu.logit,mu.logit,mu.cloglog), xlab=expression(eta), 
    ylab=expression(mu == L^{-1}*{(eta)}), lwd=2, type="n", las=1)
    
## for S-PLUS:
## plot(c(eta,eta,eta), c(mu.logit,mu.logit,mu.cloglog), 'eta', 
##     ylab='mu', lwd=2, type="n", las=1)

lines(spline(eta, mu.logit), lwd=3)
lines(spline(eta, mu.probit), lwd=3, lty=3)
lines(spline(eta, mu.cloglog), lwd=3, lty=4)
legend(locator(1), lty=c(1,3,4), lwd=3, 
    legend=c("logit","probit","complementary log-log")) 


#logistic regression

library(car)

data(Mroz)
Mroz[sort(sample(753,10)),]

attach(Mroz)
mod.mroz <- glm(lfp~k5+k618+age+wc+hc+lwg+inc, fam=binomial)
summary(mod.mroz)

anova(update(mod.mroz, . ~ . - k5), mod.mroz, test="Chisq")

Anova(mod.mroz)

# with binomial response

closeness <- factor(rep(c('one.sided', 'close'), c(3,3)), 
    levels=c('one.sided', 'close'))   
preference <- ordered(rep(c('weak', 'medium', 'strong'), 2), 
    levels=c('weak', 'medium', 'strong'))
voted <- c(91, 121, 64, 214, 284, 201)
did.not.vote <- c(39, 49, 24, 87, 76, 25)

logit.turnout <- log(voted/did.not.vote)
data.frame(closeness, preference, voted, did.not.vote, 
    logit=logit.turnout)

par(mar=c(5.1, 4.1, 4.1, 4.1))
plot(rep(1:3, 2), logit.turnout, type='n', axes=F,
    xlab='Intensity of Preference', 
    ylab='Logit(Voted/Did Not Vote)')
axis(1, at=1:3, labels=c('Weak', 'Medium', 'Strong'))
axis(2)
prob.axis(side='right', at=seq(.7, .85, by=.05),
    axis.title='Proportion(Voted)')
box()
points(1:3, logit.turnout[1:3], pch=1, type='b', 
    lty=1, lwd=3, cex=2)
points(1:3, logit.turnout[4:6], pch=16, type='b', 
    lty=2, lwd=3, cex=2)
text(locator(2), c('Close', 'One-Sided'))

options(contrasts=c('contr.sum', 'contr.poly'))
mod.campbell <- glm(cbind(voted, did.not.vote) ~ closeness*preference, 
    family=binomial)
summary(mod.campbell)

Anova(mod.campbell)

# Polytomous Data

data(Womenlf)
Womenlf[sort(sample(1:263, 10)),]

attach(Womenlf)
working <- recode(partic, " 'not.work' = 'no'; else = 'yes' ")
fulltime <- recode (partic, 
    " 'fulltime' = 'yes'; 'parttime' = 'no'; 'not.work' = NA")
working
fulltime

options(contrasts=c('contr.treatment', 'contr.poly'))
mod.working <- glm(working ~ hincome + children + region, family=binomial)
summary(mod.working)
mod.fulltime <- glm(fulltime ~ hincome + children + region, family=binomial)
summary(mod.fulltime)
Anova(mod.working)
Anova(mod.fulltime)

mod.working.1 <- update(mod.working, . ~ . - region)
mod.fulltime.1 <- update(mod.fulltime, . ~ . - region)

predictors <- expand.grid(hincome=1:45, children=c('absent', 'present'))
predictors
p.work <- predict(mod.working.1, predictors, type='response')
p.fulltime <- predict(mod.fulltime.1, predictors, type='response')
p.full <- p.work * p.fulltime
p.part <- p.work * (1 - p.fulltime)
p.not <- 1 - p.work

par(mfrow=c(1,2))
plot(c(1,45), c(0,1), 
    type='n', xlab="Husband's Income", ylab='Fitted Probability',
    main="Children Absent")
lines(1:45, p.not[1:45], lty=1, lwd=3)
lines(1:45, p.part[1:45], lty=2, lwd=3)
lines(1:45, p.full[1:45], lty=3, lwd=3)
legend(locator(1), lty=1:3, lwd=3, 
    legend=c('not working', 'part-time', 'full-time'))  

plot(c(1,45), c(0,1), 
    type='n', xlab="Husband's Income", ylab='Fitted Probability',
    main="Children Present")
lines(1:45, p.not[46:90], lty=1, lwd=3)
lines(1:45, p.part[46:90], lty=2, lwd=3)
lines(1:45, p.full[46:90], lty=3, lwd=3)

# multinomial logistic regression

participation <- ordered(partic, 
    levels=c('not.work', 'parttime', 'fulltime'))
library(nnet)
library(MASS)
mod.multinom <- multinom(participation ~ hincome + children)
summary(mod.multinom, cor=F, Wald=T)

p.fit <- predict(mod.multinom, predictors, type='probs')
data.frame(predictors, p.fit)[1:5,]

plot(c(1,45), c(0,1), 
    type='n', xlab="Husband's Income", ylab='Fitted Probability',
    main="Children Absent")
lines(1:45, p.fit[1:45, 'not.work'], lty=1, lwd=3)
lines(1:45, p.fit[1:45, 'parttime'], lty=2, lwd=3)
lines(1:45, p.fit[1:45, 'fulltime'], lty=3, lwd=3)
legend(locator(1), lty=1:3, lwd=3, 
    legend=c('not working', 'part-time', 'full-time'))  

plot(c(1,45), c(0,1), 
    type='n', xlab="Husband's Income", ylab='Fitted Probability',
    main="Children Present")
lines(1:45, p.fit[46:90, 'not.work'], lty=1, lwd=3)
lines(1:45, p.fit[46:90, 'parttime'], lty=2, lwd=3)
lines(1:45, p.fit[46:90, 'fulltime'], lty=3, lwd=3)

# proportional odds model


mod.polr <- polr(participation ~ hincome + children)
summary(mod.polr)

p.fit <- predict(mod.polr, predictors, type='probs')

plot(c(1,45), c(0,1), 
    type='n', xlab="Husband's Income", ylab='Fitted Probability',
    main="Children Absent")
lines(1:45, p.fit[1:45, 'not.work'], lty=1, lwd=3)
lines(1:45, p.fit[1:45, 'parttime'], lty=2, lwd=3)
lines(1:45, p.fit[1:45, 'fulltime'], lty=3, lwd=3)
legend(locator(1), lty=1:3, lwd=3, 
    legend=c('not working', 'part-time', 'full-time'))  

plot(c(1,45), c(0,1), 
    type='n', xlab="Husband's Income", ylab='Fitted Probability',
    main="Children Present")
lines(1:45, p.fit[46:90, 'not.work'], lty=1, lwd=3)
lines(1:45, p.fit[46:90, 'parttime'], lty=2, lwd=3)
lines(1:45, p.fit[46:90, 'fulltime'], lty=3, lwd=3)

1 - pchisq(441.66 - 422.88, df = 6 - 4)

# artificial plot

x <- 1:100
logit.1 <- 5 -.2*x
logit.2 <- 7 -.2*x
logit.3 <- 13 -.2*x

p.1 <- 1/(1 + exp(logit.1))
p.2 <- 1/(1 + exp(logit.2))
p.3 <- 1/(1 + exp(logit.3))

plot(c(1, 100), c(0,1), 
    type='n', xlab=expression(eta), ylab='Probability',
    axes=F)
## S-PLUS:
## plot(c(1, 100), c(0,1), 
##     type='n', xlab='eta', ylab='Probability',
##     axes=F)
axis(2)
box()
lines(1:100, p.1, lty=1, lwd=3)
lines(1:100, p.2, lty=2, lwd=3)
lines(1:100, p.3, lty=3, lwd=3)

legend(locator(1), lty=1:3, lwd=3, 
    legend=c('Pr(y > 1)', 'Pr(y > 2)', 'Pr(y > 3)'))
    
# Poisson models

# Poisson regression

data(Ornstein)
Ornstein[sort(sample(248,10)),]

attach(Ornstein)
tab <- table(interlocks)
tab
x <- sort(unique(interlocks))
plot(x, tab, type='h', xlab='Number of Interlocks',
    ylab='Frequency')
points(x, tab, pch=16)


  
mod.ornstein <- glm(interlocks ~ assets + nation + sector, family=poisson)
summary(mod.ornstein)
Anova(mod.ornstein)


# loglinear model

counts <- c(91, 39, 121, 49, 64, 24, 214, 87, 284, 76, 201, 25)
Campbell <- expand.grid(turnout=c('voted','did.not.vote'),
    preference=c('weak', 'medium', 'strong'),
    closeness=c('one.sided', 'close'))
cbind(Campbell, counts)

options(contrasts=c('contr.sum', 'contr.poly'))

attach(Campbell)
mod.loglin <- glm(counts ~ closeness*preference*turnout, family=poisson)
summary(mod.loglin)

Anova(mod.loglin)
detach(Campbell)

## create Campbell table as array

campbell <- array(counts, c(2,3,2))
dimnames <- list(c('voted', 'did.not.vote'), c('weak', 'medium', 'strong'), c('one.sided', 'close'))
names(dimnames)<-c('turnout','preference','closeness')
dimnames(campbell)<-dimnames
class(campbell) <- "table"
campbell

Campbell <- as.data.frame(campbell)
Campbell

## S-PLUS
## ravel.table <- function(table){
##     cbind(expand.grid(dimnames(table)), 
##         Freq=as.vector(table))
##     }
## Campbell <- ravel.table(campbell)
## Campbell    

mod.loglin <- glm(Freq ~ closeness*preference*turnout, 
    family=poisson, data=Campbell)


# gamma distributions

x <- seq(0.1, 10, by=.02)
d.5 <- dgamma(x, shape=.5)
d1 <- dgamma(x, shape=1)
d2 <- dgamma(x, shape=2)
d5 <- dgamma(x, shape=5)
plot(rep(x,4), c(d.5, d1, d2, d5), type='n',
 xlab="y", ylab="p(y)")
lines(x, d.5, lty=1, lwd=3)
lines(x, d1, lty=1, lwd=3)
lines(x, d2, lty=1, lwd=3)
lines(x, d5, lty=1, lwd=3)

## The following text command does not work in S-PLUS:

text (locator(4), pos=4, labels=c(expression(psi == 0.5),
    expression(psi == 1),expression(psi == 2),expression(psi == 5)))


# overdispersed Poisson

mod.ornstein.q <- update(mod.ornstein, family=quasipoisson)
summary(mod.ornstein.q)
Anova(mod.ornstein.q, test="F")

# S-PLUS (and R) alternative:

mod.ornstein.q <- update(mod.ornstein, family=quasi(link=log, var='mu'))
summary(mod.ornstein.q)
Anova(mod.ornstein.q, test="F")

# arguments to glm

args(glm)