# Regression Diagnostics, 2nd Edition
# John Fox
# last modified: 2019-06-18

# R script for Ch. 3

# Fig. 3.1: univariate displays

CIA <- read.table("CIA.txt", header=TRUE)
  # assumes that CIA.txt is in the current directory, adjust as necessary

library(car)
library(RcmdrMisc)

par(mfrow=c(2, 2))
Hist(CIA$infant, xlab="Infant Mortality Rate per 1000", ylab="Frequency", 
     col="gray", main="(a)")
Boxplot(~infant, data=CIA, main="(b)")
densityPlot(CIA$infant, from=0, normalize=TRUE, 
            xlab="Infant Mortality Rate per 1000", main="(c)")
qqPlot(~infant, data=CIA, ylab="Infant Mortality Rate per 1000", 
       xlab="Normal Quantiles", main="(d)",
       id=list(method=c(TRUE, rep(FALSE, 132), TRUE)), col.lines="black")

# Fig. 3.2: QQ plots

par <- par(mar=c(5.1, 5.1, 4.1, 2.1))

set.seed(123) # for reproducibility
x.norm <- rnorm(50, 100, 15)
par(fig=c(0, .5, .5, 1)) # top-left panel
qqPlot(x.norm, id=FALSE, col.lines="black", xlab="Quantiles of N(0, 1)", 
       ylab="Sample Drawn from N(100, 15)", main="(a)")

set.seed(321)
x.t <- rt(50, df=2)
par(fig=c(.5, 1, .5, 1)) # top-right panel
par(new=TRUE)
qqPlot(x.t, id=FALSE, col.lines="black", xlab="Quantiles of N(0, 1)", 
       ylab=expression("Sample Drawn from"~~t[2]), main="(b)")

set.seed(666)
x.chisq <- rchisq(50, df=2)
par(fig=c(.25, .75, 0, .5)) # bottom panel
par(new=TRUE)
qqPlot(x.chisq, id=FALSE, col.lines="black", xlab="Quantiles of N(0, 1)", 
       ylab=expression("Sample Drawn from"~~chi[2]^2), main="(c)")

par(par) # restore graphical parameters
par(mfrow=c(1, 1))

# Fig. 3.3: Box-Cox family

n <- 500
x <- seq(0.1, 3, length=n)
x1 <- bcPower(x, 1)
x0.5 <- bcPower(x, 0.5)
x0 <- bcPower(x, 0)
xm0.5 <- bcPower(x, -0.5)
xm1 <- bcPower(x, -1)
x2 <- bcPower(x, 2)
x3 <- bcPower(x, 3)
xlim <- range(c(x1, x0.5, x0, xm0.5, xm1, x2, x3))

plot(range(x)+ c(-0.6, 0.5), c(-5, 10), type="n", xlab="", ylab="", las=1)
usr <- par("usr")
text(usr[2], usr[3] - 1, label="x", xpd=TRUE)
text(usr[1] - 0.2, usr[4] + 0.75, label=expression(t[BC](x, lambda)), xpd=TRUE)
lines(x, x1, lwd=2)
text(x[n]+0.0625, x1[n], labels=expression(lambda == 1), adj=c(0, 0.2))
lines(x, x2, lwd=2)
text(x[n]+0.0625, x2[n], labels=expression(lambda == 2), adj=c(0, 0.2))
lines(x, x3, lwd=2)
text(x[n]+0.0625, x3[n], labels=expression(lambda == 3), adj=c(0, 0.2))
lines(x, x0.5, lwd=2)
text(x[1]-0.025, x0.5[1], labels=expression(lambda == 0.5), adj=c(1, 0.3))
lines(x, x0, lwd=2)
text(x[1]-0.025, x0[1], labels=expression(lambda == 0), adj=c(1, 0.3))
lines(x, xm0.5, lwd=2)
text(x[1]-0.025, xm0.5[1], labels=expression(lambda == -0.5), adj=c(1, 0.3))
lines(x=c(1, 1), y=c(usr[3], 0), lty=2)
lines(x=c(usr[1], 1), y=c(0, 0), lty=2)

# Fig. 3.4: symmetry boxplots

symbox(~infant, data=CIA, xlab=expression("Powers,"~lambda), ylab="", 
       powers = c(-1, -0.5, 0, 0.33, 0.5, 1))
mtext(2, 1, text=expression(t[BC]("Infant Mortality",~lambda)))

# Fig. 3.5: log-transformed infant mortality

densityPlot(~log(infant), data=CIA, adjust=0.75, xlab="log(Infant Mortality)")
basicPowerAxis(0, side="above", at=c(1, 5, 10, 20, 50, 100), 
               axis.title="Infant Mortality Rate per 1000")

# Estimation of transformation of infant mortality

S(pt <- powerTransform(infant ~ 1, data=CIA))
pt$lambda # estimated lambda
sqrt(pt$invHess) # SE

# Fig. 3.6: scatterplot matrix for CIA data

scatterplotMatrix(~infant + gdp + gini + health, data=CIA, 
                  var.labels=c("Infant Mortality", "GDP per Capita", 
                               "Gini Coefficient", "Health Spending"),
                  smooth=list(smoother=loessLine, var=FALSE, lwd.smooth=3), 
                  col="black")

# Fig. 3.7: scatterplot matrix for transformed CIA data

scatterplotMatrix(~log(infant) + basicPower(gdp, 0.2) + log(gini) + 
                    log(health), data=CIA,
                  var.labels=c(expression(log("Infant Mortality")), 
                               expression("GDP per Capita"^0.2), 
                               expression(log("Gini Coefficient")), 
                               expression(log("Health Spending"))),
                  smooth=list(smoother=loessLine, var=FALSE, lwd.smooth=3), 
                  col="black")

# Table 3.2: estimates of transformation parameters

S(pt4 <- powerTransform(cbind(infant, gdp, gini, health) ~ 1, data=CIA))

# Fig. 3.8: patterns of nonlinearity

par <- par(mar=c(2, 3, 3, 2))

par(fig=c(0, .5, .5, 1)) # top-left panel
x <- seq(0, 1, length=200)
Ey <- rev(1 - x^2)
y <- Ey + 0.1*rnorm(200)
plot(x, y, axes=FALSE, frame=TRUE, main="(a) monotone, simple", 
     cex.main=1, xlab="", ylab="", col="darkgray", cex=0.75)
lines(x, Ey, lwd=2)
mtext("x", side=1, adj=1)
mtext("y ", side=2, at=max(y), las=1)

par(fig=c(.5, 1, .5, 1)) # top-right panel
par(new=TRUE)
x <- seq(0.02, 0.99, length=200)
Ey <- log(x/(1 - x))
y <- Ey + 0.5*rnorm(200)
plot (x, y, axes=FALSE, frame=TRUE, main="(b) monotone, not simple", 
      cex.main=1, xlab="", ylab="", col="darkgray", cex=0.75)
lines(x, Ey, lwd=2)
mtext("x", side=1, adj=1)
mtext("y ", side=2, at=max(y), las=1)

par(fig=c(.25, .75, 0, .5)) # bottom panel
par(new=TRUE)
x <- seq(0.2, 1, length=200)
Ey <- (x - 0.5)^2
y <- Ey + 0.04*rnorm(200)
plot(x, y, axes=FALSE, frame=TRUE, main="(c) non-monotone, simple", 
     cex.main=1, xlab="", ylab="", col="darkgray", cex=0.75)
lines(x, Ey, lwd=2)
mtext("x", side=1, adj=1)
mtext("y ", side=2, at=max(y), las=1)

par(par)

# Fig. 3.9: bulging rule 

library(plotrix)
library(sfsmisc)
library(MASS)

par <- par(mar=c(3, 5, 3, 5), mfrow=c(1,1))
eqscplot(c(-1, 1), c(-1, 1), axes=FALSE, ann=FALSE, type="n")
points(0, 0, cex=2, pch=16)

p.arrows(0, 0, 0.9, 0, fill="black")
p.arrows(0, 0, -0.9, 0, fill="black")
p.arrows(0, 0, 0, 0.9, fill="black")
p.arrows(0, 0, 0, -0.9, fill="black")

draw.arc(0, 0, radius=0.8, lwd=2, deg1=10, deg2=80, n=500)
draw.arc(0, 0, radius=0.8, lwd=2, deg1=100, deg2=170, n=500)
draw.arc(0, 0, radius=0.8, lwd=2, deg1=190, deg2=260, n=500)
draw.arc(0, 0, radius=0.8, lwd=2, deg1=280, deg2=350, n=500)

text(0.925, 0.075, labels="x up:", xpd=TRUE, adj=0)
text(0.925, -0.075, labels=expression(paste(x^2, ",", x^3, ",", ldots)), 
     xpd=TRUE, adj=0)
text(-0.925, 0.075, labels="x down:", xpd=TRUE, adj=1)
text(-0.925, -0.075, labels=expression(paste(sqrt(x), ",", log(x), ",", 
                                             ldots)), xpd=TRUE, adj=1)
text(0, 0.945, labels="y up:", xpd=TRUE, adj=0.5)
text(0, 1.125, labels=expression(paste(y^2, ",", y^3, ",", ldots)),
     xpd=TRUE, adj=0.5)
text(0, -0.945, labels="y down:", xpd=TRUE, adj=0.5)
text(0, -1.125, labels=expression(paste(sqrt(y), ",", log(y), ",", ldots)), 
     xpd=TRUE, adj=0.5)

z <- sqrt(0.64/2)
p.arrows(z, z, z + 0.15, z + 0.15, fill="black")
p.arrows(z, -z, z + 0.15, -z - 0.15, fill="black")
p.arrows(-z, z, -z - 0.15, z + 0.15, fill="black")
p.arrows(-z, -z, -z - 0.15, -z - 0.15, fill="black")

par(par)

# Fig. 3.10: scatterplot of infant mortality vs GDP

scatterplot(infant ~ gdp, data=CIA, smooth=list(smoother=loessLine, var=FALSE, 
                                                lwd.smooth=3), col="black",
            regLine=list(lwd=3),
            xlab="GDP per Capita ($1000s)", 
            ylab="Infant Mortality Rate (per 1000)")

# Fig. 3.11: scatterplot of log(infant mortality) vs log(GDP)

par <- par(oma=c(0, 0, 0, 2))
scatterplot(log(infant) ~ log(gdp), data=CIA, 
            smooth=list(smoother=loessLine, var=FALSE, lwd.smooth=3), 
            col="black",
            regLine=list(lwd=3),
            xlab="log(GDP per Capita)", ylab="log(Infant Mortality)", 
            reset.par=FALSE)
basicPowerAxis(0, side="right", at=c(1, 5, 10, 20, 50, 100), 
               axis.title="Infant Mortality Rate per 1000")
basicPowerAxis(0, side="above", at=c(1, 2, 5, 10, 20, 50), 
               axis.title="GDP per Capita ($1000s)")
par(par)

# regression of log(infant mortality) on log(GDP)

S(lm(log(infant) ~ log(gdp), data=CIA))

# Fig. 3.12: how loess works

par <- par(mfrow=c(2, 2), las=1)   # 2 x 2 array of graphs

D <- CIA[, c("gdp", "infant")]
gdp <- D$gdp
infant <- D$infant
ord <- order(gdp)   # sort data by gdp
gdp <- gdp[ord]
infant <- infant[ord]

x0 <- gdp[96]           # focal x = x_(96) (Latvia)
dist <- abs(gdp - x0)   # distance from focal x
h <- sort(dist)[89]     # bandwidth for span of 2/3 (where n = 134)
pick <- dist <= h       # observations within window

plot(gdp, infant, xlab="GDP per Capita ($1000s)", 
     ylab="Infant Mortality Rate per 1000",
     type="n", main="(a) Observations Within the Window\nspan = 2/3")
points(gdp[pick], infant[pick], col="black")
points(gdp[!pick], infant[!pick], col="gray")
points(gdp[96], infant[96], pch=16, cex=1.5, col="black") # focal point
abline(v=x0, col="black")    # at focal x
abline(v=c(x0 - h, x0 + h), lty=2, col="black")  # window
text(x0, par("usr")[4] + 5, expression(x[(96)]), xpd=TRUE, col="black")

plot(range(gdp), c(0,1), xlab="GDP per Capita ($1000s)",
     ylab="Tricube Kernel Weight",
     type="n", main="(b) Tricube Weights")
abline(v=x0, col="black")
abline(v=c(x0 - h, x0 + h), lty=2, col="black")

tricube <- function(x, x0, h) {
    z <- abs(x - x0)/h
    ifelse(z < 1, (1 - z^3)^3, 0)
}
tc <- function(x) tricube(x, x0, h) # to use with curve
curve(tc, min(gdp), max(gdp), n=1000, lwd=2, add=TRUE, col="black")
points(gdp[pick], tricube(gdp, x0, h)[pick], pch=16, col="black")
abline(h=c(0, 1), col="gray")

plot(gdp, infant, xlab="GDP per Capita ($1000s)", 
     ylab="Infant Mortality Rate per 1000",
     type="n", main="(c) Local Quadratic Regression")
points(gdp[pick], infant[pick], col="black")
points(gdp[!pick], infant[!pick], col="gray")
abline(v=x0, col="black")
abline(v=c(x0 - h, x0 + h), lty=2, col="black")
mod <- lm(infant ~ poly(gdp, 2), weights=tricube(gdp, x0, h))
new <- data.frame(gdp=seq(x0 - h, x0 + h, length=51))
fit <- predict(mod, newdata=new)
  # local regression line
lines(seq(x0 - h, x0 + h, length=51), fit, lwd=2, col="black") 
points(x0, fit[26], pch=16, cex=2, col="black")
text(x0, par("usr")[4] + 5, expression(x[(96)]), xpd=TRUE, col="black")
text(x0 + 1, fit[26] + 2.5, expression(hat(y)[(96)]), adj=c(0, 0), col="black")

plot(gdp, infant, xlab="GDP per Capita  ($1000s)", 
     ylab="Infant Mortality Rate (per 1000)",
     main="(d) Complete Local-Quadratic Estimate", col=gray(0.25))
yhat <- numeric(length(gdp))
for (i in 1:length(gdp)){   # kernel estimate at each x
    x0 <- gdp[i]
    dist <- abs(gdp - x0)
    h <- sort(dist)[89]
    mod <- update(mod, weights=tricube(gdp, x0, h))
    yhat[i] <- predict(mod, newdata=data.frame(gdp=x0))
}
lines(gdp, yhat, lwd=2, col="black")

par(par)
par(mfrow=c(1, 1))

# Fig. 3.13: Boxplots of GDP vs region

Boxplot(gdp ~ region, data=CIA, id=list(location="lr"), 
        ylab="GDP per Capita ($1000s)", xlab="Region of the World")

# Fig. 3.14: spread-level plot for GDP by region

  # reorder levels of region
CIA$region <- factor(CIA$region, levels=c("Europe", "America", 
                                          "Oceania", "Asia", "Africa"))

spreadLevelPlot(gdp ~ region, data=CIA, main="", 
                ylab="Inter-Quartile Range", col.lines="black")

# Fig. 3.15: boxplots for transformed GDP by region

par <- par(mar=c(5.1, 4.1, 3.1, 5.5), mfrow=c(1, 2))

Boxplot(gdp^(1/3) ~ region, data=CIA, id=list(location="lr"), 
        ylab=expression(GDP^{1/3}), xlab="Region of the World", main="(a)")
basicPowerAxis(1/3, side="right", at=c(1, 2, 5, 10, 20, 50, 100), 
               axis.title="GDP per Capita ($1000s)")

Boxplot(log(gdp) ~ region, data=CIA, id=list(location="lr"), 
        ylab="log(GDP)", xlab="Region of the World", main="(b)")
basicPowerAxis(0, side="right", at=c(1, 2, 5, 10, 20, 50, 100), 
               axis.title="GDP per Capita ($1000s)")

par(par)