2016-06-16

Basic Least-Squares Function

> ls <- function(X, y, intercept=TRUE){
+     if (intercept) X <- cbind(1, X)
+     n <- nrow(X)
+     p <- ncol(X)
+     V <- solve(t(X) %*% X)
+     b <- as.vector(V %*% t(X) %*% y)
+     yhat <- as.vector(X %*% b)
+     e <- y - yhat
+     s2 <- sum(e^2)/(n - p)
+     V <- s2*V
+     list(coef=b, vcov=V, residuals=e, fitted=yhat)
+ }

Testing the function with the Longley data:

> mod.lm <- lm(Employed ~., data=longley)
> coef(mod.lm)
  (Intercept)  GNP.deflator           GNP    Unemployed  Armed.Forces 
-3.482259e+03  1.506187e-02 -3.581918e-02 -2.020230e-02 -1.033227e-02 
   Population          Year 
-5.110411e-02  1.829151e+00 
> X <- data.matrix(longley[, 1:6])
> y <- longley[, "Employed"]
> mod.ls <- ls(X, y)
> mod.ls$coef
[1] -3.482259e+03  1.506187e-02 -3.581918e-02 -2.020230e-02 -1.033227e-02
[6] -5.110411e-02  1.829151e+00
> mod.ls$coef/coef(mod.lm) # should be close to 1
 (Intercept) GNP.deflator          GNP   Unemployed Armed.Forces 
   1.0000000    0.9999998    1.0000000    1.0000000    1.0000000 
  Population         Year 
   1.0000000    1.0000000 
> sqrt(diag(mod.ls$vcov))/sqrt(diag(vcov(mod.lm))) # ditto
             GNP.deflator          GNP   Unemployed Armed.Forces 
           1            1            1            1            1 
  Population         Year 
           1            1 

My ls() function produces results similar to those produced by lm().

Making the Least-Squares Function Object-Oriented

Defining the ls() generic function with "default" and "formula" methods:

> ls <- function(X, ...){
+     UseMethod("ls")
+ }
> 
> ls.default <- function(X, y, intercept=TRUE, ...){
+     if (intercept) X <- cbind(1, X)
+     n <- nrow(X)
+     p <- ncol(X)
+     V <- solve(t(X) %*% X)
+     b <- as.vector(V %*% t(X) %*% y)
+     names(b) <- colnames(X)
+     yhat <- as.vector(X %*% b)
+     e <- y - yhat
+     s2 <- sum(e^2)/(n - p)
+     V <- s2*V
+     result <- list(coef=b, vcov=V, residuals=e, fitted=yhat)
+     class(result) <- "ls"
+     result
+ }
> 
> ls.formula <- function(formula, data, subset, na.action, model = TRUE,
+     contrasts = NULL, ...) {
+     call <- match.call()  # returns the function call
+     mf <- match.call(expand.dots = FALSE)  # the function call w/o ...
+     args <- match(c("formula", "data", "subset", "na.action"),
+         names(mf), 0)  # which arguments are present?
+     mf <- mf[c(1, args)]
+     mf$drop.unused.levels <- TRUE
+     mf[[1]] <- as.name("model.frame")
+     mf <- eval.parent(mf)  # create a model frame
+     terms <- attr(mf, "terms")  # terms object for the model
+     y <- model.response(mf)  # response variable
+     X <- model.matrix(terms, mf, contrasts)  # model matrix
+     mod <- ls.default(X, y, intercept=FALSE)
+     mod$na.action <- attr(mf, "na.action")
+     mod$contrasts <- attr(X, "contrasts")
+     mod$xlevels <- .getXlevels(terms, mf)
+     mod$call <- call
+     mod$terms <- terms
+     if (model)  mod$model <- mf
+     mod
+ }

Defining "ls" methods for some standard generic functions:

> coef.ls <- function(object, ...) object$coef
> 
> vcov.ls <- function(object, ...) object$vcov
> 
> print.ls <- function(x, ...) {
+     print(coef(x))
+     invisible(x)
+ }
> 
> summary.ls <- function(object, ...) {
+     b <- coef(object)
+     se <- sqrt(diag(vcov(object)))
+     e <- residuals(object)
+     df <- length(e) - length(b)
+     s2 <- sum(e^2)/df
+     t <- b/se
+     table <- cbind(b, se, t, 2*(1 - pt(abs(t), df=df)))
+     colnames(table) <- c("Estimate", "Std.Err", "t value", "Pr(>|t|)")
+     rownames(table) <- names(b)
+     result <- list(coef=table, s2=s2, df=df)
+     class(result) <- "summary.ls"
+     result
+ }
> 
> print.summary.ls <- function(x, ...) {
+     printCoefmat(x$coef)
+     cat("\nresidual std. dev. =", sqrt(x$s2), "  residual df =", x$df,"\n")
+     invisible(x)
+ }

Testing:

> mod.ls <- ls(Employed ~ ., data=longley)
> mod.ls # print method
  (Intercept)  GNP.deflator           GNP    Unemployed  Armed.Forces 
-3.482259e+03  1.506187e-02 -3.581918e-02 -2.020230e-02 -1.033227e-02 
   Population          Year 
-5.110411e-02  1.829151e+00 
> summary(mod.ls)
                Estimate     Std.Err t value  Pr(>|t|)    
(Intercept)  -3.4823e+03  8.9042e+02 -3.9108 0.0035604 ** 
GNP.deflator  1.5062e-02  8.4915e-02  0.1774 0.8631409    
GNP          -3.5819e-02  3.3491e-02 -1.0695 0.3126811    
Unemployed   -2.0202e-02  4.8840e-03 -4.1364 0.0025351 ** 
Armed.Forces -1.0332e-02  2.1427e-03 -4.8220 0.0009444 ***
Population   -5.1104e-02  2.2607e-01 -0.2261 0.8262118    
Year          1.8292e+00  4.5548e-01  4.0159 0.0030368 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

residual std. dev. = 0.3048541   residual df = 9 
> summary(mod.lm) # checkin against lm()

Call:
lm(formula = Employed ~ ., data = longley)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.41011 -0.15767 -0.02816  0.10155  0.45539 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -3.482e+03  8.904e+02  -3.911 0.003560 ** 
GNP.deflator  1.506e-02  8.492e-02   0.177 0.863141    
GNP          -3.582e-02  3.349e-02  -1.070 0.312681    
Unemployed   -2.020e-02  4.884e-03  -4.136 0.002535 ** 
Armed.Forces -1.033e-02  2.143e-03  -4.822 0.000944 ***
Population   -5.110e-02  2.261e-01  -0.226 0.826212    
Year          1.829e+00  4.555e-01   4.016 0.003037 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3049 on 9 degrees of freedom
Multiple R-squared:  0.9955,    Adjusted R-squared:  0.9925 
F-statistic: 330.3 on 6 and 9 DF,  p-value: 4.984e-10