> ls <- function(X, y, intercept=TRUE){
+ if (intercept) X <- cbind(1, X)
+ n <- nrow(X)
+ p <- ncol(X)
+ V <- solve(t(X) %*% X)
+ b <- as.vector(V %*% t(X) %*% y)
+ yhat <- as.vector(X %*% b)
+ e <- y - yhat
+ s2 <- sum(e^2)/(n - p)
+ V <- s2*V
+ list(coef=b, vcov=V, residuals=e, fitted=yhat)
+ }
Testing the function with the Longley data:
> mod.lm <- lm(Employed ~., data=longley)
> coef(mod.lm)
(Intercept) GNP.deflator GNP Unemployed Armed.Forces
-3.482259e+03 1.506187e-02 -3.581918e-02 -2.020230e-02 -1.033227e-02
Population Year
-5.110411e-02 1.829151e+00
> X <- data.matrix(longley[, 1:6])
> y <- longley[, "Employed"]
> mod.ls <- ls(X, y)
> mod.ls$coef
[1] -3.482259e+03 1.506187e-02 -3.581918e-02 -2.020230e-02 -1.033227e-02
[6] -5.110411e-02 1.829151e+00
> mod.ls$coef/coef(mod.lm) # should be close to 1
(Intercept) GNP.deflator GNP Unemployed Armed.Forces
1.0000000 0.9999998 1.0000000 1.0000000 1.0000000
Population Year
1.0000000 1.0000000
> sqrt(diag(mod.ls$vcov))/sqrt(diag(vcov(mod.lm))) # ditto
GNP.deflator GNP Unemployed Armed.Forces
1 1 1 1 1
Population Year
1 1
My ls()
function produces results similar to those produced by lm()
.
Defining the ls()
generic function with "default"
and "formula"
methods:
> ls <- function(X, ...){
+ UseMethod("ls")
+ }
>
> ls.default <- function(X, y, intercept=TRUE, ...){
+ if (intercept) X <- cbind(1, X)
+ n <- nrow(X)
+ p <- ncol(X)
+ V <- solve(t(X) %*% X)
+ b <- as.vector(V %*% t(X) %*% y)
+ names(b) <- colnames(X)
+ yhat <- as.vector(X %*% b)
+ e <- y - yhat
+ s2 <- sum(e^2)/(n - p)
+ V <- s2*V
+ result <- list(coef=b, vcov=V, residuals=e, fitted=yhat)
+ class(result) <- "ls"
+ result
+ }
>
> ls.formula <- function(formula, data, subset, na.action, model = TRUE,
+ contrasts = NULL, ...) {
+ call <- match.call() # returns the function call
+ mf <- match.call(expand.dots = FALSE) # the function call w/o ...
+ args <- match(c("formula", "data", "subset", "na.action"),
+ names(mf), 0) # which arguments are present?
+ mf <- mf[c(1, args)]
+ mf$drop.unused.levels <- TRUE
+ mf[[1]] <- as.name("model.frame")
+ mf <- eval.parent(mf) # create a model frame
+ terms <- attr(mf, "terms") # terms object for the model
+ y <- model.response(mf) # response variable
+ X <- model.matrix(terms, mf, contrasts) # model matrix
+ mod <- ls.default(X, y, intercept=FALSE)
+ mod$na.action <- attr(mf, "na.action")
+ mod$contrasts <- attr(X, "contrasts")
+ mod$xlevels <- .getXlevels(terms, mf)
+ mod$call <- call
+ mod$terms <- terms
+ if (model) mod$model <- mf
+ mod
+ }
Defining "ls"
methods for some standard generic functions:
> coef.ls <- function(object, ...) object$coef
>
> vcov.ls <- function(object, ...) object$vcov
>
> print.ls <- function(x, ...) {
+ print(coef(x))
+ invisible(x)
+ }
>
> summary.ls <- function(object, ...) {
+ b <- coef(object)
+ se <- sqrt(diag(vcov(object)))
+ e <- residuals(object)
+ df <- length(e) - length(b)
+ s2 <- sum(e^2)/df
+ t <- b/se
+ table <- cbind(b, se, t, 2*(1 - pt(abs(t), df=df)))
+ colnames(table) <- c("Estimate", "Std.Err", "t value", "Pr(>|t|)")
+ rownames(table) <- names(b)
+ result <- list(coef=table, s2=s2, df=df)
+ class(result) <- "summary.ls"
+ result
+ }
>
> print.summary.ls <- function(x, ...) {
+ printCoefmat(x$coef)
+ cat("\nresidual std. dev. =", sqrt(x$s2), " residual df =", x$df,"\n")
+ invisible(x)
+ }
Testing:
> mod.ls <- ls(Employed ~ ., data=longley)
> mod.ls # print method
(Intercept) GNP.deflator GNP Unemployed Armed.Forces
-3.482259e+03 1.506187e-02 -3.581918e-02 -2.020230e-02 -1.033227e-02
Population Year
-5.110411e-02 1.829151e+00
> summary(mod.ls)
Estimate Std.Err t value Pr(>|t|)
(Intercept) -3.4823e+03 8.9042e+02 -3.9108 0.0035604 **
GNP.deflator 1.5062e-02 8.4915e-02 0.1774 0.8631409
GNP -3.5819e-02 3.3491e-02 -1.0695 0.3126811
Unemployed -2.0202e-02 4.8840e-03 -4.1364 0.0025351 **
Armed.Forces -1.0332e-02 2.1427e-03 -4.8220 0.0009444 ***
Population -5.1104e-02 2.2607e-01 -0.2261 0.8262118
Year 1.8292e+00 4.5548e-01 4.0159 0.0030368 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
residual std. dev. = 0.3048541 residual df = 9
> summary(mod.lm) # checkin against lm()
Call:
lm(formula = Employed ~ ., data = longley)
Residuals:
Min 1Q Median 3Q Max
-0.41011 -0.15767 -0.02816 0.10155 0.45539
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.482e+03 8.904e+02 -3.911 0.003560 **
GNP.deflator 1.506e-02 8.492e-02 0.177 0.863141
GNP -3.582e-02 3.349e-02 -1.070 0.312681
Unemployed -2.020e-02 4.884e-03 -4.136 0.002535 **
Armed.Forces -1.033e-02 2.143e-03 -4.822 0.000944 ***
Population -5.110e-02 2.261e-01 -0.226 0.826212
Year 1.829e+00 4.555e-01 4.016 0.003037 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3049 on 9 degrees of freedom
Multiple R-squared: 0.9955, Adjusted R-squared: 0.9925
F-statistic: 330.3 on 6 and 9 DF, p-value: 4.984e-10