1#  File src/library/stats/R/lm.influence.R
2#  Part of the R package, https://www.R-project.org
3#
4#  Copyright (C) 1995-2020 The R Core Team
5#
6#  This program is free software; you can redistribute it and/or modify
7#  it under the terms of the GNU General Public License as published by
8#  the Free Software Foundation; either version 3 of the License, or
9#  (at your option) any later version.
10#
11#  This program is distributed in the hope that it will be useful,
12#  but WITHOUT ANY WARRANTY; without even the implied warranty of
13#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14#  GNU General Public License for more details.
15#
16#  A copy of the GNU General Public License is available at
17#  https://www.R-project.org/Licenses/
18
19### "lm"  *and*	 "glm"	 leave-one-out influence measures
20
21## this is mainly for back-compatibility (from "lsfit" time) -- use hatvalues()!
22hat <- function(x, intercept = TRUE)
23{
24    if(is.qr(x)) n <- nrow(x$qr)
25    else {
26	if(intercept) x <- cbind(1, x)
27	n <- nrow(x)
28	x <- qr(x)
29    }
30    rowSums(qr.qy(x, diag(1, nrow = n, ncol = x$rank))^2)
31}
32
33## see PR#7961, https://stat.ethz.ch/pipermail/r-devel/2011-January/059642.html
34weighted.residuals <- function(obj, drop0 = TRUE)
35{
36    w <- weights(obj)
37    r <- residuals(obj, type="deviance")
38    if(drop0 && !is.null(w)) {
39        if(is.matrix(r)) r[w != 0, , drop = FALSE] # e.g. mlm fit
40        else r[w != 0]
41    } else r
42}
43
44lm.influence <- function (model, do.coef = TRUE)
45{
46    wt.res <- weighted.residuals(model)
47    e <- na.omit(wt.res)
48    is.mlm <- is.matrix(e) # n x q  matrix in the multivariate lm case
49    if (model$rank == 0) {
50        n <- length(wt.res) # drops 0 wt, may drop NAs
51        sigma <- sqrt(deviance(model)/df.residual(model))
52        res <- list(hat = rep(0, n), coefficients = matrix(0, n, 0),
53                    sigma = rep(sigma, n))
54    } else {
55        ## if we have a point with hat = 1, the corresponding e should be
56        ## exactly zero.  Protect against returning Inf by forcing this
57        e[abs(e) < 100 * .Machine$double.eps * median(abs(e))] <- 0
58        mqr <- qr.lm(model)
59        n <- as.integer(nrow(mqr$qr))
60        if (is.na(n)) stop("invalid model QR matrix")
61        ## in na.exclude case, omit NAs; also drop 0-weight cases
62        if(NROW(e) != n)
63            stop("non-NA residual length does not match cases used in fitting")
64        do.coef <- as.logical(do.coef)
65        tol <- 10 * .Machine$double.eps
66        res <- .Call(C_influence, mqr, e, tol)
67        if (do.coef){
68            ok <- seq_len(mqr$rank) # need this for rank-deficient cases
69            Q <- qr.Q(mqr)[ , ok, drop=FALSE]
70            R <- qr.R(mqr)[ok, ok, drop=FALSE]
71            hat <- res$hat
72            invRQtt <- t(backsolve(R,t(Q)))
73            k <- NCOL(Q)
74            q <- NCOL(e)
75            ## NB: The following relies on recycling: diag(v) %*% A == A * v
76            ## so we need a for loop for the mlm case
77            if (is.mlm){
78                cf <- array(0, c(n,k,q))
79                for ( j in seq_len(q) )
80                    cf[,,j] <- invRQtt * ifelse(hat == 1, 0, e[,j]/(1-hat))
81            } else
82                cf <- invRQtt * ifelse(hat == 1, 0, e/(1-hat))
83
84
85            res$coefficients <- cf
86        }
87
88
89        drop1d <- function(a) { # more cautious variant of drop(.)
90            d <- dim(a)
91            if(length(d) == 3L && d[[3L]] == 1L)
92                dim(a) <- d[-3L]
93            a
94        }
95        if(is.null(model$na.action)) {
96            if(!is.mlm) { ## drop the 'q=1' array extent (from C)
97                res$sigma <- drop(res$sigma)
98                if(do.coef)
99                    res$coefficients <- drop1d(res$coefficients)
100            }
101        } else {
102            hat <- naresid(model$na.action, res$hat)
103            hat[is.na(hat)] <- 0       # omitted cases have 0 leverage
104            res$hat <- hat
105            if(do.coef) {
106                coefficients <- naresid(model$na.action, res$coefficients)
107                coefficients[is.na(coefficients)] <- 0 # omitted cases have 0 change
108                res$coefficients <- if(is.mlm) coefficients else drop1d(coefficients)
109            }
110            sigma <- naresid(model$na.action, res$sigma)
111            sigma[is.na(sigma)] <- sqrt(deviance(model)/df.residual(model))
112            res$sigma <- if(is.mlm) sigma else drop(sigma)
113        }
114    }
115    res$wt.res <- naresid(model$na.action, e)
116    res$hat[res$hat > 1 - 10*.Machine$double.eps] <- 1 # force 1
117    names(res$hat) <- names(res$sigma) <- names(res$wt.res)
118    if(do.coef) {
119	cf <- coef(model)
120	if(is.mlm) { # coef is 3d array
121	    dnr <- dimnames(res$wt.res)
122	    dimnames(res$coefficients) <- list(
123		dnr[[1L]],
124		rownames(cf)[!apply(cf, 1L, anyNA)],
125		dnr[[2L]])
126	} else {
127	    dimnames(res$coefficients) <- list(names(res$wt.res),
128					       names(cf)[!is.na(cf)])
129	}
130    }
131    res[c("hat", "coefficients", "sigma", "wt.res")] # ensure order, for backward compatibility and regression tests
132}
133
134## The following is adapted from John Fox's  "car" :
135influence <- function(model, ...) UseMethod("influence")
136influence.lm  <- function(model, do.coef = TRUE, ...)
137    lm.influence(model, do.coef = do.coef, ...)
138influence.glm <- function(model, do.coef = TRUE, ...) {
139    res <- lm.influence(model, do.coef = do.coef, ...)
140    pRes <- na.omit(residuals(model, type = "pearson"))[model$prior.weights != 0]
141    pRes <- naresid(model$na.action, pRes)
142    names(res)[names(res) == "wt.res"] <- "dev.res"
143    c(res, list(pear.res = pRes))
144}
145
146hatvalues <- function(model, ...) UseMethod("hatvalues")
147hatvalues.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), ...) infl$hat
148
149rstandard <- function(model, ...) UseMethod("rstandard")
150rstandard.lm <- function(model, infl = lm.influence(model, do.coef=FALSE),
151                         sd = sqrt(deviance(model)/df.residual(model)),
152                         type = c("sd.1", "predictive"), ...)
153{
154    type <- match.arg(type)
155    res <- infl$wt.res / switch(type,
156				"sd.1" = c(outer(sqrt(1 - infl$hat), sd)),
157				"predictive" = 1 - infl$hat)
158    res[is.infinite(res)] <- NaN
159    res
160}
161
162### New version from Brett Presnell, March 2011
163### Slightly modified (dispersion bit) by pd
164rstandard.glm <-
165 function(model,
166          infl=influence(model, do.coef=FALSE),
167          type=c("deviance","pearson"), ...)
168{
169 type <- match.arg(type)
170 res <- switch(type, pearson = infl$pear.res, infl$dev.res)
171 res <- res/sqrt(summary(model)$dispersion * (1 - infl$hat))
172 res[is.infinite(res)] <- NaN
173 res
174}
175
176
177rstudent <- function(model, ...) UseMethod("rstudent")
178rstudent.lm <- function(model, infl = lm.influence(model, do.coef=FALSE),
179			res = infl$wt.res, ...)
180{
181    res <- res / (infl$sigma * sqrt(1 - infl$hat))
182    res[is.infinite(res)] <- NaN
183    res
184}
185
186rstudent.glm <- function(model, infl = influence(model, do.coef=FALSE), ...)
187{
188    r <- infl$dev.res
189    r <- sign(r) * sqrt(r^2 + (infl$hat * infl$pear.res^2)/(1 - infl$hat))
190    r[is.infinite(r)] <- NaN
191    if (any(family(model)$family == c("binomial", "poisson")))
192	r else r/infl$sigma
193}
194
195### FIXME for glm (see above) ?!?
196dffits <- function(model, infl = lm.influence(model, do.coef=FALSE),
197		   res = weighted.residuals(model))
198{
199    res <- res * sqrt(infl$hat)/(infl$sigma*(1-infl$hat))
200    res[is.infinite(res)] <- NaN
201    res
202}
203
204
205dfbeta <- function(model, ...) UseMethod("dfbeta")
206
207dfbeta.lm <- function(model, infl = lm.influence(model, do.coef=TRUE), ...)
208{
209    ## for lm & glm
210    b <- infl$coefficients
211    mlm <- is.matrix(wr <- infl$wt.res)
212    ## is this needed -- check!?
213    if(!mlm) dimnames(b) <- list(names(wr), variable.names(model))
214    b
215}
216
217dfbetas <- function(model, ...) UseMethod("dfbetas")
218
219dfbetas.lm <- function (model, infl = lm.influence(model, do.coef=TRUE), ...)
220{
221    ## for lm & glm
222    qrm <- qr(model)
223    xxi <- chol2inv(qrm$qr, qrm$rank)
224    db <- dfbeta(model, infl)
225    if(length(dim(db)) == 3L) db <- aperm(db, c(1L,3:2))
226    db / outer(infl$sigma, sqrt(diag(xxi)))
227}
228
229covratio <- function(model, infl = lm.influence(model, do.coef=FALSE),
230		     res = weighted.residuals(model))
231{
232    n <- nrow(qr.lm(model)$qr)
233    p <- model$rank
234    omh <- 1-infl$hat
235    e.star <- res/(infl$sigma*sqrt(omh))
236    e.star[is.infinite(e.star)] <- NaN
237    1/(omh*(((n - p - 1)+e.star^2)/(n - p))^p)
238}
239
240cooks.distance <- function(model, ...) UseMethod("cooks.distance")
241
242## Used in plot.lm(); allow passing of known parts; `infl' used only via `hat'
243cooks.distance.lm <-
244function(model, infl = lm.influence(model, do.coef=FALSE),
245	 res = weighted.residuals(model),
246	 sd = sqrt(deviance(model)/df.residual(model)),
247	 hat = infl$hat, ...)
248{
249    p <- model$rank
250    res <- ((res/c(outer(1 - hat, sd)))^2 * hat)/p
251    res[is.infinite(res)] <- NaN
252    res
253}
254
255cooks.distance.glm <-
256function(model, infl = influence(model, do.coef=FALSE),
257	 res = infl$pear.res,
258	 dispersion = summary(model)$dispersion, hat = infl$hat, ...)
259{
260    p <- model$rank
261    res <- (res/(1-hat))^2 * hat/(dispersion* p)
262    res[is.infinite(res)] <- NaN
263    res
264}
265
266influence.measures <- function(model, infl = influence(model))
267{
268    is.influential <- function(infmat, n)# n == sum(h > 0)  [!]
269    {
270	## Argument is result of using influence.measures
271        d <- dim(infmat)
272        k <- d[[length(d)]] - 4L
273        if(n <= k)
274            stop("too few cases i with h_ii > 0), n < k")
275        absmat <- abs(infmat)
276	r <-
277	    if(is.matrix(infmat)) { # usual non-mlm case
278		## a matrix  of logicals structured like the argument
279		cbind(absmat[, 1L:k] > 1, # |dfbetas| > 1
280		      absmat[, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > ..
281		      abs(1 - infmat[, k + 2]) > (3*k)/(n - k),# |1-cov.r| >..
282		      pf(infmat[, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5
283		      infmat[, k + 4] > (3 * k)/n) # hat > 3k/n
284	    } else { # is.mlm: a 3d-array, not a matrix
285		## a 3d-array of logicals structured like the argument
286		c(absmat[,, 1L:k] > 1, # |dfbetas| > 1
287		  absmat[,, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > ..
288		  abs(1 - infmat[,, k + 2]) > (3*k)/(n - k),# |1-cov.r| >..
289		  pf(infmat[,, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5
290		  infmat[,, k + 4] > (3 * k)/n) # hat > 3k/n
291	    }
292	attributes(r) <- attributes(infmat) # dim, dimnames, ..
293        r
294    }
295
296    p <- model$rank
297    e <- weighted.residuals(model)
298    s <- sqrt(sum(e^2, na.rm=TRUE)/df.residual(model))
299    mqr <- qr.lm(model)
300    xxi <- chol2inv(mqr$qr, mqr$rank)
301    si <- infl$sigma
302    h <- infl$hat
303    is.mlm <- is.matrix(e)
304    cf <- if(is.mlm) aperm(infl$coefficients, c(1L,3:2)) else infl$coefficients
305    dfbetas <- cf / outer(infl$sigma, sqrt(diag(xxi)))
306    vn <- variable.names(model); vn[vn == "(Intercept)"] <- "1_"
307    dimnames(dfbetas)[[length(dim(dfbetas))]] <- paste0("dfb.", abbreviate(vn))
308    ## Compatible to dffits():
309    dffits <- e*sqrt(h)/(si*(1-h))
310    if(any(ii <- is.infinite(dffits))) dffits[ii] <- NaN
311    cov.ratio <- (si/s)^(2 * p)/(1 - h)
312    cooks.d <-
313        if(inherits(model, "glm"))
314            (infl$pear.res/(1-h))^2 * h/(summary(model)$dispersion * p)
315        else # lm (incl "mlm")
316            ((e/(s * (1 - h)))^2 * h)/p
317    infmat <-
318	if(is.mlm) { #  a 3d-array, not a matrix
319	    dns <- dimnames(dfbetas)
320	    dns[[3]] <- c(dns[[3]], "dffit", "cov.r", "cook.d", "hat")
321	    a <- array(dfbetas, dim = dim(dfbetas) + c(0,0, 3+1),
322		       dimnames = dns)
323	    a[,, "dffit"] <- dffits
324	    a[,, "cov.r"] <- cov.ratio
325	    a[,,"cook.d"] <- cooks.d
326	    a[,, "hat"  ] <- h
327	    a
328	} else {
329	    cbind(dfbetas, dffit = dffits, cov.r = cov.ratio,
330		  cook.d = cooks.d, hat = h)
331	}
332    infmat[is.infinite(infmat)] <- NaN
333    is.inf <- is.influential(infmat, sum(h > 0))
334    ans <- list(infmat = infmat, is.inf = is.inf, call = model$call)
335    class(ans) <- "infl"
336    ans
337}
338
339print.infl <- function(x, digits = max(3L, getOption("digits") - 4L), ...)
340{
341    ## `x' : as the result of  influence.measures(.)
342    cat("Influence measures of\n\t", deparse(x$call),":\n\n")
343    is.star <- apply(x$is.inf, 1L, any, na.rm = TRUE)
344    print(data.frame(x$infmat,
345		     inf = ifelse(is.star, "*", " ")),
346	  digits = digits, ...)
347    invisible(x)
348}
349
350summary.infl <-
351    function(object, digits = max(2L, getOption("digits") - 5L), ...)
352{
353    ## object must be as the result of	influence.measures(.)
354    is.inf <- object$is.inf
355    isMLM <- length(dim(is.inf)) == 3L
356    ## will have NaN values from any hat=1 rows.
357    is.inf[is.na(is.inf)] <- FALSE
358    is.star <- apply(is.inf, 1L, any)
359    cat("Potentially influential observations of\n\t",
360	deparse(object$call),":\n")
361    if(any(is.star)) {
362	if(isMLM) {
363	    is.inf <- is.inf       [is.star,,]
364	    imat <- object $ infmat[is.star,, , drop = FALSE]
365	} else { # regular "lm"
366	    is.inf <- is.inf       [is.star, ]
367	    imat <- object $ infmat[is.star, , drop = FALSE]
368	}
369	rownam <- dimnames(object $ infmat)[[1L]] %||% format(seq(is.star))
370	dimnames(imat)[[1L]] <- rownam[is.star]
371	chmat <- format(round(imat, digits = digits))
372	cat("\n")
373	print(array(paste0(chmat, c("", "_*")[1L + is.inf]),
374		    dimnames = dimnames(imat), dim = dim(imat)),
375	      quote = FALSE)
376	invisible(imat)
377    } else {
378	cat("NONE\n")
379	numeric()
380    }
381}
382