1# File src/library/stats/R/lm.influence.R 2# Part of the R package, https://www.R-project.org 3# 4# Copyright (C) 1995-2020 The R Core Team 5# 6# This program is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 3 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# A copy of the GNU General Public License is available at 17# https://www.R-project.org/Licenses/ 18 19### "lm" *and* "glm" leave-one-out influence measures 20 21## this is mainly for back-compatibility (from "lsfit" time) -- use hatvalues()! 22hat <- function(x, intercept = TRUE) 23{ 24 if(is.qr(x)) n <- nrow(x$qr) 25 else { 26 if(intercept) x <- cbind(1, x) 27 n <- nrow(x) 28 x <- qr(x) 29 } 30 rowSums(qr.qy(x, diag(1, nrow = n, ncol = x$rank))^2) 31} 32 33## see PR#7961, https://stat.ethz.ch/pipermail/r-devel/2011-January/059642.html 34weighted.residuals <- function(obj, drop0 = TRUE) 35{ 36 w <- weights(obj) 37 r <- residuals(obj, type="deviance") 38 if(drop0 && !is.null(w)) { 39 if(is.matrix(r)) r[w != 0, , drop = FALSE] # e.g. mlm fit 40 else r[w != 0] 41 } else r 42} 43 44lm.influence <- function (model, do.coef = TRUE) 45{ 46 wt.res <- weighted.residuals(model) 47 e <- na.omit(wt.res) 48 is.mlm <- is.matrix(e) # n x q matrix in the multivariate lm case 49 if (model$rank == 0) { 50 n <- length(wt.res) # drops 0 wt, may drop NAs 51 sigma <- sqrt(deviance(model)/df.residual(model)) 52 res <- list(hat = rep(0, n), coefficients = matrix(0, n, 0), 53 sigma = rep(sigma, n)) 54 } else { 55 ## if we have a point with hat = 1, the corresponding e should be 56 ## exactly zero. Protect against returning Inf by forcing this 57 e[abs(e) < 100 * .Machine$double.eps * median(abs(e))] <- 0 58 mqr <- qr.lm(model) 59 n <- as.integer(nrow(mqr$qr)) 60 if (is.na(n)) stop("invalid model QR matrix") 61 ## in na.exclude case, omit NAs; also drop 0-weight cases 62 if(NROW(e) != n) 63 stop("non-NA residual length does not match cases used in fitting") 64 do.coef <- as.logical(do.coef) 65 tol <- 10 * .Machine$double.eps 66 res <- .Call(C_influence, mqr, e, tol) 67 if (do.coef){ 68 ok <- seq_len(mqr$rank) # need this for rank-deficient cases 69 Q <- qr.Q(mqr)[ , ok, drop=FALSE] 70 R <- qr.R(mqr)[ok, ok, drop=FALSE] 71 hat <- res$hat 72 invRQtt <- t(backsolve(R,t(Q))) 73 k <- NCOL(Q) 74 q <- NCOL(e) 75 ## NB: The following relies on recycling: diag(v) %*% A == A * v 76 ## so we need a for loop for the mlm case 77 if (is.mlm){ 78 cf <- array(0, c(n,k,q)) 79 for ( j in seq_len(q) ) 80 cf[,,j] <- invRQtt * ifelse(hat == 1, 0, e[,j]/(1-hat)) 81 } else 82 cf <- invRQtt * ifelse(hat == 1, 0, e/(1-hat)) 83 84 85 res$coefficients <- cf 86 } 87 88 89 drop1d <- function(a) { # more cautious variant of drop(.) 90 d <- dim(a) 91 if(length(d) == 3L && d[[3L]] == 1L) 92 dim(a) <- d[-3L] 93 a 94 } 95 if(is.null(model$na.action)) { 96 if(!is.mlm) { ## drop the 'q=1' array extent (from C) 97 res$sigma <- drop(res$sigma) 98 if(do.coef) 99 res$coefficients <- drop1d(res$coefficients) 100 } 101 } else { 102 hat <- naresid(model$na.action, res$hat) 103 hat[is.na(hat)] <- 0 # omitted cases have 0 leverage 104 res$hat <- hat 105 if(do.coef) { 106 coefficients <- naresid(model$na.action, res$coefficients) 107 coefficients[is.na(coefficients)] <- 0 # omitted cases have 0 change 108 res$coefficients <- if(is.mlm) coefficients else drop1d(coefficients) 109 } 110 sigma <- naresid(model$na.action, res$sigma) 111 sigma[is.na(sigma)] <- sqrt(deviance(model)/df.residual(model)) 112 res$sigma <- if(is.mlm) sigma else drop(sigma) 113 } 114 } 115 res$wt.res <- naresid(model$na.action, e) 116 res$hat[res$hat > 1 - 10*.Machine$double.eps] <- 1 # force 1 117 names(res$hat) <- names(res$sigma) <- names(res$wt.res) 118 if(do.coef) { 119 cf <- coef(model) 120 if(is.mlm) { # coef is 3d array 121 dnr <- dimnames(res$wt.res) 122 dimnames(res$coefficients) <- list( 123 dnr[[1L]], 124 rownames(cf)[!apply(cf, 1L, anyNA)], 125 dnr[[2L]]) 126 } else { 127 dimnames(res$coefficients) <- list(names(res$wt.res), 128 names(cf)[!is.na(cf)]) 129 } 130 } 131 res[c("hat", "coefficients", "sigma", "wt.res")] # ensure order, for backward compatibility and regression tests 132} 133 134## The following is adapted from John Fox's "car" : 135influence <- function(model, ...) UseMethod("influence") 136influence.lm <- function(model, do.coef = TRUE, ...) 137 lm.influence(model, do.coef = do.coef, ...) 138influence.glm <- function(model, do.coef = TRUE, ...) { 139 res <- lm.influence(model, do.coef = do.coef, ...) 140 pRes <- na.omit(residuals(model, type = "pearson"))[model$prior.weights != 0] 141 pRes <- naresid(model$na.action, pRes) 142 names(res)[names(res) == "wt.res"] <- "dev.res" 143 c(res, list(pear.res = pRes)) 144} 145 146hatvalues <- function(model, ...) UseMethod("hatvalues") 147hatvalues.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), ...) infl$hat 148 149rstandard <- function(model, ...) UseMethod("rstandard") 150rstandard.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), 151 sd = sqrt(deviance(model)/df.residual(model)), 152 type = c("sd.1", "predictive"), ...) 153{ 154 type <- match.arg(type) 155 res <- infl$wt.res / switch(type, 156 "sd.1" = c(outer(sqrt(1 - infl$hat), sd)), 157 "predictive" = 1 - infl$hat) 158 res[is.infinite(res)] <- NaN 159 res 160} 161 162### New version from Brett Presnell, March 2011 163### Slightly modified (dispersion bit) by pd 164rstandard.glm <- 165 function(model, 166 infl=influence(model, do.coef=FALSE), 167 type=c("deviance","pearson"), ...) 168{ 169 type <- match.arg(type) 170 res <- switch(type, pearson = infl$pear.res, infl$dev.res) 171 res <- res/sqrt(summary(model)$dispersion * (1 - infl$hat)) 172 res[is.infinite(res)] <- NaN 173 res 174} 175 176 177rstudent <- function(model, ...) UseMethod("rstudent") 178rstudent.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), 179 res = infl$wt.res, ...) 180{ 181 res <- res / (infl$sigma * sqrt(1 - infl$hat)) 182 res[is.infinite(res)] <- NaN 183 res 184} 185 186rstudent.glm <- function(model, infl = influence(model, do.coef=FALSE), ...) 187{ 188 r <- infl$dev.res 189 r <- sign(r) * sqrt(r^2 + (infl$hat * infl$pear.res^2)/(1 - infl$hat)) 190 r[is.infinite(r)] <- NaN 191 if (any(family(model)$family == c("binomial", "poisson"))) 192 r else r/infl$sigma 193} 194 195### FIXME for glm (see above) ?!? 196dffits <- function(model, infl = lm.influence(model, do.coef=FALSE), 197 res = weighted.residuals(model)) 198{ 199 res <- res * sqrt(infl$hat)/(infl$sigma*(1-infl$hat)) 200 res[is.infinite(res)] <- NaN 201 res 202} 203 204 205dfbeta <- function(model, ...) UseMethod("dfbeta") 206 207dfbeta.lm <- function(model, infl = lm.influence(model, do.coef=TRUE), ...) 208{ 209 ## for lm & glm 210 b <- infl$coefficients 211 mlm <- is.matrix(wr <- infl$wt.res) 212 ## is this needed -- check!? 213 if(!mlm) dimnames(b) <- list(names(wr), variable.names(model)) 214 b 215} 216 217dfbetas <- function(model, ...) UseMethod("dfbetas") 218 219dfbetas.lm <- function (model, infl = lm.influence(model, do.coef=TRUE), ...) 220{ 221 ## for lm & glm 222 qrm <- qr(model) 223 xxi <- chol2inv(qrm$qr, qrm$rank) 224 db <- dfbeta(model, infl) 225 if(length(dim(db)) == 3L) db <- aperm(db, c(1L,3:2)) 226 db / outer(infl$sigma, sqrt(diag(xxi))) 227} 228 229covratio <- function(model, infl = lm.influence(model, do.coef=FALSE), 230 res = weighted.residuals(model)) 231{ 232 n <- nrow(qr.lm(model)$qr) 233 p <- model$rank 234 omh <- 1-infl$hat 235 e.star <- res/(infl$sigma*sqrt(omh)) 236 e.star[is.infinite(e.star)] <- NaN 237 1/(omh*(((n - p - 1)+e.star^2)/(n - p))^p) 238} 239 240cooks.distance <- function(model, ...) UseMethod("cooks.distance") 241 242## Used in plot.lm(); allow passing of known parts; `infl' used only via `hat' 243cooks.distance.lm <- 244function(model, infl = lm.influence(model, do.coef=FALSE), 245 res = weighted.residuals(model), 246 sd = sqrt(deviance(model)/df.residual(model)), 247 hat = infl$hat, ...) 248{ 249 p <- model$rank 250 res <- ((res/c(outer(1 - hat, sd)))^2 * hat)/p 251 res[is.infinite(res)] <- NaN 252 res 253} 254 255cooks.distance.glm <- 256function(model, infl = influence(model, do.coef=FALSE), 257 res = infl$pear.res, 258 dispersion = summary(model)$dispersion, hat = infl$hat, ...) 259{ 260 p <- model$rank 261 res <- (res/(1-hat))^2 * hat/(dispersion* p) 262 res[is.infinite(res)] <- NaN 263 res 264} 265 266influence.measures <- function(model, infl = influence(model)) 267{ 268 is.influential <- function(infmat, n)# n == sum(h > 0) [!] 269 { 270 ## Argument is result of using influence.measures 271 d <- dim(infmat) 272 k <- d[[length(d)]] - 4L 273 if(n <= k) 274 stop("too few cases i with h_ii > 0), n < k") 275 absmat <- abs(infmat) 276 r <- 277 if(is.matrix(infmat)) { # usual non-mlm case 278 ## a matrix of logicals structured like the argument 279 cbind(absmat[, 1L:k] > 1, # |dfbetas| > 1 280 absmat[, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > .. 281 abs(1 - infmat[, k + 2]) > (3*k)/(n - k),# |1-cov.r| >.. 282 pf(infmat[, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5 283 infmat[, k + 4] > (3 * k)/n) # hat > 3k/n 284 } else { # is.mlm: a 3d-array, not a matrix 285 ## a 3d-array of logicals structured like the argument 286 c(absmat[,, 1L:k] > 1, # |dfbetas| > 1 287 absmat[,, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > .. 288 abs(1 - infmat[,, k + 2]) > (3*k)/(n - k),# |1-cov.r| >.. 289 pf(infmat[,, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5 290 infmat[,, k + 4] > (3 * k)/n) # hat > 3k/n 291 } 292 attributes(r) <- attributes(infmat) # dim, dimnames, .. 293 r 294 } 295 296 p <- model$rank 297 e <- weighted.residuals(model) 298 s <- sqrt(sum(e^2, na.rm=TRUE)/df.residual(model)) 299 mqr <- qr.lm(model) 300 xxi <- chol2inv(mqr$qr, mqr$rank) 301 si <- infl$sigma 302 h <- infl$hat 303 is.mlm <- is.matrix(e) 304 cf <- if(is.mlm) aperm(infl$coefficients, c(1L,3:2)) else infl$coefficients 305 dfbetas <- cf / outer(infl$sigma, sqrt(diag(xxi))) 306 vn <- variable.names(model); vn[vn == "(Intercept)"] <- "1_" 307 dimnames(dfbetas)[[length(dim(dfbetas))]] <- paste0("dfb.", abbreviate(vn)) 308 ## Compatible to dffits(): 309 dffits <- e*sqrt(h)/(si*(1-h)) 310 if(any(ii <- is.infinite(dffits))) dffits[ii] <- NaN 311 cov.ratio <- (si/s)^(2 * p)/(1 - h) 312 cooks.d <- 313 if(inherits(model, "glm")) 314 (infl$pear.res/(1-h))^2 * h/(summary(model)$dispersion * p) 315 else # lm (incl "mlm") 316 ((e/(s * (1 - h)))^2 * h)/p 317 infmat <- 318 if(is.mlm) { # a 3d-array, not a matrix 319 dns <- dimnames(dfbetas) 320 dns[[3]] <- c(dns[[3]], "dffit", "cov.r", "cook.d", "hat") 321 a <- array(dfbetas, dim = dim(dfbetas) + c(0,0, 3+1), 322 dimnames = dns) 323 a[,, "dffit"] <- dffits 324 a[,, "cov.r"] <- cov.ratio 325 a[,,"cook.d"] <- cooks.d 326 a[,, "hat" ] <- h 327 a 328 } else { 329 cbind(dfbetas, dffit = dffits, cov.r = cov.ratio, 330 cook.d = cooks.d, hat = h) 331 } 332 infmat[is.infinite(infmat)] <- NaN 333 is.inf <- is.influential(infmat, sum(h > 0)) 334 ans <- list(infmat = infmat, is.inf = is.inf, call = model$call) 335 class(ans) <- "infl" 336 ans 337} 338 339print.infl <- function(x, digits = max(3L, getOption("digits") - 4L), ...) 340{ 341 ## `x' : as the result of influence.measures(.) 342 cat("Influence measures of\n\t", deparse(x$call),":\n\n") 343 is.star <- apply(x$is.inf, 1L, any, na.rm = TRUE) 344 print(data.frame(x$infmat, 345 inf = ifelse(is.star, "*", " ")), 346 digits = digits, ...) 347 invisible(x) 348} 349 350summary.infl <- 351 function(object, digits = max(2L, getOption("digits") - 5L), ...) 352{ 353 ## object must be as the result of influence.measures(.) 354 is.inf <- object$is.inf 355 isMLM <- length(dim(is.inf)) == 3L 356 ## will have NaN values from any hat=1 rows. 357 is.inf[is.na(is.inf)] <- FALSE 358 is.star <- apply(is.inf, 1L, any) 359 cat("Potentially influential observations of\n\t", 360 deparse(object$call),":\n") 361 if(any(is.star)) { 362 if(isMLM) { 363 is.inf <- is.inf [is.star,,] 364 imat <- object $ infmat[is.star,, , drop = FALSE] 365 } else { # regular "lm" 366 is.inf <- is.inf [is.star, ] 367 imat <- object $ infmat[is.star, , drop = FALSE] 368 } 369 rownam <- dimnames(object $ infmat)[[1L]] %||% format(seq(is.star)) 370 dimnames(imat)[[1L]] <- rownam[is.star] 371 chmat <- format(round(imat, digits = digits)) 372 cat("\n") 373 print(array(paste0(chmat, c("", "_*")[1L + is.inf]), 374 dimnames = dimnames(imat), dim = dim(imat)), 375 quote = FALSE) 376 invisible(imat) 377 } else { 378 cat("NONE\n") 379 numeric() 380 } 381} 382