1### -*- mode: R ; delete-old-versions: never -*- 2 3## This program is free software; you can redistribute it and/or modify 4## it under the terms of the GNU General Public License as published by 5## the Free Software Foundation; either version 2 of the License, or 6## (at your option) any later version. 7## 8## This program is distributed in the hope that it will be useful, 9## but WITHOUT ANY WARRANTY; without even the implied warranty of 10## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11## GNU General Public License for more details. 12## 13## You should have received a copy of the GNU General Public License 14## along with this program; if not, a copy is available at 15## http://www.r-project.org/Licenses/ 16 17### From package 'Biobase' (has only rowMedians + rowQ) / 'matrixStats' 18### MM: all type checking now in C 19## --- TODO: implement hasNA=NA ==> do check maybe differently than = TRUE 20## --> ../src/rowMedians.c + ../src/rowMedians_TYPE-template.h 21colMedians <- function(x, na.rm=FALSE, hasNA=TRUE, keep.names=TRUE) 22 .Call(R_rowMedians, x, na.rm, hasNA, FALSE, keep.names) 23 24rowMedians <- function(x, na.rm=FALSE, hasNA=TRUE, keep.names=TRUE) 25 .Call(R_rowMedians, x, na.rm, hasNA, TRUE, keep.names) 26 27 28 29 30### Maria Anna di Palma, without consistency factor 15.11.2014 31### Fixes by Valentin Todorov 32### Martin Maechler: added mad() consistency factor, 27.11.2014 33### new name, class; more compatible to 'covMcd' 34 35covComed <- function (X, n.iter = 2, reweight = FALSE, 36 tolSolve = control$ tolSolve,# had 1e-10 hardwired {now 1e-14 default} 37 trace = control$ trace, 38 wgtFUN = control$ wgtFUN, 39 control = rrcov.control()) 40{ 41 ## ATTENTION ## 42 ## Med(abs(X))^2=Med(X*X) only if the number of rows is odd 43 d <- dim(X <- as.matrix(X)) 44 n <- d[1] 45 p <- d[2] 46 47 if(is.character(wgtFUN)) 48 wgtFUN <- .wgtFUN.covComed[[wgtFUN]](p=p, n=n, control) 49 if(!is.function(wgtFUN)) 50 stop("'wgtFUN' must be a function or a string specifying such a function") 51 52 madX <- apply(X, 2, mad) 53 I.mad <- 1/madX 54 rho <- I.mad * COM(X) * rep(I.mad, each = p) 55 ## better than 56 ## D <- diag(1/madX) 57 ## rho <- D %*% COM(X) %*% t(D) 58 59 U <- svd(rho, p, nv = 0L)$u 60 ## DD <- diag(madX) 61 ## Q <- DD %*% U 62 ## invQ <- solve(Q) ## == t(U) %*% D -- since U is orthogonal! 63 t.inv.Q <- I.mad * U # = t(solve(Q)) = t(t(U) * D) == t(D) U = D U 64 Z <- X %*% t.inv.Q ## much faster than for (i in 1:n) Z[i,] <- invQ %*% X[i,] 65 out <- comedian(rho, Z, X) 66 67 ## Mahalanobis distance 68 for(it in seq_len(n.iter))# allow n.iter = 0 69 out <- comedian(out$S., out$Z, X) 70 71 mm <- colMedians(out$Z) 72 mx <- drop(out$Q %*% mm) 73 ## MM: These are "raw" distances compared to covMcd() 74 mah <- mahalanobis(X, mx, out$S., tol = tolSolve) 75 76 ## compute weights 77 weights <- wgtFUN(mah) 78 covW <- cov.wt(X, wt=weights)[c("cov", "center", "n.obs")] 79 covW$weights <- 80 if(reweight) { ## above 'mah' = 'raw.mah' .. ==> allow another reweighting as in covMcd() 81 covW$raw.weights <- weights 82 covW$mah <- mahalanobis(X, covW$center, covW$cov, tol = tolSolve) 83 wgtFUN(mah) 84 } else # no re-weighting 85 weights 86 structure(class = "comed", 87 c(list(Z = out$Z, raw.cov = out$S., raw.center = mx, raw.mah = mah, 88 wgtFUN=wgtFUN), 89 covW)) 90} 91 92 93##' Martin Maechler's simple proposal for an *adaptive* cutoff 94##' i.e., one which does *not* reject outliers in good samples asymptotically 95.COM.adaptWgt.c <- function(n,p, eps = 0.2 / n^0.3) { 96 ## default eps ==> 1-eps(n=100) ~= 0.95; 1-eps(n=10) ~= 0.90 97 ## using upper tail: 98 1.4826 * qchisq(eps, p, lower.tail=FALSE) / qchisq(0.5, p) 99} 100 101## Default wgtFUN() constructors for covComed(): 102.wgtFUN.covComed <- 103 list("01.original" = function(p, ...) { 104 cMah <- .COM.adaptWgt.c(p=p, eps = 0.05)# 1 - eps = 0.95 105 function(d) as.numeric(d < median(d)*cMah) }, 106 "01.flex" = function(p, n, control) { ## 'beta' instead of 0.95 107 stopifnot(is.1num(beta <- control$beta), 0 <= beta, beta <= 1) 108 cMah <- 1.4826 * qchisq(beta, p) / qchisq(0.5, p) 109 function(d) as.numeric(d < median(d)*cMah) }, 110 "01.adaptive" = function(p, n, ...) { ## 'beta_n' instead of 0.975 111 cMah <- .COM.adaptWgt.c(n,p) 112 function(d) as.numeric(d < cMah) }, 113 "sm1.flex" = function(p, n, control) { ## 'beta' / smooth weight 114 stopifnot(is.1num(beta <- control$beta), 0 <= beta, beta <= 1) 115 cMah <- 1.4826 * qchisq(beta, p) / qchisq(0.5, p) 116 function(d) smoothWgt(d / median(d), c=cMah, h = 1) }, 117 "sm1.adaptive" = function(p, n, ...) { 118 cMah <- .COM.adaptWgt.c(n=n, p=p) 119 function(d) smoothWgt(d / median(d), c = cMah, h = 1) }, 120 "sm2.adaptive" = function(p, n, ...) { 121 cMah <- .COM.adaptWgt.c(n=n, p=p) 122 function(d) smoothWgt(d / median(d), c = cMah, h = 2) } 123 ) 124 125 126comedian <- function (rho, Z, X) 127{ 128 p <- ncol(X) 129 U <- svd(rho, nv = 0L)$u 130 madX <- apply(X, 2, mad) 131 I.mad <- 1/madX 132 ## D <- diag(madX) 133 ## Q <- D %*% U 134 Q <- madX * U 135 ## invQ <- solve(Q) 136 t.inv.Q <- I.mad * U # = t(solve(Q)) = t(t(U) * D) == t(D) U = D U 137 Z <- X %*% t.inv.Q ## for (i in 1:n) Z[i,] <- invQ %*% X[i,] 138 madZ <- apply(Z, 2, mad) 139 list(Q=Q, Z=Z, S. = tcrossprod(Q * rep(madZ, each=p))) ## better than 140 ## S. = Q %*% diag(madZ)^2 %*% t(Q) 141} 142 143COM <- function(X) 144{ 145 ## Comedian *with* consistency factor. Falk(1997) was without it. 146 147 stopifnot(is.1num(p <- ncol(X)), p >= 1) 148 med <- colMedians(X) 149 Y <- sweep(X, 2L, med, `-`) 150 COM <- matrix(0., p,p) 151 madY <- numeric(p) 152 for(i in 1:p) { 153 madY[i] <- madYi <- mad(Yi <- Y[,i]) 154 for(j in seq_len(i-1)) { # j <= i ==> madY[j] "exists" 155 COM[j,i] <- COM[i,j] <- median(Yi * Y[,j]) / (madYi * madY[j]) 156 ## COM[i,j] <- median((Y[,i])*(Y[,j])) 157 ## COM[i,j] <- (1.4826^2)*median((Y[,i])*(Y[,j])) 158 } 159 ## j == i : 160 COM[i,i] <- median(Yi^2) / (madYi^2) 161 } 162 ## return [ 1.4826 = formals(mad)$constant = consistency factor of mad()] 163 1.4826^2 * COM 164} 165