1#### http://www.econ.kuleuven.be/public/NDBAE06/programs/roblog/ : 2#### 3#### August 06, 2010 2:14 PM 9121 BYlogreg.r.txt == BYlogreg.R (*this* original) 4#### May 04, 2005 9:24 AM 6702 BYlogreg.txt == BYlogreg.R.~2005~ 5#### May 04, 2005 9:25 AM 6720 WBYlogreg.txt == WBYlogreg.R 6#### 7#### Sep. 27, 2017: available at 8#### 9#### NB: Splus original Version of this file: BYlogreg.ssc in the 10#### -- in FunctionsRob/ (from FunctionsRob.zip) from Wiley's book supplements 11#### http://www.wiley.com/legacy/wileychi/robust_statistics/robust.html 12#### see my ../misc/MMY-book/Wiley-supplements/FunctionsRob/BYlogreg.ssc 13 14 15## Computation of the estimator of Bianco and Yohai (1996) in logistic regression 16## ------------- 17## Christophe Croux, Gentiane Haesbroeck 18## (thanks to Kristel Joossens and Valentin Todorov for improving the code) - 19## ==> Now "contains" both the *weighted* and regular, unweighted BY-estimator 20## 21## This program computes the estimator of Bianco and Yohai in 22## logistic regression. By default, an intercept term is included 23## and p parameters are estimated. 24## 25## For more details we refer to 26## Croux, C., and Haesbroeck, G. (2003), 27## ``Implementing the Bianco and Yohai estimator for Logistic Regression'', 28## Computational Statistics and Data Analysis, 44, 273-295 29## 30 31## Changes by Martin Maechler, ---> ../man/BYlogreg.Rd 32## ------------------ 33 34BYlogreg <- function(x0, y, initwml=TRUE, # w.x=NULL, 35 addIntercept=TRUE, 36 const=0.5, kmax = 1000, maxhalf = 10, 37 sigma.min = 1e-4, trace.lev=0) 38{ 39 if(!is.numeric(y)) 40 y <- as.numeric(y) 41 ## if(!is.null(w.x)) 42 ## warning("x weights 'w.x' are not yet made use of") 43 if(!is.null(dim(y))) { 44 if(ncol(y) != 1) 45 stop("y is not onedimensional") 46 y <- as.vector(y) 47 } 48 n <- length(y) 49 50 if(is.data.frame(x0)) { 51 x0 <- data.matrix(x0) 52 } else if (!is.matrix(x0)) { 53 x0 <- matrix(x0, length(x0), 1, 54 dimnames = list(names(x0), deparse(substitute(x0)))) 55 } 56 if(nrow(x0) != n) 57 stop("Number of observations in x and y not equal") 58 59 na.x <- !is.finite(rowSums(x0)) 60 na.y <- !is.finite(y) 61 ok <- !(na.x | na.y) 62 if(!all(ok)) { 63 x0 <- x0[ok, , drop = FALSE] 64 y <- y [ok] # y[ok, , drop = FALSE] 65 } 66 67 if(addIntercept) { 68 x <- cbind("Intercept" = 1, x0) 69 } else { # x0 := x without the "intercept column" 70 x <- x0 71 all1 <- apply(x == 1, 2, all) 72 if(any(all1)) 73 x0 <- x[,!all1, drop = FALSE] 74 else message("no intercept in the model") 75 } 76 77 dx <- dim(x) 78 n <- dx[1] 79 if(n == 0) 80 stop("All observations have missing values!") 81 p <- dx[2] # == ncol(x) 82 83 family <- binomial() 84 ## Computation of the initial value of the optimization process 85 gstart <- 86 if(initwml) { 87###_ FIXME: Should allow many more schemes: 88###_ 1) using MVE with much less singular cases 89###_ 2) Instead of {0,1}-weighting with cutoff, w/ weights --> 0 *continuously* 90### --> glm() with "prior" weights instead of 'subset' 91 92 ## hp <- floor(n*(1-0.25))+1 93 ## mcdx <- cov.mcd(x0, quantile.used =hp,method="mcd") 94 ## rdx=sqrt(mahalanobis(x0,center=mcdx$center,cov=mcdx$cov)) 95 ## mcdx <- CovMcd(x0, alpha=0.75) 96 ## rdx <- sqrt(getDistance(mcdx)) 97 mcd <- covMcd(x0, alpha=0.75) 98 ## ----- FIXME: argument! 99 D <- sqrt( mahalanobis(mcd$X, mcd$center, mcd$cov) ) 100 vc <- sqrt(qchisq(0.975, p-1)) 101 ## ----- FIXME: 'vc' should be argument! 102 wrd <- D <= vc 103### FIXME_2: use weights and "weights.on.x' as in Mqle ( ./glmrobMqle.R ) 104 ## glm(y~x0, family=binomial, subset = wrd)$coef 105 glm.fit(x[wrd,,drop=FALSE], y[wrd], family=family)$coef 106 } else { 107 glm.fit(x, y, family=family)$coef 108 } 109 110 sigma1 <- 1/sqrt(sum(gstart^2)) 111 xistart <- gstart*sigma1 112 stscores <- x %*% xistart 113 114 ## Initial value for the objective function 115 oldobj <- mean(phiBY3(stscores/sigma1, y, const)) 116 117 converged <- FALSE 118 kstep <- 1L 119 while(kstep < kmax && !converged) 120 { 121 unisig <- function(sigma) mean(phiBY3(stscores/sigma, y, const)) 122 ## ------ 123 optimsig <- nlminb(sigma1, unisig, lower=0)# "FIXME" arguments to nlminb() 124 ## ====== 125 if(trace.lev) cat(sprintf("k=%2d, s1=%12.8g: => new s1= %12.8g", 126 kstep, sigma1, optimsig$par))# MM: jhalf =!?= 1 here ?? 127 sigma1 <- optimsig$par 128 129 if(sigma1 < sigma.min) { 130 if(trace.lev) cat("\n") 131 warning(gettextf("Implosion: sigma1=%g became too small", sigma1)) 132 kstep <- kmax #-> *no* convergence 133 } else { 134 ## gamma1 <- xistart/sigma1 135 scores <- stscores/sigma1 136 newobj <- mean(phiBY3(scores, y,const)) 137 oldobj <- newobj 138 grad.BY <- colMeans((derphiBY3(scores,y,const) %*% matrix(1,ncol=p))*x) 139 h <- -grad.BY + (grad.BY %*% xistart) *xistart 140 finalstep <- h/sqrt(sum(h^2)) 141 142 if(trace.lev) { 143 if(trace.lev >= 2) cat(sprintf(", obj=%12.9g: ", oldobj)) 144 cat("\n") 145 } 146 147 ## FIXME repeat { ... } {{next 4 lines are also inside while(..) below}} 148 xi1 <- xistart+finalstep 149 xi1 <- xi1/sum(xi1^2) 150 scores1 <- (x %*% xi1)/sigma1 151 newobj <- mean(phiBY3(scores1,y,const)) 152 153 ## If 'newobj' is not better, try taking a smaller step size: 154 hstep <- 1. 155 jhalf <- 1L 156 while(jhalf <= maxhalf & newobj > oldobj) 157 { 158 hstep <- hstep/2 159 xi1 <- xistart+finalstep*hstep 160 xi1 <- xi1/sqrt(sum(xi1^2)) 161 scores1 <- x %*% xi1/sigma1 162 newobj <- mean(phiBY3(scores1,y,const)) 163 if(trace.lev >= 2) 164 cat(sprintf(" jh=%2d, hstep=%13.8g => new obj=%13.9g\n", 165 jhalf, hstep, newobj)) 166 jhalf <- jhalf+1L 167 } 168 169 converged <- 170 not.improved <- (jhalf > maxhalf && newobj > oldobj) 171 if(not.improved) { 172 ## newobj is "worse" and step halving did not improve 173 message("Convergence Achieved") 174 } else { 175 jhalf <- 1L 176 xistart <- xi1 177 oldobj <- newobj 178 stscores <- x %*% xi1 179 kstep <- kstep+1L 180 } 181 } 182 } ## while( kstep ) 183 184 if(kstep == kmax) { 185 warning(gettextf("No convergence in %d steps.", kstep), domain=NA) 186 list(convergence=FALSE, objective=0, coefficients= rep(NA,p)) 187 } else { 188 gammaest <- xistart/sigma1 189 V <- vcovBY3(x, y, const, estim=gammaest, addIntercept=FALSE) 190 list(convergence=TRUE, objective=oldobj, coefficients=gammaest, 191 cov = V, sterror = sqrt(diag(V)), 192 iter = kstep) 193 } 194} 195 196 197### -- FIXME: nlminb() allows many tweaks !! 198### -- ----- but we use nlminb() for ONE-dim. minimization over { sigma >= 0 } - really?? 199## MM: my version would rather use optimize() over over log(sigma) 200glmrobBY.control <- 201 function(maxit = 1000, const = 0.5, maxhalf = 10) 202 ## FIXME: sigma.min 203 ## MM: 'acc' seems a misnomer to me, but it's inherited from MASS::rlm 204 ## TODO acc = 1e-04, test.acc = "coef", tcc = 1.345) 205{ 206 ## if (!is.numeric(acc) || acc <= 0) 207 ## stop("value of acc must be > 0") 208 ## if (test.acc != "coef") 209 ## stop("Only 'test.acc = \"coef\"' is currently implemented") 210 ## if (!(any(test.vec == c("coef", "resid")))) 211 ## stop("invalid argument for test.acc") 212 if(!is.numeric(maxit) || maxit <= 0) 213 stop("maximum number of \"kstep\" iterations must be > 0") 214 if(!is.numeric(maxhalf) || maxhalf <= 0) 215 stop("maximal number of *inner* step halvings must be > 0") 216 ## if (!is.numeric(tcc) || tcc <= 0) 217 ## stop("value of the tuning constant c (tcc) must be > 0") 218 if(!is.numeric(const) || const <= 0) 219 stop("value of the tuning constant c ('const') must be > 0") 220 221 list(## acc = acc, consttest.acc = test.acc, 222 const=const, 223 maxhalf=maxhalf, 224 maxit=maxit #, tcc = tcc 225 ) 226} 227 228 229##' @param intercept logical, if true, X[,] has an intercept column which should 230##' not be used for rob.wts 231glmrobBY <- function(X, y, 232 weights = NULL, start = NULL, offset = NULL, 233 method = c("WBY","BY"), weights.on.x = "none", 234 control = glmrobBY.control(...), intercept = TRUE, 235 trace.lev = 0, ...) 236{ 237### THIS is *NOT* exported 238 239 method <- match.arg(method) 240 if(!is.null(weights) || any(weights != 1)) ## FIXME (?) 241 stop("non-trivial prior 'weights' are not yet implemented for \"BY\"") 242 if(!is.null(start)) 243 stop(" 'start' cannot yet be passed to glmrobBY()") 244 if(!is.null(offset)) 245 stop(" 'offset' is not yet implemented for \"BY\"") 246 const <- if(is.null(cc <- control$const )) 0.5 else cc 247 kmax <- if(is.null(cc <- control$maxit )) 1e3 else cc 248 maxhalf <- if(is.null(cc <- control$maxhalf)) 10 else cc 249 if(!identical(weights.on.x, "none")) 250 stop(gettextf("'weights.on.x = \"%s\"' is not implemented", 251 format(weights.on.x)), domain=NA) 252 ## w.x <- robXweights(weights.on.x, X=X, intercept=intercept) 253 ## 254 ## MM: all(?) the BY3() functions below would need to work with weights... 255 256 r <- BYlogreg(x0=X, y=y, initwml = (method == "WBY"), ## w.x=w.x, 257 addIntercept = !intercept, ## add intercept if there is none 258 const=const, kmax=kmax, maxhalf=maxhalf, 259 ## FIXME sigma.min (is currently x-scale dependent !????) 260 trace.lev=trace.lev) 261 ## FIXME: make result more "compatible" with other glmrob() methods 262 r 263} 264 265 266 267### Functions needed for the computation of estimator of Bianco and Yohai ---------------------- 268 269## From their paper: 270 271## A last remark is worth mentioning: when huge outliers occur in 272## the logistic regression setting, often numerical imprecision occurs in the computation 273## of the deviances given by 274## d(s;y_i)= -y_i log F(s) - (1-y_i) log{1-F(s)} . 275## 276## Instead of directly computing this expression, it can be seen that a 277## numerically more stable and accurate formula is given by 278## log(1 + exp(-abs(s))) + abs(s)* ((y-0.5)*s < 0) 279## in which the second term equals abs(s) if the observation is misclassified, 0 otherwise. 280dev1 <- function(s,y) log(1+exp(-abs(s))) + abs(s)*((y-0.5)*s<0) 281dev2 <- function(s,y) log1p(exp(-abs(s))) + abs(s)*((y-0.5)*s<0) 282dev3 <- function(s,y) -( y * plogis(s, log.p=TRUE) + 283 (1-y)*plogis(s, lower.tail=FALSE, log.p=TRUE)) 284## MM[FIXME]: first tests indicate that dev3() is clearly more accurate than 285## their dev1() !! 286## MM{FIXME2}: In code below have (or "had") three cases of same formula, but 287## with 's>0' instead of 's<0' : This is == dev?(-s, y) !! 288 289## for now, 100% back-compatibility: 290devBY <- dev1 291rm(dev1, dev2, dev3) 292 293## MM: This is from my vignette, but *not* used 294log1pexp <- function(x) { 295 if(has.na <- any(ina <- is.na(x))) { 296 y <- x 297 x <- x[ok <- !ina] 298 } 299 t1 <- x <= 18 300 t2 <- !t1 & (tt <- x <= 33.3) 301 r <- x 302 r[ t1] <- log1p(exp(x[t1])) 303 r[ t2] <- { x2 <- x[t2]; x2 + exp(-x2) } 304 r[!tt] <- x[!tt] 305 if(has.na) { y[ok] <- r ; y } else r 306} 307 308 309phiBY3 <- function(s,y,c3) 310{ 311 s <- as.double(s) 312 ## MM FIXME log(1 + exp(-.)) ... but read the note above !! --- 313 dev. <- devBY(s,y) 314 ## FIXME: GBY3Fs() computes the 'dev' above *again*, and 315 ## GBY3Fsm() does with 's>0' instead of 's<0' 316 rhoBY3(dev.,c3) + GBY3Fs(s,c3) + GBY3Fsm(s,c3) 317} 318 319rhoBY3 <- function(t,c3) 320{ 321 ec3 <- exp(-sqrt(c3)) 322 t*ec3* (t <= c3) + 323 (ec3*(2+(2*sqrt(c3))+c3) - 2*exp(-sqrt(t))*(1+sqrt(t)))* (t > c3) 324} 325 326psiBY3 <- function(t,c3) 327{ 328 exp(-sqrt(c3)) *(t <= c3) + 329 exp(-sqrt( t)) *(t > c3) 330} 331## MM: This is shorter (but possibly slower when most t are <= c3 : 332## psiBY3 <- function(t,c3) exp(-sqrt(pmax(t, c3))) 333 334##' d/dt psi(t, c3) 335derpsiBY3 <- function(t, c3) 336{ 337 r <- t 338 r[in. <- (t <= c3)] <- 0 339 if(any(out <- !in.)) { 340 t <- t[out] 341 st <- sqrt(t) 342 r[out] <- -exp(-st)/(2*st) 343 } 344 r 345} 346 347## MM: FIXME this is not used above 348sigmaBY3 <- function(sigma,s,y,c3) 349{ 350 mean(phiBY3(s/sigma,y,c3)) 351} 352 353derphiBY3 <- function(s,y,c3) 354{ 355 Fs <- exp(-devBY(s,1)) 356 ds <- Fs*(1-Fs) ## MM FIXME: use expm1() 357 dev. <- devBY(s,y) 358 Gprim1 <- devBY(s,1) 359 Gprim2 <- devBY(-s,1) 360 361 -psiBY3(dev.,c3)*(y-Fs) + ds*(psiBY3(Gprim1,c3) - psiBY3(Gprim2,c3)) 362} 363 364der2phiBY3 <- function(s, y, c3) 365{ 366 s <- as.double(s) 367 Fs <- exp(-devBY(s,1)) 368 ds <- Fs*(1-Fs) ## MM FIXME: use expm1() 369 dev. <- devBY(s,y) 370 Gprim1 <- devBY(s,1) 371 Gprim2 <- devBY(-s,1) 372 der2 <- derpsiBY3(dev.,c3)*(Fs-y)^2 + ds*psiBY3(dev.,c3) 373 der2 <- der2+ ds*(1-2*Fs)*(psiBY3(Gprim1,c3) - psiBY3(Gprim2,c3)) 374 der2 - ds*(derpsiBY3(Gprim1,c3)*(1-Fs) + 375 derpsiBY3(Gprim2,c3)* Fs ) 376} 377 378 379GBY3Fs <- function(s,c3) 380{ 381 e.f <- exp(0.25)*sqrt(pi) 382 ## MM FIXME: Fs = exp(..) and below use log(Fs) !! 383 Fs <- exp(-devBY(s,1)) 384 resGinf <- e.f*(pnorm(sqrt(2)*(0.5+sqrt(-log(Fs))))-1) 385 ## MM FIXME: use expm1(): 386 resGinf <- (resGinf+(Fs*exp(-sqrt(-log(Fs)))))*as.numeric(s <= -log(exp(c3)-1)) 387 resGsup <- ((Fs*exp(-sqrt(c3)))+(e.f*(pnorm(sqrt(2)*(0.5+sqrt(c3)))-1))) * 388 as.numeric(s > -log(exp(c3)-1)) 389 resGinf + resGsup 390} 391 392 393GBY3Fsm <- function(s,c3) 394{ 395 e.f <- exp(0.25)*sqrt(pi) 396 ## MM FIXME: Fsm = exp(..) and below use log(Fsm) !! 397 Fsm <- exp(-devBY(-s,1)) 398 resGinf <- e.f*(pnorm(sqrt(2)*(0.5+sqrt(-log(Fsm))))-1) 399 ## MM FIXME: use expm1(): 400 resGinf <- (resGinf+(Fsm*exp(-sqrt(-log(Fsm))))) * as.numeric(s >= log(exp(c3)-1)) 401 resGsup <- ((Fsm*exp(-sqrt(c3)))+(e.f*(pnorm(sqrt(2)*(0.5+sqrt(c3)))-1))) * 402 as.numeric(s < log(exp(c3)-1)) 403 resGinf + resGsup 404} 405 406## Compute the standard erros of the estimates - 407## this is done by estimating the asymptotic variance of the normal 408## limiting distribution of the BY estimator - as derived in Bianco 409## and Yohai (1996) 410## 411sterby3 <- function(x0, y, const, estim, addIntercept) 412{ 413 sqrt(diag(vcovBY3(x0, y, const=const, estim=estim, addIntercept=addIntercept))) 414} 415 416vcovBY3 <- function(z, y, const, estim, addIntercept) 417{ 418 stopifnot(length(dim(z)) == 2) 419 if(addIntercept) z <- cbind(1, z) 420 d <- dim(z) 421 n <- d[1] 422 p <- d[2] 423 argum <- z %*% estim 424 matM <- IFsqr <- matrix(0, p, p) 425 for(i in 1:n) 426 { 427 myscalar <- as.numeric(der2phiBY3(argum[i],y[i], c3=const)) 428 zzt <- tcrossprod(z[i,]) 429 matM <- matM + myscalar * zzt 430 IFsqr <- IFsqr + derphiBY3(argum[i],y[i], c3=const)^2 * zzt 431 } 432 433 matM <- matM/n 434 matMinv <- solve(matM) 435 IFsqr <- IFsqr/n 436 ## Now, asymp.cov = matMinv %*% IFsqr %*% t(matMinv) 437 438 ## provide vcov(): the full matrix 439 (matMinv %*% IFsqr %*% t(matMinv))/n 440} 441 442 443