1#### http://www.econ.kuleuven.be/public/NDBAE06/programs/roblog/ :
2####
3#### August 06, 2010  2:14 PM 9121 BYlogreg.r.txt == BYlogreg.R (*this* original)
4####    May 04, 2005  9:24 AM 6702 BYlogreg.txt   == BYlogreg.R.~2005~
5####    May 04, 2005  9:25 AM 6720 WBYlogreg.txt  == WBYlogreg.R
6####
7#### Sep. 27, 2017: available at
8####
9#### NB: Splus original Version of this file:  BYlogreg.ssc in the
10#### --  in FunctionsRob/ (from FunctionsRob.zip) from Wiley's book supplements
11#### http://www.wiley.com/legacy/wileychi/robust_statistics/robust.html
12#### see my ../misc/MMY-book/Wiley-supplements/FunctionsRob/BYlogreg.ssc
13
14
15##  Computation of the estimator of Bianco and Yohai (1996) in logistic regression
16##  -------------
17##  Christophe Croux, Gentiane Haesbroeck
18##  (thanks to Kristel Joossens and Valentin Todorov for improving the code) -
19##  ==> Now "contains" both the *weighted* and regular, unweighted  BY-estimator
20##
21##  This program computes the estimator of Bianco and Yohai in
22##  logistic regression. By default, an intercept term is included
23##  and p parameters are estimated.
24##
25##  For more details we refer to
26##     Croux, C., and Haesbroeck, G. (2003),
27##     ``Implementing the Bianco and Yohai estimator for Logistic Regression'',
28##     Computational Statistics and Data Analysis, 44, 273-295
29##
30
31## Changes by Martin Maechler, ---> ../man/BYlogreg.Rd
32##                                  ------------------
33
34BYlogreg <- function(x0, y, initwml=TRUE, # w.x=NULL,
35                     addIntercept=TRUE,
36                     const=0.5, kmax = 1000, maxhalf = 10,
37                     sigma.min = 1e-4, trace.lev=0)
38{
39    if(!is.numeric(y))
40        y <- as.numeric(y)
41    ## if(!is.null(w.x))
42    ##     warning("x weights  'w.x'  are not yet made use of")
43    if(!is.null(dim(y))) {
44        if(ncol(y) != 1)
45            stop("y is not onedimensional")
46        y <- as.vector(y)
47    }
48    n <- length(y)
49
50    if(is.data.frame(x0)) {
51        x0 <- data.matrix(x0)
52    } else if (!is.matrix(x0)) {
53        x0 <- matrix(x0, length(x0), 1,
54                     dimnames = list(names(x0), deparse(substitute(x0))))
55    }
56    if(nrow(x0) != n)
57        stop("Number of observations in x and y not equal")
58
59    na.x <- !is.finite(rowSums(x0))
60    na.y <- !is.finite(y)
61    ok <- !(na.x | na.y)
62    if(!all(ok)) {
63        x0 <- x0[ok, , drop = FALSE]
64        y  <- y [ok] # y[ok, , drop = FALSE]
65    }
66
67    if(addIntercept) {
68        x <- cbind("Intercept" = 1, x0)
69    } else { # x0 := x without the  "intercept column"
70        x <- x0
71        all1 <- apply(x == 1, 2, all)
72        if(any(all1))
73            x0 <- x[,!all1, drop = FALSE]
74        else message("no intercept in the model")
75    }
76
77    dx <- dim(x)
78    n <- dx[1]
79    if(n == 0)
80	stop("All observations have missing values!")
81    p <- dx[2] # == ncol(x)
82
83    family <- binomial()
84    ## Computation of the initial value of the optimization process
85    gstart <-
86        if(initwml) {
87###_ FIXME: Should allow many more schemes:
88###_ 1) using MVE with much less singular cases
89###_ 2) Instead of {0,1}-weighting with cutoff, w/ weights --> 0 *continuously*
90###     --> glm() with "prior" weights instead of 'subset'
91
92            ## hp <- floor(n*(1-0.25))+1
93            ##        mcdx <- cov.mcd(x0, quantile.used =hp,method="mcd")
94            ##        rdx=sqrt(mahalanobis(x0,center=mcdx$center,cov=mcdx$cov))
95            ## mcdx <- CovMcd(x0, alpha=0.75)
96            ## rdx  <- sqrt(getDistance(mcdx))
97            mcd <- covMcd(x0, alpha=0.75)
98            ##                      -----  FIXME: argument!
99            D <- sqrt( mahalanobis(mcd$X, mcd$center, mcd$cov) )
100            vc  <- sqrt(qchisq(0.975, p-1))
101            ##                 -----       FIXME: 'vc' should be argument!
102            wrd <- D <= vc
103### FIXME_2: use  weights and "weights.on.x'  as in Mqle ( ./glmrobMqle.R )
104	    ## glm(y~x0, family=binomial, subset = wrd)$coef
105	    glm.fit(x[wrd,,drop=FALSE], y[wrd], family=family)$coef
106	} else {
107	    glm.fit(x, y, family=family)$coef
108        }
109
110    sigma1 <- 1/sqrt(sum(gstart^2))
111    xistart <- gstart*sigma1
112    stscores <- x %*% xistart
113
114    ## Initial value for the objective function
115    oldobj <- mean(phiBY3(stscores/sigma1, y, const))
116
117    converged <- FALSE
118    kstep <- 1L
119    while(kstep < kmax && !converged)
120    {
121        unisig <- function(sigma) mean(phiBY3(stscores/sigma, y, const))
122        ##                             ------
123        optimsig <- nlminb(sigma1, unisig, lower=0)# "FIXME" arguments to nlminb()
124        ##          ======
125        if(trace.lev) cat(sprintf("k=%2d, s1=%12.8g: => new s1= %12.8g",
126                                  kstep, sigma1, optimsig$par))# MM: jhalf =!?= 1 here ??
127        sigma1 <- optimsig$par
128
129        if(sigma1 < sigma.min) {
130            if(trace.lev) cat("\n")
131            warning(gettextf("Implosion: sigma1=%g became too small", sigma1))
132            kstep <- kmax #-> *no* convergence
133        } else {
134            ## gamma1 <- xistart/sigma1
135            scores <- stscores/sigma1
136            newobj <- mean(phiBY3(scores, y,const))
137            oldobj <- newobj
138            grad.BY <- colMeans((derphiBY3(scores,y,const) %*% matrix(1,ncol=p))*x)
139            h <- -grad.BY + (grad.BY %*% xistart) *xistart
140            finalstep <- h/sqrt(sum(h^2))
141
142            if(trace.lev) {
143                if(trace.lev >= 2) cat(sprintf(", obj=%12.9g: ", oldobj))
144                cat("\n")
145            }
146
147            ## FIXME repeat { ... }   {{next 4 lines are also inside while(..) below}}
148            xi1 <- xistart+finalstep
149            xi1 <- xi1/sum(xi1^2)
150            scores1 <- (x %*% xi1)/sigma1
151            newobj <- mean(phiBY3(scores1,y,const))
152
153            ## If 'newobj' is not better, try taking a smaller step size:
154            hstep <- 1.
155            jhalf <- 1L
156            while(jhalf <= maxhalf & newobj > oldobj)
157            {
158                hstep <- hstep/2
159                xi1 <- xistart+finalstep*hstep
160                xi1 <- xi1/sqrt(sum(xi1^2))
161                scores1 <- x %*% xi1/sigma1
162                newobj <- mean(phiBY3(scores1,y,const))
163                if(trace.lev >= 2)
164                    cat(sprintf("  jh=%2d, hstep=%13.8g => new obj=%13.9g\n",
165                                jhalf, hstep, newobj))
166                jhalf <- jhalf+1L
167            }
168
169            converged <-
170                not.improved <- (jhalf > maxhalf && newobj > oldobj)
171            if(not.improved) {
172                ## newobj is "worse" and step halving did not improve
173                message("Convergence Achieved")
174            } else {
175                jhalf <- 1L
176                xistart <- xi1
177                oldobj <- newobj
178                stscores <- x %*% xi1
179                kstep <- kstep+1L
180            }
181        }
182    } ## while( kstep )
183
184    if(kstep == kmax) {
185        warning(gettextf("No convergence in %d steps.", kstep), domain=NA)
186        list(convergence=FALSE, objective=0, coefficients= rep(NA,p))
187    } else {
188        gammaest <- xistart/sigma1
189        V <- vcovBY3(x, y, const, estim=gammaest, addIntercept=FALSE)
190        list(convergence=TRUE, objective=oldobj, coefficients=gammaest,
191             cov = V, sterror = sqrt(diag(V)),
192             iter = kstep)
193    }
194}
195
196
197### -- FIXME: nlminb() allows many tweaks !!
198### -- -----  but we use nlminb() for ONE-dim. minimization over { sigma >= 0 } - really??
199##  MM: my version would rather use  optimize() over over  log(sigma)
200glmrobBY.control <-
201    function(maxit = 1000, const = 0.5, maxhalf = 10)
202    ## FIXME: sigma.min
203    ## MM: 'acc' seems a misnomer to me, but it's inherited from  MASS::rlm
204    ## TODO acc = 1e-04, test.acc = "coef", tcc = 1.345)
205{
206    ## if (!is.numeric(acc) || acc <= 0)
207    ##     stop("value of acc must be > 0")
208    ## if (test.acc != "coef")
209    ##     stop("Only 'test.acc = \"coef\"' is currently implemented")
210    ## if (!(any(test.vec == c("coef", "resid"))))
211    ##	  stop("invalid argument for test.acc")
212    if(!is.numeric(maxit) || maxit <= 0)
213	stop("maximum number of \"kstep\" iterations must be > 0")
214    if(!is.numeric(maxhalf) || maxhalf <= 0)
215	stop("maximal number of *inner* step halvings must be > 0")
216    ## if (!is.numeric(tcc) || tcc <= 0)
217    ##     stop("value of the tuning constant c (tcc) must be > 0")
218    if(!is.numeric(const) || const <= 0)
219	stop("value of the tuning constant c ('const') must be > 0")
220
221   list(## acc = acc, consttest.acc = test.acc,
222         const=const,
223         maxhalf=maxhalf,
224         maxit=maxit #, tcc = tcc
225         )
226}
227
228
229##' @param intercept logical, if true, X[,] has an intercept column which should
230##'                  not be used for rob.wts
231glmrobBY <- function(X, y,
232                     weights = NULL, start = NULL, offset = NULL,
233                     method = c("WBY","BY"), weights.on.x = "none",
234                     control = glmrobBY.control(...), intercept = TRUE,
235                     trace.lev = 0, ...)
236{
237### THIS is *NOT* exported
238
239    method <- match.arg(method)
240    if(!is.null(weights) || any(weights != 1)) ## FIXME (?)
241        stop("non-trivial prior 'weights' are not yet implemented for \"BY\"")
242    if(!is.null(start))
243        stop(" 'start' cannot yet be passed to glmrobBY()")
244    if(!is.null(offset))
245        stop(" 'offset' is not yet implemented for \"BY\"")
246    const   <- if(is.null(cc <- control$const  )) 0.5 else cc
247    kmax    <- if(is.null(cc <- control$maxit  )) 1e3 else cc
248    maxhalf <- if(is.null(cc <- control$maxhalf))  10 else cc
249    if(!identical(weights.on.x, "none"))
250        stop(gettextf("'weights.on.x = \"%s\"' is not implemented",
251                      format(weights.on.x)), domain=NA)
252    ## w.x <- robXweights(weights.on.x, X=X, intercept=intercept)
253    ##
254    ## MM: all(?) the  BY3() functions below would need to work with weights...
255
256    r <- BYlogreg(x0=X, y=y, initwml = (method == "WBY"), ## w.x=w.x,
257		  addIntercept = !intercept, ## add intercept if there is none
258		  const=const, kmax=kmax, maxhalf=maxhalf,
259		  ## FIXME sigma.min  (is currently x-scale dependent !????)
260		  trace.lev=trace.lev)
261    ## FIXME: make result more "compatible" with other glmrob() methods
262    r
263}
264
265
266
267### Functions needed for the computation of estimator of Bianco and Yohai ----------------------
268
269## From their paper:
270
271## A last remark is worth mentioning: when huge outliers occur in
272## the logistic regression setting, often numerical imprecision occurs in the computation
273## of the deviances given by
274##    d(s;y_i)= -y_i log F(s) - (1-y_i) log{1-F(s)} .
275##
276## Instead of directly computing this expression, it can be seen that a
277## numerically more stable and accurate formula is given by
278##    log(1 + exp(-abs(s))) + abs(s)* ((y-0.5)*s < 0)
279## in which the second term equals abs(s) if the observation is misclassified, 0 otherwise.
280dev1 <- function(s,y) log(1+exp(-abs(s))) + abs(s)*((y-0.5)*s<0)
281dev2 <- function(s,y) log1p(exp(-abs(s))) + abs(s)*((y-0.5)*s<0)
282dev3 <- function(s,y) -( y  * plogis(s, log.p=TRUE) +
283                        (1-y)*plogis(s, lower.tail=FALSE, log.p=TRUE))
284## MM[FIXME]: first tests indicate that  dev3() is clearly more accurate than
285##            their dev1() !!
286## MM{FIXME2}: In code below have (or "had") three cases of same formula, but
287##             with 's>0' instead of 's<0' : This is == dev?(-s, y) !!
288
289## for now,  100% back-compatibility:
290devBY <- dev1
291rm(dev1, dev2, dev3)
292
293## MM: This is from my vignette, but  *not* used
294log1pexp <- function(x) {
295    if(has.na <- any(ina <- is.na(x))) {
296	y <- x
297	x <- x[ok <- !ina]
298    }
299    t1 <- x <= 18
300    t2 <- !t1 & (tt <- x <= 33.3)
301    r <- x
302    r[ t1] <- log1p(exp(x[t1]))
303    r[ t2] <- { x2 <- x[t2]; x2 + exp(-x2) }
304    r[!tt] <- x[!tt]
305    if(has.na) { y[ok] <- r ; y } else r
306}
307
308
309phiBY3 <- function(s,y,c3)
310{
311  s <- as.double(s)
312  ## MM FIXME  log(1 + exp(-.)) ... but read the note above !! ---
313  dev. <- devBY(s,y)
314  ## FIXME: GBY3Fs()  computes the 'dev' above *again*, and
315  ##        GBY3Fsm() does with 's>0' instead of 's<0'
316  rhoBY3(dev.,c3) + GBY3Fs(s,c3) + GBY3Fsm(s,c3)
317}
318
319rhoBY3 <- function(t,c3)
320{
321    ec3 <- exp(-sqrt(c3))
322    t*ec3* (t <= c3) +
323        (ec3*(2+(2*sqrt(c3))+c3) - 2*exp(-sqrt(t))*(1+sqrt(t)))* (t > c3)
324}
325
326psiBY3 <- function(t,c3)
327{
328    exp(-sqrt(c3)) *(t <= c3) +
329    exp(-sqrt( t)) *(t >  c3)
330}
331## MM: This is shorter (but possibly slower when most t are <= c3 :
332## psiBY3 <- function(t,c3) exp(-sqrt(pmax(t, c3)))
333
334##'   d/dt psi(t, c3)
335derpsiBY3 <- function(t, c3)
336{
337    r <- t
338    r[in. <- (t <= c3)] <- 0
339    if(any(out <- !in.)) {
340        t <- t[out]
341        st <- sqrt(t)
342        r[out] <- -exp(-st)/(2*st)
343    }
344    r
345}
346
347## MM: FIXME  this is not used above
348sigmaBY3 <- function(sigma,s,y,c3)
349{
350    mean(phiBY3(s/sigma,y,c3))
351}
352
353derphiBY3 <- function(s,y,c3)
354{
355    Fs <- exp(-devBY(s,1))
356    ds <- Fs*(1-Fs) ## MM FIXME: use expm1()
357    dev. <- devBY(s,y)
358    Gprim1 <- devBY(s,1)
359    Gprim2 <- devBY(-s,1)
360
361    -psiBY3(dev.,c3)*(y-Fs) + ds*(psiBY3(Gprim1,c3) - psiBY3(Gprim2,c3))
362}
363
364der2phiBY3 <- function(s, y, c3)
365{
366    s <- as.double(s)
367    Fs <- exp(-devBY(s,1))
368    ds <- Fs*(1-Fs) ## MM FIXME: use expm1()
369    dev. <- devBY(s,y)
370    Gprim1 <- devBY(s,1)
371    Gprim2 <- devBY(-s,1)
372    der2 <- derpsiBY3(dev.,c3)*(Fs-y)^2  + ds*psiBY3(dev.,c3)
373    der2 <- der2+ ds*(1-2*Fs)*(psiBY3(Gprim1,c3) - psiBY3(Gprim2,c3))
374    der2 - ds*(derpsiBY3(Gprim1,c3)*(1-Fs) +
375               derpsiBY3(Gprim2,c3)*  Fs )
376}
377
378
379GBY3Fs <- function(s,c3)
380{
381    e.f <- exp(0.25)*sqrt(pi)
382    ## MM FIXME: Fs = exp(..) and below use  log(Fs) !!
383    Fs <- exp(-devBY(s,1))
384    resGinf <- e.f*(pnorm(sqrt(2)*(0.5+sqrt(-log(Fs))))-1)
385     ## MM FIXME: use expm1():
386    resGinf <- (resGinf+(Fs*exp(-sqrt(-log(Fs)))))*as.numeric(s <= -log(exp(c3)-1))
387    resGsup <- ((Fs*exp(-sqrt(c3)))+(e.f*(pnorm(sqrt(2)*(0.5+sqrt(c3)))-1))) *
388        as.numeric(s > -log(exp(c3)-1))
389    resGinf + resGsup
390}
391
392
393GBY3Fsm <- function(s,c3)
394{
395    e.f <- exp(0.25)*sqrt(pi)
396    ## MM FIXME: Fsm = exp(..) and below use  log(Fsm) !!
397    Fsm <- exp(-devBY(-s,1))
398    resGinf <- e.f*(pnorm(sqrt(2)*(0.5+sqrt(-log(Fsm))))-1)
399     ## MM FIXME: use expm1():
400    resGinf <- (resGinf+(Fsm*exp(-sqrt(-log(Fsm))))) * as.numeric(s >= log(exp(c3)-1))
401    resGsup <- ((Fsm*exp(-sqrt(c3)))+(e.f*(pnorm(sqrt(2)*(0.5+sqrt(c3)))-1))) *
402        as.numeric(s < log(exp(c3)-1))
403    resGinf + resGsup
404}
405
406## Compute the standard erros of the estimates -
407##  this is done by estimating the asymptotic variance of the normal
408##  limiting distribution of the BY estimator - as derived in Bianco
409##  and Yohai (1996)
410##
411sterby3 <- function(x0, y, const, estim, addIntercept)
412{
413    sqrt(diag(vcovBY3(x0, y, const=const, estim=estim, addIntercept=addIntercept)))
414}
415
416vcovBY3 <- function(z, y, const, estim, addIntercept)
417{
418    stopifnot(length(dim(z)) == 2)
419    if(addIntercept) z <- cbind(1, z)
420    d <- dim(z)
421    n <- d[1]
422    p <- d[2]
423    argum <- z %*% estim
424    matM <- IFsqr <- matrix(0, p, p)
425    for(i in 1:n)
426    {
427        myscalar <- as.numeric(der2phiBY3(argum[i],y[i], c3=const))
428        zzt <- tcrossprod(z[i,])
429        matM <- matM + myscalar * zzt
430        IFsqr <- IFsqr + derphiBY3(argum[i],y[i], c3=const)^2 * zzt
431    }
432
433    matM    <- matM/n
434    matMinv <- solve(matM)
435    IFsqr <- IFsqr/n
436    ## Now,  asymp.cov  =  matMinv %*% IFsqr %*% t(matMinv)
437
438    ## provide  vcov(): the full matrix
439    (matMinv %*% IFsqr %*% t(matMinv))/n
440}
441
442
443