1#  File src/library/stats/R/wilcox.test.R
2#  Part of the R package, https://www.R-project.org
3#
4#  Copyright (C) 1995-2019 The R Core Team
5#
6#  This program is free software; you can redistribute it and/or modify
7#  it under the terms of the GNU General Public License as published by
8#  the Free Software Foundation; either version 2 of the License, or
9#  (at your option) any later version.
10#
11#  This program is distributed in the hope that it will be useful,
12#  but WITHOUT ANY WARRANTY; without even the implied warranty of
13#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14#  GNU General Public License for more details.
15#
16#  A copy of the GNU General Public License is available at
17#  https://www.R-project.org/Licenses/
18
19wilcox.test <- function(x, ...) UseMethod("wilcox.test")
20
21wilcox.test.default <-
22function(x, y = NULL, alternative = c("two.sided", "less", "greater"),
23         mu = 0, paired = FALSE, exact = NULL, correct = TRUE,
24         conf.int = FALSE, conf.level = 0.95, tol.root = 1e-4, digits.rank = Inf, ...)
25{
26    alternative <- match.arg(alternative)
27    if(!missing(mu) && ((length(mu) > 1L) || !is.finite(mu)))
28        stop("'mu' must be a single number")
29    if(conf.int) {
30        if(!((length(conf.level) == 1L)
31             && is.finite(conf.level)
32             && (conf.level > 0)
33             && (conf.level < 1)))
34            stop("'conf.level' must be a single number between 0 and 1")
35    }
36
37    if(!is.numeric(x)) stop("'x' must be numeric")
38    if(!is.null(y)) {
39        if(!is.numeric(y)) stop("'y' must be numeric")
40        DNAME <- paste(deparse1(substitute(x)), "and",
41                       deparse1(substitute(y)))
42        if(paired) {
43            if(length(x) != length(y))
44                stop("'x' and 'y' must have the same length")
45            OK <- complete.cases(x, y)
46            x <- x[OK] - y[OK]
47            y <- NULL
48        }
49        else {
50            y <- y[!is.na(y)]
51        }
52    } else {
53        DNAME <- deparse1(substitute(x))
54        if(paired)
55            stop("'y' is missing for paired test")
56    }
57    x <- x[!is.na(x)]
58
59    if(length(x) < 1L)
60        stop("not enough (non-missing) 'x' observations")
61    CORRECTION <- 0
62    if(is.null(y)) {
63        METHOD <- "Wilcoxon signed rank test"
64        x <- x - mu
65        ZEROES <- any(x == 0)
66        if(ZEROES)
67            x <- x[x != 0]
68        n <- as.double(length(x))
69        if(is.null(exact))
70            exact <- (n < 50)
71        r <- rank(abs(if(is.finite(digits.rank)) signif(x, digits.rank) else x))
72        STATISTIC <- setNames(sum(r[x > 0]), "V")
73        TIES <- length(r) != length(unique(r))
74
75        if(exact && !TIES && !ZEROES) {
76	    METHOD <- sub("test", "exact test", METHOD, fixed=TRUE)
77            PVAL <-
78                switch(alternative,
79                       "two.sided" = {
80                           p <- if(STATISTIC > (n * (n + 1) / 4))
81                                psignrank(STATISTIC - 1, n, lower.tail = FALSE)
82                           else psignrank(STATISTIC, n)
83                           min(2 * p, 1)
84                       },
85                       "greater" = psignrank(STATISTIC - 1, n, lower.tail = FALSE),
86                       "less" = psignrank(STATISTIC, n))
87            if(conf.int) {
88                ## Exact confidence interval for the median in the
89                ## one-sample case.  When used with paired values this
90                ## gives a confidence interval for mean(x) - mean(y).
91                x <- x + mu             # we want a conf.int for the median
92                alpha <- 1 - conf.level
93                diffs <- outer(x, x, "+")
94                diffs <- sort(diffs[!lower.tri(diffs)]) / 2
95                cint <-
96                    switch(alternative,
97                           "two.sided" = {
98                               qu <- qsignrank(alpha / 2, n)
99                               if(qu == 0) qu <- 1
100                               ql <- n*(n+1)/2 - qu
101                               achieved.alpha <- 2*psignrank(trunc(qu)-1,n)
102                               c(diffs[qu], diffs[ql+1])
103                           },
104                           "greater" = {
105                               qu <- qsignrank(alpha, n)
106                               if(qu == 0) qu <- 1
107                               achieved.alpha <- psignrank(trunc(qu)-1,n)
108                               c(diffs[qu], +Inf)
109                           },
110                           "less" = {
111                               qu <- qsignrank(alpha, n)
112                               if(qu == 0) qu <- 1
113                               ql <- n*(n+1)/2 - qu
114                               achieved.alpha <- psignrank(trunc(qu)-1,n)
115                               c(-Inf, diffs[ql+1])
116                           })
117                if (achieved.alpha - alpha > alpha/2){
118                    warning("requested conf.level not achievable")
119                    conf.level <- 1 - signif(achieved.alpha, 2)
120                }
121                attr(cint, "conf.level") <- conf.level
122		ESTIMATE <- c("(pseudo)median" = median(diffs))
123            }
124        } else { ## not exact, maybe ties or zeroes
125            NTIES <- table(r)
126            z <- STATISTIC - n * (n + 1)/4
127            SIGMA <- sqrt(n * (n + 1) * (2 * n + 1) / 24
128                          - sum(NTIES^3 - NTIES) / 48)
129            if(correct) {
130                CORRECTION <-
131                    switch(alternative,
132                           "two.sided" = sign(z) * 0.5,
133                           "greater" = 0.5,
134                           "less" = -0.5)
135                METHOD <- paste(METHOD, "with continuity correction")
136            }
137	    z <- (z - CORRECTION) / SIGMA
138	    PVAL <- switch(alternative,
139			   "less" = pnorm(z),
140			   "greater" = pnorm(z, lower.tail=FALSE),
141			   "two.sided" = 2 * min(pnorm(z),
142						 pnorm(z, lower.tail=FALSE)))
143            if(conf.int) {
144                ## Asymptotic confidence interval for the median in the
145                ## one-sample case.  When used with paired values this
146                ## gives a confidence interval for mean(x) - mean(y).
147                ## Algorithm not published, thus better documented here.
148                x <- x + mu
149                alpha <- 1 - conf.level
150		if(n > 0) {
151		    ## These are sample based limits for the median
152		    ## [They don't work if alpha is too high]
153		    mumin <- min(x)
154		    mumax <- max(x)
155		    ## wdiff(d, zq) returns the absolute difference between
156		    ## the asymptotic Wilcoxon statistic of x - mu - d and
157		    ## the quantile zq.
158                    W <- function(d) { ## also fn(x, correct, alternative)
159			xd <- x - d
160			xd <- xd[xd != 0]
161			nx <- length(xd)
162                        dr <- rank(abs(if(is.finite(digits.rank)) signif(xd, digits.rank) else xd))
163			zd <- sum(dr[xd > 0]) - nx * (nx + 1)/4
164			NTIES.CI <- table(dr)
165			SIGMA.CI <- sqrt(nx * (nx + 1) * (2 * nx + 1) / 24
166					 - sum(NTIES.CI^3 - NTIES.CI) / 48)
167			if (SIGMA.CI == 0)
168			    warning(
169			"cannot compute confidence interval when all observations are zero or tied",
170				    call.=FALSE)
171			CORRECTION.CI <-
172			    if(correct) {
173				switch(alternative,
174				       "two.sided" = sign(zd) * 0.5,
175				       "greater" = 0.5,
176				       "less" = -0.5)
177			    } else 0
178			(zd - CORRECTION.CI) / SIGMA.CI
179		    }
180		    Wmumin <- W(mumin)
181		    Wmumax <- if(!is.finite(Wmumin)) NA else W(mumax) # if(): warn only once
182		}
183		if(n == 0 || !is.finite(Wmumax)) { # incl. "all zero / ties" warning above
184		    cint <- structure(c(if(alternative == "less"   ) -Inf else NaN,
185					if(alternative == "greater") +Inf else NaN),
186				      conf.level = 0)
187		    ESTIMATE <- if(n > 0) c(midrange = (mumin+mumax)/2) else NaN
188		} else { # (Wmumin, Wmumax) are finite
189                    wdiff <- function(d, zq) W(d) - zq
190                    ## Here we optimize the function wdiff in d over the set
191                    ## c(mumin, mumax).
192                    ## This returns a value from c(mumin, mumax) for which
193                    ## the asymptotic Wilcoxon statistic is equal to the
194                    ## quantile zq.  This means that the statistic is not
195                    ## within the critical region, and that implies that d
196                    ## is a confidence limit for the median.
197                    ##
198                    ## As in the exact case, interchange quantiles.
199                    root <- function(zq) {
200                        uniroot(wdiff, lower = mumin, upper = mumax,
201                                f.lower = Wmumin - zq, f.upper = Wmumax - zq,
202                                tol = tol.root, zq = zq)$root
203                    }
204
205		    cint <- switch(alternative, "two.sided" = {
206			repeat { ## FIXME: no need to loop for finding boundary alpha !!
207			    mindiff <- Wmumin - qnorm(alpha/2, lower.tail = FALSE)
208			    maxdiff <- Wmumax - qnorm(alpha/2)
209			    if(mindiff < 0 || maxdiff > 0)  alpha <- alpha*2  else break
210			}
211			if (alpha >= 1 || 1 - conf.level < alpha*0.75) {
212			    conf.level <- 1 - pmin(1, alpha)
213			    warning("requested conf.level not achievable")
214			}
215			if(alpha < 1) {
216			    l <- root(zq = qnorm(alpha/2, lower.tail = FALSE))
217			    u <- root(zq = qnorm(alpha/2))
218			    c(l, u)
219			} else { ## alpha >= 1
220			    rep(median(x), 2)
221			}
222		    }, "greater" = {
223			repeat { ## FIXME: no need to loop for finding boundary alpha !!
224			    mindiff <- Wmumin - qnorm(alpha, lower.tail = FALSE)
225			    if(mindiff < 0)  alpha <- alpha*2  else break
226			}
227			if (alpha >= 1 || 1 - conf.level < alpha*0.75) {
228			    conf.level <- 1 - pmin(1, alpha)
229			    warning("requested conf.level not achievable")
230			}
231			l <- if(alpha < 1)
232				 root(zq = qnorm(alpha, lower.tail = FALSE))
233			     else   ## alpha >= 1
234				 median(x)
235			c(l, +Inf)
236
237		    }, "less" = {
238			repeat { ## FIXME: no need to loop for finding boundary alpha !!
239			    maxdiff <- Wmumax - qnorm(alpha/2)
240			    if(maxdiff > 0)  alpha <- alpha * 2  else break
241			}
242			if (alpha >= 1 || 1 - conf.level < alpha*0.75) {
243			    conf.level <- 1 - pmin(1, alpha)
244			    warning("requested conf.level not achievable")
245			}
246			u <- if(alpha < 1)
247				 root(zq = qnorm(alpha))
248			     else
249				 median(x)
250			c(-Inf, u)
251		    })
252		    attr(cint, "conf.level") <- conf.level
253		    correct <- FALSE # for W(): no continuity correction for estimate
254		    ESTIMATE <- c("(pseudo)median" =
255				  uniroot(W, lower = mumin, upper = mumax,
256					  tol = tol.root)$root)
257                } # regular (Wmumin, Wmumax)
258            } # end{conf.int}
259            if(exact && TIES) {
260                warning("cannot compute exact p-value with ties")
261                if(conf.int)
262                    warning("cannot compute exact confidence interval with ties")
263            }
264            if(exact && ZEROES) {
265                warning("cannot compute exact p-value with zeroes")
266                if(conf.int)
267                    warning("cannot compute exact confidence interval with zeroes")
268            }
269	}
270    }
271    else { ##-------------------------- 2-sample case ---------------------------
272        if(length(y) < 1L)
273            stop("not enough 'y' observations")
274        METHOD <- "Wilcoxon rank sum test"
275        r <- c(x - mu, y)
276        r <- rank(if(is.finite(digits.rank)) signif(r, digits.rank) else r)
277        n.x <- as.double(length(x))
278        n.y <- as.double(length(y))
279        if(is.null(exact))
280            exact <- (n.x < 50) && (n.y < 50)
281        STATISTIC <- c("W" = sum(r[seq_along(x)]) - n.x * (n.x + 1) / 2)
282        TIES <- (length(r) != length(unique(r)))
283        if(exact && !TIES) {
284	    METHOD <- sub("test", "exact test", METHOD, fixed=TRUE)
285            PVAL <-
286                switch(alternative,
287                       "two.sided" = {
288                           p <- if(STATISTIC > (n.x * n.y / 2))
289                               pwilcox(STATISTIC - 1, n.x, n.y, lower.tail = FALSE)
290                           else
291                               pwilcox(STATISTIC, n.x, n.y)
292                           min(2 * p, 1)
293                       },
294                       "greater" = {
295                           pwilcox(STATISTIC - 1, n.x, n.y, lower.tail = FALSE)
296                       },
297                       "less" = pwilcox(STATISTIC, n.x, n.y))
298            if(conf.int) {
299                ## Exact confidence interval for the location parameter
300                ## mean(x) - mean(y) in the two-sample case (cf. the
301                ## one-sample case).
302                alpha <- 1 - conf.level
303                diffs <- sort(outer(x, y, "-"))
304                cint <-
305                    switch(alternative,
306                           "two.sided" = {
307                               qu <- qwilcox(alpha/2, n.x, n.y)
308                               if(qu == 0) qu <- 1
309                               ql <- n.x*n.y - qu
310                               achieved.alpha <- 2*pwilcox(trunc(qu)-1,n.x,n.y)
311                               c(diffs[qu], diffs[ql + 1])
312                           },
313                           "greater" = {
314                               qu <- qwilcox(alpha, n.x, n.y)
315                               if(qu == 0) qu <- 1
316                               achieved.alpha <- pwilcox(trunc(qu)-1,n.x,n.y)
317                               c(diffs[qu], +Inf)
318                           },
319                           "less" = {
320                               qu <- qwilcox(alpha, n.x, n.y)
321                               if(qu == 0) qu <- 1
322                               ql <- n.x*n.y - qu
323                               achieved.alpha <- pwilcox(trunc(qu)-1,n.x,n.y)
324                               c(-Inf, diffs[ql + 1])
325                           })
326                if (achieved.alpha-alpha > alpha/2) {
327                    warning("Requested conf.level not achievable")
328                    conf.level <- 1 - achieved.alpha
329                }
330                attr(cint, "conf.level") <- conf.level
331                ESTIMATE <- c("difference in location" = median(diffs))
332            }
333        }
334        else { ## not exact, maybe ties or zeroes
335            NTIES <- table(r)
336            z <- STATISTIC - n.x * n.y / 2
337            SIGMA <- sqrt((n.x * n.y / 12) *
338                          ((n.x + n.y + 1)
339                           - sum(NTIES^3 - NTIES)
340                           / ((n.x + n.y) * (n.x + n.y - 1))))
341            if(correct) {
342                CORRECTION <- switch(alternative,
343                                     "two.sided" = sign(z) * 0.5,
344                                     "greater" = 0.5,
345                                     "less" = -0.5)
346                METHOD <- paste(METHOD, "with continuity correction")
347            }
348	    z <- (z - CORRECTION) / SIGMA
349	    PVAL <- switch(alternative,
350			   "less" = pnorm(z),
351			   "greater" = pnorm(z, lower.tail=FALSE),
352			   "two.sided" = 2 * min(pnorm(z),
353						 pnorm(z, lower.tail=FALSE)))
354            if(conf.int) {
355                ## Asymptotic confidence interval for the location
356                ## parameter mean(x) - mean(y) in the two-sample case
357                ## (cf. one-sample case).
358                ##
359                ## Algorithm not published, for a documentation see the
360                ## one-sample case.
361                alpha <- 1 - conf.level
362                mumin <- min(x) - max(y)
363                mumax <- max(x) - min(y)
364                W <- function(d) { ## also fn (x, y, n.x, n.y, correct, alternative)
365                    dr <- c(x - d, y)
366                    dr <- rank(if(is.finite(digits.rank)) signif(dr, digits.rank) else dr)
367                    NTIES.CI <- table(dr)
368                    dz <- sum(dr[seq_along(x)]) - n.x * (n.x + 1) / 2 - n.x * n.y / 2
369		    CORRECTION.CI <-
370			if(correct) {
371                            switch(alternative,
372                                   "two.sided" = sign(dz) * 0.5,
373                                   "greater" = 0.5,
374                                   "less" = -0.5)
375			} else 0
376                    SIGMA.CI <- sqrt((n.x * n.y / 12) *
377                                     ((n.x + n.y + 1)
378                                      - sum(NTIES.CI^3 - NTIES.CI)
379                                      / ((n.x + n.y) * (n.x + n.y - 1))))
380                    if (SIGMA.CI == 0)
381			warning(
382			"cannot compute confidence interval when all observations are tied",
383                                call.=FALSE)
384                    (dz - CORRECTION.CI) / SIGMA.CI
385                }
386                wdiff <- function(d, zq) W(d) - zq
387                Wmumin <- W(mumin)
388                Wmumax <- W(mumax)
389                root <- function(zq) {
390                    ## in extreme cases we need to return endpoints,
391                    ## e.g.  wilcox.test(1, 2:60, conf.int=TRUE)
392                    f.lower <- Wmumin - zq
393                    if(f.lower <= 0) return(mumin)
394                    f.upper <- Wmumax - zq
395                    if(f.upper >= 0) return(mumax)
396                    uniroot(wdiff, lower=mumin, upper=mumax,
397                            f.lower = f.lower, f.upper = f.upper,
398                            tol = tol.root, zq = zq)$root
399                }
400                cint <- switch(alternative,
401                               "two.sided" = {
402                                   l <- root(zq = qnorm(alpha/2, lower.tail = FALSE))
403                                   u <- root(zq = qnorm(alpha/2))
404                                   c(l, u)
405                               },
406                               "greater" = {
407                                   l <- root(zq = qnorm(alpha, lower.tail = FALSE))
408                                   c(l, +Inf)
409                               },
410                               "less" = {
411                                   u <- root(zq = qnorm(alpha))
412                                   c(-Inf, u)
413                               })
414                attr(cint, "conf.level") <- conf.level
415		correct <- FALSE # for W(): no continuity correction for estimate
416		ESTIMATE <- c("difference in location" =
417			      uniroot(W, lower=mumin, upper=mumax,
418				      tol = tol.root)$root)
419            } ## {conf.int}
420
421            if(exact && TIES) {
422                warning("cannot compute exact p-value with ties")
423                if(conf.int)
424                    warning("cannot compute exact confidence intervals with ties")
425            }
426        }
427    }
428
429    names(mu) <- if(paired || !is.null(y)) "location shift" else "location"
430    RVAL <- list(statistic = STATISTIC,
431                 parameter = NULL,
432                 p.value = as.numeric(PVAL),
433                 null.value = mu,
434                 alternative = alternative,
435                 method = METHOD,
436                 data.name = DNAME)
437    if(conf.int)
438        RVAL <- c(RVAL,
439                  list(conf.int = cint,
440                       estimate = ESTIMATE))
441    class(RVAL) <- "htest"
442    RVAL
443}
444
445wilcox.test.formula <-
446function(formula, data, subset, na.action, ...)
447{
448    if(missing(formula) || (length(formula) != 3L))
449        stop("'formula' missing or incorrect")
450    oneSampleOrPaired <- FALSE
451    if (length(attr(terms(formula[-2L]), "term.labels")) != 1L)
452        if (formula[[3]] == 1L)
453            oneSampleOrPaired <- TRUE
454        else
455            stop("'formula' missing or incorrect")
456    m <- match.call(expand.dots = FALSE)
457    if (is.matrix(eval(m$data, parent.frame())))
458        m$data <- as.data.frame(data)
459    ## need stats:: for non-standard evaluation
460    m[[1L]] <- quote(stats::model.frame)
461    m$... <- NULL
462    mf <- eval(m, parent.frame())
463    DNAME <- paste(names(mf), collapse = " by ") # works in all cases
464    names(mf) <- NULL
465    response <- attr(attr(mf, "terms"), "response")
466    if (! oneSampleOrPaired) {
467        g <- factor(mf[[-response]])
468        if(nlevels(g) != 2L)
469            stop("grouping factor must have exactly 2 levels")
470        DATA <- setNames(split(mf[[response]], g), c("x", "y"))
471        y <- do.call("wilcox.test", c(DATA, list(...)))
472    }
473    else { # 1-sample and paired tests
474        respVar <- mf[[response]]
475        if (inherits(respVar, "Pair")){
476            DATA <- list(x = respVar[,1], y = respVar[,2], paired=TRUE)
477            y <- do.call("wilcox.test", c(DATA, list(...)))
478        }
479        else {
480            DATA <- list(x = respVar)
481            y <- do.call("wilcox.test", c(DATA, list(...)))
482        }
483    }
484    y$data.name <- DNAME
485    y
486}
487