1# File src/library/stats/R/wilcox.test.R 2# Part of the R package, https://www.R-project.org 3# 4# Copyright (C) 1995-2019 The R Core Team 5# 6# This program is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# A copy of the GNU General Public License is available at 17# https://www.R-project.org/Licenses/ 18 19wilcox.test <- function(x, ...) UseMethod("wilcox.test") 20 21wilcox.test.default <- 22function(x, y = NULL, alternative = c("two.sided", "less", "greater"), 23 mu = 0, paired = FALSE, exact = NULL, correct = TRUE, 24 conf.int = FALSE, conf.level = 0.95, tol.root = 1e-4, digits.rank = Inf, ...) 25{ 26 alternative <- match.arg(alternative) 27 if(!missing(mu) && ((length(mu) > 1L) || !is.finite(mu))) 28 stop("'mu' must be a single number") 29 if(conf.int) { 30 if(!((length(conf.level) == 1L) 31 && is.finite(conf.level) 32 && (conf.level > 0) 33 && (conf.level < 1))) 34 stop("'conf.level' must be a single number between 0 and 1") 35 } 36 37 if(!is.numeric(x)) stop("'x' must be numeric") 38 if(!is.null(y)) { 39 if(!is.numeric(y)) stop("'y' must be numeric") 40 DNAME <- paste(deparse1(substitute(x)), "and", 41 deparse1(substitute(y))) 42 if(paired) { 43 if(length(x) != length(y)) 44 stop("'x' and 'y' must have the same length") 45 OK <- complete.cases(x, y) 46 x <- x[OK] - y[OK] 47 y <- NULL 48 } 49 else { 50 y <- y[!is.na(y)] 51 } 52 } else { 53 DNAME <- deparse1(substitute(x)) 54 if(paired) 55 stop("'y' is missing for paired test") 56 } 57 x <- x[!is.na(x)] 58 59 if(length(x) < 1L) 60 stop("not enough (non-missing) 'x' observations") 61 CORRECTION <- 0 62 if(is.null(y)) { 63 METHOD <- "Wilcoxon signed rank test" 64 x <- x - mu 65 ZEROES <- any(x == 0) 66 if(ZEROES) 67 x <- x[x != 0] 68 n <- as.double(length(x)) 69 if(is.null(exact)) 70 exact <- (n < 50) 71 r <- rank(abs(if(is.finite(digits.rank)) signif(x, digits.rank) else x)) 72 STATISTIC <- setNames(sum(r[x > 0]), "V") 73 TIES <- length(r) != length(unique(r)) 74 75 if(exact && !TIES && !ZEROES) { 76 METHOD <- sub("test", "exact test", METHOD, fixed=TRUE) 77 PVAL <- 78 switch(alternative, 79 "two.sided" = { 80 p <- if(STATISTIC > (n * (n + 1) / 4)) 81 psignrank(STATISTIC - 1, n, lower.tail = FALSE) 82 else psignrank(STATISTIC, n) 83 min(2 * p, 1) 84 }, 85 "greater" = psignrank(STATISTIC - 1, n, lower.tail = FALSE), 86 "less" = psignrank(STATISTIC, n)) 87 if(conf.int) { 88 ## Exact confidence interval for the median in the 89 ## one-sample case. When used with paired values this 90 ## gives a confidence interval for mean(x) - mean(y). 91 x <- x + mu # we want a conf.int for the median 92 alpha <- 1 - conf.level 93 diffs <- outer(x, x, "+") 94 diffs <- sort(diffs[!lower.tri(diffs)]) / 2 95 cint <- 96 switch(alternative, 97 "two.sided" = { 98 qu <- qsignrank(alpha / 2, n) 99 if(qu == 0) qu <- 1 100 ql <- n*(n+1)/2 - qu 101 achieved.alpha <- 2*psignrank(trunc(qu)-1,n) 102 c(diffs[qu], diffs[ql+1]) 103 }, 104 "greater" = { 105 qu <- qsignrank(alpha, n) 106 if(qu == 0) qu <- 1 107 achieved.alpha <- psignrank(trunc(qu)-1,n) 108 c(diffs[qu], +Inf) 109 }, 110 "less" = { 111 qu <- qsignrank(alpha, n) 112 if(qu == 0) qu <- 1 113 ql <- n*(n+1)/2 - qu 114 achieved.alpha <- psignrank(trunc(qu)-1,n) 115 c(-Inf, diffs[ql+1]) 116 }) 117 if (achieved.alpha - alpha > alpha/2){ 118 warning("requested conf.level not achievable") 119 conf.level <- 1 - signif(achieved.alpha, 2) 120 } 121 attr(cint, "conf.level") <- conf.level 122 ESTIMATE <- c("(pseudo)median" = median(diffs)) 123 } 124 } else { ## not exact, maybe ties or zeroes 125 NTIES <- table(r) 126 z <- STATISTIC - n * (n + 1)/4 127 SIGMA <- sqrt(n * (n + 1) * (2 * n + 1) / 24 128 - sum(NTIES^3 - NTIES) / 48) 129 if(correct) { 130 CORRECTION <- 131 switch(alternative, 132 "two.sided" = sign(z) * 0.5, 133 "greater" = 0.5, 134 "less" = -0.5) 135 METHOD <- paste(METHOD, "with continuity correction") 136 } 137 z <- (z - CORRECTION) / SIGMA 138 PVAL <- switch(alternative, 139 "less" = pnorm(z), 140 "greater" = pnorm(z, lower.tail=FALSE), 141 "two.sided" = 2 * min(pnorm(z), 142 pnorm(z, lower.tail=FALSE))) 143 if(conf.int) { 144 ## Asymptotic confidence interval for the median in the 145 ## one-sample case. When used with paired values this 146 ## gives a confidence interval for mean(x) - mean(y). 147 ## Algorithm not published, thus better documented here. 148 x <- x + mu 149 alpha <- 1 - conf.level 150 if(n > 0) { 151 ## These are sample based limits for the median 152 ## [They don't work if alpha is too high] 153 mumin <- min(x) 154 mumax <- max(x) 155 ## wdiff(d, zq) returns the absolute difference between 156 ## the asymptotic Wilcoxon statistic of x - mu - d and 157 ## the quantile zq. 158 W <- function(d) { ## also fn(x, correct, alternative) 159 xd <- x - d 160 xd <- xd[xd != 0] 161 nx <- length(xd) 162 dr <- rank(abs(if(is.finite(digits.rank)) signif(xd, digits.rank) else xd)) 163 zd <- sum(dr[xd > 0]) - nx * (nx + 1)/4 164 NTIES.CI <- table(dr) 165 SIGMA.CI <- sqrt(nx * (nx + 1) * (2 * nx + 1) / 24 166 - sum(NTIES.CI^3 - NTIES.CI) / 48) 167 if (SIGMA.CI == 0) 168 warning( 169 "cannot compute confidence interval when all observations are zero or tied", 170 call.=FALSE) 171 CORRECTION.CI <- 172 if(correct) { 173 switch(alternative, 174 "two.sided" = sign(zd) * 0.5, 175 "greater" = 0.5, 176 "less" = -0.5) 177 } else 0 178 (zd - CORRECTION.CI) / SIGMA.CI 179 } 180 Wmumin <- W(mumin) 181 Wmumax <- if(!is.finite(Wmumin)) NA else W(mumax) # if(): warn only once 182 } 183 if(n == 0 || !is.finite(Wmumax)) { # incl. "all zero / ties" warning above 184 cint <- structure(c(if(alternative == "less" ) -Inf else NaN, 185 if(alternative == "greater") +Inf else NaN), 186 conf.level = 0) 187 ESTIMATE <- if(n > 0) c(midrange = (mumin+mumax)/2) else NaN 188 } else { # (Wmumin, Wmumax) are finite 189 wdiff <- function(d, zq) W(d) - zq 190 ## Here we optimize the function wdiff in d over the set 191 ## c(mumin, mumax). 192 ## This returns a value from c(mumin, mumax) for which 193 ## the asymptotic Wilcoxon statistic is equal to the 194 ## quantile zq. This means that the statistic is not 195 ## within the critical region, and that implies that d 196 ## is a confidence limit for the median. 197 ## 198 ## As in the exact case, interchange quantiles. 199 root <- function(zq) { 200 uniroot(wdiff, lower = mumin, upper = mumax, 201 f.lower = Wmumin - zq, f.upper = Wmumax - zq, 202 tol = tol.root, zq = zq)$root 203 } 204 205 cint <- switch(alternative, "two.sided" = { 206 repeat { ## FIXME: no need to loop for finding boundary alpha !! 207 mindiff <- Wmumin - qnorm(alpha/2, lower.tail = FALSE) 208 maxdiff <- Wmumax - qnorm(alpha/2) 209 if(mindiff < 0 || maxdiff > 0) alpha <- alpha*2 else break 210 } 211 if (alpha >= 1 || 1 - conf.level < alpha*0.75) { 212 conf.level <- 1 - pmin(1, alpha) 213 warning("requested conf.level not achievable") 214 } 215 if(alpha < 1) { 216 l <- root(zq = qnorm(alpha/2, lower.tail = FALSE)) 217 u <- root(zq = qnorm(alpha/2)) 218 c(l, u) 219 } else { ## alpha >= 1 220 rep(median(x), 2) 221 } 222 }, "greater" = { 223 repeat { ## FIXME: no need to loop for finding boundary alpha !! 224 mindiff <- Wmumin - qnorm(alpha, lower.tail = FALSE) 225 if(mindiff < 0) alpha <- alpha*2 else break 226 } 227 if (alpha >= 1 || 1 - conf.level < alpha*0.75) { 228 conf.level <- 1 - pmin(1, alpha) 229 warning("requested conf.level not achievable") 230 } 231 l <- if(alpha < 1) 232 root(zq = qnorm(alpha, lower.tail = FALSE)) 233 else ## alpha >= 1 234 median(x) 235 c(l, +Inf) 236 237 }, "less" = { 238 repeat { ## FIXME: no need to loop for finding boundary alpha !! 239 maxdiff <- Wmumax - qnorm(alpha/2) 240 if(maxdiff > 0) alpha <- alpha * 2 else break 241 } 242 if (alpha >= 1 || 1 - conf.level < alpha*0.75) { 243 conf.level <- 1 - pmin(1, alpha) 244 warning("requested conf.level not achievable") 245 } 246 u <- if(alpha < 1) 247 root(zq = qnorm(alpha)) 248 else 249 median(x) 250 c(-Inf, u) 251 }) 252 attr(cint, "conf.level") <- conf.level 253 correct <- FALSE # for W(): no continuity correction for estimate 254 ESTIMATE <- c("(pseudo)median" = 255 uniroot(W, lower = mumin, upper = mumax, 256 tol = tol.root)$root) 257 } # regular (Wmumin, Wmumax) 258 } # end{conf.int} 259 if(exact && TIES) { 260 warning("cannot compute exact p-value with ties") 261 if(conf.int) 262 warning("cannot compute exact confidence interval with ties") 263 } 264 if(exact && ZEROES) { 265 warning("cannot compute exact p-value with zeroes") 266 if(conf.int) 267 warning("cannot compute exact confidence interval with zeroes") 268 } 269 } 270 } 271 else { ##-------------------------- 2-sample case --------------------------- 272 if(length(y) < 1L) 273 stop("not enough 'y' observations") 274 METHOD <- "Wilcoxon rank sum test" 275 r <- c(x - mu, y) 276 r <- rank(if(is.finite(digits.rank)) signif(r, digits.rank) else r) 277 n.x <- as.double(length(x)) 278 n.y <- as.double(length(y)) 279 if(is.null(exact)) 280 exact <- (n.x < 50) && (n.y < 50) 281 STATISTIC <- c("W" = sum(r[seq_along(x)]) - n.x * (n.x + 1) / 2) 282 TIES <- (length(r) != length(unique(r))) 283 if(exact && !TIES) { 284 METHOD <- sub("test", "exact test", METHOD, fixed=TRUE) 285 PVAL <- 286 switch(alternative, 287 "two.sided" = { 288 p <- if(STATISTIC > (n.x * n.y / 2)) 289 pwilcox(STATISTIC - 1, n.x, n.y, lower.tail = FALSE) 290 else 291 pwilcox(STATISTIC, n.x, n.y) 292 min(2 * p, 1) 293 }, 294 "greater" = { 295 pwilcox(STATISTIC - 1, n.x, n.y, lower.tail = FALSE) 296 }, 297 "less" = pwilcox(STATISTIC, n.x, n.y)) 298 if(conf.int) { 299 ## Exact confidence interval for the location parameter 300 ## mean(x) - mean(y) in the two-sample case (cf. the 301 ## one-sample case). 302 alpha <- 1 - conf.level 303 diffs <- sort(outer(x, y, "-")) 304 cint <- 305 switch(alternative, 306 "two.sided" = { 307 qu <- qwilcox(alpha/2, n.x, n.y) 308 if(qu == 0) qu <- 1 309 ql <- n.x*n.y - qu 310 achieved.alpha <- 2*pwilcox(trunc(qu)-1,n.x,n.y) 311 c(diffs[qu], diffs[ql + 1]) 312 }, 313 "greater" = { 314 qu <- qwilcox(alpha, n.x, n.y) 315 if(qu == 0) qu <- 1 316 achieved.alpha <- pwilcox(trunc(qu)-1,n.x,n.y) 317 c(diffs[qu], +Inf) 318 }, 319 "less" = { 320 qu <- qwilcox(alpha, n.x, n.y) 321 if(qu == 0) qu <- 1 322 ql <- n.x*n.y - qu 323 achieved.alpha <- pwilcox(trunc(qu)-1,n.x,n.y) 324 c(-Inf, diffs[ql + 1]) 325 }) 326 if (achieved.alpha-alpha > alpha/2) { 327 warning("Requested conf.level not achievable") 328 conf.level <- 1 - achieved.alpha 329 } 330 attr(cint, "conf.level") <- conf.level 331 ESTIMATE <- c("difference in location" = median(diffs)) 332 } 333 } 334 else { ## not exact, maybe ties or zeroes 335 NTIES <- table(r) 336 z <- STATISTIC - n.x * n.y / 2 337 SIGMA <- sqrt((n.x * n.y / 12) * 338 ((n.x + n.y + 1) 339 - sum(NTIES^3 - NTIES) 340 / ((n.x + n.y) * (n.x + n.y - 1)))) 341 if(correct) { 342 CORRECTION <- switch(alternative, 343 "two.sided" = sign(z) * 0.5, 344 "greater" = 0.5, 345 "less" = -0.5) 346 METHOD <- paste(METHOD, "with continuity correction") 347 } 348 z <- (z - CORRECTION) / SIGMA 349 PVAL <- switch(alternative, 350 "less" = pnorm(z), 351 "greater" = pnorm(z, lower.tail=FALSE), 352 "two.sided" = 2 * min(pnorm(z), 353 pnorm(z, lower.tail=FALSE))) 354 if(conf.int) { 355 ## Asymptotic confidence interval for the location 356 ## parameter mean(x) - mean(y) in the two-sample case 357 ## (cf. one-sample case). 358 ## 359 ## Algorithm not published, for a documentation see the 360 ## one-sample case. 361 alpha <- 1 - conf.level 362 mumin <- min(x) - max(y) 363 mumax <- max(x) - min(y) 364 W <- function(d) { ## also fn (x, y, n.x, n.y, correct, alternative) 365 dr <- c(x - d, y) 366 dr <- rank(if(is.finite(digits.rank)) signif(dr, digits.rank) else dr) 367 NTIES.CI <- table(dr) 368 dz <- sum(dr[seq_along(x)]) - n.x * (n.x + 1) / 2 - n.x * n.y / 2 369 CORRECTION.CI <- 370 if(correct) { 371 switch(alternative, 372 "two.sided" = sign(dz) * 0.5, 373 "greater" = 0.5, 374 "less" = -0.5) 375 } else 0 376 SIGMA.CI <- sqrt((n.x * n.y / 12) * 377 ((n.x + n.y + 1) 378 - sum(NTIES.CI^3 - NTIES.CI) 379 / ((n.x + n.y) * (n.x + n.y - 1)))) 380 if (SIGMA.CI == 0) 381 warning( 382 "cannot compute confidence interval when all observations are tied", 383 call.=FALSE) 384 (dz - CORRECTION.CI) / SIGMA.CI 385 } 386 wdiff <- function(d, zq) W(d) - zq 387 Wmumin <- W(mumin) 388 Wmumax <- W(mumax) 389 root <- function(zq) { 390 ## in extreme cases we need to return endpoints, 391 ## e.g. wilcox.test(1, 2:60, conf.int=TRUE) 392 f.lower <- Wmumin - zq 393 if(f.lower <= 0) return(mumin) 394 f.upper <- Wmumax - zq 395 if(f.upper >= 0) return(mumax) 396 uniroot(wdiff, lower=mumin, upper=mumax, 397 f.lower = f.lower, f.upper = f.upper, 398 tol = tol.root, zq = zq)$root 399 } 400 cint <- switch(alternative, 401 "two.sided" = { 402 l <- root(zq = qnorm(alpha/2, lower.tail = FALSE)) 403 u <- root(zq = qnorm(alpha/2)) 404 c(l, u) 405 }, 406 "greater" = { 407 l <- root(zq = qnorm(alpha, lower.tail = FALSE)) 408 c(l, +Inf) 409 }, 410 "less" = { 411 u <- root(zq = qnorm(alpha)) 412 c(-Inf, u) 413 }) 414 attr(cint, "conf.level") <- conf.level 415 correct <- FALSE # for W(): no continuity correction for estimate 416 ESTIMATE <- c("difference in location" = 417 uniroot(W, lower=mumin, upper=mumax, 418 tol = tol.root)$root) 419 } ## {conf.int} 420 421 if(exact && TIES) { 422 warning("cannot compute exact p-value with ties") 423 if(conf.int) 424 warning("cannot compute exact confidence intervals with ties") 425 } 426 } 427 } 428 429 names(mu) <- if(paired || !is.null(y)) "location shift" else "location" 430 RVAL <- list(statistic = STATISTIC, 431 parameter = NULL, 432 p.value = as.numeric(PVAL), 433 null.value = mu, 434 alternative = alternative, 435 method = METHOD, 436 data.name = DNAME) 437 if(conf.int) 438 RVAL <- c(RVAL, 439 list(conf.int = cint, 440 estimate = ESTIMATE)) 441 class(RVAL) <- "htest" 442 RVAL 443} 444 445wilcox.test.formula <- 446function(formula, data, subset, na.action, ...) 447{ 448 if(missing(formula) || (length(formula) != 3L)) 449 stop("'formula' missing or incorrect") 450 oneSampleOrPaired <- FALSE 451 if (length(attr(terms(formula[-2L]), "term.labels")) != 1L) 452 if (formula[[3]] == 1L) 453 oneSampleOrPaired <- TRUE 454 else 455 stop("'formula' missing or incorrect") 456 m <- match.call(expand.dots = FALSE) 457 if (is.matrix(eval(m$data, parent.frame()))) 458 m$data <- as.data.frame(data) 459 ## need stats:: for non-standard evaluation 460 m[[1L]] <- quote(stats::model.frame) 461 m$... <- NULL 462 mf <- eval(m, parent.frame()) 463 DNAME <- paste(names(mf), collapse = " by ") # works in all cases 464 names(mf) <- NULL 465 response <- attr(attr(mf, "terms"), "response") 466 if (! oneSampleOrPaired) { 467 g <- factor(mf[[-response]]) 468 if(nlevels(g) != 2L) 469 stop("grouping factor must have exactly 2 levels") 470 DATA <- setNames(split(mf[[response]], g), c("x", "y")) 471 y <- do.call("wilcox.test", c(DATA, list(...))) 472 } 473 else { # 1-sample and paired tests 474 respVar <- mf[[response]] 475 if (inherits(respVar, "Pair")){ 476 DATA <- list(x = respVar[,1], y = respVar[,2], paired=TRUE) 477 y <- do.call("wilcox.test", c(DATA, list(...))) 478 } 479 else { 480 DATA <- list(x = respVar) 481 y <- do.call("wilcox.test", c(DATA, list(...))) 482 } 483 } 484 y$data.name <- DNAME 485 y 486} 487