stats/R/ansari.test.R

#  File src/library/stats/R/ansari.test.R
#  Part of the R package, https://www.R-project.org
#
#  Copyright (C) 1995-2020 The R Core Team
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  https://www.R-project.org/Licenses/

ansari.test <- function(x, ...) UseMethod("ansari.test")

ansari.test.default <-
function(x, y, alternative = c("two.sided", "less", "greater"),
         exact = NULL, conf.int = FALSE, conf.level = 0.95, ...)
{
    alternative <- match.arg(alternative)
    if(conf.int) {
        if(!((length(conf.level) == 1L)
             && is.finite(conf.level)
             && (conf.level > 0)
             && (conf.level < 1)))
            stop("'conf.level' must be a single number between 0 and 1")
    }
    DNAME <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y)))

    x <- x[complete.cases(x)]
    y <- y[complete.cases(y)]
    m <- as.integer(length(x))
    if(is.na(m) || m < 1L)
        stop("not enough 'x' observations")
    n <- as.integer(length(y))
    if(is.na(n) || n < 1L)
        stop("not enough 'y' observations")
    N <- m + n

    r <- rank(c(x, y))
    STATISTIC <- sum(pmin(r, N - r + 1)[seq_along(x)])
    TIES <- (length(r) != length(unique(r)))

    if(is.null(exact))
        exact <- ((m < 50L) && (n < 50L))

    if(exact && !TIES) {
        pansari <- function(q, m, n) .Call(C_pAnsari, q, m, n)
        PVAL <-
            switch(alternative,
                   two.sided = {
                       if (STATISTIC > ((m + 1)^2 %/% 4
                                        + ((m * n) %/% 2) / 2))
                           p <- 1 - pansari(STATISTIC - 1, m, n)
                       else
                           p <- pansari(STATISTIC, m, n)
                       min(2 * p, 1)
                   },
                   less = 1 - pansari(STATISTIC - 1, m, n),
                   greater = pansari(STATISTIC, m, n))
        if (conf.int) {
            qansari <- function(p, m, n) .Call(C_qAnsari, p, m, n)
            alpha <- 1 - conf.level
            x <- sort(x)
            y <- sort(y)
            ab <- function(sig) {
                rab <- rank(c(x/sig, y))
                sum(pmin(rab, N - rab + 1)[seq_along(x)])
            }
            ratio <- outer(x, y, "/")
            aratio <- ratio[ratio >= 0]
            sigma <- sort(aratio)

            cci <- function(alpha) {
              u <- absigma - qansari(alpha/2,  m, n)
              l <- absigma - qansari(1 - alpha/2, m, n)
              if(length(u[u >= 0]) == 0L || length(l[l > 0]) == 0L) {
                  warning("samples differ in location: cannot compute confidence set, returning NA")
                  return(c(NA, NA))
              }
              ## Check if the statistic exceeds both quantiles first.
              ## uci <- NULL
              ## lci <- NULL
              ## if (is.null(uci)) {
                  u[u < 0] <- NA
                  uci <- min(sigma[which(u == min(u, na.rm = TRUE))])
              ## }
              ## if (is.null(lci)) {
                  l[l <= 0] <- NA
                  lci <- max(sigma[which(l == min(l, na.rm = TRUE))])
              ## }
              ## The process of the statistics does not need to be
              ## monotone in sigma: check this and interchange quantiles.
              if (uci > lci) {
                  l <- absigma - qansari(alpha/2,  m, n)
                  u <- absigma - qansari(1 - alpha/2, m, n)
                  u[u < 0] <- NA
                  uci <- min(sigma[which(u == min(u, na.rm = TRUE))])
                  l[l <= 0] <- NA
                  lci <- max(sigma[which(l == min(l, na.rm = TRUE))])
               }
               c(uci, lci)
            }

            cint <- if(length(sigma) < 1L) {
                warning("cannot compute confidence set, returning NA")
                c(NA, NA)
            }
            else {
                ## Compute statistics directly: computing the steps is
                ## not faster.
                absigma <-
                    sapply(sigma + c(diff(sigma)/2,
                                     sigma[length(sigma)]*1.01), ab)
                switch(alternative, two.sided = cci(alpha),
                       greater = c(cci(alpha*2)[1L], Inf),
                       less    = c(0, cci(alpha*2)[2L]))
            }
            attr(cint, "conf.level") <- conf.level
            u <- absigma - qansari(0.5, m, n)
            sgr <- if(length(sgr <- sigma[u <= 0]) == 0L) NA else max(sgr)
            sle <- if(length(sle <- sigma[u  > 0]) == 0L) NA else min(sle)
            ESTIMATE <- mean(c(sle, sgr))
        }
    }
    else {
        EVEN <- ((N %% 2L) == 0L)
        normalize <- function(s, r, TIES, m=length(x), n=length(y)) {
            z <- if(EVEN)
                s - m * (N + 2)/4
            else
                s - m * (N + 1)^2/(4 * N)
            if (!TIES) {
                SIGMA <- if(EVEN)
                    sqrt((m * n * (N + 2) * (N - 2))/(48 * (N - 1)))
                else
                    sqrt((m * n * (N + 1) * (3 + N^2))/(48 * N^2))
            }
            else {
                r <- rle(sort(pmin(r, N - r + 1)))
                SIGMA <- if(EVEN)
                    sqrt(m * n
                         * (16 * sum(r$lengths * r$values^2) - N * (N + 2)^2)
                         / (16 * N * (N - 1)))
                else
                    sqrt(m * n
                         * (16 * N * sum(r$lengths * r$values^2) - (N + 1)^4)
                         / (16 * N^2 * (N - 1)))
            }
            z / SIGMA
        }
        p <- pnorm(normalize(STATISTIC, r, TIES))
        PVAL <- switch(alternative,
                       two.sided = 2 * min(p, 1 - p),
                       less = 1 - p,
                       greater = p)

        if(conf.int && !exact) {
            alpha <- 1 - conf.level
            ab2 <- function(sig, zq) {
                r <- rank(c(x / sig, y))
                s <- sum(pmin(r, N - r + 1)[seq_along(x)])
                TIES <- (length(r) != length(unique(r)))
                normalize(s, r, TIES, length(x), length(y)) - zq
            }
            ## Use uniroot here.
            ## Compute the range of sigma first.
            srangepos <- NULL
            srangeneg <- NULL
            if (length(x[x > 0]) && length(y[y > 0]))
                srangepos <-
                    c(min(x[x>0], na.rm=TRUE)/max(y[y>0], na.rm=TRUE),
                      max(x[x>0], na.rm=TRUE)/min(y[y>0], na.rm=TRUE))
            if (length(x[x <= 0]) && length(y[y < 0]))
                srangeneg <-
                    c(min(x[x<=0], na.rm=TRUE)/max(y[y<0], na.rm=TRUE),
                      max(x[x<=0], na.rm=TRUE)/min(y[y<0], na.rm=TRUE))
            if (any(is.infinite(c(srangepos, srangeneg)))) {
                warning("cannot compute asymptotic confidence set or estimator")
                conf.int <- FALSE
            } else {
                ccia <- function(alpha) {
                    ## Check if the statistic exceeds both quantiles
                    ## first.
                    statu <- ab2(srange[1L], zq=qnorm(alpha/2))
                    statl <- ab2(srange[2L], zq=qnorm(alpha/2, lower.tail=FALSE))
                    if (statu > 0 || statl < 0) {
                        warning("samples differ in location: cannot compute confidence set, returning NA")
                        return(c(NA, NA))
                    }
                    u <- uniroot(ab2, srange, tol=1e-4,
                                 zq=qnorm(alpha/2))$root
                    l <- uniroot(ab2, srange, tol=1e-4,
                                 zq=qnorm(alpha/2, lower.tail=FALSE))$root
                    ## The process of the statistics does not need to be
                    ## monotone: sort is ok here.
                    sort(c(u, l))
                }
                srange <- range(c(srangepos, srangeneg), na.rm=FALSE)
                cint <- switch(alternative,
                               two.sided = ccia(alpha),
                               greater = c(ccia(alpha*2)[1L], Inf),
                               less = c(0, ccia(alpha*2)[2L]) )
                attr(cint, "conf.level") <- conf.level
                ## Check if the statistic exceeds both quantiles first.
                statu <- ab2(srange[1L], zq=0)
                statl <- ab2(srange[2L], zq=0)
                if (statu > 0 || statl < 0) {
                    ESTIMATE <- NA
                    warning("cannot compute estimate, returning NA")
                } else
                    ESTIMATE <- uniroot(ab2, srange, tol=1e-4, zq=0)$root
            }
        }
        if(exact && TIES) {
            warning("cannot compute exact p-value with ties")
            if(conf.int)
                warning("cannot compute exact confidence intervals with ties")
        }
    }

    names(STATISTIC) <- "AB"
    RVAL <- list(statistic = STATISTIC,
                 p.value = PVAL,
                 null.value = c("ratio of scales" = 1),
                 alternative = alternative,
                 method = "Ansari-Bradley test",
                 data.name = DNAME)
    if(conf.int)
        RVAL <- c(RVAL,
                  list(conf.int = cint,
                       estimate = c("ratio of scales" = ESTIMATE)))
    class(RVAL) <- "htest"
    return(RVAL)
}

ansari.test.formula <-
function(formula, data, subset, na.action, ...)
{
    if(missing(formula)
       || (length(formula) != 3L)
       || (length(attr(terms(formula[-2L]), "term.labels")) != 1L))
        stop("'formula' missing or incorrect")
    m <- match.call(expand.dots = FALSE)
    if(is.matrix(eval(m$data, parent.frame())))
        m$data <- as.data.frame(data)
    ## need stats:: for non-standard evaluation
    m[[1L]] <- quote(stats::model.frame)
    m$... <- NULL
    mf <- eval(m, parent.frame())
    DNAME <- paste(names(mf), collapse = " by ")
    names(mf) <- NULL
    response <- attr(attr(mf, "terms"), "response")
    g <- factor(mf[[-response]])
    if(nlevels(g) != 2L)
        stop("grouping factor must have exactly 2 levels")
    DATA <- setNames(split(mf[[response]], g), c("x", "y"))
    y <- do.call("ansari.test", c(DATA, list(...)))
    y$data.name <- DNAME
    y
}