sandbox/stats/multicomp.py

'''

from pystatsmodels mailinglist 20100524

Notes:
 - unfinished, unverified, but most parts seem to work in MonteCarlo
 - one example taken from lecture notes looks ok
 - needs cases with non-monotonic inequality for test to see difference between
   one-step, step-up and step-down procedures
 - FDR does not look really better then Bonferoni in the MC examples that I tried
update:
 - now tested against R, stats and multtest,
   I have all of their methods for p-value correction
 - getting Hommel was impossible until I found reference for pvalue correction
 - now, since I have p-values correction, some of the original tests (rej/norej)
   implementation is not really needed anymore. I think I keep it for reference.
   Test procedure for Hommel in development session log
 - I have not updated other functions and classes in here.
   - multtest has some good helper function according to docs
 - still need to update references, the real papers
 - fdr with estimated true hypothesis still missing
 - multiple comparison procedures incomplete or missing
 - I will get multiple comparison for now only for independent case, which might
   be conservative in correlated case (?).


some References:

Gibbons, Jean Dickinson and Chakraborti Subhabrata, 2003, Nonparametric Statistical
Inference, Fourth Edition, Marcel Dekker
    p.363: 10.4 THE KRUSKAL-WALLIS ONE-WAY ANOVA TEST AND MULTIPLE COMPARISONS
    p.367: multiple comparison for kruskal formula used in multicomp.kruskal

Sheskin, David J., 2004, Handbook of Parametric and Nonparametric Statistical
Procedures, 3rd ed., Chapman&Hall/CRC
    Test 21: The Single-Factor Between-Subjects Analysis of Variance
    Test 22: The Kruskal-Wallis One-Way Analysis of Variance by Ranks Test

Zwillinger, Daniel and Stephen Kokoska, 2000, CRC standard probability and
statistics tables and formulae, Chapman&Hall/CRC
    14.9 WILCOXON RANKSUM (MANN WHITNEY) TEST


S. Paul Wright, Adjusted P-Values for Simultaneous Inference, Biometrics
    Vol. 48, No. 4 (Dec., 1992), pp. 1005-1013, International Biometric Society
    Stable URL: http://www.jstor.org/stable/2532694
 (p-value correction for Hommel in appendix)

for multicomparison

new book "multiple comparison in R"
Hsu is a good reference but I do not have it.


Author: Josef Pktd and example from H Raja and rewrite from Vincent Davis


TODO
----
* name of function multipletests, rename to something like pvalue_correction?


'''
from statsmodels.compat.python import lzip, lrange

import copy
import math

import numpy as np
from numpy.testing import assert_almost_equal, assert_equal
from scipy import stats, interpolate

from statsmodels.iolib.table import SimpleTable
#temporary circular import
from statsmodels.stats.multitest import multipletests, _ecdf as ecdf, fdrcorrection as fdrcorrection0, fdrcorrection_twostage
from statsmodels.graphics import utils
from statsmodels.tools.sm_exceptions import ValueWarning

qcrit = '''
  2     3     4     5     6     7     8     9     10
5   3.64 5.70   4.60 6.98   5.22 7.80   5.67 8.42   6.03 8.91   6.33 9.32   6.58 9.67   6.80 9.97   6.99 10.24
6   3.46 5.24   4.34 6.33   4.90 7.03   5.30 7.56   5.63 7.97   5.90 8.32   6.12 8.61   6.32 8.87   6.49 9.10
7   3.34 4.95   4.16 5.92   4.68 6.54   5.06 7.01   5.36 7.37   5.61 7.68   5.82 7.94   6.00 8.17   6.16 8.37
8   3.26 4.75   4.04 5.64   4.53 6.20   4.89 6.62   5.17 6.96   5.40 7.24       5.60 7.47   5.77 7.68   5.92 7.86
9   3.20 4.60   3.95 5.43   4.41 5.96   4.76 6.35   5.02 6.66   5.24 6.91       5.43 7.13   5.59 7.33   5.74 7.49
10  3.15 4.48   3.88 5.27   4.33 5.77   4.65 6.14   4.91 6.43   5.12 6.67       5.30 6.87   5.46 7.05   5.60 7.21
11  3.11 4.39   3.82 5.15   4.26 5.62   4.57 5.97   4.82 6.25   5.03 6.48 5.20 6.67   5.35 6.84   5.49 6.99
12  3.08 4.32   3.77 5.05   4.20 5.50   4.51 5.84   4.75 6.10   4.95 6.32 5.12 6.51   5.27 6.67   5.39 6.81
13  3.06 4.26   3.73 4.96   4.15 5.40   4.45 5.73   4.69 5.98   4.88 6.19 5.05 6.37   5.19 6.53   5.32 6.67
14  3.03 4.21   3.70 4.89   4.11 5.32   4.41 5.63   4.64 5.88   4.83 6.08 4.99 6.26   5.13 6.41   5.25 6.54
15  3.01 4.17   3.67 4.84   4.08 5.25   4.37 5.56   4.59 5.80   4.78 5.99 4.94 6.16   5.08 6.31   5.20 6.44
16  3.00 4.13   3.65 4.79   4.05 5.19   4.33 5.49   4.56 5.72   4.74 5.92 4.90 6.08   5.03 6.22   5.15 6.35
17  2.98 4.10   3.63 4.74   4.02 5.14   4.30 5.43   4.52 5.66   4.70 5.85 4.86 6.01   4.99 6.15   5.11 6.27
18  2.97 4.07   3.61 4.70   4.00 5.09   4.28 5.38   4.49 5.60   4.67 5.79 4.82 5.94   4.96 6.08   5.07 6.20
19  2.96 4.05   3.59 4.67   3.98 5.05   4.25 5.33   4.47 5.55   4.65 5.73 4.79 5.89   4.92 6.02   5.04 6.14
20  2.95 4.02   3.58 4.64   3.96 5.02   4.23 5.29   4.45 5.51   4.62 5.69 4.77 5.84   4.90 5.97   5.01 6.09
24  2.92 3.96   3.53 4.55   3.90 4.91   4.17 5.17   4.37 5.37   4.54 5.54 4.68 5.69   4.81 5.81   4.92 5.92
30  2.89 3.89   3.49 4.45   3.85 4.80   4.10 5.05   4.30 5.24   4.46 5.40 4.60 5.54   4.72 5.65   4.82 5.76
40  2.86 3.82   3.44 4.37   3.79 4.70   4.04 4.93   4.23 5.11   4.39 5.26 4.52 5.39   4.63 5.50   4.73 5.60
60  2.83 3.76   3.40 4.28   3.74 4.59   3.98 4.82   4.16 4.99   4.31 5.13 4.44 5.25   4.55 5.36   4.65 5.45
120   2.80 3.70   3.36 4.20   3.68 4.50   3.92 4.71   4.10 4.87   4.24 5.01 4.36 5.12   4.47 5.21   4.56 5.30
infinity  2.77 3.64   3.31 4.12   3.63 4.40   3.86 4.60   4.03 4.76   4.17 4.88   4.29 4.99   4.39 5.08   4.47 5.16
'''

res = [line.split() for line in qcrit.replace('infinity','9999').split('\n')]
c=np.array(res[2:-1]).astype(float)
#c[c==9999] = np.inf
ccols = np.arange(2,11)
crows = c[:,0]
cv005 = c[:, 1::2]
cv001 = c[:, 2::2]


def get_tukeyQcrit(k, df, alpha=0.05):
    '''
    return critical values for Tukey's HSD (Q)

    Parameters
    ----------
    k : int in {2, ..., 10}
        number of tests
    df : int
        degrees of freedom of error term
    alpha : {0.05, 0.01}
        type 1 error, 1-confidence level


    not enough error checking for limitations
    '''
    if alpha == 0.05:
        intp = interpolate.interp1d(crows, cv005[:,k-2])
    elif alpha == 0.01:
        intp = interpolate.interp1d(crows, cv001[:,k-2])
    else:
        raise ValueError('only implemented for alpha equal to 0.01 and 0.05')
    return intp(df)

def get_tukeyQcrit2(k, df, alpha=0.05):
    '''
    return critical values for Tukey's HSD (Q)

    Parameters
    ----------
    k : int in {2, ..., 10}
        number of tests
    df : int
        degrees of freedom of error term
    alpha : {0.05, 0.01}
        type 1 error, 1-confidence level


    not enough error checking for limitations
    '''
    from statsmodels.stats.libqsturng import qsturng
    return qsturng(1-alpha, k, df)


def get_tukey_pvalue(k, df, q):
    '''
    return adjusted p-values for Tukey's HSD

    Parameters
    ----------
    k : int in {2, ..., 10}
        number of tests
    df : int
        degrees of freedom of error term
    q : scalar, array_like; q >= 0
        quantile value of Studentized Range

    '''

    from statsmodels.stats.libqsturng import psturng
    return psturng(q, k, df)


def Tukeythreegene(first, second, third):
    # Performing the Tukey HSD post-hoc test for three genes
    # qwb = xlrd.open_workbook('F:/Lab/bioinformatics/qcrittable.xls')
    # #opening the workbook containing the q crit table
    # qwb.sheet_names()
    # qcrittable = qwb.sheet_by_name(u'Sheet1')

    # means of the three arrays
    firstmean = np.mean(first)
    secondmean = np.mean(second)
    thirdmean = np.mean(third)

    # standard deviations of the threearrays
    firststd = np.std(first)
    secondstd = np.std(second)
    thirdstd = np.std(third)

    # standard deviation squared of the three arrays
    firsts2 = math.pow(firststd, 2)
    seconds2 = math.pow(secondstd, 2)
    thirds2 = math.pow(thirdstd, 2)

    # numerator for mean square error
    mserrornum = firsts2 * 2 + seconds2 * 2 + thirds2 * 2
    # denominator for mean square error
    mserrorden = (len(first) + len(second) + len(third)) - 3
    mserror = mserrornum / mserrorden  # mean square error

    standarderror = math.sqrt(mserror / len(first))
    # standard error, which is square root of mserror and
    # the number of samples in a group

    # various degrees of freedom
    dftotal = len(first) + len(second) + len(third) - 1
    dfgroups = 2
    dferror = dftotal - dfgroups  # noqa: F841

    qcrit = 0.5  # fix arbitrary#qcrittable.cell(dftotal, 3).value
    qcrit = get_tukeyQcrit(3, dftotal, alpha=0.05)
    # getting the q critical value, for degrees of freedom total and 3 groups

    qtest3to1 = (math.fabs(thirdmean - firstmean)) / standarderror
    # calculating q test statistic values
    qtest3to2 = (math.fabs(thirdmean - secondmean)) / standarderror
    qtest2to1 = (math.fabs(secondmean - firstmean)) / standarderror

    conclusion = []

    # print(qcrit
    print(qtest3to1)
    print(qtest3to2)
    print(qtest2to1)

    # testing all q test statistic values to q critical values
    if qtest3to1 > qcrit:
        conclusion.append('3to1null')
    else:
        conclusion.append('3to1alt')
    if qtest3to2 > qcrit:
        conclusion.append('3to2null')
    else:
        conclusion.append('3to2alt')
    if qtest2to1 > qcrit:
        conclusion.append('2to1null')
    else:
        conclusion.append('2to1alt')

    return conclusion


#rewrite by Vincent
def Tukeythreegene2(genes): #Performing the Tukey HSD post-hoc test for three genes
    """gend is a list, ie [first, second, third]"""
#   qwb = xlrd.open_workbook('F:/Lab/bioinformatics/qcrittable.xls')
    #opening the workbook containing the q crit table
#   qwb.sheet_names()
#   qcrittable = qwb.sheet_by_name(u'Sheet1')

    means = []
    stds = []
    for gene in genes:
        means.append(np.mean(gene))
        std.append(np.std(gene))  # noqa:F821  See GH#5756

    #firstmean = np.mean(first) #means of the three arrays
    #secondmean = np.mean(second)
    #thirdmean = np.mean(third)

    #firststd = np.std(first) #standard deviations of the three arrays
    #secondstd = np.std(second)
    #thirdstd = np.std(third)

    stds2 = []
    for std in stds:
        stds2.append(math.pow(std,2))


    #firsts2 = math.pow(firststd,2) #standard deviation squared of the three arrays
    #seconds2 = math.pow(secondstd,2)
    #thirds2 = math.pow(thirdstd,2)

    #mserrornum = firsts2*2+seconds2*2+thirds2*2 #numerator for mean square error
    mserrornum = sum(stds2)*2
    mserrorden = (len(genes[0])+len(genes[1])+len(genes[2]))-3 #denominator for mean square error
    mserror = mserrornum/mserrorden #mean square error


def catstack(args):
    x = np.hstack(args)
    labels = np.hstack([k*np.ones(len(arr)) for k,arr in enumerate(args)])
    return x, labels


def maxzero(x):
    '''find all up zero crossings and return the index of the highest

    Not used anymore


    >>> np.random.seed(12345)
    >>> x = np.random.randn(8)
    >>> x
    array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
            1.39340583,  0.09290788,  0.28174615])
    >>> maxzero(x)
    (4, array([1, 4]))


    no up-zero-crossing at end

    >>> np.random.seed(0)
    >>> x = np.random.randn(8)
    >>> x
    array([ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,  1.86755799,
           -0.97727788,  0.95008842, -0.15135721])
    >>> maxzero(x)
    (None, array([6]))
    '''
    x = np.asarray(x)
    cond1 = x[:-1] < 0
    cond2 = x[1:] > 0
    #allzeros = np.nonzero(np.sign(x[:-1])*np.sign(x[1:]) <= 0)[0] + 1
    allzeros = np.nonzero((cond1 & cond2) | (x[1:]==0))[0] + 1
    if x[-1] >=0:
        maxz = max(allzeros)
    else:
        maxz = None
    return maxz, allzeros

def maxzerodown(x):
    '''find all up zero crossings and return the index of the highest

    Not used anymore

    >>> np.random.seed(12345)
    >>> x = np.random.randn(8)
    >>> x
    array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
            1.39340583,  0.09290788,  0.28174615])
    >>> maxzero(x)
    (4, array([1, 4]))


    no up-zero-crossing at end

    >>> np.random.seed(0)
    >>> x = np.random.randn(8)
    >>> x
    array([ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,  1.86755799,
           -0.97727788,  0.95008842, -0.15135721])
    >>> maxzero(x)
    (None, array([6]))
'''
    x = np.asarray(x)
    cond1 = x[:-1] > 0
    cond2 = x[1:] < 0
    #allzeros = np.nonzero(np.sign(x[:-1])*np.sign(x[1:]) <= 0)[0] + 1
    allzeros = np.nonzero((cond1 & cond2) | (x[1:]==0))[0] + 1
    if x[-1] <=0:
        maxz = max(allzeros)
    else:
        maxz = None
    return maxz, allzeros


def rejectionline(n, alpha=0.5):
    '''reference line for rejection in multiple tests

    Not used anymore

    from: section 3.2, page 60
    '''
    t = np.arange(n)/float(n)
    frej = t/( t * (1-alpha) + alpha)
    return frej


#I do not remember what I changed or why 2 versions,
#this follows german diss ???  with rline
#this might be useful if the null hypothesis is not "all effects are zero"
#rename to _bak and working again on fdrcorrection0
def fdrcorrection_bak(pvals, alpha=0.05, method='indep'):
    '''Reject False discovery rate correction for pvalues

    Old version, to be deleted


    missing: methods that estimate fraction of true hypotheses

    '''
    pvals = np.asarray(pvals)


    pvals_sortind = np.argsort(pvals)
    pvals_sorted = pvals[pvals_sortind]
    pecdf = ecdf(pvals_sorted)
    if method in ['i', 'indep', 'p', 'poscorr']:
        rline = pvals_sorted / alpha
    elif method in ['n', 'negcorr']:
        cm = np.sum(1./np.arange(1, len(pvals)))
        rline = pvals_sorted / alpha * cm
    elif method in ['g', 'onegcorr']:  #what's this ? german diss
        rline = pvals_sorted / (pvals_sorted*(1-alpha) + alpha)
    elif method in ['oth', 'o2negcorr']: # other invalid, cut-paste
        cm = np.sum(np.arange(len(pvals)))
        rline = pvals_sorted / alpha /cm
    else:
        raise ValueError('method not available')

    reject = pecdf >= rline
    if reject.any():
        rejectmax = max(np.nonzero(reject)[0])
    else:
        rejectmax = 0
    reject[:rejectmax] = True
    return reject[pvals_sortind.argsort()]

def mcfdr(nrepl=100, nobs=50, ntests=10, ntrue=6, mu=0.5, alpha=0.05, rho=0.):
    '''MonteCarlo to test fdrcorrection
    '''
    nfalse = ntests - ntrue
    locs = np.array([0.]*ntrue + [mu]*(ntests - ntrue))
    results = []
    for i in range(nrepl):
        #rvs = locs + stats.norm.rvs(size=(nobs, ntests))
        rvs = locs + randmvn(rho, size=(nobs, ntests))
        tt, tpval = stats.ttest_1samp(rvs, 0)
        res = fdrcorrection_bak(np.abs(tpval), alpha=alpha, method='i')
        res0 = fdrcorrection0(np.abs(tpval), alpha=alpha)
        #res and res0 give the same results
        results.append([np.sum(res[:ntrue]), np.sum(res[ntrue:])] +
                       [np.sum(res0[:ntrue]), np.sum(res0[ntrue:])] +
                       res.tolist() +
                       np.sort(tpval).tolist() +
                       [np.sum(tpval[:ntrue]<alpha),
                        np.sum(tpval[ntrue:]<alpha)] +
                       [np.sum(tpval[:ntrue]<alpha/ntests),
                        np.sum(tpval[ntrue:]<alpha/ntests)])
    return np.array(results)

def randmvn(rho, size=(1, 2), standardize=False):
    '''create random draws from equi-correlated multivariate normal distribution

    Parameters
    ----------
    rho : float
        correlation coefficient
    size : tuple of int
        size is interpreted (nobs, nvars) where each row

    Returns
    -------
    rvs : ndarray
        nobs by nvars where each row is a independent random draw of nvars-
        dimensional correlated rvs

    '''
    nobs, nvars = size
    if 0 < rho and rho < 1:
        rvs = np.random.randn(nobs, nvars+1)
        rvs2 = rvs[:,:-1] * np.sqrt((1-rho)) + rvs[:,-1:] * np.sqrt(rho)
    elif rho ==0:
        rvs2 = np.random.randn(nobs, nvars)
    elif rho < 0:
        if rho < -1./(nvars-1):
            raise ValueError('rho has to be larger than -1./(nvars-1)')
        elif rho == -1./(nvars-1):
            rho = -1./(nvars-1+1e-10)  #barely positive definite
        #use Cholesky
        A = rho*np.ones((nvars,nvars))+(1-rho)*np.eye(nvars)
        rvs2 = np.dot(np.random.randn(nobs, nvars), np.linalg.cholesky(A).T)
    if standardize:
        rvs2 = stats.zscore(rvs2)
    return rvs2

#============================
#
# Part 2: Multiple comparisons and independent samples tests
#
#============================

def tiecorrect(xranks):
    '''

    should be equivalent of scipy.stats.tiecorrect

    '''
    #casting to int rounds down, but not relevant for this case
    rankbincount = np.bincount(np.asarray(xranks,dtype=int))
    nties = rankbincount[rankbincount > 1]
    ntot = float(len(xranks))
    tiecorrection = 1 - (nties**3 - nties).sum()/(ntot**3 - ntot)
    return tiecorrection


class GroupsStats(object):
    '''
    statistics by groups (another version)

    groupstats as a class with lazy evaluation (not yet - decorators are still
    missing)

    written this time as equivalent of scipy.stats.rankdata
    gs = GroupsStats(X, useranks=True)
    assert_almost_equal(gs.groupmeanfilter, stats.rankdata(X[:,0]), 15)

    TODO: incomplete doc strings

    '''

    def __init__(self, x, useranks=False, uni=None, intlab=None):
        '''descriptive statistics by groups

        Parameters
        ----------
        x : ndarray, 2d
            first column data, second column group labels
        useranks : bool
            if true, then use ranks as data corresponding to the
            scipy.stats.rankdata definition (start at 1, ties get mean)
        uni, intlab : arrays (optional)
            to avoid call to unique, these can be given as inputs


        '''
        self.x = np.asarray(x)
        if intlab is None:
            uni, intlab = np.unique(x[:,1], return_inverse=True)
        elif uni is None:
            uni = np.unique(x[:,1])

        self.useranks = useranks


        self.uni = uni
        self.intlab = intlab
        self.groupnobs = groupnobs = np.bincount(intlab)

        #temporary until separated and made all lazy
        self.runbasic(useranks=useranks)


    def runbasic_old(self, useranks=False):
        """runbasic_old"""
        #check: refactoring screwed up case useranks=True

        #groupxsum = np.bincount(intlab, weights=X[:,0])
        #groupxmean = groupxsum * 1.0 / groupnobs
        x = self.x
        if useranks:
            self.xx = x[:,1].argsort().argsort() + 1  #rankraw
        else:
            self.xx = x[:,0]
        self.groupsum = groupranksum = np.bincount(self.intlab, weights=self.xx)
        #print('groupranksum', groupranksum, groupranksum.shape, self.groupnobs.shape
        # start at 1 for stats.rankdata :
        self.groupmean = grouprankmean = groupranksum * 1.0 / self.groupnobs # + 1
        self.groupmeanfilter = grouprankmean[self.intlab]
        #return grouprankmean[intlab]

    def runbasic(self, useranks=False):
        """runbasic"""
        #check: refactoring screwed up case useranks=True

        #groupxsum = np.bincount(intlab, weights=X[:,0])
        #groupxmean = groupxsum * 1.0 / groupnobs
        x = self.x
        if useranks:
            xuni, xintlab = np.unique(x[:,0], return_inverse=True)
            ranksraw = x[:,0].argsort().argsort() + 1  #rankraw
            self.xx = GroupsStats(np.column_stack([ranksraw, xintlab]),
                                  useranks=False).groupmeanfilter
        else:
            self.xx = x[:,0]
        self.groupsum = groupranksum = np.bincount(self.intlab, weights=self.xx)
        #print('groupranksum', groupranksum, groupranksum.shape, self.groupnobs.shape
        # start at 1 for stats.rankdata :
        self.groupmean = grouprankmean = groupranksum * 1.0 / self.groupnobs # + 1
        self.groupmeanfilter = grouprankmean[self.intlab]
        #return grouprankmean[intlab]

    def groupdemean(self):
        """groupdemean"""
        return self.xx - self.groupmeanfilter

    def groupsswithin(self):
        """groupsswithin"""
        xtmp = self.groupdemean()
        return np.bincount(self.intlab, weights=xtmp**2)

    def groupvarwithin(self):
        """groupvarwithin"""
        return self.groupsswithin()/(self.groupnobs-1) #.sum()

class TukeyHSDResults(object):
    """Results from Tukey HSD test, with additional plot methods

    Can also compute and plot additional post-hoc evaluations using this
    results class.

    Attributes
    ----------
    reject : array of boolean, True if we reject Null for group pair
    meandiffs : pairwise mean differences
    confint : confidence interval for pairwise mean differences
    std_pairs : standard deviation of pairwise mean differences
    q_crit : critical value of studentized range statistic at given alpha
    halfwidths : half widths of simultaneous confidence interval
    pvalues : adjusted p-values from the HSD test

    Notes
    -----
    halfwidths is only available after call to `plot_simultaneous`.

    Other attributes contain information about the data from the
    MultiComparison instance: data, df_total, groups, groupsunique, variance.
    """
    def __init__(self, mc_object, results_table, q_crit, reject=None,
                 meandiffs=None, std_pairs=None, confint=None, df_total=None,
                 reject2=None, variance=None, pvalues=None):

        self._multicomp = mc_object
        self._results_table = results_table
        self.q_crit = q_crit
        self.reject = reject
        self.meandiffs = meandiffs
        self.std_pairs = std_pairs
        self.confint = confint
        self.df_total = df_total
        self.reject2 = reject2
        self.variance = variance
        self.pvalues = pvalues
        # Taken out of _multicomp for ease of access for unknowledgeable users
        self.data = self._multicomp.data
        self.groups = self._multicomp.groups
        self.groupsunique = self._multicomp.groupsunique

    def __str__(self):
        return str(self._results_table)

    def summary(self):
        '''Summary table that can be printed
        '''
        return self._results_table


    def _simultaneous_ci(self):
        """Compute simultaneous confidence intervals for comparison of means.
        """
        self.halfwidths = simultaneous_ci(self.q_crit, self.variance,
                            self._multicomp.groupstats.groupnobs,
                            self._multicomp.pairindices)

    def plot_simultaneous(self, comparison_name=None, ax=None, figsize=(10,6),
                          xlabel=None, ylabel=None):
        """Plot a universal confidence interval of each group mean

        Visualize significant differences in a plot with one confidence
        interval per group instead of all pairwise confidence intervals.

        Parameters
        ----------
        comparison_name : str, optional
            if provided, plot_intervals will color code all groups that are
            significantly different from the comparison_name red, and will
            color code insignificant groups gray. Otherwise, all intervals will
            just be plotted in black.
        ax : matplotlib axis, optional
            An axis handle on which to attach the plot.
        figsize : tuple, optional
            tuple for the size of the figure generated
        xlabel : str, optional
            Name to be displayed on x axis
        ylabel : str, optional
            Name to be displayed on y axis

        Returns
        -------
        Figure
            handle to figure object containing interval plots

        Notes
        -----
        Multiple comparison tests are nice, but lack a good way to be
        visualized. If you have, say, 6 groups, showing a graph of the means
        between each group will require 15 confidence intervals.
        Instead, we can visualize inter-group differences with a single
        interval for each group mean. Hochberg et al. [1] first proposed this
        idea and used Tukey's Q critical value to compute the interval widths.
        Unlike plotting the differences in the means and their respective
        confidence intervals, any two pairs can be compared for significance
        by looking for overlap.

        References
        ----------
        .. [*] Hochberg, Y., and A. C. Tamhane. Multiple Comparison Procedures.
               Hoboken, NJ: John Wiley & Sons, 1987.

        Examples
        --------
        >>> from statsmodels.examples.try_tukey_hsd import cylinders, cyl_labels
        >>> from statsmodels.stats.multicomp import MultiComparison
        >>> cardata = MultiComparison(cylinders, cyl_labels)
        >>> results = cardata.tukeyhsd()
        >>> results.plot_simultaneous()
        <matplotlib.figure.Figure at 0x...>

        This example shows an example plot comparing significant differences
        in group means. Significant differences at the alpha=0.05 level can be
        identified by intervals that do not overlap (i.e. USA vs Japan,
        USA vs Germany).

        >>> results.plot_simultaneous(comparison_name="USA")
        <matplotlib.figure.Figure at 0x...>

        Optionally provide one of the group names to color code the plot to
        highlight group means different from comparison_name.
        """
        fig, ax1 = utils.create_mpl_ax(ax)
        if figsize is not None:
            fig.set_size_inches(figsize)
        if getattr(self, 'halfwidths', None) is None:
            self._simultaneous_ci()
        means = self._multicomp.groupstats.groupmean


        sigidx = []
        nsigidx = []
        minrange = [means[i] - self.halfwidths[i] for i in range(len(means))]
        maxrange = [means[i] + self.halfwidths[i] for i in range(len(means))]

        if comparison_name is None:
            ax1.errorbar(means, lrange(len(means)), xerr=self.halfwidths,
                         marker='o', linestyle='None', color='k', ecolor='k')
        else:
            if comparison_name not in self.groupsunique:
                raise ValueError('comparison_name not found in group names.')
            midx = np.where(self.groupsunique==comparison_name)[0][0]
            for i in range(len(means)):
                if self.groupsunique[i] == comparison_name:
                    continue
                if (min(maxrange[i], maxrange[midx]) -
                                         max(minrange[i], minrange[midx]) < 0):
                    sigidx.append(i)
                else:
                    nsigidx.append(i)
            #Plot the main comparison
            ax1.errorbar(means[midx], midx, xerr=self.halfwidths[midx],
                         marker='o', linestyle='None', color='b', ecolor='b')
            ax1.plot([minrange[midx]]*2, [-1, self._multicomp.ngroups],
                     linestyle='--', color='0.7')
            ax1.plot([maxrange[midx]]*2, [-1, self._multicomp.ngroups],
                     linestyle='--', color='0.7')
            #Plot those that are significantly different
            if len(sigidx) > 0:
                ax1.errorbar(means[sigidx], sigidx,
                             xerr=self.halfwidths[sigidx], marker='o',
                             linestyle='None', color='r', ecolor='r')
            #Plot those that are not significantly different
            if len(nsigidx) > 0:
                ax1.errorbar(means[nsigidx], nsigidx,
                             xerr=self.halfwidths[nsigidx], marker='o',
                             linestyle='None', color='0.5', ecolor='0.5')

        ax1.set_title('Multiple Comparisons Between All Pairs (Tukey)')
        r = np.max(maxrange) - np.min(minrange)
        ax1.set_ylim([-1, self._multicomp.ngroups])
        ax1.set_xlim([np.min(minrange) - r / 10., np.max(maxrange) + r / 10.])
        ylbls = [""] + self.groupsunique.astype(str).tolist() + [""]
        ax1.set_yticks(np.arange(-1, len(means) + 1))
        ax1.set_yticklabels(ylbls)
        ax1.set_xlabel(xlabel if xlabel is not None else '')
        ax1.set_ylabel(ylabel if ylabel is not None else '')
        return fig


class MultiComparison(object):
    '''Tests for multiple comparisons

    Parameters
    ----------
    data : ndarray
        independent data samples
    groups : ndarray
        group labels corresponding to each data point
    group_order : list[str], optional
        the desired order for the group mean results to be reported in. If
        not specified, results are reported in increasing order.
        If group_order does not contain all labels that are in groups, then
        only those observations are kept that have a label in group_order.

    '''

    def __init__(self, data, groups, group_order=None):

        if len(data) != len(groups):
            raise ValueError('data has %d elements and groups has %d' % (len(data), len(groups)))
        self.data = np.asarray(data)
        self.groups = groups = np.asarray(groups)

        # Allow for user-provided sorting of groups
        if group_order is None:
            self.groupsunique, self.groupintlab = np.unique(groups,
                                                            return_inverse=True)
        else:
            #check if group_order has any names not in groups
            for grp in group_order:
                if grp not in groups:
                    raise ValueError(
                            "group_order value '%s' not found in groups" % grp)
            self.groupsunique = np.array(group_order)
            self.groupintlab = np.empty(len(data), int)
            self.groupintlab.fill(-999)  # instead of a nan
            count = 0
            for name in self.groupsunique:
                idx = np.where(self.groups == name)[0]
                count += len(idx)
                self.groupintlab[idx] = np.where(self.groupsunique == name)[0]
            if count != self.data.shape[0]:
                #raise ValueError('group_order does not contain all groups')
                # warn and keep only observations with label in group_order
                import warnings
                warnings.warn('group_order does not contain all groups:' +
                              ' dropping observations', ValueWarning)

                mask_keep = self.groupintlab != -999
                self.groupintlab = self.groupintlab[mask_keep]
                self.data = self.data[mask_keep]
                self.groups = self.groups[mask_keep]

        if len(self.groupsunique) < 2:
            raise ValueError('2 or more groups required for multiple comparisons')

        self.datali = [self.data[self.groups == k] for k in self.groupsunique]
        self.pairindices = np.triu_indices(len(self.groupsunique), 1)  #tuple
        self.nobs = self.data.shape[0]
        self.ngroups = len(self.groupsunique)


    def getranks(self):
        '''convert data to rankdata and attach


        This creates rankdata as it is used for non-parametric tests, where
        in the case of ties the average rank is assigned.


        '''
        #bug: the next should use self.groupintlab instead of self.groups
        #update: looks fixed
        #self.ranks = GroupsStats(np.column_stack([self.data, self.groups]),
        self.ranks = GroupsStats(np.column_stack([self.data, self.groupintlab]),
                                 useranks=True)
        self.rankdata = self.ranks.groupmeanfilter

    def kruskal(self, pairs=None, multimethod='T'):
        '''
        pairwise comparison for kruskal-wallis test

        This is just a reimplementation of scipy.stats.kruskal and does
        not yet use a multiple comparison correction.

        '''
        self.getranks()
        tot = self.nobs
        meanranks = self.ranks.groupmean
        groupnobs = self.ranks.groupnobs


        # simultaneous/separate treatment of multiple tests
        f=(tot * (tot + 1.) / 12.) / stats.tiecorrect(self.rankdata) #(xranks)
        print('MultiComparison.kruskal')
        for i,j in zip(*self.pairindices):
            #pdiff = np.abs(mrs[i] - mrs[j])
            pdiff = np.abs(meanranks[i] - meanranks[j])
            se = np.sqrt(f * np.sum(1. / groupnobs[[i,j]] )) #np.array([8,8]))) #Fixme groupnobs[[i,j]] ))
            Q = pdiff / se

            # TODO : print(statments, fix
            print(i,j, pdiff, se, pdiff / se, pdiff / se > 2.6310)
            print(stats.norm.sf(Q) * 2)
            return stats.norm.sf(Q) * 2


    def allpairtest(self, testfunc, alpha=0.05, method='bonf', pvalidx=1):
        '''run a pairwise test on all pairs with multiple test correction

        The statistical test given in testfunc is calculated for all pairs
        and the p-values are adjusted by methods in multipletests. The p-value
        correction is generic and based only on the p-values, and does not
        take any special structure of the hypotheses into account.

        Parameters
        ----------
        testfunc : function
            A test function for two (independent) samples. It is assumed that
            the return value on position pvalidx is the p-value.
        alpha : float
            familywise error rate
        method : str
            This specifies the method for the p-value correction. Any method
            of multipletests is possible.
        pvalidx : int (default: 1)
            position of the p-value in the return of testfunc

        Returns
        -------
        sumtab : SimpleTable instance
            summary table for printing

        errors:  TODO: check if this is still wrong, I think it's fixed.
        results from multipletests are in different order
        pval_corrected can be larger than 1 ???
        '''
        res = []
        for i,j in zip(*self.pairindices):
            res.append(testfunc(self.datali[i], self.datali[j]))
        res = np.array(res)
        reject, pvals_corrected, alphacSidak, alphacBonf = \
                multipletests(res[:, pvalidx], alpha=alpha, method=method)
        #print(np.column_stack([res[:,0],res[:,1], reject, pvals_corrected])

        i1, i2 = self.pairindices
        if pvals_corrected is None:
            resarr = np.array(lzip(self.groupsunique[i1], self.groupsunique[i2],
                                  np.round(res[:,0],4),
                                  np.round(res[:,1],4),
                                  reject),
                       dtype=[('group1', object),
                              ('group2', object),
                              ('stat',float),
                              ('pval',float),
                              ('reject', np.bool8)])
        else:
            resarr = np.array(lzip(self.groupsunique[i1], self.groupsunique[i2],
                                  np.round(res[:,0],4),
                                  np.round(res[:,1],4),
                                  np.round(pvals_corrected,4),
                                  reject),
                       dtype=[('group1', object),
                              ('group2', object),
                              ('stat',float),
                              ('pval',float),
                              ('pval_corr',float),
                              ('reject', np.bool8)])
        results_table = SimpleTable(resarr, headers=resarr.dtype.names)
        results_table.title = (
            'Test Multiple Comparison %s \n%s%4.2f method=%s'
            % (testfunc.__name__, 'FWER=', alpha, method) +
            '\nalphacSidak=%4.2f, alphacBonf=%5.3f'
            % (alphacSidak, alphacBonf))

        return results_table, (res, reject, pvals_corrected,
                               alphacSidak, alphacBonf), resarr

    def tukeyhsd(self, alpha=0.05):
        """
        Tukey's range test to compare means of all pairs of groups

        Parameters
        ----------
        alpha : float, optional
            Value of FWER at which to calculate HSD.

        Returns
        -------
        results : TukeyHSDResults instance
            A results class containing relevant data and some post-hoc
            calculations
        """
        self.groupstats = GroupsStats(
            np.column_stack([self.data, self.groupintlab]),
            useranks=False)

        gmeans = self.groupstats.groupmean
        gnobs = self.groupstats.groupnobs
        # var_ = self.groupstats.groupvarwithin()
        # #possibly an error in varcorrection in this case
        var_ = np.var(self.groupstats.groupdemean(), ddof=len(gmeans))
        # res contains: 0:(idx1, idx2), 1:reject, 2:meandiffs, 3: std_pairs,
        # 4:confint, 5:q_crit, 6:df_total, 7:reject2, 8: pvals
        res = tukeyhsd(gmeans, gnobs, var_, df=None, alpha=alpha, q_crit=None)

        resarr = np.array(lzip(self.groupsunique[res[0][0]],
                               self.groupsunique[res[0][1]],
                               np.round(res[2], 4),
                               np.round(res[8], 4),
                               np.round(res[4][:, 0], 4),
                               np.round(res[4][:, 1], 4),
                               res[1]),
                          dtype=[('group1', object),
                                 ('group2', object),
                                 ('meandiff', float),
                                 ('p-adj', float),
                                 ('lower', float),
                                 ('upper', float),
                                 ('reject', np.bool8)])
        results_table = SimpleTable(resarr, headers=resarr.dtype.names)
        results_table.title = 'Multiple Comparison of Means - Tukey HSD, ' + \
                              'FWER=%4.2f' % alpha

        return TukeyHSDResults(self, results_table, res[5], res[1], res[2],
                               res[3], res[4], res[6], res[7], var_, res[8])


def rankdata(x):
    '''rankdata, equivalent to scipy.stats.rankdata

    just a different implementation, I have not yet compared speed

    '''
    uni, intlab = np.unique(x[:,0], return_inverse=True)
    groupnobs = np.bincount(intlab)
    groupxsum = np.bincount(intlab, weights=X[:,0])
    groupxmean = groupxsum * 1.0 / groupnobs

    rankraw = x[:,0].argsort().argsort()
    groupranksum = np.bincount(intlab, weights=rankraw)
    # start at 1 for stats.rankdata :
    grouprankmean = groupranksum * 1.0 / groupnobs + 1
    return grouprankmean[intlab]


#new

def compare_ordered(vals, alpha):
    '''simple ordered sequential comparison of means

    vals : array_like
        means or rankmeans for independent groups

    incomplete, no return, not used yet
    '''
    vals = np.asarray(vals)
    alphaf = alpha  # Notation ?
    sortind = np.argsort(vals)
    pvals = vals[sortind]
    sortrevind = sortind.argsort()
    ntests = len(vals)
    #alphacSidak = 1 - np.power((1. - alphaf), 1./ntests)
    #alphacBonf = alphaf / float(ntests)
    v1, v2 = np.triu_indices(ntests, 1)
    #v1,v2 have wrong sequence
    for i in range(4):
        for j in range(4,i, -1):
            print(i,j)


def varcorrection_unbalanced(nobs_all, srange=False):
    '''correction factor for variance with unequal sample sizes

    this is just a harmonic mean

    Parameters
    ----------
    nobs_all : array_like
        The number of observations for each sample
    srange : bool
        if true, then the correction is divided by the number of samples
        for the variance of the studentized range statistic

    Returns
    -------
    correction : float
        Correction factor for variance.


    Notes
    -----

    variance correction factor is

    1/k * sum_i 1/n_i

    where k is the number of samples and summation is over i=0,...,k-1.
    If all n_i are the same, then the correction factor is 1.

    This needs to be multiplied by the joint variance estimate, means square
    error, MSE. To obtain the correction factor for the standard deviation,
    square root needs to be taken.

    '''
    nobs_all = np.asarray(nobs_all)
    if not srange:
        return (1./nobs_all).sum()
    else:
        return (1./nobs_all).sum()/len(nobs_all)

def varcorrection_pairs_unbalanced(nobs_all, srange=False):
    '''correction factor for variance with unequal sample sizes for all pairs

    this is just a harmonic mean

    Parameters
    ----------
    nobs_all : array_like
        The number of observations for each sample
    srange : bool
        if true, then the correction is divided by 2 for the variance of
        the studentized range statistic

    Returns
    -------
    correction : ndarray
        Correction factor for variance.


    Notes
    -----

    variance correction factor is

    1/k * sum_i 1/n_i

    where k is the number of samples and summation is over i=0,...,k-1.
    If all n_i are the same, then the correction factor is 1.

    This needs to be multiplies by the joint variance estimate, means square
    error, MSE. To obtain the correction factor for the standard deviation,
    square root needs to be taken.

    For the studentized range statistic, the resulting factor has to be
    divided by 2.

    '''
    #TODO: test and replace with broadcasting
    n1, n2 = np.meshgrid(nobs_all, nobs_all)
    if not srange:
        return (1./n1 + 1./n2)
    else:
        return (1./n1 + 1./n2) / 2.

def varcorrection_unequal(var_all, nobs_all, df_all):
    '''return joint variance from samples with unequal variances and unequal
    sample sizes

    something is wrong

    Parameters
    ----------
    var_all : array_like
        The variance for each sample
    nobs_all : array_like
        The number of observations for each sample
    df_all : array_like
        degrees of freedom for each sample

    Returns
    -------
    varjoint : float
        joint variance.
    dfjoint : float
        joint Satterthwait's degrees of freedom


    Notes
    -----
    (copy, paste not correct)
    variance is

    1/k * sum_i 1/n_i

    where k is the number of samples and summation is over i=0,...,k-1.
    If all n_i are the same, then the correction factor is 1/n.

    This needs to be multiplies by the joint variance estimate, means square
    error, MSE. To obtain the correction factor for the standard deviation,
    square root needs to be taken.

    This is for variance of mean difference not of studentized range.
    '''

    var_all = np.asarray(var_all)
    var_over_n = var_all *1./ nobs_all  #avoid integer division
    varjoint = var_over_n.sum()

    dfjoint = varjoint**2 / (var_over_n**2 * df_all).sum()

    return varjoint, dfjoint

def varcorrection_pairs_unequal(var_all, nobs_all, df_all):
    '''return joint variance from samples with unequal variances and unequal
    sample sizes for all pairs

    something is wrong

    Parameters
    ----------
    var_all : array_like
        The variance for each sample
    nobs_all : array_like
        The number of observations for each sample
    df_all : array_like
        degrees of freedom for each sample

    Returns
    -------
    varjoint : ndarray
        joint variance.
    dfjoint : ndarray
        joint Satterthwait's degrees of freedom


    Notes
    -----

    (copy, paste not correct)
    variance is

    1/k * sum_i 1/n_i

    where k is the number of samples and summation is over i=0,...,k-1.
    If all n_i are the same, then the correction factor is 1.

    This needs to be multiplies by the joint variance estimate, means square
    error, MSE. To obtain the correction factor for the standard deviation,
    square root needs to be taken.

    TODO: something looks wrong with dfjoint, is formula from SPSS
    '''
    #TODO: test and replace with broadcasting
    v1, v2 = np.meshgrid(var_all, var_all)
    n1, n2 = np.meshgrid(nobs_all, nobs_all)
    df1, df2 = np.meshgrid(df_all, df_all)

    varjoint = v1/n1 + v2/n2

    dfjoint = varjoint**2 / (df1 * (v1/n1)**2 + df2 * (v2/n2)**2)

    return varjoint, dfjoint

def tukeyhsd(mean_all, nobs_all, var_all, df=None, alpha=0.05, q_crit=None):
    '''simultaneous Tukey HSD


    check: instead of sorting, I use absolute value of pairwise differences
    in means. That's irrelevant for the test, but maybe reporting actual
    differences would be better.
    CHANGED: meandiffs are with sign, studentized range uses abs

    q_crit added for testing

    TODO: error in variance calculation when nobs_all is scalar, missing 1/n

    '''
    mean_all = np.asarray(mean_all)
    #check if or when other ones need to be arrays

    n_means = len(mean_all)

    if df is None:
        df = nobs_all - 1

    if np.size(df) == 1:   # assumes balanced samples with df = n - 1, n_i = n
        df_total = n_means * df
        df = np.ones(n_means) * df
    else:
        df_total = np.sum(df)

    if (np.size(nobs_all) == 1) and (np.size(var_all) == 1):
        #balanced sample sizes and homogenous variance
        var_pairs = 1. * var_all / nobs_all * np.ones((n_means, n_means))

    elif np.size(var_all) == 1:
        #unequal sample sizes and homogenous variance
        var_pairs = var_all * varcorrection_pairs_unbalanced(nobs_all,
                                                             srange=True)
    elif np.size(var_all) > 1:
        var_pairs, df_sum = varcorrection_pairs_unequal(nobs_all, var_all, df)
        var_pairs /= 2.
        #check division by two for studentized range

    else:
        raise ValueError('not supposed to be here')

    #meandiffs_ = mean_all[:,None] - mean_all
    meandiffs_ = mean_all - mean_all[:,None]  #reverse sign, check with R example
    std_pairs_ = np.sqrt(var_pairs)

    #select all pairs from upper triangle of matrix
    idx1, idx2 = np.triu_indices(n_means, 1)
    meandiffs = meandiffs_[idx1, idx2]
    std_pairs = std_pairs_[idx1, idx2]

    st_range = np.abs(meandiffs) / std_pairs #studentized range statistic

    df_total_ = max(df_total, 5)  #TODO: smallest df in table
    if q_crit is None:
        q_crit = get_tukeyQcrit2(n_means, df_total, alpha=alpha)

    pvalues = get_tukey_pvalue(n_means, df_total, st_range)
    # we need pvalues to be atleast_1d for iteration. see #6132
    pvalues = np.atleast_1d(pvalues)

    reject = st_range > q_crit
    crit_int = std_pairs * q_crit
    reject2 = np.abs(meandiffs) > crit_int

    confint = np.column_stack((meandiffs - crit_int, meandiffs + crit_int))

    return ((idx1, idx2), reject, meandiffs, std_pairs, confint, q_crit,
            df_total, reject2, pvalues)


def simultaneous_ci(q_crit, var, groupnobs, pairindices=None):
    """Compute simultaneous confidence intervals for comparison of means.

    q_crit value is generated from tukey hsd test. Variance is considered
    across all groups. Returned halfwidths can be thought of as uncertainty
    intervals around each group mean. They allow for simultaneous
    comparison of pairwise significance among any pairs (by checking for
    overlap)

    Parameters
    ----------
    q_crit : float
        The Q critical value studentized range statistic from Tukey's HSD
    var : float
        The group variance
    groupnobs : array_like object
        Number of observations contained in each group.
    pairindices : tuple of lists, optional
        Indices corresponding to the upper triangle of matrix. Computed
        here if not supplied

    Returns
    -------
    halfwidths : ndarray
        Half the width of each confidence interval for each group given in
        groupnobs

    See Also
    --------
    MultiComparison : statistics class providing significance tests
    tukeyhsd : among other things, computes q_crit value

    References
    ----------
    .. [*] Hochberg, Y., and A. C. Tamhane. Multiple Comparison Procedures.
           Hoboken, NJ: John Wiley & Sons, 1987.)
    """
    # Set initial variables
    ng = len(groupnobs)
    if pairindices is None:
        pairindices = np.triu_indices(ng, 1)

    # Compute dij for all pairwise comparisons ala hochberg p. 95
    gvar = var / groupnobs

    d12 = np.sqrt(gvar[pairindices[0]] + gvar[pairindices[1]])

    # Create the full d matrix given all known dij vals
    d = np.zeros((ng, ng))
    d[pairindices] = d12
    d = d + d.conj().T

    # Compute the two global sums from hochberg eq 3.32
    sum1 = np.sum(d12)
    sum2 = np.sum(d, axis=0)

    if (ng > 2):
        w = ((ng-1.) * sum2 - sum1) / ((ng - 1.) * (ng - 2.))
    else:
        w = sum1 * np.ones((2, 1)) / 2.

    return (q_crit / np.sqrt(2))*w

def distance_st_range(mean_all, nobs_all, var_all, df=None, triu=False):
    '''pairwise distance matrix, outsourced from tukeyhsd


    CHANGED: meandiffs are with sign, studentized range uses abs

    q_crit added for testing

    TODO: error in variance calculation when nobs_all is scalar, missing 1/n

    '''
    mean_all = np.asarray(mean_all)
    #check if or when other ones need to be arrays

    n_means = len(mean_all)

    if df is None:
        df = nobs_all - 1

    if np.size(df) == 1:   # assumes balanced samples with df = n - 1, n_i = n
        df_total = n_means * df
    else:
        df_total = np.sum(df)

    if (np.size(nobs_all) == 1) and (np.size(var_all) == 1):
        #balanced sample sizes and homogenous variance
        var_pairs = 1. * var_all / nobs_all * np.ones((n_means, n_means))

    elif np.size(var_all) == 1:
        #unequal sample sizes and homogenous variance
        var_pairs = var_all * varcorrection_pairs_unbalanced(nobs_all,
                                                             srange=True)
    elif np.size(var_all) > 1:
        var_pairs, df_sum = varcorrection_pairs_unequal(nobs_all, var_all, df)
        var_pairs /= 2.
        #check division by two for studentized range

    else:
        raise ValueError('not supposed to be here')

    #meandiffs_ = mean_all[:,None] - mean_all
    meandiffs = mean_all - mean_all[:,None]  #reverse sign, check with R example
    std_pairs = np.sqrt(var_pairs)

    idx1, idx2 = np.triu_indices(n_means, 1)
    if triu:
        #select all pairs from upper triangle of matrix
        meandiffs = meandiffs_[idx1, idx2]  # noqa: F821  See GH#5756
        std_pairs = std_pairs_[idx1, idx2]  # noqa: F821  See GH#5756

    st_range = np.abs(meandiffs) / std_pairs #studentized range statistic

    return st_range, meandiffs, std_pairs, (idx1,idx2)  #return square arrays


def contrast_allpairs(nm):
    '''contrast or restriction matrix for all pairs of nm variables

    Parameters
    ----------
    nm : int

    Returns
    -------
    contr : ndarray, 2d, (nm*(nm-1)/2, nm)
       contrast matrix for all pairwise comparisons

    '''
    contr = []
    for i in range(nm):
        for j in range(i+1, nm):
            contr_row = np.zeros(nm)
            contr_row[i] = 1
            contr_row[j] = -1
            contr.append(contr_row)
    return np.array(contr)

def contrast_all_one(nm):
    '''contrast or restriction matrix for all against first comparison

    Parameters
    ----------
    nm : int

    Returns
    -------
    contr : ndarray, 2d, (nm-1, nm)
       contrast matrix for all against first comparisons

    '''
    contr = np.column_stack((np.ones(nm-1), -np.eye(nm-1)))
    return contr

def contrast_diff_mean(nm):
    '''contrast or restriction matrix for all against mean comparison

    Parameters
    ----------
    nm : int

    Returns
    -------
    contr : ndarray, 2d, (nm-1, nm)
       contrast matrix for all against mean comparisons

    '''
    return np.eye(nm) - np.ones((nm,nm))/nm

def tukey_pvalues(std_range, nm, df):
    #corrected but very slow with warnings about integration
    #nm = len(std_range)
    contr = contrast_allpairs(nm)
    corr = np.dot(contr, contr.T)/2.
    tstat = std_range / np.sqrt(2) * np.ones(corr.shape[0]) #need len of all pairs
    return multicontrast_pvalues(tstat, corr, df=df)


def multicontrast_pvalues(tstat, tcorr, df=None, dist='t', alternative='two-sided'):
    '''pvalues for simultaneous tests

    '''
    from statsmodels.sandbox.distributions.multivariate import mvstdtprob
    if (df is None) and (dist == 't'):
        raise ValueError('df has to be specified for the t-distribution')
    tstat = np.asarray(tstat)
    ntests = len(tstat)
    cc = np.abs(tstat)
    pval_global = 1 - mvstdtprob(-cc,cc, tcorr, df)
    pvals = []
    for ti in cc:
        limits = ti*np.ones(ntests)
        pvals.append(1 - mvstdtprob(-cc,cc, tcorr, df))

    return pval_global, np.asarray(pvals)


class StepDown(object):
    '''a class for step down methods

    This is currently for simple tree subset descend, similar to homogeneous_subsets,
    but checks all leave-one-out subsets instead of assuming an ordered set.
    Comment in SAS manual:
    SAS only uses interval subsets of the sorted list, which is sufficient for range
    tests (maybe also equal variance and balanced sample sizes are required).
    For F-test based critical distances, the restriction to intervals is not sufficient.

    This version uses a single critical value of the studentized range distribution
    for all comparisons, and is therefore a step-down version of Tukey HSD.
    The class is written so it can be subclassed, where the get_distance_matrix and
    get_crit are overwritten to obtain other step-down procedures such as REGW.

    iter_subsets can be overwritten, to get a recursion as in the many to one comparison
    with a control such as in Dunnet's test.


    A one-sided right tail test is not covered because the direction of the inequality
    is hard coded in check_set.  Also Peritz's check of partitions is not possible, but
    I have not seen it mentioned in any more recent references.
    I have only partially read the step-down procedure for closed tests by Westfall.

    One change to make it more flexible, is to separate out the decision on a subset,
    also because the F-based tests, FREGW in SPSS, take information from all elements of
    a set and not just pairwise comparisons. I have not looked at the details of
    the F-based tests such as Sheffe yet. It looks like running an F-test on equality
    of means in each subset. This would also outsource how pairwise conditions are
    combined, any larger or max. This would also imply that the distance matrix cannot
    be calculated in advance for tests like the F-based ones.


    '''

    def __init__(self, vals, nobs_all, var_all, df=None):
        self.vals = vals
        self.n_vals = len(vals)
        self.nobs_all = nobs_all
        self.var_all = var_all
        self.df = df
        # the following has been moved to run
        #self.cache_result = {}
        #self.crit = self.getcrit(0.5)   #decide where to set alpha, moved to run
        #self.accepted = []  #store accepted sets, not unique

    def get_crit(self, alpha):
        """
        get_tukeyQcrit

        currently tukey Q, add others
        """
        q_crit = get_tukeyQcrit(self.n_vals, self.df, alpha=alpha)
        return q_crit * np.ones(self.n_vals)


    def get_distance_matrix(self):
        '''studentized range statistic'''
        #make into property, decorate
        dres = distance_st_range(self.vals, self.nobs_all, self.var_all, df=self.df)
        self.distance_matrix = dres[0]

    def iter_subsets(self, indices):
        """Iterate substeps"""
        for ii in range(len(indices)):
            idxsub = copy.copy(indices)
            idxsub.pop(ii)
            yield idxsub


    def check_set(self, indices):
        '''check whether pairwise distances of indices satisfy condition

        '''
        indtup = tuple(indices)
        if indtup in self.cache_result:
            return self.cache_result[indtup]
        else:
            set_distance_matrix = self.distance_matrix[np.asarray(indices)[:,None], indices]
            n_elements = len(indices)
            if np.any(set_distance_matrix > self.crit[n_elements-1]):
                res = True
            else:
                res = False
            self.cache_result[indtup] = res
            return res

    def stepdown(self, indices):
        """stepdown"""
        print(indices)
        if self.check_set(indices): # larger than critical distance
            if (len(indices) > 2):  # step down into subsets if more than 2 elements
                for subs in self.iter_subsets(indices):
                    self.stepdown(subs)
            else:
                self.rejected.append(tuple(indices))
        else:
            self.accepted.append(tuple(indices))
            return indices

    def run(self, alpha):
        '''main function to run the test,

        could be done in __call__ instead
        this could have all the initialization code

        '''
        self.cache_result = {}
        self.crit = self.get_crit(alpha)   #decide where to set alpha, moved to run
        self.accepted = []  #store accepted sets, not unique
        self.rejected = []
        self.get_distance_matrix()
        self.stepdown(lrange(self.n_vals))

        return list(set(self.accepted)), list(set(sd.rejected))


def homogeneous_subsets(vals, dcrit):
    '''recursively check all pairs of vals for minimum distance

    step down method as in Newman-Keuls and Ryan procedures. This is not a
    closed procedure since not all partitions are checked.

    Parameters
    ----------
    vals : array_like
        values that are pairwise compared
    dcrit : array_like or float
        critical distance for rejecting, either float, or 2-dimensional array
        with distances on the upper triangle.

    Returns
    -------
    rejs : list of pairs
        list of pair-indices with (strictly) larger than critical difference
    nrejs : list of pairs
        list of pair-indices with smaller than critical difference
    lli : list of tuples
        list of subsets with smaller than critical difference
    res : tree
        result of all comparisons (for checking)


    this follows description in SPSS notes on Post-Hoc Tests

    Because of the recursive structure, some comparisons are made several
    times, but only unique pairs or sets are returned.

    Examples
    --------
    >>> m = [0, 2, 2.5, 3, 6, 8, 9, 9.5,10 ]
    >>> rej, nrej, ssli, res = homogeneous_subsets(m, 2)
    >>> set_partition(ssli)
    ([(5, 6, 7, 8), (1, 2, 3), (4,)], [0])
    >>> [np.array(m)[list(pp)] for pp in set_partition(ssli)[0]]
    [array([  8. ,   9. ,   9.5,  10. ]), array([ 2. ,  2.5,  3. ]), array([ 6.])]


    '''

    nvals = len(vals)
    indices_ = lrange(nvals)
    rejected = []
    subsetsli = []
    if np.size(dcrit) == 1:
        dcrit = dcrit*np.ones((nvals, nvals))  #example numbers for experimenting

    def subsets(vals, indices_):
        '''recursive function for constructing homogeneous subset

        registers rejected and subsetli in outer scope
        '''
        i, j = (indices_[0], indices_[-1])
        if vals[-1] - vals[0] > dcrit[i,j]:
            rejected.append((indices_[0], indices_[-1]))
            return [subsets(vals[:-1], indices_[:-1]),
                    subsets(vals[1:], indices_[1:]),
                    (indices_[0], indices_[-1])]
        else:
            subsetsli.append(tuple(indices_))
            return indices_
    res = subsets(vals, indices_)

    all_pairs = [(i,j) for i in range(nvals) for j in range(nvals-1,i,-1)]
    rejs = set(rejected)
    not_rejected = list(set(all_pairs) - rejs)

    return list(rejs), not_rejected, list(set(subsetsli)), res

def set_partition(ssli):
    '''extract a partition from a list of tuples

    this should be correctly called select largest disjoint sets.
    Begun and Gabriel 1981 do not seem to be bothered by sets of accepted
    hypothesis with joint elements,
    e.g. maximal_accepted_sets = { {1,2,3}, {2,3,4} }

    This creates a set partition from a list of sets given as tuples.
    It tries to find the partition with the largest sets. That is, sets are
    included after being sorted by length.

    If the list does not include the singletons, then it will be only a
    partial partition. Missing items are singletons (I think).

    Examples
    --------
    >>> li
    [(5, 6, 7, 8), (1, 2, 3), (4, 5), (0, 1)]
    >>> set_partition(li)
    ([(5, 6, 7, 8), (1, 2, 3)], [0, 4])

    '''
    part = []
    for s in sorted(list(set(ssli)), key=len)[::-1]:
        #print(s,
        s_ = set(s).copy()
        if not any(set(s_).intersection(set(t)) for t in part):
            #print('inside:', s
            part.append(s)
        #else: print(part

    missing = list(set(i for ll in ssli for i in ll)
                   - set(i for ll in part for i in ll))
    return part, missing


def set_remove_subs(ssli):
    '''remove sets that are subsets of another set from a list of tuples

    Parameters
    ----------
    ssli : list of tuples
        each tuple is considered as a set

    Returns
    -------
    part : list of tuples
        new list with subset tuples removed, it is sorted by set-length of tuples. The
        list contains original tuples, duplicate elements are not removed.

    Examples
    --------
    >>> set_remove_subs([(0, 1), (1, 2), (1, 2, 3), (0,)])
    [(1, 2, 3), (0, 1)]
    >>> set_remove_subs([(0, 1), (1, 2), (1,1, 1, 2, 3), (0,)])
    [(1, 1, 1, 2, 3), (0, 1)]

    '''
    #TODO: maybe convert all tuples to sets immediately, but I do not need the extra efficiency
    part = []
    for s in sorted(list(set(ssli)), key=lambda x: len(set(x)))[::-1]:
        #print(s,
        #s_ = set(s).copy()
        if not any(set(s).issubset(set(t)) for t in part):
            #print('inside:', s
            part.append(s)
        #else: print(part

##    missing = list(set(i for ll in ssli for i in ll)
##                   - set(i for ll in part for i in ll))
    return part


if __name__ == '__main__':

    examples = ['tukey', 'tukeycrit', 'fdr', 'fdrmc', 'bonf', 'randmvn',
                'multicompdev', 'None']#[-1]

    if 'tukey' in examples:
        #Example Tukey
        x = np.array([[0,0,1]]).T + np.random.randn(3, 20)
        print(Tukeythreegene(*x))

    # Example FDR
    # ------------
    if ('fdr' in examples) or ('bonf' in examples):
        from .ex_multicomp import example_fdr_bonferroni
        example_fdr_bonferroni()

    if 'fdrmc' in examples:
        mcres = mcfdr(nobs=100, nrepl=1000, ntests=30, ntrue=30, mu=0.1, alpha=0.05, rho=0.3)
        mcmeans = np.array(mcres).mean(0)
        print(mcmeans)
        print(mcmeans[0]/6., 1-mcmeans[1]/4.)
        print(mcmeans[:4], mcmeans[-4:])


    if 'randmvn' in examples:
        rvsmvn = randmvn(0.8, (5000,5))
        print(np.corrcoef(rvsmvn, rowvar=0))
        print(rvsmvn.var(0))


    if 'tukeycrit' in examples:
        print(get_tukeyQcrit(8, 8, alpha=0.05), 5.60)
        print(get_tukeyQcrit(8, 8, alpha=0.01), 7.47)


    if 'multicompdev' in examples:
        #development of kruskal-wallis multiple-comparison
        #example from matlab file exchange

        X = np.array([[7.68, 1], [7.69, 1], [7.70, 1], [7.70, 1], [7.72, 1],
                      [7.73, 1], [7.73, 1], [7.76, 1], [7.71, 2], [7.73, 2],
                      [7.74, 2], [7.74, 2], [7.78, 2], [7.78, 2], [7.80, 2],
                      [7.81, 2], [7.74, 3], [7.75, 3], [7.77, 3], [7.78, 3],
                      [7.80, 3], [7.81, 3], [7.84, 3], [7.71, 4], [7.71, 4],
                      [7.74, 4], [7.79, 4], [7.81, 4], [7.85, 4], [7.87, 4],
                      [7.91, 4]])
        xli = [X[X[:,1]==k,0] for k in range(1,5)]
        xranks = stats.rankdata(X[:,0])
        xranksli = [xranks[X[:,1]==k] for k in range(1,5)]
        xnobs = np.array([len(xval) for xval in xli])
        meanranks = [item.mean() for item in xranksli]
        sumranks = [item.sum() for item in xranksli]
        # equivalent function
        #from scipy import special
        #-np.sqrt(2.)*special.erfcinv(2-0.5) == stats.norm.isf(0.25)
        stats.norm.sf(0.67448975019608171)
        stats.norm.isf(0.25)

        mrs = np.sort(meanranks)
        v1, v2 = np.triu_indices(4,1)
        print('\nsorted rank differences')
        print(mrs[v2] - mrs[v1])
        diffidx = np.argsort(mrs[v2] - mrs[v1])[::-1]
        mrs[v2[diffidx]] - mrs[v1[diffidx]]

        print('\nkruskal for all pairs')
        for i,j in zip(v2[diffidx], v1[diffidx]):
            print(i,j, stats.kruskal(xli[i], xli[j]))
            mwu, mwupval = stats.mannwhitneyu(xli[i], xli[j], use_continuity=False)
            print(mwu, mwupval*2, mwupval*2<0.05/6., mwupval*2<0.1/6.)


        uni, intlab = np.unique(X[:,0], return_inverse=True)
        groupnobs = np.bincount(intlab)
        groupxsum = np.bincount(intlab, weights=X[:,0])
        groupxmean = groupxsum * 1.0 / groupnobs

        rankraw = X[:,0].argsort().argsort()
        groupranksum = np.bincount(intlab, weights=rankraw)
        # start at 1 for stats.rankdata :
        grouprankmean = groupranksum * 1.0 / groupnobs + 1
        assert_almost_equal(grouprankmean[intlab], stats.rankdata(X[:,0]), 15)
        gs = GroupsStats(X, useranks=True)
        print('\ngroupmeanfilter and grouprankmeans')
        print(gs.groupmeanfilter)
        print(grouprankmean[intlab])
        #the following has changed
        #assert_almost_equal(gs.groupmeanfilter, stats.rankdata(X[:,0]), 15)

        xuni, xintlab = np.unique(X[:,0], return_inverse=True)
        gs2 = GroupsStats(np.column_stack([X[:,0], xintlab]), useranks=True)
        #assert_almost_equal(gs2.groupmeanfilter, stats.rankdata(X[:,0]), 15)

        rankbincount = np.bincount(xranks.astype(int))
        nties = rankbincount[rankbincount > 1]
        ntot = float(len(xranks))
        tiecorrection = 1 - (nties**3 - nties).sum()/(ntot**3 - ntot)
        assert_almost_equal(tiecorrection, stats.tiecorrect(xranks),15)
        print('\ntiecorrection for data and ranks')
        print(tiecorrection)
        print(tiecorrect(xranks))

        tot = X.shape[0]
        t=500 #168
        f=(tot*(tot+1.)/12.)-(t/(6.*(tot-1.)))
        f=(tot*(tot+1.)/12.)/stats.tiecorrect(xranks)
        print('\npairs of mean rank differences')
        for i,j in zip(v2[diffidx], v1[diffidx]):
            #pdiff = np.abs(mrs[i] - mrs[j])
            pdiff = np.abs(meanranks[i] - meanranks[j])
            se = np.sqrt(f * np.sum(1./xnobs[[i,j]] )) #np.array([8,8]))) #Fixme groupnobs[[i,j]] ))
            print(i,j, pdiff, se, pdiff/se, pdiff/se>2.6310)

        multicomp = MultiComparison(*X.T)
        multicomp.kruskal()
        gsr = GroupsStats(X, useranks=True)

        print('\nexamples for kruskal multicomparison')
        for i in range(10):
            x1, x2 = (np.random.randn(30,2) + np.array([0, 0.5])).T
            skw = stats.kruskal(x1, x2)
            mc2=MultiComparison(np.r_[x1, x2], np.r_[np.zeros(len(x1)), np.ones(len(x2))])
            newskw = mc2.kruskal()
            print(skw, np.sqrt(skw[0]), skw[1]-newskw, (newskw/skw[1]-1)*100)

        tablett, restt, arrtt = multicomp.allpairtest(stats.ttest_ind)
        tablemw, resmw, arrmw = multicomp.allpairtest(stats.mannwhitneyu)
        print('')
        print(tablett)
        print('')
        print(tablemw)
        tablemwhs, resmw, arrmw = multicomp.allpairtest(stats.mannwhitneyu, method='hs')
        print('')
        print(tablemwhs)

    if 'last' in examples:
        xli = (np.random.randn(60,4) + np.array([0, 0, 0.5, 0.5])).T
        #Xrvs = np.array(catstack(xli))
        xrvs, xrvsgr = catstack(xli)
        multicompr = MultiComparison(xrvs, xrvsgr)
        tablett, restt, arrtt = multicompr.allpairtest(stats.ttest_ind)
        print(tablett)


        xli=[[8,10,9,10,9],[7,8,5,8,5],[4,8,7,5,7]]
        x, labels = catstack(xli)
        gs4 = GroupsStats(np.column_stack([x, labels]))
        print(gs4.groupvarwithin())


    #test_tukeyhsd() #moved to test_multi.py

    gmeans = np.array([ 7.71375,  7.76125,  7.78428571,  7.79875])
    gnobs = np.array([8, 8, 7, 8])
    sd = StepDown(gmeans, gnobs, 0.001, [27])

    #example from BKY
    pvals = [0.0001, 0.0004, 0.0019, 0.0095, 0.0201, 0.0278, 0.0298, 0.0344, 0.0459,
             0.3240, 0.4262, 0.5719, 0.6528, 0.7590, 1.000 ]

    #same number of rejection as in BKY paper:
    #single step-up:4, two-stage:8, iterated two-step:9
    #also alpha_star is the same as theirs for TST
    print(fdrcorrection0(pvals, alpha=0.05, method='indep'))
    print(fdrcorrection_twostage(pvals, alpha=0.05, iter=False))
    res_tst = fdrcorrection_twostage(pvals, alpha=0.05, iter=False)
    assert_almost_equal([0.047619, 0.0649], res_tst[-1][:2],3) #alpha_star for stage 2
    assert_equal(8, res_tst[0].sum())
    print(fdrcorrection_twostage(pvals, alpha=0.05, iter=True))
    print('fdr_gbs', multipletests(pvals, alpha=0.05, method='fdr_gbs'))
    #multicontrast_pvalues(tstat, tcorr, df)
    tukey_pvalues(3.649, 3, 16)