1'''Multiple Testing and P-Value Correction 2 3 4Author: Josef Perktold 5License: BSD-3 6 7''' 8 9 10import numpy as np 11 12from statsmodels.stats._knockoff import RegressionFDR 13 14__all__ = ['fdrcorrection', 'fdrcorrection_twostage', 'local_fdr', 15 'multipletests', 'NullDistribution', 'RegressionFDR'] 16 17# ============================================== 18# 19# Part 1: Multiple Tests and P-Value Correction 20# 21# ============================================== 22 23 24def _ecdf(x): 25 '''no frills empirical cdf used in fdrcorrection 26 ''' 27 nobs = len(x) 28 return np.arange(1,nobs+1)/float(nobs) 29 30multitest_methods_names = {'b': 'Bonferroni', 31 's': 'Sidak', 32 'h': 'Holm', 33 'hs': 'Holm-Sidak', 34 'sh': 'Simes-Hochberg', 35 'ho': 'Hommel', 36 'fdr_bh': 'FDR Benjamini-Hochberg', 37 'fdr_by': 'FDR Benjamini-Yekutieli', 38 'fdr_tsbh': 'FDR 2-stage Benjamini-Hochberg', 39 'fdr_tsbky': 'FDR 2-stage Benjamini-Krieger-Yekutieli', 40 'fdr_gbs': 'FDR adaptive Gavrilov-Benjamini-Sarkar' 41 } 42 43_alias_list = [['b', 'bonf', 'bonferroni'], 44 ['s', 'sidak'], 45 ['h', 'holm'], 46 ['hs', 'holm-sidak'], 47 ['sh', 'simes-hochberg'], 48 ['ho', 'hommel'], 49 ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp'], 50 ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr'], 51 ['fdr_tsbh', 'fdr_2sbh'], 52 ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage'], 53 ['fdr_gbs'] 54 ] 55 56 57multitest_alias = {} 58for m in _alias_list: 59 multitest_alias[m[0]] = m[0] 60 for a in m[1:]: 61 multitest_alias[a] = m[0] 62 63def multipletests(pvals, alpha=0.05, method='hs', is_sorted=False, 64 returnsorted=False): 65 """ 66 Test results and p-value correction for multiple tests 67 68 Parameters 69 ---------- 70 pvals : array_like, 1-d 71 uncorrected p-values. Must be 1-dimensional. 72 alpha : float 73 FWER, family-wise error rate, e.g. 0.1 74 method : str 75 Method used for testing and adjustment of pvalues. Can be either the 76 full name or initial letters. Available methods are: 77 78 - `bonferroni` : one-step correction 79 - `sidak` : one-step correction 80 - `holm-sidak` : step down method using Sidak adjustments 81 - `holm` : step-down method using Bonferroni adjustments 82 - `simes-hochberg` : step-up method (independent) 83 - `hommel` : closed method based on Simes tests (non-negative) 84 - `fdr_bh` : Benjamini/Hochberg (non-negative) 85 - `fdr_by` : Benjamini/Yekutieli (negative) 86 - `fdr_tsbh` : two stage fdr correction (non-negative) 87 - `fdr_tsbky` : two stage fdr correction (non-negative) 88 89 is_sorted : bool 90 If False (default), the p_values will be sorted, but the corrected 91 pvalues are in the original order. If True, then it assumed that the 92 pvalues are already sorted in ascending order. 93 returnsorted : bool 94 not tested, return sorted p-values instead of original sequence 95 96 Returns 97 ------- 98 reject : ndarray, boolean 99 true for hypothesis that can be rejected for given alpha 100 pvals_corrected : ndarray 101 p-values corrected for multiple tests 102 alphacSidak : float 103 corrected alpha for Sidak method 104 alphacBonf : float 105 corrected alpha for Bonferroni method 106 107 Notes 108 ----- 109 There may be API changes for this function in the future. 110 111 Except for 'fdr_twostage', the p-value correction is independent of the 112 alpha specified as argument. In these cases the corrected p-values 113 can also be compared with a different alpha. In the case of 'fdr_twostage', 114 the corrected p-values are specific to the given alpha, see 115 ``fdrcorrection_twostage``. 116 117 The 'fdr_gbs' procedure is not verified against another package, p-values 118 are derived from scratch and are not derived in the reference. In Monte 119 Carlo experiments the method worked correctly and maintained the false 120 discovery rate. 121 122 All procedures that are included, control FWER or FDR in the independent 123 case, and most are robust in the positively correlated case. 124 125 `fdr_gbs`: high power, fdr control for independent case and only small 126 violation in positively correlated case 127 128 **Timing**: 129 130 Most of the time with large arrays is spent in `argsort`. When 131 we want to calculate the p-value for several methods, then it is more 132 efficient to presort the pvalues, and put the results back into the 133 original order outside of the function. 134 135 Method='hommel' is very slow for large arrays, since it requires the 136 evaluation of n partitions, where n is the number of p-values. 137 """ 138 import gc 139 pvals = np.asarray(pvals) 140 alphaf = alpha # Notation ? 141 142 if not is_sorted: 143 sortind = np.argsort(pvals) 144 pvals = np.take(pvals, sortind) 145 146 ntests = len(pvals) 147 alphacSidak = 1 - np.power((1. - alphaf), 1./ntests) 148 alphacBonf = alphaf / float(ntests) 149 if method.lower() in ['b', 'bonf', 'bonferroni']: 150 reject = pvals <= alphacBonf 151 pvals_corrected = pvals * float(ntests) 152 153 elif method.lower() in ['s', 'sidak']: 154 reject = pvals <= alphacSidak 155 pvals_corrected = -np.expm1(ntests * np.log1p(-pvals)) 156 157 elif method.lower() in ['hs', 'holm-sidak']: 158 alphacSidak_all = 1 - np.power((1. - alphaf), 159 1./np.arange(ntests, 0, -1)) 160 notreject = pvals > alphacSidak_all 161 del alphacSidak_all 162 163 nr_index = np.nonzero(notreject)[0] 164 if nr_index.size == 0: 165 # nonreject is empty, all rejected 166 notrejectmin = len(pvals) 167 else: 168 notrejectmin = np.min(nr_index) 169 notreject[notrejectmin:] = True 170 reject = ~notreject 171 del notreject 172 173 # It's eqivalent to 1 - np.power((1. - pvals), 174 # np.arange(ntests, 0, -1)) 175 # but prevents the issue of the floating point precision 176 pvals_corrected_raw = -np.expm1(np.arange(ntests, 0, -1) * 177 np.log1p(-pvals)) 178 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) 179 del pvals_corrected_raw 180 181 elif method.lower() in ['h', 'holm']: 182 notreject = pvals > alphaf / np.arange(ntests, 0, -1) 183 nr_index = np.nonzero(notreject)[0] 184 if nr_index.size == 0: 185 # nonreject is empty, all rejected 186 notrejectmin = len(pvals) 187 else: 188 notrejectmin = np.min(nr_index) 189 notreject[notrejectmin:] = True 190 reject = ~notreject 191 pvals_corrected_raw = pvals * np.arange(ntests, 0, -1) 192 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) 193 del pvals_corrected_raw 194 gc.collect() 195 196 elif method.lower() in ['sh', 'simes-hochberg']: 197 alphash = alphaf / np.arange(ntests, 0, -1) 198 reject = pvals <= alphash 199 rejind = np.nonzero(reject) 200 if rejind[0].size > 0: 201 rejectmax = np.max(np.nonzero(reject)) 202 reject[:rejectmax] = True 203 pvals_corrected_raw = np.arange(ntests, 0, -1) * pvals 204 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 205 del pvals_corrected_raw 206 207 elif method.lower() in ['ho', 'hommel']: 208 # we need a copy because we overwrite it in a loop 209 a = pvals.copy() 210 for m in range(ntests, 1, -1): 211 cim = np.min(m * pvals[-m:] / np.arange(1,m+1.)) 212 a[-m:] = np.maximum(a[-m:], cim) 213 a[:-m] = np.maximum(a[:-m], np.minimum(m * pvals[:-m], cim)) 214 pvals_corrected = a 215 reject = a <= alphaf 216 217 elif method.lower() in ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp']: 218 # delegate, call with sorted pvals 219 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, 220 method='indep', 221 is_sorted=True) 222 elif method.lower() in ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr']: 223 # delegate, call with sorted pvals 224 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, 225 method='n', 226 is_sorted=True) 227 elif method.lower() in ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage']: 228 # delegate, call with sorted pvals 229 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, 230 method='bky', 231 is_sorted=True)[:2] 232 elif method.lower() in ['fdr_tsbh', 'fdr_2sbh']: 233 # delegate, call with sorted pvals 234 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, 235 method='bh', 236 is_sorted=True)[:2] 237 238 elif method.lower() in ['fdr_gbs']: 239 #adaptive stepdown in Gavrilov, Benjamini, Sarkar, Annals of Statistics 2009 240## notreject = pvals > alphaf / np.arange(ntests, 0, -1) #alphacSidak 241## notrejectmin = np.min(np.nonzero(notreject)) 242## notreject[notrejectmin:] = True 243## reject = ~notreject 244 245 ii = np.arange(1, ntests + 1) 246 q = (ntests + 1. - ii)/ii * pvals / (1. - pvals) 247 pvals_corrected_raw = np.maximum.accumulate(q) #up requirementd 248 249 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 250 del pvals_corrected_raw 251 reject = pvals_corrected <= alpha 252 253 else: 254 raise ValueError('method not recognized') 255 256 if pvals_corrected is not None: #not necessary anymore 257 pvals_corrected[pvals_corrected>1] = 1 258 if is_sorted or returnsorted: 259 return reject, pvals_corrected, alphacSidak, alphacBonf 260 else: 261 pvals_corrected_ = np.empty_like(pvals_corrected) 262 pvals_corrected_[sortind] = pvals_corrected 263 del pvals_corrected 264 reject_ = np.empty_like(reject) 265 reject_[sortind] = reject 266 return reject_, pvals_corrected_, alphacSidak, alphacBonf 267 268 269def fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False): 270 ''' 271 pvalue correction for false discovery rate. 272 273 This covers Benjamini/Hochberg for independent or positively correlated and 274 Benjamini/Yekutieli for general or negatively correlated tests. 275 276 Parameters 277 ---------- 278 pvals : array_like, 1d 279 Set of p-values of the individual tests. 280 alpha : float, optional 281 Family-wise error rate. Defaults to ``0.05``. 282 method : {'i', 'indep', 'p', 'poscorr', 'n', 'negcorr'}, optional 283 Which method to use for FDR correction. 284 ``{'i', 'indep', 'p', 'poscorr'}`` all refer to ``fdr_bh`` 285 (Benjamini/Hochberg for independent or positively 286 correlated tests). ``{'n', 'negcorr'}`` both refer to ``fdr_by`` 287 (Benjamini/Yekutieli for general or negatively correlated tests). 288 Defaults to ``'indep'``. 289 is_sorted : bool, optional 290 If False (default), the p_values will be sorted, but the corrected 291 pvalues are in the original order. If True, then it assumed that the 292 pvalues are already sorted in ascending order. 293 294 Returns 295 ------- 296 rejected : ndarray, bool 297 True if a hypothesis is rejected, False if not 298 pvalue-corrected : ndarray 299 pvalues adjusted for multiple hypothesis testing to limit FDR 300 301 Notes 302 ----- 303 If there is prior information on the fraction of true hypothesis, then alpha 304 should be set to ``alpha * m/m_0`` where m is the number of tests, 305 given by the p-values, and m_0 is an estimate of the true hypothesis. 306 (see Benjamini, Krieger and Yekuteli) 307 308 The two-step method of Benjamini, Krieger and Yekutiel that estimates the number 309 of false hypotheses will be available (soon). 310 311 Both methods exposed via this function (Benjamini/Hochberg, Benjamini/Yekutieli) 312 are also available in the function ``multipletests``, as ``method="fdr_bh"`` and 313 ``method="fdr_by"``, respectively. 314 315 See also 316 -------- 317 multipletests 318 319 ''' 320 pvals = np.asarray(pvals) 321 assert pvals.ndim == 1, "pvals must be 1-dimensional, that is of shape (n,)" 322 323 if not is_sorted: 324 pvals_sortind = np.argsort(pvals) 325 pvals_sorted = np.take(pvals, pvals_sortind) 326 else: 327 pvals_sorted = pvals # alias 328 329 if method in ['i', 'indep', 'p', 'poscorr']: 330 ecdffactor = _ecdf(pvals_sorted) 331 elif method in ['n', 'negcorr']: 332 cm = np.sum(1./np.arange(1, len(pvals_sorted)+1)) #corrected this 333 ecdffactor = _ecdf(pvals_sorted) / cm 334## elif method in ['n', 'negcorr']: 335## cm = np.sum(np.arange(len(pvals))) 336## ecdffactor = ecdf(pvals_sorted)/cm 337 else: 338 raise ValueError('only indep and negcorr implemented') 339 reject = pvals_sorted <= ecdffactor*alpha 340 if reject.any(): 341 rejectmax = max(np.nonzero(reject)[0]) 342 reject[:rejectmax] = True 343 344 pvals_corrected_raw = pvals_sorted / ecdffactor 345 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 346 del pvals_corrected_raw 347 pvals_corrected[pvals_corrected>1] = 1 348 if not is_sorted: 349 pvals_corrected_ = np.empty_like(pvals_corrected) 350 pvals_corrected_[pvals_sortind] = pvals_corrected 351 del pvals_corrected 352 reject_ = np.empty_like(reject) 353 reject_[pvals_sortind] = reject 354 return reject_, pvals_corrected_ 355 else: 356 return reject, pvals_corrected 357 358 359def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False, 360 is_sorted=False): 361 '''(iterated) two stage linear step-up procedure with estimation of number of true 362 hypotheses 363 364 Benjamini, Krieger and Yekuteli, procedure in Definition 6 365 366 Parameters 367 ---------- 368 pvals : array_like 369 set of p-values of the individual tests. 370 alpha : float 371 error rate 372 method : {'bky', 'bh') 373 see Notes for details 374 375 * 'bky' - implements the procedure in Definition 6 of Benjamini, Krieger 376 and Yekuteli 2006 377 * 'bh' - the two stage method of Benjamini and Hochberg 378 379 iter : bool 380 381 Returns 382 ------- 383 rejected : ndarray, bool 384 True if a hypothesis is rejected, False if not 385 pvalue-corrected : ndarray 386 pvalues adjusted for multiple hypotheses testing to limit FDR 387 m0 : int 388 ntest - rej, estimated number of true hypotheses 389 alpha_stages : list of floats 390 A list of alphas that have been used at each stage 391 392 Notes 393 ----- 394 The returned corrected p-values are specific to the given alpha, they 395 cannot be used for a different alpha. 396 397 The returned corrected p-values are from the last stage of the fdr_bh 398 linear step-up procedure (fdrcorrection0 with method='indep') corrected 399 for the estimated fraction of true hypotheses. 400 This means that the rejection decision can be obtained with 401 ``pval_corrected <= alpha``, where ``alpha`` is the original significance 402 level. 403 (Note: This has changed from earlier versions (<0.5.0) of statsmodels.) 404 405 BKY described several other multi-stage methods, which would be easy to implement. 406 However, in their simulation the simple two-stage method (with iter=False) was the 407 most robust to the presence of positive correlation 408 409 TODO: What should be returned? 410 411 ''' 412 pvals = np.asarray(pvals) 413 414 if not is_sorted: 415 pvals_sortind = np.argsort(pvals) 416 pvals = np.take(pvals, pvals_sortind) 417 418 ntests = len(pvals) 419 if method == 'bky': 420 fact = (1.+alpha) 421 alpha_prime = alpha / fact 422 elif method == 'bh': 423 fact = 1. 424 alpha_prime = alpha 425 else: 426 raise ValueError("only 'bky' and 'bh' are available as method") 427 428 alpha_stages = [alpha_prime] 429 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_prime, method='indep', 430 is_sorted=True) 431 r1 = rej.sum() 432 if (r1 == 0) or (r1 == ntests): 433 return rej, pvalscorr * fact, ntests - r1, alpha_stages 434 ri_old = r1 435 436 while True: 437 ntests0 = 1.0 * ntests - ri_old 438 alpha_star = alpha_prime * ntests / ntests0 439 alpha_stages.append(alpha_star) 440 #print ntests0, alpha_star 441 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_star, method='indep', 442 is_sorted=True) 443 ri = rej.sum() 444 if (not iter) or ri == ri_old: 445 break 446 elif ri < ri_old: 447 # prevent cycles and endless loops 448 raise RuntimeError(" oops - should not be here") 449 ri_old = ri 450 451 # make adjustment to pvalscorr to reflect estimated number of Non-Null cases 452 # decision is then pvalscorr < alpha (or <=) 453 pvalscorr *= ntests0 * 1.0 / ntests 454 if method == 'bky': 455 pvalscorr *= (1. + alpha) 456 457 if not is_sorted: 458 pvalscorr_ = np.empty_like(pvalscorr) 459 pvalscorr_[pvals_sortind] = pvalscorr 460 del pvalscorr 461 reject = np.empty_like(rej) 462 reject[pvals_sortind] = rej 463 return reject, pvalscorr_, ntests - ri, alpha_stages 464 else: 465 return rej, pvalscorr, ntests - ri, alpha_stages 466 467 468def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, 469 nbins=30, alpha=0): 470 """ 471 Calculate local FDR values for a list of Z-scores. 472 473 Parameters 474 ---------- 475 zscores : array_like 476 A vector of Z-scores 477 null_proportion : float 478 The assumed proportion of true null hypotheses 479 null_pdf : function mapping reals to positive reals 480 The density of null Z-scores; if None, use standard normal 481 deg : int 482 The maximum exponent in the polynomial expansion of the 483 density of non-null Z-scores 484 nbins : int 485 The number of bins for estimating the marginal density 486 of Z-scores. 487 alpha : float 488 Use Poisson ridge regression with parameter alpha to estimate 489 the density of non-null Z-scores. 490 491 Returns 492 ------- 493 fdr : array_like 494 A vector of FDR values 495 496 References 497 ---------- 498 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups 499 Model. Statistical Science 23:1, 1-22. 500 501 Examples 502 -------- 503 Basic use (the null Z-scores are taken to be standard normal): 504 505 >>> from statsmodels.stats.multitest import local_fdr 506 >>> import numpy as np 507 >>> zscores = np.random.randn(30) 508 >>> fdr = local_fdr(zscores) 509 510 Use a Gaussian null distribution estimated from the data: 511 512 >>> null = EmpiricalNull(zscores) 513 >>> fdr = local_fdr(zscores, null_pdf=null.pdf) 514 """ 515 516 from statsmodels.genmod.generalized_linear_model import GLM 517 from statsmodels.genmod.generalized_linear_model import families 518 from statsmodels.regression.linear_model import OLS 519 520 # Bins for Poisson modeling of the marginal Z-score density 521 minz = min(zscores) 522 maxz = max(zscores) 523 bins = np.linspace(minz, maxz, nbins) 524 525 # Bin counts 526 zhist = np.histogram(zscores, bins)[0] 527 528 # Bin centers 529 zbins = (bins[:-1] + bins[1:]) / 2 530 531 # The design matrix at bin centers 532 dmat = np.vander(zbins, deg + 1) 533 534 # Rescale the design matrix 535 sd = dmat.std(0) 536 ii = sd >1e-8 537 dmat[:, ii] /= sd[ii] 538 539 start = OLS(np.log(1 + zhist), dmat).fit().params 540 541 # Poisson regression 542 if alpha > 0: 543 md = GLM(zhist, dmat, family=families.Poisson()).fit_regularized(L1_wt=0, alpha=alpha, start_params=start) 544 else: 545 md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=start) 546 547 # The design matrix for all Z-scores 548 dmat_full = np.vander(zscores, deg + 1) 549 dmat_full[:, ii] /= sd[ii] 550 551 # The height of the estimated marginal density of Z-scores, 552 # evaluated at every observed Z-score. 553 fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) 554 555 # The null density. 556 if null_pdf is None: 557 f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) 558 else: 559 f0 = null_pdf(zscores) 560 561 # The local FDR values 562 fdr = null_proportion * f0 / fz 563 564 fdr = np.clip(fdr, 0, 1) 565 566 return fdr 567 568 569class NullDistribution(object): 570 """ 571 Estimate a Gaussian distribution for the null Z-scores. 572 573 The observed Z-scores consist of both null and non-null values. 574 The fitted distribution of null Z-scores is Gaussian, but may have 575 non-zero mean and/or non-unit scale. 576 577 Parameters 578 ---------- 579 zscores : array_like 580 The observed Z-scores. 581 null_lb : float 582 Z-scores between `null_lb` and `null_ub` are all considered to be 583 true null hypotheses. 584 null_ub : float 585 See `null_lb`. 586 estimate_mean : bool 587 If True, estimate the mean of the distribution. If False, the 588 mean is fixed at zero. 589 estimate_scale : bool 590 If True, estimate the scale of the distribution. If False, the 591 scale parameter is fixed at 1. 592 estimate_null_proportion : bool 593 If True, estimate the proportion of true null hypotheses (i.e. 594 the proportion of z-scores with expected value zero). If False, 595 this parameter is fixed at 1. 596 597 Attributes 598 ---------- 599 mean : float 600 The estimated mean of the empirical null distribution 601 sd : float 602 The estimated standard deviation of the empirical null distribution 603 null_proportion : float 604 The estimated proportion of true null hypotheses among all hypotheses 605 606 References 607 ---------- 608 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups 609 Model. Statistical Science 23:1, 1-22. 610 611 Notes 612 ----- 613 See also: 614 615 http://nipy.org/nipy/labs/enn.html#nipy.algorithms.statistics.empirical_pvalue.NormalEmpiricalNull.fdr 616 """ 617 618 def __init__(self, zscores, null_lb=-1, null_ub=1, estimate_mean=True, 619 estimate_scale=True, estimate_null_proportion=False): 620 621 # Extract the null z-scores 622 ii = np.flatnonzero((zscores >= null_lb) & (zscores <= null_ub)) 623 if len(ii) == 0: 624 raise RuntimeError("No Z-scores fall between null_lb and null_ub") 625 zscores0 = zscores[ii] 626 627 # Number of Z-scores, and null Z-scores 628 n_zs, n_zs0 = len(zscores), len(zscores0) 629 630 # Unpack and transform the parameters to the natural scale, hold 631 # parameters fixed as specified. 632 def xform(params): 633 634 mean = 0. 635 sd = 1. 636 prob = 1. 637 638 ii = 0 639 if estimate_mean: 640 mean = params[ii] 641 ii += 1 642 if estimate_scale: 643 sd = np.exp(params[ii]) 644 ii += 1 645 if estimate_null_proportion: 646 prob = 1 / (1 + np.exp(-params[ii])) 647 648 return mean, sd, prob 649 650 651 from scipy.stats.distributions import norm 652 653 654 def fun(params): 655 """ 656 Negative log-likelihood of z-scores. 657 658 The function has three arguments, packed into a vector: 659 660 mean : location parameter 661 logscale : log of the scale parameter 662 logitprop : logit of the proportion of true nulls 663 664 The implementation follows section 4 from Efron 2008. 665 """ 666 667 d, s, p = xform(params) 668 669 # Mass within the central region 670 central_mass = (norm.cdf((null_ub - d) / s) - 671 norm.cdf((null_lb - d) / s)) 672 673 # Probability that a Z-score is null and is in the central region 674 cp = p * central_mass 675 676 # Binomial term 677 rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp) 678 679 # Truncated Gaussian term for null Z-scores 680 zv = (zscores0 - d) / s 681 rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s) 682 rval -= n_zs0 * np.log(central_mass) 683 684 return -rval 685 686 687 # Estimate the parameters 688 from scipy.optimize import minimize 689 # starting values are mean = 0, scale = 1, p0 ~ 1 690 mz = minimize(fun, np.r_[0., 0, 3], method="Nelder-Mead") 691 mean, sd, prob = xform(mz['x']) 692 693 self.mean = mean 694 self.sd = sd 695 self.null_proportion = prob 696 697 698 # The fitted null density function 699 def pdf(self, zscores): 700 """ 701 Evaluates the fitted empirical null Z-score density. 702 703 Parameters 704 ---------- 705 zscores : scalar or array_like 706 The point or points at which the density is to be 707 evaluated. 708 709 Returns 710 ------- 711 The empirical null Z-score density evaluated at the given 712 points. 713 """ 714 715 zval = (zscores - self.mean) / self.sd 716 return np.exp(-0.5*zval**2 - np.log(self.sd) - 0.5*np.log(2*np.pi)) 717