1""" 2Empirical likelihood inference on descriptive statistics 3 4This module conducts hypothesis tests and constructs confidence 5intervals for the mean, variance, skewness, kurtosis and correlation. 6 7If matplotlib is installed, this module can also generate multivariate 8confidence region plots as well as mean-variance contour plots. 9 10See _OptFuncts docstring for technical details and optimization variable 11definitions. 12 13General References: 14------------------ 15Owen, A. (2001). "Empirical Likelihood." Chapman and Hall 16 17""" 18import numpy as np 19from scipy import optimize 20from scipy.stats import chi2, skew, kurtosis 21from statsmodels.base.optimizer import _fit_newton 22import itertools 23from statsmodels.graphics import utils 24 25 26def DescStat(endog): 27 """ 28 Returns an instance to conduct inference on descriptive statistics 29 via empirical likelihood. See DescStatUV and DescStatMV for more 30 information. 31 32 Parameters 33 ---------- 34 endog : ndarray 35 Array of data 36 37 Returns : DescStat instance 38 If k=1, the function returns a univariate instance, DescStatUV. 39 If k>1, the function returns a multivariate instance, DescStatMV. 40 """ 41 if endog.ndim == 1: 42 endog = endog.reshape(len(endog), 1) 43 if endog.shape[1] == 1: 44 return DescStatUV(endog) 45 if endog.shape[1] > 1: 46 return DescStatMV(endog) 47 48 49class _OptFuncts(object): 50 """ 51 A class that holds functions that are optimized/solved. 52 53 The general setup of the class is simple. Any method that starts with 54 _opt_ creates a vector of estimating equations named est_vect such that 55 np.dot(p, (est_vect))=0 where p is the weight on each 56 observation as a 1 x n array and est_vect is n x k. Then _modif_Newton is 57 called to determine the optimal p by solving for the Lagrange multiplier 58 (eta) in the profile likelihood maximization problem. In the presence 59 of nuisance parameters, _opt_ functions are optimized over to profile 60 out the nuisance parameters. 61 62 Any method starting with _ci_limits calculates the log likelihood 63 ratio for a specific value of a parameter and then subtracts a 64 pre-specified critical value. This is solved so that llr - crit = 0. 65 """ 66 67 def __init__(self, endog): 68 pass 69 70 def _log_star(self, eta, est_vect, weights, nobs): 71 """ 72 Transforms the log of observation probabilities in terms of the 73 Lagrange multiplier to the log 'star' of the probabilities. 74 75 Parameters 76 ---------- 77 eta : float 78 Lagrange multiplier 79 80 est_vect : ndarray (n,k) 81 Estimating equations vector 82 83 wts : nx1 array 84 Observation weights 85 86 Returns 87 ------ 88 data_star : ndarray 89 The weighted logstar of the estimting equations 90 91 Notes 92 ----- 93 This function is only a placeholder for the _fit_Newton. 94 The function value is not used in optimization and the optimal value 95 is disregarded when computing the log likelihood ratio. 96 """ 97 data_star = np.log(weights) + (np.sum(weights) +\ 98 np.dot(est_vect, eta)) 99 idx = data_star < 1. / nobs 100 not_idx = ~idx 101 nx = nobs * data_star[idx] 102 data_star[idx] = np.log(1. / nobs) - 1.5 + nx * (2. - nx / 2) 103 data_star[not_idx] = np.log(data_star[not_idx]) 104 return data_star 105 106 def _hess(self, eta, est_vect, weights, nobs): 107 """ 108 Calculates the hessian of a weighted empirical likelihood 109 problem. 110 111 Parameters 112 ---------- 113 eta : ndarray, (1,m) 114 Lagrange multiplier in the profile likelihood maximization 115 116 est_vect : ndarray (n,k) 117 Estimating equations vector 118 119 weights : 1darray 120 Observation weights 121 122 Returns 123 ------- 124 hess : m x m array 125 Weighted hessian used in _wtd_modif_newton 126 """ 127 #eta = np.squeeze(eta) 128 data_star_doub_prime = np.sum(weights) + np.dot(est_vect, eta) 129 idx = data_star_doub_prime < 1. / nobs 130 not_idx = ~idx 131 data_star_doub_prime[idx] = - nobs ** 2 132 data_star_doub_prime[not_idx] = - (data_star_doub_prime[not_idx]) ** -2 133 wtd_dsdp = weights * data_star_doub_prime 134 return np.dot(est_vect.T, wtd_dsdp[:, None] * est_vect) 135 136 def _grad(self, eta, est_vect, weights, nobs): 137 """ 138 Calculates the gradient of a weighted empirical likelihood 139 problem 140 141 Parameters 142 ---------- 143 eta : ndarray, (1,m) 144 Lagrange multiplier in the profile likelihood maximization 145 146 est_vect : ndarray, (n,k) 147 Estimating equations vector 148 149 weights : 1darray 150 Observation weights 151 152 Returns 153 ------- 154 gradient : ndarray (m,1) 155 The gradient used in _wtd_modif_newton 156 """ 157 #eta = np.squeeze(eta) 158 data_star_prime = np.sum(weights) + np.dot(est_vect, eta) 159 idx = data_star_prime < 1. / nobs 160 not_idx = ~idx 161 data_star_prime[idx] = nobs * (2 - nobs * data_star_prime[idx]) 162 data_star_prime[not_idx] = 1. / data_star_prime[not_idx] 163 return np.dot(weights * data_star_prime, est_vect) 164 165 def _modif_newton(self, eta, est_vect, weights): 166 """ 167 Modified Newton's method for maximizing the log 'star' equation. This 168 function calls _fit_newton to find the optimal values of eta. 169 170 Parameters 171 ---------- 172 eta : ndarray, (1,m) 173 Lagrange multiplier in the profile likelihood maximization 174 175 est_vect : ndarray, (n,k) 176 Estimating equations vector 177 178 weights : 1darray 179 Observation weights 180 181 Returns 182 ------- 183 params : 1xm array 184 Lagrange multiplier that maximizes the log-likelihood 185 """ 186 nobs = len(est_vect) 187 f = lambda x0: - np.sum(self._log_star(x0, est_vect, weights, nobs)) 188 grad = lambda x0: - self._grad(x0, est_vect, weights, nobs) 189 hess = lambda x0: - self._hess(x0, est_vect, weights, nobs) 190 kwds = {'tol': 1e-8} 191 eta = eta.squeeze() 192 res = _fit_newton(f, grad, eta, (), kwds, hess=hess, maxiter=50, \ 193 disp=0) 194 return res[0] 195 196 def _find_eta(self, eta): 197 """ 198 Finding the root of sum(xi-h0)/(1+eta(xi-mu)) solves for 199 eta when computing ELR for univariate mean. 200 201 Parameters 202 ---------- 203 eta : float 204 Lagrange multiplier in the empirical likelihood maximization 205 206 Returns 207 ------- 208 llr : float 209 n times the log likelihood value for a given value of eta 210 """ 211 return np.sum((self.endog - self.mu0) / \ 212 (1. + eta * (self.endog - self.mu0))) 213 214 def _ci_limits_mu(self, mu): 215 """ 216 Calculates the difference between the log likelihood of mu_test and a 217 specified critical value. 218 219 Parameters 220 ---------- 221 mu : float 222 Hypothesized value of the mean. 223 224 Returns 225 ------- 226 diff : float 227 The difference between the log likelihood value of mu0 and 228 a specified value. 229 """ 230 return self.test_mean(mu)[0] - self.r0 231 232 def _find_gamma(self, gamma): 233 """ 234 Finds gamma that satisfies 235 sum(log(n * w(gamma))) - log(r0) = 0 236 237 Used for confidence intervals for the mean 238 239 Parameters 240 ---------- 241 gamma : float 242 Lagrange multiplier when computing confidence interval 243 244 Returns 245 ------- 246 diff : float 247 The difference between the log-liklihood when the Lagrange 248 multiplier is gamma and a pre-specified value 249 """ 250 denom = np.sum((self.endog - gamma) ** -1) 251 new_weights = (self.endog - gamma) ** -1 / denom 252 return -2 * np.sum(np.log(self.nobs * new_weights)) - \ 253 self.r0 254 255 def _opt_var(self, nuisance_mu, pval=False): 256 """ 257 This is the function to be optimized over a nuisance mean parameter 258 to determine the likelihood ratio for the variance 259 260 Parameters 261 ---------- 262 nuisance_mu : float 263 Value of a nuisance mean parameter 264 265 Returns 266 ------- 267 llr : float 268 Log likelihood of a pre-specified variance holding the nuisance 269 parameter constant 270 """ 271 endog = self.endog 272 nobs = self.nobs 273 sig_data = ((endog - nuisance_mu) ** 2 \ 274 - self.sig2_0) 275 mu_data = (endog - nuisance_mu) 276 est_vect = np.column_stack((mu_data, sig_data)) 277 eta_star = self._modif_newton(np.array([1. / nobs, 278 1. / nobs]), est_vect, 279 np.ones(nobs) * (1. / nobs)) 280 281 denom = 1 + np.dot(eta_star, est_vect.T) 282 self.new_weights = 1. / nobs * 1. / denom 283 llr = np.sum(np.log(nobs * self.new_weights)) 284 if pval: # Used for contour plotting 285 return chi2.sf(-2 * llr, 1) 286 return -2 * llr 287 288 def _ci_limits_var(self, var): 289 """ 290 Used to determine the confidence intervals for the variance. 291 It calls test_var and when called by an optimizer, 292 finds the value of sig2_0 that is chi2.ppf(significance-level) 293 294 Parameters 295 ---------- 296 var_test : float 297 Hypothesized value of the variance 298 299 Returns 300 ------- 301 diff : float 302 The difference between the log likelihood ratio at var_test and a 303 pre-specified value. 304 """ 305 return self.test_var(var)[0] - self.r0 306 307 def _opt_skew(self, nuis_params): 308 """ 309 Called by test_skew. This function is optimized over 310 nuisance parameters mu and sigma 311 312 Parameters 313 ---------- 314 nuis_params : 1darray 315 An array with a nuisance mean and variance parameter 316 317 Returns 318 ------- 319 llr : float 320 The log likelihood ratio of a pre-specified skewness holding 321 the nuisance parameters constant. 322 """ 323 endog = self.endog 324 nobs = self.nobs 325 mu_data = endog - nuis_params[0] 326 sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1] 327 skew_data = ((((endog - nuis_params[0]) ** 3) / 328 (nuis_params[1] ** 1.5))) - self.skew0 329 est_vect = np.column_stack((mu_data, sig_data, skew_data)) 330 eta_star = self._modif_newton(np.array([1. / nobs, 331 1. / nobs, 332 1. / nobs]), est_vect, 333 np.ones(nobs) * (1. / nobs)) 334 denom = 1. + np.dot(eta_star, est_vect.T) 335 self.new_weights = 1. / nobs * 1. / denom 336 llr = np.sum(np.log(nobs * self.new_weights)) 337 return -2 * llr 338 339 def _opt_kurt(self, nuis_params): 340 """ 341 Called by test_kurt. This function is optimized over 342 nuisance parameters mu and sigma 343 344 Parameters 345 ---------- 346 nuis_params : 1darray 347 An array with a nuisance mean and variance parameter 348 349 Returns 350 ------- 351 llr : float 352 The log likelihood ratio of a pre-speified kurtosis holding the 353 nuisance parameters constant 354 """ 355 endog = self.endog 356 nobs = self.nobs 357 mu_data = endog - nuis_params[0] 358 sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1] 359 kurt_data = (((((endog - nuis_params[0]) ** 4) / \ 360 (nuis_params[1] ** 2))) - 3) - self.kurt0 361 est_vect = np.column_stack((mu_data, sig_data, kurt_data)) 362 eta_star = self._modif_newton(np.array([1. / nobs, 363 1. / nobs, 364 1. / nobs]), est_vect, 365 np.ones(nobs) * (1. / nobs)) 366 denom = 1 + np.dot(eta_star, est_vect.T) 367 self.new_weights = 1. / nobs * 1. / denom 368 llr = np.sum(np.log(nobs * self.new_weights)) 369 return -2 * llr 370 371 def _opt_skew_kurt(self, nuis_params): 372 """ 373 Called by test_joint_skew_kurt. This function is optimized over 374 nuisance parameters mu and sigma 375 376 Parameters 377 ---------- 378 nuis_params : 1darray 379 An array with a nuisance mean and variance parameter 380 381 Returns 382 ------ 383 llr : float 384 The log likelihood ratio of a pre-speified skewness and 385 kurtosis holding the nuisance parameters constant. 386 """ 387 endog = self.endog 388 nobs = self.nobs 389 mu_data = endog - nuis_params[0] 390 sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1] 391 skew_data = ((((endog - nuis_params[0]) ** 3) / \ 392 (nuis_params[1] ** 1.5))) - self.skew0 393 kurt_data = (((((endog - nuis_params[0]) ** 4) / \ 394 (nuis_params[1] ** 2))) - 3) - self.kurt0 395 est_vect = np.column_stack((mu_data, sig_data, skew_data, kurt_data)) 396 eta_star = self._modif_newton(np.array([1. / nobs, 397 1. / nobs, 398 1. / nobs, 399 1. / nobs]), est_vect, 400 np.ones(nobs) * (1. / nobs)) 401 denom = 1. + np.dot(eta_star, est_vect.T) 402 self.new_weights = 1. / nobs * 1. / denom 403 llr = np.sum(np.log(nobs * self.new_weights)) 404 return -2 * llr 405 406 def _ci_limits_skew(self, skew): 407 """ 408 Parameters 409 ---------- 410 skew0 : float 411 Hypothesized value of skewness 412 413 Returns 414 ------- 415 diff : float 416 The difference between the log likelihood ratio at skew and a 417 pre-specified value. 418 """ 419 return self.test_skew(skew)[0] - self.r0 420 421 def _ci_limits_kurt(self, kurt): 422 """ 423 Parameters 424 ---------- 425 skew0 : float 426 Hypothesized value of kurtosis 427 428 Returns 429 ------- 430 diff : float 431 The difference between the log likelihood ratio at kurt and a 432 pre-specified value. 433 """ 434 return self.test_kurt(kurt)[0] - self.r0 435 436 def _opt_correl(self, nuis_params, corr0, endog, nobs, x0, weights0): 437 """ 438 Parameters 439 ---------- 440 nuis_params : 1darray 441 Array containing two nuisance means and two nuisance variances 442 443 Returns 444 ------- 445 llr : float 446 The log-likelihood of the correlation coefficient holding nuisance 447 parameters constant 448 """ 449 mu1_data, mu2_data = (endog - nuis_params[::2]).T 450 sig1_data = mu1_data ** 2 - nuis_params[1] 451 sig2_data = mu2_data ** 2 - nuis_params[3] 452 correl_data = ((mu1_data * mu2_data) - corr0 * 453 (nuis_params[1] * nuis_params[3]) ** .5) 454 est_vect = np.column_stack((mu1_data, sig1_data, 455 mu2_data, sig2_data, correl_data)) 456 eta_star = self._modif_newton(x0, est_vect, weights0) 457 denom = 1. + np.dot(est_vect, eta_star) 458 self.new_weights = 1. / nobs * 1. / denom 459 llr = np.sum(np.log(nobs * self.new_weights)) 460 return -2 * llr 461 462 def _ci_limits_corr(self, corr): 463 return self.test_corr(corr)[0] - self.r0 464 465 466class DescStatUV(_OptFuncts): 467 """ 468 A class to compute confidence intervals and hypothesis tests involving 469 mean, variance, kurtosis and skewness of a univariate random variable. 470 471 Parameters 472 ---------- 473 endog : 1darray 474 Data to be analyzed 475 476 Attributes 477 ---------- 478 endog : 1darray 479 Data to be analyzed 480 481 nobs : float 482 Number of observations 483 """ 484 485 def __init__(self, endog): 486 self.endog = np.squeeze(endog) 487 self.nobs = endog.shape[0] 488 489 def test_mean(self, mu0, return_weights=False): 490 """ 491 Returns - 2 x log-likelihood ratio, p-value and weights 492 for a hypothesis test of the mean. 493 494 Parameters 495 ---------- 496 mu0 : float 497 Mean value to be tested 498 499 return_weights : bool 500 If return_weights is True the function returns 501 the weights of the observations under the null hypothesis. 502 Default is False 503 504 Returns 505 ------- 506 test_results : tuple 507 The log-likelihood ratio and p-value of mu0 508 """ 509 self.mu0 = mu0 510 endog = self.endog 511 nobs = self.nobs 512 eta_min = (1. - (1. / nobs)) / (self.mu0 - max(endog)) 513 eta_max = (1. - (1. / nobs)) / (self.mu0 - min(endog)) 514 eta_star = optimize.brentq(self._find_eta, eta_min, eta_max) 515 new_weights = (1. / nobs) * 1. / (1. + eta_star * (endog - self.mu0)) 516 llr = -2 * np.sum(np.log(nobs * new_weights)) 517 if return_weights: 518 return llr, chi2.sf(llr, 1), new_weights 519 else: 520 return llr, chi2.sf(llr, 1) 521 522 def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8, 523 gamma_low=-10 ** 10, gamma_high=10 ** 10): 524 """ 525 Returns the confidence interval for the mean. 526 527 Parameters 528 ---------- 529 sig : float 530 significance level. Default is .05 531 532 method : str 533 Root finding method, Can be 'nested-brent' or 534 'gamma'. Default is 'gamma' 535 536 'gamma' Tries to solve for the gamma parameter in the 537 Lagrange (see Owen pg 22) and then determine the weights. 538 539 'nested brent' uses brents method to find the confidence 540 intervals but must maximize the likelihood ratio on every 541 iteration. 542 543 gamma is generally much faster. If the optimizations does not 544 converge, try expanding the gamma_high and gamma_low 545 variable. 546 547 gamma_low : float 548 Lower bound for gamma when finding lower limit. 549 If function returns f(a) and f(b) must have different signs, 550 consider lowering gamma_low. 551 552 gamma_high : float 553 Upper bound for gamma when finding upper limit. 554 If function returns f(a) and f(b) must have different signs, 555 consider raising gamma_high. 556 557 epsilon : float 558 When using 'nested-brent', amount to decrease (increase) 559 from the maximum (minimum) of the data when 560 starting the search. This is to protect against the 561 likelihood ratio being zero at the maximum (minimum) 562 value of the data. If data is very small in absolute value 563 (<10 ``**`` -6) consider shrinking epsilon 564 565 When using 'gamma', amount to decrease (increase) the 566 minimum (maximum) by to start the search for gamma. 567 If function returns f(a) and f(b) must have different signs, 568 consider lowering epsilon. 569 570 Returns 571 ------- 572 Interval : tuple 573 Confidence interval for the mean 574 """ 575 endog = self.endog 576 sig = 1 - sig 577 if method == 'nested-brent': 578 self.r0 = chi2.ppf(sig, 1) 579 middle = np.mean(endog) 580 epsilon_u = (max(endog) - np.mean(endog)) * epsilon 581 epsilon_l = (np.mean(endog) - min(endog)) * epsilon 582 ulim = optimize.brentq(self._ci_limits_mu, middle, 583 max(endog) - epsilon_u) 584 llim = optimize.brentq(self._ci_limits_mu, middle, 585 min(endog) + epsilon_l) 586 return llim, ulim 587 588 if method == 'gamma': 589 self.r0 = chi2.ppf(sig, 1) 590 gamma_star_l = optimize.brentq(self._find_gamma, gamma_low, 591 min(endog) - epsilon) 592 gamma_star_u = optimize.brentq(self._find_gamma, \ 593 max(endog) + epsilon, gamma_high) 594 weights_low = ((endog - gamma_star_l) ** -1) / \ 595 np.sum((endog - gamma_star_l) ** -1) 596 weights_high = ((endog - gamma_star_u) ** -1) / \ 597 np.sum((endog - gamma_star_u) ** -1) 598 mu_low = np.sum(weights_low * endog) 599 mu_high = np.sum(weights_high * endog) 600 return mu_low, mu_high 601 602 def test_var(self, sig2_0, return_weights=False): 603 """ 604 Returns -2 x log-likelihood ratio and the p-value for the 605 hypothesized variance 606 607 Parameters 608 ---------- 609 sig2_0 : float 610 Hypothesized variance to be tested 611 612 return_weights : bool 613 If True, returns the weights that maximize the 614 likelihood of observing sig2_0. Default is False 615 616 Returns 617 ------- 618 test_results : tuple 619 The log-likelihood ratio and the p_value of sig2_0 620 621 Examples 622 -------- 623 >>> import numpy as np 624 >>> import statsmodels.api as sm 625 >>> random_numbers = np.random.standard_normal(1000)*100 626 >>> el_analysis = sm.emplike.DescStat(random_numbers) 627 >>> hyp_test = el_analysis.test_var(9500) 628 """ 629 self.sig2_0 = sig2_0 630 mu_max = max(self.endog) 631 mu_min = min(self.endog) 632 llr = optimize.fminbound(self._opt_var, mu_min, mu_max, \ 633 full_output=1)[1] 634 p_val = chi2.sf(llr, 1) 635 if return_weights: 636 return llr, p_val, self.new_weights.T 637 else: 638 return llr, p_val 639 640 def ci_var(self, lower_bound=None, upper_bound=None, sig=.05): 641 """ 642 Returns the confidence interval for the variance. 643 644 Parameters 645 ---------- 646 lower_bound : float 647 The minimum value the lower confidence interval can 648 take. The p-value from test_var(lower_bound) must be lower 649 than 1 - significance level. Default is .99 confidence 650 limit assuming normality 651 652 upper_bound : float 653 The maximum value the upper confidence interval 654 can take. The p-value from test_var(upper_bound) must be lower 655 than 1 - significance level. Default is .99 confidence 656 limit assuming normality 657 658 sig : float 659 The significance level. Default is .05 660 661 Returns 662 ------- 663 Interval : tuple 664 Confidence interval for the variance 665 666 Examples 667 -------- 668 >>> import numpy as np 669 >>> import statsmodels.api as sm 670 >>> random_numbers = np.random.standard_normal(100) 671 >>> el_analysis = sm.emplike.DescStat(random_numbers) 672 >>> el_analysis.ci_var() 673 (0.7539322567470305, 1.229998852496268) 674 >>> el_analysis.ci_var(.5, 2) 675 (0.7539322567469926, 1.2299988524962664) 676 677 Notes 678 ----- 679 If the function returns the error f(a) and f(b) must have 680 different signs, consider lowering lower_bound and raising 681 upper_bound. 682 """ 683 endog = self.endog 684 if upper_bound is None: 685 upper_bound = ((self.nobs - 1) * endog.var()) / \ 686 (chi2.ppf(.0001, self.nobs - 1)) 687 if lower_bound is None: 688 lower_bound = ((self.nobs - 1) * endog.var()) / \ 689 (chi2.ppf(.9999, self.nobs - 1)) 690 self.r0 = chi2.ppf(1 - sig, 1) 691 llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var()) 692 ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound) 693 return llim, ulim 694 695 def plot_contour(self, mu_low, mu_high, var_low, var_high, mu_step, 696 var_step, 697 levs=[.2, .1, .05, .01, .001]): 698 """ 699 Returns a plot of the confidence region for a univariate 700 mean and variance. 701 702 Parameters 703 ---------- 704 mu_low : float 705 Lowest value of the mean to plot 706 707 mu_high : float 708 Highest value of the mean to plot 709 710 var_low : float 711 Lowest value of the variance to plot 712 713 var_high : float 714 Highest value of the variance to plot 715 716 mu_step : float 717 Increments to evaluate the mean 718 719 var_step : float 720 Increments to evaluate the mean 721 722 levs : list 723 Which values of significance the contour lines will be drawn. 724 Default is [.2, .1, .05, .01, .001] 725 726 Returns 727 ------- 728 Figure 729 The contour plot 730 """ 731 fig, ax = utils.create_mpl_ax() 732 ax.set_ylabel('Variance') 733 ax.set_xlabel('Mean') 734 mu_vect = list(np.arange(mu_low, mu_high, mu_step)) 735 var_vect = list(np.arange(var_low, var_high, var_step)) 736 z = [] 737 for sig0 in var_vect: 738 self.sig2_0 = sig0 739 for mu0 in mu_vect: 740 z.append(self._opt_var(mu0, pval=True)) 741 z = np.asarray(z).reshape(len(var_vect), len(mu_vect)) 742 ax.contour(mu_vect, var_vect, z, levels=levs) 743 return fig 744 745 def test_skew(self, skew0, return_weights=False): 746 """ 747 Returns -2 x log-likelihood and p-value for the hypothesized 748 skewness. 749 750 Parameters 751 ---------- 752 skew0 : float 753 Skewness value to be tested 754 755 return_weights : bool 756 If True, function also returns the weights that 757 maximize the likelihood ratio. Default is False. 758 759 Returns 760 ------- 761 test_results : tuple 762 The log-likelihood ratio and p_value of skew0 763 """ 764 self.skew0 = skew0 765 start_nuisance = np.array([self.endog.mean(), 766 self.endog.var()]) 767 768 llr = optimize.fmin_powell(self._opt_skew, start_nuisance, 769 full_output=1, disp=0)[1] 770 p_val = chi2.sf(llr, 1) 771 if return_weights: 772 return llr, p_val, self.new_weights.T 773 return llr, p_val 774 775 def test_kurt(self, kurt0, return_weights=False): 776 """ 777 Returns -2 x log-likelihood and the p-value for the hypothesized 778 kurtosis. 779 780 Parameters 781 ---------- 782 kurt0 : float 783 Kurtosis value to be tested 784 785 return_weights : bool 786 If True, function also returns the weights that 787 maximize the likelihood ratio. Default is False. 788 789 Returns 790 ------- 791 test_results : tuple 792 The log-likelihood ratio and p-value of kurt0 793 """ 794 self.kurt0 = kurt0 795 start_nuisance = np.array([self.endog.mean(), 796 self.endog.var()]) 797 798 llr = optimize.fmin_powell(self._opt_kurt, start_nuisance, 799 full_output=1, disp=0)[1] 800 p_val = chi2.sf(llr, 1) 801 if return_weights: 802 return llr, p_val, self.new_weights.T 803 return llr, p_val 804 805 def test_joint_skew_kurt(self, skew0, kurt0, return_weights=False): 806 """ 807 Returns - 2 x log-likelihood and the p-value for the joint 808 hypothesis test for skewness and kurtosis 809 810 Parameters 811 ---------- 812 skew0 : float 813 Skewness value to be tested 814 kurt0 : float 815 Kurtosis value to be tested 816 817 return_weights : bool 818 If True, function also returns the weights that 819 maximize the likelihood ratio. Default is False. 820 821 Returns 822 ------- 823 test_results : tuple 824 The log-likelihood ratio and p-value of the joint hypothesis test. 825 """ 826 self.skew0 = skew0 827 self.kurt0 = kurt0 828 start_nuisance = np.array([self.endog.mean(), 829 self.endog.var()]) 830 831 llr = optimize.fmin_powell(self._opt_skew_kurt, start_nuisance, 832 full_output=1, disp=0)[1] 833 p_val = chi2.sf(llr, 2) 834 if return_weights: 835 return llr, p_val, self.new_weights.T 836 return llr, p_val 837 838 def ci_skew(self, sig=.05, upper_bound=None, lower_bound=None): 839 """ 840 Returns the confidence interval for skewness. 841 842 Parameters 843 ---------- 844 sig : float 845 The significance level. Default is .05 846 847 upper_bound : float 848 Maximum value of skewness the upper limit can be. 849 Default is .99 confidence limit assuming normality. 850 851 lower_bound : float 852 Minimum value of skewness the lower limit can be. 853 Default is .99 confidence level assuming normality. 854 855 Returns 856 ------- 857 Interval : tuple 858 Confidence interval for the skewness 859 860 Notes 861 ----- 862 If function returns f(a) and f(b) must have different signs, consider 863 expanding lower and upper bounds 864 """ 865 nobs = self.nobs 866 endog = self.endog 867 if upper_bound is None: 868 upper_bound = skew(endog) + \ 869 2.5 * ((6. * nobs * (nobs - 1.)) / \ 870 ((nobs - 2.) * (nobs + 1.) * \ 871 (nobs + 3.))) ** .5 872 if lower_bound is None: 873 lower_bound = skew(endog) - \ 874 2.5 * ((6. * nobs * (nobs - 1.)) / \ 875 ((nobs - 2.) * (nobs + 1.) * \ 876 (nobs + 3.))) ** .5 877 self.r0 = chi2.ppf(1 - sig, 1) 878 llim = optimize.brentq(self._ci_limits_skew, lower_bound, skew(endog)) 879 ulim = optimize.brentq(self._ci_limits_skew, skew(endog), upper_bound) 880 return llim, ulim 881 882 def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None): 883 """ 884 Returns the confidence interval for kurtosis. 885 886 Parameters 887 ---------- 888 889 sig : float 890 The significance level. Default is .05 891 892 upper_bound : float 893 Maximum value of kurtosis the upper limit can be. 894 Default is .99 confidence limit assuming normality. 895 896 lower_bound : float 897 Minimum value of kurtosis the lower limit can be. 898 Default is .99 confidence limit assuming normality. 899 900 Returns 901 ------- 902 Interval : tuple 903 Lower and upper confidence limit 904 905 Notes 906 ----- 907 For small n, upper_bound and lower_bound may have to be 908 provided by the user. Consider using test_kurt to find 909 values close to the desired significance level. 910 911 If function returns f(a) and f(b) must have different signs, consider 912 expanding the bounds. 913 """ 914 endog = self.endog 915 nobs = self.nobs 916 if upper_bound is None: 917 upper_bound = kurtosis(endog) + \ 918 (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \ 919 ((nobs - 2.) * (nobs + 1.) * \ 920 (nobs + 3.))) ** .5) * \ 921 (((nobs ** 2.) - 1.) / ((nobs - 3.) *\ 922 (nobs + 5.))) ** .5) 923 if lower_bound is None: 924 lower_bound = kurtosis(endog) - \ 925 (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \ 926 ((nobs - 2.) * (nobs + 1.) * \ 927 (nobs + 3.))) ** .5) * \ 928 (((nobs ** 2.) - 1.) / ((nobs - 3.) *\ 929 (nobs + 5.))) ** .5) 930 self.r0 = chi2.ppf(1 - sig, 1) 931 llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \ 932 kurtosis(endog)) 933 ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \ 934 upper_bound) 935 return llim, ulim 936 937 938class DescStatMV(_OptFuncts): 939 """ 940 A class for conducting inference on multivariate means and correlation. 941 942 Parameters 943 ---------- 944 endog : ndarray 945 Data to be analyzed 946 947 Attributes 948 ---------- 949 endog : ndarray 950 Data to be analyzed 951 952 nobs : float 953 Number of observations 954 """ 955 956 def __init__(self, endog): 957 self.endog = endog 958 self.nobs = endog.shape[0] 959 960 def mv_test_mean(self, mu_array, return_weights=False): 961 """ 962 Returns -2 x log likelihood and the p-value 963 for a multivariate hypothesis test of the mean 964 965 Parameters 966 ---------- 967 mu_array : 1d array 968 Hypothesized values for the mean. Must have same number of 969 elements as columns in endog 970 971 return_weights : bool 972 If True, returns the weights that maximize the 973 likelihood of mu_array. Default is False. 974 975 Returns 976 ------- 977 test_results : tuple 978 The log-likelihood ratio and p-value for mu_array 979 """ 980 endog = self.endog 981 nobs = self.nobs 982 if len(mu_array) != endog.shape[1]: 983 raise ValueError('mu_array must have the same number of ' 984 'elements as the columns of the data.') 985 mu_array = mu_array.reshape(1, endog.shape[1]) 986 means = np.ones((endog.shape[0], endog.shape[1])) 987 means = mu_array * means 988 est_vect = endog - means 989 start_vals = 1. / nobs * np.ones(endog.shape[1]) 990 eta_star = self._modif_newton(start_vals, est_vect, 991 np.ones(nobs) * (1. / nobs)) 992 denom = 1 + np.dot(eta_star, est_vect.T) 993 self.new_weights = 1 / nobs * 1 / denom 994 llr = -2 * np.sum(np.log(nobs * self.new_weights)) 995 p_val = chi2.sf(llr, mu_array.shape[1]) 996 if return_weights: 997 return llr, p_val, self.new_weights.T 998 else: 999 return llr, p_val 1000 1001 def mv_mean_contour(self, mu1_low, mu1_upp, mu2_low, mu2_upp, step1, step2, 1002 levs=(.001, .01, .05, .1, .2), var1_name=None, 1003 var2_name=None, plot_dta=False): 1004 """ 1005 Creates a confidence region plot for the mean of bivariate data 1006 1007 Parameters 1008 ---------- 1009 m1_low : float 1010 Minimum value of the mean for variable 1 1011 m1_upp : float 1012 Maximum value of the mean for variable 1 1013 mu2_low : float 1014 Minimum value of the mean for variable 2 1015 mu2_upp : float 1016 Maximum value of the mean for variable 2 1017 step1 : float 1018 Increment of evaluations for variable 1 1019 step2 : float 1020 Increment of evaluations for variable 2 1021 levs : list 1022 Levels to be drawn on the contour plot. 1023 Default = (.001, .01, .05, .1, .2) 1024 plot_dta : bool 1025 If True, makes a scatter plot of the data on 1026 top of the contour plot. Defaultis False. 1027 var1_name : str 1028 Name of variable 1 to be plotted on the x-axis 1029 var2_name : str 1030 Name of variable 2 to be plotted on the y-axis 1031 1032 Notes 1033 ----- 1034 The smaller the step size, the more accurate the intervals 1035 will be 1036 1037 If the function returns optimization failed, consider narrowing 1038 the boundaries of the plot 1039 1040 Examples 1041 -------- 1042 >>> import statsmodels.api as sm 1043 >>> two_rvs = np.random.standard_normal((20,2)) 1044 >>> el_analysis = sm.emplike.DescStat(two_rvs) 1045 >>> contourp = el_analysis.mv_mean_contour(-2, 2, -2, 2, .1, .1) 1046 >>> contourp.show() 1047 """ 1048 if self.endog.shape[1] != 2: 1049 raise ValueError('Data must contain exactly two variables') 1050 fig, ax = utils.create_mpl_ax() 1051 if var2_name is None: 1052 ax.set_ylabel('Variable 2') 1053 else: 1054 ax.set_ylabel(var2_name) 1055 if var1_name is None: 1056 ax.set_xlabel('Variable 1') 1057 else: 1058 ax.set_xlabel(var1_name) 1059 x = np.arange(mu1_low, mu1_upp, step1) 1060 y = np.arange(mu2_low, mu2_upp, step2) 1061 pairs = itertools.product(x, y) 1062 z = [] 1063 for i in pairs: 1064 z.append(self.mv_test_mean(np.asarray(i))[0]) 1065 X, Y = np.meshgrid(x, y) 1066 z = np.asarray(z) 1067 z = z.reshape(X.shape[1], Y.shape[0]) 1068 ax.contour(x, y, z.T, levels=levs) 1069 if plot_dta: 1070 ax.plot(self.endog[:, 0], self.endog[:, 1], 'bo') 1071 return fig 1072 1073 def test_corr(self, corr0, return_weights=0): 1074 """ 1075 Returns -2 x log-likelihood ratio and p-value for the 1076 correlation coefficient between 2 variables 1077 1078 Parameters 1079 ---------- 1080 corr0 : float 1081 Hypothesized value to be tested 1082 1083 return_weights : bool 1084 If true, returns the weights that maximize 1085 the log-likelihood at the hypothesized value 1086 """ 1087 nobs = self.nobs 1088 endog = self.endog 1089 if endog.shape[1] != 2: 1090 raise NotImplementedError('Correlation matrix not yet implemented') 1091 nuis0 = np.array([endog[:, 0].mean(), 1092 endog[:, 0].var(), 1093 endog[:, 1].mean(), 1094 endog[:, 1].var()]) 1095 1096 x0 = np.zeros(5) 1097 weights0 = np.array([1. / nobs] * int(nobs)) 1098 args = (corr0, endog, nobs, x0, weights0) 1099 llr = optimize.fmin(self._opt_correl, nuis0, args=args, 1100 full_output=1, disp=0)[1] 1101 p_val = chi2.sf(llr, 1) 1102 if return_weights: 1103 return llr, p_val, self.new_weights.T 1104 return llr, p_val 1105 1106 def ci_corr(self, sig=.05, upper_bound=None, lower_bound=None): 1107 """ 1108 Returns the confidence intervals for the correlation coefficient 1109 1110 Parameters 1111 ---------- 1112 sig : float 1113 The significance level. Default is .05 1114 1115 upper_bound : float 1116 Maximum value the upper confidence limit can be. 1117 Default is 99% confidence limit assuming normality. 1118 1119 lower_bound : float 1120 Minimum value the lower confidence limit can be. 1121 Default is 99% confidence limit assuming normality. 1122 1123 Returns 1124 ------- 1125 interval : tuple 1126 Confidence interval for the correlation 1127 """ 1128 endog = self.endog 1129 nobs = self.nobs 1130 self.r0 = chi2.ppf(1 - sig, 1) 1131 point_est = np.corrcoef(endog[:, 0], endog[:, 1])[0, 1] 1132 if upper_bound is None: 1133 upper_bound = min(.999, point_est + \ 1134 2.5 * ((1. - point_est ** 2.) / \ 1135 (nobs - 2.)) ** .5) 1136 1137 if lower_bound is None: 1138 lower_bound = max(- .999, point_est - \ 1139 2.5 * (np.sqrt((1. - point_est ** 2.) / \ 1140 (nobs - 2.)))) 1141 1142 llim = optimize.brenth(self._ci_limits_corr, lower_bound, point_est) 1143 ulim = optimize.brenth(self._ci_limits_corr, point_est, upper_bound) 1144 return llim, ulim 1145