1""" 2Multivariate Conditional and Unconditional Kernel Density Estimation 3with Mixed Data Types 4 5References 6---------- 7[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice. 8 Princeton University Press. (2007) 9[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation 10 and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008) 11 http://dx.doi.org/10.1561/0800000009 12[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions 13 with Categorical and Continuous Data." Working Paper. (2000) 14[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional 15 Distributions Annals of Economics and Finance 5, 211-235 (2004) 16[5] Liu, R., Yang, L. "Kernel estimation of multivariate 17 cumulative distribution function." 18 Journal of Nonparametric Statistics (2008) 19[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF 20 with Categorical and Continuous Data." Working Paper 21[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric 22 regression" Statistica Sinica 14(2004), pp. 485-512 23[8] Racine, J.: "Consistent Significance Testing for Nonparametric 24 Regression" Journal of Business & Economics Statistics 25[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of 26 Categorical Predictor Variables in Nonparametric Regression 27 Models", 2006, Econometric Reviews 25, 523-544 28 29""" 30 31# TODO: make default behavior efficient=True above a certain n_obs 32import copy 33 34import numpy as np 35from scipy import optimize 36from scipy.stats.mstats import mquantiles 37 38from ._kernel_base import GenericKDE, EstimatorSettings, gpke, \ 39 LeaveOneOut, _get_type_pos, _adjust_shape, _compute_min_std_IQR, kernel_func 40 41 42__all__ = ['KernelReg', 'KernelCensoredReg'] 43 44 45class KernelReg(GenericKDE): 46 """ 47 Nonparametric kernel regression class. 48 49 Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``. 50 Note that the "local constant" type of regression provided here is also 51 known as Nadaraya-Watson kernel regression; "local linear" is an extension 52 of that which suffers less from bias issues at the edge of the support. Note 53 that specifying a custom kernel works only with "local linear" kernel 54 regression. For example, a custom ``tricube`` kernel yields LOESS regression. 55 56 Parameters 57 ---------- 58 endog : array_like 59 This is the dependent variable. 60 exog : array_like 61 The training data for the independent variable(s) 62 Each element in the list is a separate variable 63 var_type : str 64 The type of the variables, one character per variable: 65 66 - c: continuous 67 - u: unordered (discrete) 68 - o: ordered (discrete) 69 70 reg_type : {'lc', 'll'}, optional 71 Type of regression estimator. 'lc' means local constant and 72 'll' local Linear estimator. Default is 'll' 73 bw : str or array_like, optional 74 Either a user-specified bandwidth or the method for bandwidth 75 selection. If a string, valid values are 'cv_ls' (least-squares 76 cross-validation) and 'aic' (AIC Hurvich bandwidth estimation). 77 Default is 'cv_ls'. User specified bandwidth must have as many 78 entries as the number of variables. 79 ckertype : str, optional 80 The kernel used for the continuous variables. 81 okertype : str, optional 82 The kernel used for the ordered discrete variables. 83 ukertype : str, optional 84 The kernel used for the unordered discrete variables. 85 defaults : EstimatorSettings instance, optional 86 The default values for the efficient bandwidth estimation. 87 88 Attributes 89 ---------- 90 bw : array_like 91 The bandwidth parameters. 92 """ 93 def __init__(self, endog, exog, var_type, reg_type='ll', bw='cv_ls', 94 ckertype='gaussian', okertype='wangryzin', 95 ukertype='aitchisonaitken', defaults=None): 96 self.var_type = var_type 97 self.data_type = var_type 98 self.reg_type = reg_type 99 self.ckertype = ckertype 100 self.okertype = okertype 101 self.ukertype = ukertype 102 if not (self.ckertype in kernel_func and self.ukertype in kernel_func 103 and self.okertype in kernel_func): 104 raise ValueError('user specified kernel must be a supported ' 105 'kernel from statsmodels.nonparametric.kernels.') 106 107 self.k_vars = len(self.var_type) 108 self.endog = _adjust_shape(endog, 1) 109 self.exog = _adjust_shape(exog, self.k_vars) 110 self.data = np.column_stack((self.endog, self.exog)) 111 self.nobs = np.shape(self.exog)[0] 112 self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear) 113 defaults = EstimatorSettings() if defaults is None else defaults 114 self._set_defaults(defaults) 115 if not isinstance(bw, str): 116 bw = np.asarray(bw) 117 if len(bw) != self.k_vars: 118 raise ValueError('bw must have the same dimension as the ' 119 'number of variables.') 120 if not self.efficient: 121 self.bw = self._compute_reg_bw(bw) 122 else: 123 self.bw = self._compute_efficient(bw) 124 125 def _compute_reg_bw(self, bw): 126 if not isinstance(bw, str): 127 self._bw_method = "user-specified" 128 return np.asarray(bw) 129 else: 130 # The user specified a bandwidth selection method e.g. 'cv_ls' 131 self._bw_method = bw 132 # Workaround to avoid instance methods in __dict__ 133 if bw == 'cv_ls': 134 res = self.cv_loo 135 else: # bw == 'aic' 136 res = self.aic_hurvich 137 X = np.std(self.exog, axis=0) 138 h0 = 1.06 * X * \ 139 self.nobs ** (- 1. / (4 + np.size(self.exog, axis=1))) 140 141 func = self.est[self.reg_type] 142 bw_estimated = optimize.fmin(res, x0=h0, args=(func, ), 143 maxiter=1e3, maxfun=1e3, disp=0) 144 return bw_estimated 145 146 def _est_loc_linear(self, bw, endog, exog, data_predict): 147 """ 148 Local linear estimator of g(x) in the regression ``y = g(x) + e``. 149 150 Parameters 151 ---------- 152 bw : array_like 153 Vector of bandwidth value(s). 154 endog : 1D array_like 155 The dependent variable. 156 exog : 1D or 2D array_like 157 The independent variable(s). 158 data_predict : 1D array_like of length K, where K is the number of variables. 159 The point at which the density is estimated. 160 161 Returns 162 ------- 163 D_x : array_like 164 The value of the conditional mean at `data_predict`. 165 166 Notes 167 ----- 168 See p. 81 in [1] and p.38 in [2] for the formulas. 169 Unlike other methods, this one requires that `data_predict` be 1D. 170 """ 171 nobs, k_vars = exog.shape 172 ker = gpke(bw, data=exog, data_predict=data_predict, 173 var_type=self.var_type, 174 ckertype=self.ckertype, 175 ukertype=self.ukertype, 176 okertype=self.okertype, 177 tosum=False) / float(nobs) 178 # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij 179 # See also p. 38 in [2] 180 #ix_cont = np.arange(self.k_vars) # Use all vars instead of continuous only 181 # Note: because ix_cont was defined here such that it selected all 182 # columns, I removed the indexing with it from exog/data_predict. 183 184 # Convert ker to a 2-D array to make matrix operations below work 185 ker = ker[:, np.newaxis] 186 187 M12 = exog - data_predict 188 M22 = np.dot(M12.T, M12 * ker) 189 M12 = (M12 * ker).sum(axis=0) 190 M = np.empty((k_vars + 1, k_vars + 1)) 191 M[0, 0] = ker.sum() 192 M[0, 1:] = M12 193 M[1:, 0] = M12 194 M[1:, 1:] = M22 195 196 ker_endog = ker * endog 197 V = np.empty((k_vars + 1, 1)) 198 V[0, 0] = ker_endog.sum() 199 V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0) 200 201 mean_mfx = np.dot(np.linalg.pinv(M), V) 202 mean = mean_mfx[0] 203 mfx = mean_mfx[1:, :] 204 return mean, mfx 205 206 def _est_loc_constant(self, bw, endog, exog, data_predict): 207 """ 208 Local constant estimator of g(x) in the regression 209 y = g(x) + e 210 211 Parameters 212 ---------- 213 bw : array_like 214 Array of bandwidth value(s). 215 endog : 1D array_like 216 The dependent variable. 217 exog : 1D or 2D array_like 218 The independent variable(s). 219 data_predict : 1D or 2D array_like 220 The point(s) at which the density is estimated. 221 222 Returns 223 ------- 224 G : ndarray 225 The value of the conditional mean at `data_predict`. 226 B_x : ndarray 227 The marginal effects. 228 """ 229 ker_x = gpke(bw, data=exog, data_predict=data_predict, 230 var_type=self.var_type, 231 ckertype=self.ckertype, 232 ukertype=self.ukertype, 233 okertype=self.okertype, 234 tosum=False) 235 ker_x = np.reshape(ker_x, np.shape(endog)) 236 G_numer = (ker_x * endog).sum(axis=0) 237 G_denom = ker_x.sum(axis=0) 238 G = G_numer / G_denom 239 nobs = exog.shape[0] 240 f_x = G_denom / float(nobs) 241 ker_xc = gpke(bw, data=exog, data_predict=data_predict, 242 var_type=self.var_type, 243 ckertype='d_gaussian', 244 #okertype='wangryzin_reg', 245 tosum=False) 246 247 ker_xc = ker_xc[:, np.newaxis] 248 d_mx = -(endog * ker_xc).sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont])) 249 d_fx = -ker_xc.sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont])) 250 B_x = d_mx / f_x - G * d_fx / f_x 251 B_x = (G_numer * d_fx - G_denom * d_mx) / (G_denom**2) 252 #B_x = (f_x * d_mx - m_x * d_fx) / (f_x ** 2) 253 return G, B_x 254 255 def aic_hurvich(self, bw, func=None): 256 """ 257 Computes the AIC Hurvich criteria for the estimation of the bandwidth. 258 259 Parameters 260 ---------- 261 bw : str or array_like 262 See the ``bw`` parameter of `KernelReg` for details. 263 264 Returns 265 ------- 266 aic : ndarray 267 The AIC Hurvich criteria, one element for each variable. 268 func : None 269 Unused here, needed in signature because it's used in `cv_loo`. 270 271 References 272 ---------- 273 See ch.2 in [1] and p.35 in [2]. 274 """ 275 H = np.empty((self.nobs, self.nobs)) 276 for j in range(self.nobs): 277 H[:, j] = gpke(bw, data=self.exog, data_predict=self.exog[j,:], 278 ckertype=self.ckertype, ukertype=self.ukertype, 279 okertype=self.okertype, var_type=self.var_type, 280 tosum=False) 281 282 denom = H.sum(axis=1) 283 H = H / denom 284 gx = KernelReg(endog=self.endog, exog=self.exog, var_type=self.var_type, 285 reg_type=self.reg_type, bw=bw, 286 defaults=EstimatorSettings(efficient=False)).fit()[0] 287 gx = np.reshape(gx, (self.nobs, 1)) 288 sigma = ((self.endog - gx)**2).sum(axis=0) / float(self.nobs) 289 290 frac = (1 + np.trace(H) / float(self.nobs)) / \ 291 (1 - (np.trace(H) + 2) / float(self.nobs)) 292 #siga = np.dot(self.endog.T, (I - H).T) 293 #sigb = np.dot((I - H), self.endog) 294 #sigma = np.dot(siga, sigb) / float(self.nobs) 295 aic = np.log(sigma) + frac 296 return aic 297 298 def cv_loo(self, bw, func): 299 r""" 300 The cross-validation function with leave-one-out estimator. 301 302 Parameters 303 ---------- 304 bw : array_like 305 Vector of bandwidth values. 306 func : callable function 307 Returns the estimator of g(x). Can be either ``_est_loc_constant`` 308 (local constant) or ``_est_loc_linear`` (local_linear). 309 310 Returns 311 ------- 312 L : float 313 The value of the CV function. 314 315 Notes 316 ----- 317 Calculates the cross-validation least-squares function. This function 318 is minimized by compute_bw to calculate the optimal value of `bw`. 319 320 For details see p.35 in [2] 321 322 .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2} 323 324 where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X) 325 and :math:`h` is the vector of bandwidths 326 """ 327 LOO_X = LeaveOneOut(self.exog) 328 LOO_Y = LeaveOneOut(self.endog).__iter__() 329 L = 0 330 for ii, X_not_i in enumerate(LOO_X): 331 Y = next(LOO_Y) 332 G = func(bw, endog=Y, exog=-X_not_i, 333 data_predict=-self.exog[ii, :])[0] 334 L += (self.endog[ii] - G) ** 2 335 336 # Note: There might be a way to vectorize this. See p.72 in [1] 337 return L / self.nobs 338 339 def r_squared(self): 340 r""" 341 Returns the R-Squared for the nonparametric regression. 342 343 Notes 344 ----- 345 For more details see p.45 in [2] 346 The R-Squared is calculated by: 347 348 .. math:: R^{2}=\frac{\left[\sum_{i=1}^{n} 349 (Y_{i}-\bar{y})(\hat{Y_{i}}-\bar{y}\right]^{2}}{\sum_{i=1}^{n} 350 (Y_{i}-\bar{y})^{2}\sum_{i=1}^{n}(\hat{Y_{i}}-\bar{y})^{2}}, 351 352 where :math:`\hat{Y_{i}}` is the mean calculated in `fit` at the exog 353 points. 354 """ 355 Y = np.squeeze(self.endog) 356 Yhat = self.fit()[0] 357 Y_bar = np.mean(Yhat) 358 R2_numer = (((Y - Y_bar) * (Yhat - Y_bar)).sum())**2 359 R2_denom = ((Y - Y_bar)**2).sum(axis=0) * \ 360 ((Yhat - Y_bar)**2).sum(axis=0) 361 return R2_numer / R2_denom 362 363 def fit(self, data_predict=None): 364 """ 365 Returns the mean and marginal effects at the `data_predict` points. 366 367 Parameters 368 ---------- 369 data_predict : array_like, optional 370 Points at which to return the mean and marginal effects. If not 371 given, ``data_predict == exog``. 372 373 Returns 374 ------- 375 mean : ndarray 376 The regression result for the mean (i.e. the actual curve). 377 mfx : ndarray 378 The marginal effects, i.e. the partial derivatives of the mean. 379 """ 380 func = self.est[self.reg_type] 381 if data_predict is None: 382 data_predict = self.exog 383 else: 384 data_predict = _adjust_shape(data_predict, self.k_vars) 385 386 N_data_predict = np.shape(data_predict)[0] 387 mean = np.empty((N_data_predict,)) 388 mfx = np.empty((N_data_predict, self.k_vars)) 389 for i in range(N_data_predict): 390 mean_mfx = func(self.bw, self.endog, self.exog, 391 data_predict=data_predict[i, :]) 392 mean[i] = mean_mfx[0] 393 mfx_c = np.squeeze(mean_mfx[1]) 394 mfx[i, :] = mfx_c 395 396 return mean, mfx 397 398 def sig_test(self, var_pos, nboot=50, nested_res=25, pivot=False): 399 """ 400 Significance test for the variables in the regression. 401 402 Parameters 403 ---------- 404 var_pos : sequence 405 The position of the variable in exog to be tested. 406 407 Returns 408 ------- 409 sig : str 410 The level of significance: 411 412 - `*` : at 90% confidence level 413 - `**` : at 95% confidence level 414 - `***` : at 99* confidence level 415 - "Not Significant" : if not significant 416 """ 417 var_pos = np.asarray(var_pos) 418 ix_cont, ix_ord, ix_unord = _get_type_pos(self.var_type) 419 if np.any(ix_cont[var_pos]): 420 if np.any(ix_ord[var_pos]) or np.any(ix_unord[var_pos]): 421 raise ValueError("Discrete variable in hypothesis. Must be continuous") 422 423 Sig = TestRegCoefC(self, var_pos, nboot, nested_res, pivot) 424 else: 425 Sig = TestRegCoefD(self, var_pos, nboot) 426 427 return Sig.sig 428 429 def __repr__(self): 430 """Provide something sane to print.""" 431 rpr = "KernelReg instance\n" 432 rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n" 433 rpr += "Number of samples: N = " + str(self.nobs) + "\n" 434 rpr += "Variable types: " + self.var_type + "\n" 435 rpr += "BW selection method: " + self._bw_method + "\n" 436 rpr += "Estimator type: " + self.reg_type + "\n" 437 return rpr 438 439 def _get_class_vars_type(self): 440 """Helper method to be able to pass needed vars to _compute_subset.""" 441 class_type = 'KernelReg' 442 class_vars = (self.var_type, self.k_vars, self.reg_type) 443 return class_type, class_vars 444 445 def _compute_dispersion(self, data): 446 """ 447 Computes the measure of dispersion. 448 449 The minimum of the standard deviation and interquartile range / 1.349 450 451 References 452 ---------- 453 See the user guide for the np package in R. 454 In the notes on bwscaling option in npreg, npudens, npcdens there is 455 a discussion on the measure of dispersion 456 """ 457 data = data[:, 1:] 458 return _compute_min_std_IQR(data) 459 460 461class KernelCensoredReg(KernelReg): 462 """ 463 Nonparametric censored regression. 464 465 Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``, 466 where y is left-censored. Left censored variable Y is defined as 467 ``Y = min {Y', L}`` where ``L`` is the value at which ``Y`` is censored 468 and ``Y'`` is the true value of the variable. 469 470 Parameters 471 ---------- 472 endog : list with one element which is array_like 473 This is the dependent variable. 474 exog : list 475 The training data for the independent variable(s) 476 Each element in the list is a separate variable 477 dep_type : str 478 The type of the dependent variable(s) 479 c: Continuous 480 u: Unordered (Discrete) 481 o: Ordered (Discrete) 482 reg_type : str 483 Type of regression estimator 484 lc: Local Constant Estimator 485 ll: Local Linear Estimator 486 bw : array_like 487 Either a user-specified bandwidth or 488 the method for bandwidth selection. 489 cv_ls: cross-validation least squares 490 aic: AIC Hurvich Estimator 491 ckertype : str, optional 492 The kernel used for the continuous variables. 493 okertype : str, optional 494 The kernel used for the ordered discrete variables. 495 ukertype : str, optional 496 The kernel used for the unordered discrete variables. 497 censor_val : float 498 Value at which the dependent variable is censored 499 defaults : EstimatorSettings instance, optional 500 The default values for the efficient bandwidth estimation 501 502 Attributes 503 ---------- 504 bw : array_like 505 The bandwidth parameters 506 """ 507 def __init__(self, endog, exog, var_type, reg_type, bw='cv_ls', 508 ckertype='gaussian', 509 ukertype='aitchison_aitken_reg', 510 okertype='wangryzin_reg', 511 censor_val=0, defaults=None): 512 self.var_type = var_type 513 self.data_type = var_type 514 self.reg_type = reg_type 515 self.ckertype = ckertype 516 self.okertype = okertype 517 self.ukertype = ukertype 518 if not (self.ckertype in kernel_func and self.ukertype in kernel_func 519 and self.okertype in kernel_func): 520 raise ValueError('user specified kernel must be a supported ' 521 'kernel from statsmodels.nonparametric.kernels.') 522 523 self.k_vars = len(self.var_type) 524 self.endog = _adjust_shape(endog, 1) 525 self.exog = _adjust_shape(exog, self.k_vars) 526 self.data = np.column_stack((self.endog, self.exog)) 527 self.nobs = np.shape(self.exog)[0] 528 self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear) 529 defaults = EstimatorSettings() if defaults is None else defaults 530 self._set_defaults(defaults) 531 self.censor_val = censor_val 532 if self.censor_val is not None: 533 self.censored(censor_val) 534 else: 535 self.W_in = np.ones((self.nobs, 1)) 536 537 if not self.efficient: 538 self.bw = self._compute_reg_bw(bw) 539 else: 540 self.bw = self._compute_efficient(bw) 541 542 def censored(self, censor_val): 543 # see pp. 341-344 in [1] 544 self.d = (self.endog != censor_val) * 1. 545 ix = np.argsort(np.squeeze(self.endog)) 546 self.sortix = ix 547 self.sortix_rev = np.zeros(ix.shape, int) 548 self.sortix_rev[ix] = np.arange(len(ix)) 549 self.endog = np.squeeze(self.endog[ix]) 550 self.endog = _adjust_shape(self.endog, 1) 551 self.exog = np.squeeze(self.exog[ix]) 552 self.d = np.squeeze(self.d[ix]) 553 self.W_in = np.empty((self.nobs, 1)) 554 for i in range(1, self.nobs + 1): 555 P=1 556 for j in range(1, i): 557 P *= ((self.nobs - j)/(float(self.nobs)-j+1))**self.d[j-1] 558 self.W_in[i-1,0] = P * self.d[i-1] / (float(self.nobs) - i + 1 ) 559 560 def __repr__(self): 561 """Provide something sane to print.""" 562 rpr = "KernelCensoredReg instance\n" 563 rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n" 564 rpr += "Number of samples: nobs = " + str(self.nobs) + "\n" 565 rpr += "Variable types: " + self.var_type + "\n" 566 rpr += "BW selection method: " + self._bw_method + "\n" 567 rpr += "Estimator type: " + self.reg_type + "\n" 568 return rpr 569 570 def _est_loc_linear(self, bw, endog, exog, data_predict, W): 571 """ 572 Local linear estimator of g(x) in the regression ``y = g(x) + e``. 573 574 Parameters 575 ---------- 576 bw : array_like 577 Vector of bandwidth value(s) 578 endog : 1D array_like 579 The dependent variable 580 exog : 1D or 2D array_like 581 The independent variable(s) 582 data_predict : 1D array_like of length K, where K is 583 the number of variables. The point at which 584 the density is estimated 585 586 Returns 587 ------- 588 D_x : array_like 589 The value of the conditional mean at data_predict 590 591 Notes 592 ----- 593 See p. 81 in [1] and p.38 in [2] for the formulas 594 Unlike other methods, this one requires that data_predict be 1D 595 """ 596 nobs, k_vars = exog.shape 597 ker = gpke(bw, data=exog, data_predict=data_predict, 598 var_type=self.var_type, 599 ckertype=self.ckertype, 600 ukertype=self.ukertype, 601 okertype=self.okertype, tosum=False) 602 # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij 603 # See also p. 38 in [2] 604 605 # Convert ker to a 2-D array to make matrix operations below work 606 ker = W * ker[:, np.newaxis] 607 608 M12 = exog - data_predict 609 M22 = np.dot(M12.T, M12 * ker) 610 M12 = (M12 * ker).sum(axis=0) 611 M = np.empty((k_vars + 1, k_vars + 1)) 612 M[0, 0] = ker.sum() 613 M[0, 1:] = M12 614 M[1:, 0] = M12 615 M[1:, 1:] = M22 616 617 ker_endog = ker * endog 618 V = np.empty((k_vars + 1, 1)) 619 V[0, 0] = ker_endog.sum() 620 V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0) 621 622 mean_mfx = np.dot(np.linalg.pinv(M), V) 623 mean = mean_mfx[0] 624 mfx = mean_mfx[1:, :] 625 return mean, mfx 626 627 628 def cv_loo(self, bw, func): 629 r""" 630 The cross-validation function with leave-one-out 631 estimator 632 633 Parameters 634 ---------- 635 bw : array_like 636 Vector of bandwidth values 637 func : callable function 638 Returns the estimator of g(x). 639 Can be either ``_est_loc_constant`` (local constant) or 640 ``_est_loc_linear`` (local_linear). 641 642 Returns 643 ------- 644 L : float 645 The value of the CV function 646 647 Notes 648 ----- 649 Calculates the cross-validation least-squares 650 function. This function is minimized by compute_bw 651 to calculate the optimal value of bw 652 653 For details see p.35 in [2] 654 655 .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2} 656 657 where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X) 658 and :math:`h` is the vector of bandwidths 659 """ 660 LOO_X = LeaveOneOut(self.exog) 661 LOO_Y = LeaveOneOut(self.endog).__iter__() 662 LOO_W = LeaveOneOut(self.W_in).__iter__() 663 L = 0 664 for ii, X_not_i in enumerate(LOO_X): 665 Y = next(LOO_Y) 666 w = next(LOO_W) 667 G = func(bw, endog=Y, exog=-X_not_i, 668 data_predict=-self.exog[ii, :], W=w)[0] 669 L += (self.endog[ii] - G) ** 2 670 671 # Note: There might be a way to vectorize this. See p.72 in [1] 672 return L / self.nobs 673 674 def fit(self, data_predict=None): 675 """ 676 Returns the marginal effects at the data_predict points. 677 """ 678 func = self.est[self.reg_type] 679 if data_predict is None: 680 data_predict = self.exog 681 else: 682 data_predict = _adjust_shape(data_predict, self.k_vars) 683 684 N_data_predict = np.shape(data_predict)[0] 685 mean = np.empty((N_data_predict,)) 686 mfx = np.empty((N_data_predict, self.k_vars)) 687 for i in range(N_data_predict): 688 mean_mfx = func(self.bw, self.endog, self.exog, 689 data_predict=data_predict[i, :], 690 W=self.W_in) 691 mean[i] = mean_mfx[0] 692 mfx_c = np.squeeze(mean_mfx[1]) 693 mfx[i, :] = mfx_c 694 695 return mean, mfx 696 697 698class TestRegCoefC(object): 699 """ 700 Significance test for continuous variables in a nonparametric regression. 701 702 The null hypothesis is ``dE(Y|X)/dX_not_i = 0``, the alternative hypothesis 703 is ``dE(Y|X)/dX_not_i != 0``. 704 705 Parameters 706 ---------- 707 model : KernelReg instance 708 This is the nonparametric regression model whose elements 709 are tested for significance. 710 test_vars : tuple, list of integers, array_like 711 index of position of the continuous variables to be tested 712 for significance. E.g. (1,3,5) jointly tests variables at 713 position 1,3 and 5 for significance. 714 nboot : int 715 Number of bootstrap samples used to determine the distribution 716 of the test statistic in a finite sample. Default is 400 717 nested_res : int 718 Number of nested resamples used to calculate lambda. 719 Must enable the pivot option 720 pivot : bool 721 Pivot the test statistic by dividing by its standard error 722 Significantly increases computational time. But pivot statistics 723 have more desirable properties 724 (See references) 725 726 Attributes 727 ---------- 728 sig : str 729 The significance level of the variable(s) tested 730 "Not Significant": Not significant at the 90% confidence level 731 Fails to reject the null 732 "*": Significant at the 90% confidence level 733 "**": Significant at the 95% confidence level 734 "***": Significant at the 99% confidence level 735 736 Notes 737 ----- 738 This class allows testing of joint hypothesis as long as all variables 739 are continuous. 740 741 References 742 ---------- 743 Racine, J.: "Consistent Significance Testing for Nonparametric Regression" 744 Journal of Business & Economics Statistics. 745 746 Chapter 12 in [1]. 747 """ 748 # Significance of continuous vars in nonparametric regression 749 # Racine: Consistent Significance Testing for Nonparametric Regression 750 # Journal of Business & Economics Statistics 751 def __init__(self, model, test_vars, nboot=400, nested_res=400, 752 pivot=False): 753 self.nboot = nboot 754 self.nres = nested_res 755 self.test_vars = test_vars 756 self.model = model 757 self.bw = model.bw 758 self.var_type = model.var_type 759 self.k_vars = len(self.var_type) 760 self.endog = model.endog 761 self.exog = model.exog 762 self.gx = model.est[model.reg_type] 763 self.test_vars = test_vars 764 self.pivot = pivot 765 self.run() 766 767 def run(self): 768 self.test_stat = self._compute_test_stat(self.endog, self.exog) 769 self.sig = self._compute_sig() 770 771 def _compute_test_stat(self, Y, X): 772 """ 773 Computes the test statistic. See p.371 in [8]. 774 """ 775 lam = self._compute_lambda(Y, X) 776 t = lam 777 if self.pivot: 778 se_lam = self._compute_se_lambda(Y, X) 779 t = lam / float(se_lam) 780 781 return t 782 783 def _compute_lambda(self, Y, X): 784 """Computes only lambda -- the main part of the test statistic""" 785 n = np.shape(X)[0] 786 Y = _adjust_shape(Y, 1) 787 X = _adjust_shape(X, self.k_vars) 788 b = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw, 789 defaults = EstimatorSettings(efficient=False)).fit()[1] 790 791 b = b[:, self.test_vars] 792 b = np.reshape(b, (n, len(self.test_vars))) 793 #fct = np.std(b) # Pivot the statistic by dividing by SE 794 fct = 1. # Do not Pivot -- Bootstrapping works better if Pivot 795 lam = ((b / fct) ** 2).sum() / float(n) 796 return lam 797 798 def _compute_se_lambda(self, Y, X): 799 """ 800 Calculates the SE of lambda by nested resampling 801 Used to pivot the statistic. 802 Bootstrapping works better with estimating pivotal statistics 803 but slows down computation significantly. 804 """ 805 n = np.shape(Y)[0] 806 lam = np.empty(shape=(self.nres,)) 807 for i in range(self.nres): 808 ind = np.random.randint(0, n, size=(n, 1)) 809 Y1 = Y[ind, 0] 810 X1 = X[ind, :] 811 lam[i] = self._compute_lambda(Y1, X1) 812 813 se_lambda = np.std(lam) 814 return se_lambda 815 816 def _compute_sig(self): 817 """ 818 Computes the significance value for the variable(s) tested. 819 820 The empirical distribution of the test statistic is obtained through 821 bootstrapping the sample. The null hypothesis is rejected if the test 822 statistic is larger than the 90, 95, 99 percentiles. 823 """ 824 t_dist = np.empty(shape=(self.nboot, )) 825 Y = self.endog 826 X = copy.deepcopy(self.exog) 827 n = np.shape(Y)[0] 828 829 X[:, self.test_vars] = np.mean(X[:, self.test_vars], axis=0) 830 # Calculate the restricted mean. See p. 372 in [8] 831 M = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw, 832 defaults=EstimatorSettings(efficient=False)).fit()[0] 833 M = np.reshape(M, (n, 1)) 834 e = Y - M 835 e = e - np.mean(e) # recenter residuals 836 for i in range(self.nboot): 837 ind = np.random.randint(0, n, size=(n, 1)) 838 e_boot = e[ind, 0] 839 Y_boot = M + e_boot 840 t_dist[i] = self._compute_test_stat(Y_boot, self.exog) 841 842 self.t_dist = t_dist 843 sig = "Not Significant" 844 if self.test_stat > mquantiles(t_dist, 0.9): 845 sig = "*" 846 if self.test_stat > mquantiles(t_dist, 0.95): 847 sig = "**" 848 if self.test_stat > mquantiles(t_dist, 0.99): 849 sig = "***" 850 851 return sig 852 853 854class TestRegCoefD(TestRegCoefC): 855 """ 856 Significance test for the categorical variables in a nonparametric 857 regression. 858 859 Parameters 860 ---------- 861 model : Instance of KernelReg class 862 This is the nonparametric regression model whose elements 863 are tested for significance. 864 test_vars : tuple, list of one element 865 index of position of the discrete variable to be tested 866 for significance. E.g. (3) tests variable at 867 position 3 for significance. 868 nboot : int 869 Number of bootstrap samples used to determine the distribution 870 of the test statistic in a finite sample. Default is 400 871 872 Attributes 873 ---------- 874 sig : str 875 The significance level of the variable(s) tested 876 "Not Significant": Not significant at the 90% confidence level 877 Fails to reject the null 878 "*": Significant at the 90% confidence level 879 "**": Significant at the 95% confidence level 880 "***": Significant at the 99% confidence level 881 882 Notes 883 ----- 884 This class currently does not allow joint hypothesis. 885 Only one variable can be tested at a time 886 887 References 888 ---------- 889 See [9] and chapter 12 in [1]. 890 """ 891 892 def _compute_test_stat(self, Y, X): 893 """Computes the test statistic""" 894 895 dom_x = np.sort(np.unique(self.exog[:, self.test_vars])) 896 897 n = np.shape(X)[0] 898 model = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw, 899 defaults = EstimatorSettings(efficient=False)) 900 X1 = copy.deepcopy(X) 901 X1[:, self.test_vars] = 0 902 903 m0 = model.fit(data_predict=X1)[0] 904 m0 = np.reshape(m0, (n, 1)) 905 zvec = np.zeros((n, 1)) # noqa:E741 906 for i in dom_x[1:] : 907 X1[:, self.test_vars] = i 908 m1 = model.fit(data_predict=X1)[0] 909 m1 = np.reshape(m1, (n, 1)) 910 zvec += (m1 - m0) ** 2 # noqa:E741 911 912 avg = zvec.sum(axis=0) / float(n) 913 return avg 914 915 def _compute_sig(self): 916 """Calculates the significance level of the variable tested""" 917 918 m = self._est_cond_mean() 919 Y = self.endog 920 X = self.exog 921 n = np.shape(X)[0] 922 u = Y - m 923 u = u - np.mean(u) # center 924 fct1 = (1 - 5**0.5) / 2. 925 fct2 = (1 + 5**0.5) / 2. 926 u1 = fct1 * u 927 u2 = fct2 * u 928 r = fct2 / (5 ** 0.5) 929 I_dist = np.empty((self.nboot,1)) 930 for j in range(self.nboot): 931 u_boot = copy.deepcopy(u2) 932 933 prob = np.random.uniform(0,1, size = (n,1)) 934 ind = prob < r 935 u_boot[ind] = u1[ind] 936 Y_boot = m + u_boot 937 I_dist[j] = self._compute_test_stat(Y_boot, X) 938 939 sig = "Not Significant" 940 if self.test_stat > mquantiles(I_dist, 0.9): 941 sig = "*" 942 if self.test_stat > mquantiles(I_dist, 0.95): 943 sig = "**" 944 if self.test_stat > mquantiles(I_dist, 0.99): 945 sig = "***" 946 947 return sig 948 949 def _est_cond_mean(self): 950 """ 951 Calculates the expected conditional mean 952 m(X, Z=l) for all possible l 953 """ 954 self.dom_x = np.sort(np.unique(self.exog[:, self.test_vars])) 955 X = copy.deepcopy(self.exog) 956 m=0 957 for i in self.dom_x: 958 X[:, self.test_vars] = i 959 m += self.model.fit(data_predict = X)[0] 960 961 m = m / float(len(self.dom_x)) 962 m = np.reshape(m, (np.shape(self.exog)[0], 1)) 963 return m 964