1"""
2Multivariate Conditional and Unconditional Kernel Density Estimation
3with Mixed Data Types
4
5References
6----------
7[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
8    Princeton University Press. (2007)
9[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
10    and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
11    http://dx.doi.org/10.1561/0800000009
12[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
13    with Categorical and Continuous Data." Working Paper. (2000)
14[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
15    Distributions Annals of Economics and Finance 5, 211-235 (2004)
16[5] Liu, R., Yang, L. "Kernel estimation of multivariate
17    cumulative distribution function."
18    Journal of Nonparametric Statistics (2008)
19[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
20    with Categorical and Continuous Data." Working Paper
21[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
22    regression" Statistica Sinica 14(2004), pp. 485-512
23[8] Racine, J.: "Consistent Significance Testing for Nonparametric
24        Regression" Journal of Business & Economics Statistics
25[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
26        Categorical Predictor Variables in Nonparametric Regression
27        Models", 2006, Econometric Reviews 25, 523-544
28
29"""
30
31# TODO: make default behavior efficient=True above a certain n_obs
32import copy
33
34import numpy as np
35from scipy import optimize
36from scipy.stats.mstats import mquantiles
37
38from ._kernel_base import GenericKDE, EstimatorSettings, gpke, \
39    LeaveOneOut, _get_type_pos, _adjust_shape, _compute_min_std_IQR, kernel_func
40
41
42__all__ = ['KernelReg', 'KernelCensoredReg']
43
44
45class KernelReg(GenericKDE):
46    """
47    Nonparametric kernel regression class.
48
49    Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``.
50    Note that the "local constant" type of regression provided here is also
51    known as Nadaraya-Watson kernel regression; "local linear" is an extension
52    of that which suffers less from bias issues at the edge of the support. Note
53    that specifying a custom kernel works only with "local linear" kernel
54    regression. For example, a custom ``tricube`` kernel yields LOESS regression.
55
56    Parameters
57    ----------
58    endog : array_like
59        This is the dependent variable.
60    exog : array_like
61        The training data for the independent variable(s)
62        Each element in the list is a separate variable
63    var_type : str
64        The type of the variables, one character per variable:
65
66            - c: continuous
67            - u: unordered (discrete)
68            - o: ordered (discrete)
69
70    reg_type : {'lc', 'll'}, optional
71        Type of regression estimator. 'lc' means local constant and
72        'll' local Linear estimator.  Default is 'll'
73    bw : str or array_like, optional
74        Either a user-specified bandwidth or the method for bandwidth
75        selection. If a string, valid values are 'cv_ls' (least-squares
76        cross-validation) and 'aic' (AIC Hurvich bandwidth estimation).
77        Default is 'cv_ls'. User specified bandwidth must have as many
78        entries as the number of variables.
79    ckertype : str, optional
80        The kernel used for the continuous variables.
81    okertype : str, optional
82        The kernel used for the ordered discrete variables.
83    ukertype : str, optional
84        The kernel used for the unordered discrete variables.
85    defaults : EstimatorSettings instance, optional
86        The default values for the efficient bandwidth estimation.
87
88    Attributes
89    ----------
90    bw : array_like
91        The bandwidth parameters.
92    """
93    def __init__(self, endog, exog, var_type, reg_type='ll', bw='cv_ls',
94                 ckertype='gaussian', okertype='wangryzin',
95                 ukertype='aitchisonaitken', defaults=None):
96        self.var_type = var_type
97        self.data_type = var_type
98        self.reg_type = reg_type
99        self.ckertype = ckertype
100        self.okertype = okertype
101        self.ukertype = ukertype
102        if not (self.ckertype in kernel_func and self.ukertype in kernel_func
103                and self.okertype in kernel_func):
104            raise ValueError('user specified kernel must be a supported '
105                             'kernel from statsmodels.nonparametric.kernels.')
106
107        self.k_vars = len(self.var_type)
108        self.endog = _adjust_shape(endog, 1)
109        self.exog = _adjust_shape(exog, self.k_vars)
110        self.data = np.column_stack((self.endog, self.exog))
111        self.nobs = np.shape(self.exog)[0]
112        self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear)
113        defaults = EstimatorSettings() if defaults is None else defaults
114        self._set_defaults(defaults)
115        if not isinstance(bw, str):
116            bw = np.asarray(bw)
117            if len(bw) != self.k_vars:
118                raise ValueError('bw must have the same dimension as the '
119                                 'number of variables.')
120        if not self.efficient:
121            self.bw = self._compute_reg_bw(bw)
122        else:
123            self.bw = self._compute_efficient(bw)
124
125    def _compute_reg_bw(self, bw):
126        if not isinstance(bw, str):
127            self._bw_method = "user-specified"
128            return np.asarray(bw)
129        else:
130            # The user specified a bandwidth selection method e.g. 'cv_ls'
131            self._bw_method = bw
132            # Workaround to avoid instance methods in __dict__
133            if bw == 'cv_ls':
134                res = self.cv_loo
135            else:  # bw == 'aic'
136                res = self.aic_hurvich
137            X = np.std(self.exog, axis=0)
138            h0 = 1.06 * X * \
139                 self.nobs ** (- 1. / (4 + np.size(self.exog, axis=1)))
140
141            func = self.est[self.reg_type]
142            bw_estimated = optimize.fmin(res, x0=h0, args=(func, ),
143                                         maxiter=1e3, maxfun=1e3, disp=0)
144            return bw_estimated
145
146    def _est_loc_linear(self, bw, endog, exog, data_predict):
147        """
148        Local linear estimator of g(x) in the regression ``y = g(x) + e``.
149
150        Parameters
151        ----------
152        bw : array_like
153            Vector of bandwidth value(s).
154        endog : 1D array_like
155            The dependent variable.
156        exog : 1D or 2D array_like
157            The independent variable(s).
158        data_predict : 1D array_like of length K, where K is the number of variables.
159            The point at which the density is estimated.
160
161        Returns
162        -------
163        D_x : array_like
164            The value of the conditional mean at `data_predict`.
165
166        Notes
167        -----
168        See p. 81 in [1] and p.38 in [2] for the formulas.
169        Unlike other methods, this one requires that `data_predict` be 1D.
170        """
171        nobs, k_vars = exog.shape
172        ker = gpke(bw, data=exog, data_predict=data_predict,
173                   var_type=self.var_type,
174                   ckertype=self.ckertype,
175                   ukertype=self.ukertype,
176                   okertype=self.okertype,
177                   tosum=False) / float(nobs)
178        # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
179        # See also p. 38 in [2]
180        #ix_cont = np.arange(self.k_vars)  # Use all vars instead of continuous only
181        # Note: because ix_cont was defined here such that it selected all
182        # columns, I removed the indexing with it from exog/data_predict.
183
184        # Convert ker to a 2-D array to make matrix operations below work
185        ker = ker[:, np.newaxis]
186
187        M12 = exog - data_predict
188        M22 = np.dot(M12.T, M12 * ker)
189        M12 = (M12 * ker).sum(axis=0)
190        M = np.empty((k_vars + 1, k_vars + 1))
191        M[0, 0] = ker.sum()
192        M[0, 1:] = M12
193        M[1:, 0] = M12
194        M[1:, 1:] = M22
195
196        ker_endog = ker * endog
197        V = np.empty((k_vars + 1, 1))
198        V[0, 0] = ker_endog.sum()
199        V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)
200
201        mean_mfx = np.dot(np.linalg.pinv(M), V)
202        mean = mean_mfx[0]
203        mfx = mean_mfx[1:, :]
204        return mean, mfx
205
206    def _est_loc_constant(self, bw, endog, exog, data_predict):
207        """
208        Local constant estimator of g(x) in the regression
209        y = g(x) + e
210
211        Parameters
212        ----------
213        bw : array_like
214            Array of bandwidth value(s).
215        endog : 1D array_like
216            The dependent variable.
217        exog : 1D or 2D array_like
218            The independent variable(s).
219        data_predict : 1D or 2D array_like
220            The point(s) at which the density is estimated.
221
222        Returns
223        -------
224        G : ndarray
225            The value of the conditional mean at `data_predict`.
226        B_x : ndarray
227            The marginal effects.
228        """
229        ker_x = gpke(bw, data=exog, data_predict=data_predict,
230                     var_type=self.var_type,
231                     ckertype=self.ckertype,
232                     ukertype=self.ukertype,
233                     okertype=self.okertype,
234                     tosum=False)
235        ker_x = np.reshape(ker_x, np.shape(endog))
236        G_numer = (ker_x * endog).sum(axis=0)
237        G_denom = ker_x.sum(axis=0)
238        G = G_numer / G_denom
239        nobs = exog.shape[0]
240        f_x = G_denom / float(nobs)
241        ker_xc = gpke(bw, data=exog, data_predict=data_predict,
242                      var_type=self.var_type,
243                      ckertype='d_gaussian',
244                      #okertype='wangryzin_reg',
245                      tosum=False)
246
247        ker_xc = ker_xc[:, np.newaxis]
248        d_mx = -(endog * ker_xc).sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
249        d_fx = -ker_xc.sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
250        B_x = d_mx / f_x - G * d_fx / f_x
251        B_x = (G_numer * d_fx - G_denom * d_mx) / (G_denom**2)
252        #B_x = (f_x * d_mx - m_x * d_fx) / (f_x ** 2)
253        return G, B_x
254
255    def aic_hurvich(self, bw, func=None):
256        """
257        Computes the AIC Hurvich criteria for the estimation of the bandwidth.
258
259        Parameters
260        ----------
261        bw : str or array_like
262            See the ``bw`` parameter of `KernelReg` for details.
263
264        Returns
265        -------
266        aic : ndarray
267            The AIC Hurvich criteria, one element for each variable.
268        func : None
269            Unused here, needed in signature because it's used in `cv_loo`.
270
271        References
272        ----------
273        See ch.2 in [1] and p.35 in [2].
274        """
275        H = np.empty((self.nobs, self.nobs))
276        for j in range(self.nobs):
277            H[:, j] = gpke(bw, data=self.exog, data_predict=self.exog[j,:],
278                           ckertype=self.ckertype, ukertype=self.ukertype,
279                           okertype=self.okertype, var_type=self.var_type,
280                           tosum=False)
281
282        denom = H.sum(axis=1)
283        H = H / denom
284        gx = KernelReg(endog=self.endog, exog=self.exog, var_type=self.var_type,
285                       reg_type=self.reg_type, bw=bw,
286                       defaults=EstimatorSettings(efficient=False)).fit()[0]
287        gx = np.reshape(gx, (self.nobs, 1))
288        sigma = ((self.endog - gx)**2).sum(axis=0) / float(self.nobs)
289
290        frac = (1 + np.trace(H) / float(self.nobs)) / \
291               (1 - (np.trace(H) + 2) / float(self.nobs))
292        #siga = np.dot(self.endog.T, (I - H).T)
293        #sigb = np.dot((I - H), self.endog)
294        #sigma = np.dot(siga, sigb) / float(self.nobs)
295        aic = np.log(sigma) + frac
296        return aic
297
298    def cv_loo(self, bw, func):
299        r"""
300        The cross-validation function with leave-one-out estimator.
301
302        Parameters
303        ----------
304        bw : array_like
305            Vector of bandwidth values.
306        func : callable function
307            Returns the estimator of g(x).  Can be either ``_est_loc_constant``
308            (local constant) or ``_est_loc_linear`` (local_linear).
309
310        Returns
311        -------
312        L : float
313            The value of the CV function.
314
315        Notes
316        -----
317        Calculates the cross-validation least-squares function. This function
318        is minimized by compute_bw to calculate the optimal value of `bw`.
319
320        For details see p.35 in [2]
321
322        .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}
323
324        where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
325        and :math:`h` is the vector of bandwidths
326        """
327        LOO_X = LeaveOneOut(self.exog)
328        LOO_Y = LeaveOneOut(self.endog).__iter__()
329        L = 0
330        for ii, X_not_i in enumerate(LOO_X):
331            Y = next(LOO_Y)
332            G = func(bw, endog=Y, exog=-X_not_i,
333                     data_predict=-self.exog[ii, :])[0]
334            L += (self.endog[ii] - G) ** 2
335
336        # Note: There might be a way to vectorize this. See p.72 in [1]
337        return L / self.nobs
338
339    def r_squared(self):
340        r"""
341        Returns the R-Squared for the nonparametric regression.
342
343        Notes
344        -----
345        For more details see p.45 in [2]
346        The R-Squared is calculated by:
347
348        .. math:: R^{2}=\frac{\left[\sum_{i=1}^{n}
349            (Y_{i}-\bar{y})(\hat{Y_{i}}-\bar{y}\right]^{2}}{\sum_{i=1}^{n}
350            (Y_{i}-\bar{y})^{2}\sum_{i=1}^{n}(\hat{Y_{i}}-\bar{y})^{2}},
351
352        where :math:`\hat{Y_{i}}` is the mean calculated in `fit` at the exog
353        points.
354        """
355        Y = np.squeeze(self.endog)
356        Yhat = self.fit()[0]
357        Y_bar = np.mean(Yhat)
358        R2_numer = (((Y - Y_bar) * (Yhat - Y_bar)).sum())**2
359        R2_denom = ((Y - Y_bar)**2).sum(axis=0) * \
360                   ((Yhat - Y_bar)**2).sum(axis=0)
361        return R2_numer / R2_denom
362
363    def fit(self, data_predict=None):
364        """
365        Returns the mean and marginal effects at the `data_predict` points.
366
367        Parameters
368        ----------
369        data_predict : array_like, optional
370            Points at which to return the mean and marginal effects.  If not
371            given, ``data_predict == exog``.
372
373        Returns
374        -------
375        mean : ndarray
376            The regression result for the mean (i.e. the actual curve).
377        mfx : ndarray
378            The marginal effects, i.e. the partial derivatives of the mean.
379        """
380        func = self.est[self.reg_type]
381        if data_predict is None:
382            data_predict = self.exog
383        else:
384            data_predict = _adjust_shape(data_predict, self.k_vars)
385
386        N_data_predict = np.shape(data_predict)[0]
387        mean = np.empty((N_data_predict,))
388        mfx = np.empty((N_data_predict, self.k_vars))
389        for i in range(N_data_predict):
390            mean_mfx = func(self.bw, self.endog, self.exog,
391                            data_predict=data_predict[i, :])
392            mean[i] = mean_mfx[0]
393            mfx_c = np.squeeze(mean_mfx[1])
394            mfx[i, :] = mfx_c
395
396        return mean, mfx
397
398    def sig_test(self, var_pos, nboot=50, nested_res=25, pivot=False):
399        """
400        Significance test for the variables in the regression.
401
402        Parameters
403        ----------
404        var_pos : sequence
405            The position of the variable in exog to be tested.
406
407        Returns
408        -------
409        sig : str
410            The level of significance:
411
412                - `*` : at 90% confidence level
413                - `**` : at 95% confidence level
414                - `***` : at 99* confidence level
415                - "Not Significant" : if not significant
416        """
417        var_pos = np.asarray(var_pos)
418        ix_cont, ix_ord, ix_unord = _get_type_pos(self.var_type)
419        if np.any(ix_cont[var_pos]):
420            if np.any(ix_ord[var_pos]) or np.any(ix_unord[var_pos]):
421                raise ValueError("Discrete variable in hypothesis. Must be continuous")
422
423            Sig = TestRegCoefC(self, var_pos, nboot, nested_res, pivot)
424        else:
425            Sig = TestRegCoefD(self, var_pos, nboot)
426
427        return Sig.sig
428
429    def __repr__(self):
430        """Provide something sane to print."""
431        rpr = "KernelReg instance\n"
432        rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n"
433        rpr += "Number of samples:   N = " + str(self.nobs) + "\n"
434        rpr += "Variable types:      " + self.var_type + "\n"
435        rpr += "BW selection method: " + self._bw_method + "\n"
436        rpr += "Estimator type: " + self.reg_type + "\n"
437        return rpr
438
439    def _get_class_vars_type(self):
440        """Helper method to be able to pass needed vars to _compute_subset."""
441        class_type = 'KernelReg'
442        class_vars = (self.var_type, self.k_vars, self.reg_type)
443        return class_type, class_vars
444
445    def _compute_dispersion(self, data):
446        """
447        Computes the measure of dispersion.
448
449        The minimum of the standard deviation and interquartile range / 1.349
450
451        References
452        ----------
453        See the user guide for the np package in R.
454        In the notes on bwscaling option in npreg, npudens, npcdens there is
455        a discussion on the measure of dispersion
456        """
457        data = data[:, 1:]
458        return _compute_min_std_IQR(data)
459
460
461class KernelCensoredReg(KernelReg):
462    """
463    Nonparametric censored regression.
464
465    Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``,
466    where y is left-censored.  Left censored variable Y is defined as
467    ``Y = min {Y', L}`` where ``L`` is the value at which ``Y`` is censored
468    and ``Y'`` is the true value of the variable.
469
470    Parameters
471    ----------
472    endog : list with one element which is array_like
473        This is the dependent variable.
474    exog : list
475        The training data for the independent variable(s)
476        Each element in the list is a separate variable
477    dep_type : str
478        The type of the dependent variable(s)
479        c: Continuous
480        u: Unordered (Discrete)
481        o: Ordered (Discrete)
482    reg_type : str
483        Type of regression estimator
484        lc: Local Constant Estimator
485        ll: Local Linear Estimator
486    bw : array_like
487        Either a user-specified bandwidth or
488        the method for bandwidth selection.
489        cv_ls: cross-validation least squares
490        aic: AIC Hurvich Estimator
491    ckertype : str, optional
492        The kernel used for the continuous variables.
493    okertype : str, optional
494        The kernel used for the ordered discrete variables.
495    ukertype : str, optional
496        The kernel used for the unordered discrete variables.
497    censor_val : float
498        Value at which the dependent variable is censored
499    defaults : EstimatorSettings instance, optional
500        The default values for the efficient bandwidth estimation
501
502    Attributes
503    ----------
504    bw : array_like
505        The bandwidth parameters
506    """
507    def __init__(self, endog, exog, var_type, reg_type, bw='cv_ls',
508                 ckertype='gaussian',
509                 ukertype='aitchison_aitken_reg',
510                 okertype='wangryzin_reg',
511                 censor_val=0, defaults=None):
512        self.var_type = var_type
513        self.data_type = var_type
514        self.reg_type = reg_type
515        self.ckertype = ckertype
516        self.okertype = okertype
517        self.ukertype = ukertype
518        if not (self.ckertype in kernel_func and self.ukertype in kernel_func
519                and self.okertype in kernel_func):
520            raise ValueError('user specified kernel must be a supported '
521                             'kernel from statsmodels.nonparametric.kernels.')
522
523        self.k_vars = len(self.var_type)
524        self.endog = _adjust_shape(endog, 1)
525        self.exog = _adjust_shape(exog, self.k_vars)
526        self.data = np.column_stack((self.endog, self.exog))
527        self.nobs = np.shape(self.exog)[0]
528        self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear)
529        defaults = EstimatorSettings() if defaults is None else defaults
530        self._set_defaults(defaults)
531        self.censor_val = censor_val
532        if self.censor_val is not None:
533            self.censored(censor_val)
534        else:
535            self.W_in = np.ones((self.nobs, 1))
536
537        if not self.efficient:
538            self.bw = self._compute_reg_bw(bw)
539        else:
540            self.bw = self._compute_efficient(bw)
541
542    def censored(self, censor_val):
543        # see pp. 341-344 in [1]
544        self.d = (self.endog != censor_val) * 1.
545        ix = np.argsort(np.squeeze(self.endog))
546        self.sortix = ix
547        self.sortix_rev = np.zeros(ix.shape, int)
548        self.sortix_rev[ix] = np.arange(len(ix))
549        self.endog = np.squeeze(self.endog[ix])
550        self.endog = _adjust_shape(self.endog, 1)
551        self.exog = np.squeeze(self.exog[ix])
552        self.d = np.squeeze(self.d[ix])
553        self.W_in = np.empty((self.nobs, 1))
554        for i in range(1, self.nobs + 1):
555            P=1
556            for j in range(1, i):
557                P *= ((self.nobs - j)/(float(self.nobs)-j+1))**self.d[j-1]
558            self.W_in[i-1,0] = P * self.d[i-1] / (float(self.nobs) - i + 1 )
559
560    def __repr__(self):
561        """Provide something sane to print."""
562        rpr = "KernelCensoredReg instance\n"
563        rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n"
564        rpr += "Number of samples:   nobs = " + str(self.nobs) + "\n"
565        rpr += "Variable types:      " + self.var_type + "\n"
566        rpr += "BW selection method: " + self._bw_method + "\n"
567        rpr += "Estimator type: " + self.reg_type + "\n"
568        return rpr
569
570    def _est_loc_linear(self, bw, endog, exog, data_predict, W):
571        """
572        Local linear estimator of g(x) in the regression ``y = g(x) + e``.
573
574        Parameters
575        ----------
576        bw : array_like
577            Vector of bandwidth value(s)
578        endog : 1D array_like
579            The dependent variable
580        exog : 1D or 2D array_like
581            The independent variable(s)
582        data_predict : 1D array_like of length K, where K is
583            the number of variables. The point at which
584            the density is estimated
585
586        Returns
587        -------
588        D_x : array_like
589            The value of the conditional mean at data_predict
590
591        Notes
592        -----
593        See p. 81 in [1] and p.38 in [2] for the formulas
594        Unlike other methods, this one requires that data_predict be 1D
595        """
596        nobs, k_vars = exog.shape
597        ker = gpke(bw, data=exog, data_predict=data_predict,
598                   var_type=self.var_type,
599                   ckertype=self.ckertype,
600                   ukertype=self.ukertype,
601                   okertype=self.okertype, tosum=False)
602        # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
603        # See also p. 38 in [2]
604
605        # Convert ker to a 2-D array to make matrix operations below work
606        ker = W * ker[:, np.newaxis]
607
608        M12 = exog - data_predict
609        M22 = np.dot(M12.T, M12 * ker)
610        M12 = (M12 * ker).sum(axis=0)
611        M = np.empty((k_vars + 1, k_vars + 1))
612        M[0, 0] = ker.sum()
613        M[0, 1:] = M12
614        M[1:, 0] = M12
615        M[1:, 1:] = M22
616
617        ker_endog = ker * endog
618        V = np.empty((k_vars + 1, 1))
619        V[0, 0] = ker_endog.sum()
620        V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)
621
622        mean_mfx = np.dot(np.linalg.pinv(M), V)
623        mean = mean_mfx[0]
624        mfx = mean_mfx[1:, :]
625        return mean, mfx
626
627
628    def cv_loo(self, bw, func):
629        r"""
630        The cross-validation function with leave-one-out
631        estimator
632
633        Parameters
634        ----------
635        bw : array_like
636            Vector of bandwidth values
637        func : callable function
638            Returns the estimator of g(x).
639            Can be either ``_est_loc_constant`` (local constant) or
640            ``_est_loc_linear`` (local_linear).
641
642        Returns
643        -------
644        L : float
645            The value of the CV function
646
647        Notes
648        -----
649        Calculates the cross-validation least-squares
650        function. This function is minimized by compute_bw
651        to calculate the optimal value of bw
652
653        For details see p.35 in [2]
654
655        .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}
656
657        where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
658        and :math:`h` is the vector of bandwidths
659        """
660        LOO_X = LeaveOneOut(self.exog)
661        LOO_Y = LeaveOneOut(self.endog).__iter__()
662        LOO_W = LeaveOneOut(self.W_in).__iter__()
663        L = 0
664        for ii, X_not_i in enumerate(LOO_X):
665            Y = next(LOO_Y)
666            w = next(LOO_W)
667            G = func(bw, endog=Y, exog=-X_not_i,
668                     data_predict=-self.exog[ii, :], W=w)[0]
669            L += (self.endog[ii] - G) ** 2
670
671        # Note: There might be a way to vectorize this. See p.72 in [1]
672        return L / self.nobs
673
674    def fit(self, data_predict=None):
675        """
676        Returns the marginal effects at the data_predict points.
677        """
678        func = self.est[self.reg_type]
679        if data_predict is None:
680            data_predict = self.exog
681        else:
682            data_predict = _adjust_shape(data_predict, self.k_vars)
683
684        N_data_predict = np.shape(data_predict)[0]
685        mean = np.empty((N_data_predict,))
686        mfx = np.empty((N_data_predict, self.k_vars))
687        for i in range(N_data_predict):
688            mean_mfx = func(self.bw, self.endog, self.exog,
689                            data_predict=data_predict[i, :],
690                            W=self.W_in)
691            mean[i] = mean_mfx[0]
692            mfx_c = np.squeeze(mean_mfx[1])
693            mfx[i, :] = mfx_c
694
695        return mean, mfx
696
697
698class TestRegCoefC(object):
699    """
700    Significance test for continuous variables in a nonparametric regression.
701
702    The null hypothesis is ``dE(Y|X)/dX_not_i = 0``, the alternative hypothesis
703    is ``dE(Y|X)/dX_not_i != 0``.
704
705    Parameters
706    ----------
707    model : KernelReg instance
708        This is the nonparametric regression model whose elements
709        are tested for significance.
710    test_vars : tuple, list of integers, array_like
711        index of position of the continuous variables to be tested
712        for significance. E.g. (1,3,5) jointly tests variables at
713        position 1,3 and 5 for significance.
714    nboot : int
715        Number of bootstrap samples used to determine the distribution
716        of the test statistic in a finite sample. Default is 400
717    nested_res : int
718        Number of nested resamples used to calculate lambda.
719        Must enable the pivot option
720    pivot : bool
721        Pivot the test statistic by dividing by its standard error
722        Significantly increases computational time. But pivot statistics
723        have more desirable properties
724        (See references)
725
726    Attributes
727    ----------
728    sig : str
729        The significance level of the variable(s) tested
730        "Not Significant": Not significant at the 90% confidence level
731                            Fails to reject the null
732        "*": Significant at the 90% confidence level
733        "**": Significant at the 95% confidence level
734        "***": Significant at the 99% confidence level
735
736    Notes
737    -----
738    This class allows testing of joint hypothesis as long as all variables
739    are continuous.
740
741    References
742    ----------
743    Racine, J.: "Consistent Significance Testing for Nonparametric Regression"
744    Journal of Business & Economics Statistics.
745
746    Chapter 12 in [1].
747    """
748    # Significance of continuous vars in nonparametric regression
749    # Racine: Consistent Significance Testing for Nonparametric Regression
750    # Journal of Business & Economics Statistics
751    def __init__(self, model, test_vars, nboot=400, nested_res=400,
752                 pivot=False):
753        self.nboot = nboot
754        self.nres = nested_res
755        self.test_vars = test_vars
756        self.model = model
757        self.bw = model.bw
758        self.var_type = model.var_type
759        self.k_vars = len(self.var_type)
760        self.endog = model.endog
761        self.exog = model.exog
762        self.gx = model.est[model.reg_type]
763        self.test_vars = test_vars
764        self.pivot = pivot
765        self.run()
766
767    def run(self):
768        self.test_stat = self._compute_test_stat(self.endog, self.exog)
769        self.sig = self._compute_sig()
770
771    def _compute_test_stat(self, Y, X):
772        """
773        Computes the test statistic.  See p.371 in [8].
774        """
775        lam = self._compute_lambda(Y, X)
776        t = lam
777        if self.pivot:
778            se_lam = self._compute_se_lambda(Y, X)
779            t = lam / float(se_lam)
780
781        return t
782
783    def _compute_lambda(self, Y, X):
784        """Computes only lambda -- the main part of the test statistic"""
785        n = np.shape(X)[0]
786        Y = _adjust_shape(Y, 1)
787        X = _adjust_shape(X, self.k_vars)
788        b = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
789                        defaults = EstimatorSettings(efficient=False)).fit()[1]
790
791        b = b[:, self.test_vars]
792        b = np.reshape(b, (n, len(self.test_vars)))
793        #fct = np.std(b)  # Pivot the statistic by dividing by SE
794        fct = 1.  # Do not Pivot -- Bootstrapping works better if Pivot
795        lam = ((b / fct) ** 2).sum() / float(n)
796        return lam
797
798    def _compute_se_lambda(self, Y, X):
799        """
800        Calculates the SE of lambda by nested resampling
801        Used to pivot the statistic.
802        Bootstrapping works better with estimating pivotal statistics
803        but slows down computation significantly.
804        """
805        n = np.shape(Y)[0]
806        lam = np.empty(shape=(self.nres,))
807        for i in range(self.nres):
808            ind = np.random.randint(0, n, size=(n, 1))
809            Y1 = Y[ind, 0]
810            X1 = X[ind, :]
811            lam[i] = self._compute_lambda(Y1, X1)
812
813        se_lambda = np.std(lam)
814        return se_lambda
815
816    def _compute_sig(self):
817        """
818        Computes the significance value for the variable(s) tested.
819
820        The empirical distribution of the test statistic is obtained through
821        bootstrapping the sample.  The null hypothesis is rejected if the test
822        statistic is larger than the 90, 95, 99 percentiles.
823        """
824        t_dist = np.empty(shape=(self.nboot, ))
825        Y = self.endog
826        X = copy.deepcopy(self.exog)
827        n = np.shape(Y)[0]
828
829        X[:, self.test_vars] = np.mean(X[:, self.test_vars], axis=0)
830        # Calculate the restricted mean. See p. 372 in [8]
831        M = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
832                      defaults=EstimatorSettings(efficient=False)).fit()[0]
833        M = np.reshape(M, (n, 1))
834        e = Y - M
835        e = e - np.mean(e)  # recenter residuals
836        for i in range(self.nboot):
837            ind = np.random.randint(0, n, size=(n, 1))
838            e_boot = e[ind, 0]
839            Y_boot = M + e_boot
840            t_dist[i] = self._compute_test_stat(Y_boot, self.exog)
841
842        self.t_dist = t_dist
843        sig = "Not Significant"
844        if self.test_stat > mquantiles(t_dist, 0.9):
845            sig = "*"
846        if self.test_stat > mquantiles(t_dist, 0.95):
847            sig = "**"
848        if self.test_stat > mquantiles(t_dist, 0.99):
849            sig = "***"
850
851        return sig
852
853
854class TestRegCoefD(TestRegCoefC):
855    """
856    Significance test for the categorical variables in a nonparametric
857    regression.
858
859    Parameters
860    ----------
861    model : Instance of KernelReg class
862        This is the nonparametric regression model whose elements
863        are tested for significance.
864    test_vars : tuple, list of one element
865        index of position of the discrete variable to be tested
866        for significance. E.g. (3) tests variable at
867        position 3 for significance.
868    nboot : int
869        Number of bootstrap samples used to determine the distribution
870        of the test statistic in a finite sample. Default is 400
871
872    Attributes
873    ----------
874    sig : str
875        The significance level of the variable(s) tested
876        "Not Significant": Not significant at the 90% confidence level
877                            Fails to reject the null
878        "*": Significant at the 90% confidence level
879        "**": Significant at the 95% confidence level
880        "***": Significant at the 99% confidence level
881
882    Notes
883    -----
884    This class currently does not allow joint hypothesis.
885    Only one variable can be tested at a time
886
887    References
888    ----------
889    See [9] and chapter 12 in [1].
890    """
891
892    def _compute_test_stat(self, Y, X):
893        """Computes the test statistic"""
894
895        dom_x = np.sort(np.unique(self.exog[:, self.test_vars]))
896
897        n = np.shape(X)[0]
898        model = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
899                          defaults = EstimatorSettings(efficient=False))
900        X1 = copy.deepcopy(X)
901        X1[:, self.test_vars] = 0
902
903        m0 = model.fit(data_predict=X1)[0]
904        m0 = np.reshape(m0, (n, 1))
905        zvec = np.zeros((n, 1))  # noqa:E741
906        for i in dom_x[1:] :
907            X1[:, self.test_vars] = i
908            m1 = model.fit(data_predict=X1)[0]
909            m1 = np.reshape(m1, (n, 1))
910            zvec += (m1 - m0) ** 2  # noqa:E741
911
912        avg = zvec.sum(axis=0) / float(n)
913        return avg
914
915    def _compute_sig(self):
916        """Calculates the significance level of the variable tested"""
917
918        m = self._est_cond_mean()
919        Y = self.endog
920        X = self.exog
921        n = np.shape(X)[0]
922        u = Y - m
923        u = u - np.mean(u)  # center
924        fct1 = (1 - 5**0.5) / 2.
925        fct2 = (1 + 5**0.5) / 2.
926        u1 = fct1 * u
927        u2 = fct2 * u
928        r = fct2 / (5 ** 0.5)
929        I_dist = np.empty((self.nboot,1))
930        for j in range(self.nboot):
931            u_boot = copy.deepcopy(u2)
932
933            prob = np.random.uniform(0,1, size = (n,1))
934            ind = prob < r
935            u_boot[ind] = u1[ind]
936            Y_boot = m + u_boot
937            I_dist[j] = self._compute_test_stat(Y_boot, X)
938
939        sig = "Not Significant"
940        if self.test_stat > mquantiles(I_dist, 0.9):
941            sig = "*"
942        if self.test_stat > mquantiles(I_dist, 0.95):
943            sig = "**"
944        if self.test_stat > mquantiles(I_dist, 0.99):
945            sig = "***"
946
947        return sig
948
949    def _est_cond_mean(self):
950        """
951        Calculates the expected conditional mean
952        m(X, Z=l) for all possible l
953        """
954        self.dom_x = np.sort(np.unique(self.exog[:, self.test_vars]))
955        X = copy.deepcopy(self.exog)
956        m=0
957        for i in self.dom_x:
958            X[:, self.test_vars]  = i
959            m += self.model.fit(data_predict = X)[0]
960
961        m = m / float(len(self.dom_x))
962        m = np.reshape(m, (np.shape(self.exog)[0], 1))
963        return m
964