1"""
2Ordinary Least Squares regression with regimes.
3"""
4
5__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, Daniel Arribas-Bel darribas@asu.edu"
6
7from . import regimes as REGI
8from . import user_output as USER
9from .ols import BaseOLS
10from .utils import set_warn, spbroadcast, RegressionProps_basic, RegressionPropsY, spdot
11from .robust import hac_multi
12from . import summary_output as SUMMARY
13import numpy as np
14import multiprocessing as mp
15from platform import system
16import scipy.sparse as SP
17import copy as COPY
18
19
20class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY):
21
22    """
23    Ordinary least squares with results and diagnostics.
24
25    Parameters
26    ----------
27    y            : array
28                   nx1 array for dependent variable
29    x            : array
30                   Two dimensional array with n rows and one column for each
31                   independent (exogenous) variable, excluding the constant
32    regimes      : list
33                   List of n values with the mapping of each
34                   observation to a regime. Assumed to be aligned with 'x'.
35    w            : pysal W object
36                   Spatial weights object (required if running spatial
37                   diagnostics)
38    robust       : string
39                   If 'white', then a White consistent estimator of the
40                   variance-covariance matrix is given.  If 'hac', then a
41                   HAC consistent estimator of the variance-covariance
42                   matrix is given. Default set to None.
43    gwk          : pysal W object
44                   Kernel spatial weights needed for HAC estimation. Note:
45                   matrix must have ones along the main diagonal.
46    sig2n_k      : boolean
47                   If True, then use n-k to estimate sigma^2. If False, use n.
48    nonspat_diag : boolean
49                   If True, then compute non-spatial diagnostics on
50                   the regression.
51    spat_diag    : boolean
52                   If True, then compute Lagrange multiplier tests (requires
53                   w). Note: see moran for further tests.
54    moran        : boolean
55                   If True, compute Moran's I on the residuals. Note:
56                   requires spat_diag=True.
57    white_test   : boolean
58                   If True, compute White's specification robust test.
59                   (requires nonspat_diag=True)
60    vm           : boolean
61                   If True, include variance-covariance matrix in summary
62                   results
63    constant_regi: string, optional
64                   Switcher controlling the constant term setup. It may take
65                   the following values:
66
67                   *  'one': a vector of ones is appended to x and held constant across regimes
68
69                   * 'many': a vector of ones is appended to x and considered different per regime (default)
70    cols2regi    : list, 'all'
71                   Argument indicating whether each
72                   column of x should be considered as different per regime
73                   or held constant across regimes (False).
74                   If a list, k booleans indicating for each variable the
75                   option (True if one per regime, False to be held constant).
76                   If 'all' (default), all the variables vary by regime.
77    regime_err_sep  : boolean
78                   If True, a separate regression is run for each regime.
79    cores        : boolean
80                   Specifies if multiprocessing is to be used
81                   Default: no multiprocessing, cores = False
82                   Note: Multiprocessing may not work on all platforms.
83    name_y       : string
84                   Name of dependent variable for use in output
85    name_x       : list of strings
86                   Names of independent variables for use in output
87    name_w       : string
88                   Name of weights matrix for use in output
89    name_gwk     : string
90                   Name of kernel weights matrix for use in output
91    name_ds      : string
92                   Name of dataset for use in output
93    name_regimes : string
94                   Name of regime variable for use in the output
95
96
97    Attributes
98    ----------
99    summary      : string
100                   Summary of regression results and diagnostics (note: use in
101                   conjunction with the print command)
102    betas        : array
103                   kx1 array of estimated coefficients
104    u            : array
105                   nx1 array of residuals
106    predy        : array
107                   nx1 array of predicted y values
108    n            : integer
109                   Number of observations
110    k            : integer
111                   Number of variables for which coefficients are estimated
112                   (including the constant)
113                   Only available in dictionary 'multi' when multiple regressions
114                   (see 'multi' below for details)
115    y            : array
116                   nx1 array for dependent variable
117    x            : array
118                   Two dimensional array with n rows and one column for each
119                   independent (exogenous) variable, including the constant
120                   Only available in dictionary 'multi' when multiple regressions
121                   (see 'multi' below for details)
122    robust       : string
123                   Adjustment for robust standard errors
124                   Only available in dictionary 'multi' when multiple regressions
125                   (see 'multi' below for details)
126    mean_y       : float
127                   Mean of dependent variable
128    std_y        : float
129                   Standard deviation of dependent variable
130    vm           : array
131                   Variance covariance matrix (kxk)
132    r2           : float
133                   R squared
134                   Only available in dictionary 'multi' when multiple regressions
135                   (see 'multi' below for details)
136    ar2          : float
137                   Adjusted R squared
138                   Only available in dictionary 'multi' when multiple regressions
139                   (see 'multi' below for details)
140    utu          : float
141                   Sum of squared residuals
142    sig2         : float
143                   Sigma squared used in computations
144                   Only available in dictionary 'multi' when multiple regressions
145                   (see 'multi' below for details)
146    sig2ML       : float
147                   Sigma squared (maximum likelihood)
148                   Only available in dictionary 'multi' when multiple regressions
149                   (see 'multi' below for details)
150    f_stat       : tuple
151                   Statistic (float), p-value (float)
152                   Only available in dictionary 'multi' when multiple regressions
153                   (see 'multi' below for details)
154    logll        : float
155                   Log likelihood
156                   Only available in dictionary 'multi' when multiple regressions
157                   (see 'multi' below for details)
158    aic          : float
159                   Akaike information criterion
160                   Only available in dictionary 'multi' when multiple regressions
161                   (see 'multi' below for details)
162    schwarz      : float
163                   Schwarz information criterion
164                   Only available in dictionary 'multi' when multiple regressions
165                   (see 'multi' below for details)
166    std_err      : array
167                   1xk array of standard errors of the betas
168                   Only available in dictionary 'multi' when multiple regressions
169                   (see 'multi' below for details)
170    t_stat       : list of tuples
171                   t statistic; each tuple contains the pair (statistic,
172                   p-value), where each is a float
173                   Only available in dictionary 'multi' when multiple regressions
174                   (see 'multi' below for details)
175    mulColli     : float
176                   Multicollinearity condition number
177                   Only available in dictionary 'multi' when multiple regressions
178                   (see 'multi' below for details)
179    jarque_bera  : dictionary
180                   'jb': Jarque-Bera statistic (float); 'pvalue': p-value
181                   (float); 'df': degrees of freedom (int)
182                   Only available in dictionary 'multi' when multiple regressions
183                   (see 'multi' below for details)
184    breusch_pagan : dictionary
185                    'bp': Breusch-Pagan statistic (float); 'pvalue': p-value
186                    (float); 'df': degrees of freedom (int)
187                    Only available in dictionary 'multi' when multiple regressions
188                    (see 'multi' below for details)
189    koenker_bassett: dictionary
190                     'kb': Koenker-Bassett statistic (float); 'pvalue': p-value (float);
191                     'df': degrees of freedom (int). Only available in dictionary
192                     'multi' when multiple regressions (see 'multi' below for details).
193    white         : dictionary
194                    'wh': White statistic (float); 'pvalue': p-value (float);
195                    'df': degrees of freedom (int).
196                    Only available in dictionary 'multi' when multiple regressions
197                    (see 'multi' below for details)
198    lm_error      : tuple
199                    Lagrange multiplier test for spatial error model; tuple
200                    contains the pair (statistic, p-value), where each is a
201                    float. Only available in dictionary 'multi' when multiple
202                    regressions (see 'multi' below for details)
203    lm_lag        : tuple
204                    Lagrange multiplier test for spatial lag model; tuple
205                    contains the pair (statistic, p-value), where each is a
206                    float. Only available in dictionary 'multi' when multiple
207                    regressions (see 'multi' below for details)
208    rlm_error     : tuple
209                    Robust lagrange multiplier test for spatial error model;
210                    tuple contains the pair (statistic, p-value), where each
211                    is a float. Only available in dictionary 'multi' when multiple
212                    regressions (see 'multi' below for details)
213    rlm_lag       : tuple
214                    Robust lagrange multiplier test for spatial lag model;
215                    tuple contains the pair (statistic, p-value), where each
216                    is a float. Only available in dictionary 'multi' when
217                    multiple regressions (see 'multi' below for details)
218    lm_sarma      : tuple
219                    Lagrange multiplier test for spatial SARMA model; tuple
220                    contains the pair (statistic, p-value), where each is a
221                    float. Only available in dictionary 'multi' when multiple
222                    regressions (see 'multi' below for details)
223    moran_res     : tuple
224                    Moran's I for the residuals; tuple containing the triple
225                    (Moran's I, standardized Moran's I, p-value)
226    name_y        : string
227                    Name of dependent variable for use in output
228    name_x        : list of strings
229                    Names of independent variables for use in output
230    name_w        : string
231                    Name of weights matrix for use in output
232    name_gwk      : string
233                    Name of kernel weights matrix for use in output
234    name_ds       : string
235                    Name of dataset for use in output
236    name_regimes  : string
237                    Name of regime variable for use in the output
238    title         : string
239                    Name of the regression method used.
240                    Only available in dictionary 'multi' when multiple regressions
241                    (see 'multi' below for details)
242    sig2n         : float
243                    Sigma squared (computed with n in the denominator)
244    sig2n_k       : float
245                    Sigma squared (computed with n-k in the denominator)
246    xtx           : float
247                    :math:`X'X`. Only available in dictionary 'multi' when multiple
248                    regressions (see 'multi' below for details)
249    xtxi          : float
250                    :math:`(X'X)^{-1}`. Only available in dictionary 'multi' when multiple
251                    regressions (see 'multi' below for details)
252    regimes       : list
253                    List of n values with the mapping of each observation to
254                    a regime. Assumed to be aligned with 'x'.
255    constant_regi : string
256                    Ignored if regimes=False. Constant option for regimes.
257                    Switcher controlling the constant term setup. It may take
258                    the following values:
259
260                    *  'one': a vector of ones is appended to x and held constant across regimes.
261
262                    * 'many': a vector of ones is appended to x and considered different per regime.
263    cols2regi    : list
264                   Ignored if regimes=False. Argument indicating whether each
265                   column of x should be considered as different per regime
266                   or held constant across regimes (False).
267                   If a list, k booleans indicating for each variable the
268                   option (True if one per regime, False to be held constant).
269                   If 'all', all the variables vary by regime.
270    regime_err_sep: boolean
271                    If True, a separate regression is run for each regime.
272    kr           : int
273                   Number of variables/columns to be "regimized" or subject
274                   to change by regime. These will result in one parameter
275                   estimate by regime for each variable (i.e. nr parameters per
276                   variable)
277    kf           : int
278                   Number of variables/columns to be considered fixed or
279                   global across regimes and hence only obtain one parameter
280                   estimate.
281    nr           : int
282                   Number of different regimes in the 'regimes' list.
283    multi        : dictionary
284                   Only available when multiple regressions are estimated,
285                   i.e. when regime_err_sep=True and no variable is fixed
286                   across regimes.
287                   Contains all attributes of each individual regression.
288
289    Examples
290    --------
291    >>> import numpy as np
292    >>> import libpysal
293    >>> from spreg import OLS_Regimes
294
295    Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
296    This is the DBF associated with the NAT shapefile.  Note that
297    libpysal.io.open() also reads data in CSV format; since the actual class
298    requires data to be passed in as numpy arrays, the user can read their
299    data in using any method.
300
301    >>> db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"),'r')
302
303    Extract the HR90 column (homicide rates in 1990) from the DBF file and make it
304    the dependent variable for the regression. Note that PySAL requires this to be
305    an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
306    that other packages accept.
307
308    >>> y_var = 'HR90'
309    >>> y = db.by_col(y_var)
310    >>> y = np.array(y).reshape(len(y), 1)
311
312    Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
313    the DBF to be used as independent variables in the regression. Other variables
314    can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
315    Note that PySAL requires this to be an nxj numpy array, where j is the
316    number of independent variables (not including a constant). By default
317    this model adds a vector of ones to the independent variables passed in.
318
319    >>> x_var = ['PS90','UE90']
320    >>> x = np.array([db.by_col(name) for name in x_var]).T
321
322    The different regimes in this data are given according to the North and
323    South dummy (SOUTH).
324
325    >>> r_var = 'SOUTH'
326    >>> regimes = db.by_col(r_var)
327
328    We can now run the regression and then have a summary of the output
329    by typing: olsr.summary
330    Alternatively, we can just check the betas and standard errors of the
331    parameters:
332
333    >>> olsr = OLS_Regimes(y, x, regimes, nonspat_diag=False, name_y=y_var, name_x=['PS90','UE90'], name_regimes=r_var, name_ds='NAT')
334    >>> olsr.betas
335    array([[0.39642899],
336           [0.65583299],
337           [0.48703937],
338           [5.59835   ],
339           [1.16210453],
340           [0.53163886]])
341    >>> np.sqrt(olsr.vm.diagonal())
342    array([0.24816345, 0.09662678, 0.03628629, 0.46894564, 0.21667395,
343           0.05945651])
344    >>> olsr.cols2regi
345    'all'
346    """
347
348    def __init__(self, y, x, regimes,
349                 w=None, robust=None, gwk=None, sig2n_k=True,
350                 nonspat_diag=True, spat_diag=False, moran=False, white_test=False,
351                 vm=False, constant_regi='many', cols2regi='all',
352                 regime_err_sep=True, cores=False,
353                 name_y=None, name_x=None, name_regimes=None,
354                 name_w=None, name_gwk=None, name_ds=None):
355
356        n = USER.check_arrays(y, x)
357        y = USER.check_y(y, n)
358        USER.check_weights(w, y)
359        USER.check_robust(robust, gwk)
360        USER.check_spat_diag(spat_diag, w)
361        x_constant,name_x,warn = USER.check_constant(x,name_x,just_rem=True)
362        set_warn(self,warn)
363        name_x = USER.set_name_x(name_x, x_constant, constant=True)
364        self.name_x_r = USER.set_name_x(name_x, x_constant)
365        self.constant_regi = constant_regi
366        self.cols2regi = cols2regi
367        self.name_w = USER.set_name_w(name_w, w)
368        self.name_gwk = USER.set_name_w(name_gwk, gwk)
369        self.name_ds = USER.set_name_ds(name_ds)
370        self.name_y = USER.set_name_y(name_y)
371        self.name_regimes = USER.set_name_ds(name_regimes)
372        self.n = n
373        cols2regi = REGI.check_cols2regi(
374            constant_regi, cols2regi, x_constant, add_cons=False)
375        self.regimes_set = REGI._get_regimes_set(regimes)
376        self.regimes = regimes
377        USER.check_regimes(self.regimes_set, self.n, x_constant.shape[1])
378        if regime_err_sep == True and robust == 'hac':
379            set_warn(
380                self, "Error by regimes is incompatible with HAC estimation. Hence, error by regimes has been disabled for this model.")
381            regime_err_sep = False
382        self.regime_err_sep = regime_err_sep
383        if regime_err_sep == True and set(cols2regi) == set([True]) and constant_regi == 'many':
384            self.y = y
385            regi_ids = dict(
386                (r, list(np.where(np.array(regimes) == r)[0])) for r in self.regimes_set)
387            self._ols_regimes_multi(x_constant, w, regi_ids, cores,
388                                    gwk, sig2n_k, robust, nonspat_diag, spat_diag, vm, name_x, moran, white_test)
389        else:
390            x, self.name_x = REGI.Regimes_Frame.__init__(self, x_constant,
391                                                         regimes, constant_regi, cols2regi, name_x)
392            BaseOLS.__init__(
393                self, y=y, x=x, robust=robust, gwk=gwk, sig2n_k=sig2n_k)
394            if regime_err_sep == True and robust == None:
395                y2, x2 = REGI._get_weighted_var(
396                    regimes, self.regimes_set, sig2n_k, self.u, y, x)
397                ols2 = BaseOLS(y=y2, x=x2, sig2n_k=sig2n_k)
398                RegressionProps_basic(self, betas=ols2.betas, vm=ols2.vm)
399                self.title = "ORDINARY LEAST SQUARES - REGIMES (Group-wise heteroskedasticity)"
400                nonspat_diag = None
401                set_warn(
402                    self, "Residuals treated as homoskedastic for the purpose of diagnostics.")
403            else:
404                self.title = "ORDINARY LEAST SQUARES - REGIMES"
405            self.robust = USER.set_robust(robust)
406            self.chow = REGI.Chow(self)
407            SUMMARY.OLS(reg=self, vm=vm, w=w, nonspat_diag=nonspat_diag,
408                        spat_diag=spat_diag, moran=moran, white_test=white_test, regimes=True)
409
410    def _ols_regimes_multi(self, x, w, regi_ids, cores,
411                           gwk, sig2n_k, robust, nonspat_diag, spat_diag, vm, name_x, moran, white_test):
412        results_p = {}
413        """
414        for r in self.regimes_set:
415            if system() == 'Windows':
416                is_win = True
417                results_p[r] = _work(*(self.y,x,w,regi_ids,r,robust,sig2n_k,self.name_ds,self.name_y,name_x,self.name_w,self.name_regimes))
418            else:
419                pool = mp.Pool(cores)
420                results_p[r] = pool.apply_async(_work,args=(self.y,x,w,regi_ids,r,robust,sig2n_k,self.name_ds,self.name_y,name_x,self.name_w,self.name_regimes))
421                is_win = False
422        """
423        x_constant,name_x = REGI.check_const_regi(self,x,name_x,regi_ids)
424        self.name_x_r = name_x
425        for r in self.regimes_set:
426            if cores:
427                pool = mp.Pool(None)
428                results_p[r] = pool.apply_async(_work, args=(
429                    self.y, x_constant, w, regi_ids, r, robust, sig2n_k, self.name_ds, self.name_y, name_x, self.name_w, self.name_regimes))
430            else:
431                results_p[r] = _work(*(self.y, x_constant, w, regi_ids, r, robust, sig2n_k,
432                                       self.name_ds, self.name_y, name_x, self.name_w, self.name_regimes))
433        self.kryd = 0
434        self.kr = x_constant.shape[1]
435        self.kf = 0
436        self.nr = len(self.regimes_set)
437        self.vm = np.zeros((self.nr * self.kr, self.nr * self.kr), float)
438        self.betas = np.zeros((self.nr * self.kr, 1), float)
439        self.u = np.zeros((self.n, 1), float)
440        self.predy = np.zeros((self.n, 1), float)
441        """
442        if not is_win:
443            pool.close()
444            pool.join()
445        """
446        if cores:
447            pool.close()
448            pool.join()
449
450        results = {}
451        self.name_y, self.name_x = [], []
452        counter = 0
453        for r in self.regimes_set:
454            """
455            if is_win:
456                results[r] = results_p[r]
457            else:
458                results[r] = results_p[r].get()
459            """
460            if not cores:
461                results[r] = results_p[r]
462            else:
463                results[r] = results_p[r].get()
464
465            self.vm[(counter * self.kr):((counter + 1) * self.kr),
466                    (counter * self.kr):((counter + 1) * self.kr)] = results[r].vm
467            self.betas[
468                (counter * self.kr):((counter + 1) * self.kr), ] = results[r].betas
469            self.u[regi_ids[r], ] = results[r].u
470            self.predy[regi_ids[r], ] = results[r].predy
471            self.name_y += results[r].name_y
472            self.name_x += results[r].name_x
473            counter += 1
474        self.multi = results
475        self.hac_var = x_constant[:,1:]
476        if robust == 'hac':
477            hac_multi(self, gwk)
478        self.chow = REGI.Chow(self)
479        if spat_diag:
480            self._get_spat_diag_props(x_constant, sig2n_k)
481        SUMMARY.OLS_multi(reg=self, multireg=self.multi, vm=vm, nonspat_diag=nonspat_diag,
482                          spat_diag=spat_diag, moran=moran, white_test=white_test, regimes=True, w=w)
483
484    def _get_spat_diag_props(self, x, sig2n_k):
485        self.k = self.kr
486        self._cache = {}
487        self.x = REGI.regimeX_setup(
488            x, self.regimes, [True] * x.shape[1], self.regimes_set)
489        self.xtx = spdot(self.x.T, self.x)
490        self.xtxi = np.linalg.inv(self.xtx)
491
492
493def _work(y, x, w, regi_ids, r, robust, sig2n_k, name_ds, name_y, name_x, name_w, name_regimes):
494    y_r = y[regi_ids[r]]
495    x_r = x[regi_ids[r]]
496    #x_constant,name_x,warn = USER.check_constant(x_r, name_x)
497    #name_x = USER.set_name_x(name_x, x_constant)
498    if robust == 'hac':
499        robust = None
500    model = BaseOLS(y_r, x_r, robust=robust, sig2n_k=sig2n_k)
501    model.title = "ORDINARY LEAST SQUARES ESTIMATION - REGIME %s" % r
502    model.robust = USER.set_robust(robust)
503    model.name_ds = name_ds
504    model.name_y = '%s_%s' % (str(r), name_y)
505    model.name_x = ['%s_%s' % (str(r), i) for i in name_x]
506    model.name_w = name_w
507    model.name_regimes = name_regimes
508    if w:
509        w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True)
510        set_warn(model, warn)
511        model.w = w_r
512    return model
513
514
515def _test():
516    import doctest
517    start_suppress = np.get_printoptions()['suppress']
518    np.set_printoptions(suppress=True)
519    doctest.testmod()
520    np.set_printoptions(suppress=start_suppress)
521
522if __name__ == '__main__':
523    _test()
524    import numpy as np
525    import libpysal
526    db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"),'r')
527    y_var = 'CRIME'
528    y = np.array([db.by_col(y_var)]).reshape(49, 1)
529    x_var = ['INC', 'HOVAL']
530    x = np.array([db.by_col(name) for name in x_var]).T
531    r_var = 'NSA'
532    regimes = db.by_col(r_var)
533    w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
534    w.transform = 'r'
535    olsr = OLS_Regimes(y, x, regimes, w=w, constant_regi='many', nonspat_diag=False, spat_diag=False, name_y=y_var, name_x=['INC', 'HOVAL'],
536                       name_ds='columbus', name_regimes=r_var, name_w='columbus.gal', regime_err_sep=True, cols2regi=[True, True], sig2n_k=True, robust='white')
537    print(olsr.summary)
538