1import numpy as np
2from . import regimes as REGI
3from . import user_output as USER
4import multiprocessing as mp
5import scipy.sparse as SP
6from .utils import sphstack, set_warn, RegressionProps_basic, spdot, sphstack
7from .twosls import BaseTSLS
8from .robust import hac_multi
9from . import summary_output as SUMMARY
10from platform import system
11
12"""
13Two-stage Least Squares estimation with regimes.
14"""
15
16__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, David C. Folch david.folch@asu.edu"
17
18
19class TSLS_Regimes(BaseTSLS, REGI.Regimes_Frame):
20
21    """
22    Two stage least squares (2SLS) with regimes.
23
24    Parameters
25    ----------
26    y            : array
27                   nx1 array for dependent variable
28    x            : array
29                   Two dimensional array with n rows and one column for each
30                   independent (exogenous) variable, excluding the constant
31    yend         : array
32                   Two dimensional array with n rows and one column for each
33                   endogenous variable
34    q            : array
35                   Two dimensional array with n rows and one column for each
36                   external exogenous variable to use as instruments (note:
37                   this should not contain any variables from x)
38    regimes      : list
39                   List of n values with the mapping of each
40                   observation to a regime. Assumed to be aligned with 'x'.
41    constant_regi: string
42                   Switcher controlling the constant term setup. It may take
43                   the following values:
44
45                   * 'one': a vector of ones is appended to x and held constant across regimes.
46
47                   * 'many': a vector of ones is appended to x and considered different per regime (default).
48    cols2regi    : list, 'all'
49                   Argument indicating whether each
50                   column of x should be considered as different per regime
51                   or held constant across regimes (False).
52                   If a list, k booleans indicating for each variable the
53                   option (True if one per regime, False to be held constant).
54                   If 'all' (default), all the variables vary by regime.
55    regime_err_sep: boolean
56                    If True, a separate regression is run for each regime.
57    robust       : string
58                   If 'white', then a White consistent estimator of the
59                   variance-covariance matrix is given.
60                   If 'hac', then a HAC consistent estimator of the
61                   variance-covariance matrix is given.
62                   If 'ogmm', then Optimal GMM is used to estimate
63                   betas and the variance-covariance matrix.
64                   Default set to None.
65    gwk          : pysal W object
66                   Kernel spatial weights needed for HAC estimation. Note:
67                   matrix must have ones along the main diagonal.
68    sig2n_k      : boolean
69                   If True, then use n-k to estimate sigma^2. If False, use n.
70    vm           : boolean
71                   If True, include variance-covariance matrix in summary
72    cores        : boolean
73                   Specifies if multiprocessing is to be used
74                   Default: no multiprocessing, cores = False
75                   Note: Multiprocessing may not work on all platforms.
76    name_y       : string
77                   Name of dependent variable for use in output
78    name_x       : list of strings
79                   Names of independent variables for use in output
80    name_yend    : list of strings
81                   Names of endogenous variables for use in output
82    name_q       : list of strings
83                   Names of instruments for use in output
84    name_regimes : string
85                   Name of regimes variable for use in output
86    name_w       : string
87                   Name of weights matrix for use in output
88    name_gwk     : string
89                   Name of kernel weights matrix for use in output
90    name_ds      : string
91                   Name of dataset for use in output
92
93    Attributes
94    ----------
95    betas        : array
96                   kx1 array of estimated coefficients
97    u            : array
98                   nx1 array of residuals
99    predy        : array
100                   nx1 array of predicted y values
101    n            : integer
102                   Number of observations
103    y            : array
104                   nx1 array for dependent variable
105    x            : array
106                   Two dimensional array with n rows and one column for each
107                   independent (exogenous) variable, including the constant
108                   Only available in dictionary 'multi' when multiple regressions
109                   (see 'multi' below for details)
110    yend         : array
111                   Two dimensional array with n rows and one column for each
112                   endogenous variable
113                   Only available in dictionary 'multi' when multiple regressions
114                   (see 'multi' below for details)
115    q            : array
116                   Two dimensional array with n rows and one column for each
117                   external exogenous variable used as instruments
118                   Only available in dictionary 'multi' when multiple regressions
119                   (see 'multi' below for details)
120    vm           : array
121                   Variance covariance matrix (kxk)
122    regimes      : list
123                   List of n values with the mapping of each
124                   observation to a regime. Assumed to be aligned with 'x'.
125    constant_regi: [False, 'one', 'many']
126                   Ignored if regimes=False. Constant option for regimes.
127                   Switcher controlling the constant term setup. It may take
128                   the following values:
129
130                   * 'one': a vector of ones is appended to x and held constant across regimes.
131
132                   * 'many': a vector of ones is appended to x and considered different per regime (default).
133    cols2regi    : list, 'all'
134                   Ignored if regimes=False. Argument indicating whether each
135                   column of x should be considered as different per regime
136                   or held constant across regimes (False).
137                   If a list, k booleans indicating for each variable the
138                   option (True if one per regime, False to be held constant).
139                   If 'all', all the variables vary by regime.
140    regime_err_sep: boolean
141                    If True, a separate regression is run for each regime.
142    kr           : int
143                   Number of variables/columns to be "regimized" or subject
144                   to change by regime. These will result in one parameter
145                   estimate by regime for each variable (i.e. nr parameters per
146                   variable)
147    kf           : int
148                   Number of variables/columns to be considered fixed or
149                   global across regimes and hence only obtain one parameter
150                   estimate
151    nr           : int
152                   Number of different regimes in the 'regimes' list
153    name_y       : string
154                   Name of dependent variable for use in output
155    name_x       : list of strings
156                   Names of independent variables for use in output
157    name_yend    : list of strings
158                   Names of endogenous variables for use in output
159    name_q       : list of strings
160                   Names of instruments for use in output
161    name_regimes : string
162                   Name of regimes variable for use in output
163    name_w       : string
164                   Name of weights matrix for use in output
165    name_gwk     : string
166                   Name of kernel weights matrix for use in output
167    name_ds      : string
168                   Name of dataset for use in output
169    multi        : dictionary
170                   Only available when multiple regressions are estimated,
171                   i.e. when regime_err_sep=True and no variable is fixed
172                   across regimes.
173                   Contains all attributes of each individual regression
174
175    Examples
176    --------
177
178    We first need to import the needed modules, namely numpy to convert the
179    data we read into arrays that ``spreg`` understands and ``pysal`` to
180    perform all the analysis.
181
182    >>> import numpy as np
183    >>> import libpysal
184    >>> from libpysal.examples import load_example
185    >>> from libpysal.weights import Rook
186
187    Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
188    This is the DBF associated with the NAT shapefile.  Note that
189    libpysal.io.open() also reads data in CSV format; since the actual class
190    requires data to be passed in as numpy arrays, the user can read their
191    data in using any method.
192
193    >>> nat = load_example('Natregimes')
194    >>> db = libpysal.io.open(nat.get_path('natregimes.dbf'), 'r')
195
196    Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
197    dependent variable for the regression. Note that PySAL requires this to be
198    an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
199    that other packages accept.
200
201    >>> y_var = 'HR90'
202    >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
203
204    Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
205    the DBF to be used as independent variables in the regression. Other variables
206    can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
207    Note that PySAL requires this to be an nxj numpy array, where j is the
208    number of independent variables (not including a constant). By default
209    this model adds a vector of ones to the independent variables passed in.
210
211    >>> x_var = ['PS90','UE90']
212    >>> x = np.array([db.by_col(name) for name in x_var]).T
213
214    In this case we consider RD90 (resource deprivation) as an endogenous regressor.
215    We tell the model that this is so by passing it in a different parameter
216    from the exogenous variables (x).
217
218    >>> yd_var = ['RD90']
219    >>> yd = np.array([db.by_col(name) for name in yd_var]).T
220
221    Because we have endogenous variables, to obtain a correct estimate of the
222    model, we need to instrument for RD90. We use FP89 (families below poverty)
223    for this and hence put it in the instruments parameter, 'q'.
224
225    >>> q_var = ['FP89']
226    >>> q = np.array([db.by_col(name) for name in q_var]).T
227
228    The different regimes in this data are given according to the North and
229    South dummy (SOUTH).
230
231    >>> r_var = 'SOUTH'
232    >>> regimes = db.by_col(r_var)
233
234    Since we want to perform tests for spatial dependence, we need to specify
235    the spatial weights matrix that includes the spatial configuration of the
236    observations into the error component of the model. To do that, we can open
237    an already existing gal file or create a new one. In this case, we will
238    create one from ``NAT.shp``.
239
240    >>> w = Rook.from_shapefile(nat.get_path("natregimes.shp"))
241
242    Unless there is a good reason not to do it, the weights have to be
243    row-standardized so every row of the matrix sums to one. Among other
244    things, this allows to interpret the spatial lag of a variable as the
245    average value of the neighboring observations. In PySAL, this can be
246    easily performed in the following way:
247
248    >>> w.transform = 'r'
249
250    We can now run the regression and then have a summary of the output
251    by typing: model.summary
252    Alternatively, we can just check the betas and standard errors of the
253    parameters:
254
255    >>> from spreg import TSLS_Regimes
256    >>> tslsr = TSLS_Regimes(y, x, yd, q, regimes, w=w, constant_regi='many', spat_diag=False, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp')
257
258    >>> tslsr.betas
259    array([[ 3.66973562],
260           [ 1.06950466],
261           [ 0.14680946],
262           [ 2.45864196],
263           [ 9.55873243],
264           [ 1.94666348],
265           [-0.30810214],
266           [ 3.68718119]])
267
268    >>> np.sqrt(tslsr.vm.diagonal())
269    array([0.38389901, 0.09963973, 0.04672091, 0.22725012, 0.49181223,
270           0.19630774, 0.07784587, 0.25529011])
271
272    """
273
274    def __init__(self, y, x, yend, q, regimes,
275                 w=None, robust=None, gwk=None, sig2n_k=True,
276                 spat_diag=False, vm=False, constant_regi='many',
277                 cols2regi='all', regime_err_sep=True, name_y=None, name_x=None,
278                 cores=False, name_yend=None, name_q=None, name_regimes=None,
279                 name_w=None, name_gwk=None, name_ds=None, summ=True):
280
281        n = USER.check_arrays(y, x)
282        y = USER.check_y(y, n)
283        USER.check_weights(w, y)
284        USER.check_robust(robust, gwk)
285        USER.check_spat_diag(spat_diag, w)
286        x_constant,name_x,warn = USER.check_constant(x,name_x,just_rem=True)
287        set_warn(self,warn)
288        name_x = USER.set_name_x(name_x, x_constant, constant=True)
289        self.constant_regi = constant_regi
290        self.cols2regi = cols2regi
291        self.name_ds = USER.set_name_ds(name_ds)
292        self.name_regimes = USER.set_name_ds(name_regimes)
293        self.name_w = USER.set_name_w(name_w, w)
294        self.name_gwk = USER.set_name_w(name_gwk, gwk)
295        self.name_y = USER.set_name_y(name_y)
296        name_yend = USER.set_name_yend(name_yend, yend)
297        name_q = USER.set_name_q(name_q, q)
298        self.name_x_r = USER.set_name_x(name_x, x_constant) + name_yend
299        self.n = n
300        cols2regi = REGI.check_cols2regi(
301            constant_regi, cols2regi, x_constant, yend=yend, add_cons=False)
302        self.regimes_set = REGI._get_regimes_set(regimes)
303        self.regimes = regimes
304        USER.check_regimes(self.regimes_set, self.n, x_constant.shape[1])
305        if regime_err_sep == True and robust == 'hac':
306            set_warn(
307                self, "Error by regimes is incompatible with HAC estimation for 2SLS models. Hence, the error by regimes has been disabled for this model.")
308            regime_err_sep = False
309        self.regime_err_sep = regime_err_sep
310        if regime_err_sep == True and set(cols2regi) == set([True]) and constant_regi == 'many':
311            self.y = y
312            regi_ids = dict(
313                (r, list(np.where(np.array(regimes) == r)[0])) for r in self.regimes_set)
314            self._tsls_regimes_multi(x_constant, yend, q, w, regi_ids, cores,
315                                     gwk, sig2n_k, robust, spat_diag, vm, name_x, name_yend, name_q)
316        else:
317            q, self.name_q = REGI.Regimes_Frame.__init__(self, q,
318                                                         regimes, constant_regi=None, cols2regi='all', names=name_q)
319            x, self.name_x = REGI.Regimes_Frame.__init__(self, x_constant,
320                                                         regimes, constant_regi, cols2regi=cols2regi, names=name_x)
321            yend, self.name_yend = REGI.Regimes_Frame.__init__(self, yend,
322                                                               regimes, constant_regi=None,
323                                                               cols2regi=cols2regi, yend=True, names=name_yend)
324            if regime_err_sep == True and robust == None:
325                robust = 'white'
326            BaseTSLS.__init__(self, y=y, x=x, yend=yend, q=q,
327                              robust=robust, gwk=gwk, sig2n_k=sig2n_k)
328            self.title = "TWO STAGE LEAST SQUARES - REGIMES"
329            if robust == 'ogmm':
330                _optimal_weight(self, sig2n_k)
331            self.name_z = self.name_x + self.name_yend
332            self.name_h = USER.set_name_h(self.name_x, self.name_q)
333            self.chow = REGI.Chow(self)
334            self.robust = USER.set_robust(robust)
335            if summ:
336                SUMMARY.TSLS(
337                    reg=self, vm=vm, w=w, spat_diag=spat_diag, regimes=True)
338
339    def _tsls_regimes_multi(self, x, yend, q, w, regi_ids, cores,
340                            gwk, sig2n_k, robust, spat_diag, vm, name_x, name_yend, name_q):
341        results_p = {}
342        """
343        for r in self.regimes_set:
344            if system() != 'Windows':
345                is_win = True
346                results_p[r] = _work(*(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes))
347            else:
348                pool = mp.Pool(cores)
349                results_p[r] = pool.apply_async(_work,args=(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes))
350                is_win = False
351        """
352        x_constant,name_x = REGI.check_const_regi(self,x,name_x,regi_ids)
353        self.name_x_r = name_x
354        for r in self.regimes_set:
355            if cores:
356                pool = mp.Pool(None)
357                results_p[r] = pool.apply_async(_work, args=(
358                    self.y, x_constant, w, regi_ids, r, yend, q, robust, sig2n_k, self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes))
359            else:
360                results_p[r] = _work(*(self.y, x_constant, w, regi_ids, r, yend, q, robust, sig2n_k,
361                                       self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes))
362
363        self.kryd = 0
364        self.kr = x_constant.shape[1] + yend.shape[1]
365        self.kf = 0
366        self.nr = len(self.regimes_set)
367        self.vm = np.zeros((self.nr * self.kr, self.nr * self.kr), float)
368        self.betas = np.zeros((self.nr * self.kr, 1), float)
369        self.u = np.zeros((self.n, 1), float)
370        self.predy = np.zeros((self.n, 1), float)
371        """
372        if not is_win:
373            pool.close()
374            pool.join()
375        """
376        if cores:
377            pool.close()
378            pool.join()
379
380        results = {}
381        self.name_y, self.name_x, self.name_yend, self.name_q, self.name_z, self.name_h = [
382        ], [], [], [], [], []
383        counter = 0
384        for r in self.regimes_set:
385            """
386            if is_win:
387                results[r] = results_p[r]
388            else:
389                results[r] = results_p[r].get()
390            """
391            if not cores:
392                results[r] = results_p[r]
393            else:
394                results[r] = results_p[r].get()
395
396            self.vm[(counter * self.kr):((counter + 1) * self.kr),
397                    (counter * self.kr):((counter + 1) * self.kr)] = results[r].vm
398            self.betas[
399                (counter * self.kr):((counter + 1) * self.kr), ] = results[r].betas
400            self.u[regi_ids[r], ] = results[r].u
401            self.predy[regi_ids[r], ] = results[r].predy
402            self.name_y += results[r].name_y
403            self.name_x += results[r].name_x
404            self.name_yend += results[r].name_yend
405            self.name_q += results[r].name_q
406            self.name_z += results[r].name_z
407            self.name_h += results[r].name_h
408            counter += 1
409        self.multi = results
410        self.hac_var = sphstack(x_constant[:,1:], q)
411        if robust == 'hac':
412            hac_multi(self, gwk)
413        if robust == 'ogmm':
414            set_warn(
415                self, "Residuals treated as homoskedastic for the purpose of diagnostics.")
416        self.chow = REGI.Chow(self)
417        if spat_diag:
418            self._get_spat_diag_props(results, regi_ids, x_constant, yend, q)
419        SUMMARY.TSLS_multi(
420            reg=self, multireg=self.multi, vm=vm, spat_diag=spat_diag, regimes=True, w=w)
421
422    def _get_spat_diag_props(self, results, regi_ids, x, yend, q):
423        self._cache = {}
424        x = REGI.regimeX_setup(
425            x, self.regimes, [True] * x.shape[1], self.regimes_set)
426        self.z = sphstack(x, REGI.regimeX_setup(
427            yend, self.regimes, [True] * yend.shape[1], self.regimes_set))
428        self.h = sphstack(
429            x, REGI.regimeX_setup(q, self.regimes, [True] * q.shape[1], self.regimes_set))
430        hthi = np.linalg.inv(spdot(self.h.T, self.h))
431        zth = spdot(self.z.T, self.h)
432        self.varb = np.linalg.inv(spdot(spdot(zth, hthi), zth.T))
433
434
435def _work(y, x, w, regi_ids, r, yend, q, robust, sig2n_k, name_ds, name_y, name_x, name_yend, name_q, name_w, name_regimes):
436    y_r = y[regi_ids[r]]
437    x_r = x[regi_ids[r]]
438    yend_r = yend[regi_ids[r]]
439    q_r = q[regi_ids[r]]
440    if robust == 'hac' or robust == 'ogmm':
441        robust2 = None
442    else:
443        robust2 = robust
444    model = BaseTSLS(
445        y_r, x_r, yend_r, q_r, robust=robust2, sig2n_k=sig2n_k)
446    model.title = "TWO STAGE LEAST SQUARES ESTIMATION - REGIME %s" % r
447    if robust == 'ogmm':
448        _optimal_weight(model, sig2n_k, warn=False)
449    model.robust = USER.set_robust(robust)
450    model.name_ds = name_ds
451    model.name_y = '%s_%s' % (str(r), name_y)
452    model.name_x = ['%s_%s' % (str(r), i) for i in name_x]
453    model.name_yend = ['%s_%s' % (str(r), i) for i in name_yend]
454    model.name_z = model.name_x + model.name_yend
455    model.name_q = ['%s_%s' % (str(r), i) for i in name_q]
456    model.name_h = model.name_x + model.name_q
457    model.name_w = name_w
458    model.name_regimes = name_regimes
459    if w:
460        w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True)
461        set_warn(model, warn)
462        model.w = w_r
463    return model
464
465
466def _optimal_weight(reg, sig2n_k, warn=True):
467    try:
468        Hu = reg.h.toarray() * reg.u ** 2
469    except:
470        Hu = reg.h * reg.u ** 2
471    if sig2n_k:
472        S = spdot(reg.h.T, Hu, array_out=True) / (reg.n - reg.k)
473    else:
474        S = spdot(reg.h.T, Hu, array_out=True) / reg.n
475    Si = np.linalg.inv(S)
476    ZtH = spdot(reg.z.T, reg.h)
477    ZtHSi = spdot(ZtH, Si)
478    fac2 = np.linalg.inv(spdot(ZtHSi, ZtH.T, array_out=True))
479    fac3 = spdot(ZtHSi, spdot(reg.h.T, reg.y), array_out=True)
480    betas = np.dot(fac2, fac3)
481    if sig2n_k:
482        vm = fac2 * (reg.n - reg.k)
483    else:
484        vm = fac2 * reg.n
485    RegressionProps_basic(reg, betas=betas, vm=vm, sig2=False)
486    reg.title += " (Optimal-Weighted GMM)"
487    if warn:
488        set_warn(
489            reg, "Residuals treated as homoskedastic for the purpose of diagnostics.")
490    return
491
492
493def _test():
494    import doctest
495    start_suppress = np.get_printoptions()['suppress']
496    np.set_printoptions(suppress=True)
497    doctest.testmod()
498    np.set_printoptions(suppress=start_suppress)
499
500
501if __name__ == '__main__':
502    _test()
503    import numpy as np
504    import libpysal
505    from libpysal.examples import load_example
506
507    nat = load_example('Natregimes')
508    db = libpysal.io.open(nat.get_path('natregimes.dbf'), 'r')
509    y_var = 'HR60'
510    y = np.array([db.by_col(y_var)]).T
511    x_var = ['PS60', 'DV60', 'RD60']
512    x = np.array([db.by_col(name) for name in x_var]).T
513    yd_var = ['UE60']
514    yd = np.array([db.by_col(name) for name in yd_var]).T
515    q_var = ['FP59', 'MA60']
516    q = np.array([db.by_col(name) for name in q_var]).T
517    r_var = 'SOUTH'
518    regimes = db.by_col(r_var)
519    tslsr = TSLS_Regimes(y, x, yd, q, regimes, constant_regi='many', spat_diag=False, name_y=y_var, name_x=x_var,
520                         name_yend=yd_var, name_q=q_var, name_regimes=r_var, cols2regi=[
521                             False, True, True, True],
522                         sig2n_k=False)
523    print(tslsr.summary)
524