1import numpy as np 2from . import regimes as REGI 3from . import user_output as USER 4import multiprocessing as mp 5import scipy.sparse as SP 6from .utils import sphstack, set_warn, RegressionProps_basic, spdot, sphstack 7from .twosls import BaseTSLS 8from .robust import hac_multi 9from . import summary_output as SUMMARY 10from platform import system 11 12""" 13Two-stage Least Squares estimation with regimes. 14""" 15 16__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, David C. Folch david.folch@asu.edu" 17 18 19class TSLS_Regimes(BaseTSLS, REGI.Regimes_Frame): 20 21 """ 22 Two stage least squares (2SLS) with regimes. 23 24 Parameters 25 ---------- 26 y : array 27 nx1 array for dependent variable 28 x : array 29 Two dimensional array with n rows and one column for each 30 independent (exogenous) variable, excluding the constant 31 yend : array 32 Two dimensional array with n rows and one column for each 33 endogenous variable 34 q : array 35 Two dimensional array with n rows and one column for each 36 external exogenous variable to use as instruments (note: 37 this should not contain any variables from x) 38 regimes : list 39 List of n values with the mapping of each 40 observation to a regime. Assumed to be aligned with 'x'. 41 constant_regi: string 42 Switcher controlling the constant term setup. It may take 43 the following values: 44 45 * 'one': a vector of ones is appended to x and held constant across regimes. 46 47 * 'many': a vector of ones is appended to x and considered different per regime (default). 48 cols2regi : list, 'all' 49 Argument indicating whether each 50 column of x should be considered as different per regime 51 or held constant across regimes (False). 52 If a list, k booleans indicating for each variable the 53 option (True if one per regime, False to be held constant). 54 If 'all' (default), all the variables vary by regime. 55 regime_err_sep: boolean 56 If True, a separate regression is run for each regime. 57 robust : string 58 If 'white', then a White consistent estimator of the 59 variance-covariance matrix is given. 60 If 'hac', then a HAC consistent estimator of the 61 variance-covariance matrix is given. 62 If 'ogmm', then Optimal GMM is used to estimate 63 betas and the variance-covariance matrix. 64 Default set to None. 65 gwk : pysal W object 66 Kernel spatial weights needed for HAC estimation. Note: 67 matrix must have ones along the main diagonal. 68 sig2n_k : boolean 69 If True, then use n-k to estimate sigma^2. If False, use n. 70 vm : boolean 71 If True, include variance-covariance matrix in summary 72 cores : boolean 73 Specifies if multiprocessing is to be used 74 Default: no multiprocessing, cores = False 75 Note: Multiprocessing may not work on all platforms. 76 name_y : string 77 Name of dependent variable for use in output 78 name_x : list of strings 79 Names of independent variables for use in output 80 name_yend : list of strings 81 Names of endogenous variables for use in output 82 name_q : list of strings 83 Names of instruments for use in output 84 name_regimes : string 85 Name of regimes variable for use in output 86 name_w : string 87 Name of weights matrix for use in output 88 name_gwk : string 89 Name of kernel weights matrix for use in output 90 name_ds : string 91 Name of dataset for use in output 92 93 Attributes 94 ---------- 95 betas : array 96 kx1 array of estimated coefficients 97 u : array 98 nx1 array of residuals 99 predy : array 100 nx1 array of predicted y values 101 n : integer 102 Number of observations 103 y : array 104 nx1 array for dependent variable 105 x : array 106 Two dimensional array with n rows and one column for each 107 independent (exogenous) variable, including the constant 108 Only available in dictionary 'multi' when multiple regressions 109 (see 'multi' below for details) 110 yend : array 111 Two dimensional array with n rows and one column for each 112 endogenous variable 113 Only available in dictionary 'multi' when multiple regressions 114 (see 'multi' below for details) 115 q : array 116 Two dimensional array with n rows and one column for each 117 external exogenous variable used as instruments 118 Only available in dictionary 'multi' when multiple regressions 119 (see 'multi' below for details) 120 vm : array 121 Variance covariance matrix (kxk) 122 regimes : list 123 List of n values with the mapping of each 124 observation to a regime. Assumed to be aligned with 'x'. 125 constant_regi: [False, 'one', 'many'] 126 Ignored if regimes=False. Constant option for regimes. 127 Switcher controlling the constant term setup. It may take 128 the following values: 129 130 * 'one': a vector of ones is appended to x and held constant across regimes. 131 132 * 'many': a vector of ones is appended to x and considered different per regime (default). 133 cols2regi : list, 'all' 134 Ignored if regimes=False. Argument indicating whether each 135 column of x should be considered as different per regime 136 or held constant across regimes (False). 137 If a list, k booleans indicating for each variable the 138 option (True if one per regime, False to be held constant). 139 If 'all', all the variables vary by regime. 140 regime_err_sep: boolean 141 If True, a separate regression is run for each regime. 142 kr : int 143 Number of variables/columns to be "regimized" or subject 144 to change by regime. These will result in one parameter 145 estimate by regime for each variable (i.e. nr parameters per 146 variable) 147 kf : int 148 Number of variables/columns to be considered fixed or 149 global across regimes and hence only obtain one parameter 150 estimate 151 nr : int 152 Number of different regimes in the 'regimes' list 153 name_y : string 154 Name of dependent variable for use in output 155 name_x : list of strings 156 Names of independent variables for use in output 157 name_yend : list of strings 158 Names of endogenous variables for use in output 159 name_q : list of strings 160 Names of instruments for use in output 161 name_regimes : string 162 Name of regimes variable for use in output 163 name_w : string 164 Name of weights matrix for use in output 165 name_gwk : string 166 Name of kernel weights matrix for use in output 167 name_ds : string 168 Name of dataset for use in output 169 multi : dictionary 170 Only available when multiple regressions are estimated, 171 i.e. when regime_err_sep=True and no variable is fixed 172 across regimes. 173 Contains all attributes of each individual regression 174 175 Examples 176 -------- 177 178 We first need to import the needed modules, namely numpy to convert the 179 data we read into arrays that ``spreg`` understands and ``pysal`` to 180 perform all the analysis. 181 182 >>> import numpy as np 183 >>> import libpysal 184 >>> from libpysal.examples import load_example 185 >>> from libpysal.weights import Rook 186 187 Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open(). 188 This is the DBF associated with the NAT shapefile. Note that 189 libpysal.io.open() also reads data in CSV format; since the actual class 190 requires data to be passed in as numpy arrays, the user can read their 191 data in using any method. 192 193 >>> nat = load_example('Natregimes') 194 >>> db = libpysal.io.open(nat.get_path('natregimes.dbf'), 'r') 195 196 Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the 197 dependent variable for the regression. Note that PySAL requires this to be 198 an numpy array of shape (n, 1) as opposed to the also common shape of (n, ) 199 that other packages accept. 200 201 >>> y_var = 'HR90' 202 >>> y = np.array([db.by_col(y_var)]).reshape(3085,1) 203 204 Extract UE90 (unemployment rate) and PS90 (population structure) vectors from 205 the DBF to be used as independent variables in the regression. Other variables 206 can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...] 207 Note that PySAL requires this to be an nxj numpy array, where j is the 208 number of independent variables (not including a constant). By default 209 this model adds a vector of ones to the independent variables passed in. 210 211 >>> x_var = ['PS90','UE90'] 212 >>> x = np.array([db.by_col(name) for name in x_var]).T 213 214 In this case we consider RD90 (resource deprivation) as an endogenous regressor. 215 We tell the model that this is so by passing it in a different parameter 216 from the exogenous variables (x). 217 218 >>> yd_var = ['RD90'] 219 >>> yd = np.array([db.by_col(name) for name in yd_var]).T 220 221 Because we have endogenous variables, to obtain a correct estimate of the 222 model, we need to instrument for RD90. We use FP89 (families below poverty) 223 for this and hence put it in the instruments parameter, 'q'. 224 225 >>> q_var = ['FP89'] 226 >>> q = np.array([db.by_col(name) for name in q_var]).T 227 228 The different regimes in this data are given according to the North and 229 South dummy (SOUTH). 230 231 >>> r_var = 'SOUTH' 232 >>> regimes = db.by_col(r_var) 233 234 Since we want to perform tests for spatial dependence, we need to specify 235 the spatial weights matrix that includes the spatial configuration of the 236 observations into the error component of the model. To do that, we can open 237 an already existing gal file or create a new one. In this case, we will 238 create one from ``NAT.shp``. 239 240 >>> w = Rook.from_shapefile(nat.get_path("natregimes.shp")) 241 242 Unless there is a good reason not to do it, the weights have to be 243 row-standardized so every row of the matrix sums to one. Among other 244 things, this allows to interpret the spatial lag of a variable as the 245 average value of the neighboring observations. In PySAL, this can be 246 easily performed in the following way: 247 248 >>> w.transform = 'r' 249 250 We can now run the regression and then have a summary of the output 251 by typing: model.summary 252 Alternatively, we can just check the betas and standard errors of the 253 parameters: 254 255 >>> from spreg import TSLS_Regimes 256 >>> tslsr = TSLS_Regimes(y, x, yd, q, regimes, w=w, constant_regi='many', spat_diag=False, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp') 257 258 >>> tslsr.betas 259 array([[ 3.66973562], 260 [ 1.06950466], 261 [ 0.14680946], 262 [ 2.45864196], 263 [ 9.55873243], 264 [ 1.94666348], 265 [-0.30810214], 266 [ 3.68718119]]) 267 268 >>> np.sqrt(tslsr.vm.diagonal()) 269 array([0.38389901, 0.09963973, 0.04672091, 0.22725012, 0.49181223, 270 0.19630774, 0.07784587, 0.25529011]) 271 272 """ 273 274 def __init__(self, y, x, yend, q, regimes, 275 w=None, robust=None, gwk=None, sig2n_k=True, 276 spat_diag=False, vm=False, constant_regi='many', 277 cols2regi='all', regime_err_sep=True, name_y=None, name_x=None, 278 cores=False, name_yend=None, name_q=None, name_regimes=None, 279 name_w=None, name_gwk=None, name_ds=None, summ=True): 280 281 n = USER.check_arrays(y, x) 282 y = USER.check_y(y, n) 283 USER.check_weights(w, y) 284 USER.check_robust(robust, gwk) 285 USER.check_spat_diag(spat_diag, w) 286 x_constant,name_x,warn = USER.check_constant(x,name_x,just_rem=True) 287 set_warn(self,warn) 288 name_x = USER.set_name_x(name_x, x_constant, constant=True) 289 self.constant_regi = constant_regi 290 self.cols2regi = cols2regi 291 self.name_ds = USER.set_name_ds(name_ds) 292 self.name_regimes = USER.set_name_ds(name_regimes) 293 self.name_w = USER.set_name_w(name_w, w) 294 self.name_gwk = USER.set_name_w(name_gwk, gwk) 295 self.name_y = USER.set_name_y(name_y) 296 name_yend = USER.set_name_yend(name_yend, yend) 297 name_q = USER.set_name_q(name_q, q) 298 self.name_x_r = USER.set_name_x(name_x, x_constant) + name_yend 299 self.n = n 300 cols2regi = REGI.check_cols2regi( 301 constant_regi, cols2regi, x_constant, yend=yend, add_cons=False) 302 self.regimes_set = REGI._get_regimes_set(regimes) 303 self.regimes = regimes 304 USER.check_regimes(self.regimes_set, self.n, x_constant.shape[1]) 305 if regime_err_sep == True and robust == 'hac': 306 set_warn( 307 self, "Error by regimes is incompatible with HAC estimation for 2SLS models. Hence, the error by regimes has been disabled for this model.") 308 regime_err_sep = False 309 self.regime_err_sep = regime_err_sep 310 if regime_err_sep == True and set(cols2regi) == set([True]) and constant_regi == 'many': 311 self.y = y 312 regi_ids = dict( 313 (r, list(np.where(np.array(regimes) == r)[0])) for r in self.regimes_set) 314 self._tsls_regimes_multi(x_constant, yend, q, w, regi_ids, cores, 315 gwk, sig2n_k, robust, spat_diag, vm, name_x, name_yend, name_q) 316 else: 317 q, self.name_q = REGI.Regimes_Frame.__init__(self, q, 318 regimes, constant_regi=None, cols2regi='all', names=name_q) 319 x, self.name_x = REGI.Regimes_Frame.__init__(self, x_constant, 320 regimes, constant_regi, cols2regi=cols2regi, names=name_x) 321 yend, self.name_yend = REGI.Regimes_Frame.__init__(self, yend, 322 regimes, constant_regi=None, 323 cols2regi=cols2regi, yend=True, names=name_yend) 324 if regime_err_sep == True and robust == None: 325 robust = 'white' 326 BaseTSLS.__init__(self, y=y, x=x, yend=yend, q=q, 327 robust=robust, gwk=gwk, sig2n_k=sig2n_k) 328 self.title = "TWO STAGE LEAST SQUARES - REGIMES" 329 if robust == 'ogmm': 330 _optimal_weight(self, sig2n_k) 331 self.name_z = self.name_x + self.name_yend 332 self.name_h = USER.set_name_h(self.name_x, self.name_q) 333 self.chow = REGI.Chow(self) 334 self.robust = USER.set_robust(robust) 335 if summ: 336 SUMMARY.TSLS( 337 reg=self, vm=vm, w=w, spat_diag=spat_diag, regimes=True) 338 339 def _tsls_regimes_multi(self, x, yend, q, w, regi_ids, cores, 340 gwk, sig2n_k, robust, spat_diag, vm, name_x, name_yend, name_q): 341 results_p = {} 342 """ 343 for r in self.regimes_set: 344 if system() != 'Windows': 345 is_win = True 346 results_p[r] = _work(*(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) 347 else: 348 pool = mp.Pool(cores) 349 results_p[r] = pool.apply_async(_work,args=(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) 350 is_win = False 351 """ 352 x_constant,name_x = REGI.check_const_regi(self,x,name_x,regi_ids) 353 self.name_x_r = name_x 354 for r in self.regimes_set: 355 if cores: 356 pool = mp.Pool(None) 357 results_p[r] = pool.apply_async(_work, args=( 358 self.y, x_constant, w, regi_ids, r, yend, q, robust, sig2n_k, self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes)) 359 else: 360 results_p[r] = _work(*(self.y, x_constant, w, regi_ids, r, yend, q, robust, sig2n_k, 361 self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes)) 362 363 self.kryd = 0 364 self.kr = x_constant.shape[1] + yend.shape[1] 365 self.kf = 0 366 self.nr = len(self.regimes_set) 367 self.vm = np.zeros((self.nr * self.kr, self.nr * self.kr), float) 368 self.betas = np.zeros((self.nr * self.kr, 1), float) 369 self.u = np.zeros((self.n, 1), float) 370 self.predy = np.zeros((self.n, 1), float) 371 """ 372 if not is_win: 373 pool.close() 374 pool.join() 375 """ 376 if cores: 377 pool.close() 378 pool.join() 379 380 results = {} 381 self.name_y, self.name_x, self.name_yend, self.name_q, self.name_z, self.name_h = [ 382 ], [], [], [], [], [] 383 counter = 0 384 for r in self.regimes_set: 385 """ 386 if is_win: 387 results[r] = results_p[r] 388 else: 389 results[r] = results_p[r].get() 390 """ 391 if not cores: 392 results[r] = results_p[r] 393 else: 394 results[r] = results_p[r].get() 395 396 self.vm[(counter * self.kr):((counter + 1) * self.kr), 397 (counter * self.kr):((counter + 1) * self.kr)] = results[r].vm 398 self.betas[ 399 (counter * self.kr):((counter + 1) * self.kr), ] = results[r].betas 400 self.u[regi_ids[r], ] = results[r].u 401 self.predy[regi_ids[r], ] = results[r].predy 402 self.name_y += results[r].name_y 403 self.name_x += results[r].name_x 404 self.name_yend += results[r].name_yend 405 self.name_q += results[r].name_q 406 self.name_z += results[r].name_z 407 self.name_h += results[r].name_h 408 counter += 1 409 self.multi = results 410 self.hac_var = sphstack(x_constant[:,1:], q) 411 if robust == 'hac': 412 hac_multi(self, gwk) 413 if robust == 'ogmm': 414 set_warn( 415 self, "Residuals treated as homoskedastic for the purpose of diagnostics.") 416 self.chow = REGI.Chow(self) 417 if spat_diag: 418 self._get_spat_diag_props(results, regi_ids, x_constant, yend, q) 419 SUMMARY.TSLS_multi( 420 reg=self, multireg=self.multi, vm=vm, spat_diag=spat_diag, regimes=True, w=w) 421 422 def _get_spat_diag_props(self, results, regi_ids, x, yend, q): 423 self._cache = {} 424 x = REGI.regimeX_setup( 425 x, self.regimes, [True] * x.shape[1], self.regimes_set) 426 self.z = sphstack(x, REGI.regimeX_setup( 427 yend, self.regimes, [True] * yend.shape[1], self.regimes_set)) 428 self.h = sphstack( 429 x, REGI.regimeX_setup(q, self.regimes, [True] * q.shape[1], self.regimes_set)) 430 hthi = np.linalg.inv(spdot(self.h.T, self.h)) 431 zth = spdot(self.z.T, self.h) 432 self.varb = np.linalg.inv(spdot(spdot(zth, hthi), zth.T)) 433 434 435def _work(y, x, w, regi_ids, r, yend, q, robust, sig2n_k, name_ds, name_y, name_x, name_yend, name_q, name_w, name_regimes): 436 y_r = y[regi_ids[r]] 437 x_r = x[regi_ids[r]] 438 yend_r = yend[regi_ids[r]] 439 q_r = q[regi_ids[r]] 440 if robust == 'hac' or robust == 'ogmm': 441 robust2 = None 442 else: 443 robust2 = robust 444 model = BaseTSLS( 445 y_r, x_r, yend_r, q_r, robust=robust2, sig2n_k=sig2n_k) 446 model.title = "TWO STAGE LEAST SQUARES ESTIMATION - REGIME %s" % r 447 if robust == 'ogmm': 448 _optimal_weight(model, sig2n_k, warn=False) 449 model.robust = USER.set_robust(robust) 450 model.name_ds = name_ds 451 model.name_y = '%s_%s' % (str(r), name_y) 452 model.name_x = ['%s_%s' % (str(r), i) for i in name_x] 453 model.name_yend = ['%s_%s' % (str(r), i) for i in name_yend] 454 model.name_z = model.name_x + model.name_yend 455 model.name_q = ['%s_%s' % (str(r), i) for i in name_q] 456 model.name_h = model.name_x + model.name_q 457 model.name_w = name_w 458 model.name_regimes = name_regimes 459 if w: 460 w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) 461 set_warn(model, warn) 462 model.w = w_r 463 return model 464 465 466def _optimal_weight(reg, sig2n_k, warn=True): 467 try: 468 Hu = reg.h.toarray() * reg.u ** 2 469 except: 470 Hu = reg.h * reg.u ** 2 471 if sig2n_k: 472 S = spdot(reg.h.T, Hu, array_out=True) / (reg.n - reg.k) 473 else: 474 S = spdot(reg.h.T, Hu, array_out=True) / reg.n 475 Si = np.linalg.inv(S) 476 ZtH = spdot(reg.z.T, reg.h) 477 ZtHSi = spdot(ZtH, Si) 478 fac2 = np.linalg.inv(spdot(ZtHSi, ZtH.T, array_out=True)) 479 fac3 = spdot(ZtHSi, spdot(reg.h.T, reg.y), array_out=True) 480 betas = np.dot(fac2, fac3) 481 if sig2n_k: 482 vm = fac2 * (reg.n - reg.k) 483 else: 484 vm = fac2 * reg.n 485 RegressionProps_basic(reg, betas=betas, vm=vm, sig2=False) 486 reg.title += " (Optimal-Weighted GMM)" 487 if warn: 488 set_warn( 489 reg, "Residuals treated as homoskedastic for the purpose of diagnostics.") 490 return 491 492 493def _test(): 494 import doctest 495 start_suppress = np.get_printoptions()['suppress'] 496 np.set_printoptions(suppress=True) 497 doctest.testmod() 498 np.set_printoptions(suppress=start_suppress) 499 500 501if __name__ == '__main__': 502 _test() 503 import numpy as np 504 import libpysal 505 from libpysal.examples import load_example 506 507 nat = load_example('Natregimes') 508 db = libpysal.io.open(nat.get_path('natregimes.dbf'), 'r') 509 y_var = 'HR60' 510 y = np.array([db.by_col(y_var)]).T 511 x_var = ['PS60', 'DV60', 'RD60'] 512 x = np.array([db.by_col(name) for name in x_var]).T 513 yd_var = ['UE60'] 514 yd = np.array([db.by_col(name) for name in yd_var]).T 515 q_var = ['FP59', 'MA60'] 516 q = np.array([db.by_col(name) for name in q_var]).T 517 r_var = 'SOUTH' 518 regimes = db.by_col(r_var) 519 tslsr = TSLS_Regimes(y, x, yd, q, regimes, constant_regi='many', spat_diag=False, name_y=y_var, name_x=x_var, 520 name_yend=yd_var, name_q=q_var, name_regimes=r_var, cols2regi=[ 521 False, True, True, True], 522 sig2n_k=False) 523 print(tslsr.summary) 524