1""" 2Ordinary Least Squares regression with regimes. 3""" 4 5__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, Daniel Arribas-Bel darribas@asu.edu" 6 7from . import regimes as REGI 8from . import user_output as USER 9from .ols import BaseOLS 10from .utils import set_warn, spbroadcast, RegressionProps_basic, RegressionPropsY, spdot 11from .robust import hac_multi 12from . import summary_output as SUMMARY 13import numpy as np 14import multiprocessing as mp 15from platform import system 16import scipy.sparse as SP 17import copy as COPY 18 19 20class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): 21 22 """ 23 Ordinary least squares with results and diagnostics. 24 25 Parameters 26 ---------- 27 y : array 28 nx1 array for dependent variable 29 x : array 30 Two dimensional array with n rows and one column for each 31 independent (exogenous) variable, excluding the constant 32 regimes : list 33 List of n values with the mapping of each 34 observation to a regime. Assumed to be aligned with 'x'. 35 w : pysal W object 36 Spatial weights object (required if running spatial 37 diagnostics) 38 robust : string 39 If 'white', then a White consistent estimator of the 40 variance-covariance matrix is given. If 'hac', then a 41 HAC consistent estimator of the variance-covariance 42 matrix is given. Default set to None. 43 gwk : pysal W object 44 Kernel spatial weights needed for HAC estimation. Note: 45 matrix must have ones along the main diagonal. 46 sig2n_k : boolean 47 If True, then use n-k to estimate sigma^2. If False, use n. 48 nonspat_diag : boolean 49 If True, then compute non-spatial diagnostics on 50 the regression. 51 spat_diag : boolean 52 If True, then compute Lagrange multiplier tests (requires 53 w). Note: see moran for further tests. 54 moran : boolean 55 If True, compute Moran's I on the residuals. Note: 56 requires spat_diag=True. 57 white_test : boolean 58 If True, compute White's specification robust test. 59 (requires nonspat_diag=True) 60 vm : boolean 61 If True, include variance-covariance matrix in summary 62 results 63 constant_regi: string, optional 64 Switcher controlling the constant term setup. It may take 65 the following values: 66 67 * 'one': a vector of ones is appended to x and held constant across regimes 68 69 * 'many': a vector of ones is appended to x and considered different per regime (default) 70 cols2regi : list, 'all' 71 Argument indicating whether each 72 column of x should be considered as different per regime 73 or held constant across regimes (False). 74 If a list, k booleans indicating for each variable the 75 option (True if one per regime, False to be held constant). 76 If 'all' (default), all the variables vary by regime. 77 regime_err_sep : boolean 78 If True, a separate regression is run for each regime. 79 cores : boolean 80 Specifies if multiprocessing is to be used 81 Default: no multiprocessing, cores = False 82 Note: Multiprocessing may not work on all platforms. 83 name_y : string 84 Name of dependent variable for use in output 85 name_x : list of strings 86 Names of independent variables for use in output 87 name_w : string 88 Name of weights matrix for use in output 89 name_gwk : string 90 Name of kernel weights matrix for use in output 91 name_ds : string 92 Name of dataset for use in output 93 name_regimes : string 94 Name of regime variable for use in the output 95 96 97 Attributes 98 ---------- 99 summary : string 100 Summary of regression results and diagnostics (note: use in 101 conjunction with the print command) 102 betas : array 103 kx1 array of estimated coefficients 104 u : array 105 nx1 array of residuals 106 predy : array 107 nx1 array of predicted y values 108 n : integer 109 Number of observations 110 k : integer 111 Number of variables for which coefficients are estimated 112 (including the constant) 113 Only available in dictionary 'multi' when multiple regressions 114 (see 'multi' below for details) 115 y : array 116 nx1 array for dependent variable 117 x : array 118 Two dimensional array with n rows and one column for each 119 independent (exogenous) variable, including the constant 120 Only available in dictionary 'multi' when multiple regressions 121 (see 'multi' below for details) 122 robust : string 123 Adjustment for robust standard errors 124 Only available in dictionary 'multi' when multiple regressions 125 (see 'multi' below for details) 126 mean_y : float 127 Mean of dependent variable 128 std_y : float 129 Standard deviation of dependent variable 130 vm : array 131 Variance covariance matrix (kxk) 132 r2 : float 133 R squared 134 Only available in dictionary 'multi' when multiple regressions 135 (see 'multi' below for details) 136 ar2 : float 137 Adjusted R squared 138 Only available in dictionary 'multi' when multiple regressions 139 (see 'multi' below for details) 140 utu : float 141 Sum of squared residuals 142 sig2 : float 143 Sigma squared used in computations 144 Only available in dictionary 'multi' when multiple regressions 145 (see 'multi' below for details) 146 sig2ML : float 147 Sigma squared (maximum likelihood) 148 Only available in dictionary 'multi' when multiple regressions 149 (see 'multi' below for details) 150 f_stat : tuple 151 Statistic (float), p-value (float) 152 Only available in dictionary 'multi' when multiple regressions 153 (see 'multi' below for details) 154 logll : float 155 Log likelihood 156 Only available in dictionary 'multi' when multiple regressions 157 (see 'multi' below for details) 158 aic : float 159 Akaike information criterion 160 Only available in dictionary 'multi' when multiple regressions 161 (see 'multi' below for details) 162 schwarz : float 163 Schwarz information criterion 164 Only available in dictionary 'multi' when multiple regressions 165 (see 'multi' below for details) 166 std_err : array 167 1xk array of standard errors of the betas 168 Only available in dictionary 'multi' when multiple regressions 169 (see 'multi' below for details) 170 t_stat : list of tuples 171 t statistic; each tuple contains the pair (statistic, 172 p-value), where each is a float 173 Only available in dictionary 'multi' when multiple regressions 174 (see 'multi' below for details) 175 mulColli : float 176 Multicollinearity condition number 177 Only available in dictionary 'multi' when multiple regressions 178 (see 'multi' below for details) 179 jarque_bera : dictionary 180 'jb': Jarque-Bera statistic (float); 'pvalue': p-value 181 (float); 'df': degrees of freedom (int) 182 Only available in dictionary 'multi' when multiple regressions 183 (see 'multi' below for details) 184 breusch_pagan : dictionary 185 'bp': Breusch-Pagan statistic (float); 'pvalue': p-value 186 (float); 'df': degrees of freedom (int) 187 Only available in dictionary 'multi' when multiple regressions 188 (see 'multi' below for details) 189 koenker_bassett: dictionary 190 'kb': Koenker-Bassett statistic (float); 'pvalue': p-value (float); 191 'df': degrees of freedom (int). Only available in dictionary 192 'multi' when multiple regressions (see 'multi' below for details). 193 white : dictionary 194 'wh': White statistic (float); 'pvalue': p-value (float); 195 'df': degrees of freedom (int). 196 Only available in dictionary 'multi' when multiple regressions 197 (see 'multi' below for details) 198 lm_error : tuple 199 Lagrange multiplier test for spatial error model; tuple 200 contains the pair (statistic, p-value), where each is a 201 float. Only available in dictionary 'multi' when multiple 202 regressions (see 'multi' below for details) 203 lm_lag : tuple 204 Lagrange multiplier test for spatial lag model; tuple 205 contains the pair (statistic, p-value), where each is a 206 float. Only available in dictionary 'multi' when multiple 207 regressions (see 'multi' below for details) 208 rlm_error : tuple 209 Robust lagrange multiplier test for spatial error model; 210 tuple contains the pair (statistic, p-value), where each 211 is a float. Only available in dictionary 'multi' when multiple 212 regressions (see 'multi' below for details) 213 rlm_lag : tuple 214 Robust lagrange multiplier test for spatial lag model; 215 tuple contains the pair (statistic, p-value), where each 216 is a float. Only available in dictionary 'multi' when 217 multiple regressions (see 'multi' below for details) 218 lm_sarma : tuple 219 Lagrange multiplier test for spatial SARMA model; tuple 220 contains the pair (statistic, p-value), where each is a 221 float. Only available in dictionary 'multi' when multiple 222 regressions (see 'multi' below for details) 223 moran_res : tuple 224 Moran's I for the residuals; tuple containing the triple 225 (Moran's I, standardized Moran's I, p-value) 226 name_y : string 227 Name of dependent variable for use in output 228 name_x : list of strings 229 Names of independent variables for use in output 230 name_w : string 231 Name of weights matrix for use in output 232 name_gwk : string 233 Name of kernel weights matrix for use in output 234 name_ds : string 235 Name of dataset for use in output 236 name_regimes : string 237 Name of regime variable for use in the output 238 title : string 239 Name of the regression method used. 240 Only available in dictionary 'multi' when multiple regressions 241 (see 'multi' below for details) 242 sig2n : float 243 Sigma squared (computed with n in the denominator) 244 sig2n_k : float 245 Sigma squared (computed with n-k in the denominator) 246 xtx : float 247 :math:`X'X`. Only available in dictionary 'multi' when multiple 248 regressions (see 'multi' below for details) 249 xtxi : float 250 :math:`(X'X)^{-1}`. Only available in dictionary 'multi' when multiple 251 regressions (see 'multi' below for details) 252 regimes : list 253 List of n values with the mapping of each observation to 254 a regime. Assumed to be aligned with 'x'. 255 constant_regi : string 256 Ignored if regimes=False. Constant option for regimes. 257 Switcher controlling the constant term setup. It may take 258 the following values: 259 260 * 'one': a vector of ones is appended to x and held constant across regimes. 261 262 * 'many': a vector of ones is appended to x and considered different per regime. 263 cols2regi : list 264 Ignored if regimes=False. Argument indicating whether each 265 column of x should be considered as different per regime 266 or held constant across regimes (False). 267 If a list, k booleans indicating for each variable the 268 option (True if one per regime, False to be held constant). 269 If 'all', all the variables vary by regime. 270 regime_err_sep: boolean 271 If True, a separate regression is run for each regime. 272 kr : int 273 Number of variables/columns to be "regimized" or subject 274 to change by regime. These will result in one parameter 275 estimate by regime for each variable (i.e. nr parameters per 276 variable) 277 kf : int 278 Number of variables/columns to be considered fixed or 279 global across regimes and hence only obtain one parameter 280 estimate. 281 nr : int 282 Number of different regimes in the 'regimes' list. 283 multi : dictionary 284 Only available when multiple regressions are estimated, 285 i.e. when regime_err_sep=True and no variable is fixed 286 across regimes. 287 Contains all attributes of each individual regression. 288 289 Examples 290 -------- 291 >>> import numpy as np 292 >>> import libpysal 293 >>> from spreg import OLS_Regimes 294 295 Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open(). 296 This is the DBF associated with the NAT shapefile. Note that 297 libpysal.io.open() also reads data in CSV format; since the actual class 298 requires data to be passed in as numpy arrays, the user can read their 299 data in using any method. 300 301 >>> db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"),'r') 302 303 Extract the HR90 column (homicide rates in 1990) from the DBF file and make it 304 the dependent variable for the regression. Note that PySAL requires this to be 305 an numpy array of shape (n, 1) as opposed to the also common shape of (n, ) 306 that other packages accept. 307 308 >>> y_var = 'HR90' 309 >>> y = db.by_col(y_var) 310 >>> y = np.array(y).reshape(len(y), 1) 311 312 Extract UE90 (unemployment rate) and PS90 (population structure) vectors from 313 the DBF to be used as independent variables in the regression. Other variables 314 can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...] 315 Note that PySAL requires this to be an nxj numpy array, where j is the 316 number of independent variables (not including a constant). By default 317 this model adds a vector of ones to the independent variables passed in. 318 319 >>> x_var = ['PS90','UE90'] 320 >>> x = np.array([db.by_col(name) for name in x_var]).T 321 322 The different regimes in this data are given according to the North and 323 South dummy (SOUTH). 324 325 >>> r_var = 'SOUTH' 326 >>> regimes = db.by_col(r_var) 327 328 We can now run the regression and then have a summary of the output 329 by typing: olsr.summary 330 Alternatively, we can just check the betas and standard errors of the 331 parameters: 332 333 >>> olsr = OLS_Regimes(y, x, regimes, nonspat_diag=False, name_y=y_var, name_x=['PS90','UE90'], name_regimes=r_var, name_ds='NAT') 334 >>> olsr.betas 335 array([[0.39642899], 336 [0.65583299], 337 [0.48703937], 338 [5.59835 ], 339 [1.16210453], 340 [0.53163886]]) 341 >>> np.sqrt(olsr.vm.diagonal()) 342 array([0.24816345, 0.09662678, 0.03628629, 0.46894564, 0.21667395, 343 0.05945651]) 344 >>> olsr.cols2regi 345 'all' 346 """ 347 348 def __init__(self, y, x, regimes, 349 w=None, robust=None, gwk=None, sig2n_k=True, 350 nonspat_diag=True, spat_diag=False, moran=False, white_test=False, 351 vm=False, constant_regi='many', cols2regi='all', 352 regime_err_sep=True, cores=False, 353 name_y=None, name_x=None, name_regimes=None, 354 name_w=None, name_gwk=None, name_ds=None): 355 356 n = USER.check_arrays(y, x) 357 y = USER.check_y(y, n) 358 USER.check_weights(w, y) 359 USER.check_robust(robust, gwk) 360 USER.check_spat_diag(spat_diag, w) 361 x_constant,name_x,warn = USER.check_constant(x,name_x,just_rem=True) 362 set_warn(self,warn) 363 name_x = USER.set_name_x(name_x, x_constant, constant=True) 364 self.name_x_r = USER.set_name_x(name_x, x_constant) 365 self.constant_regi = constant_regi 366 self.cols2regi = cols2regi 367 self.name_w = USER.set_name_w(name_w, w) 368 self.name_gwk = USER.set_name_w(name_gwk, gwk) 369 self.name_ds = USER.set_name_ds(name_ds) 370 self.name_y = USER.set_name_y(name_y) 371 self.name_regimes = USER.set_name_ds(name_regimes) 372 self.n = n 373 cols2regi = REGI.check_cols2regi( 374 constant_regi, cols2regi, x_constant, add_cons=False) 375 self.regimes_set = REGI._get_regimes_set(regimes) 376 self.regimes = regimes 377 USER.check_regimes(self.regimes_set, self.n, x_constant.shape[1]) 378 if regime_err_sep == True and robust == 'hac': 379 set_warn( 380 self, "Error by regimes is incompatible with HAC estimation. Hence, error by regimes has been disabled for this model.") 381 regime_err_sep = False 382 self.regime_err_sep = regime_err_sep 383 if regime_err_sep == True and set(cols2regi) == set([True]) and constant_regi == 'many': 384 self.y = y 385 regi_ids = dict( 386 (r, list(np.where(np.array(regimes) == r)[0])) for r in self.regimes_set) 387 self._ols_regimes_multi(x_constant, w, regi_ids, cores, 388 gwk, sig2n_k, robust, nonspat_diag, spat_diag, vm, name_x, moran, white_test) 389 else: 390 x, self.name_x = REGI.Regimes_Frame.__init__(self, x_constant, 391 regimes, constant_regi, cols2regi, name_x) 392 BaseOLS.__init__( 393 self, y=y, x=x, robust=robust, gwk=gwk, sig2n_k=sig2n_k) 394 if regime_err_sep == True and robust == None: 395 y2, x2 = REGI._get_weighted_var( 396 regimes, self.regimes_set, sig2n_k, self.u, y, x) 397 ols2 = BaseOLS(y=y2, x=x2, sig2n_k=sig2n_k) 398 RegressionProps_basic(self, betas=ols2.betas, vm=ols2.vm) 399 self.title = "ORDINARY LEAST SQUARES - REGIMES (Group-wise heteroskedasticity)" 400 nonspat_diag = None 401 set_warn( 402 self, "Residuals treated as homoskedastic for the purpose of diagnostics.") 403 else: 404 self.title = "ORDINARY LEAST SQUARES - REGIMES" 405 self.robust = USER.set_robust(robust) 406 self.chow = REGI.Chow(self) 407 SUMMARY.OLS(reg=self, vm=vm, w=w, nonspat_diag=nonspat_diag, 408 spat_diag=spat_diag, moran=moran, white_test=white_test, regimes=True) 409 410 def _ols_regimes_multi(self, x, w, regi_ids, cores, 411 gwk, sig2n_k, robust, nonspat_diag, spat_diag, vm, name_x, moran, white_test): 412 results_p = {} 413 """ 414 for r in self.regimes_set: 415 if system() == 'Windows': 416 is_win = True 417 results_p[r] = _work(*(self.y,x,w,regi_ids,r,robust,sig2n_k,self.name_ds,self.name_y,name_x,self.name_w,self.name_regimes)) 418 else: 419 pool = mp.Pool(cores) 420 results_p[r] = pool.apply_async(_work,args=(self.y,x,w,regi_ids,r,robust,sig2n_k,self.name_ds,self.name_y,name_x,self.name_w,self.name_regimes)) 421 is_win = False 422 """ 423 x_constant,name_x = REGI.check_const_regi(self,x,name_x,regi_ids) 424 self.name_x_r = name_x 425 for r in self.regimes_set: 426 if cores: 427 pool = mp.Pool(None) 428 results_p[r] = pool.apply_async(_work, args=( 429 self.y, x_constant, w, regi_ids, r, robust, sig2n_k, self.name_ds, self.name_y, name_x, self.name_w, self.name_regimes)) 430 else: 431 results_p[r] = _work(*(self.y, x_constant, w, regi_ids, r, robust, sig2n_k, 432 self.name_ds, self.name_y, name_x, self.name_w, self.name_regimes)) 433 self.kryd = 0 434 self.kr = x_constant.shape[1] 435 self.kf = 0 436 self.nr = len(self.regimes_set) 437 self.vm = np.zeros((self.nr * self.kr, self.nr * self.kr), float) 438 self.betas = np.zeros((self.nr * self.kr, 1), float) 439 self.u = np.zeros((self.n, 1), float) 440 self.predy = np.zeros((self.n, 1), float) 441 """ 442 if not is_win: 443 pool.close() 444 pool.join() 445 """ 446 if cores: 447 pool.close() 448 pool.join() 449 450 results = {} 451 self.name_y, self.name_x = [], [] 452 counter = 0 453 for r in self.regimes_set: 454 """ 455 if is_win: 456 results[r] = results_p[r] 457 else: 458 results[r] = results_p[r].get() 459 """ 460 if not cores: 461 results[r] = results_p[r] 462 else: 463 results[r] = results_p[r].get() 464 465 self.vm[(counter * self.kr):((counter + 1) * self.kr), 466 (counter * self.kr):((counter + 1) * self.kr)] = results[r].vm 467 self.betas[ 468 (counter * self.kr):((counter + 1) * self.kr), ] = results[r].betas 469 self.u[regi_ids[r], ] = results[r].u 470 self.predy[regi_ids[r], ] = results[r].predy 471 self.name_y += results[r].name_y 472 self.name_x += results[r].name_x 473 counter += 1 474 self.multi = results 475 self.hac_var = x_constant[:,1:] 476 if robust == 'hac': 477 hac_multi(self, gwk) 478 self.chow = REGI.Chow(self) 479 if spat_diag: 480 self._get_spat_diag_props(x_constant, sig2n_k) 481 SUMMARY.OLS_multi(reg=self, multireg=self.multi, vm=vm, nonspat_diag=nonspat_diag, 482 spat_diag=spat_diag, moran=moran, white_test=white_test, regimes=True, w=w) 483 484 def _get_spat_diag_props(self, x, sig2n_k): 485 self.k = self.kr 486 self._cache = {} 487 self.x = REGI.regimeX_setup( 488 x, self.regimes, [True] * x.shape[1], self.regimes_set) 489 self.xtx = spdot(self.x.T, self.x) 490 self.xtxi = np.linalg.inv(self.xtx) 491 492 493def _work(y, x, w, regi_ids, r, robust, sig2n_k, name_ds, name_y, name_x, name_w, name_regimes): 494 y_r = y[regi_ids[r]] 495 x_r = x[regi_ids[r]] 496 #x_constant,name_x,warn = USER.check_constant(x_r, name_x) 497 #name_x = USER.set_name_x(name_x, x_constant) 498 if robust == 'hac': 499 robust = None 500 model = BaseOLS(y_r, x_r, robust=robust, sig2n_k=sig2n_k) 501 model.title = "ORDINARY LEAST SQUARES ESTIMATION - REGIME %s" % r 502 model.robust = USER.set_robust(robust) 503 model.name_ds = name_ds 504 model.name_y = '%s_%s' % (str(r), name_y) 505 model.name_x = ['%s_%s' % (str(r), i) for i in name_x] 506 model.name_w = name_w 507 model.name_regimes = name_regimes 508 if w: 509 w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) 510 set_warn(model, warn) 511 model.w = w_r 512 return model 513 514 515def _test(): 516 import doctest 517 start_suppress = np.get_printoptions()['suppress'] 518 np.set_printoptions(suppress=True) 519 doctest.testmod() 520 np.set_printoptions(suppress=start_suppress) 521 522if __name__ == '__main__': 523 _test() 524 import numpy as np 525 import libpysal 526 db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"),'r') 527 y_var = 'CRIME' 528 y = np.array([db.by_col(y_var)]).reshape(49, 1) 529 x_var = ['INC', 'HOVAL'] 530 x = np.array([db.by_col(name) for name in x_var]).T 531 r_var = 'NSA' 532 regimes = db.by_col(r_var) 533 w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) 534 w.transform = 'r' 535 olsr = OLS_Regimes(y, x, regimes, w=w, constant_regi='many', nonspat_diag=False, spat_diag=False, name_y=y_var, name_x=['INC', 'HOVAL'], 536 name_ds='columbus', name_regimes=r_var, name_w='columbus.gal', regime_err_sep=True, cols2regi=[True, True], sig2n_k=True, robust='white') 537 print(olsr.summary) 538