1"""Probit regression class and diagnostics.""" 2 3__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu" 4 5import numpy as np 6import numpy.linalg as la 7import scipy.optimize as op 8from scipy.stats import norm, chi2 9chisqprob = chi2.sf 10import scipy.sparse as SP 11from . import user_output as USER 12from . import summary_output as SUMMARY 13from .utils import spdot, spbroadcast, set_warn 14 15__all__ = ["Probit"] 16 17 18class BaseProbit(object): 19 20 """ 21 Probit class to do all the computations 22 23 Parameters 24 ---------- 25 26 x : array 27 nxk array of independent variables (assumed to be aligned with y) 28 y : array 29 nx1 array of dependent binary variable 30 w : W 31 PySAL weights instance or spatial weights sparse matrix 32 aligned with y 33 optim : string 34 Optimization method. 35 Default: 'newton' (Newton-Raphson). 36 Alternatives: 'ncg' (Newton-CG), 'bfgs' (BFGS algorithm) 37 scalem : string 38 Method to calculate the scale of the marginal effects. 39 Default: 'phimean' (Mean of individual marginal effects) 40 Alternative: 'xmean' (Marginal effects at variables mean) 41 maxiter : int 42 Maximum number of iterations until optimizer stops 43 44 Attributes 45 ---------- 46 47 x : array 48 Two dimensional array with n rows and one column for each 49 independent (exogenous) variable, including the constant 50 y : array 51 nx1 array of dependent variable 52 betas : array 53 kx1 array with estimated coefficients 54 predy : array 55 nx1 array of predicted y values 56 n : int 57 Number of observations 58 k : int 59 Number of variables 60 vm : array 61 Variance-covariance matrix (kxk) 62 z_stat : list of tuples 63 z statistic; each tuple contains the pair (statistic, 64 p-value), where each is a float 65 xmean : array 66 Mean of the independent variables (kx1) 67 predpc : float 68 Percent of y correctly predicted 69 logl : float 70 Log-Likelihhod of the estimation 71 scalem : string 72 Method to calculate the scale of the marginal effects. 73 scale : float 74 Scale of the marginal effects. 75 slopes : array 76 Marginal effects of the independent variables (k-1x1) 77 Note: Disregards the presence of dummies. 78 slopes_vm : array 79 Variance-covariance matrix of the slopes (k-1xk-1) 80 LR : tuple 81 Likelihood Ratio test of all coefficients = 0 82 (test statistics, p-value) 83 Pinkse_error: float 84 Lagrange Multiplier test against spatial error correlation. 85 Implemented as presented in :cite:`Pinkse2004`. 86 KP_error : float 87 Moran's I type test against spatial error correlation. 88 Implemented as presented in :cite:`Kelejian2001`. 89 PS_error : float 90 Lagrange Multiplier test against spatial error correlation. 91 Implemented as presented in :cite:`Pinkse1998`. 92 warning : boolean 93 if True Maximum number of iterations exceeded or gradient 94 and/or function calls not changing. 95 96 Examples 97 -------- 98 >>> import numpy as np 99 >>> import libpysal 100 >>> import spreg 101 >>> np.set_printoptions(suppress=True) #prevent scientific format 102 >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') 103 >>> y = np.array([dbf.by_col('CRIME')]).T 104 >>> x = np.array([dbf.by_col('INC'), dbf.by_col('HOVAL')]).T 105 >>> x = np.hstack((np.ones(y.shape),x)) 106 >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read() 107 >>> w.transform='r' 108 >>> model = spreg.probit.BaseProbit((y>40).astype(float), x, w=w) 109 >>> print(np.around(model.betas, decimals=6)) 110 [[ 3.353811] 111 [-0.199653] 112 [-0.029514]] 113 114 >>> print(np.around(model.vm, decimals=6)) 115 [[ 0.852814 -0.043627 -0.008052] 116 [-0.043627 0.004114 -0.000193] 117 [-0.008052 -0.000193 0.00031 ]] 118 119 >>> tests = np.array([['Pinkse_error','KP_error','PS_error']]) 120 >>> stats = np.array([[model.Pinkse_error[0],model.KP_error[0],model.PS_error[0]]]) 121 >>> pvalue = np.array([[model.Pinkse_error[1],model.KP_error[1],model.PS_error[1]]]) 122 >>> print(np.hstack((tests.T,np.around(np.hstack((stats.T,pvalue.T)),6)))) 123 [['Pinkse_error' '3.131719' '0.076783'] 124 ['KP_error' '1.721312' '0.085194'] 125 ['PS_error' '2.558166' '0.109726']] 126 """ 127 128 def __init__(self, y, x, w=None, optim='newton', scalem='phimean', maxiter=100): 129 self.y = y 130 self.x = x 131 self.n, self.k = x.shape 132 self.optim = optim 133 self.scalem = scalem 134 self.w = w 135 self.maxiter = maxiter 136 par_est, self.warning = self.par_est() 137 self.betas = np.reshape(par_est[0], (self.k, 1)) 138 self.logl = -float(par_est[1]) 139 140 @property 141 def vm(self): 142 try: 143 return self._cache['vm'] 144 except AttributeError: 145 self._cache = {} 146 H = self.hessian(self.betas) 147 self._cache['vm'] = -la.inv(H) 148 except KeyError: 149 H = self.hessian(self.betas) 150 self._cache['vm'] = -la.inv(H) 151 return self._cache['vm'] 152 153 @vm.setter 154 def vm(self, val): 155 try: 156 self._cache['vm'] = val 157 except AttributeError: 158 self._cache = {} 159 self._cache['vm'] = val 160 161 @property #could this get packaged into a separate function or something? It feels weird to duplicate this. 162 def z_stat(self): 163 try: 164 return self._cache['z_stat'] 165 except AttributeError: 166 self._cache = {} 167 variance = self.vm.diagonal() 168 zStat = self.betas.reshape(len(self.betas),) / np.sqrt(variance) 169 rs = {} 170 for i in range(len(self.betas)): 171 rs[i] = (zStat[i], norm.sf(abs(zStat[i])) * 2) 172 self._cache['z_stat'] = rs.values() 173 except KeyError: 174 variance = self.vm.diagonal() 175 zStat = self.betas.reshape(len(self.betas),) / np.sqrt(variance) 176 rs = {} 177 for i in range(len(self.betas)): 178 rs[i] = (zStat[i], norm.sf(abs(zStat[i])) * 2) 179 self._cache['z_stat'] = rs.values() 180 return self._cache['z_stat'] 181 182 @z_stat.setter 183 def z_stat(self, val): 184 try: 185 self._cache['z_stat'] = val 186 except AttributeError: 187 self._cache = {} 188 self._cache['z_stat'] = val 189 190 @property 191 def slopes_std_err(self): 192 try: 193 return self._cache['slopes_std_err'] 194 except AttributeError: 195 self._cache = {} 196 self._cache['slopes_std_err'] = np.sqrt(self.slopes_vm.diagonal()) 197 except KeyError: 198 self._cache['slopes_std_err'] = np.sqrt(self.slopes_vm.diagonal()) 199 return self._cache['slopes_std_err'] 200 201 @slopes_std_err.setter 202 def slopes_std_err(self, val): 203 try: 204 self._cache['slopes_std_err'] = val 205 except AttributeError: 206 self._cache = {} 207 self._cache['slopes_std_err'] = val 208 209 @property 210 def slopes_z_stat(self): 211 try: 212 return self._cache['slopes_z_stat'] 213 except AttributeError: 214 self._cache = {} 215 return self.slopes_z_stat 216 except KeyError: 217 zStat = self.slopes.reshape( 218 len(self.slopes),) / self.slopes_std_err 219 rs = {} 220 for i in range(len(self.slopes)): 221 rs[i] = (zStat[i], norm.sf(abs(zStat[i])) * 2) 222 self._cache['slopes_z_stat'] = list(rs.values()) 223 return self._cache['slopes_z_stat'] 224 225 @slopes_z_stat.setter 226 def slopes_z_stat(self, val): 227 try: 228 self._cache['slopes_z_stat'] = val 229 except AttributeError: 230 self._cache = {} 231 self._cache['slopes_z_stat'] = val 232 233 @property 234 def xmean(self): 235 try: 236 return self._cache['xmean'] 237 except AttributeError: 238 self._cache = {} 239 try: #why is this try-accept? can x be a list?? 240 self._cache['xmean'] = np.reshape(sum(self.x) / self.n, (self.k, 1)) 241 except: 242 self._cache['xmean'] = np.reshape(sum(self.x).toarray() / self.n, (self.k, 1)) 243 except KeyError: 244 try: 245 self._cache['xmean'] = np.reshape(sum(self.x) / self.n, (self.k, 1)) 246 except: 247 self._cache['xmean'] = np.reshape(sum(self.x).toarray() / self.n, (self.k, 1)) 248 return self._cache['xmean'] 249 250 @xmean.setter 251 def xmean(self, val): 252 try: 253 self._cache['xmean'] = val 254 except AttributeError: 255 self._cache = {} 256 self._cache['xmean'] = val 257 258 @property 259 def xb(self): 260 try: 261 return self._cache['xb'] 262 except AttributeError: 263 self._cache = {} 264 self._cache['xb'] = spdot(self.x, self.betas) 265 except KeyError: 266 self._cache['xb'] = spdot(self.x, self.betas) 267 return self._cache['xb'] 268 269 @xb.setter 270 def xb(self, val): 271 try: 272 self._cache['xb'] = val 273 except AttributeError: 274 self._cache = {} 275 self._cache['xb'] = val 276 277 @property 278 def predy(self): 279 try: 280 return self._cache['predy'] 281 except AttributeError: 282 self._cache = {} 283 self._cache['predy'] = norm.cdf(self.xb) 284 except KeyError: 285 self._cache['predy'] = norm.cdf(self.xb) 286 return self._cache['predy'] 287 288 @predy.setter 289 def predy(self, val): 290 try: 291 self._cache['predy'] = val 292 except AttributeError: 293 self._cache = {} 294 self._cache['predy'] = val 295 296 @property 297 def predpc(self): 298 try: 299 return self._cache['predpc'] 300 except AttributeError: 301 self._cache = {} 302 predpc = abs(self.y - self.predy) 303 for i in range(len(predpc)): 304 if predpc[i] > 0.5: 305 predpc[i] = 0 306 else: 307 predpc[i] = 1 308 self._cache['predpc'] = float(100.0 * np.sum(predpc) / self.n) 309 except KeyError: 310 predpc = abs(self.y - self.predy) 311 for i in range(len(predpc)): 312 if predpc[i] > 0.5: 313 predpc[i] = 0 314 else: 315 predpc[i] = 1 316 self._cache['predpc'] = float(100.0 * np.sum(predpc) / self.n) 317 return self._cache['predpc'] 318 319 @predpc.setter 320 def predpc(self, val): 321 try: 322 self._cache['predpc'] = val 323 except AttributeError: 324 self._cache = {} 325 self._cache['predpc'] = val 326 327 @property 328 def phiy(self): 329 try: 330 return self._cache['phiy'] 331 except AttributeError: 332 self._cache = {} 333 self._cache['phiy'] = norm.pdf(self.xb) 334 except KeyError: 335 self._cache['phiy'] = norm.pdf(self.xb) 336 return self._cache['phiy'] 337 338 @phiy.setter 339 def phiy(self, val): 340 try: 341 self._cache['phiy'] = val 342 except AttributeError: 343 self._cache = {} 344 self._cache['phiy'] = val 345 346 @property 347 def scale(self): 348 try: 349 return self._cache['scale'] 350 except AttributeError: 351 self._cache = {} 352 if self.scalem == 'phimean': 353 self._cache['scale'] = float(1.0 * np.sum(self.phiy) / self.n) 354 elif self.scalem == 'xmean': 355 self._cache['scale'] = float(norm.pdf(np.dot(self.xmean.T, self.betas))) 356 except KeyError: 357 if self.scalem == 'phimean': 358 self._cache['scale'] = float(1.0 * np.sum(self.phiy) / self.n) 359 if self.scalem == 'xmean': 360 self._cache['scale'] = float(norm.pdf(np.dot(self.xmean.T, self.betas))) 361 return self._cache['scale'] 362 363 @scale.setter 364 def scale(self, val): 365 try: 366 self._cache['scale'] = val 367 except AttributeError: 368 self._cache = {} 369 self._cache['scale'] = val 370 371 @property 372 def slopes(self): 373 try: 374 return self._cache['slopes'] 375 except AttributeError: 376 self._cache = {} 377 self._cache['slopes'] = self.betas[1:] * self.scale 378 except KeyError: 379 self._cache['slopes'] = self.betas[1:] * self.scale 380 return self._cache['slopes'] 381 382 @slopes.setter 383 def slopes(self, val): 384 try: 385 self._cache['slopes'] = val 386 except AttributeError: 387 self._cache = {} 388 self._cache['slopes'] = val 389 390 @property 391 def slopes_vm(self): 392 try: 393 return self._cache['slopes_vm'] 394 except AttributeError: 395 self._cache = {} 396 x = self.xmean 397 b = self.betas 398 dfdb = np.eye(self.k) - spdot(b.T, x) * spdot(b, x.T) 399 slopes_vm = (self.scale ** 2) * \ 400 np.dot(np.dot(dfdb, self.vm), dfdb.T) 401 self._cache['slopes_vm'] = slopes_vm[1:, 1:] 402 except KeyError: 403 x = self.xmean 404 b = self.betas 405 dfdb = np.eye(self.k) - spdot(b.T, x) * spdot(b, x.T) 406 slopes_vm = (self.scale ** 2) * \ 407 np.dot(np.dot(dfdb, self.vm), dfdb.T) 408 self._cache['slopes_vm'] = slopes_vm[1:, 1:] 409 return self._cache['slopes_vm'] 410 411 @slopes_vm.setter 412 def slopes_vm(self, val): 413 try: 414 self._cache['slopes_vm'] = val 415 except AttributeError: 416 self._cache = {} 417 self._cache['slopes_vm'] = val 418 419 @property 420 def LR(self): 421 try: 422 return self._cache['LR'] 423 except AttributeError: 424 self._cache = {} 425 P = 1.0 * np.sum(self.y) / self.n 426 LR = float( 427 -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) 428 self._cache['LR'] = (LR, chisqprob(LR, self.k)) 429 except KeyError: 430 P = 1.0 * np.sum(self.y) / self.n 431 LR = float( 432 -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) 433 self._cache['LR'] = (LR, chisqprob(LR, self.k)) 434 return self._cache['LR'] 435 436 @LR.setter 437 def LR(self, val): 438 try: 439 self._cache['LR'] = val 440 except AttributeError: 441 self._cache = {} 442 self._cache['LR'] = val 443 444 @property 445 def u_naive(self): 446 try: 447 return self._cache['u_naive'] 448 except AttributeError: 449 self._cache = {} 450 self._cache['u_naive'] = self.y - self.predy 451 except KeyError: 452 u_naive = self.y - self.predy 453 self._cache['u_naive'] = u_naive 454 return self._cache['u_naive'] 455 456 @u_naive.setter 457 def u_naive(self, val): 458 try: 459 self._cache['u_naive'] = val 460 except AttributeError: 461 self._cache = {} 462 self._cache['u_naive'] = val 463 464 @property 465 def u_gen(self): 466 try: 467 return self._cache['u_gen'] 468 except AttributeError: 469 self._cache = {} 470 Phi_prod = self.predy * (1 - self.predy) 471 u_gen = self.phiy * (self.u_naive / Phi_prod) 472 self._cache['u_gen'] = u_gen 473 except KeyError: 474 Phi_prod = self.predy * (1 - self.predy) 475 u_gen = self.phiy * (self.u_naive / Phi_prod) 476 self._cache['u_gen'] = u_gen 477 return self._cache['u_gen'] 478 479 @u_gen.setter 480 def u_gen(self, val): 481 try: 482 self._cache['u_gen'] = val 483 except AttributeError: 484 self._cache = {} 485 self._cache['u_gen'] = val 486 487 @property 488 def Pinkse_error(self): 489 try: 490 return self._cache['Pinkse_error'] 491 except AttributeError: 492 self._cache = {} 493 self._cache['Pinkse_error'], self._cache[ 494 'KP_error'], self._cache['PS_error'] = sp_tests(self) 495 except KeyError: 496 self._cache['Pinkse_error'], self._cache[ 497 'KP_error'], self._cache['PS_error'] = sp_tests(self) 498 return self._cache['Pinkse_error'] 499 500 @Pinkse_error.setter 501 def Pinkse_error(self, val): 502 try: 503 self._cache['Pinkse_error'] = val 504 except AttributeError: 505 self._cache = {} 506 self._cache['Pinkse_error'] = val 507 508 @property 509 def KP_error(self): 510 try: 511 return self._cache['KP_error'] 512 except AttributeError: 513 self._cache = {} 514 self._cache['Pinkse_error'], self._cache[ 515 'KP_error'], self._cache['PS_error'] = sp_tests(self) 516 except KeyError: 517 self._cache['Pinkse_error'], self._cache[ 518 'KP_error'], self._cache['PS_error'] = sp_tests(self) 519 return self._cache['KP_error'] 520 521 @KP_error.setter 522 def KP_error(self, val): 523 try: 524 self._cache['KP_error'] = val 525 except AttributeError: 526 self._cache = {} 527 self._cache['KP_error'] = val 528 529 @property 530 def PS_error(self): 531 try: 532 return self._cache['PS_error'] 533 except AttributeError: 534 self._cache = {} 535 self._cache['Pinkse_error'], self._cache[ 536 'KP_error'], self._cache['PS_error'] = sp_tests(self) 537 except KeyError: 538 self._cache['Pinkse_error'], self._cache[ 539 'KP_error'], self._cache['PS_error'] = sp_tests(self) 540 return self._cache['PS_error'] 541 542 @PS_error.setter 543 def PS_error(self, val): 544 try: 545 self._cache['PS_error'] = val 546 except AttributeError: 547 self._cache = {} 548 self._cache['PS_error'] = val 549 550 def par_est(self): 551 start = np.dot(la.inv(spdot(self.x.T, self.x)), 552 spdot(self.x.T, self.y)) 553 flogl = lambda par: -self.ll(par) 554 if self.optim == 'newton': 555 fgrad = lambda par: self.gradient(par) 556 fhess = lambda par: self.hessian(par) 557 par_hat = newton(flogl, start, fgrad, fhess, self.maxiter) 558 warn = par_hat[2] 559 else: 560 fgrad = lambda par: -self.gradient(par) 561 if self.optim == 'bfgs': 562 par_hat = op.fmin_bfgs( 563 flogl, start, fgrad, full_output=1, disp=0) 564 warn = par_hat[6] 565 if self.optim == 'ncg': 566 fhess = lambda par: -self.hessian(par) 567 par_hat = op.fmin_ncg( 568 flogl, start, fgrad, fhess=fhess, full_output=1, disp=0) 569 warn = par_hat[5] 570 if warn > 0: 571 warn = True 572 else: 573 warn = False 574 return par_hat, warn 575 576 def ll(self, par): 577 beta = np.reshape(np.array(par), (self.k, 1)) 578 q = 2 * self.y - 1 579 qxb = q * spdot(self.x, beta) 580 ll = sum(np.log(norm.cdf(qxb))) 581 return ll 582 583 def gradient(self, par): 584 beta = np.reshape(np.array(par), (self.k, 1)) 585 q = 2 * self.y - 1 586 qxb = q * spdot(self.x, beta) 587 lamb = q * norm.pdf(qxb) / norm.cdf(qxb) 588 gradient = spdot(lamb.T, self.x)[0] 589 return gradient 590 591 def hessian(self, par): 592 beta = np.reshape(np.array(par), (self.k, 1)) 593 q = 2 * self.y - 1 594 xb = spdot(self.x, beta) 595 qxb = q * xb 596 lamb = q * norm.pdf(qxb) / norm.cdf(qxb) 597 hessian = spdot(self.x.T, spbroadcast(self.x,-lamb * (lamb + xb))) 598 return hessian 599 600 601class Probit(BaseProbit): 602 603 """ 604 Classic non-spatial Probit and spatial diagnostics. The class includes a 605 printout that formats all the results and tests in a nice format. 606 607 The diagnostics for spatial dependence currently implemented are: 608 609 * Pinkse Error :cite:`Pinkse2004` 610 611 * Kelejian and Prucha Moran's I :cite:`Kelejian2001` 612 613 * Pinkse & Slade Error :cite:`Pinkse1998` 614 615 Parameters 616 ---------- 617 618 x : array 619 nxk array of independent variables (assumed to be aligned with y) 620 y : array 621 nx1 array of dependent binary variable 622 w : W 623 PySAL weights instance aligned with y 624 optim : string 625 Optimization method. 626 Default: 'newton' (Newton-Raphson). 627 Alternatives: 'ncg' (Newton-CG), 'bfgs' (BFGS algorithm) 628 scalem : string 629 Method to calculate the scale of the marginal effects. 630 Default: 'phimean' (Mean of individual marginal effects) 631 Alternative: 'xmean' (Marginal effects at variables mean) 632 maxiter : int 633 Maximum number of iterations until optimizer stops 634 name_y : string 635 Name of dependent variable for use in output 636 name_x : list of strings 637 Names of independent variables for use in output 638 name_w : string 639 Name of weights matrix for use in output 640 name_ds : string 641 Name of dataset for use in output 642 643 Attributes 644 ---------- 645 646 x : array 647 Two dimensional array with n rows and one column for each 648 independent (exogenous) variable, including the constant 649 y : array 650 nx1 array of dependent variable 651 betas : array 652 kx1 array with estimated coefficients 653 predy : array 654 nx1 array of predicted y values 655 n : int 656 Number of observations 657 k : int 658 Number of variables 659 vm : array 660 Variance-covariance matrix (kxk) 661 z_stat : list of tuples 662 z statistic; each tuple contains the pair (statistic, 663 p-value), where each is a float 664 xmean : array 665 Mean of the independent variables (kx1) 666 predpc : float 667 Percent of y correctly predicted 668 logl : float 669 Log-Likelihhod of the estimation 670 scalem : string 671 Method to calculate the scale of the marginal effects. 672 scale : float 673 Scale of the marginal effects. 674 slopes : array 675 Marginal effects of the independent variables (k-1x1) 676 slopes_vm : array 677 Variance-covariance matrix of the slopes (k-1xk-1) 678 LR : tuple 679 Likelihood Ratio test of all coefficients = 0 680 (test statistics, p-value) 681 Pinkse_error: float 682 Lagrange Multiplier test against spatial error correlation. 683 Implemented as presented in :cite:`Pinkse2004` 684 KP_error : float 685 Moran's I type test against spatial error correlation. 686 Implemented as presented in :cite:`Kelejian2001` 687 PS_error : float 688 Lagrange Multiplier test against spatial error correlation. 689 Implemented as presented in :cite:`Pinkse1998` 690 warning : boolean 691 if True Maximum number of iterations exceeded or gradient 692 and/or function calls not changing. 693 name_y : string 694 Name of dependent variable for use in output 695 name_x : list of strings 696 Names of independent variables for use in output 697 name_w : string 698 Name of weights matrix for use in output 699 name_ds : string 700 Name of dataset for use in output 701 title : string 702 Name of the regression method used 703 704 Examples 705 -------- 706 707 We first need to import the needed modules, namely numpy to convert the 708 data we read into arrays that ``spreg`` understands and ``libpysal`` to 709 perform all the analysis. 710 711 >>> import numpy as np 712 >>> import libpysal 713 >>> np.set_printoptions(suppress=True) #prevent scientific format 714 715 Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open(). 716 This is the DBF associated with the Columbus shapefile. Note that 717 libpysal.io.open() also reads data in CSV format; since the actual class 718 requires data to be passed in as numpy arrays, the user can read their 719 data in using any method. 720 721 >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') 722 723 Extract the CRIME column (crime) from the DBF file and make it the 724 dependent variable for the regression. Note that libpysal requires this to be 725 an numpy array of shape (n, 1) as opposed to the also common shape of (n, ) 726 that other packages accept. Since we want to run a probit model and for this 727 example we use the Columbus data, we also need to transform the continuous 728 CRIME variable into a binary variable. As in :cite:`McMillen1992`, we define 729 y = 1 if CRIME > 40. 730 731 >>> y = np.array([dbf.by_col('CRIME')]).T 732 >>> y = (y>40).astype(float) 733 734 Extract HOVAL (home values) and INC (income) vectors from the DBF to be used as 735 independent variables in the regression. Note that libpysal requires this to 736 be an nxj numpy array, where j is the number of independent variables (not 737 including a constant). By default this class adds a vector of ones to the 738 independent variables passed in. 739 740 >>> names_to_extract = ['INC', 'HOVAL'] 741 >>> x = np.array([dbf.by_col(name) for name in names_to_extract]).T 742 743 Since we want to the test the probit model for spatial dependence, we need to 744 specify the spatial weights matrix that includes the spatial configuration of 745 the observations into the error component of the model. To do that, we can open 746 an already existing gal file or create a new one. In this case, we will use 747 ``columbus.gal``, which contains contiguity relationships between the 748 observations in the Columbus dataset we are using throughout this example. 749 Note that, in order to read the file, not only to open it, we need to 750 append '.read()' at the end of the command. 751 752 >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read() 753 754 Unless there is a good reason not to do it, the weights have to be 755 row-standardized so every row of the matrix sums to one. In libpysal, this 756 can be easily performed in the following way: 757 758 >>> w.transform='r' 759 760 We are all set with the preliminaries, we are good to run the model. In this 761 case, we will need the variables and the weights matrix. If we want to 762 have the names of the variables printed in the output summary, we will 763 have to pass them in as well, although this is optional. 764 765 >>> from spreg import Probit 766 >>> model = Probit(y, x, w=w, name_y='crime', name_x=['income','home value'], name_ds='columbus', name_w='columbus.gal') 767 768 Once we have run the model, we can explore a little bit the output. The 769 regression object we have created has many attributes so take your time to 770 discover them. 771 772 >>> np.around(model.betas, decimals=6) 773 array([[ 3.353811], 774 [-0.199653], 775 [-0.029514]]) 776 777 >>> np.around(model.vm, decimals=6) 778 array([[ 0.852814, -0.043627, -0.008052], 779 [-0.043627, 0.004114, -0.000193], 780 [-0.008052, -0.000193, 0.00031 ]]) 781 782 Since we have provided a spatial weigths matrix, the diagnostics for 783 spatial dependence have also been computed. We can access them and their 784 p-values individually: 785 786 >>> tests = np.array([['Pinkse_error','KP_error','PS_error']]) 787 >>> stats = np.array([[model.Pinkse_error[0],model.KP_error[0],model.PS_error[0]]]) 788 >>> pvalue = np.array([[model.Pinkse_error[1],model.KP_error[1],model.PS_error[1]]]) 789 >>> print(np.hstack((tests.T,np.around(np.hstack((stats.T,pvalue.T)),6)))) 790 [['Pinkse_error' '3.131719' '0.076783'] 791 ['KP_error' '1.721312' '0.085194'] 792 ['PS_error' '2.558166' '0.109726']] 793 794 Or we can easily obtain a full summary of all the results nicely formatted and 795 ready to be printed simply by typing 'print model.summary' 796 797 """ 798 799 def __init__( 800 self, y, x, w=None, optim='newton', scalem='phimean', maxiter=100, 801 vm=False, name_y=None, name_x=None, name_w=None, name_ds=None, 802 spat_diag=False): 803 804 n = USER.check_arrays(y, x) 805 y = USER.check_y(y, n) 806 if w != None: 807 USER.check_weights(w, y) 808 spat_diag = True 809 ws = w.sparse 810 else: 811 ws = None 812 x_constant,name_x,warn = USER.check_constant(x,name_x) 813 set_warn(self, warn) 814 815 BaseProbit.__init__(self, y=y, x=x_constant, w=ws, 816 optim=optim, scalem=scalem, maxiter=maxiter) 817 self.title = "CLASSIC PROBIT ESTIMATOR" 818 self.name_ds = USER.set_name_ds(name_ds) 819 self.name_y = USER.set_name_y(name_y) 820 self.name_x = USER.set_name_x(name_x, x) 821 self.name_w = USER.set_name_w(name_w, w) 822 SUMMARY.Probit(reg=self, w=w, vm=vm, spat_diag=spat_diag) 823 824 825def newton(flogl, start, fgrad, fhess, maxiter): 826 """ 827 Calculates the Newton-Raphson method 828 829 Parameters 830 ---------- 831 832 flogl : lambda 833 Function to calculate the log-likelihood 834 start : array 835 kx1 array of starting values 836 fgrad : lambda 837 Function to calculate the gradient 838 fhess : lambda 839 Function to calculate the hessian 840 maxiter : int 841 Maximum number of iterations until optimizer stops 842 """ 843 warn = 0 844 iteration = 0 845 par_hat0 = start 846 m = 1 847 while (iteration < maxiter and m >= 1e-04): 848 H = -la.inv(fhess(par_hat0)) 849 g = fgrad(par_hat0).reshape(start.shape) 850 Hg = np.dot(H, g) 851 par_hat0 = par_hat0 + Hg 852 iteration += 1 853 m = np.dot(g.T, Hg) 854 if iteration == maxiter: 855 warn = 1 856 logl = flogl(par_hat0) 857 return (par_hat0, logl, warn) 858 859 860def sp_tests(reg): 861 """ 862 Calculates tests for spatial dependence in Probit models 863 864 Parameters 865 ---------- 866 reg : regression object 867 output instance from a probit model 868 """ 869 if reg.w != None: 870 try: 871 w = reg.w.sparse 872 except: 873 w = reg.w 874 Phi = reg.predy 875 phi = reg.phiy 876 # Pinkse_error: 877 Phi_prod = Phi * (1 - Phi) 878 u_naive = reg.u_naive 879 u_gen = reg.u_gen 880 sig2 = np.sum((phi * phi) / Phi_prod) / reg.n 881 LM_err_num = np.dot(u_gen.T, (w * u_gen)) ** 2 882 trWW = np.sum((w * w).diagonal()) 883 trWWWWp = trWW + np.sum((w * w.T).diagonal()) 884 LM_err = float(1.0 * LM_err_num / (sig2 ** 2 * trWWWWp)) 885 LM_err = np.array([LM_err, chisqprob(LM_err, 1)]) 886 # KP_error: 887 moran = moran_KP(reg.w, u_naive, Phi_prod) 888 # Pinkse-Slade_error: 889 u_std = u_naive / np.sqrt(Phi_prod) 890 ps_num = np.dot(u_std.T, (w * u_std)) ** 2 891 trWpW = np.sum((w.T * w).diagonal()) 892 ps = float(ps_num / (trWW + trWpW)) 893 # chi-square instead of bootstrap. 894 ps = np.array([ps, chisqprob(ps, 1)]) 895 else: 896 raise Exception("W matrix must be provided to calculate spatial tests.") 897 return LM_err, moran, ps 898 899 900def moran_KP(w, u, sig2i): 901 """ 902 Calculates Moran-flavoured tests 903 904 Parameters 905 ---------- 906 907 w : W 908 PySAL weights instance aligned with y 909 u : array 910 nx1 array of naive residuals 911 sig2i : array 912 nx1 array of individual variance 913 """ 914 try: 915 w = w.sparse 916 except: 917 pass 918 moran_num = np.dot(u.T, (w * u)) 919 E = SP.lil_matrix(w.get_shape()) 920 E.setdiag(sig2i.flat) 921 E = E.asformat('csr') 922 WE = w * E 923 moran_den = np.sqrt(np.sum((WE * WE + (w.T * E) * WE).diagonal())) 924 moran = float(1.0 * moran_num / moran_den) 925 moran = np.array([moran, norm.sf(abs(moran)) * 2.]) 926 return moran 927 928 929def _test(): 930 import doctest 931 start_suppress = np.get_printoptions()['suppress'] 932 np.set_printoptions(suppress=True) 933 doctest.testmod() 934 np.set_printoptions(suppress=start_suppress) 935 936if __name__ == '__main__': 937 _test() 938 import numpy as np 939 import libpysal 940 dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'), 'r') 941 y = np.array([dbf.by_col('CRIME')]).T 942 var_x = ['INC', 'HOVAL'] 943 x = np.array([dbf.by_col(name) for name in var_x]).T 944 w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read() 945 w.transform = 'r' 946 probit1 = Probit( 947 (y > 40).astype(float), x, w=w, name_x=var_x, name_y="CRIME", 948 name_ds="Columbus", name_w="columbus.dbf") 949 print(probit1.summary) 950