1# Main GWR classes 2 3__author__ = "Taylor Oshan Tayoshan@gmail.com" 4 5import copy 6import numpy as np 7import numpy.linalg as la 8from scipy.stats import t 9from scipy.special import factorial 10from itertools import combinations as combo 11from spglm.family import Gaussian, Binomial, Poisson 12from spglm.glm import GLM, GLMResults 13from spglm.iwls import iwls, _compute_betas_gwr 14from spglm.utils import cache_readonly 15from .diagnostics import get_AIC, get_AICc, get_BIC, corr 16from .kernels import * 17from .summary import * 18import multiprocessing as mp 19 20 21class GWR(GLM): 22 """ 23 Geographically weighted regression. Can currently estimate Gaussian, 24 Poisson, and logistic models(built on a GLM framework). GWR object prepares 25 model input. Fit method performs estimation and returns a GWRResults object. 26 27 Parameters 28 ---------- 29 coords : array-like 30 n*2, collection of n sets of (x,y) coordinates of 31 observatons; also used as calibration locations is 32 'points' is set to None 33 34 y : array 35 n*1, dependent variable 36 37 X : array 38 n*k, independent variable, exlcuding the constant 39 40 bw : scalar 41 bandwidth value consisting of either a distance or N 42 nearest neighbors; user specified or obtained using 43 Sel_BW 44 45 family : family object 46 underlying probability model; provides 47 distribution-specific calculations 48 49 offset : array 50 n*1, the offset variable at the ith location. For Poisson model 51 this term is often the size of the population at risk or 52 the expected size of the outcome in spatial epidemiology 53 Default is None where Ni becomes 1.0 for all locations; 54 only for Poisson models 55 56 sigma2_v1 : boolean 57 specify form of corrected denominator of sigma squared to use for 58 model diagnostics; Acceptable options are: 59 60 'True': n-tr(S) (defualt) 61 'False': n-2(tr(S)+tr(S'S)) 62 63 kernel : string 64 type of kernel function used to weight observations; 65 available options: 66 'gaussian' 67 'bisquare' 68 'exponential' 69 70 fixed : boolean 71 True for distance based kernel function and False for 72 adaptive (nearest neighbor) kernel function (default) 73 74 constant : boolean 75 True to include intercept (default) in model and False to exclude 76 intercept. 77 78 spherical : boolean 79 True for shperical coordinates (long-lat), 80 False for projected coordinates (defalut). 81 hat_matrix : boolean 82 True to store full n by n hat matrix, 83 False to not store full hat matrix to minimize memory footprint (defalut). 84 85 Attributes 86 ---------- 87 coords : array-like 88 n*2, collection of n sets of (x,y) coordinates used for 89 calibration locations 90 91 y : array 92 n*1, dependent variable 93 94 X : array 95 n*k, independent variable, exlcuding the constant 96 97 bw : scalar 98 bandwidth value consisting of either a distance or N 99 nearest neighbors; user specified or obtained using 100 Sel_BW 101 102 family : family object 103 underlying probability model; provides 104 distribution-specific calculations 105 106 offset : array 107 n*1, the offset variable at the ith location. For Poisson model 108 this term is often the size of the population at risk or 109 the expected size of the outcome in spatial epidemiology 110 Default is None where Ni becomes 1.0 for all locations 111 112 sigma2_v1 : boolean 113 specify form of corrected denominator of sigma squared to use for 114 model diagnostics; Acceptable options are: 115 116 'True': n-tr(S) (defualt) 117 'False': n-2(tr(S)+tr(S'S)) 118 119 kernel : string 120 type of kernel function used to weight observations; 121 available options: 122 'gaussian' 123 'bisquare' 124 'exponential' 125 126 fixed : boolean 127 True for distance based kernel function and False for 128 adaptive (nearest neighbor) kernel function (default) 129 130 constant : boolean 131 True to include intercept (default) in model and False to exclude 132 intercept 133 134 spherical : boolean 135 True for shperical coordinates (long-lat), 136 False for projected coordinates (defalut). 137 138 hat_matrix : boolean 139 True to store full n by n hat matrix, 140 False to not store full hat matrix to minimize memory footprint (defalut). 141 142 n : integer 143 number of observations 144 145 k : integer 146 number of independent variables 147 148 mean_y : float 149 mean of y 150 151 std_y : float 152 standard deviation of y 153 154 fit_params : dict 155 parameters passed into fit method to define estimation 156 routine 157 158 points : array-like 159 n*2, collection of n sets of (x,y) coordinates used for 160 calibration locations instead of all observations; 161 defaults to None unles specified in predict method 162 163 P : array 164 n*k, independent variables used to make prediction; 165 exlcuding the constant; default to None unless specified 166 in predict method 167 168 exog_scale : scalar 169 estimated scale using sampled locations; defualt is None 170 unless specified in predict method 171 172 exog_resid : array-like 173 estimated residuals using sampled locations; defualt is None 174 unless specified in predict method 175 176 Examples 177 -------- 178 #basic model calibration 179 180 >>> import libpysal as ps 181 >>> from mgwr.gwr import GWR 182 >>> data = ps.io.open(ps.examples.get_path('GData_utm.csv')) 183 >>> coords = list(zip(data.by_col('X'), data.by_col('Y'))) 184 >>> y = np.array(data.by_col('PctBach')).reshape((-1,1)) 185 >>> rural = np.array(data.by_col('PctRural')).reshape((-1,1)) 186 >>> pov = np.array(data.by_col('PctPov')).reshape((-1,1)) 187 >>> african_amer = np.array(data.by_col('PctBlack')).reshape((-1,1)) 188 >>> X = np.hstack([rural, pov, african_amer]) 189 >>> model = GWR(coords, y, X, bw=90.000, fixed=False, kernel='bisquare') 190 >>> results = model.fit() 191 >>> print(results.params.shape) 192 (159, 4) 193 194 #predict at unsampled locations 195 196 >>> index = np.arange(len(y)) 197 >>> test = index[-10:] 198 >>> X_test = X[test] 199 >>> coords_test = np.array(coords)[test] 200 >>> model = GWR(coords, y, X, bw=94, fixed=False, kernel='bisquare') 201 >>> results = model.predict(coords_test, X_test) 202 >>> print(results.params.shape) 203 (10, 4) 204 205 """ 206 207 def __init__(self, coords, y, X, bw, family=Gaussian(), offset=None, 208 sigma2_v1=True, kernel='bisquare', fixed=False, constant=True, 209 spherical=False, hat_matrix=False): 210 """ 211 Initialize class 212 """ 213 GLM.__init__(self, y, X, family, constant=constant) 214 self.constant = constant 215 self.sigma2_v1 = sigma2_v1 216 self.coords = np.array(coords) 217 self.bw = bw 218 self.kernel = kernel 219 self.fixed = fixed 220 if offset is None: 221 self.offset = np.ones((self.n, 1)) 222 else: 223 self.offset = offset * 1.0 224 self.fit_params = {} 225 226 self.points = None 227 self.exog_scale = None 228 self.exog_resid = None 229 self.P = None 230 self.spherical = spherical 231 self.hat_matrix = hat_matrix 232 233 def _build_wi(self, i, bw): 234 235 try: 236 wi = Kernel(i, self.coords, bw, fixed=self.fixed, 237 function=self.kernel, points=self.points, 238 spherical=self.spherical).kernel 239 except BaseException: 240 raise # TypeError('Unsupported kernel function ', kernel) 241 242 return wi 243 244 def _local_fit(self, i): 245 """ 246 Local fitting at location i. 247 """ 248 wi = self._build_wi(i, self.bw).reshape(-1, 1) #local spatial weights 249 250 if isinstance(self.family, Gaussian): 251 betas, inv_xtx_xt = _compute_betas_gwr(self.y, self.X, wi) 252 predy = np.dot(self.X[i], betas)[0] 253 resid = self.y[i] - predy 254 influ = np.dot(self.X[i], inv_xtx_xt[:, i]) 255 w = 1 256 257 elif isinstance(self.family, (Poisson, Binomial)): 258 rslt = iwls(self.y, self.X, self.family, self.offset, None, 259 self.fit_params['ini_params'], self.fit_params['tol'], 260 self.fit_params['max_iter'], wi=wi) 261 inv_xtx_xt = rslt[5] 262 w = rslt[3][i][0] 263 influ = np.dot(self.X[i], inv_xtx_xt[:, i]) * w 264 predy = rslt[1][i] 265 resid = self.y[i] - predy 266 betas = rslt[0] 267 268 if self.fit_params['lite']: 269 return influ, resid, predy, betas.reshape(-1) 270 else: 271 Si = np.dot(self.X[i], inv_xtx_xt).reshape(-1) 272 tr_STS_i = np.sum(Si * Si * w * w) 273 CCT = np.diag(np.dot(inv_xtx_xt, inv_xtx_xt.T)).reshape(-1) 274 if not self.hat_matrix: 275 Si = None 276 return influ, resid, predy, betas.reshape(-1), w, Si, tr_STS_i, CCT 277 278 def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls', 279 lite=False, pool=None): 280 """ 281 Method that fits a model with a particular estimation routine. 282 283 Parameters 284 ---------- 285 286 ini_betas : array, optional 287 k*1, initial coefficient values, including constant. 288 Default is None, which calculates initial values during 289 estimation. 290 tol: float, optional 291 Tolerence for estimation convergence. 292 Default is 1.0e-5. 293 max_iter : integer, optional 294 Maximum number of iterations if convergence not 295 achieved. Default is 20. 296 solve : string, optional 297 Technique to solve MLE equations. 298 Default is 'iwls', meaning iteratively ( 299 re)weighted least squares. 300 lite : bool, optional 301 Whether to estimate a lightweight GWR that 302 computes the minimum diagnostics needed for 303 bandwidth selection (could speed up 304 bandwidth selection for GWR) or to estimate 305 a full GWR. Default is False. 306 pool : A multiprocessing Pool object to enable parallel fitting; default is None. 307 308 Returns 309 ------- 310 : 311 If lite=False, return a GWRResult 312 instance; otherwise, return a GWRResultLite 313 instance. 314 315 """ 316 self.fit_params['ini_params'] = ini_params 317 self.fit_params['tol'] = tol 318 self.fit_params['max_iter'] = max_iter 319 self.fit_params['solve'] = solve 320 self.fit_params['lite'] = lite 321 322 if solve.lower() == 'iwls': 323 324 if self.points is None: 325 m = self.y.shape[0] 326 else: 327 m = self.points.shape[0] 328 329 if pool: 330 rslt = pool.map(self._local_fit, 331 range(m)) #parallel using mp.Pool 332 else: 333 rslt = map(self._local_fit, range(m)) #sequential 334 335 rslt_list = list(zip(*rslt)) 336 influ = np.array(rslt_list[0]).reshape(-1, 1) 337 resid = np.array(rslt_list[1]).reshape(-1, 1) 338 params = np.array(rslt_list[3]) 339 340 if lite: 341 return GWRResultsLite(self, resid, influ, params) 342 else: 343 predy = np.array(rslt_list[2]).reshape(-1, 1) 344 w = np.array(rslt_list[-4]).reshape(-1, 1) 345 if self.hat_matrix: 346 S = np.array(rslt_list[-3]) 347 else: 348 S = None 349 tr_STS = np.sum(np.array(rslt_list[-2])) 350 CCT = np.array(rslt_list[-1]) 351 return GWRResults(self, params, predy, S, CCT, influ, tr_STS, 352 w) 353 354 355 def predict(self, points, P, exog_scale=None, exog_resid=None, 356 fit_params={}): 357 """ 358 Method that predicts values of the dependent variable at un-sampled 359 locations 360 361 Parameters 362 ---------- 363 points : array-like 364 n*2, collection of n sets of (x,y) coordinates used for 365 calibration prediction locations 366 P : array 367 n*k, independent variables used to make prediction; 368 exlcuding the constant 369 exog_scale : scalar 370 estimated scale using sampled locations; defualt is None 371 which estimates a model using points from "coords" 372 exog_resid : array-like 373 estimated residuals using sampled locations; defualt is None 374 which estimates a model using points from "coords"; if 375 given it must be n*1 where n is the length of coords 376 fit_params : dict 377 key-value pairs of parameters that will be passed into fit 378 method to define estimation routine; see fit method for more details 379 380 """ 381 if (exog_scale is None) & (exog_resid is None): 382 train_gwr = self.fit(**fit_params) 383 self.exog_scale = train_gwr.scale 384 self.exog_resid = train_gwr.resid_response 385 elif (exog_scale is not None) & (exog_resid is not None): 386 self.exog_scale = exog_scale 387 self.exog_resid = exog_resid 388 else: 389 raise InputError('exog_scale and exog_resid must both either be' 390 'None or specified') 391 self.points = points 392 if self.constant: 393 P = np.hstack([np.ones((len(P), 1)), P]) 394 self.P = P 395 else: 396 self.P = P 397 gwr = self.fit(**fit_params) 398 399 return gwr 400 401 @cache_readonly 402 def df_model(self): 403 return None 404 405 @cache_readonly 406 def df_resid(self): 407 return None 408 409 410class GWRResults(GLMResults): 411 """ 412 Basic class including common properties for all GWR regression models 413 414 Parameters 415 ---------- 416 model : GWR object 417 pointer to GWR object with estimation parameters 418 419 params : array 420 n*k, estimated coefficients 421 422 predy : array 423 n*1, predicted y values 424 425 S : array 426 n*n, hat matrix 427 428 CCT : array 429 n*k, scaled variance-covariance matrix 430 431 w : array 432 n*1, final weight used for iteratively re-weighted least 433 sqaures; default is None 434 435 Attributes 436 ---------- 437 model : GWR Object 438 points to GWR object for which parameters have been 439 estimated 440 441 params : array 442 n*k, parameter estimates 443 444 predy : array 445 n*1, predicted value of y 446 447 y : array 448 n*1, dependent variable 449 450 X : array 451 n*k, independent variable, including constant 452 453 family : family object 454 underlying probability model; provides 455 distribution-specific calculations 456 457 n : integer 458 number of observations 459 460 k : integer 461 number of independent variables 462 463 df_model : integer 464 model degrees of freedom 465 466 df_resid : integer 467 residual degrees of freedom 468 469 offset : array 470 n*1, the offset variable at the ith location. 471 For Poisson model this term is often the size of 472 the population at risk or the expected size of 473 the outcome in spatial epidemiology; Default is 474 None where Ni becomes 1.0 for all locations 475 476 scale : float 477 sigma squared used for subsequent computations 478 479 w : array 480 n*1, final weights from iteratively re-weighted least 481 sqaures routine 482 483 resid_response : array 484 n*1, residuals of the repsonse 485 486 resid_ss : scalar 487 residual sum of sqaures 488 489 W : array 490 n*n; spatial weights for each observation from each 491 calibration point 492 493 S : array 494 n*n, hat matrix 495 496 CCT : array 497 n*k, scaled variance-covariance matrix 498 499 ENP : scalar 500 effective number of paramters, which depends on 501 sigma2 502 503 tr_S : float 504 trace of S (hat) matrix 505 506 tr_STS : float 507 trace of STS matrix 508 509 y_bar : array 510 n*1, weighted mean value of y 511 512 TSS : array 513 n*1, geographically weighted total sum of squares 514 515 RSS : array 516 n*1, geographically weighted residual sum of squares 517 518 R2 : float 519 R-squared for the entire model (1- RSS/TSS) 520 521 adj_R2 : float 522 adjusted R-squared for the entire model 523 524 aic : float 525 Akaike information criterion 526 527 aicc : float 528 corrected Akaike information criterion to account 529 to account for model complexity (smaller 530 bandwidths) 531 532 bic : float 533 Bayesian information criterio 534 535 localR2 : array 536 n*1, local R square 537 538 sigma2 : float 539 sigma squared (residual variance) that has been 540 corrected to account for the ENP 541 542 std_res : array 543 n*1, standardised residuals 544 545 bse : array 546 n*k, standard errors of parameters (betas) 547 548 influ : array 549 n*1, leading diagonal of S matrix 550 551 CooksD : array 552 n*1, Cook's D 553 554 tvalues : array 555 n*k, local t-statistics 556 557 adj_alpha : array 558 3*1, corrected alpha values to account for multiple 559 hypothesis testing for the 90%, 95%, and 99% confidence 560 levels; tvalues with an absolute value larger than the 561 corrected alpha are considered statistically 562 significant. 563 564 deviance : array 565 n*1, local model deviance for each calibration point 566 567 resid_deviance : array 568 n*1, local sum of residual deviance for each 569 calibration point 570 571 llf : scalar 572 log-likelihood of the full model; see 573 pysal.contrib.glm.family for damily-sepcific 574 log-likelihoods 575 576 pDev : float 577 local percent of deviation accounted for; analogous to 578 r-squared for GLM's 579 580 D2 : float 581 percent deviance explained for GLM, equivaleng to R2 for 582 Gaussian. 583 584 adj_D2 : float 585 adjusted percent deviance explained, equivaleng to adjusted 586 R2 for Gaussian. 587 588 mu : array 589 n*, flat one dimensional array of predicted mean 590 response value from estimator 591 592 fit_params : dict 593 parameters passed into fit method to define estimation 594 routine 595 596 predictions : array 597 p*1, predicted values generated by calling the GWR 598 predict method to predict dependent variable at 599 unsampled points () 600 """ 601 602 def __init__(self, model, params, predy, S, CCT, influ, tr_STS=None, 603 w=None): 604 GLMResults.__init__(self, model, params, predy, w) 605 self.offset = model.offset 606 if w is not None: 607 self.w = w 608 self.predy = predy 609 self.S = S 610 self.tr_STS = tr_STS 611 self.influ = influ 612 self.CCT = self.cov_params(CCT, model.exog_scale) 613 self._cache = {} 614 615 @cache_readonly 616 def W(self): 617 W = np.array( 618 [self.model._build_wi(i, self.model.bw) for i in range(self.n)]) 619 return W 620 621 @cache_readonly 622 def resid_ss(self): 623 if self.model.points is not None: 624 raise NotImplementedError('Not available for GWR prediction') 625 else: 626 u = self.resid_response.flatten() 627 return np.dot(u, u.T) 628 629 @cache_readonly 630 def scale(self, scale=None): 631 if isinstance(self.family, Gaussian): 632 scale = self.sigma2 633 else: 634 scale = 1.0 635 return scale 636 637 def cov_params(self, cov, exog_scale=None): 638 """ 639 Returns scaled covariance parameters 640 641 Parameters 642 ---------- 643 cov : array 644 estimated covariance parameters 645 646 Returns 647 ------- 648 Scaled covariance parameters 649 650 """ 651 if exog_scale is not None: 652 return cov * exog_scale 653 else: 654 return cov * self.scale 655 656 @cache_readonly 657 def tr_S(self): 658 """ 659 trace of S (hat) matrix 660 """ 661 return np.sum(self.influ) 662 663 @cache_readonly 664 def ENP(self): 665 """ 666 effective number of parameters 667 668 Defaults to tr(s) as defined in :cite:`yu:2019` 669 670 but can alternatively be based on 2tr(s) - tr(STS) 671 672 and the form depends on the specification of sigma2 673 """ 674 if self.model.sigma2_v1: 675 return self.tr_S 676 else: 677 return 2 * self.tr_S - self.tr_STS 678 679 @cache_readonly 680 def y_bar(self): 681 """ 682 weighted mean of y 683 """ 684 if self.model.points is not None: 685 n = len(self.model.points) 686 else: 687 n = self.n 688 off = self.offset.reshape((-1, 1)) 689 arr_ybar = np.zeros(shape=(self.n, 1)) 690 for i in range(n): 691 w_i = np.reshape(self.model._build_wi(i, self.model.bw), (-1, 1)) 692 sum_yw = np.sum(self.y.reshape((-1, 1)) * w_i) 693 arr_ybar[i] = 1.0 * sum_yw / np.sum(w_i * off) 694 return arr_ybar 695 696 @cache_readonly 697 def TSS(self): 698 """ 699 geographically weighted total sum of squares 700 701 Methods: p215, (9.9) 702 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 703 Geographically weighted regression: the analysis of spatially varying 704 relationships. 705 706 """ 707 if self.model.points is not None: 708 n = len(self.model.points) 709 else: 710 n = self.n 711 TSS = np.zeros(shape=(n, 1)) 712 for i in range(n): 713 TSS[i] = np.sum( 714 np.reshape(self.model._build_wi(i, self.model.bw), 715 (-1, 1)) * (self.y.reshape( 716 (-1, 1)) - self.y_bar[i])**2) 717 return TSS 718 719 @cache_readonly 720 def RSS(self): 721 """ 722 geographically weighted residual sum of squares 723 724 Methods: p215, (9.10) 725 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 726 Geographically weighted regression: the analysis of spatially varying 727 relationships. 728 """ 729 if self.model.points is not None: 730 n = len(self.model.points) 731 resid = self.model.exog_resid.reshape((-1, 1)) 732 else: 733 n = self.n 734 resid = self.resid_response.reshape((-1, 1)) 735 RSS = np.zeros(shape=(n, 1)) 736 for i in range(n): 737 RSS[i] = np.sum( 738 np.reshape(self.model._build_wi(i, self.model.bw), 739 (-1, 1)) * resid**2) 740 return RSS 741 742 @cache_readonly 743 def localR2(self): 744 """ 745 local R square 746 747 Methods: p215, (9.8) 748 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 749 Geographically weighted regression: the analysis of spatially varying 750 relationships. 751 """ 752 if isinstance(self.family, Gaussian): 753 return (self.TSS - self.RSS) / self.TSS 754 else: 755 raise NotImplementedError('Only applicable to Gaussian') 756 757 @cache_readonly 758 def sigma2(self): 759 """ 760 residual variance 761 762 if sigma2_v1 is True: only use n-tr(S) in denominator 763 764 Methods: p214, (9.6) :cite:`fotheringham_geographically_2002` 765 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 766 Geographically weighted regression: the analysis of spatially varying 767 relationships. 768 769 and as defined in :cite:`yu:2019` 770 771 if sigma2_v1 is False (v1v2): use n-2(tr(S)+tr(S'S)) in denominator 772 773 Methods: p55 (2.16)-(2.18) :cite:`fotheringham_geographically_2002` 774 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 775 Geographically weighted regression: the analysis of spatially varying 776 relationships. 777 778 """ 779 if self.model.sigma2_v1: 780 return (self.resid_ss / (self.n - self.tr_S)) 781 else: 782 # could be changed to SWSTW - nothing to test against 783 return self.resid_ss / (self.n - 2.0 * self.tr_S + self.tr_STS) 784 785 @cache_readonly 786 def std_res(self): 787 """ 788 standardized residuals 789 790 Methods: p215, (9.7) :cite:`fotheringham_geographically_2002` 791 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 792 Geographically weighted regression: the analysis of spatially varying 793 relationships. 794 """ 795 return self.resid_response.reshape( 796 (-1, 1)) / (np.sqrt(self.scale * (1.0 - self.influ))) 797 798 @cache_readonly 799 def bse(self): 800 """ 801 standard errors of Betas 802 803 Methods: p215, (2.15) and (2.21) :cite:`fotheringham_geographically_2002` 804 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 805 Geographically weighted regression: the analysis of spatially varying 806 relationships. 807 """ 808 return np.sqrt(self.CCT) 809 810 @cache_readonly 811 def cooksD(self): 812 """ 813 Influence: leading diagonal of S Matrix 814 815 Methods: p216, (9.11) :cite:`fotheringham_geographically_2002` 816 Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). 817 Geographically weighted regression: the analysis of spatially varying 818 relationships. 819 Note: in (9.11), p should be tr(S), that is, the effective number of parameters 820 """ 821 return self.std_res**2 * self.influ / (self.tr_S * (1.0 - self.influ)) 822 823 @cache_readonly 824 def deviance(self): 825 off = self.offset.reshape((-1, 1)).T 826 y = self.y 827 ybar = self.y_bar 828 if isinstance(self.family, Gaussian): 829 raise NotImplementedError( 830 'deviance not currently used for Gaussian') 831 elif isinstance(self.family, Poisson): 832 dev = np.sum( 833 2.0 * self.W * (y * np.log(y / (ybar * off)) - 834 (y - ybar * off)), axis=1) 835 elif isinstance(self.family, Binomial): 836 dev = self.family.deviance(self.y, self.y_bar, self.W, axis=1) 837 return dev.reshape((-1, 1)) 838 839 @cache_readonly 840 def resid_deviance(self): 841 if isinstance(self.family, Gaussian): 842 raise NotImplementedError( 843 'deviance not currently used for Gaussian') 844 else: 845 off = self.offset.reshape((-1, 1)).T 846 y = self.y 847 ybar = self.y_bar 848 global_dev_res = ((self.family.resid_dev(self.y, self.mu))**2) 849 dev_res = np.repeat(global_dev_res.flatten(), self.n) 850 dev_res = dev_res.reshape((self.n, self.n)) 851 dev_res = np.sum(dev_res * self.W.T, axis=0) 852 return dev_res.reshape((-1, 1)) 853 854 @cache_readonly 855 def pDev(self): 856 """ 857 Local percentage of deviance accounted for. Described in the GWR4 858 manual. Equivalent to 1 - (deviance/null deviance) 859 """ 860 if isinstance(self.family, Gaussian): 861 raise NotImplementedError('Not implemented for Gaussian') 862 else: 863 return 1.0 - (self.resid_deviance / self.deviance) 864 865 @cache_readonly 866 def adj_alpha(self): 867 """ 868 Corrected alpha (critical) values to account for multiple testing during hypothesis 869 testing. Includes corrected value for 90% (.1), 95% (.05), and 99% 870 (.01) confidence levels. Correction comes from: 871 872 :cite:`Silva:2016` : da Silva, A. R., & Fotheringham, A. S. (2015). The Multiple Testing Issue in 873 Geographically Weighted Regression. Geographical Analysis. 874 875 """ 876 alpha = np.array([.1, .05, .001]) 877 pe = self.ENP 878 p = self.k 879 return (alpha * p) / pe 880 881 def critical_tval(self, alpha=None): 882 """ 883 Utility function to derive the critical t-value based on given alpha 884 that are needed for hypothesis testing 885 886 Parameters 887 ---------- 888 alpha : scalar 889 critical value to determine which tvalues are 890 associated with statistically significant parameter 891 estimates. Default to None in which case the adjusted 892 alpha value at the 95 percent CI is automatically 893 used. 894 895 Returns 896 ------- 897 critical : scalar 898 critical t-val based on alpha 899 """ 900 n = self.n 901 if alpha is not None: 902 alpha = np.abs(alpha) / 2.0 903 critical = t.ppf(1 - alpha, n - 1) 904 else: 905 alpha = np.abs(self.adj_alpha[1]) / 2.0 906 critical = t.ppf(1 - alpha, n - 1) 907 return critical 908 909 def filter_tvals(self, critical_t=None, alpha=None): 910 """ 911 Utility function to set tvalues with an absolute value smaller than the 912 absolute value of the alpha (critical) value to 0. If critical_t 913 is supplied than it is used directly to filter. If alpha is provided 914 than the critical t value will be derived and used to filter. If neither 915 are critical_t nor alpha are provided, an adjusted alpha at the 95 916 percent CI will automatically be used to define the critical t-value and 917 used to filter. If both critical_t and alpha are supplied then the alpha 918 value will be ignored. 919 920 Parameters 921 ---------- 922 critical_t : scalar 923 critical t-value to determine whether parameters are 924 statistically significant 925 926 alpha : scalar 927 alpha value to determine which tvalues are 928 associated with statistically significant parameter 929 estimates 930 931 Returns 932 ------- 933 filtered : array 934 n*k; new set of n tvalues for each of k variables 935 where absolute tvalues less than the absolute value of 936 alpha have been set to 0. 937 """ 938 n = self.n 939 if critical_t is not None: 940 critical = critical_t 941 else: 942 critical = self.critical_tval(alpha=alpha) 943 944 subset = (self.tvalues < critical) & (self.tvalues > -1.0 * critical) 945 tvalues = self.tvalues.copy() 946 tvalues[subset] = 0 947 return tvalues 948 949 @cache_readonly 950 def df_model(self): 951 return self.n - self.tr_S 952 953 @cache_readonly 954 def df_resid(self): 955 return self.n - 2.0 * self.tr_S + self.tr_STS 956 957 @cache_readonly 958 def normalized_cov_params(self): 959 return None 960 961 @cache_readonly 962 def resid_pearson(self): 963 return None 964 965 @cache_readonly 966 def resid_working(self): 967 return None 968 969 @cache_readonly 970 def resid_anscombe(self): 971 return None 972 973 @cache_readonly 974 def pearson_chi2(self): 975 return None 976 977 @cache_readonly 978 def llnull(self): 979 return None 980 981 @cache_readonly 982 def null_deviance(self): 983 return self.family.deviance(self.y, self.null) 984 985 @cache_readonly 986 def global_deviance(self): 987 deviance = np.sum(self.family.resid_dev(self.y, self.mu)**2) 988 return deviance 989 990 @cache_readonly 991 def D2(self): 992 """ 993 Percentage of deviance explanied. Equivalent to 1 - (deviance/null deviance) 994 """ 995 D2 = 1.0 - (self.global_deviance / self.null_deviance) 996 return D2 997 998 @cache_readonly 999 def R2(self): 1000 """ 1001 Global r-squared value for a Gaussian model. 1002 """ 1003 if isinstance(self.family, Gaussian): 1004 return self.D2 1005 else: 1006 raise NotImplementedError('R2 only for Gaussian') 1007 1008 @cache_readonly 1009 def adj_D2(self): 1010 """ 1011 Adjusted percentage of deviance explanied. 1012 """ 1013 adj_D2 = 1 - (1 - self.D2) * (self.n - 1) / (self.n - self.ENP - 1) 1014 return adj_D2 1015 1016 @cache_readonly 1017 def adj_R2(self): 1018 """ 1019 Adjusted global r-squared for a Gaussian model. 1020 """ 1021 if isinstance(self.family, Gaussian): 1022 return self.adj_D2 1023 else: 1024 raise NotImplementedError('adjusted R2 only for Gaussian') 1025 1026 @cache_readonly 1027 def aic(self): 1028 return get_AIC(self) 1029 1030 @cache_readonly 1031 def aicc(self): 1032 return get_AICc(self) 1033 1034 @cache_readonly 1035 def bic(self): 1036 return get_BIC(self) 1037 1038 @cache_readonly 1039 def pseudoR2(self): 1040 return None 1041 1042 @cache_readonly 1043 def adj_pseudoR2(self): 1044 return None 1045 1046 @cache_readonly 1047 def pvalues(self): 1048 return None 1049 1050 @cache_readonly 1051 def conf_int(self): 1052 return None 1053 1054 @cache_readonly 1055 def use_t(self): 1056 return None 1057 1058 def get_bws_intervals(self, selector, level=0.95): 1059 """ 1060 Computes bandwidths confidence interval (CI) for GWR. 1061 The CI is based on Akaike weights and the bandwidth search algorithm used. 1062 Details are in Li et al. (2020) Annals of AAG 1063 1064 Returns a tuple with lower and upper bound of the bw CI. 1065 e.g. (100, 300) 1066 """ 1067 1068 try: 1069 import pandas as pd 1070 except ImportError: 1071 return 1072 1073 #Get AICcs and associated bw from the last iteration of back-fitting and make a DataFrame 1074 aiccs = pd.DataFrame(list(zip(*selector.sel_hist))[1],columns=["aicc"]) 1075 aiccs['bw'] = list(zip(*selector.sel_hist))[0] 1076 #Sort DataFrame by the AICc values 1077 aiccs = aiccs.sort_values(by=['aicc']) 1078 #Calculate delta AICc 1079 d_aic_ak = aiccs.aicc - aiccs.aicc.min() 1080 #Calculate AICc weights 1081 w_aic_ak = np.exp(-0.5*d_aic_ak) / np.sum(np.exp(-0.5*d_aic_ak)) 1082 aiccs['w_aic_ak'] = w_aic_ak/np.sum(w_aic_ak) 1083 #Calculate cum. AICc weights 1084 aiccs['cum_w_ak'] = aiccs.w_aic_ak.cumsum() 1085 #Find index where the cum weights above p-val 1086 index = len(aiccs[aiccs.cum_w_ak < level]) + 1 1087 #Get bw boundaries 1088 interval = (aiccs.iloc[:index,:].bw.min(),aiccs.iloc[:index,:].bw.max()) 1089 return interval 1090 1091 1092 def local_collinearity(self): 1093 """ 1094 Computes several indicators of multicollinearity within a geographically 1095 weighted design matrix, including: 1096 1097 local correlation coefficients (n, ((p**2) + p) / 2) 1098 local variance inflation factors (VIF) (n, p-1) 1099 local condition number (n, 1) 1100 local variance-decomposition proportions (n, p) 1101 1102 Returns four arrays with the order and dimensions listed above where n 1103 is the number of locations used as calibrations points and p is the 1104 number of explanatory variables. Local correlation coefficient and local 1105 VIF are not calculated for constant term. 1106 1107 """ 1108 x = self.X 1109 w = self.W 1110 nvar = x.shape[1] 1111 nrow = len(w) 1112 if self.model.constant: 1113 ncor = (((nvar - 1)**2 + (nvar - 1)) / 2) - (nvar - 1) 1114 jk = list(combo(range(1, nvar), 2)) 1115 else: 1116 ncor = (((nvar)**2 + (nvar)) / 2) - nvar 1117 jk = list(combo(range(nvar), 2)) 1118 corr_mat = np.ndarray((nrow, int(ncor))) 1119 if self.model.constant: 1120 vifs_mat = np.ndarray((nrow, nvar - 1)) 1121 else: 1122 vifs_mat = np.ndarray((nrow, nvar)) 1123 vdp_idx = np.ndarray((nrow, nvar)) 1124 vdp_pi = np.ndarray((nrow, nvar, nvar)) 1125 1126 for i in range(nrow): 1127 wi = self.model._build_wi(i, self.model.bw) 1128 sw = np.sum(wi) 1129 wi = wi / sw 1130 tag = 0 1131 1132 for j, k in jk: 1133 corr_mat[i, tag] = corr(np.cov(x[:, j], x[:, k], 1134 aweights=wi))[0][1] 1135 tag = tag + 1 1136 1137 if self.model.constant: 1138 corr_mati = corr(np.cov(x[:, 1:].T, aweights=wi)) 1139 vifs_mat[i, ] = np.diag( 1140 np.linalg.solve(corr_mati, np.identity((nvar - 1)))) 1141 1142 else: 1143 corr_mati = corr(np.cov(x.T, aweights=wi)) 1144 vifs_mat[i, ] = np.diag( 1145 np.linalg.solve(corr_mati, np.identity((nvar)))) 1146 1147 xw = x * wi.reshape((nrow, 1)) 1148 sxw = np.sqrt(np.sum(xw**2, axis=0)) 1149 sxw = np.transpose(xw.T / sxw.reshape((nvar, 1))) 1150 svdx = np.linalg.svd(sxw) 1151 vdp_idx[i, ] = svdx[1][0] / svdx[1] 1152 phi = np.dot(svdx[2].T, np.diag(1 / svdx[1])) 1153 phi = np.transpose(phi**2) 1154 pi_ij = phi / np.sum(phi, axis=0) 1155 vdp_pi[i, :, :] = pi_ij 1156 1157 local_CN = vdp_idx[:, nvar - 1].reshape((-1, 1)) 1158 VDP = vdp_pi[:, nvar - 1, :] 1159 1160 return corr_mat, vifs_mat, local_CN, VDP 1161 1162 def spatial_variability(self, selector, n_iters=1000, seed=None): 1163 """ 1164 Method to compute a Monte Carlo test of spatial variability for each 1165 estimated coefficient surface. 1166 1167 WARNING: This test is very computationally demanding! 1168 1169 Parameters 1170 ---------- 1171 selector : sel_bw object 1172 should be the sel_bw object used to select a bandwidth 1173 for the gwr model that produced the surfaces that are 1174 being tested for spatial variation 1175 1176 n_iters : int 1177 the number of Monte Carlo iterations to include for 1178 the tests of spatial variability. 1179 1180 seed : int 1181 optional parameter to select a custom seed to ensure 1182 stochastic results are replicable. Default is none 1183 which automatically sets the seed to 5536 1184 1185 Returns 1186 ------- 1187 1188 p values : list 1189 a list of psuedo p-values that correspond to the model 1190 parameter surfaces. Allows us to assess the 1191 probability of obtaining the observed spatial 1192 variation of a given surface by random chance. 1193 1194 1195 """ 1196 temp_sel = copy.deepcopy(selector) 1197 temp_gwr = copy.deepcopy(self.model) 1198 1199 if seed is None: 1200 np.random.seed(5536) 1201 else: 1202 np.random.seed(seed) 1203 1204 fit_params = temp_gwr.fit_params 1205 search_params = temp_sel.search_params 1206 kernel = temp_gwr.kernel 1207 fixed = temp_gwr.fixed 1208 1209 if self.model.constant: 1210 X = self.X[:, 1:] 1211 else: 1212 X = self.X 1213 1214 init_sd = np.std(self.params, axis=0) 1215 SDs = [] 1216 1217 for x in range(n_iters): 1218 temp_coords = np.random.permutation(self.model.coords) 1219 temp_sel.coords = temp_coords 1220 temp_bw = temp_sel.search(**search_params) 1221 temp_gwr.bw = temp_bw 1222 temp_gwr.coords = temp_coords 1223 temp_params = temp_gwr.fit(**fit_params).params 1224 temp_sd = np.std(temp_params, axis=0) 1225 SDs.append(temp_sd) 1226 1227 p_vals = (np.sum(np.array(SDs) > init_sd, axis=0) / float(n_iters)) 1228 return p_vals 1229 1230 @cache_readonly 1231 def predictions(self): 1232 P = self.model.P 1233 if P is None: 1234 raise TypeError('predictions only avaialble if predict' 1235 'method is previously called on GWR model') 1236 else: 1237 predictions = np.sum(P * self.params, axis=1).reshape((-1, 1)) 1238 return predictions 1239 1240 def summary(self): 1241 """ 1242 Print out GWR summary 1243 """ 1244 summary = summaryModel(self) + summaryGLM(self) + summaryGWR(self) 1245 print(summary) 1246 return 1247 1248 1249class GWRResultsLite(object): 1250 """ 1251 Lightweight GWR that computes the minimum diagnostics needed for bandwidth 1252 selection. 1253 1254 See FastGWR,Li et al., 2019, IJGIS. 1255 1256 Parameters 1257 ---------- 1258 model : GWR object 1259 pointer to GWR object with estimation parameters 1260 1261 resid : array 1262 n*1, residuals of the repsonse 1263 1264 influ : array 1265 n*1, leading diagonal of S matrix 1266 1267 Attributes 1268 ---------- 1269 tr_S : float 1270 trace of S (hat) matrix 1271 1272 llf : scalar 1273 log-likelihood of the full model; see 1274 pysal.contrib.glm.family for damily-sepcific 1275 log-likelihoods 1276 1277 mu : array 1278 n*, flat one dimensional array of predicted mean 1279 response value from estimator 1280 1281 resid_ss : scalar 1282 residual sum of sqaures 1283 1284 """ 1285 1286 def __init__(self, model, resid, influ, params): 1287 self.y = model.y 1288 self.family = model.family 1289 self.n = model.n 1290 self.influ = influ 1291 self.resid_response = resid 1292 self.model = model 1293 self.params = params 1294 1295 @cache_readonly 1296 def tr_S(self): 1297 return np.sum(self.influ) 1298 1299 @cache_readonly 1300 def llf(self): 1301 return self.family.loglike(self.y, self.mu) 1302 1303 @cache_readonly 1304 def mu(self): 1305 return self.y - self.resid_response 1306 1307 @cache_readonly 1308 def predy(self): 1309 return self.y - self.resid_response 1310 1311 @cache_readonly 1312 def resid_ss(self): 1313 u = self.resid_response.flatten() 1314 return np.dot(u, u.T) 1315 1316 1317class MGWR(GWR): 1318 """ 1319 Multiscale GWR estimation and inference. 1320 See :cite:`Fotheringham:2017` :cite:`yu:2019`. 1321 1322 Parameters 1323 ---------- 1324 coords : array-like 1325 n*2, collection of n sets of (x,y) coordinates of 1326 observatons; also used as calibration locations is 1327 'points' is set to None 1328 1329 y : array 1330 n*1, dependent variable 1331 1332 X : array 1333 n*k, independent variable, exlcuding the constant 1334 1335 selector : sel_bw object 1336 valid sel_bw object that has successfully called 1337 the "search" method. This parameter passes on 1338 information from GAM model estimation including optimal 1339 bandwidths. 1340 1341 family : family object 1342 underlying probability model; provides 1343 distribution-specific calculations 1344 1345 sigma2_v1 : boolean 1346 specify form of corrected denominator of sigma squared to use for 1347 model diagnostics; Acceptable options are: 1348 1349 'True': n-tr(S) (defualt) 1350 'False': n-2(tr(S)+tr(S'S)) 1351 1352 kernel : string 1353 type of kernel function used to weight observations; 1354 available options: 1355 'gaussian' 1356 'bisquare' 1357 'exponential' 1358 1359 fixed : boolean 1360 True for distance based kernel function and False for 1361 adaptive (nearest neighbor) kernel function (default) 1362 1363 constant : boolean 1364 True to include intercept (default) in model and False to exclude 1365 intercept. 1366 1367 spherical : boolean 1368 True for spherical coordinates (long-lat), 1369 False for projected coordinates (defalut). 1370 hat_matrix : boolean 1371 True for computing and storing covariate-specific 1372 hat matrices R (n,n,k) and model hat matrix S (n,n). 1373 False (default) for computing MGWR inference on the fly. 1374 1375 Attributes 1376 ---------- 1377 coords : array-like 1378 n*2, collection of n sets of (x,y) coordinates of 1379 observatons; also used as calibration locations is 1380 'points' is set to None 1381 1382 y : array 1383 n*1, dependent variable 1384 1385 X : array 1386 n*k, independent variable, exlcuding the constant 1387 1388 selector : sel_bw object 1389 valid sel_bw object that has successfully called 1390 the "search" method. This parameter passes on 1391 information from GAM model estimation including optimal 1392 bandwidths. 1393 1394 bw : array-like 1395 collection of bandwidth values consisting of either a distance or N 1396 nearest neighbors; user specified or obtained using 1397 Sel_BW with fb=True. Order of values should the same as 1398 the order of columns associated with X 1399 1400 family : family object 1401 underlying probability model; provides 1402 distribution-specific calculations 1403 1404 sigma2_v1 : boolean 1405 specify form of corrected denominator of sigma squared to use for 1406 model diagnostics; Acceptable options are: 1407 1408 'True': n-tr(S) (defualt) 1409 'False': n-2(tr(S)+tr(S'S)) 1410 1411 kernel : string 1412 type of kernel function used to weight observations; 1413 available options: 1414 'gaussian' 1415 'bisquare' 1416 'exponential' 1417 1418 fixed : boolean 1419 True for distance based kernel function and False for 1420 adaptive (nearest neighbor) kernel function (default) 1421 1422 constant : boolean 1423 True to include intercept (default) in model and False to exclude 1424 intercept. 1425 1426 spherical : boolean 1427 True for shperical coordinates (long-lat), 1428 False for projected coordinates (defalut). 1429 1430 n : integer 1431 number of observations 1432 1433 k : integer 1434 number of independent variables 1435 1436 mean_y : float 1437 mean of y 1438 1439 std_y : float 1440 standard deviation of y 1441 1442 fit_params : dict 1443 parameters passed into fit method to define estimation 1444 routine 1445 1446 W : array-like 1447 list of n*n arrays, spatial weights matrices for weighting all 1448 observations from each calibration point: one for each 1449 covariate (k) 1450 1451 Examples 1452 -------- 1453 1454 #basic model calibration 1455 1456 >>> import libpysal as ps 1457 >>> from mgwr.gwr import MGWR 1458 >>> from mgwr.sel_bw import Sel_BW 1459 >>> data = ps.io.open(ps.examples.get_path('GData_utm.csv')) 1460 >>> coords = list(zip(data.by_col('X'), data.by_col('Y'))) 1461 >>> y = np.array(data.by_col('PctBach')).reshape((-1,1)) 1462 >>> rural = np.array(data.by_col('PctRural')).reshape((-1,1)) 1463 >>> fb = np.array(data.by_col('PctFB')).reshape((-1,1)) 1464 >>> african_amer = np.array(data.by_col('PctBlack')).reshape((-1,1)) 1465 >>> X = np.hstack([fb, african_amer, rural]) 1466 >>> X = (X - X.mean(axis=0)) / X.std(axis=0) 1467 >>> y = (y - y.mean(axis=0)) / y.std(axis=0) 1468 >>> selector = Sel_BW(coords, y, X, multi=True) 1469 >>> selector.search(multi_bw_min=[2]) 1470 [92.0, 101.0, 136.0, 158.0] 1471 >>> model = MGWR(coords, y, X, selector, fixed=False, kernel='bisquare', sigma2_v1=True) 1472 >>> results = model.fit() 1473 >>> print(results.params.shape) 1474 (159, 4) 1475 1476 """ 1477 1478 def __init__(self, coords, y, X, selector, sigma2_v1=True, 1479 kernel='bisquare', fixed=False, constant=True, 1480 spherical=False, hat_matrix=False): 1481 """ 1482 Initialize class 1483 """ 1484 self.selector = selector 1485 self.bws = self.selector.bw[0] #final set of bandwidth 1486 self.bws_history = selector.bw[1] #bws history in backfitting 1487 self.bw_init = self.selector.bw_init #initialization bandiwdth 1488 self.family = Gaussian( 1489 ) # manually set since we only support Gassian MGWR for now 1490 GWR.__init__(self, coords, y, X, self.bw_init, family=self.family, 1491 sigma2_v1=sigma2_v1, kernel=kernel, fixed=fixed, 1492 constant=constant, spherical=spherical, 1493 hat_matrix=hat_matrix) 1494 self.selector = selector 1495 self.sigma2_v1 = sigma2_v1 1496 self.points = None 1497 self.P = None 1498 self.offset = None 1499 self.exog_resid = None 1500 self.exog_scale = None 1501 self_fit_params = None 1502 1503 def _chunk_compute_R(self, chunk_id=0): 1504 """ 1505 Compute MGWR inference by chunks to reduce memory footprint. 1506 """ 1507 n = self.n 1508 k = self.k 1509 n_chunks = self.n_chunks 1510 chunk_size = int(np.ceil(float(n / n_chunks))) 1511 ENP_j = np.zeros(self.k) 1512 CCT = np.zeros((self.n, self.k)) 1513 1514 chunk_index = np.arange(n)[chunk_id * chunk_size:(chunk_id + 1) * 1515 chunk_size] 1516 init_pR = np.zeros((n, len(chunk_index))) 1517 init_pR[chunk_index, :] = np.eye(len(chunk_index)) 1518 pR = np.zeros((n, len(chunk_index), 1519 k)) #partial R: n by chunk_size by k 1520 1521 for i in range(n): 1522 wi = self._build_wi(i, self.bw_init).reshape(-1, 1) 1523 xT = (self.X * wi).T 1524 P = np.linalg.solve(xT.dot(self.X), xT).dot(init_pR).T 1525 pR[i, :, :] = P * self.X[i] 1526 1527 err = init_pR - np.sum(pR, axis=2) #n by chunk_size 1528 1529 for iter_i in range(self.bws_history.shape[0]): 1530 for j in range(k): 1531 pRj_old = pR[:, :, j] + err 1532 Xj = self.X[:, j] 1533 n_chunks_Aj = n_chunks 1534 chunk_size_Aj = int(np.ceil(float(n / n_chunks_Aj))) 1535 for chunk_Aj in range(n_chunks_Aj): 1536 chunk_index_Aj = np.arange(n)[chunk_Aj * chunk_size_Aj:( 1537 chunk_Aj + 1) * chunk_size_Aj] 1538 pAj = np.empty((len(chunk_index_Aj), n)) 1539 for i in range(len(chunk_index_Aj)): 1540 index = chunk_index_Aj[i] 1541 wi = self._build_wi(index, self.bws_history[iter_i, j]) 1542 xw = Xj * wi 1543 pAj[i, :] = Xj[index] / np.sum(xw * Xj) * xw 1544 pR[chunk_index_Aj, :, j] = pAj.dot(pRj_old) 1545 err = pRj_old - pR[:, :, j] 1546 1547 for j in range(k): 1548 CCT[:, j] += ((pR[:, :, j] / self.X[:, j].reshape(-1, 1))**2).sum( 1549 axis=1) 1550 for i in range(len(chunk_index)): 1551 ENP_j += pR[chunk_index[i], i, :] 1552 1553 if self.hat_matrix: 1554 return ENP_j, CCT, pR 1555 return ENP_j, CCT 1556 1557 def fit(self, n_chunks=1, pool=None): 1558 """ 1559 Compute MGWR inference by chunk to reduce memory footprint. 1560 See Li and Fotheringham, 2020, IJGIS. 1561 1562 Parameters 1563 ---------- 1564 1565 n_chunks : integer, optional 1566 A number of chunks parameter to reduce memory usage. 1567 e.g. n_chunks=2 should reduce overall memory usage by 2. 1568 pool : A multiprocessing Pool object to enable parallel fitting; default is None. 1569 1570 Returns 1571 ------- 1572 : MGWRResults 1573 """ 1574 params = self.selector.params 1575 predy = np.sum(self.X * params, axis=1).reshape(-1, 1) 1576 1577 try: 1578 from tqdm.autonotebook import tqdm #progress bar 1579 except ImportError: 1580 1581 def tqdm(x, total=0, 1582 desc=''): #otherwise, just passthrough the range 1583 return x 1584 1585 if pool: 1586 self.n_chunks = pool._processes * n_chunks 1587 rslt = tqdm( 1588 pool.imap(self._chunk_compute_R, range(self.n_chunks)), 1589 total=self.n_chunks, desc='Inference') 1590 else: 1591 self.n_chunks = n_chunks 1592 rslt = map(self._chunk_compute_R, 1593 tqdm(range(self.n_chunks), desc='Inference')) 1594 1595 rslt_list = list(zip(*rslt)) 1596 ENP_j = np.sum(np.array(rslt_list[0]), axis=0) 1597 CCT = np.sum(np.array(rslt_list[1]), axis=0) 1598 1599 w = np.ones(self.n) 1600 if self.hat_matrix: 1601 R = np.hstack(rslt_list[2]) 1602 else: 1603 R = None 1604 return MGWRResults(self, params, predy, CCT, ENP_j, w, R) 1605 1606 1607 def exact_fit(self): 1608 """ 1609 A closed-form solution to MGWR estimates and inference, 1610 the backfitting in self.fit() will converge to this solution. 1611 1612 Note: this would require large memory when n > 5,000. 1613 See Li and Fotheringham, 2020, IJGIS, pg.4. 1614 1615 Returns 1616 ------- 1617 : MGWRResults 1618 """ 1619 1620 P = [] 1621 Q = [] 1622 I = np.eye(self.n) 1623 for j1 in range(self.k): 1624 Aj = GWR(self.coords,self.y,self.X[:,j1].reshape(-1,1),bw=self.bws[j1],hat_matrix=True,constant=False).fit().S 1625 Pj = [] 1626 for j2 in range(self.k): 1627 if j1 == j2: 1628 Pj.append(I) 1629 else: 1630 Pj.append(Aj) 1631 P.append(Pj) 1632 Q.append([Aj]) 1633 1634 P = np.block(P) 1635 Q = np.block(Q) 1636 R = np.linalg.solve(P, Q) 1637 f = R.dot(self.y) 1638 1639 params = f/self.X.T.reshape(-1,1) 1640 params = params.reshape(-1,self.n).T 1641 1642 R = np.stack(np.split(R,self.k),axis=2) 1643 ENP_j = np.trace(R, axis1=0, axis2=1) 1644 predy = np.sum(self.X * params, axis=1).reshape(-1, 1) 1645 w = np.ones(self.n) 1646 1647 CCT = np.zeros((self.n,self.k)) 1648 for j in range(self.k): 1649 CCT[:, j] = ((R[:, :, j] / self.X[:, j].reshape(-1, 1))**2).sum(axis=1) 1650 1651 return MGWRResults(self, params, predy, CCT, ENP_j, w, R) 1652 1653 1654 def predict(self): 1655 ''' 1656 Not implemented. 1657 ''' 1658 raise NotImplementedError('N/A') 1659 1660 1661class MGWRResults(GWRResults): 1662 """ 1663 Class including common properties for a MGWR model. 1664 1665 Parameters 1666 ---------- 1667 model : MGWR object 1668 pointer to MGWR object with estimation parameters 1669 1670 params : array 1671 n*k, estimated coefficients 1672 1673 predy : array 1674 n*1, predicted y values 1675 1676 S : array 1677 n*n, model hat matrix (if MGWR(hat_matrix=True)) 1678 1679 R : array 1680 n*n*k, covariate-specific hat matrices (if MGWR(hat_matrix=True)) 1681 1682 CCT : array 1683 n*k, scaled variance-covariance matrix 1684 1685 w : array 1686 n*1, final weight used for iteratively re-weighted least 1687 sqaures; default is None 1688 1689 Attributes 1690 ---------- 1691 model : GWR Object 1692 points to GWR object for which parameters have been 1693 estimated 1694 1695 params : array 1696 n*k, parameter estimates 1697 1698 predy : array 1699 n*1, predicted value of y 1700 1701 y : array 1702 n*1, dependent variable 1703 1704 X : array 1705 n*k, independent variable, including constant 1706 1707 family : family object 1708 underlying probability model; provides 1709 distribution-specific calculations 1710 1711 n : integer 1712 number of observations 1713 1714 k : integer 1715 number of independent variables 1716 1717 df_model : integer 1718 model degrees of freedom 1719 1720 df_resid : integer 1721 residual degrees of freedom 1722 1723 scale : float 1724 sigma squared used for subsequent computations 1725 1726 w : array 1727 n*1, final weights from iteratively re-weighted least 1728 sqaures routine 1729 1730 resid_response : array 1731 n*1, residuals of the repsonse 1732 1733 resid_ss : scalar 1734 residual sum of sqaures 1735 1736 W : array-like 1737 list of n*n arrays, spatial weights matrices for weighting all 1738 observations from each calibration point: one for each 1739 covariate (k) 1740 1741 S : array 1742 n*n, model hat matrix (if MGWR(hat_matrix=True)) 1743 1744 R : array 1745 n*n*k, covariate-specific hat matrices (if MGWR(hat_matrix=True)) 1746 1747 CCT : array 1748 n*k, scaled variance-covariance matrix 1749 1750 ENP : scalar 1751 effective number of paramters, which depends on 1752 sigma2, for the entire model 1753 1754 ENP_j : array-like 1755 effective number of paramters, which depends on 1756 sigma2, for each covariate in the model 1757 1758 adj_alpha : array 1759 3*1, corrected alpha values to account for multiple 1760 hypothesis testing for the 90%, 95%, and 99% confidence 1761 levels; tvalues with an absolute value larger than the 1762 corrected alpha are considered statistically 1763 significant. 1764 1765 adj_alpha_j : array 1766 k*3, corrected alpha values to account for multiple 1767 hypothesis testing for the 90%, 95%, and 99% confidence 1768 levels; tvalues with an absolute value larger than the 1769 corrected alpha are considered statistically 1770 significant. A set of alpha calues is computed for 1771 each covariate in the model. 1772 1773 tr_S : float 1774 trace of S (hat) matrix 1775 1776 tr_STS : float 1777 trace of STS matrix 1778 1779 R2 : float 1780 R-squared for the entire model (1- RSS/TSS) 1781 1782 adj_R2 : float 1783 adjusted R-squared for the entire model 1784 1785 aic : float 1786 Akaike information criterion 1787 1788 aicc : float 1789 corrected Akaike information criterion to account 1790 to account for model complexity (smaller 1791 bandwidths) 1792 1793 bic : float 1794 Bayesian information criterio 1795 1796 sigma2 : float 1797 sigma squared (residual variance) that has been 1798 corrected to account for the ENP 1799 1800 std_res : array 1801 n*1, standardised residuals 1802 1803 bse : array 1804 n*k, standard errors of parameters (betas) 1805 1806 influ : array 1807 n*1, leading diagonal of S matrix 1808 1809 CooksD : array 1810 n*1, Cook's D 1811 1812 tvalues : array 1813 n*k, local t-statistics 1814 1815 llf : scalar 1816 log-likelihood of the full model; see 1817 pysal.contrib.glm.family for damily-sepcific 1818 log-likelihoods 1819 1820 mu : array 1821 n*, flat one dimensional array of predicted mean 1822 response value from estimator 1823 1824 """ 1825 1826 def __init__(self, model, params, predy, CCT, ENP_j, w, R): 1827 """ 1828 Initialize class 1829 """ 1830 self.ENP_j = ENP_j 1831 self.R = R 1832 GWRResults.__init__(self, model, params, predy, None, CCT, None, w) 1833 if model.hat_matrix: 1834 self.S = np.sum(self.R, axis=2) 1835 self.predy = predy 1836 1837 @cache_readonly 1838 def tr_S(self): 1839 return np.sum(self.ENP_j) 1840 1841 @cache_readonly 1842 def W(self): 1843 Ws = [] 1844 for bw_j in self.model.bws: 1845 W = np.array( 1846 [self.model._build_wi(i, bw_j) for i in range(self.n)]) 1847 Ws.append(W) 1848 return Ws 1849 1850 @cache_readonly 1851 def adj_alpha_j(self): 1852 """ 1853 Corrected alpha (critical) values to account for multiple testing during hypothesis 1854 testing. Includes corrected value for 90% (.1), 95% (.05), and 99% 1855 (.01) confidence levels. Correction comes from: 1856 1857 :cite:`Silva:2016` : da Silva, A. R., & Fotheringham, A. S. (2015). The Multiple Testing Issue in 1858 Geographically Weighted Regression. Geographical Analysis. 1859 1860 """ 1861 alpha = np.array([.1, .05, .001]) 1862 pe = np.array(self.ENP_j).reshape((-1, 1)) 1863 p = 1. 1864 return (alpha * p) / pe 1865 1866 def critical_tval(self, alpha=None): 1867 """ 1868 Utility function to derive the critial t-value based on given alpha 1869 that are needed for hypothesis testing 1870 1871 Parameters 1872 ---------- 1873 alpha : scalar 1874 critical value to determine which tvalues are 1875 associated with statistically significant parameter 1876 estimates. Default to None in which case the adjusted 1877 alpha value at the 95 percent CI is automatically 1878 used. 1879 1880 Returns 1881 ------- 1882 critical : scalar 1883 critical t-val based on alpha 1884 """ 1885 n = self.n 1886 if alpha is not None: 1887 alpha = np.abs(alpha) / 2.0 1888 critical = t.ppf(1 - alpha, n - 1) 1889 else: 1890 alpha = np.abs(self.adj_alpha_j[:, 1]) / 2.0 1891 critical = t.ppf(1 - alpha, n - 1) 1892 return critical 1893 1894 def filter_tvals(self, critical_t=None, alpha=None): 1895 """ 1896 Utility function to set tvalues with an absolute value smaller than the 1897 absolute value of the alpha (critical) value to 0. If critical_t 1898 is supplied than it is used directly to filter. If alpha is provided 1899 than the critical t value will be derived and used to filter. If neither 1900 are critical_t nor alpha are provided, an adjusted alpha at the 95 1901 percent CI will automatically be used to define the critical t-value and 1902 used to filter. If both critical_t and alpha are supplied then the alpha 1903 value will be ignored. 1904 1905 Parameters 1906 ---------- 1907 critical : scalar 1908 critical t-value to determine whether parameters are 1909 statistically significant 1910 1911 alpha : scalar 1912 alpha value to determine which tvalues are 1913 associated with statistically significant parameter 1914 estimates 1915 1916 Returns 1917 ------- 1918 filtered : array 1919 n*k; new set of n tvalues for each of k variables 1920 where absolute tvalues less than the absolute value of 1921 alpha have been set to 0. 1922 """ 1923 n = self.n 1924 if critical_t is not None: 1925 critical = np.array(critical_t) 1926 elif alpha is not None and critical_t is None: 1927 critical = self.critical_tval(alpha=alpha) 1928 elif alpha is None and critical_t is None: 1929 critical = self.critical_tval() 1930 1931 subset = (self.tvalues < critical) & (self.tvalues > -1.0 * critical) 1932 tvalues = self.tvalues.copy() 1933 tvalues[subset] = 0 1934 return tvalues 1935 1936 @cache_readonly 1937 def RSS(self): 1938 raise NotImplementedError( 1939 'Not yet implemented for multiple bandwidths') 1940 1941 @cache_readonly 1942 def TSS(self): 1943 raise NotImplementedError( 1944 'Not yet implemented for multiple bandwidths') 1945 1946 @cache_readonly 1947 def localR2(self): 1948 raise NotImplementedError( 1949 'Not yet implemented for multiple bandwidths') 1950 1951 @cache_readonly 1952 def y_bar(self): 1953 raise NotImplementedError( 1954 'Not yet implemented for multiple bandwidths') 1955 1956 @cache_readonly 1957 def predictions(self): 1958 raise NotImplementedError('Not yet implemented for MGWR') 1959 1960 #Function for getting BWs intervals 1961 def get_bws_intervals(self, selector, level=0.95): 1962 """ 1963 Computes bandwidths confidence intervals (CIs) for MGWR. 1964 The CIs are based on Akaike weights and the bandwidth search algorithm used. 1965 Details are in Li et al. (2020) Annals of AAG 1966 1967 Returns a list of confidence intervals. e.g. [(40, 60), (100, 180), (150, 300)] 1968 1969 """ 1970 intervals = [] 1971 try: 1972 import pandas as pd 1973 except ImportError: 1974 return 1975 1976 for j in range(self.k): 1977 #Get AICcs and associated bw from the last iteration of back-fitting and make a DataFrame 1978 aiccs = pd.DataFrame(list(zip(*selector.sel_hist[-self.k+j]))[1],columns=["aicc"]) 1979 aiccs['bw'] = list(zip(*selector.sel_hist[-self.k+j]))[0] 1980 #Sort DataFrame by the AICc values 1981 aiccs = aiccs.sort_values(by=['aicc']) 1982 #Calculate delta AICc 1983 d_aic_ak = aiccs.aicc - aiccs.aicc.min() 1984 #Calculate AICc weights 1985 w_aic_ak = np.exp(-0.5*d_aic_ak) / np.sum(np.exp(-0.5*d_aic_ak)) 1986 aiccs['w_aic_ak'] = w_aic_ak/np.sum(w_aic_ak) 1987 #Calculate cum. AICc weights 1988 aiccs['cum_w_ak'] = aiccs.w_aic_ak.cumsum() 1989 #Find index where the cum weights above p-val 1990 index = len(aiccs[aiccs.cum_w_ak < level]) + 1 1991 #Get bw boundaries 1992 interval = (aiccs.iloc[:index,:].bw.min(),aiccs.iloc[:index,:].bw.max()) 1993 intervals += [interval] 1994 return intervals 1995 1996 1997 def local_collinearity(self): 1998 """ 1999 Computes several indicators of multicollinearity within a geographically 2000 weighted design matrix, including: 2001 2002 local condition number (n, 1) 2003 local variance-decomposition proportions (n, p) 2004 2005 Returns four arrays with the order and dimensions listed above where n 2006 is the number of locations used as calibrations points and p is the 2007 nubmer of explanatory variables 2008 2009 """ 2010 x = self.X 2011 w = self.W 2012 nvar = x.shape[1] 2013 nrow = self.n 2014 vdp_idx = np.ndarray((nrow, nvar)) 2015 vdp_pi = np.ndarray((nrow, nvar, nvar)) 2016 2017 for i in range(nrow): 2018 xw = np.zeros((x.shape)) 2019 for j in range(nvar): 2020 wi = w[j][i] 2021 sw = np.sum(wi) 2022 wi = wi / sw 2023 xw[:, j] = x[:, j] * wi 2024 2025 sxw = np.sqrt(np.sum(xw**2, axis=0)) 2026 sxw = np.transpose(xw.T / sxw.reshape((nvar, 1))) 2027 svdx = np.linalg.svd(sxw) 2028 vdp_idx[i, ] = svdx[1][0] / svdx[1] 2029 2030 phi = np.dot(svdx[2].T, np.diag(1 / svdx[1])) 2031 phi = np.transpose(phi**2) 2032 pi_ij = phi / np.sum(phi, axis=0) 2033 vdp_pi[i, :, :] = pi_ij 2034 2035 local_CN = vdp_idx[:, nvar - 1].reshape((-1, 1)) 2036 VDP = vdp_pi[:, nvar - 1, :] 2037 2038 return local_CN, VDP 2039 2040 def spatial_variability(self, selector, n_iters=1000, seed=None): 2041 """ 2042 Method to compute a Monte Carlo test of spatial variability for each 2043 estimated coefficient surface. 2044 2045 WARNING: This test is very computationally demanding! 2046 2047 Parameters 2048 ---------- 2049 selector : sel_bw object 2050 should be the sel_bw object used to select a bandwidth 2051 for the gwr model that produced the surfaces that are 2052 being tested for spatial variation 2053 2054 n_iters : int 2055 the number of Monte Carlo iterations to include for 2056 the tests of spatial variability. 2057 2058 seed : int 2059 optional parameter to select a custom seed to ensure 2060 stochastic results are replicable. Default is none 2061 which automatically sets the seed to 5536 2062 2063 Returns 2064 ------- 2065 2066 p values : list 2067 a list of psuedo p-values that correspond to the model 2068 parameter surfaces. Allows us to assess the 2069 probability of obtaining the observed spatial 2070 variation of a given surface by random chance. 2071 2072 2073 """ 2074 temp_sel = copy.deepcopy(selector) 2075 2076 if seed is None: 2077 np.random.seed(5536) 2078 else: 2079 np.random.seed(seed) 2080 2081 search_params = temp_sel.search_params 2082 2083 if self.model.constant: 2084 X = self.X[:, 1:] 2085 else: 2086 X = self.X 2087 2088 init_sd = np.std(self.params, axis=0) 2089 SDs = [] 2090 2091 for x in range(n_iters): 2092 temp_coords = np.random.permutation(self.model.coords) 2093 temp_sel.coords = temp_coords 2094 temp_sel.search(**search_params) 2095 temp_params = temp_sel.params 2096 temp_sd = np.std(temp_params, axis=0) 2097 SDs.append(temp_sd) 2098 2099 p_vals = (np.sum(np.array(SDs) > init_sd, axis=0) / float(n_iters)) 2100 return p_vals 2101 2102 def summary(self): 2103 """ 2104 Print out MGWR summary 2105 """ 2106 summary = summaryModel(self) + summaryGLM(self) + summaryMGWR(self) 2107 print(summary) 2108 return 2109