1'''Partial Regression plot and residual plots to find misspecification 2 3 4Author: Josef Perktold 5License: BSD-3 6Created: 2011-01-23 7 8update 92011-06-05 : start to convert example to usable functions 102011-10-27 : docstrings 11 12''' 13from statsmodels.compat.pandas import Appender 14from statsmodels.compat.python import lrange, lzip 15 16import numpy as np 17import pandas as pd 18from patsy import dmatrix 19 20from statsmodels.genmod.generalized_estimating_equations import GEE 21from statsmodels.genmod.generalized_linear_model import GLM 22from statsmodels.graphics import utils 23from statsmodels.nonparametric.smoothers_lowess import lowess 24from statsmodels.regression.linear_model import GLS, OLS, WLS 25from statsmodels.sandbox.regression.predstd import wls_prediction_std 26from statsmodels.tools.tools import maybe_unwrap_results 27 28from ._regressionplots_doc import ( 29 _plot_added_variable_doc, 30 _plot_ceres_residuals_doc, 31 _plot_influence_doc, 32 _plot_leverage_resid2_doc, 33 _plot_partial_residuals_doc, 34) 35 36__all__ = ['plot_fit', 'plot_regress_exog', 'plot_partregress', 'plot_ccpr', 37 'plot_regress_exog', 'plot_partregress_grid', 'plot_ccpr_grid', 38 'add_lowess', 'abline_plot', 'influence_plot', 39 'plot_leverage_resid2', 'added_variable_resids', 40 'partial_resids', 'ceres_resids', 'plot_added_variable', 41 'plot_partial_residuals', 'plot_ceres_residuals'] 42 43#TODO: consider moving to influence module 44def _high_leverage(results): 45 #TODO: replace 1 with k_constant 46 return 2. * (results.df_model + 1)/results.nobs 47 48 49def add_lowess(ax, lines_idx=0, frac=.2, **lowess_kwargs): 50 """ 51 Add Lowess line to a plot. 52 53 Parameters 54 ---------- 55 ax : AxesSubplot 56 The Axes to which to add the plot 57 lines_idx : int 58 This is the line on the existing plot to which you want to add 59 a smoothed lowess line. 60 frac : float 61 The fraction of the points to use when doing the lowess fit. 62 lowess_kwargs 63 Additional keyword arguments are passes to lowess. 64 65 Returns 66 ------- 67 Figure 68 The figure that holds the instance. 69 """ 70 y0 = ax.get_lines()[lines_idx]._y 71 x0 = ax.get_lines()[lines_idx]._x 72 lres = lowess(y0, x0, frac=frac, **lowess_kwargs) 73 ax.plot(lres[:, 0], lres[:, 1], 'r', lw=1.5) 74 return ax.figure 75 76 77def plot_fit(results, exog_idx, y_true=None, ax=None, vlines=True, **kwargs): 78 """ 79 Plot fit against one regressor. 80 81 This creates one graph with the scatterplot of observed values 82 compared to fitted values. 83 84 Parameters 85 ---------- 86 results : Results 87 A result instance with resid, model.endog and model.exog as 88 attributes. 89 exog_idx : {int, str} 90 Name or index of regressor in exog matrix. 91 y_true : array_like. optional 92 If this is not None, then the array is added to the plot. 93 ax : AxesSubplot, optional 94 If given, this subplot is used to plot in instead of a new figure being 95 created. 96 vlines : bool, optional 97 If this not True, then the uncertainty of the fit is not 98 plotted. 99 **kwargs 100 The keyword arguments are passed to the plot command for the fitted 101 values points. 102 103 Returns 104 ------- 105 Figure 106 If `ax` is None, the created figure. Otherwise the figure to which 107 `ax` is connected. 108 109 Examples 110 -------- 111 Load the Statewide Crime data set and perform linear regression with 112 `poverty` and `hs_grad` as variables and `murder` as the response 113 114 >>> import statsmodels.api as sm 115 >>> import matplotlib.pyplot as plt 116 117 >>> data = sm.datasets.statecrime.load_pandas().data 118 >>> murder = data['murder'] 119 >>> X = data[['poverty', 'hs_grad']] 120 121 >>> X["constant"] = 1 122 >>> y = murder 123 >>> model = sm.OLS(y, X) 124 >>> results = model.fit() 125 126 Create a plot just for the variable 'Poverty': 127 128 >>> fig, ax = plt.subplots() 129 >>> fig = sm.graphics.plot_fit(results, 0, ax=ax) 130 >>> ax.set_ylabel("Murder Rate") 131 >>> ax.set_xlabel("Poverty Level") 132 >>> ax.set_title("Linear Regression") 133 134 >>> plt.show() 135 136 .. plot:: plots/graphics_plot_fit_ex.py 137 """ 138 139 fig, ax = utils.create_mpl_ax(ax) 140 141 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) 142 results = maybe_unwrap_results(results) 143 144 #maybe add option for wendog, wexog 145 y = results.model.endog 146 x1 = results.model.exog[:, exog_idx] 147 x1_argsort = np.argsort(x1) 148 y = y[x1_argsort] 149 x1 = x1[x1_argsort] 150 151 ax.plot(x1, y, 'bo', label=results.model.endog_names) 152 if y_true is not None: 153 ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') 154 title = 'Fitted values versus %s' % exog_name 155 156 ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', 157 label='fitted', **kwargs) 158 if vlines is True: 159 _, iv_l, iv_u = wls_prediction_std(results) 160 ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, 161 color='k', alpha=.7) 162 #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, 163 # color='k') 164 ax.set_title(title) 165 ax.set_xlabel(exog_name) 166 ax.set_ylabel(results.model.endog_names) 167 ax.legend(loc='best', numpoints=1) 168 169 return fig 170 171 172def plot_regress_exog(results, exog_idx, fig=None): 173 """Plot regression results against one regressor. 174 175 This plots four graphs in a 2 by 2 figure: 'endog versus exog', 176 'residuals versus exog', 'fitted versus exog' and 177 'fitted plus residual versus exog' 178 179 Parameters 180 ---------- 181 results : result instance 182 A result instance with resid, model.endog and model.exog as attributes. 183 exog_idx : int or str 184 Name or index of regressor in exog matrix. 185 fig : Figure, optional 186 If given, this figure is simply returned. Otherwise a new figure is 187 created. 188 189 Returns 190 ------- 191 Figure 192 The value of `fig` if provided. Otherwise a new instance. 193 194 Examples 195 -------- 196 Load the Statewide Crime data set and build a model with regressors 197 including the rate of high school graduation (hs_grad), population in urban 198 areas (urban), households below poverty line (poverty), and single person 199 households (single). Outcome variable is the murder rate (murder). 200 201 Build a 2 by 2 figure based on poverty showing fitted versus actual murder 202 rate, residuals versus the poverty rate, partial regression plot of poverty, 203 and CCPR plot for poverty rate. 204 205 >>> import statsmodels.api as sm 206 >>> import matplotlib.pyplot as plot 207 >>> import statsmodels.formula.api as smf 208 209 >>> fig = plt.figure(figsize=(8, 6)) 210 >>> crime_data = sm.datasets.statecrime.load_pandas() 211 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', 212 ... data=crime_data.data).fit() 213 >>> sm.graphics.plot_regress_exog(results, 'poverty', fig=fig) 214 >>> plt.show() 215 216 .. plot:: plots/graphics_regression_regress_exog.py 217 """ 218 219 fig = utils.create_mpl_fig(fig) 220 221 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) 222 results = maybe_unwrap_results(results) 223 224 #maybe add option for wendog, wexog 225 y_name = results.model.endog_names 226 x1 = results.model.exog[:, exog_idx] 227 prstd, iv_l, iv_u = wls_prediction_std(results) 228 229 ax = fig.add_subplot(2, 2, 1) 230 ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name) 231 ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted', 232 alpha=.5) 233 ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7) 234 ax.set_title('Y and Fitted vs. X', fontsize='large') 235 ax.set_xlabel(exog_name) 236 ax.set_ylabel(y_name) 237 ax.legend(loc='best') 238 239 ax = fig.add_subplot(2, 2, 2) 240 ax.plot(x1, results.resid, 'o') 241 ax.axhline(y=0, color='black') 242 ax.set_title('Residuals versus %s' % exog_name, fontsize='large') 243 ax.set_xlabel(exog_name) 244 ax.set_ylabel("resid") 245 246 ax = fig.add_subplot(2, 2, 3) 247 exog_noti = np.ones(results.model.exog.shape[1], bool) 248 exog_noti[exog_idx] = False 249 exog_others = results.model.exog[:, exog_noti] 250 from pandas import Series 251 fig = plot_partregress(results.model.data.orig_endog, 252 Series(x1, name=exog_name, 253 index=results.model.data.row_labels), 254 exog_others, obs_labels=False, ax=ax) 255 ax.set_title('Partial regression plot', fontsize='large') 256 #ax.set_ylabel("Fitted values") 257 #ax.set_xlabel(exog_name) 258 259 ax = fig.add_subplot(2, 2, 4) 260 fig = plot_ccpr(results, exog_idx, ax=ax) 261 ax.set_title('CCPR Plot', fontsize='large') 262 #ax.set_xlabel(exog_name) 263 #ax.set_ylabel("Fitted values + resids") 264 265 fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large") 266 267 fig.tight_layout() 268 269 fig.subplots_adjust(top=.90) 270 return fig 271 272 273def _partial_regression(endog, exog_i, exog_others): 274 """Partial regression. 275 276 regress endog on exog_i conditional on exog_others 277 278 uses OLS 279 280 Parameters 281 ---------- 282 endog : array_like 283 exog : array_like 284 exog_others : array_like 285 286 Returns 287 ------- 288 res1c : OLS results instance 289 290 (res1a, res1b) : tuple of OLS results instances 291 results from regression of endog on exog_others and of exog_i on 292 exog_others 293 """ 294 #FIXME: This function does not appear to be used. 295 res1a = OLS(endog, exog_others).fit() 296 res1b = OLS(exog_i, exog_others).fit() 297 res1c = OLS(res1a.resid, res1b.resid).fit() 298 299 return res1c, (res1a, res1b) 300 301 302def plot_partregress(endog, exog_i, exog_others, data=None, 303 title_kwargs={}, obs_labels=True, label_kwargs={}, 304 ax=None, ret_coords=False, eval_env=1, **kwargs): 305 """Plot partial regression for a single regressor. 306 307 Parameters 308 ---------- 309 endog : {ndarray, str} 310 The endogenous or response variable. If string is given, you can use a 311 arbitrary translations as with a formula. 312 exog_i : {ndarray, str} 313 The exogenous, explanatory variable. If string is given, you can use a 314 arbitrary translations as with a formula. 315 exog_others : {ndarray, list[str]} 316 Any other exogenous, explanatory variables. If a list of strings is 317 given, each item is a term in formula. You can use a arbitrary 318 translations as with a formula. The effect of these variables will be 319 removed by OLS regression. 320 data : {DataFrame, dict} 321 Some kind of data structure with names if the other variables are 322 given as strings. 323 title_kwargs : dict 324 Keyword arguments to pass on for the title. The key to control the 325 fonts is fontdict. 326 obs_labels : {bool, array_like} 327 Whether or not to annotate the plot points with their observation 328 labels. If obs_labels is a boolean, the point labels will try to do 329 the right thing. First it will try to use the index of data, then 330 fall back to the index of exog_i. Alternatively, you may give an 331 array-like object corresponding to the observation numbers. 332 label_kwargs : dict 333 Keyword arguments that control annotate for the observation labels. 334 ax : AxesSubplot, optional 335 If given, this subplot is used to plot in instead of a new figure being 336 created. 337 ret_coords : bool 338 If True will return the coordinates of the points in the plot. You 339 can use this to add your own annotations. 340 eval_env : int 341 Patsy eval environment if user functions and formulas are used in 342 defining endog or exog. 343 **kwargs 344 The keyword arguments passed to plot for the points. 345 346 Returns 347 ------- 348 fig : Figure 349 If `ax` is None, the created figure. Otherwise the figure to which 350 `ax` is connected. 351 coords : list, optional 352 If ret_coords is True, return a tuple of arrays (x_coords, y_coords). 353 354 See Also 355 -------- 356 plot_partregress_grid : Plot partial regression for a set of regressors. 357 358 Notes 359 ----- 360 The slope of the fitted line is the that of `exog_i` in the full 361 multiple regression. The individual points can be used to assess the 362 influence of points on the estimated coefficient. 363 364 Examples 365 -------- 366 Load the Statewide Crime data set and plot partial regression of the rate 367 of high school graduation (hs_grad) on the murder rate(murder). 368 369 The effects of the percent of the population living in urban areas (urban), 370 below the poverty line (poverty) , and in a single person household (single) 371 are removed by OLS regression. 372 373 >>> import statsmodels.api as sm 374 >>> import matplotlib.pyplot as plt 375 376 >>> crime_data = sm.datasets.statecrime.load_pandas() 377 >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad', 378 ... exog_others=['urban', 'poverty', 'single'], 379 ... data=crime_data.data, obs_labels=False) 380 >>> plt.show() 381 382 .. plot:: plots/graphics_regression_partregress.py 383 384 More detailed examples can be found in the Regression Plots notebook 385 on the examples page. 386 """ 387 #NOTE: there is no interaction between possible missing data and 388 #obs_labels yet, so this will need to be tweaked a bit for this case 389 fig, ax = utils.create_mpl_ax(ax) 390 print("eval_env:", eval_env) 391 # strings, use patsy to transform to data 392 if isinstance(endog, str): 393 endog = dmatrix(endog + "-1", data, eval_env=eval_env) 394 395 if isinstance(exog_others, str): 396 RHS = dmatrix(exog_others, data, eval_env=eval_env) 397 elif isinstance(exog_others, list): 398 RHS = "+".join(exog_others) 399 RHS = dmatrix(RHS, data, eval_env=eval_env) 400 else: 401 RHS = exog_others 402 RHS_isemtpy = False 403 if isinstance(RHS, np.ndarray) and RHS.size==0: 404 RHS_isemtpy = True 405 elif isinstance(RHS, pd.DataFrame) and RHS.empty: 406 RHS_isemtpy = True 407 if isinstance(exog_i, str): 408 exog_i = dmatrix(exog_i + "-1", data, eval_env=eval_env) 409 410 # all arrays or pandas-like 411 412 if RHS_isemtpy: 413 endog = np.asarray(endog) 414 exog_i = np.asarray(exog_i) 415 ax.plot(endog, exog_i, 'o', **kwargs) 416 fitted_line = OLS(endog, exog_i).fit() 417 x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name 418 y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0] 419 else: 420 res_yaxis = OLS(endog, RHS).fit() 421 res_xaxis = OLS(exog_i, RHS).fit() 422 xaxis_resid = res_xaxis.resid 423 yaxis_resid = res_yaxis.resid 424 x_axis_endog_name = res_xaxis.model.endog_names 425 y_axis_endog_name = res_yaxis.model.endog_names 426 ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) 427 fitted_line = OLS(yaxis_resid, xaxis_resid).fit() 428 429 fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) 430 431 if x_axis_endog_name == 'y': # for no names regression will just get a y 432 x_axis_endog_name = 'x' # this is misleading, so use x 433 ax.set_xlabel("e(%s | X)" % x_axis_endog_name) 434 ax.set_ylabel("e(%s | X)" % y_axis_endog_name) 435 ax.set_title('Partial Regression Plot', **title_kwargs) 436 437 # NOTE: if we want to get super fancy, we could annotate if a point is 438 # clicked using this widget 439 # http://stackoverflow.com/questions/4652439/ 440 # is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ 441 # 4674445#4674445 442 if obs_labels is True: 443 if data is not None: 444 obs_labels = data.index 445 elif hasattr(exog_i, "index"): 446 obs_labels = exog_i.index 447 else: 448 obs_labels = res_xaxis.model.data.row_labels 449 #NOTE: row_labels can be None. 450 #Maybe we should fix this to never be the case. 451 if obs_labels is None: 452 obs_labels = lrange(len(exog_i)) 453 454 if obs_labels is not False: # could be array_like 455 if len(obs_labels) != len(exog_i): 456 raise ValueError("obs_labels does not match length of exog_i") 457 label_kwargs.update(dict(ha="center", va="bottom")) 458 ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, 459 lzip(res_xaxis.resid, res_yaxis.resid), 460 [(0, 5)] * len(obs_labels), "x-large", ax=ax, 461 **label_kwargs) 462 463 if ret_coords: 464 return fig, (res_xaxis.resid, res_yaxis.resid) 465 else: 466 return fig 467 468 469def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None): 470 """ 471 Plot partial regression for a set of regressors. 472 473 Parameters 474 ---------- 475 results : Results instance 476 A regression model results instance. 477 exog_idx : {None, list[int], list[str]} 478 The indices or column names of the exog used in the plot, default is 479 all. 480 grid : {None, tuple[int]} 481 If grid is given, then it is used for the arrangement of the subplots. 482 The format of grid is (nrows, ncols). If grid is None, then ncol is 483 one, if there are only 2 subplots, and the number of columns is two 484 otherwise. 485 fig : Figure, optional 486 If given, this figure is simply returned. Otherwise a new figure is 487 created. 488 489 Returns 490 ------- 491 Figure 492 If `fig` is None, the created figure. Otherwise `fig` itself. 493 494 See Also 495 -------- 496 plot_partregress : Plot partial regression for a single regressor. 497 plot_ccpr : Plot CCPR against one regressor 498 499 Notes 500 ----- 501 A subplot is created for each explanatory variable given by exog_idx. 502 The partial regression plot shows the relationship between the response 503 and the given explanatory variable after removing the effect of all other 504 explanatory variables in exog. 505 506 References 507 ---------- 508 See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm 509 510 Examples 511 -------- 512 Using the state crime dataset separately plot the effect of the each 513 variable on the on the outcome, murder rate while accounting for the effect 514 of all other variables in the model visualized with a grid of partial 515 regression plots. 516 517 >>> from statsmodels.graphics.regressionplots import plot_partregress_grid 518 >>> import statsmodels.api as sm 519 >>> import matplotlib.pyplot as plt 520 >>> import statsmodels.formula.api as smf 521 522 >>> fig = plt.figure(figsize=(8, 6)) 523 >>> crime_data = sm.datasets.statecrime.load_pandas() 524 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', 525 ... data=crime_data.data).fit() 526 >>> plot_partregress_grid(results, fig=fig) 527 >>> plt.show() 528 529 .. plot:: plots/graphics_regression_partregress_grid.py 530 """ 531 import pandas 532 fig = utils.create_mpl_fig(fig) 533 534 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) 535 536 # TODO: maybe add option for using wendog, wexog instead 537 y = pandas.Series(results.model.endog, name=results.model.endog_names) 538 exog = results.model.exog 539 540 k_vars = exog.shape[1] 541 # this function does not make sense if k_vars=1 542 543 nrows = (len(exog_idx) + 1) // 2 544 ncols = 1 if nrows == len(exog_idx) else 2 545 if grid is not None: 546 nrows, ncols = grid 547 if ncols > 1: 548 title_kwargs = {"fontdict": {"fontsize": 'small'}} 549 550 # for indexing purposes 551 other_names = np.array(results.model.exog_names) 552 for i, idx in enumerate(exog_idx): 553 others = lrange(k_vars) 554 others.pop(idx) 555 exog_others = pandas.DataFrame(exog[:, others], 556 columns=other_names[others]) 557 ax = fig.add_subplot(nrows, ncols, i + 1) 558 plot_partregress(y, pandas.Series(exog[:, idx], 559 name=other_names[idx]), 560 exog_others, ax=ax, title_kwargs=title_kwargs, 561 obs_labels=False) 562 ax.set_title("") 563 564 fig.suptitle("Partial Regression Plot", fontsize="large") 565 fig.tight_layout() 566 fig.subplots_adjust(top=.95) 567 568 return fig 569 570 571def plot_ccpr(results, exog_idx, ax=None): 572 """ 573 Plot CCPR against one regressor. 574 575 Generates a component and component-plus-residual (CCPR) plot. 576 577 Parameters 578 ---------- 579 results : result instance 580 A regression results instance. 581 exog_idx : {int, str} 582 Exogenous, explanatory variable. If string is given, it should 583 be the variable name that you want to use, and you can use arbitrary 584 translations as with a formula. 585 ax : AxesSubplot, optional 586 If given, it is used to plot in instead of a new figure being 587 created. 588 589 Returns 590 ------- 591 Figure 592 If `ax` is None, the created figure. Otherwise the figure to which 593 `ax` is connected. 594 595 See Also 596 -------- 597 plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. 598 599 Notes 600 ----- 601 The CCPR plot provides a way to judge the effect of one regressor on the 602 response variable by taking into account the effects of the other 603 independent variables. The partial residuals plot is defined as 604 Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus 605 X_i to show where the fitted line would lie. Care should be taken if X_i 606 is highly correlated with any of the other independent variables. If this 607 is the case, the variance evident in the plot will be an underestimate of 608 the true variance. 609 610 References 611 ---------- 612 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm 613 614 Examples 615 -------- 616 Using the state crime dataset plot the effect of the rate of single 617 households ('single') on the murder rate while accounting for high school 618 graduation rate ('hs_grad'), percentage of people in an urban area, and rate 619 of poverty ('poverty'). 620 621 >>> import statsmodels.api as sm 622 >>> import matplotlib.pyplot as plot 623 >>> import statsmodels.formula.api as smf 624 625 >>> crime_data = sm.datasets.statecrime.load_pandas() 626 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', 627 ... data=crime_data.data).fit() 628 >>> sm.graphics.plot_ccpr(results, 'single') 629 >>> plt.show() 630 631 .. plot:: plots/graphics_regression_ccpr.py 632 """ 633 fig, ax = utils.create_mpl_ax(ax) 634 635 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) 636 results = maybe_unwrap_results(results) 637 638 x1 = results.model.exog[:, exog_idx] 639 #namestr = ' for %s' % self.name if self.name else '' 640 x1beta = x1*results.params[exog_idx] 641 ax.plot(x1, x1beta + results.resid, 'o') 642 from statsmodels.tools.tools import add_constant 643 mod = OLS(x1beta, add_constant(x1)).fit() 644 params = mod.params 645 fig = abline_plot(*params, **dict(ax=ax)) 646 #ax.plot(x1, x1beta, '-') 647 ax.set_title('Component and component plus residual plot') 648 ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) 649 ax.set_xlabel("%s" % exog_name) 650 651 return fig 652 653 654def plot_ccpr_grid(results, exog_idx=None, grid=None, fig=None): 655 """ 656 Generate CCPR plots against a set of regressors, plot in a grid. 657 658 Generates a grid of component and component-plus-residual (CCPR) plots. 659 660 Parameters 661 ---------- 662 results : result instance 663 A results instance with exog and params. 664 exog_idx : None or list of int 665 The indices or column names of the exog used in the plot. 666 grid : None or tuple of int (nrows, ncols) 667 If grid is given, then it is used for the arrangement of the subplots. 668 If grid is None, then ncol is one, if there are only 2 subplots, and 669 the number of columns is two otherwise. 670 fig : Figure, optional 671 If given, this figure is simply returned. Otherwise a new figure is 672 created. 673 674 Returns 675 ------- 676 Figure 677 If `ax` is None, the created figure. Otherwise the figure to which 678 `ax` is connected. 679 680 See Also 681 -------- 682 plot_ccpr : Creates CCPR plot for a single regressor. 683 684 Notes 685 ----- 686 Partial residual plots are formed as:: 687 688 Res + Betahat(i)*Xi versus Xi 689 690 and CCPR adds:: 691 692 Betahat(i)*Xi versus Xi 693 694 References 695 ---------- 696 See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm 697 698 Examples 699 -------- 700 Using the state crime dataset separately plot the effect of the each 701 variable on the on the outcome, murder rate while accounting for the effect 702 of all other variables in the model. 703 704 >>> import statsmodels.api as sm 705 >>> import matplotlib.pyplot as plt 706 >>> import statsmodels.formula.api as smf 707 708 >>> fig = plt.figure(figsize=(8, 8)) 709 >>> crime_data = sm.datasets.statecrime.load_pandas() 710 >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', 711 ... data=crime_data.data).fit() 712 >>> sm.graphics.plot_ccpr_grid(results, fig=fig) 713 >>> plt.show() 714 715 .. plot:: plots/graphics_regression_ccpr_grid.py 716 """ 717 fig = utils.create_mpl_fig(fig) 718 719 exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) 720 721 if grid is not None: 722 nrows, ncols = grid 723 else: 724 if len(exog_idx) > 2: 725 nrows = int(np.ceil(len(exog_idx)/2.)) 726 ncols = 2 727 else: 728 nrows = len(exog_idx) 729 ncols = 1 730 731 seen_constant = 0 732 for i, idx in enumerate(exog_idx): 733 if results.model.exog[:, idx].var() == 0: 734 seen_constant = 1 735 continue 736 737 ax = fig.add_subplot(nrows, ncols, i+1-seen_constant) 738 fig = plot_ccpr(results, exog_idx=idx, ax=ax) 739 ax.set_title("") 740 741 fig.suptitle("Component-Component Plus Residual Plot", fontsize="large") 742 743 fig.tight_layout() 744 745 fig.subplots_adjust(top=.95) 746 return fig 747 748 749def abline_plot(intercept=None, slope=None, horiz=None, vert=None, 750 model_results=None, ax=None, **kwargs): 751 """ 752 Plot a line given an intercept and slope. 753 754 Parameters 755 ---------- 756 intercept : float 757 The intercept of the line. 758 slope : float 759 The slope of the line. 760 horiz : float or array_like 761 Data for horizontal lines on the y-axis. 762 vert : array_like 763 Data for verterical lines on the x-axis. 764 model_results : statsmodels results instance 765 Any object that has a two-value `params` attribute. Assumed that it 766 is (intercept, slope). 767 ax : axes, optional 768 Matplotlib axes instance. 769 **kwargs 770 Options passed to matplotlib.pyplot.plt. 771 772 Returns 773 ------- 774 Figure 775 The figure given by `ax.figure` or a new instance. 776 777 Examples 778 -------- 779 >>> import numpy as np 780 >>> import statsmodels.api as sm 781 782 >>> np.random.seed(12345) 783 >>> X = sm.add_constant(np.random.normal(0, 20, size=30)) 784 >>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30) 785 >>> mod = sm.OLS(y,X).fit() 786 >>> fig = sm.graphics.abline_plot(model_results=mod) 787 >>> ax = fig.axes[0] 788 >>> ax.scatter(X[:,1], y) 789 >>> ax.margins(.1) 790 >>> import matplotlib.pyplot as plt 791 >>> plt.show() 792 793 .. plot:: plots/graphics_regression_abline.py 794 """ 795 if ax is not None: # get axis limits first thing, do not change these 796 x = ax.get_xlim() 797 else: 798 x = None 799 800 fig, ax = utils.create_mpl_ax(ax) 801 802 if model_results: 803 intercept, slope = model_results.params 804 if x is None: 805 x = [model_results.model.exog[:, 1].min(), 806 model_results.model.exog[:, 1].max()] 807 else: 808 if not (intercept is not None and slope is not None): 809 raise ValueError("specify slope and intercepty or model_results") 810 if x is None: 811 x = ax.get_xlim() 812 813 data_y = [x[0]*slope+intercept, x[1]*slope+intercept] 814 ax.set_xlim(x) 815 #ax.set_ylim(y) 816 817 from matplotlib.lines import Line2D 818 819 class ABLine2D(Line2D): 820 def __init__(self, *args, **kwargs): 821 super(ABLine2D, self).__init__(*args, **kwargs) 822 self.id_xlim_callback = None 823 self.id_ylim_callback = None 824 825 def remove(self): 826 ax = self.axes 827 if self.id_xlim_callback: 828 ax.callbacks.disconnect(self.id_xlim_callback) 829 if self.id_ylim_callback: 830 ax.callbacks.disconnect(self.id_ylim_callback) 831 super(ABLine2D, self).remove() 832 833 def update_datalim(self, ax): 834 ax.set_autoscale_on(False) 835 children = ax.get_children() 836 ablines = [child for child in children if child is self] 837 abline = ablines[0] 838 x = ax.get_xlim() 839 y = [x[0] * slope + intercept, x[1] * slope + intercept] 840 abline.set_data(x, y) 841 ax.figure.canvas.draw() 842 843 # TODO: how to intercept something like a margins call and adjust? 844 line = ABLine2D(x, data_y, **kwargs) 845 ax.add_line(line) 846 line.id_xlim_callback = ax.callbacks.connect('xlim_changed', line.update_datalim) 847 line.id_ylim_callback = ax.callbacks.connect('ylim_changed', line.update_datalim) 848 849 if horiz: 850 ax.hline(horiz) 851 if vert: 852 ax.vline(vert) 853 return fig 854 855 856@Appender(_plot_influence_doc.format(**{ 857 'extra_params_doc': "results: object\n" 858 " Results for a fitted regression model.\n" 859 " influence: instance\n" 860 " The instance of Influence for model."})) 861def _influence_plot(results, influence, external=True, alpha=.05, 862 criterion="cooks", size=48, plot_alpha=.75, ax=None, 863 **kwargs): 864 infl = influence 865 fig, ax = utils.create_mpl_ax(ax) 866 867 if criterion.lower().startswith('coo'): 868 psize = infl.cooks_distance[0] 869 elif criterion.lower().startswith('dff'): 870 psize = np.abs(infl.dffits[0]) 871 else: 872 raise ValueError("Criterion %s not understood" % criterion) 873 874 # scale the variables 875 #TODO: what is the correct scaling and the assumption here? 876 #we want plots to be comparable across different plots 877 #so we would need to use the expected distribution of criterion probably 878 old_range = np.ptp(psize) 879 new_range = size**2 - 8**2 880 881 psize = (psize - psize.min()) * new_range/old_range + 8**2 882 883 leverage = infl.hat_matrix_diag 884 if external: 885 resids = infl.resid_studentized_external 886 else: 887 resids = infl.resid_studentized 888 889 from scipy import stats 890 891 cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) 892 large_resid = np.abs(resids) > cutoff 893 large_leverage = leverage > _high_leverage(results) 894 large_points = np.logical_or(large_resid, large_leverage) 895 896 ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) 897 898 # add point labels 899 labels = results.model.data.row_labels 900 if labels is None: 901 labels = lrange(len(resids)) 902 ax = utils.annotate_axes(np.where(large_points)[0], labels, 903 lzip(leverage, resids), 904 lzip(-(psize/2)**.5, (psize/2)**.5), "x-large", 905 ax) 906 907 # TODO: make configurable or let people do it ex-post? 908 font = {"fontsize": 16, "color": "black"} 909 ax.set_ylabel("Studentized Residuals", **font) 910 ax.set_xlabel("H Leverage", **font) 911 ax.set_title("Influence Plot", **font) 912 return fig 913 914 915@Appender(_plot_influence_doc.format(**{ 916 'extra_params_doc': "results : Results\n" 917 " Results for a fitted regression model."})) 918def influence_plot(results, external=True, alpha=.05, criterion="cooks", 919 size=48, plot_alpha=.75, ax=None, **kwargs): 920 921 infl = results.get_influence() 922 res = _influence_plot(results, infl, external=external, alpha=alpha, 923 criterion=criterion, size=size, 924 plot_alpha=plot_alpha, ax=ax, **kwargs) 925 return res 926 927 928@Appender(_plot_leverage_resid2_doc.format({ 929 'extra_params_doc': "results: object\n" 930 " Results for a fitted regression model\n" 931 "influence: instance\n" 932 " instance of Influence for model"})) 933def _plot_leverage_resid2(results, influence, alpha=.05, ax=None, 934 **kwargs): 935 936 from scipy.stats import norm, zscore 937 fig, ax = utils.create_mpl_ax(ax) 938 939 infl = influence 940 leverage = infl.hat_matrix_diag 941 resid = zscore(infl.resid) 942 ax.plot(resid**2, leverage, 'o', **kwargs) 943 ax.set_xlabel("Normalized residuals**2") 944 ax.set_ylabel("Leverage") 945 ax.set_title("Leverage vs. Normalized residuals squared") 946 947 large_leverage = leverage > _high_leverage(results) 948 #norm or t here if standardized? 949 cutoff = norm.ppf(1.-alpha/2) 950 large_resid = np.abs(resid) > cutoff 951 labels = results.model.data.row_labels 952 if labels is None: 953 labels = lrange(int(results.nobs)) 954 index = np.where(np.logical_or(large_leverage, large_resid))[0] 955 ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), 956 [(0, 5)]*int(results.nobs), "large", 957 ax=ax, ha="center", va="bottom") 958 ax.margins(.075, .075) 959 return fig 960 961 962@Appender(_plot_leverage_resid2_doc.format({ 963 'extra_params_doc': "results : object\n" 964 " Results for a fitted regression model"})) 965def plot_leverage_resid2(results, alpha=.05, ax=None, **kwargs): 966 967 infl = results.get_influence() 968 return _plot_leverage_resid2(results, infl, alpha=alpha, ax=ax, **kwargs) 969 970 971 972@Appender(_plot_added_variable_doc % { 973 'extra_params_doc': "results : object\n" 974 " Results for a fitted regression model"}) 975def plot_added_variable(results, focus_exog, resid_type=None, 976 use_glm_weights=True, fit_kwargs=None, ax=None): 977 978 model = results.model 979 980 fig, ax = utils.create_mpl_ax(ax) 981 982 endog_resid, focus_exog_resid =\ 983 added_variable_resids(results, focus_exog, 984 resid_type=resid_type, 985 use_glm_weights=use_glm_weights, 986 fit_kwargs=fit_kwargs) 987 988 ax.plot(focus_exog_resid, endog_resid, 'o', alpha=0.6) 989 990 ax.set_title('Added variable plot', fontsize='large') 991 992 if isinstance(focus_exog, str): 993 xname = focus_exog 994 else: 995 xname = model.exog_names[focus_exog] 996 ax.set_xlabel(xname, size=15) 997 ax.set_ylabel(model.endog_names + " residuals", size=15) 998 999 return fig 1000 1001 1002@Appender(_plot_partial_residuals_doc % { 1003 'extra_params_doc': "results : object\n" 1004 " Results for a fitted regression model"}) 1005def plot_partial_residuals(results, focus_exog, ax=None): 1006 # Docstring attached below 1007 1008 model = results.model 1009 1010 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model) 1011 1012 pr = partial_resids(results, focus_exog) 1013 focus_exog_vals = results.model.exog[:, focus_col] 1014 1015 fig, ax = utils.create_mpl_ax(ax) 1016 ax.plot(focus_exog_vals, pr, 'o', alpha=0.6) 1017 1018 ax.set_title('Partial residuals plot', fontsize='large') 1019 1020 if isinstance(focus_exog, str): 1021 xname = focus_exog 1022 else: 1023 xname = model.exog_names[focus_exog] 1024 ax.set_xlabel(xname, size=15) 1025 ax.set_ylabel("Component plus residual", size=15) 1026 1027 return fig 1028 1029 1030@Appender(_plot_ceres_residuals_doc % { 1031 'extra_params_doc': "results : Results\n" 1032 " Results instance of a fitted regression " 1033 "model."}) 1034def plot_ceres_residuals(results, focus_exog, frac=0.66, cond_means=None, 1035 ax=None): 1036 1037 model = results.model 1038 1039 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model) 1040 1041 presid = ceres_resids(results, focus_exog, frac=frac, 1042 cond_means=cond_means) 1043 1044 focus_exog_vals = model.exog[:, focus_col] 1045 1046 fig, ax = utils.create_mpl_ax(ax) 1047 ax.plot(focus_exog_vals, presid, 'o', alpha=0.6) 1048 1049 ax.set_title('CERES residuals plot', fontsize='large') 1050 1051 ax.set_xlabel(focus_exog, size=15) 1052 ax.set_ylabel("Component plus residual", size=15) 1053 1054 return fig 1055 1056 1057def ceres_resids(results, focus_exog, frac=0.66, cond_means=None): 1058 """ 1059 Calculate the CERES residuals (Conditional Expectation Partial 1060 Residuals) for a fitted model. 1061 1062 Parameters 1063 ---------- 1064 results : model results instance 1065 The fitted model for which the CERES residuals are calculated. 1066 focus_exog : int 1067 The column of results.model.exog used as the 'focus variable'. 1068 frac : float, optional 1069 Lowess smoothing parameter for estimating the conditional 1070 means. Not used if `cond_means` is provided. 1071 cond_means : array_like, optional 1072 If provided, the columns of this array are the conditional 1073 means E[exog | focus exog], where exog ranges over some 1074 or all of the columns of exog other than focus exog. If 1075 this is an empty nx0 array, the conditional means are 1076 treated as being zero. If None, the conditional means are 1077 estimated. 1078 1079 Returns 1080 ------- 1081 An array containing the CERES residuals. 1082 1083 Notes 1084 ----- 1085 If `cond_means` is not provided, it is obtained by smoothing each 1086 column of exog (except the focus column) against the focus column. 1087 1088 Currently only supports GLM, GEE, and OLS models. 1089 """ 1090 1091 model = results.model 1092 1093 if not isinstance(model, (GLM, GEE, OLS)): 1094 raise ValueError("ceres residuals not available for %s" % 1095 model.__class__.__name__) 1096 1097 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model) 1098 1099 # Indices of non-focus columns 1100 ix_nf = range(len(results.params)) 1101 ix_nf = list(ix_nf) 1102 ix_nf.pop(focus_col) 1103 nnf = len(ix_nf) 1104 1105 # Estimate the conditional means if not provided. 1106 if cond_means is None: 1107 1108 # Below we calculate E[x | focus] where x is each column other 1109 # than the focus column. We do not want the intercept when we do 1110 # this so we remove it here. 1111 pexog = model.exog[:, ix_nf] 1112 pexog -= pexog.mean(0) 1113 u, s, vt = np.linalg.svd(pexog, 0) 1114 ii = np.flatnonzero(s > 1e-6) 1115 pexog = u[:, ii] 1116 1117 fcol = model.exog[:, focus_col] 1118 cond_means = np.empty((len(fcol), pexog.shape[1])) 1119 for j in range(pexog.shape[1]): 1120 1121 # Get the fitted values for column i given the other 1122 # columns (skip the intercept). 1123 y0 = pexog[:, j] 1124 1125 cf = lowess(y0, fcol, frac=frac, return_sorted=False) 1126 1127 cond_means[:, j] = cf 1128 1129 new_exog = np.concatenate((model.exog[:, ix_nf], cond_means), axis=1) 1130 1131 # Refit the model using the adjusted exog values 1132 klass = model.__class__ 1133 init_kwargs = model._get_init_kwds() 1134 new_model = klass(model.endog, new_exog, **init_kwargs) 1135 new_result = new_model.fit() 1136 1137 # The partial residual, with respect to l(x2) (notation of Cook 1998) 1138 presid = model.endog - new_result.fittedvalues 1139 if isinstance(model, (GLM, GEE)): 1140 presid *= model.family.link.deriv(new_result.fittedvalues) 1141 if new_exog.shape[1] > nnf: 1142 presid += np.dot(new_exog[:, nnf:], new_result.params[nnf:]) 1143 1144 return presid 1145 1146def partial_resids(results, focus_exog): 1147 """ 1148 Returns partial residuals for a fitted model with respect to a 1149 'focus predictor'. 1150 1151 Parameters 1152 ---------- 1153 results : results instance 1154 A fitted regression model. 1155 focus col : int 1156 The column index of model.exog with respect to which the 1157 partial residuals are calculated. 1158 1159 Returns 1160 ------- 1161 An array of partial residuals. 1162 1163 References 1164 ---------- 1165 RD Cook and R Croos-Dabrera (1998). Partial residual plots in 1166 generalized linear models. Journal of the American Statistical 1167 Association, 93:442. 1168 """ 1169 1170 # TODO: could be a method of results 1171 # TODO: see Cook et al (1998) for a more general definition 1172 1173 # The calculation follows equation (8) from Cook's paper. 1174 model = results.model 1175 resid = model.endog - results.predict() 1176 1177 if isinstance(model, (GLM, GEE)): 1178 resid *= model.family.link.deriv(results.fittedvalues) 1179 elif isinstance(model, (OLS, GLS, WLS)): 1180 pass # No need to do anything 1181 else: 1182 raise ValueError("Partial residuals for '%s' not implemented." 1183 % type(model)) 1184 1185 if type(focus_exog) is str: 1186 focus_col = model.exog_names.index(focus_exog) 1187 else: 1188 focus_col = focus_exog 1189 1190 focus_val = results.params[focus_col] * model.exog[:, focus_col] 1191 1192 return focus_val + resid 1193 1194def added_variable_resids(results, focus_exog, resid_type=None, 1195 use_glm_weights=True, fit_kwargs=None): 1196 """ 1197 Residualize the endog variable and a 'focus' exog variable in a 1198 regression model with respect to the other exog variables. 1199 1200 Parameters 1201 ---------- 1202 results : regression results instance 1203 A fitted model including the focus exog and all other 1204 predictors of interest. 1205 focus_exog : {int, str} 1206 The column of results.model.exog or a variable name that is 1207 to be residualized against the other predictors. 1208 resid_type : str 1209 The type of residuals to use for the dependent variable. If 1210 None, uses `resid_deviance` for GLM/GEE and `resid` otherwise. 1211 use_glm_weights : bool 1212 Only used if the model is a GLM or GEE. If True, the 1213 residuals for the focus predictor are computed using WLS, with 1214 the weights obtained from the IRLS calculations for fitting 1215 the GLM. If False, unweighted regression is used. 1216 fit_kwargs : dict, optional 1217 Keyword arguments to be passed to fit when refitting the 1218 model. 1219 1220 Returns 1221 ------- 1222 endog_resid : array_like 1223 The residuals for the original exog 1224 focus_exog_resid : array_like 1225 The residuals for the focus predictor 1226 1227 Notes 1228 ----- 1229 The 'focus variable' residuals are always obtained using linear 1230 regression. 1231 1232 Currently only GLM, GEE, and OLS models are supported. 1233 """ 1234 1235 model = results.model 1236 if not isinstance(model, (GEE, GLM, OLS)): 1237 raise ValueError("model type %s not supported for added variable residuals" % 1238 model.__class__.__name__) 1239 1240 exog = model.exog 1241 endog = model.endog 1242 1243 focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model) 1244 1245 focus_exog_vals = exog[:, focus_col] 1246 1247 # Default residuals 1248 if resid_type is None: 1249 if isinstance(model, (GEE, GLM)): 1250 resid_type = "resid_deviance" 1251 else: 1252 resid_type = "resid" 1253 1254 ii = range(exog.shape[1]) 1255 ii = list(ii) 1256 ii.pop(focus_col) 1257 reduced_exog = exog[:, ii] 1258 start_params = results.params[ii] 1259 1260 klass = model.__class__ 1261 1262 kwargs = model._get_init_kwds() 1263 new_model = klass(endog, reduced_exog, **kwargs) 1264 args = {"start_params": start_params} 1265 if fit_kwargs is not None: 1266 args.update(fit_kwargs) 1267 new_result = new_model.fit(**args) 1268 if not new_result.converged: 1269 raise ValueError("fit did not converge when calculating added variable residuals") 1270 1271 try: 1272 endog_resid = getattr(new_result, resid_type) 1273 except AttributeError: 1274 raise ValueError("'%s' residual type not available" % resid_type) 1275 1276 import statsmodels.regression.linear_model as lm 1277 1278 if isinstance(model, (GLM, GEE)) and use_glm_weights: 1279 weights = model.family.weights(results.fittedvalues) 1280 if hasattr(model, "data_weights"): 1281 weights = weights * model.data_weights 1282 lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit() 1283 else: 1284 lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit() 1285 focus_exog_resid = lm_results.resid 1286 1287 return endog_resid, focus_exog_resid 1288