1'''Partial Regression plot and residual plots to find misspecification
2
3
4Author: Josef Perktold
5License: BSD-3
6Created: 2011-01-23
7
8update
92011-06-05 : start to convert example to usable functions
102011-10-27 : docstrings
11
12'''
13from statsmodels.compat.pandas import Appender
14from statsmodels.compat.python import lrange, lzip
15
16import numpy as np
17import pandas as pd
18from patsy import dmatrix
19
20from statsmodels.genmod.generalized_estimating_equations import GEE
21from statsmodels.genmod.generalized_linear_model import GLM
22from statsmodels.graphics import utils
23from statsmodels.nonparametric.smoothers_lowess import lowess
24from statsmodels.regression.linear_model import GLS, OLS, WLS
25from statsmodels.sandbox.regression.predstd import wls_prediction_std
26from statsmodels.tools.tools import maybe_unwrap_results
27
28from ._regressionplots_doc import (
29    _plot_added_variable_doc,
30    _plot_ceres_residuals_doc,
31    _plot_influence_doc,
32    _plot_leverage_resid2_doc,
33    _plot_partial_residuals_doc,
34)
35
36__all__ = ['plot_fit', 'plot_regress_exog', 'plot_partregress', 'plot_ccpr',
37           'plot_regress_exog', 'plot_partregress_grid', 'plot_ccpr_grid',
38           'add_lowess', 'abline_plot', 'influence_plot',
39           'plot_leverage_resid2', 'added_variable_resids',
40           'partial_resids', 'ceres_resids', 'plot_added_variable',
41           'plot_partial_residuals', 'plot_ceres_residuals']
42
43#TODO: consider moving to influence module
44def _high_leverage(results):
45    #TODO: replace 1 with k_constant
46    return 2. * (results.df_model + 1)/results.nobs
47
48
49def add_lowess(ax, lines_idx=0, frac=.2, **lowess_kwargs):
50    """
51    Add Lowess line to a plot.
52
53    Parameters
54    ----------
55    ax : AxesSubplot
56        The Axes to which to add the plot
57    lines_idx : int
58        This is the line on the existing plot to which you want to add
59        a smoothed lowess line.
60    frac : float
61        The fraction of the points to use when doing the lowess fit.
62    lowess_kwargs
63        Additional keyword arguments are passes to lowess.
64
65    Returns
66    -------
67    Figure
68        The figure that holds the instance.
69    """
70    y0 = ax.get_lines()[lines_idx]._y
71    x0 = ax.get_lines()[lines_idx]._x
72    lres = lowess(y0, x0, frac=frac, **lowess_kwargs)
73    ax.plot(lres[:, 0], lres[:, 1], 'r', lw=1.5)
74    return ax.figure
75
76
77def plot_fit(results, exog_idx, y_true=None, ax=None, vlines=True, **kwargs):
78    """
79    Plot fit against one regressor.
80
81    This creates one graph with the scatterplot of observed values
82    compared to fitted values.
83
84    Parameters
85    ----------
86    results : Results
87        A result instance with resid, model.endog and model.exog as
88        attributes.
89    exog_idx : {int, str}
90        Name or index of regressor in exog matrix.
91    y_true : array_like. optional
92        If this is not None, then the array is added to the plot.
93    ax : AxesSubplot, optional
94        If given, this subplot is used to plot in instead of a new figure being
95        created.
96    vlines : bool, optional
97        If this not True, then the uncertainty of the fit is not
98        plotted.
99    **kwargs
100        The keyword arguments are passed to the plot command for the fitted
101        values points.
102
103    Returns
104    -------
105    Figure
106        If `ax` is None, the created figure.  Otherwise the figure to which
107        `ax` is connected.
108
109    Examples
110    --------
111    Load the Statewide Crime data set and perform linear regression with
112    `poverty` and `hs_grad` as variables and `murder` as the response
113
114    >>> import statsmodels.api as sm
115    >>> import matplotlib.pyplot as plt
116
117    >>> data = sm.datasets.statecrime.load_pandas().data
118    >>> murder = data['murder']
119    >>> X = data[['poverty', 'hs_grad']]
120
121    >>> X["constant"] = 1
122    >>> y = murder
123    >>> model = sm.OLS(y, X)
124    >>> results = model.fit()
125
126    Create a plot just for the variable 'Poverty':
127
128    >>> fig, ax = plt.subplots()
129    >>> fig = sm.graphics.plot_fit(results, 0, ax=ax)
130    >>> ax.set_ylabel("Murder Rate")
131    >>> ax.set_xlabel("Poverty Level")
132    >>> ax.set_title("Linear Regression")
133
134    >>> plt.show()
135
136    .. plot:: plots/graphics_plot_fit_ex.py
137    """
138
139    fig, ax = utils.create_mpl_ax(ax)
140
141    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
142    results = maybe_unwrap_results(results)
143
144    #maybe add option for wendog, wexog
145    y = results.model.endog
146    x1 = results.model.exog[:, exog_idx]
147    x1_argsort = np.argsort(x1)
148    y = y[x1_argsort]
149    x1 = x1[x1_argsort]
150
151    ax.plot(x1, y, 'bo', label=results.model.endog_names)
152    if y_true is not None:
153        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
154    title = 'Fitted values versus %s' % exog_name
155
156    ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
157            label='fitted', **kwargs)
158    if vlines is True:
159        _, iv_l, iv_u = wls_prediction_std(results)
160        ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1,
161                  color='k', alpha=.7)
162    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
163    #                    color='k')
164    ax.set_title(title)
165    ax.set_xlabel(exog_name)
166    ax.set_ylabel(results.model.endog_names)
167    ax.legend(loc='best', numpoints=1)
168
169    return fig
170
171
172def plot_regress_exog(results, exog_idx, fig=None):
173    """Plot regression results against one regressor.
174
175    This plots four graphs in a 2 by 2 figure: 'endog versus exog',
176    'residuals versus exog', 'fitted versus exog' and
177    'fitted plus residual versus exog'
178
179    Parameters
180    ----------
181    results : result instance
182        A result instance with resid, model.endog and model.exog as attributes.
183    exog_idx : int or str
184        Name or index of regressor in exog matrix.
185    fig : Figure, optional
186        If given, this figure is simply returned.  Otherwise a new figure is
187        created.
188
189    Returns
190    -------
191    Figure
192        The value of `fig` if provided. Otherwise a new instance.
193
194    Examples
195    --------
196    Load the Statewide Crime data set and build a model with regressors
197    including the rate of high school graduation (hs_grad), population in urban
198    areas (urban), households below poverty line (poverty), and single person
199    households (single).  Outcome variable is the murder rate (murder).
200
201    Build a 2 by 2 figure based on poverty showing fitted versus actual murder
202    rate, residuals versus the poverty rate, partial regression plot of poverty,
203    and CCPR plot for poverty rate.
204
205    >>> import statsmodels.api as sm
206    >>> import matplotlib.pyplot as plot
207    >>> import statsmodels.formula.api as smf
208
209    >>> fig = plt.figure(figsize=(8, 6))
210    >>> crime_data = sm.datasets.statecrime.load_pandas()
211    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
212    ...                   data=crime_data.data).fit()
213    >>> sm.graphics.plot_regress_exog(results, 'poverty', fig=fig)
214    >>> plt.show()
215
216    .. plot:: plots/graphics_regression_regress_exog.py
217    """
218
219    fig = utils.create_mpl_fig(fig)
220
221    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
222    results = maybe_unwrap_results(results)
223
224    #maybe add option for wendog, wexog
225    y_name = results.model.endog_names
226    x1 = results.model.exog[:, exog_idx]
227    prstd, iv_l, iv_u = wls_prediction_std(results)
228
229    ax = fig.add_subplot(2, 2, 1)
230    ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name)
231    ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted',
232            alpha=.5)
233    ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7)
234    ax.set_title('Y and Fitted vs. X', fontsize='large')
235    ax.set_xlabel(exog_name)
236    ax.set_ylabel(y_name)
237    ax.legend(loc='best')
238
239    ax = fig.add_subplot(2, 2, 2)
240    ax.plot(x1, results.resid, 'o')
241    ax.axhline(y=0, color='black')
242    ax.set_title('Residuals versus %s' % exog_name, fontsize='large')
243    ax.set_xlabel(exog_name)
244    ax.set_ylabel("resid")
245
246    ax = fig.add_subplot(2, 2, 3)
247    exog_noti = np.ones(results.model.exog.shape[1], bool)
248    exog_noti[exog_idx] = False
249    exog_others = results.model.exog[:, exog_noti]
250    from pandas import Series
251    fig = plot_partregress(results.model.data.orig_endog,
252                           Series(x1, name=exog_name,
253                                  index=results.model.data.row_labels),
254                           exog_others, obs_labels=False, ax=ax)
255    ax.set_title('Partial regression plot', fontsize='large')
256    #ax.set_ylabel("Fitted values")
257    #ax.set_xlabel(exog_name)
258
259    ax = fig.add_subplot(2, 2, 4)
260    fig = plot_ccpr(results, exog_idx, ax=ax)
261    ax.set_title('CCPR Plot', fontsize='large')
262    #ax.set_xlabel(exog_name)
263    #ax.set_ylabel("Fitted values + resids")
264
265    fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")
266
267    fig.tight_layout()
268
269    fig.subplots_adjust(top=.90)
270    return fig
271
272
273def _partial_regression(endog, exog_i, exog_others):
274    """Partial regression.
275
276    regress endog on exog_i conditional on exog_others
277
278    uses OLS
279
280    Parameters
281    ----------
282    endog : array_like
283    exog : array_like
284    exog_others : array_like
285
286    Returns
287    -------
288    res1c : OLS results instance
289
290    (res1a, res1b) : tuple of OLS results instances
291         results from regression of endog on exog_others and of exog_i on
292         exog_others
293    """
294    #FIXME: This function does not appear to be used.
295    res1a = OLS(endog, exog_others).fit()
296    res1b = OLS(exog_i, exog_others).fit()
297    res1c = OLS(res1a.resid, res1b.resid).fit()
298
299    return res1c, (res1a, res1b)
300
301
302def plot_partregress(endog, exog_i, exog_others, data=None,
303                     title_kwargs={}, obs_labels=True, label_kwargs={},
304                     ax=None, ret_coords=False, eval_env=1, **kwargs):
305    """Plot partial regression for a single regressor.
306
307    Parameters
308    ----------
309    endog : {ndarray, str}
310       The endogenous or response variable. If string is given, you can use a
311       arbitrary translations as with a formula.
312    exog_i : {ndarray, str}
313        The exogenous, explanatory variable. If string is given, you can use a
314        arbitrary translations as with a formula.
315    exog_others : {ndarray, list[str]}
316        Any other exogenous, explanatory variables. If a list of strings is
317        given, each item is a term in formula. You can use a arbitrary
318        translations as with a formula. The effect of these variables will be
319        removed by OLS regression.
320    data : {DataFrame, dict}
321        Some kind of data structure with names if the other variables are
322        given as strings.
323    title_kwargs : dict
324        Keyword arguments to pass on for the title. The key to control the
325        fonts is fontdict.
326    obs_labels : {bool, array_like}
327        Whether or not to annotate the plot points with their observation
328        labels. If obs_labels is a boolean, the point labels will try to do
329        the right thing. First it will try to use the index of data, then
330        fall back to the index of exog_i. Alternatively, you may give an
331        array-like object corresponding to the observation numbers.
332    label_kwargs : dict
333        Keyword arguments that control annotate for the observation labels.
334    ax : AxesSubplot, optional
335        If given, this subplot is used to plot in instead of a new figure being
336        created.
337    ret_coords : bool
338        If True will return the coordinates of the points in the plot. You
339        can use this to add your own annotations.
340    eval_env : int
341        Patsy eval environment if user functions and formulas are used in
342        defining endog or exog.
343    **kwargs
344        The keyword arguments passed to plot for the points.
345
346    Returns
347    -------
348    fig : Figure
349        If `ax` is None, the created figure.  Otherwise the figure to which
350        `ax` is connected.
351    coords : list, optional
352        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).
353
354    See Also
355    --------
356    plot_partregress_grid : Plot partial regression for a set of regressors.
357
358    Notes
359    -----
360    The slope of the fitted line is the that of `exog_i` in the full
361    multiple regression. The individual points can be used to assess the
362    influence of points on the estimated coefficient.
363
364    Examples
365    --------
366    Load the Statewide Crime data set and plot partial regression of the rate
367    of high school graduation (hs_grad) on the murder rate(murder).
368
369    The effects of the percent of the population living in urban areas (urban),
370    below the poverty line (poverty) , and in a single person household (single)
371    are removed by OLS regression.
372
373    >>> import statsmodels.api as sm
374    >>> import matplotlib.pyplot as plt
375
376    >>> crime_data = sm.datasets.statecrime.load_pandas()
377    >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
378    ...                              exog_others=['urban', 'poverty', 'single'],
379    ...                              data=crime_data.data, obs_labels=False)
380    >>> plt.show()
381
382    .. plot:: plots/graphics_regression_partregress.py
383
384    More detailed examples can be found in the Regression Plots notebook
385    on the examples page.
386    """
387    #NOTE: there is no interaction between possible missing data and
388    #obs_labels yet, so this will need to be tweaked a bit for this case
389    fig, ax = utils.create_mpl_ax(ax)
390    print("eval_env:", eval_env)
391    # strings, use patsy to transform to data
392    if isinstance(endog, str):
393        endog = dmatrix(endog + "-1", data, eval_env=eval_env)
394
395    if isinstance(exog_others, str):
396        RHS = dmatrix(exog_others, data, eval_env=eval_env)
397    elif isinstance(exog_others, list):
398        RHS = "+".join(exog_others)
399        RHS = dmatrix(RHS, data, eval_env=eval_env)
400    else:
401        RHS = exog_others
402    RHS_isemtpy = False
403    if isinstance(RHS, np.ndarray) and RHS.size==0:
404        RHS_isemtpy = True
405    elif isinstance(RHS, pd.DataFrame) and RHS.empty:
406        RHS_isemtpy = True
407    if isinstance(exog_i, str):
408        exog_i = dmatrix(exog_i + "-1", data, eval_env=eval_env)
409
410    # all arrays or pandas-like
411
412    if RHS_isemtpy:
413        endog = np.asarray(endog)
414        exog_i = np.asarray(exog_i)
415        ax.plot(endog, exog_i, 'o', **kwargs)
416        fitted_line = OLS(endog, exog_i).fit()
417        x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name
418        y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0]
419    else:
420        res_yaxis = OLS(endog, RHS).fit()
421        res_xaxis = OLS(exog_i, RHS).fit()
422        xaxis_resid = res_xaxis.resid
423        yaxis_resid = res_yaxis.resid
424        x_axis_endog_name = res_xaxis.model.endog_names
425        y_axis_endog_name = res_yaxis.model.endog_names
426        ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
427        fitted_line = OLS(yaxis_resid, xaxis_resid).fit()
428
429    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)
430
431    if x_axis_endog_name == 'y':  # for no names regression will just get a y
432        x_axis_endog_name = 'x'  # this is misleading, so use x
433    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
434    ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
435    ax.set_title('Partial Regression Plot', **title_kwargs)
436
437    # NOTE: if we want to get super fancy, we could annotate if a point is
438    # clicked using this widget
439    # http://stackoverflow.com/questions/4652439/
440    # is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
441    # 4674445#4674445
442    if obs_labels is True:
443        if data is not None:
444            obs_labels = data.index
445        elif hasattr(exog_i, "index"):
446            obs_labels = exog_i.index
447        else:
448            obs_labels = res_xaxis.model.data.row_labels
449        #NOTE: row_labels can be None.
450        #Maybe we should fix this to never be the case.
451        if obs_labels is None:
452            obs_labels = lrange(len(exog_i))
453
454    if obs_labels is not False:  # could be array_like
455        if len(obs_labels) != len(exog_i):
456            raise ValueError("obs_labels does not match length of exog_i")
457        label_kwargs.update(dict(ha="center", va="bottom"))
458        ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels,
459                                 lzip(res_xaxis.resid, res_yaxis.resid),
460                                 [(0, 5)] * len(obs_labels), "x-large", ax=ax,
461                                 **label_kwargs)
462
463    if ret_coords:
464        return fig, (res_xaxis.resid, res_yaxis.resid)
465    else:
466        return fig
467
468
469def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None):
470    """
471    Plot partial regression for a set of regressors.
472
473    Parameters
474    ----------
475    results : Results instance
476        A regression model results instance.
477    exog_idx : {None, list[int], list[str]}
478        The indices  or column names of the exog used in the plot, default is
479        all.
480    grid : {None, tuple[int]}
481        If grid is given, then it is used for the arrangement of the subplots.
482        The format of grid is  (nrows, ncols). If grid is None, then ncol is
483        one, if there are only 2 subplots, and the number of columns is two
484        otherwise.
485    fig : Figure, optional
486        If given, this figure is simply returned.  Otherwise a new figure is
487        created.
488
489    Returns
490    -------
491    Figure
492        If `fig` is None, the created figure.  Otherwise `fig` itself.
493
494    See Also
495    --------
496    plot_partregress : Plot partial regression for a single regressor.
497    plot_ccpr : Plot CCPR against one regressor
498
499    Notes
500    -----
501    A subplot is created for each explanatory variable given by exog_idx.
502    The partial regression plot shows the relationship between the response
503    and the given explanatory variable after removing the effect of all other
504    explanatory variables in exog.
505
506    References
507    ----------
508    See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm
509
510    Examples
511    --------
512    Using the state crime dataset separately plot the effect of the each
513    variable on the on the outcome, murder rate while accounting for the effect
514    of all other variables in the model visualized with a grid of partial
515    regression plots.
516
517    >>> from statsmodels.graphics.regressionplots import plot_partregress_grid
518    >>> import statsmodels.api as sm
519    >>> import matplotlib.pyplot as plt
520    >>> import statsmodels.formula.api as smf
521
522    >>> fig = plt.figure(figsize=(8, 6))
523    >>> crime_data = sm.datasets.statecrime.load_pandas()
524    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
525    ...                   data=crime_data.data).fit()
526    >>> plot_partregress_grid(results, fig=fig)
527    >>> plt.show()
528
529    .. plot:: plots/graphics_regression_partregress_grid.py
530    """
531    import pandas
532    fig = utils.create_mpl_fig(fig)
533
534    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
535
536    # TODO: maybe add option for using wendog, wexog instead
537    y = pandas.Series(results.model.endog, name=results.model.endog_names)
538    exog = results.model.exog
539
540    k_vars = exog.shape[1]
541    # this function does not make sense if k_vars=1
542
543    nrows = (len(exog_idx) + 1) // 2
544    ncols = 1 if nrows == len(exog_idx) else 2
545    if grid is not None:
546        nrows, ncols = grid
547    if ncols > 1:
548        title_kwargs = {"fontdict": {"fontsize": 'small'}}
549
550    # for indexing purposes
551    other_names = np.array(results.model.exog_names)
552    for i, idx in enumerate(exog_idx):
553        others = lrange(k_vars)
554        others.pop(idx)
555        exog_others = pandas.DataFrame(exog[:, others],
556                                       columns=other_names[others])
557        ax = fig.add_subplot(nrows, ncols, i + 1)
558        plot_partregress(y, pandas.Series(exog[:, idx],
559                                          name=other_names[idx]),
560                         exog_others, ax=ax, title_kwargs=title_kwargs,
561                         obs_labels=False)
562        ax.set_title("")
563
564    fig.suptitle("Partial Regression Plot", fontsize="large")
565    fig.tight_layout()
566    fig.subplots_adjust(top=.95)
567
568    return fig
569
570
571def plot_ccpr(results, exog_idx, ax=None):
572    """
573    Plot CCPR against one regressor.
574
575    Generates a component and component-plus-residual (CCPR) plot.
576
577    Parameters
578    ----------
579    results : result instance
580        A regression results instance.
581    exog_idx : {int, str}
582        Exogenous, explanatory variable. If string is given, it should
583        be the variable name that you want to use, and you can use arbitrary
584        translations as with a formula.
585    ax : AxesSubplot, optional
586        If given, it is used to plot in instead of a new figure being
587        created.
588
589    Returns
590    -------
591    Figure
592        If `ax` is None, the created figure.  Otherwise the figure to which
593        `ax` is connected.
594
595    See Also
596    --------
597    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.
598
599    Notes
600    -----
601    The CCPR plot provides a way to judge the effect of one regressor on the
602    response variable by taking into account the effects of the other
603    independent variables. The partial residuals plot is defined as
604    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
605    X_i to show where the fitted line would lie. Care should be taken if X_i
606    is highly correlated with any of the other independent variables. If this
607    is the case, the variance evident in the plot will be an underestimate of
608    the true variance.
609
610    References
611    ----------
612    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
613
614    Examples
615    --------
616    Using the state crime dataset plot the effect of the rate of single
617    households ('single') on the murder rate while accounting for high school
618    graduation rate ('hs_grad'), percentage of people in an urban area, and rate
619    of poverty ('poverty').
620
621    >>> import statsmodels.api as sm
622    >>> import matplotlib.pyplot as plot
623    >>> import statsmodels.formula.api as smf
624
625    >>> crime_data = sm.datasets.statecrime.load_pandas()
626    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
627    ...                   data=crime_data.data).fit()
628    >>> sm.graphics.plot_ccpr(results, 'single')
629    >>> plt.show()
630
631    .. plot:: plots/graphics_regression_ccpr.py
632    """
633    fig, ax = utils.create_mpl_ax(ax)
634
635    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
636    results = maybe_unwrap_results(results)
637
638    x1 = results.model.exog[:, exog_idx]
639    #namestr = ' for %s' % self.name if self.name else ''
640    x1beta = x1*results.params[exog_idx]
641    ax.plot(x1, x1beta + results.resid, 'o')
642    from statsmodels.tools.tools import add_constant
643    mod = OLS(x1beta, add_constant(x1)).fit()
644    params = mod.params
645    fig = abline_plot(*params, **dict(ax=ax))
646    #ax.plot(x1, x1beta, '-')
647    ax.set_title('Component and component plus residual plot')
648    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
649    ax.set_xlabel("%s" % exog_name)
650
651    return fig
652
653
654def plot_ccpr_grid(results, exog_idx=None, grid=None, fig=None):
655    """
656    Generate CCPR plots against a set of regressors, plot in a grid.
657
658    Generates a grid of component and component-plus-residual (CCPR) plots.
659
660    Parameters
661    ----------
662    results : result instance
663        A results instance with exog and params.
664    exog_idx : None or list of int
665        The indices or column names of the exog used in the plot.
666    grid : None or tuple of int (nrows, ncols)
667        If grid is given, then it is used for the arrangement of the subplots.
668        If grid is None, then ncol is one, if there are only 2 subplots, and
669        the number of columns is two otherwise.
670    fig : Figure, optional
671        If given, this figure is simply returned.  Otherwise a new figure is
672        created.
673
674    Returns
675    -------
676    Figure
677        If `ax` is None, the created figure.  Otherwise the figure to which
678        `ax` is connected.
679
680    See Also
681    --------
682    plot_ccpr : Creates CCPR plot for a single regressor.
683
684    Notes
685    -----
686    Partial residual plots are formed as::
687
688        Res + Betahat(i)*Xi versus Xi
689
690    and CCPR adds::
691
692        Betahat(i)*Xi versus Xi
693
694    References
695    ----------
696    See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
697
698    Examples
699    --------
700    Using the state crime dataset separately plot the effect of the each
701    variable on the on the outcome, murder rate while accounting for the effect
702    of all other variables in the model.
703
704    >>> import statsmodels.api as sm
705    >>> import matplotlib.pyplot as plt
706    >>> import statsmodels.formula.api as smf
707
708    >>> fig = plt.figure(figsize=(8, 8))
709    >>> crime_data = sm.datasets.statecrime.load_pandas()
710    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
711    ...                   data=crime_data.data).fit()
712    >>> sm.graphics.plot_ccpr_grid(results, fig=fig)
713    >>> plt.show()
714
715    .. plot:: plots/graphics_regression_ccpr_grid.py
716    """
717    fig = utils.create_mpl_fig(fig)
718
719    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
720
721    if grid is not None:
722        nrows, ncols = grid
723    else:
724        if len(exog_idx) > 2:
725            nrows = int(np.ceil(len(exog_idx)/2.))
726            ncols = 2
727        else:
728            nrows = len(exog_idx)
729            ncols = 1
730
731    seen_constant = 0
732    for i, idx in enumerate(exog_idx):
733        if results.model.exog[:, idx].var() == 0:
734            seen_constant = 1
735            continue
736
737        ax = fig.add_subplot(nrows, ncols, i+1-seen_constant)
738        fig = plot_ccpr(results, exog_idx=idx, ax=ax)
739        ax.set_title("")
740
741    fig.suptitle("Component-Component Plus Residual Plot", fontsize="large")
742
743    fig.tight_layout()
744
745    fig.subplots_adjust(top=.95)
746    return fig
747
748
749def abline_plot(intercept=None, slope=None, horiz=None, vert=None,
750                model_results=None, ax=None, **kwargs):
751    """
752    Plot a line given an intercept and slope.
753
754    Parameters
755    ----------
756    intercept : float
757        The intercept of the line.
758    slope : float
759        The slope of the line.
760    horiz : float or array_like
761        Data for horizontal lines on the y-axis.
762    vert : array_like
763        Data for verterical lines on the x-axis.
764    model_results : statsmodels results instance
765        Any object that has a two-value `params` attribute. Assumed that it
766        is (intercept, slope).
767    ax : axes, optional
768        Matplotlib axes instance.
769    **kwargs
770        Options passed to matplotlib.pyplot.plt.
771
772    Returns
773    -------
774    Figure
775        The figure given by `ax.figure` or a new instance.
776
777    Examples
778    --------
779    >>> import numpy as np
780    >>> import statsmodels.api as sm
781
782    >>> np.random.seed(12345)
783    >>> X = sm.add_constant(np.random.normal(0, 20, size=30))
784    >>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30)
785    >>> mod = sm.OLS(y,X).fit()
786    >>> fig = sm.graphics.abline_plot(model_results=mod)
787    >>> ax = fig.axes[0]
788    >>> ax.scatter(X[:,1], y)
789    >>> ax.margins(.1)
790    >>> import matplotlib.pyplot as plt
791    >>> plt.show()
792
793    .. plot:: plots/graphics_regression_abline.py
794    """
795    if ax is not None:  # get axis limits first thing, do not change these
796        x = ax.get_xlim()
797    else:
798        x = None
799
800    fig, ax = utils.create_mpl_ax(ax)
801
802    if model_results:
803        intercept, slope = model_results.params
804        if x is None:
805            x = [model_results.model.exog[:, 1].min(),
806                 model_results.model.exog[:, 1].max()]
807    else:
808        if not (intercept is not None and slope is not None):
809            raise ValueError("specify slope and intercepty or model_results")
810        if x is None:
811            x = ax.get_xlim()
812
813    data_y = [x[0]*slope+intercept, x[1]*slope+intercept]
814    ax.set_xlim(x)
815    #ax.set_ylim(y)
816
817    from matplotlib.lines import Line2D
818
819    class ABLine2D(Line2D):
820        def __init__(self, *args, **kwargs):
821            super(ABLine2D, self).__init__(*args, **kwargs)
822            self.id_xlim_callback = None
823            self.id_ylim_callback = None
824
825        def remove(self):
826            ax = self.axes
827            if self.id_xlim_callback:
828                ax.callbacks.disconnect(self.id_xlim_callback)
829            if self.id_ylim_callback:
830                ax.callbacks.disconnect(self.id_ylim_callback)
831            super(ABLine2D, self).remove()
832
833        def update_datalim(self, ax):
834            ax.set_autoscale_on(False)
835            children = ax.get_children()
836            ablines = [child for child in children if child is self]
837            abline = ablines[0]
838            x = ax.get_xlim()
839            y = [x[0] * slope + intercept, x[1] * slope + intercept]
840            abline.set_data(x, y)
841            ax.figure.canvas.draw()
842
843    # TODO: how to intercept something like a margins call and adjust?
844    line = ABLine2D(x, data_y, **kwargs)
845    ax.add_line(line)
846    line.id_xlim_callback = ax.callbacks.connect('xlim_changed', line.update_datalim)
847    line.id_ylim_callback = ax.callbacks.connect('ylim_changed', line.update_datalim)
848
849    if horiz:
850        ax.hline(horiz)
851    if vert:
852        ax.vline(vert)
853    return fig
854
855
856@Appender(_plot_influence_doc.format(**{
857    'extra_params_doc': "results: object\n"
858                        "        Results for a fitted regression model.\n"
859                        "    influence: instance\n"
860                        "        The instance of Influence for model."}))
861def _influence_plot(results, influence, external=True, alpha=.05,
862                    criterion="cooks", size=48, plot_alpha=.75, ax=None,
863                    **kwargs):
864    infl = influence
865    fig, ax = utils.create_mpl_ax(ax)
866
867    if criterion.lower().startswith('coo'):
868        psize = infl.cooks_distance[0]
869    elif criterion.lower().startswith('dff'):
870        psize = np.abs(infl.dffits[0])
871    else:
872        raise ValueError("Criterion %s not understood" % criterion)
873
874    # scale the variables
875    #TODO: what is the correct scaling and the assumption here?
876    #we want plots to be comparable across different plots
877    #so we would need to use the expected distribution of criterion probably
878    old_range = np.ptp(psize)
879    new_range = size**2 - 8**2
880
881    psize = (psize - psize.min()) * new_range/old_range + 8**2
882
883    leverage = infl.hat_matrix_diag
884    if external:
885        resids = infl.resid_studentized_external
886    else:
887        resids = infl.resid_studentized
888
889    from scipy import stats
890
891    cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
892    large_resid = np.abs(resids) > cutoff
893    large_leverage = leverage > _high_leverage(results)
894    large_points = np.logical_or(large_resid, large_leverage)
895
896    ax.scatter(leverage, resids, s=psize, alpha=plot_alpha)
897
898    # add point labels
899    labels = results.model.data.row_labels
900    if labels is None:
901        labels = lrange(len(resids))
902    ax = utils.annotate_axes(np.where(large_points)[0], labels,
903                             lzip(leverage, resids),
904                             lzip(-(psize/2)**.5, (psize/2)**.5), "x-large",
905                             ax)
906
907    # TODO: make configurable or let people do it ex-post?
908    font = {"fontsize": 16, "color": "black"}
909    ax.set_ylabel("Studentized Residuals", **font)
910    ax.set_xlabel("H Leverage", **font)
911    ax.set_title("Influence Plot", **font)
912    return fig
913
914
915@Appender(_plot_influence_doc.format(**{
916    'extra_params_doc': "results : Results\n"
917                        "        Results for a fitted regression model."}))
918def influence_plot(results, external=True, alpha=.05, criterion="cooks",
919                   size=48, plot_alpha=.75, ax=None, **kwargs):
920
921    infl = results.get_influence()
922    res = _influence_plot(results, infl, external=external, alpha=alpha,
923                          criterion=criterion, size=size,
924                          plot_alpha=plot_alpha, ax=ax, **kwargs)
925    return res
926
927
928@Appender(_plot_leverage_resid2_doc.format({
929    'extra_params_doc': "results: object\n"
930                        "    Results for a fitted regression model\n"
931                        "influence: instance\n"
932                        "    instance of Influence for model"}))
933def _plot_leverage_resid2(results, influence, alpha=.05, ax=None,
934                         **kwargs):
935
936    from scipy.stats import norm, zscore
937    fig, ax = utils.create_mpl_ax(ax)
938
939    infl = influence
940    leverage = infl.hat_matrix_diag
941    resid = zscore(infl.resid)
942    ax.plot(resid**2, leverage, 'o', **kwargs)
943    ax.set_xlabel("Normalized residuals**2")
944    ax.set_ylabel("Leverage")
945    ax.set_title("Leverage vs. Normalized residuals squared")
946
947    large_leverage = leverage > _high_leverage(results)
948    #norm or t here if standardized?
949    cutoff = norm.ppf(1.-alpha/2)
950    large_resid = np.abs(resid) > cutoff
951    labels = results.model.data.row_labels
952    if labels is None:
953        labels = lrange(int(results.nobs))
954    index = np.where(np.logical_or(large_leverage, large_resid))[0]
955    ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
956                             [(0, 5)]*int(results.nobs), "large",
957                             ax=ax, ha="center", va="bottom")
958    ax.margins(.075, .075)
959    return fig
960
961
962@Appender(_plot_leverage_resid2_doc.format({
963    'extra_params_doc': "results : object\n"
964                        "    Results for a fitted regression model"}))
965def plot_leverage_resid2(results, alpha=.05, ax=None, **kwargs):
966
967    infl = results.get_influence()
968    return _plot_leverage_resid2(results, infl, alpha=alpha, ax=ax, **kwargs)
969
970
971
972@Appender(_plot_added_variable_doc % {
973    'extra_params_doc': "results : object\n"
974                        "    Results for a fitted regression model"})
975def plot_added_variable(results, focus_exog, resid_type=None,
976                        use_glm_weights=True, fit_kwargs=None, ax=None):
977
978    model = results.model
979
980    fig, ax = utils.create_mpl_ax(ax)
981
982    endog_resid, focus_exog_resid =\
983                 added_variable_resids(results, focus_exog,
984                                       resid_type=resid_type,
985                                       use_glm_weights=use_glm_weights,
986                                       fit_kwargs=fit_kwargs)
987
988    ax.plot(focus_exog_resid, endog_resid, 'o', alpha=0.6)
989
990    ax.set_title('Added variable plot', fontsize='large')
991
992    if isinstance(focus_exog, str):
993        xname = focus_exog
994    else:
995        xname = model.exog_names[focus_exog]
996    ax.set_xlabel(xname, size=15)
997    ax.set_ylabel(model.endog_names + " residuals", size=15)
998
999    return fig
1000
1001
1002@Appender(_plot_partial_residuals_doc % {
1003    'extra_params_doc': "results : object\n"
1004                        "    Results for a fitted regression model"})
1005def plot_partial_residuals(results, focus_exog, ax=None):
1006    # Docstring attached below
1007
1008    model = results.model
1009
1010    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1011
1012    pr = partial_resids(results, focus_exog)
1013    focus_exog_vals = results.model.exog[:, focus_col]
1014
1015    fig, ax = utils.create_mpl_ax(ax)
1016    ax.plot(focus_exog_vals, pr, 'o', alpha=0.6)
1017
1018    ax.set_title('Partial residuals plot', fontsize='large')
1019
1020    if isinstance(focus_exog, str):
1021        xname = focus_exog
1022    else:
1023        xname = model.exog_names[focus_exog]
1024    ax.set_xlabel(xname, size=15)
1025    ax.set_ylabel("Component plus residual", size=15)
1026
1027    return fig
1028
1029
1030@Appender(_plot_ceres_residuals_doc % {
1031    'extra_params_doc': "results : Results\n"
1032                        "        Results instance of a fitted regression "
1033                        "model."})
1034def plot_ceres_residuals(results, focus_exog, frac=0.66, cond_means=None,
1035                         ax=None):
1036
1037    model = results.model
1038
1039    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1040
1041    presid = ceres_resids(results, focus_exog, frac=frac,
1042                          cond_means=cond_means)
1043
1044    focus_exog_vals = model.exog[:, focus_col]
1045
1046    fig, ax = utils.create_mpl_ax(ax)
1047    ax.plot(focus_exog_vals, presid, 'o', alpha=0.6)
1048
1049    ax.set_title('CERES residuals plot', fontsize='large')
1050
1051    ax.set_xlabel(focus_exog, size=15)
1052    ax.set_ylabel("Component plus residual", size=15)
1053
1054    return fig
1055
1056
1057def ceres_resids(results, focus_exog, frac=0.66, cond_means=None):
1058    """
1059    Calculate the CERES residuals (Conditional Expectation Partial
1060    Residuals) for a fitted model.
1061
1062    Parameters
1063    ----------
1064    results : model results instance
1065        The fitted model for which the CERES residuals are calculated.
1066    focus_exog : int
1067        The column of results.model.exog used as the 'focus variable'.
1068    frac : float, optional
1069        Lowess smoothing parameter for estimating the conditional
1070        means.  Not used if `cond_means` is provided.
1071    cond_means : array_like, optional
1072        If provided, the columns of this array are the conditional
1073        means E[exog | focus exog], where exog ranges over some
1074        or all of the columns of exog other than focus exog.  If
1075        this is an empty nx0 array, the conditional means are
1076        treated as being zero.  If None, the conditional means are
1077        estimated.
1078
1079    Returns
1080    -------
1081    An array containing the CERES residuals.
1082
1083    Notes
1084    -----
1085    If `cond_means` is not provided, it is obtained by smoothing each
1086    column of exog (except the focus column) against the focus column.
1087
1088    Currently only supports GLM, GEE, and OLS models.
1089    """
1090
1091    model = results.model
1092
1093    if not isinstance(model, (GLM, GEE, OLS)):
1094        raise ValueError("ceres residuals not available for %s" %
1095                         model.__class__.__name__)
1096
1097    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1098
1099    # Indices of non-focus columns
1100    ix_nf = range(len(results.params))
1101    ix_nf = list(ix_nf)
1102    ix_nf.pop(focus_col)
1103    nnf = len(ix_nf)
1104
1105    # Estimate the conditional means if not provided.
1106    if cond_means is None:
1107
1108        # Below we calculate E[x | focus] where x is each column other
1109        # than the focus column.  We do not want the intercept when we do
1110        # this so we remove it here.
1111        pexog = model.exog[:, ix_nf]
1112        pexog -= pexog.mean(0)
1113        u, s, vt = np.linalg.svd(pexog, 0)
1114        ii = np.flatnonzero(s > 1e-6)
1115        pexog = u[:, ii]
1116
1117        fcol = model.exog[:, focus_col]
1118        cond_means = np.empty((len(fcol), pexog.shape[1]))
1119        for j in range(pexog.shape[1]):
1120
1121            # Get the fitted values for column i given the other
1122            # columns (skip the intercept).
1123            y0 = pexog[:, j]
1124
1125            cf = lowess(y0, fcol, frac=frac, return_sorted=False)
1126
1127            cond_means[:, j] = cf
1128
1129    new_exog = np.concatenate((model.exog[:, ix_nf], cond_means), axis=1)
1130
1131    # Refit the model using the adjusted exog values
1132    klass = model.__class__
1133    init_kwargs = model._get_init_kwds()
1134    new_model = klass(model.endog, new_exog, **init_kwargs)
1135    new_result = new_model.fit()
1136
1137    # The partial residual, with respect to l(x2) (notation of Cook 1998)
1138    presid = model.endog - new_result.fittedvalues
1139    if isinstance(model, (GLM, GEE)):
1140        presid *= model.family.link.deriv(new_result.fittedvalues)
1141    if new_exog.shape[1] > nnf:
1142        presid += np.dot(new_exog[:, nnf:], new_result.params[nnf:])
1143
1144    return presid
1145
1146def partial_resids(results, focus_exog):
1147    """
1148    Returns partial residuals for a fitted model with respect to a
1149    'focus predictor'.
1150
1151    Parameters
1152    ----------
1153    results : results instance
1154        A fitted regression model.
1155    focus col : int
1156        The column index of model.exog with respect to which the
1157        partial residuals are calculated.
1158
1159    Returns
1160    -------
1161    An array of partial residuals.
1162
1163    References
1164    ----------
1165    RD Cook and R Croos-Dabrera (1998).  Partial residual plots in
1166    generalized linear models.  Journal of the American Statistical
1167    Association, 93:442.
1168    """
1169
1170    # TODO: could be a method of results
1171    # TODO: see Cook et al (1998) for a more general definition
1172
1173    # The calculation follows equation (8) from Cook's paper.
1174    model = results.model
1175    resid = model.endog - results.predict()
1176
1177    if isinstance(model, (GLM, GEE)):
1178        resid *= model.family.link.deriv(results.fittedvalues)
1179    elif isinstance(model, (OLS, GLS, WLS)):
1180        pass # No need to do anything
1181    else:
1182        raise ValueError("Partial residuals for '%s' not implemented."
1183                         % type(model))
1184
1185    if type(focus_exog) is str:
1186        focus_col = model.exog_names.index(focus_exog)
1187    else:
1188        focus_col = focus_exog
1189
1190    focus_val = results.params[focus_col] * model.exog[:, focus_col]
1191
1192    return focus_val + resid
1193
1194def added_variable_resids(results, focus_exog, resid_type=None,
1195                          use_glm_weights=True, fit_kwargs=None):
1196    """
1197    Residualize the endog variable and a 'focus' exog variable in a
1198    regression model with respect to the other exog variables.
1199
1200    Parameters
1201    ----------
1202    results : regression results instance
1203        A fitted model including the focus exog and all other
1204        predictors of interest.
1205    focus_exog : {int, str}
1206        The column of results.model.exog or a variable name that is
1207        to be residualized against the other predictors.
1208    resid_type : str
1209        The type of residuals to use for the dependent variable.  If
1210        None, uses `resid_deviance` for GLM/GEE and `resid` otherwise.
1211    use_glm_weights : bool
1212        Only used if the model is a GLM or GEE.  If True, the
1213        residuals for the focus predictor are computed using WLS, with
1214        the weights obtained from the IRLS calculations for fitting
1215        the GLM.  If False, unweighted regression is used.
1216    fit_kwargs : dict, optional
1217        Keyword arguments to be passed to fit when refitting the
1218        model.
1219
1220    Returns
1221    -------
1222    endog_resid : array_like
1223        The residuals for the original exog
1224    focus_exog_resid : array_like
1225        The residuals for the focus predictor
1226
1227    Notes
1228    -----
1229    The 'focus variable' residuals are always obtained using linear
1230    regression.
1231
1232    Currently only GLM, GEE, and OLS models are supported.
1233    """
1234
1235    model = results.model
1236    if not isinstance(model, (GEE, GLM, OLS)):
1237        raise ValueError("model type %s not supported for added variable residuals" %
1238                         model.__class__.__name__)
1239
1240    exog = model.exog
1241    endog = model.endog
1242
1243    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
1244
1245    focus_exog_vals = exog[:, focus_col]
1246
1247    # Default residuals
1248    if resid_type is None:
1249        if isinstance(model, (GEE, GLM)):
1250            resid_type = "resid_deviance"
1251        else:
1252            resid_type = "resid"
1253
1254    ii = range(exog.shape[1])
1255    ii = list(ii)
1256    ii.pop(focus_col)
1257    reduced_exog = exog[:, ii]
1258    start_params = results.params[ii]
1259
1260    klass = model.__class__
1261
1262    kwargs = model._get_init_kwds()
1263    new_model = klass(endog, reduced_exog, **kwargs)
1264    args = {"start_params": start_params}
1265    if fit_kwargs is not None:
1266        args.update(fit_kwargs)
1267    new_result = new_model.fit(**args)
1268    if not new_result.converged:
1269        raise ValueError("fit did not converge when calculating added variable residuals")
1270
1271    try:
1272        endog_resid = getattr(new_result, resid_type)
1273    except AttributeError:
1274        raise ValueError("'%s' residual type not available" % resid_type)
1275
1276    import statsmodels.regression.linear_model as lm
1277
1278    if isinstance(model, (GLM, GEE)) and use_glm_weights:
1279        weights = model.family.weights(results.fittedvalues)
1280        if hasattr(model, "data_weights"):
1281            weights = weights * model.data_weights
1282        lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit()
1283    else:
1284        lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit()
1285    focus_exog_resid = lm_results.resid
1286
1287    return endog_resid, focus_exog_resid
1288