1# -*- coding: utf-8 -*-
2"""
3Created on Fri Sep 15 12:53:45 2017
4
5Author: Josef Perktold
6"""
7
8import numpy as np
9from scipy import stats
10
11from statsmodels.discrete.discrete_model import Poisson
12from statsmodels.regression.linear_model import OLS
13
14
15def _combine_bins(edge_index, x):
16    """group columns into bins using sum
17
18    This is mainly a helper function for combining probabilities into cells.
19    It similar to `np.add.reduceat(x, edge_index, axis=-1)` except for the
20    treatment of the last index and last cell.
21
22    Parameters
23    ----------
24    edge_index : array_like
25         This defines the (zero-based) indices for the columns that are be
26         combined. Each index in `edge_index` except the last is the starting
27         index for a bin. The largest index in a bin is the next edge_index-1.
28    x : 1d or 2d array
29        array for which columns are combined. If x is 1-dimensional that it
30        will be treated as a 2-d row vector.
31
32    Returns
33    -------
34    x_new : ndarray
35
36
37    Examples
38    --------
39    >>> dia.combine_bins([0,1,5], np.arange(4))
40    (array([0, 6]), array([1, 4]))
41
42    this aggregates to two bins with the sum of 1 and 4 elements
43    >>> np.arange(4)[0].sum()
44    0
45    >>> np.arange(4)[1:5].sum()
46    6
47
48    If the rightmost index is smaller than len(x)+1, then the remaining
49    columns will not be included.
50
51    >>> dia.combine_bins([0,1,3], np.arange(4))
52    (array([0, 3]), array([1, 2]))
53    """
54    x = np.asarray(x)
55    if x.ndim == 1:
56        is_1d = True
57        x = x[None, :]
58    else:
59        is_1d = False
60    xli = []
61    kli = []
62    for bin_idx in range(len(edge_index) - 1):
63        i, j = edge_index[bin_idx : bin_idx + 2]
64        xli.append(x[:, i:j].sum(1))
65        kli.append(j - i)
66
67    x_new = np.column_stack(xli)
68    if is_1d:
69        x_new = x_new.squeeze()
70    return x_new, np.asarray(kli)
71
72
73def plot_probs(freq, probs_predicted, label='predicted', upp_xlim=None,
74               fig=None):
75    """diagnostic plots for comparing two lists of discrete probabilities
76
77    Parameters
78    ----------
79    freq, probs_predicted : nd_arrays
80        two arrays of probabilities, this can be any probabilities for
81        the same events, default is designed for comparing predicted
82        and observed probabilities
83    label : str or tuple
84        If string, then it will be used as the label for probs_predicted and
85        "freq" is used for the other probabilities.
86        If label is a tuple of strings, then the first is they are used as
87        label for both probabilities
88
89    upp_xlim : None or int
90        If it is not None, then the xlim of the first two plots are set to
91        (0, upp_xlim), otherwise the matplotlib default is used
92    fig : None or matplotlib figure instance
93        If fig is provided, then the axes will be added to it in a (3,1)
94        subplots, otherwise a matplotlib figure instance is created
95
96    Returns
97    -------
98    Figure
99        The figure contains 3 subplot with probabilities, cumulative
100        probabilities and a PP-plot
101    """
102
103    if isinstance(label, list):
104        label0, label1 = label
105    else:
106        label0, label1 = 'freq', label
107
108    if fig is None:
109        import matplotlib.pyplot as plt
110        fig = plt.figure(figsize=(8,12))
111    ax1 = fig.add_subplot(311)
112    ax1.plot(freq, '-o', label=label0)
113    ax1.plot(probs_predicted, '-d', label=label1)
114    if upp_xlim is not None:
115        ax1.set_xlim(0, upp_xlim)
116    ax1.legend()
117    ax1.set_title('probabilities')
118
119    ax2 = fig.add_subplot(312)
120    ax2.plot(np.cumsum(freq), '-o', label=label0)
121    ax2.plot(np.cumsum(probs_predicted), '-d', label=label1)
122    if upp_xlim is not None:
123        ax2.set_xlim(0, upp_xlim)
124    ax2.legend()
125    ax2.set_title('cumulative probabilities')
126
127    ax3 = fig.add_subplot(313)
128    ax3.plot(np.cumsum(probs_predicted), np.cumsum(freq), 'o')
129    ax3.plot(np.arange(len(freq)) / len(freq), np.arange(len(freq)) / len(freq))
130    ax3.set_title('PP-plot')
131    ax3.set_xlabel(label1)
132    ax3.set_ylabel(label0)
133    return fig
134
135
136def test_chisquare_prob(results, probs, bin_edges=None, method=None):
137    """
138    chisquare test for predicted probabilities using cmt-opg
139
140    Parameters
141    ----------
142    results : results instance
143        Instance of a count regression results
144    probs : ndarray
145        Array of predicted probabilities with observations
146        in rows and event counts in columns
147    bin_edges : None or array
148        intervals to combine several counts into cells
149        see combine_bins
150
151    Returns
152    -------
153    (api not stable, replace by test-results class)
154    statistic : float
155        chisquare statistic for tes
156    p-value : float
157        p-value of test
158    df : int
159        degrees of freedom for chisquare distribution
160    extras : ???
161        currently returns a tuple with some intermediate results
162        (diff, res_aux)
163
164    Notes
165    -----
166
167    Status : experimental, no verified unit tests, needs to be generalized
168    currently only OPG version with auxiliary regression is implemented
169
170
171    Assumes counts are np.arange(probs.shape[1]), i.e. consecutive
172    integers starting at zero.
173
174    Auxiliary regression drops the last column of binned probs to avoid
175    that probabilities sum to 1.
176    """
177    res = results
178    score_obs = results.model.score_obs(results.params)
179    d_ind = (res.model.endog[:, None] == np.arange(probs.shape[1])).astype(int)
180    if bin_edges is not None:
181        d_ind_bins, k_bins = _combine_bins(bin_edges, d_ind)
182        probs_bins, k_bins = _combine_bins(bin_edges, probs)
183    else:
184        d_ind_bins, k_bins = d_ind, d_ind.shape[1]
185        probs_bins = probs
186    diff1 = d_ind_bins - probs_bins
187    #diff2 = (1 - d_ind.sum(1)) - (1 - probs_bins.sum(1))
188    x_aux = np.column_stack((score_obs, diff1[:, :-1])) #, diff2))
189    nobs = x_aux.shape[0]
190    res_aux = OLS(np.ones(nobs), x_aux).fit()
191
192    chi2_stat = nobs * (1 - res_aux.ssr / res_aux.uncentered_tss)
193    df = res_aux.model.rank - score_obs.shape[1]
194    if df < k_bins - 1:
195        # not a problem in general, but it can be for OPG version
196        import warnings
197        warnings.warn('auxiliary model is rank deficient')
198    extras = (diff1, res_aux)
199    return chi2_stat, stats.chi2.sf(chi2_stat, df), df, extras
200
201
202def test_poisson_zeroinflation(results_poisson, exog_infl=None):
203    """score test for zero inflation or deflation in Poisson
204
205    This implements Jansakul and Hinde 2009 score test
206    for excess zeros against a zero modified Poisson
207    alternative. They use a linear link function for the
208    inflation model to allow for zero deflation.
209
210    Parameters
211    ----------
212    results_poisson: results instance
213        The test is only valid if the results instance is a Poisson
214        model.
215    exog_infl : ndarray
216        Explanatory variables for the zero inflated or zero modified
217        alternative. I exog_infl is None, then the inflation
218        probability is assumed to be constant.
219
220    Returns
221    -------
222    score test results based on chisquare distribution
223
224    Notes
225    -----
226    This is a score test based on the null hypothesis that
227    the true model is Poisson. It will also reject for
228    other deviations from a Poisson model if those affect
229    the zero probabilities, e.g. in the direction of
230    excess dispersion as in the Negative Binomial
231    or Generalized Poisson model.
232    Therefore, rejection in this test does not imply that
233    zero-inflated Poisson is the appropriate model.
234
235    Status: experimental, no verified unit tests,
236
237    TODO: If the zero modification probability is assumed
238    to be constant under the alternative, then we only have
239    a scalar test score and we can use one-sided tests to
240    distinguish zero inflation and deflation from the
241    two-sided deviations. (The general one-sided case is
242    difficult.)
243
244    References
245    ----------
246    Jansakul and Hinde 2009
247    """
248    if not isinstance(results_poisson.model, Poisson):
249        # GLM Poisson would be also valid, not tried
250        import warnings
251        warnings.warn('Test is only valid if model is Poisson')
252
253    nobs = results_poisson.model.endog.shape[0]
254
255    if exog_infl is None:
256        exog_infl = np.ones((nobs, 1))
257
258
259    endog = results_poisson.model.endog
260    exog = results_poisson.model.exog
261
262    mu = results_poisson.predict()
263    prob_zero = np.exp(-mu)
264
265    cov_poi = results_poisson.cov_params()
266    cross_derivative = (exog_infl.T * (-mu)).dot(exog).T
267    cov_infl = (exog_infl.T * ((1 - prob_zero) / prob_zero)).dot(exog_infl)
268    score_obs_infl = exog_infl * (((endog == 0) - prob_zero) / prob_zero)[:,None]
269    #score_obs_infl = exog_infl * ((endog == 0) * (1 - prob_zero) / prob_zero - (endog>0))[:,None] #same
270    score_infl = score_obs_infl.sum(0)
271    cov_score_infl = cov_infl - cross_derivative.T.dot(cov_poi).dot(cross_derivative)
272    cov_score_infl_inv = np.linalg.pinv(cov_score_infl)
273    statistic = score_infl.dot(cov_score_infl_inv).dot(score_infl)
274    df2 = np.linalg.matrix_rank(cov_score_infl)  # more general, maybe not needed
275    df = exog_infl.shape[1]
276    pvalue = stats.chi2.sf(statistic, df)
277    return statistic, pvalue, df, df2
278
279
280def test_poisson_zeroinflation_brock(results_poisson):
281    """score test for zero modification in Poisson, special case
282
283    This assumes that the Poisson model has a constant and that
284    the zero modification probability is constant.
285
286    This is a special case of test_poisson_zeroinflation derived by
287    van den Brock 1995.
288
289    The test reports two sided and one sided alternatives based on
290    the normal distribution of the test statistic.
291    """
292
293    mu = results_poisson.predict()
294    prob_zero = np.exp(-mu)
295    endog = results_poisson.model.endog
296    nobs = len(endog)
297
298    score =  ((endog == 0) / prob_zero).sum() - nobs
299    var_score = (1 / prob_zero).sum() - nobs - endog.sum()
300    statistic = score / np.sqrt(var_score)
301    pvalue_two = 2 * stats.norm.sf(np.abs(statistic))
302    pvalue_upp = stats.norm.sf(statistic)
303    pvalue_low = stats.norm.cdf(statistic)
304    return statistic, pvalue_two, pvalue_upp, pvalue_low, stats.chi2.sf(statistic**2, 1)
305