1"""
2Conditional logistic, Poisson, and multinomial logit regression
3"""
4
5import numpy as np
6import statsmodels.base.model as base
7import statsmodels.regression.linear_model as lm
8import statsmodels.base.wrapper as wrap
9from statsmodels.discrete.discrete_model import (MultinomialResults,
10      MultinomialResultsWrapper)
11import collections
12import warnings
13import itertools
14
15
16class _ConditionalModel(base.LikelihoodModel):
17
18    def __init__(self, endog, exog, missing='none', **kwargs):
19
20        if "groups" not in kwargs:
21            raise ValueError("'groups' is a required argument")
22        groups = kwargs["groups"]
23
24        if groups.size != endog.size:
25            msg = "'endog' and 'groups' should have the same dimensions"
26            raise ValueError(msg)
27
28        if exog.shape[0] != endog.size:
29            msg = "The leading dimension of 'exog' should equal the length of 'endog'"
30            raise ValueError(msg)
31
32        super(_ConditionalModel, self).__init__(
33            endog, exog, missing=missing, **kwargs)
34
35        if self.data.const_idx is not None:
36            msg = ("Conditional models should not have an intercept in the " +
37                  "design matrix")
38            raise ValueError(msg)
39
40        exog = self.exog
41        self.k_params = exog.shape[1]
42
43        # Get the row indices for each group
44        row_ix = {}
45        for i, g in enumerate(groups):
46            if g not in row_ix:
47                row_ix[g] = []
48            row_ix[g].append(i)
49
50        # Split the data into groups and remove groups with no variation
51        endog, exog = np.asarray(endog), np.asarray(exog)
52        offset = kwargs.get("offset")
53        self._endog_grp = []
54        self._exog_grp = []
55        self._groupsize = []
56        if offset is not None:
57            offset = np.asarray(offset)
58            self._offset_grp = []
59        self._offset = []
60        self._sumy = []
61        self.nobs = 0
62        drops = [0, 0]
63        for g, ix in row_ix.items():
64            y = endog[ix].flat
65            if np.std(y) == 0:
66                drops[0] += 1
67                drops[1] += len(y)
68                continue
69            self.nobs += len(y)
70            self._endog_grp.append(y)
71            if offset is not None:
72                self._offset_grp.append(offset[ix])
73            self._groupsize.append(len(y))
74            self._exog_grp.append(exog[ix, :])
75            self._sumy.append(np.sum(y))
76
77        if drops[0] > 0:
78            msg = ("Dropped %d groups and %d observations for having " +
79                   "no within-group variance") % tuple(drops)
80            warnings.warn(msg)
81
82        # This can be pre-computed
83        if offset is not None:
84            self._endofs = []
85            for k, ofs in enumerate(self._offset_grp):
86                self._endofs.append(np.dot(self._endog_grp[k], ofs))
87
88        # Number of groups
89        self._n_groups = len(self._endog_grp)
90
91        # These are the sufficient statistics
92        self._xy = []
93        self._n1 = []
94        for g in range(self._n_groups):
95            self._xy.append(np.dot(self._endog_grp[g], self._exog_grp[g]))
96            self._n1.append(np.sum(self._endog_grp[g]))
97
98    def hessian(self, params):
99
100        from statsmodels.tools.numdiff import approx_fprime
101        hess = approx_fprime(params, self.score)
102        hess = np.atleast_2d(hess)
103        return hess
104
105    def fit(self,
106            start_params=None,
107            method='BFGS',
108            maxiter=100,
109            full_output=True,
110            disp=False,
111            fargs=(),
112            callback=None,
113            retall=False,
114            skip_hessian=False,
115            **kwargs):
116
117        rslt = super(_ConditionalModel, self).fit(
118            start_params=start_params,
119            method=method,
120            maxiter=maxiter,
121            full_output=full_output,
122            disp=disp,
123            skip_hessian=skip_hessian)
124
125        crslt = ConditionalResults(self, rslt.params, rslt.cov_params(), 1)
126        crslt.method = method
127        crslt.nobs = self.nobs
128        crslt.n_groups = self._n_groups
129        crslt._group_stats = [
130            "%d" % min(self._groupsize),
131            "%d" % max(self._groupsize),
132            "%.1f" % np.mean(self._groupsize)
133        ]
134        rslt = ConditionalResultsWrapper(crslt)
135        return rslt
136
137    def fit_regularized(self,
138                        method="elastic_net",
139                        alpha=0.,
140                        start_params=None,
141                        refit=False,
142                        **kwargs):
143        """
144        Return a regularized fit to a linear regression model.
145
146        Parameters
147        ----------
148        method : {'elastic_net'}
149            Only the `elastic_net` approach is currently implemented.
150        alpha : scalar or array_like
151            The penalty weight.  If a scalar, the same penalty weight
152            applies to all variables in the model.  If a vector, it
153            must have the same length as `params`, and contains a
154            penalty weight for each coefficient.
155        start_params : array_like
156            Starting values for `params`.
157        refit : bool
158            If True, the model is refit using only the variables that
159            have non-zero coefficients in the regularized fit.  The
160            refitted model is not regularized.
161        **kwargs
162            Additional keyword argument that are used when fitting the model.
163
164        Returns
165        -------
166        Results
167            A results instance.
168        """
169
170        from statsmodels.base.elastic_net import fit_elasticnet
171
172        if method != "elastic_net":
173            raise ValueError("method for fit_regularied must be elastic_net")
174
175        defaults = {"maxiter": 50, "L1_wt": 1, "cnvrg_tol": 1e-10,
176                    "zero_tol": 1e-10}
177        defaults.update(kwargs)
178
179        return fit_elasticnet(self, method=method,
180                              alpha=alpha,
181                              start_params=start_params,
182                              refit=refit,
183                              **defaults)
184
185    # Override to allow groups to be passed as a variable name.
186    @classmethod
187    def from_formula(cls,
188                     formula,
189                     data,
190                     subset=None,
191                     drop_cols=None,
192                     *args,
193                     **kwargs):
194
195        try:
196            groups = kwargs["groups"]
197            del kwargs["groups"]
198        except KeyError:
199            raise ValueError("'groups' is a required argument")
200
201        if isinstance(groups, str):
202            groups = data[groups]
203
204        if "0+" not in formula.replace(" ", ""):
205            warnings.warn("Conditional models should not include an intercept")
206
207        model = super(_ConditionalModel, cls).from_formula(
208            formula, data=data, groups=groups, *args, **kwargs)
209
210        return model
211
212
213class ConditionalLogit(_ConditionalModel):
214    """
215    Fit a conditional logistic regression model to grouped data.
216
217    Every group is implicitly given an intercept, but the model is fit using
218    a conditional likelihood in which the intercepts are not present.  Thus,
219    intercept estimates are not given, but the other parameter estimates can
220    be interpreted as being adjusted for any group-level confounders.
221
222    Parameters
223    ----------
224    endog : array_like
225        The response variable, must contain only 0 and 1.
226    exog : array_like
227        The array of covariates.  Do not include an intercept
228        in this array.
229    groups : array_like
230        Codes defining the groups. This is a required keyword parameter.
231    """
232
233    def __init__(self, endog, exog, missing='none', **kwargs):
234
235        super(ConditionalLogit, self).__init__(
236            endog, exog, missing=missing, **kwargs)
237
238        if np.any(np.unique(self.endog) != np.r_[0, 1]):
239            msg = "endog must be coded as 0, 1"
240            raise ValueError(msg)
241
242        self.K = self.exog.shape[1]
243        # i.e. self.k_params, for compatibility with MNLogit
244
245    def loglike(self, params):
246
247        ll = 0
248        for g in range(len(self._endog_grp)):
249            ll += self.loglike_grp(g, params)
250
251        return ll
252
253    def score(self, params):
254
255        score = 0
256        for g in range(self._n_groups):
257            score += self.score_grp(g, params)
258
259        return score
260
261    def _denom(self, grp, params, ofs=None):
262
263        if ofs is None:
264            ofs = 0
265
266        exb = np.exp(np.dot(self._exog_grp[grp], params) + ofs)
267
268        # In the recursions, f may be called multiple times with the
269        # same arguments, so we memoize the results.
270        memo = {}
271
272        def f(t, k):
273            if t < k:
274                return 0
275            if k == 0:
276                return 1
277
278            try:
279                return memo[(t, k)]
280            except KeyError:
281                pass
282
283            v = f(t - 1, k) + f(t - 1, k - 1) * exb[t - 1]
284            memo[(t, k)] = v
285
286            return v
287
288        return f(self._groupsize[grp], self._n1[grp])
289
290    def _denom_grad(self, grp, params, ofs=None):
291
292        if ofs is None:
293            ofs = 0
294
295        ex = self._exog_grp[grp]
296        exb = np.exp(np.dot(ex, params) + ofs)
297
298        # s may be called multiple times in the recursions with the
299        # same arguments, so memoize the results.
300        memo = {}
301
302        def s(t, k):
303
304            if t < k:
305                return 0, np.zeros(self.k_params)
306            if k == 0:
307                return 1, 0
308
309            try:
310                return memo[(t, k)]
311            except KeyError:
312                pass
313
314            h = exb[t - 1]
315            a, b = s(t - 1, k)
316            c, e = s(t - 1, k - 1)
317            d = c * h * ex[t - 1, :]
318
319            u, v = a + c * h, b + d + e * h
320            memo[(t, k)] = (u, v)
321
322            return u, v
323
324        return s(self._groupsize[grp], self._n1[grp])
325
326    def loglike_grp(self, grp, params):
327
328        ofs = None
329        if hasattr(self, 'offset'):
330            ofs = self._offset_grp[grp]
331
332        llg = np.dot(self._xy[grp], params)
333
334        if ofs is not None:
335            llg += self._endofs[grp]
336
337        llg -= np.log(self._denom(grp, params, ofs))
338
339        return llg
340
341    def score_grp(self, grp, params):
342
343        ofs = 0
344        if hasattr(self, 'offset'):
345            ofs = self._offset_grp[grp]
346
347        d, h = self._denom_grad(grp, params, ofs)
348        return self._xy[grp] - h / d
349
350
351class ConditionalPoisson(_ConditionalModel):
352    """
353    Fit a conditional Poisson regression model to grouped data.
354
355    Every group is implicitly given an intercept, but the model is fit using
356    a conditional likelihood in which the intercepts are not present.  Thus,
357    intercept estimates are not given, but the other parameter estimates can
358    be interpreted as being adjusted for any group-level confounders.
359
360    Parameters
361    ----------
362    endog : array_like
363        The response variable
364    exog : array_like
365        The covariates
366    groups : array_like
367        Codes defining the groups. This is a required keyword parameter.
368    """
369
370    def loglike(self, params):
371
372        ofs = None
373        if hasattr(self, 'offset'):
374            ofs = self._offset_grp
375
376        ll = 0.0
377
378        for i in range(len(self._endog_grp)):
379
380            xb = np.dot(self._exog_grp[i], params)
381            if ofs is not None:
382                xb += ofs[i]
383            exb = np.exp(xb)
384            y = self._endog_grp[i]
385            ll += np.dot(y, xb)
386            s = exb.sum()
387            ll -= self._sumy[i] * np.log(s)
388
389        return ll
390
391    def score(self, params):
392
393        ofs = None
394        if hasattr(self, 'offset'):
395            ofs = self._offset_grp
396
397        score = 0.0
398
399        for i in range(len(self._endog_grp)):
400
401            x = self._exog_grp[i]
402            xb = np.dot(x, params)
403            if ofs is not None:
404                xb += ofs[i]
405            exb = np.exp(xb)
406            s = exb.sum()
407            y = self._endog_grp[i]
408            score += np.dot(y, x)
409            score -= self._sumy[i] * np.dot(exb, x) / s
410
411        return score
412
413
414class ConditionalResults(base.LikelihoodModelResults):
415    def __init__(self, model, params, normalized_cov_params, scale):
416
417        super(ConditionalResults, self).__init__(
418            model,
419            params,
420            normalized_cov_params=normalized_cov_params,
421            scale=scale)
422
423    def summary(self, yname=None, xname=None, title=None, alpha=.05):
424        """
425        Summarize the fitted model.
426
427        Parameters
428        ----------
429        yname : str, optional
430            Default is `y`
431        xname : list[str], optional
432            Names for the exogenous variables, default is "var_xx".
433            Must match the number of parameters in the model
434        title : str, optional
435            Title for the top table. If not None, then this replaces the
436            default title
437        alpha : float
438            Significance level for the confidence intervals
439
440        Returns
441        -------
442        smry : Summary instance
443            This holds the summary tables and text, which can be printed or
444            converted to various output formats.
445
446        See Also
447        --------
448        statsmodels.iolib.summary.Summary : class to hold summary
449            results
450        """
451
452        top_left = [
453            ('Dep. Variable:', None),
454            ('Model:', None),
455            ('Log-Likelihood:', None),
456            ('Method:', [self.method]),
457            ('Date:', None),
458            ('Time:', None),
459        ]
460
461        top_right = [
462            ('No. Observations:', None),
463            ('No. groups:', [self.n_groups]),
464            ('Min group size:', [self._group_stats[0]]),
465            ('Max group size:', [self._group_stats[1]]),
466            ('Mean group size:', [self._group_stats[2]]),
467        ]
468
469        if title is None:
470            title = "Conditional Logit Model Regression Results"
471
472        # create summary tables
473        from statsmodels.iolib.summary import Summary
474        smry = Summary()
475        smry.add_table_2cols(
476            self,
477            gleft=top_left,
478            gright=top_right,  # [],
479            yname=yname,
480            xname=xname,
481            title=title)
482        smry.add_table_params(
483            self, yname=yname, xname=xname, alpha=alpha, use_t=self.use_t)
484
485        return smry
486
487class ConditionalMNLogit(_ConditionalModel):
488    """
489    Fit a conditional multinomial logit model to grouped data.
490
491    Parameters
492    ----------
493    endog : array_like
494        The dependent variable, must be integer-valued, coded
495        0, 1, ..., c-1, where c is the number of response
496        categories.
497    exog : array_like
498        The independent variables.
499    groups : array_like
500        Codes defining the groups. This is a required keyword parameter.
501
502    Notes
503    -----
504    Equivalent to femlogit in Stata.
505
506    References
507    ----------
508    Gary Chamberlain (1980).  Analysis of covariance with qualitative
509    data. The Review of Economic Studies.  Vol. 47, No. 1, pp. 225-238.
510    """
511
512    def __init__(self, endog, exog, missing='none', **kwargs):
513
514        super(ConditionalMNLogit, self).__init__(
515            endog, exog, missing=missing, **kwargs)
516
517        # endog must be integers
518        self.endog = self.endog.astype(int)
519
520        self.k_cat = self.endog.max() + 1
521        self.df_model = (self.k_cat - 1) * self.exog.shape[1]
522        self.df_resid = self.nobs - self.df_model
523        self._ynames_map = {j: str(j) for j in range(self.k_cat)}
524        self.J = self.k_cat  # Unfortunate name, needed for results
525        self.K = self.exog.shape[1]  # for compatibility with MNLogit
526
527        if self.endog.min() < 0:
528            msg = "endog may not contain negative values"
529            raise ValueError(msg)
530
531        grx = collections.defaultdict(list)
532        for k, v in enumerate(self.groups):
533            grx[v].append(k)
534        self._group_labels = list(grx.keys())
535        self._group_labels.sort()
536        self._grp_ix = [grx[k] for k in self._group_labels]
537
538    def fit(self,
539            start_params=None,
540            method='BFGS',
541            maxiter=100,
542            full_output=True,
543            disp=False,
544            fargs=(),
545            callback=None,
546            retall=False,
547            skip_hessian=False,
548            **kwargs):
549
550        if start_params is None:
551            q = self.exog.shape[1]
552            c = self.k_cat - 1
553            start_params = np.random.normal(size=q * c)
554
555        # Do not call super(...).fit because it cannot handle the 2d-params.
556        rslt = base.LikelihoodModel.fit(
557            self,
558            start_params=start_params,
559            method=method,
560            maxiter=maxiter,
561            full_output=full_output,
562            disp=disp,
563            skip_hessian=skip_hessian)
564
565        rslt.params = rslt.params.reshape((self.exog.shape[1], -1))
566        rslt = MultinomialResults(self, rslt)
567
568        # Not clear what the null likelihood should be, there is no intercept
569        # so the null model is not clearly defined.  This is needed for summary
570        # to work.
571        rslt.set_null_options(llnull=np.nan)
572
573        return MultinomialResultsWrapper(rslt)
574
575    def loglike(self, params):
576
577        q = self.exog.shape[1]
578        c = self.k_cat - 1
579
580        pmat = params.reshape((q, c))
581        pmat = np.concatenate((np.zeros((q, 1)), pmat), axis=1)
582        lpr = np.dot(self.exog, pmat)
583
584        ll = 0.0
585        for ii in self._grp_ix:
586            x = lpr[ii, :]
587            jj = np.arange(x.shape[0], dtype=int)
588            y = self.endog[ii]
589            denom = 0.0
590            for p in itertools.permutations(y):
591                denom += np.exp(x[(jj, p)].sum())
592            ll += x[(jj, y)].sum() - np.log(denom)
593
594        return ll
595
596
597    def score(self, params):
598
599        q = self.exog.shape[1]
600        c = self.k_cat - 1
601
602        pmat = params.reshape((q, c))
603        pmat = np.concatenate((np.zeros((q, 1)), pmat), axis=1)
604        lpr = np.dot(self.exog, pmat)
605
606        grad = np.zeros((q, c))
607        for ii in self._grp_ix:
608            x = lpr[ii, :]
609            jj = np.arange(x.shape[0], dtype=int)
610            y = self.endog[ii]
611            denom = 0.0
612            denomg = np.zeros((q, c))
613            for p in itertools.permutations(y):
614                v = np.exp(x[(jj, p)].sum())
615                denom += v
616                for i, r in enumerate(p):
617                    if r != 0:
618                        denomg[:, r - 1] += v * self.exog[ii[i], :]
619
620            for i, r in enumerate(y):
621                if r != 0:
622                    grad[:, r - 1] += self.exog[ii[i], :]
623
624            grad -= denomg / denom
625
626        return grad.flatten()
627
628
629
630class ConditionalResultsWrapper(lm.RegressionResultsWrapper):
631    pass
632
633
634wrap.populate_wrapper(ConditionalResultsWrapper, ConditionalResults)
635