1"""
2Implementation of proportional hazards regression models for duration
3data that may be censored ("Cox models").
4
5References
6----------
7T Therneau (1996).  Extending the Cox model.  Technical report.
8http://www.mayo.edu/research/documents/biostat-58pdf/DOC-10027288
9
10G Rodriguez (2005).  Non-parametric estimation in survival models.
11http://data.princeton.edu/pop509/NonParametricSurvival.pdf
12
13B Gillespie (2006).  Checking the assumptions in the Cox proportional
14hazards model.
15http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf
16"""
17import numpy as np
18
19from statsmodels.base import model
20import statsmodels.base.model as base
21from statsmodels.tools.decorators import cache_readonly
22from statsmodels.compat.pandas import Appender
23
24
25_predict_docstring = """
26    Returns predicted values from the proportional hazards
27    regression model.
28
29    Parameters
30    ----------%(params_doc)s
31    exog : array_like
32        Data to use as `exog` in forming predictions.  If not
33        provided, the `exog` values from the model used to fit the
34        data are used.%(cov_params_doc)s
35    endog : array_like
36        Duration (time) values at which the predictions are made.
37        Only used if pred_type is either 'cumhaz' or 'surv'.  If
38        using model `exog`, defaults to model `endog` (time), but
39        may be provided explicitly to make predictions at
40        alternative times.
41    strata : array_like
42        A vector of stratum values used to form the predictions.
43        Not used (may be 'None') if pred_type is 'lhr' or 'hr'.
44        If `exog` is None, the model stratum values are used.  If
45        `exog` is not None and pred_type is 'surv' or 'cumhaz',
46        stratum values must be provided (unless there is only one
47        stratum).
48    offset : array_like
49        Offset values used to create the predicted values.
50    pred_type : str
51        If 'lhr', returns log hazard ratios, if 'hr' returns
52        hazard ratios, if 'surv' returns the survival function, if
53        'cumhaz' returns the cumulative hazard function.
54    pred_only : bool
55        If True, returns only an array of predicted values.  Otherwise
56        returns a bunch containing the predicted values and standard
57        errors.
58
59    Returns
60    -------
61    A bunch containing two fields: `predicted_values` and
62    `standard_errors`.
63
64    Notes
65    -----
66    Standard errors are only returned when predicting the log
67    hazard ratio (pred_type is 'lhr').
68
69    Types `surv` and `cumhaz` require estimation of the cumulative
70    hazard function.
71"""
72
73_predict_params_doc = """
74    params : array_like
75        The proportional hazards model parameters."""
76
77_predict_cov_params_docstring = """
78    cov_params : array_like
79        The covariance matrix of the estimated `params` vector,
80        used to obtain prediction errors if pred_type='lhr',
81        otherwise optional."""
82
83
84
85class PHSurvivalTime(object):
86
87    def __init__(self, time, status, exog, strata=None, entry=None,
88                 offset=None):
89        """
90        Represent a collection of survival times with possible
91        stratification and left truncation.
92
93        Parameters
94        ----------
95        time : array_like
96            The times at which either the event (failure) occurs or
97            the observation is censored.
98        status : array_like
99            Indicates whether the event (failure) occurs at `time`
100            (`status` is 1), or if `time` is a censoring time (`status`
101            is 0).
102        exog : array_like
103            The exogeneous (covariate) data matrix, cases are rows and
104            variables are columns.
105        strata : array_like
106            Grouping variable defining the strata.  If None, all
107            observations are in a single stratum.
108        entry : array_like
109            Entry (left truncation) times.  The observation is not
110            part of the risk set for times before the entry time.  If
111            None, the entry time is treated as being zero, which
112            gives no left truncation.  The entry time must be less
113            than or equal to `time`.
114        offset : array_like
115            An optional array of offsets
116        """
117
118        # Default strata
119        if strata is None:
120            strata = np.zeros(len(time), dtype=np.int32)
121
122        # Default entry times
123        if entry is None:
124            entry = np.zeros(len(time))
125
126        # Parameter validity checks.
127        self._check(time, status, strata, entry)
128
129        # Get the row indices for the cases in each stratum
130        stu = np.unique(strata)
131        sth = {x: [] for x in stu}
132        for i,k in enumerate(strata):
133            sth[k].append(i)
134        stratum_rows = [np.asarray(sth[k], dtype=np.int32) for k in stu]
135        stratum_names = stu
136
137        # Remove strata with no events
138        ix = [i for i,ix in enumerate(stratum_rows) if status[ix].sum() > 0]
139        self.nstrat_orig = len(stratum_rows)
140        stratum_rows = [stratum_rows[i] for i in ix]
141        stratum_names = [stratum_names[i] for i in ix]
142
143        # The number of strata
144        nstrat = len(stratum_rows)
145        self.nstrat = nstrat
146
147        # Remove subjects whose entry time occurs after the last event
148        # in their stratum.
149        for stx,ix in enumerate(stratum_rows):
150            last_failure = max(time[ix][status[ix] == 1])
151
152            # Stata uses < here, R uses <=
153            ii = [i for i,t in enumerate(entry[ix]) if
154                  t <= last_failure]
155            stratum_rows[stx] = stratum_rows[stx][ii]
156
157        # Remove subjects who are censored before the first event in
158        # their stratum.
159        for stx,ix in enumerate(stratum_rows):
160            first_failure = min(time[ix][status[ix] == 1])
161
162            ii = [i for i,t in enumerate(time[ix]) if
163                  t >= first_failure]
164            stratum_rows[stx] = stratum_rows[stx][ii]
165
166        # Order by time within each stratum
167        for stx,ix in enumerate(stratum_rows):
168            ii = np.argsort(time[ix])
169            stratum_rows[stx] = stratum_rows[stx][ii]
170
171        if offset is not None:
172            self.offset_s = []
173            for stx in range(nstrat):
174                self.offset_s.append(offset[stratum_rows[stx]])
175        else:
176            self.offset_s = None
177
178        # Number of informative subjects
179        self.n_obs = sum([len(ix) for ix in stratum_rows])
180
181        self.stratum_rows = stratum_rows
182        self.stratum_names = stratum_names
183
184        # Split everything by stratum
185        self.time_s = self._split(time)
186        self.exog_s = self._split(exog)
187        self.status_s = self._split(status)
188        self.entry_s = self._split(entry)
189
190        # Precalculate some indices needed to fit Cox models.
191        # Distinct failure times within a stratum are always taken to
192        # be sorted in ascending order.
193        #
194        # ufailt_ix[stx][k] is a list of indices for subjects who fail
195        # at the k^th sorted unique failure time in stratum stx
196        #
197        # risk_enter[stx][k] is a list of indices for subjects who
198        # enter the risk set at the k^th sorted unique failure time in
199        # stratum stx
200        #
201        # risk_exit[stx][k] is a list of indices for subjects who exit
202        # the risk set at the k^th sorted unique failure time in
203        # stratum stx
204        self.ufailt_ix, self.risk_enter, self.risk_exit, self.ufailt =\
205            [], [], [], []
206
207        for stx in range(self.nstrat):
208
209            # All failure times
210            ift = np.flatnonzero(self.status_s[stx] == 1)
211            ft = self.time_s[stx][ift]
212
213            # Unique failure times
214            uft = np.unique(ft)
215            nuft = len(uft)
216
217            # Indices of cases that fail at each unique failure time
218            #uft_map = {x:i for i,x in enumerate(uft)} # requires >=2.7
219            uft_map = dict([(x, i) for i,x in enumerate(uft)]) # 2.6
220            uft_ix = [[] for k in range(nuft)]
221            for ix,ti in zip(ift,ft):
222                uft_ix[uft_map[ti]].append(ix)
223
224            # Indices of cases (failed or censored) that enter the
225            # risk set at each unique failure time.
226            risk_enter1 = [[] for k in range(nuft)]
227            for i,t in enumerate(self.time_s[stx]):
228                ix = np.searchsorted(uft, t, "right") - 1
229                if ix >= 0:
230                    risk_enter1[ix].append(i)
231
232            # Indices of cases (failed or censored) that exit the
233            # risk set at each unique failure time.
234            risk_exit1 = [[] for k in range(nuft)]
235            for i,t in enumerate(self.entry_s[stx]):
236                ix = np.searchsorted(uft, t)
237                risk_exit1[ix].append(i)
238
239            self.ufailt.append(uft)
240            self.ufailt_ix.append([np.asarray(x, dtype=np.int32)
241                                   for x in uft_ix])
242            self.risk_enter.append([np.asarray(x, dtype=np.int32)
243                                    for x in risk_enter1])
244            self.risk_exit.append([np.asarray(x, dtype=np.int32)
245                                   for x in risk_exit1])
246
247    def _split(self, x):
248        v = []
249        if x.ndim == 1:
250            for ix in self.stratum_rows:
251                v.append(x[ix])
252        else:
253            for ix in self.stratum_rows:
254                v.append(x[ix, :])
255        return v
256
257    def _check(self, time, status, strata, entry):
258        n1, n2, n3, n4 = len(time), len(status), len(strata),\
259            len(entry)
260        nv = [n1, n2, n3, n4]
261        if max(nv) != min(nv):
262            raise ValueError("endog, status, strata, and " +
263                             "entry must all have the same length")
264        if min(time) < 0:
265            raise ValueError("endog must be non-negative")
266        if min(entry) < 0:
267            raise ValueError("entry time must be non-negative")
268
269        # In Stata, this is entry >= time, in R it is >.
270        if np.any(entry > time):
271            raise ValueError("entry times may not occur " +
272                             "after event or censoring times")
273
274
275class PHReg(model.LikelihoodModel):
276    """
277    Cox Proportional Hazards Regression Model
278
279    The Cox PH Model is for right censored data.
280
281    Parameters
282    ----------
283    endog : array_like
284        The observed times (event or censoring)
285    exog : 2D array_like
286        The covariates or exogeneous variables
287    status : array_like
288        The censoring status values; status=1 indicates that an
289        event occurred (e.g. failure or death), status=0 indicates
290        that the observation was right censored. If None, defaults
291        to status=1 for all cases.
292    entry : array_like
293        The entry times, if left truncation occurs
294    strata : array_like
295        Stratum labels.  If None, all observations are taken to be
296        in a single stratum.
297    ties : str
298        The method used to handle tied times, must be either 'breslow'
299        or 'efron'.
300    offset : array_like
301        Array of offset values
302    missing : str
303        The method used to handle missing data
304
305    Notes
306    -----
307    Proportional hazards regression models should not include an
308    explicit or implicit intercept.  The effect of an intercept is
309    not identified using the partial likelihood approach.
310
311    `endog`, `event`, `strata`, `entry`, and the first dimension
312    of `exog` all must have the same length
313    """
314
315    def __init__(self, endog, exog, status=None, entry=None,
316                 strata=None, offset=None, ties='breslow',
317                 missing='drop', **kwargs):
318
319        # Default is no censoring
320        if status is None:
321            status = np.ones(len(endog))
322
323        super(PHReg, self).__init__(endog, exog, status=status,
324                                    entry=entry, strata=strata,
325                                    offset=offset, missing=missing,
326                                    **kwargs)
327
328        # endog and exog are automatically converted, but these are
329        # not
330        if self.status is not None:
331            self.status = np.asarray(self.status)
332        if self.entry is not None:
333            self.entry = np.asarray(self.entry)
334        if self.strata is not None:
335            self.strata = np.asarray(self.strata)
336        if self.offset is not None:
337            self.offset = np.asarray(self.offset)
338
339        self.surv = PHSurvivalTime(self.endog, self.status,
340                                    self.exog, self.strata,
341                                    self.entry, self.offset)
342        self.nobs = len(self.endog)
343        self.groups = None
344
345        # TODO: not used?
346        self.missing = missing
347
348        self.df_resid = float(self.exog.shape[0] -
349                              np.linalg.matrix_rank(self.exog))
350        self.df_model = float(np.linalg.matrix_rank(self.exog))
351
352        ties = ties.lower()
353        if ties not in ("efron", "breslow"):
354            raise ValueError("`ties` must be either `efron` or " +
355                             "`breslow`")
356
357        self.ties = ties
358
359    @classmethod
360    def from_formula(cls, formula, data, status=None, entry=None,
361                     strata=None, offset=None, subset=None,
362                     ties='breslow', missing='drop', *args, **kwargs):
363        """
364        Create a proportional hazards regression model from a formula
365        and dataframe.
366
367        Parameters
368        ----------
369        formula : str or generic Formula object
370            The formula specifying the model
371        data : array_like
372            The data for the model. See Notes.
373        status : array_like
374            The censoring status values; status=1 indicates that an
375            event occurred (e.g. failure or death), status=0 indicates
376            that the observation was right censored. If None, defaults
377            to status=1 for all cases.
378        entry : array_like
379            The entry times, if left truncation occurs
380        strata : array_like
381            Stratum labels.  If None, all observations are taken to be
382            in a single stratum.
383        offset : array_like
384            Array of offset values
385        subset : array_like
386            An array-like object of booleans, integers, or index
387            values that indicate the subset of df to use in the
388            model. Assumes df is a `pandas.DataFrame`
389        ties : str
390            The method used to handle tied times, must be either 'breslow'
391            or 'efron'.
392        missing : str
393            The method used to handle missing data
394        args : extra arguments
395            These are passed to the model
396        kwargs : extra keyword arguments
397            These are passed to the model with one exception. The
398            ``eval_env`` keyword is passed to patsy. It can be either a
399            :class:`patsy:patsy.EvalEnvironment` object or an integer
400            indicating the depth of the namespace to use. For example, the
401            default ``eval_env=0`` uses the calling namespace. If you wish
402            to use a "clean" environment set ``eval_env=-1``.
403
404        Returns
405        -------
406        model : PHReg model instance
407        """
408
409        # Allow array arguments to be passed by column name.
410        if isinstance(status, str):
411            status = data[status]
412        if isinstance(entry, str):
413            entry = data[entry]
414        if isinstance(strata, str):
415            strata = data[strata]
416        if isinstance(offset, str):
417            offset = data[offset]
418
419        import re
420        terms = re.split(r"[+\-~]", formula)
421        for term in terms:
422            term = term.strip()
423            if term in ("0", "1"):
424                import warnings
425                warnings.warn("PHReg formulas should not include any '0' or '1' terms")
426
427        mod = super(PHReg, cls).from_formula(formula, data,
428                    status=status, entry=entry, strata=strata,
429                    offset=offset, subset=subset, ties=ties,
430                    missing=missing, drop_cols=["Intercept"], *args,
431                    **kwargs)
432
433        return mod
434
435    def fit(self, groups=None, **args):
436        """
437        Fit a proportional hazards regression model.
438
439        Parameters
440        ----------
441        groups : array_like
442            Labels indicating groups of observations that may be
443            dependent.  If present, the standard errors account for
444            this dependence. Does not affect fitted values.
445
446        Returns
447        -------
448        PHRegResults
449            Returns a results instance.
450        """
451
452        # TODO process for missing values
453        if groups is not None:
454            if len(groups) != len(self.endog):
455                msg = ("len(groups) = %d and len(endog) = %d differ" %
456                       (len(groups), len(self.endog)))
457                raise ValueError(msg)
458            self.groups = np.asarray(groups)
459        else:
460            self.groups = None
461
462        if 'disp' not in args:
463            args['disp'] = False
464
465        fit_rslts = super(PHReg, self).fit(**args)
466
467        if self.groups is None:
468            cov_params = fit_rslts.cov_params()
469        else:
470            cov_params = self.robust_covariance(fit_rslts.params)
471
472        results = PHRegResults(self, fit_rslts.params, cov_params)
473
474        return results
475
476    def fit_regularized(self, method="elastic_net", alpha=0.,
477                        start_params=None, refit=False, **kwargs):
478        r"""
479        Return a regularized fit to a linear regression model.
480
481        Parameters
482        ----------
483        method : {'elastic_net'}
484            Only the `elastic_net` approach is currently implemented.
485        alpha : scalar or array_like
486            The penalty weight.  If a scalar, the same penalty weight
487            applies to all variables in the model.  If a vector, it
488            must have the same length as `params`, and contains a
489            penalty weight for each coefficient.
490        start_params : array_like
491            Starting values for `params`.
492        refit : bool
493            If True, the model is refit using only the variables that
494            have non-zero coefficients in the regularized fit.  The
495            refitted model is not regularized.
496        **kwargs
497            Additional keyword arguments used to fit the model.
498
499        Returns
500        -------
501        PHRegResults
502            Returns a results instance.
503
504        Notes
505        -----
506        The penalty is the ``elastic net`` penalty, which is a
507        combination of L1 and L2 penalties.
508
509        The function that is minimized is:
510
511        .. math::
512
513            -loglike/n + alpha*((1-L1\_wt)*|params|_2^2/2 + L1\_wt*|params|_1)
514
515        where :math:`|*|_1` and :math:`|*|_2` are the L1 and L2 norms.
516
517        Post-estimation results are based on the same data used to
518        select variables, hence may be subject to overfitting biases.
519
520        The elastic_net method uses the following keyword arguments:
521
522        maxiter : int
523            Maximum number of iterations
524        L1_wt  : float
525            Must be in [0, 1].  The L1 penalty has weight L1_wt and the
526            L2 penalty has weight 1 - L1_wt.
527        cnvrg_tol : float
528            Convergence threshold for line searches
529        zero_tol : float
530            Coefficients below this threshold are treated as zero.
531        """
532
533        from statsmodels.base.elastic_net import fit_elasticnet
534
535        if method != "elastic_net":
536            raise ValueError("method for fit_regularied must be elastic_net")
537
538        defaults = {"maxiter" : 50, "L1_wt" : 1, "cnvrg_tol" : 1e-10,
539                    "zero_tol" : 1e-10}
540        defaults.update(kwargs)
541
542        return fit_elasticnet(self, method=method,
543                              alpha=alpha,
544                              start_params=start_params,
545                              refit=refit,
546                              **defaults)
547
548
549    def loglike(self, params):
550        """
551        Returns the log partial likelihood function evaluated at
552        `params`.
553        """
554
555        if self.ties == "breslow":
556            return self.breslow_loglike(params)
557        elif self.ties == "efron":
558            return self.efron_loglike(params)
559
560    def score(self, params):
561        """
562        Returns the score function evaluated at `params`.
563        """
564
565        if self.ties == "breslow":
566            return self.breslow_gradient(params)
567        elif self.ties == "efron":
568            return self.efron_gradient(params)
569
570    def hessian(self, params):
571        """
572        Returns the Hessian matrix of the log partial likelihood
573        function evaluated at `params`.
574        """
575
576        if self.ties == "breslow":
577            return self.breslow_hessian(params)
578        else:
579            return self.efron_hessian(params)
580
581    def breslow_loglike(self, params):
582        """
583        Returns the value of the log partial likelihood function
584        evaluated at `params`, using the Breslow method to handle tied
585        times.
586        """
587
588        surv = self.surv
589
590        like = 0.
591
592        # Loop over strata
593        for stx in range(surv.nstrat):
594
595            uft_ix = surv.ufailt_ix[stx]
596            exog_s = surv.exog_s[stx]
597            nuft = len(uft_ix)
598
599            linpred = np.dot(exog_s, params)
600            if surv.offset_s is not None:
601                linpred += surv.offset_s[stx]
602            linpred -= linpred.max()
603            e_linpred = np.exp(linpred)
604
605            xp0 = 0.
606
607            # Iterate backward through the unique failure times.
608            for i in range(nuft)[::-1]:
609
610                # Update for new cases entering the risk set.
611                ix = surv.risk_enter[stx][i]
612                xp0 += e_linpred[ix].sum()
613
614                # Account for all cases that fail at this point.
615                ix = uft_ix[i]
616                like += (linpred[ix] - np.log(xp0)).sum()
617
618                # Update for cases leaving the risk set.
619                ix = surv.risk_exit[stx][i]
620                xp0 -= e_linpred[ix].sum()
621
622        return like
623
624    def efron_loglike(self, params):
625        """
626        Returns the value of the log partial likelihood function
627        evaluated at `params`, using the Efron method to handle tied
628        times.
629        """
630
631        surv = self.surv
632
633        like = 0.
634
635        # Loop over strata
636        for stx in range(surv.nstrat):
637
638            # exog and linear predictor for this stratum
639            exog_s = surv.exog_s[stx]
640            linpred = np.dot(exog_s, params)
641            if surv.offset_s is not None:
642                linpred += surv.offset_s[stx]
643            linpred -= linpred.max()
644            e_linpred = np.exp(linpred)
645
646            xp0 = 0.
647
648            # Iterate backward through the unique failure times.
649            uft_ix = surv.ufailt_ix[stx]
650            nuft = len(uft_ix)
651            for i in range(nuft)[::-1]:
652
653                # Update for new cases entering the risk set.
654                ix = surv.risk_enter[stx][i]
655                xp0 += e_linpred[ix].sum()
656                xp0f = e_linpred[uft_ix[i]].sum()
657
658                # Account for all cases that fail at this point.
659                ix = uft_ix[i]
660                like += linpred[ix].sum()
661
662                m = len(ix)
663                J = np.arange(m, dtype=np.float64) / m
664                like -= np.log(xp0 - J*xp0f).sum()
665
666                # Update for cases leaving the risk set.
667                ix = surv.risk_exit[stx][i]
668                xp0 -= e_linpred[ix].sum()
669
670        return like
671
672    def breslow_gradient(self, params):
673        """
674        Returns the gradient of the log partial likelihood, using the
675        Breslow method to handle tied times.
676        """
677
678        surv = self.surv
679
680        grad = 0.
681
682        # Loop over strata
683        for stx in range(surv.nstrat):
684
685            # Indices of subjects in the stratum
686            strat_ix = surv.stratum_rows[stx]
687
688            # Unique failure times in the stratum
689            uft_ix = surv.ufailt_ix[stx]
690            nuft = len(uft_ix)
691
692            # exog and linear predictor for the stratum
693            exog_s = surv.exog_s[stx]
694            linpred = np.dot(exog_s, params)
695            if surv.offset_s is not None:
696                linpred += surv.offset_s[stx]
697            linpred -= linpred.max()
698            e_linpred = np.exp(linpred)
699
700            xp0, xp1 = 0., 0.
701
702            # Iterate backward through the unique failure times.
703            for i in range(nuft)[::-1]:
704
705                # Update for new cases entering the risk set.
706                ix = surv.risk_enter[stx][i]
707                if len(ix) > 0:
708                    v = exog_s[ix,:]
709                    xp0 += e_linpred[ix].sum()
710                    xp1 += (e_linpred[ix][:,None] * v).sum(0)
711
712                # Account for all cases that fail at this point.
713                ix = uft_ix[i]
714                grad += (exog_s[ix,:] - xp1 / xp0).sum(0)
715
716                # Update for cases leaving the risk set.
717                ix = surv.risk_exit[stx][i]
718                if len(ix) > 0:
719                    v = exog_s[ix,:]
720                    xp0 -= e_linpred[ix].sum()
721                    xp1 -= (e_linpred[ix][:,None] * v).sum(0)
722
723        return grad
724
725    def efron_gradient(self, params):
726        """
727        Returns the gradient of the log partial likelihood evaluated
728        at `params`, using the Efron method to handle tied times.
729        """
730
731        surv = self.surv
732
733        grad = 0.
734
735        # Loop over strata
736        for stx in range(surv.nstrat):
737
738            # Indices of cases in the stratum
739            strat_ix = surv.stratum_rows[stx]
740
741            # exog and linear predictor of the stratum
742            exog_s = surv.exog_s[stx]
743            linpred = np.dot(exog_s, params)
744            if surv.offset_s is not None:
745                linpred += surv.offset_s[stx]
746            linpred -= linpred.max()
747            e_linpred = np.exp(linpred)
748
749            xp0, xp1 = 0., 0.
750
751            # Iterate backward through the unique failure times.
752            uft_ix = surv.ufailt_ix[stx]
753            nuft = len(uft_ix)
754            for i in range(nuft)[::-1]:
755
756                # Update for new cases entering the risk set.
757                ix = surv.risk_enter[stx][i]
758                if len(ix) > 0:
759                    v = exog_s[ix,:]
760                    xp0 += e_linpred[ix].sum()
761                    xp1 += (e_linpred[ix][:,None] * v).sum(0)
762                ixf = uft_ix[i]
763                if len(ixf) > 0:
764                    v = exog_s[ixf,:]
765                    xp0f = e_linpred[ixf].sum()
766                    xp1f = (e_linpred[ixf][:,None] * v).sum(0)
767
768                    # Consider all cases that fail at this point.
769                    grad += v.sum(0)
770
771                    m = len(ixf)
772                    J = np.arange(m, dtype=np.float64) / m
773                    numer = xp1 - np.outer(J, xp1f)
774                    denom = xp0 - np.outer(J, xp0f)
775                    ratio = numer / denom
776                    rsum = ratio.sum(0)
777                    grad -= rsum
778
779                # Update for cases leaving the risk set.
780                ix = surv.risk_exit[stx][i]
781                if len(ix) > 0:
782                    v = exog_s[ix,:]
783                    xp0 -= e_linpred[ix].sum()
784                    xp1 -= (e_linpred[ix][:,None] * v).sum(0)
785
786        return grad
787
788    def breslow_hessian(self, params):
789        """
790        Returns the Hessian of the log partial likelihood evaluated at
791        `params`, using the Breslow method to handle tied times.
792        """
793
794        surv = self.surv
795
796        hess = 0.
797
798        # Loop over strata
799        for stx in range(surv.nstrat):
800
801            uft_ix = surv.ufailt_ix[stx]
802            nuft = len(uft_ix)
803
804            exog_s = surv.exog_s[stx]
805
806            linpred = np.dot(exog_s, params)
807            if surv.offset_s is not None:
808                linpred += surv.offset_s[stx]
809            linpred -= linpred.max()
810            e_linpred = np.exp(linpred)
811
812            xp0, xp1, xp2 = 0., 0., 0.
813
814            # Iterate backward through the unique failure times.
815            for i in range(nuft)[::-1]:
816
817                # Update for new cases entering the risk set.
818                ix = surv.risk_enter[stx][i]
819                if len(ix) > 0:
820                    xp0 += e_linpred[ix].sum()
821                    v = exog_s[ix,:]
822                    xp1 += (e_linpred[ix][:,None] * v).sum(0)
823                    elx = e_linpred[ix]
824                    xp2 += np.einsum("ij,ik,i->jk", v, v, elx)
825
826                # Account for all cases that fail at this point.
827                m = len(uft_ix[i])
828                hess += m*(xp2 / xp0  - np.outer(xp1, xp1) / xp0**2)
829
830                # Update for new cases entering the risk set.
831                ix = surv.risk_exit[stx][i]
832                if len(ix) > 0:
833                    xp0 -= e_linpred[ix].sum()
834                    v = exog_s[ix,:]
835                    xp1 -= (e_linpred[ix][:,None] * v).sum(0)
836                    elx = e_linpred[ix]
837                    xp2 -= np.einsum("ij,ik,i->jk", v, v, elx)
838        return -hess
839
840    def efron_hessian(self, params):
841        """
842        Returns the Hessian matrix of the partial log-likelihood
843        evaluated at `params`, using the Efron method to handle tied
844        times.
845        """
846
847        surv = self.surv
848
849        hess = 0.
850
851        # Loop over strata
852        for stx in range(surv.nstrat):
853
854            exog_s = surv.exog_s[stx]
855
856            linpred = np.dot(exog_s, params)
857            if surv.offset_s is not None:
858                linpred += surv.offset_s[stx]
859            linpred -= linpred.max()
860            e_linpred = np.exp(linpred)
861
862            xp0, xp1, xp2 = 0., 0., 0.
863
864            # Iterate backward through the unique failure times.
865            uft_ix = surv.ufailt_ix[stx]
866            nuft = len(uft_ix)
867            for i in range(nuft)[::-1]:
868
869                # Update for new cases entering the risk set.
870                ix = surv.risk_enter[stx][i]
871                if len(ix) > 0:
872                    xp0 += e_linpred[ix].sum()
873                    v = exog_s[ix,:]
874                    xp1 += (e_linpred[ix][:,None] * v).sum(0)
875                    elx = e_linpred[ix]
876                    xp2 += np.einsum("ij,ik,i->jk", v, v, elx)
877
878                ixf = uft_ix[i]
879                if len(ixf) > 0:
880                    v = exog_s[ixf,:]
881                    xp0f = e_linpred[ixf].sum()
882                    xp1f = (e_linpred[ixf][:,None] * v).sum(0)
883                    elx = e_linpred[ixf]
884                    xp2f = np.einsum("ij,ik,i->jk", v, v, elx)
885
886                # Account for all cases that fail at this point.
887                m = len(uft_ix[i])
888                J = np.arange(m, dtype=np.float64) / m
889                c0 = xp0 - J*xp0f
890                hess += xp2 * np.sum(1 / c0)
891                hess -= xp2f * np.sum(J / c0)
892                mat = (xp1[None, :] - np.outer(J, xp1f)) / c0[:, None]
893                hess -= np.einsum("ij,ik->jk", mat, mat)
894
895                # Update for new cases entering the risk set.
896                ix = surv.risk_exit[stx][i]
897                if len(ix) > 0:
898                    xp0 -= e_linpred[ix].sum()
899                    v = exog_s[ix,:]
900                    xp1 -= (e_linpred[ix][:,None] * v).sum(0)
901                    elx = e_linpred[ix]
902                    xp2 -= np.einsum("ij,ik,i->jk", v, v, elx)
903
904        return -hess
905
906    def robust_covariance(self, params):
907        """
908        Returns a covariance matrix for the proportional hazards model
909        regresion coefficient estimates that is robust to certain
910        forms of model misspecification.
911
912        Parameters
913        ----------
914        params : ndarray
915            The parameter vector at which the covariance matrix is
916            calculated.
917
918        Returns
919        -------
920        The robust covariance matrix as a square ndarray.
921
922        Notes
923        -----
924        This function uses the `groups` argument to determine groups
925        within which observations may be dependent.  The covariance
926        matrix is calculated using the Huber-White "sandwich" approach.
927        """
928
929        if self.groups is None:
930            raise ValueError("`groups` must be specified to calculate the robust covariance matrix")
931
932        hess = self.hessian(params)
933
934        score_obs = self.score_residuals(params)
935
936        # Collapse
937        grads = {}
938        for i,g in enumerate(self.groups):
939            if g not in grads:
940                grads[g] = 0.
941            grads[g] += score_obs[i, :]
942        grads = np.asarray(list(grads.values()))
943
944        mat = grads[None, :, :]
945        mat = mat.T * mat
946        mat = mat.sum(1)
947
948        hess_inv = np.linalg.inv(hess)
949        cmat = np.dot(hess_inv, np.dot(mat, hess_inv))
950
951        return cmat
952
953    def score_residuals(self, params):
954        """
955        Returns the score residuals calculated at a given vector of
956        parameters.
957
958        Parameters
959        ----------
960        params : ndarray
961            The parameter vector at which the score residuals are
962            calculated.
963
964        Returns
965        -------
966        The score residuals, returned as a ndarray having the same
967        shape as `exog`.
968
969        Notes
970        -----
971        Observations in a stratum with no observed events have undefined
972        score residuals, and contain NaN in the returned matrix.
973        """
974
975        surv = self.surv
976
977        score_resid = np.zeros(self.exog.shape, dtype=np.float64)
978
979        # Use to set undefined values to NaN.
980        mask = np.zeros(self.exog.shape[0], dtype=np.int32)
981
982        w_avg = self.weighted_covariate_averages(params)
983
984        # Loop over strata
985        for stx in range(surv.nstrat):
986
987            uft_ix = surv.ufailt_ix[stx]
988            exog_s = surv.exog_s[stx]
989            nuft = len(uft_ix)
990            strat_ix = surv.stratum_rows[stx]
991
992            xp0 = 0.
993
994            linpred = np.dot(exog_s, params)
995            if surv.offset_s is not None:
996                linpred += surv.offset_s[stx]
997            linpred -= linpred.max()
998            e_linpred = np.exp(linpred)
999
1000            at_risk_ix = set([])
1001
1002            # Iterate backward through the unique failure times.
1003            for i in range(nuft)[::-1]:
1004
1005                # Update for new cases entering the risk set.
1006                ix = surv.risk_enter[stx][i]
1007                at_risk_ix |= set(ix)
1008                xp0 += e_linpred[ix].sum()
1009
1010                atr_ix = list(at_risk_ix)
1011                leverage = exog_s[atr_ix, :] - w_avg[stx][i, :]
1012
1013                # Event indicators
1014                d = np.zeros(exog_s.shape[0])
1015                d[uft_ix[i]] = 1
1016
1017                # The increment in the cumulative hazard
1018                dchaz = len(uft_ix[i]) / xp0
1019
1020                # Piece of the martingale residual
1021                mrp = d[atr_ix] - e_linpred[atr_ix] * dchaz
1022
1023                # Update the score residuals
1024                ii = strat_ix[atr_ix]
1025                score_resid[ii,:] += leverage * mrp[:, None]
1026                mask[ii] = 1
1027
1028                # Update for cases leaving the risk set.
1029                ix = surv.risk_exit[stx][i]
1030                at_risk_ix -= set(ix)
1031                xp0 -= e_linpred[ix].sum()
1032
1033        jj = np.flatnonzero(mask == 0)
1034        if len(jj) > 0:
1035            score_resid[jj, :] = np.nan
1036
1037        return score_resid
1038
1039    def weighted_covariate_averages(self, params):
1040        """
1041        Returns the hazard-weighted average of covariate values for
1042        subjects who are at-risk at a particular time.
1043
1044        Parameters
1045        ----------
1046        params : ndarray
1047            Parameter vector
1048
1049        Returns
1050        -------
1051        averages : list of ndarrays
1052            averages[stx][i,:] is a row vector containing the weighted
1053            average values (for all the covariates) of at-risk
1054            subjects a the i^th largest observed failure time in
1055            stratum `stx`, using the hazard multipliers as weights.
1056
1057        Notes
1058        -----
1059        Used to calculate leverages and score residuals.
1060        """
1061
1062        surv = self.surv
1063
1064        averages = []
1065        xp0, xp1 = 0., 0.
1066
1067        # Loop over strata
1068        for stx in range(surv.nstrat):
1069
1070            uft_ix = surv.ufailt_ix[stx]
1071            exog_s = surv.exog_s[stx]
1072            nuft = len(uft_ix)
1073
1074            average_s = np.zeros((len(uft_ix), exog_s.shape[1]),
1075                                  dtype=np.float64)
1076
1077            linpred = np.dot(exog_s, params)
1078            if surv.offset_s is not None:
1079                linpred += surv.offset_s[stx]
1080            linpred -= linpred.max()
1081            e_linpred = np.exp(linpred)
1082
1083            # Iterate backward through the unique failure times.
1084            for i in range(nuft)[::-1]:
1085
1086                # Update for new cases entering the risk set.
1087                ix = surv.risk_enter[stx][i]
1088                xp0 += e_linpred[ix].sum()
1089                xp1 += np.dot(e_linpred[ix], exog_s[ix, :])
1090
1091                average_s[i, :] = xp1 / xp0
1092
1093                # Update for cases leaving the risk set.
1094                ix = surv.risk_exit[stx][i]
1095                xp0 -= e_linpred[ix].sum()
1096                xp1 -= np.dot(e_linpred[ix], exog_s[ix, :])
1097
1098            averages.append(average_s)
1099
1100        return averages
1101
1102    def baseline_cumulative_hazard(self, params):
1103        """
1104        Estimate the baseline cumulative hazard and survival
1105        functions.
1106
1107        Parameters
1108        ----------
1109        params : ndarray
1110            The model parameters.
1111
1112        Returns
1113        -------
1114        A list of triples (time, hazard, survival) containing the time
1115        values and corresponding cumulative hazard and survival
1116        function values for each stratum.
1117
1118        Notes
1119        -----
1120        Uses the Nelson-Aalen estimator.
1121        """
1122
1123        # TODO: some disagreements with R, not the same algorithm but
1124        # hard to deduce what R is doing.  Our results are reasonable.
1125
1126        surv = self.surv
1127        rslt = []
1128
1129        # Loop over strata
1130        for stx in range(surv.nstrat):
1131
1132            uft = surv.ufailt[stx]
1133            uft_ix = surv.ufailt_ix[stx]
1134            exog_s = surv.exog_s[stx]
1135            nuft = len(uft_ix)
1136
1137            linpred = np.dot(exog_s, params)
1138            if surv.offset_s is not None:
1139                linpred += surv.offset_s[stx]
1140            e_linpred = np.exp(linpred)
1141
1142            xp0 = 0.
1143            h0 = np.zeros(nuft, dtype=np.float64)
1144
1145            # Iterate backward through the unique failure times.
1146            for i in range(nuft)[::-1]:
1147
1148                # Update for new cases entering the risk set.
1149                ix = surv.risk_enter[stx][i]
1150                xp0 += e_linpred[ix].sum()
1151
1152                # Account for all cases that fail at this point.
1153                ix = uft_ix[i]
1154                h0[i] = len(ix) / xp0
1155
1156                # Update for cases leaving the risk set.
1157                ix = surv.risk_exit[stx][i]
1158                xp0 -= e_linpred[ix].sum()
1159
1160            cumhaz = np.cumsum(h0) - h0
1161            current_strata_surv = np.exp(-cumhaz)
1162            rslt.append([uft, cumhaz, current_strata_surv])
1163
1164        return rslt
1165
1166    def baseline_cumulative_hazard_function(self, params):
1167        """
1168        Returns a function that calculates the baseline cumulative
1169        hazard function for each stratum.
1170
1171        Parameters
1172        ----------
1173        params : ndarray
1174            The model parameters.
1175
1176        Returns
1177        -------
1178        A dict mapping stratum names to the estimated baseline
1179        cumulative hazard function.
1180        """
1181
1182        from scipy.interpolate import interp1d
1183        surv = self.surv
1184        base = self.baseline_cumulative_hazard(params)
1185
1186        cumhaz_f = {}
1187        for stx in range(surv.nstrat):
1188            time_h = base[stx][0]
1189            cumhaz = base[stx][1]
1190            time_h = np.r_[-np.inf, time_h, np.inf]
1191            cumhaz = np.r_[cumhaz[0], cumhaz, cumhaz[-1]]
1192            func = interp1d(time_h, cumhaz, kind='zero')
1193            cumhaz_f[self.surv.stratum_names[stx]] = func
1194
1195        return cumhaz_f
1196
1197    @Appender(_predict_docstring % {
1198        'params_doc': _predict_params_doc,
1199        'cov_params_doc': _predict_cov_params_docstring})
1200    def predict(self, params, exog=None, cov_params=None, endog=None,
1201                strata=None, offset=None, pred_type="lhr", pred_only=False):
1202
1203        # This function breaks mediation, because it does not simply
1204        # return the predicted values as an array.
1205
1206        pred_type = pred_type.lower()
1207        if pred_type not in ["lhr", "hr", "surv", "cumhaz"]:
1208            msg = "Type %s not allowed for prediction" % pred_type
1209            raise ValueError(msg)
1210
1211        class bunch:
1212            predicted_values = None
1213            standard_errors = None
1214        ret_val = bunch()
1215
1216        # Do not do anything with offset here because we want to allow
1217        # different offsets to be specified even if exog is the model
1218        # exog.
1219        exog_provided = True
1220        if exog is None:
1221            exog = self.exog
1222            exog_provided = False
1223
1224        lhr = np.dot(exog, params)
1225        if offset is not None:
1226            lhr += offset
1227        # Never use self.offset unless we are also using self.exog
1228        elif self.offset is not None and not exog_provided:
1229            lhr += self.offset
1230
1231        # Handle lhr and hr prediction first, since they do not make
1232        # use of the hazard function.
1233
1234        if pred_type == "lhr":
1235            ret_val.predicted_values = lhr
1236            if cov_params is not None:
1237                mat = np.dot(exog, cov_params)
1238                va = (mat * exog).sum(1)
1239                ret_val.standard_errors = np.sqrt(va)
1240            if pred_only:
1241                return ret_val.predicted_values
1242            return ret_val
1243
1244        hr = np.exp(lhr)
1245
1246        if pred_type == "hr":
1247            ret_val.predicted_values = hr
1248            if pred_only:
1249                return ret_val.predicted_values
1250            return ret_val
1251
1252        # Makes sure endog is defined
1253        if endog is None and exog_provided:
1254            msg = "If `exog` is provided `endog` must be provided."
1255            raise ValueError(msg)
1256        # Use model endog if using model exog
1257        elif endog is None and not exog_provided:
1258            endog = self.endog
1259
1260        # Make sure strata is defined
1261        if strata is None:
1262            if exog_provided and self.surv.nstrat > 1:
1263                raise ValueError("`strata` must be provided")
1264            if self.strata is None:
1265                strata = [self.surv.stratum_names[0],] * len(endog)
1266            else:
1267                strata = self.strata
1268
1269        cumhaz = np.nan * np.ones(len(endog), dtype=np.float64)
1270        stv = np.unique(strata)
1271        bhaz = self.baseline_cumulative_hazard_function(params)
1272        for stx in stv:
1273            ix = np.flatnonzero(strata == stx)
1274            func = bhaz[stx]
1275            cumhaz[ix] = func(endog[ix]) * hr[ix]
1276
1277        if pred_type == "cumhaz":
1278            ret_val.predicted_values = cumhaz
1279
1280        elif pred_type == "surv":
1281            ret_val.predicted_values = np.exp(-cumhaz)
1282
1283        if pred_only:
1284            return ret_val.predicted_values
1285
1286        return ret_val
1287
1288    def get_distribution(self, params, scale=1.0, exog=None):
1289        """
1290        Returns a scipy distribution object corresponding to the
1291        distribution of uncensored endog (duration) values for each
1292        case.
1293
1294        Parameters
1295        ----------
1296        params : array_like
1297            The proportional hazards model parameters.
1298        scale : float
1299            Present for compatibility, not used.
1300        exog : array_like
1301            A design matrix, defaults to model.exog.
1302
1303        Returns
1304        -------
1305        A list of objects of type scipy.stats.distributions.rv_discrete
1306
1307        Notes
1308        -----
1309        The distributions are obtained from a simple discrete estimate
1310        of the survivor function that puts all mass on the observed
1311        failure times within a stratum.
1312        """
1313
1314        surv = self.surv
1315        bhaz = self.baseline_cumulative_hazard(params)
1316
1317        # The arguments to rv_discrete_float, first obtained by
1318        # stratum
1319        pk, xk = [], []
1320
1321        if exog is None:
1322            exog_split = surv.exog_s
1323        else:
1324            exog_split = self.surv._split(exog)
1325
1326        for stx in range(self.surv.nstrat):
1327
1328            exog_s = exog_split[stx]
1329
1330            linpred = np.dot(exog_s, params)
1331            if surv.offset_s is not None:
1332                linpred += surv.offset_s[stx]
1333            e_linpred = np.exp(linpred)
1334
1335            # The unique failure times for this stratum (the support
1336            # of the distribution).
1337            pts = bhaz[stx][0]
1338
1339            # The individual cumulative hazards for everyone in this
1340            # stratum.
1341            ichaz = np.outer(e_linpred, bhaz[stx][1])
1342
1343            # The individual survival functions.
1344            usurv = np.exp(-ichaz)
1345            z = np.zeros((usurv.shape[0], 1))
1346            usurv = np.concatenate((usurv, z), axis=1)
1347
1348            # The individual survival probability masses.
1349            probs = -np.diff(usurv, 1)
1350
1351            pk.append(probs)
1352            xk.append(np.outer(np.ones(probs.shape[0]), pts))
1353
1354        # Pad to make all strata have the same shape
1355        mxc = max([x.shape[1] for x in xk])
1356        for k in range(self.surv.nstrat):
1357            if xk[k].shape[1] < mxc:
1358                xk1 = np.zeros((xk[k].shape[0], mxc))
1359                pk1 = np.zeros((pk[k].shape[0], mxc))
1360                xk1[:, 0:xk[k].shape[1]] = xk[k]
1361                pk1[:, 0:pk[k].shape[1]] = pk[k]
1362                xk[k], pk[k] = xk1, pk1
1363
1364        # Put the support points and probabilities into single matrices
1365        xka = np.nan * np.ones((len(self.endog), mxc))
1366        pka = np.ones((len(self.endog), mxc), dtype=np.float64) / mxc
1367        for stx in range(self.surv.nstrat):
1368            ix = self.surv.stratum_rows[stx]
1369            xka[ix, :] = xk[stx]
1370            pka[ix, :] = pk[stx]
1371
1372        dist = rv_discrete_float(xka, pka)
1373
1374        return dist
1375
1376
1377class PHRegResults(base.LikelihoodModelResults):
1378    '''
1379    Class to contain results of fitting a Cox proportional hazards
1380    survival model.
1381
1382    PHregResults inherits from statsmodels.LikelihoodModelResults
1383
1384    Parameters
1385    ----------
1386    See statsmodels.LikelihoodModelResults
1387
1388    Attributes
1389    ----------
1390    model : class instance
1391        PHreg model instance that called fit.
1392    normalized_cov_params : ndarray
1393        The sampling covariance matrix of the estimates
1394    params : ndarray
1395        The coefficients of the fitted model.  Each coefficient is the
1396        log hazard ratio corresponding to a 1 unit difference in a
1397        single covariate while holding the other covariates fixed.
1398    bse : ndarray
1399        The standard errors of the fitted parameters.
1400
1401    See Also
1402    --------
1403    statsmodels.LikelihoodModelResults
1404    '''
1405
1406    def __init__(self, model, params, cov_params, scale=1., covariance_type="naive"):
1407
1408        # There is no scale parameter, but we need it for
1409        # meta-procedures that work with results.
1410
1411        self.covariance_type = covariance_type
1412        self.df_resid = model.df_resid
1413        self.df_model = model.df_model
1414
1415        super(PHRegResults, self).__init__(model, params, scale=1.,
1416           normalized_cov_params=cov_params)
1417
1418    @cache_readonly
1419    def standard_errors(self):
1420        """
1421        Returns the standard errors of the parameter estimates.
1422        """
1423        return np.sqrt(np.diag(self.cov_params()))
1424
1425    @cache_readonly
1426    def bse(self):
1427        """
1428        Returns the standard errors of the parameter estimates.
1429        """
1430        return self.standard_errors
1431
1432    def get_distribution(self):
1433        """
1434        Returns a scipy distribution object corresponding to the
1435        distribution of uncensored endog (duration) values for each
1436        case.
1437
1438        Returns
1439        -------
1440        A list of objects of type scipy.stats.distributions.rv_discrete
1441
1442        Notes
1443        -----
1444        The distributions are obtained from a simple discrete estimate
1445        of the survivor function that puts all mass on the observed
1446        failure times within a stratum.
1447        """
1448
1449        return self.model.get_distribution(self.params)
1450
1451    @Appender(_predict_docstring % {'params_doc': '', 'cov_params_doc': ''})
1452    def predict(self, endog=None, exog=None, strata=None,
1453                offset=None, transform=True, pred_type="lhr"):
1454        return super(PHRegResults, self).predict(exog=exog,
1455                                                 transform=transform,
1456                                                 cov_params=self.cov_params(),
1457                                                 endog=endog,
1458                                                 strata=strata,
1459                                                 offset=offset,
1460                                                 pred_type=pred_type)
1461
1462    def _group_stats(self, groups):
1463        """
1464        Descriptive statistics of the groups.
1465        """
1466        gsizes = np.unique(groups, return_counts=True)
1467        gsizes = gsizes[1]
1468        return gsizes.min(), gsizes.max(), gsizes.mean(), len(gsizes)
1469
1470    @cache_readonly
1471    def weighted_covariate_averages(self):
1472        """
1473        The average covariate values within the at-risk set at each
1474        event time point, weighted by hazard.
1475        """
1476        return self.model.weighted_covariate_averages(self.params)
1477
1478    @cache_readonly
1479    def score_residuals(self):
1480        """
1481        A matrix containing the score residuals.
1482        """
1483        return self.model.score_residuals(self.params)
1484
1485    @cache_readonly
1486    def baseline_cumulative_hazard(self):
1487        """
1488        A list (corresponding to the strata) containing the baseline
1489        cumulative hazard function evaluated at the event points.
1490        """
1491        return self.model.baseline_cumulative_hazard(self.params)
1492
1493    @cache_readonly
1494    def baseline_cumulative_hazard_function(self):
1495        """
1496        A list (corresponding to the strata) containing function
1497        objects that calculate the cumulative hazard function.
1498        """
1499        return self.model.baseline_cumulative_hazard_function(self.params)
1500
1501    @cache_readonly
1502    def schoenfeld_residuals(self):
1503        """
1504        A matrix containing the Schoenfeld residuals.
1505
1506        Notes
1507        -----
1508        Schoenfeld residuals for censored observations are set to zero.
1509        """
1510
1511        surv = self.model.surv
1512        w_avg = self.weighted_covariate_averages
1513
1514        # Initialize at NaN since rows that belong to strata with no
1515        # events have undefined residuals.
1516        sch_resid = np.nan*np.ones(self.model.exog.shape, dtype=np.float64)
1517
1518        # Loop over strata
1519        for stx in range(surv.nstrat):
1520
1521            uft = surv.ufailt[stx]
1522            exog_s = surv.exog_s[stx]
1523            time_s = surv.time_s[stx]
1524            strat_ix = surv.stratum_rows[stx]
1525
1526            ii = np.searchsorted(uft, time_s)
1527
1528            # These subjects are censored after the last event in
1529            # their stratum, so have empty risk sets and undefined
1530            # residuals.
1531            jj = np.flatnonzero(ii < len(uft))
1532
1533            sch_resid[strat_ix[jj], :] = exog_s[jj, :] - w_avg[stx][ii[jj], :]
1534
1535        jj = np.flatnonzero(self.model.status == 0)
1536        sch_resid[jj, :] = np.nan
1537
1538        return sch_resid
1539
1540    @cache_readonly
1541    def martingale_residuals(self):
1542        """
1543        The martingale residuals.
1544        """
1545
1546        surv = self.model.surv
1547
1548        # Initialize at NaN since rows that belong to strata with no
1549        # events have undefined residuals.
1550        mart_resid = np.nan*np.ones(len(self.model.endog), dtype=np.float64)
1551
1552        cumhaz_f_list = self.baseline_cumulative_hazard_function
1553
1554        # Loop over strata
1555        for stx in range(surv.nstrat):
1556
1557            cumhaz_f = cumhaz_f_list[stx]
1558
1559            exog_s = surv.exog_s[stx]
1560            time_s = surv.time_s[stx]
1561
1562            linpred = np.dot(exog_s, self.params)
1563            if surv.offset_s is not None:
1564                linpred += surv.offset_s[stx]
1565            e_linpred = np.exp(linpred)
1566
1567            ii = surv.stratum_rows[stx]
1568            chaz = cumhaz_f(time_s)
1569            mart_resid[ii] = self.model.status[ii] - e_linpred * chaz
1570
1571        return mart_resid
1572
1573    def summary(self, yname=None, xname=None, title=None, alpha=.05):
1574        """
1575        Summarize the proportional hazards regression results.
1576
1577        Parameters
1578        ----------
1579        yname : str, optional
1580            Default is `y`
1581        xname : list[str], optional
1582            Names for the exogenous variables, default is `x#` for ## in p the
1583            number of regressors. Must match the number of parameters in
1584            the model
1585        title : str, optional
1586            Title for the top table. If not None, then this replaces
1587            the default title
1588        alpha : float
1589            significance level for the confidence intervals
1590
1591        Returns
1592        -------
1593        smry : Summary instance
1594            this holds the summary tables and text, which can be
1595            printed or converted to various output formats.
1596
1597        See Also
1598        --------
1599        statsmodels.iolib.summary2.Summary : class to hold summary results
1600        """
1601
1602        from statsmodels.iolib import summary2
1603        smry = summary2.Summary()
1604        float_format = "%8.3f"
1605
1606        info = {}
1607        info["Model:"] = "PH Reg"
1608        if yname is None:
1609            yname = self.model.endog_names
1610        info["Dependent variable:"] = yname
1611        info["Ties:"] = self.model.ties.capitalize()
1612        info["Sample size:"] = str(self.model.surv.n_obs)
1613        info["Num. events:"] = str(int(sum(self.model.status)))
1614
1615        if self.model.groups is not None:
1616            mn, mx, avg, num = self._group_stats(self.model.groups)
1617            info["Num groups:"] = "%.0f" % num
1618            info["Min group size:"] = "%.0f" % mn
1619            info["Max group size:"] = "%.0f" % mx
1620            info["Avg group size:"] = "%.1f" % avg
1621
1622        if self.model.strata is not None:
1623            mn, mx, avg, num = self._group_stats(self.model.strata)
1624            info["Num strata:"] = "%.0f" % num
1625            info["Min stratum size:"] = "%.0f" % mn
1626            info["Max stratum size:"] = "%.0f" % mx
1627            info["Avg stratum size:"] = "%.1f" % avg
1628
1629        smry.add_dict(info, align='l', float_format=float_format)
1630
1631        param = summary2.summary_params(self, alpha=alpha)
1632        param = param.rename(columns={"Coef.": "log HR",
1633                                      "Std.Err.": "log HR SE"})
1634        param.insert(2, "HR", np.exp(param["log HR"]))
1635        a = "[%.3f" % (alpha / 2)
1636        param.loc[:, a] = np.exp(param.loc[:, a])
1637        a = "%.3f]" % (1 - alpha / 2)
1638        param.loc[:, a] = np.exp(param.loc[:, a])
1639        if xname is not None:
1640            param.index = xname
1641        smry.add_df(param, float_format=float_format)
1642        smry.add_title(title=title, results=self)
1643        smry.add_text("Confidence intervals are for the hazard ratios")
1644
1645        dstrat = self.model.surv.nstrat_orig - self.model.surv.nstrat
1646        if dstrat > 0:
1647            if dstrat == 1:
1648                smry.add_text("1 stratum dropped for having no events")
1649            else:
1650                smry.add_text("%d strata dropped for having no events" % dstrat)
1651
1652        if self.model.entry is not None:
1653            n_entry = sum(self.model.entry != 0)
1654            if n_entry == 1:
1655                smry.add_text("1 observation has a positive entry time")
1656            else:
1657                smry.add_text("%d observations have positive entry times" % n_entry)
1658
1659        if self.model.groups is not None:
1660            smry.add_text("Standard errors account for dependence within groups")
1661
1662        if hasattr(self, "regularized"):
1663            smry.add_text("Standard errors do not account for the regularization")
1664
1665        return smry
1666
1667
1668class rv_discrete_float(object):
1669    """
1670    A class representing a collection of discrete distributions.
1671
1672    Parameters
1673    ----------
1674    xk : 2d array_like
1675        The support points, should be non-decreasing within each
1676        row.
1677    pk : 2d array_like
1678        The probabilities, should sum to one within each row.
1679
1680    Notes
1681    -----
1682    Each row of `xk`, and the corresponding row of `pk` describe a
1683    discrete distribution.
1684
1685    `xk` and `pk` should both be two-dimensional ndarrays.  Each row
1686    of `pk` should sum to 1.
1687
1688    This class is used as a substitute for scipy.distributions.
1689    rv_discrete, since that class does not allow non-integer support
1690    points, or vectorized operations.
1691
1692    Only a limited number of methods are implemented here compared to
1693    the other scipy distribution classes.
1694    """
1695
1696    def __init__(self, xk, pk):
1697
1698        self.xk = xk
1699        self.pk = pk
1700        self.cpk = np.cumsum(self.pk, axis=1)
1701
1702    def rvs(self, n=None):
1703        """
1704        Returns a random sample from the discrete distribution.
1705
1706        A vector is returned containing a single draw from each row of
1707        `xk`, using the probabilities of the corresponding row of `pk`
1708
1709        Parameters
1710        ----------
1711        n : not used
1712            Present for signature compatibility
1713        """
1714
1715        n = self.xk.shape[0]
1716        u = np.random.uniform(size=n)
1717
1718        ix = (self.cpk < u[:, None]).sum(1)
1719        ii = np.arange(n, dtype=np.int32)
1720        return self.xk[(ii,ix)]
1721
1722    def mean(self):
1723        """
1724        Returns a vector containing the mean values of the discrete
1725        distributions.
1726
1727        A vector is returned containing the mean value of each row of
1728        `xk`, using the probabilities in the corresponding row of
1729        `pk`.
1730        """
1731
1732        return (self.xk * self.pk).sum(1)
1733
1734    def var(self):
1735        """
1736        Returns a vector containing the variances of the discrete
1737        distributions.
1738
1739        A vector is returned containing the variance for each row of
1740        `xk`, using the probabilities in the corresponding row of
1741        `pk`.
1742        """
1743
1744        mn = self.mean()
1745        xkc = self.xk - mn[:, None]
1746
1747        return (self.pk * (self.xk - xkc)**2).sum(1)
1748
1749    def std(self):
1750        """
1751        Returns a vector containing the standard deviations of the
1752        discrete distributions.
1753
1754        A vector is returned containing the standard deviation for
1755        each row of `xk`, using the probabilities in the corresponding
1756        row of `pk`.
1757        """
1758
1759        return np.sqrt(self.var())
1760