1""" 2Implementation of proportional hazards regression models for duration 3data that may be censored ("Cox models"). 4 5References 6---------- 7T Therneau (1996). Extending the Cox model. Technical report. 8http://www.mayo.edu/research/documents/biostat-58pdf/DOC-10027288 9 10G Rodriguez (2005). Non-parametric estimation in survival models. 11http://data.princeton.edu/pop509/NonParametricSurvival.pdf 12 13B Gillespie (2006). Checking the assumptions in the Cox proportional 14hazards model. 15http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf 16""" 17import numpy as np 18 19from statsmodels.base import model 20import statsmodels.base.model as base 21from statsmodels.tools.decorators import cache_readonly 22from statsmodels.compat.pandas import Appender 23 24 25_predict_docstring = """ 26 Returns predicted values from the proportional hazards 27 regression model. 28 29 Parameters 30 ----------%(params_doc)s 31 exog : array_like 32 Data to use as `exog` in forming predictions. If not 33 provided, the `exog` values from the model used to fit the 34 data are used.%(cov_params_doc)s 35 endog : array_like 36 Duration (time) values at which the predictions are made. 37 Only used if pred_type is either 'cumhaz' or 'surv'. If 38 using model `exog`, defaults to model `endog` (time), but 39 may be provided explicitly to make predictions at 40 alternative times. 41 strata : array_like 42 A vector of stratum values used to form the predictions. 43 Not used (may be 'None') if pred_type is 'lhr' or 'hr'. 44 If `exog` is None, the model stratum values are used. If 45 `exog` is not None and pred_type is 'surv' or 'cumhaz', 46 stratum values must be provided (unless there is only one 47 stratum). 48 offset : array_like 49 Offset values used to create the predicted values. 50 pred_type : str 51 If 'lhr', returns log hazard ratios, if 'hr' returns 52 hazard ratios, if 'surv' returns the survival function, if 53 'cumhaz' returns the cumulative hazard function. 54 pred_only : bool 55 If True, returns only an array of predicted values. Otherwise 56 returns a bunch containing the predicted values and standard 57 errors. 58 59 Returns 60 ------- 61 A bunch containing two fields: `predicted_values` and 62 `standard_errors`. 63 64 Notes 65 ----- 66 Standard errors are only returned when predicting the log 67 hazard ratio (pred_type is 'lhr'). 68 69 Types `surv` and `cumhaz` require estimation of the cumulative 70 hazard function. 71""" 72 73_predict_params_doc = """ 74 params : array_like 75 The proportional hazards model parameters.""" 76 77_predict_cov_params_docstring = """ 78 cov_params : array_like 79 The covariance matrix of the estimated `params` vector, 80 used to obtain prediction errors if pred_type='lhr', 81 otherwise optional.""" 82 83 84 85class PHSurvivalTime(object): 86 87 def __init__(self, time, status, exog, strata=None, entry=None, 88 offset=None): 89 """ 90 Represent a collection of survival times with possible 91 stratification and left truncation. 92 93 Parameters 94 ---------- 95 time : array_like 96 The times at which either the event (failure) occurs or 97 the observation is censored. 98 status : array_like 99 Indicates whether the event (failure) occurs at `time` 100 (`status` is 1), or if `time` is a censoring time (`status` 101 is 0). 102 exog : array_like 103 The exogeneous (covariate) data matrix, cases are rows and 104 variables are columns. 105 strata : array_like 106 Grouping variable defining the strata. If None, all 107 observations are in a single stratum. 108 entry : array_like 109 Entry (left truncation) times. The observation is not 110 part of the risk set for times before the entry time. If 111 None, the entry time is treated as being zero, which 112 gives no left truncation. The entry time must be less 113 than or equal to `time`. 114 offset : array_like 115 An optional array of offsets 116 """ 117 118 # Default strata 119 if strata is None: 120 strata = np.zeros(len(time), dtype=np.int32) 121 122 # Default entry times 123 if entry is None: 124 entry = np.zeros(len(time)) 125 126 # Parameter validity checks. 127 self._check(time, status, strata, entry) 128 129 # Get the row indices for the cases in each stratum 130 stu = np.unique(strata) 131 sth = {x: [] for x in stu} 132 for i,k in enumerate(strata): 133 sth[k].append(i) 134 stratum_rows = [np.asarray(sth[k], dtype=np.int32) for k in stu] 135 stratum_names = stu 136 137 # Remove strata with no events 138 ix = [i for i,ix in enumerate(stratum_rows) if status[ix].sum() > 0] 139 self.nstrat_orig = len(stratum_rows) 140 stratum_rows = [stratum_rows[i] for i in ix] 141 stratum_names = [stratum_names[i] for i in ix] 142 143 # The number of strata 144 nstrat = len(stratum_rows) 145 self.nstrat = nstrat 146 147 # Remove subjects whose entry time occurs after the last event 148 # in their stratum. 149 for stx,ix in enumerate(stratum_rows): 150 last_failure = max(time[ix][status[ix] == 1]) 151 152 # Stata uses < here, R uses <= 153 ii = [i for i,t in enumerate(entry[ix]) if 154 t <= last_failure] 155 stratum_rows[stx] = stratum_rows[stx][ii] 156 157 # Remove subjects who are censored before the first event in 158 # their stratum. 159 for stx,ix in enumerate(stratum_rows): 160 first_failure = min(time[ix][status[ix] == 1]) 161 162 ii = [i for i,t in enumerate(time[ix]) if 163 t >= first_failure] 164 stratum_rows[stx] = stratum_rows[stx][ii] 165 166 # Order by time within each stratum 167 for stx,ix in enumerate(stratum_rows): 168 ii = np.argsort(time[ix]) 169 stratum_rows[stx] = stratum_rows[stx][ii] 170 171 if offset is not None: 172 self.offset_s = [] 173 for stx in range(nstrat): 174 self.offset_s.append(offset[stratum_rows[stx]]) 175 else: 176 self.offset_s = None 177 178 # Number of informative subjects 179 self.n_obs = sum([len(ix) for ix in stratum_rows]) 180 181 self.stratum_rows = stratum_rows 182 self.stratum_names = stratum_names 183 184 # Split everything by stratum 185 self.time_s = self._split(time) 186 self.exog_s = self._split(exog) 187 self.status_s = self._split(status) 188 self.entry_s = self._split(entry) 189 190 # Precalculate some indices needed to fit Cox models. 191 # Distinct failure times within a stratum are always taken to 192 # be sorted in ascending order. 193 # 194 # ufailt_ix[stx][k] is a list of indices for subjects who fail 195 # at the k^th sorted unique failure time in stratum stx 196 # 197 # risk_enter[stx][k] is a list of indices for subjects who 198 # enter the risk set at the k^th sorted unique failure time in 199 # stratum stx 200 # 201 # risk_exit[stx][k] is a list of indices for subjects who exit 202 # the risk set at the k^th sorted unique failure time in 203 # stratum stx 204 self.ufailt_ix, self.risk_enter, self.risk_exit, self.ufailt =\ 205 [], [], [], [] 206 207 for stx in range(self.nstrat): 208 209 # All failure times 210 ift = np.flatnonzero(self.status_s[stx] == 1) 211 ft = self.time_s[stx][ift] 212 213 # Unique failure times 214 uft = np.unique(ft) 215 nuft = len(uft) 216 217 # Indices of cases that fail at each unique failure time 218 #uft_map = {x:i for i,x in enumerate(uft)} # requires >=2.7 219 uft_map = dict([(x, i) for i,x in enumerate(uft)]) # 2.6 220 uft_ix = [[] for k in range(nuft)] 221 for ix,ti in zip(ift,ft): 222 uft_ix[uft_map[ti]].append(ix) 223 224 # Indices of cases (failed or censored) that enter the 225 # risk set at each unique failure time. 226 risk_enter1 = [[] for k in range(nuft)] 227 for i,t in enumerate(self.time_s[stx]): 228 ix = np.searchsorted(uft, t, "right") - 1 229 if ix >= 0: 230 risk_enter1[ix].append(i) 231 232 # Indices of cases (failed or censored) that exit the 233 # risk set at each unique failure time. 234 risk_exit1 = [[] for k in range(nuft)] 235 for i,t in enumerate(self.entry_s[stx]): 236 ix = np.searchsorted(uft, t) 237 risk_exit1[ix].append(i) 238 239 self.ufailt.append(uft) 240 self.ufailt_ix.append([np.asarray(x, dtype=np.int32) 241 for x in uft_ix]) 242 self.risk_enter.append([np.asarray(x, dtype=np.int32) 243 for x in risk_enter1]) 244 self.risk_exit.append([np.asarray(x, dtype=np.int32) 245 for x in risk_exit1]) 246 247 def _split(self, x): 248 v = [] 249 if x.ndim == 1: 250 for ix in self.stratum_rows: 251 v.append(x[ix]) 252 else: 253 for ix in self.stratum_rows: 254 v.append(x[ix, :]) 255 return v 256 257 def _check(self, time, status, strata, entry): 258 n1, n2, n3, n4 = len(time), len(status), len(strata),\ 259 len(entry) 260 nv = [n1, n2, n3, n4] 261 if max(nv) != min(nv): 262 raise ValueError("endog, status, strata, and " + 263 "entry must all have the same length") 264 if min(time) < 0: 265 raise ValueError("endog must be non-negative") 266 if min(entry) < 0: 267 raise ValueError("entry time must be non-negative") 268 269 # In Stata, this is entry >= time, in R it is >. 270 if np.any(entry > time): 271 raise ValueError("entry times may not occur " + 272 "after event or censoring times") 273 274 275class PHReg(model.LikelihoodModel): 276 """ 277 Cox Proportional Hazards Regression Model 278 279 The Cox PH Model is for right censored data. 280 281 Parameters 282 ---------- 283 endog : array_like 284 The observed times (event or censoring) 285 exog : 2D array_like 286 The covariates or exogeneous variables 287 status : array_like 288 The censoring status values; status=1 indicates that an 289 event occurred (e.g. failure or death), status=0 indicates 290 that the observation was right censored. If None, defaults 291 to status=1 for all cases. 292 entry : array_like 293 The entry times, if left truncation occurs 294 strata : array_like 295 Stratum labels. If None, all observations are taken to be 296 in a single stratum. 297 ties : str 298 The method used to handle tied times, must be either 'breslow' 299 or 'efron'. 300 offset : array_like 301 Array of offset values 302 missing : str 303 The method used to handle missing data 304 305 Notes 306 ----- 307 Proportional hazards regression models should not include an 308 explicit or implicit intercept. The effect of an intercept is 309 not identified using the partial likelihood approach. 310 311 `endog`, `event`, `strata`, `entry`, and the first dimension 312 of `exog` all must have the same length 313 """ 314 315 def __init__(self, endog, exog, status=None, entry=None, 316 strata=None, offset=None, ties='breslow', 317 missing='drop', **kwargs): 318 319 # Default is no censoring 320 if status is None: 321 status = np.ones(len(endog)) 322 323 super(PHReg, self).__init__(endog, exog, status=status, 324 entry=entry, strata=strata, 325 offset=offset, missing=missing, 326 **kwargs) 327 328 # endog and exog are automatically converted, but these are 329 # not 330 if self.status is not None: 331 self.status = np.asarray(self.status) 332 if self.entry is not None: 333 self.entry = np.asarray(self.entry) 334 if self.strata is not None: 335 self.strata = np.asarray(self.strata) 336 if self.offset is not None: 337 self.offset = np.asarray(self.offset) 338 339 self.surv = PHSurvivalTime(self.endog, self.status, 340 self.exog, self.strata, 341 self.entry, self.offset) 342 self.nobs = len(self.endog) 343 self.groups = None 344 345 # TODO: not used? 346 self.missing = missing 347 348 self.df_resid = float(self.exog.shape[0] - 349 np.linalg.matrix_rank(self.exog)) 350 self.df_model = float(np.linalg.matrix_rank(self.exog)) 351 352 ties = ties.lower() 353 if ties not in ("efron", "breslow"): 354 raise ValueError("`ties` must be either `efron` or " + 355 "`breslow`") 356 357 self.ties = ties 358 359 @classmethod 360 def from_formula(cls, formula, data, status=None, entry=None, 361 strata=None, offset=None, subset=None, 362 ties='breslow', missing='drop', *args, **kwargs): 363 """ 364 Create a proportional hazards regression model from a formula 365 and dataframe. 366 367 Parameters 368 ---------- 369 formula : str or generic Formula object 370 The formula specifying the model 371 data : array_like 372 The data for the model. See Notes. 373 status : array_like 374 The censoring status values; status=1 indicates that an 375 event occurred (e.g. failure or death), status=0 indicates 376 that the observation was right censored. If None, defaults 377 to status=1 for all cases. 378 entry : array_like 379 The entry times, if left truncation occurs 380 strata : array_like 381 Stratum labels. If None, all observations are taken to be 382 in a single stratum. 383 offset : array_like 384 Array of offset values 385 subset : array_like 386 An array-like object of booleans, integers, or index 387 values that indicate the subset of df to use in the 388 model. Assumes df is a `pandas.DataFrame` 389 ties : str 390 The method used to handle tied times, must be either 'breslow' 391 or 'efron'. 392 missing : str 393 The method used to handle missing data 394 args : extra arguments 395 These are passed to the model 396 kwargs : extra keyword arguments 397 These are passed to the model with one exception. The 398 ``eval_env`` keyword is passed to patsy. It can be either a 399 :class:`patsy:patsy.EvalEnvironment` object or an integer 400 indicating the depth of the namespace to use. For example, the 401 default ``eval_env=0`` uses the calling namespace. If you wish 402 to use a "clean" environment set ``eval_env=-1``. 403 404 Returns 405 ------- 406 model : PHReg model instance 407 """ 408 409 # Allow array arguments to be passed by column name. 410 if isinstance(status, str): 411 status = data[status] 412 if isinstance(entry, str): 413 entry = data[entry] 414 if isinstance(strata, str): 415 strata = data[strata] 416 if isinstance(offset, str): 417 offset = data[offset] 418 419 import re 420 terms = re.split(r"[+\-~]", formula) 421 for term in terms: 422 term = term.strip() 423 if term in ("0", "1"): 424 import warnings 425 warnings.warn("PHReg formulas should not include any '0' or '1' terms") 426 427 mod = super(PHReg, cls).from_formula(formula, data, 428 status=status, entry=entry, strata=strata, 429 offset=offset, subset=subset, ties=ties, 430 missing=missing, drop_cols=["Intercept"], *args, 431 **kwargs) 432 433 return mod 434 435 def fit(self, groups=None, **args): 436 """ 437 Fit a proportional hazards regression model. 438 439 Parameters 440 ---------- 441 groups : array_like 442 Labels indicating groups of observations that may be 443 dependent. If present, the standard errors account for 444 this dependence. Does not affect fitted values. 445 446 Returns 447 ------- 448 PHRegResults 449 Returns a results instance. 450 """ 451 452 # TODO process for missing values 453 if groups is not None: 454 if len(groups) != len(self.endog): 455 msg = ("len(groups) = %d and len(endog) = %d differ" % 456 (len(groups), len(self.endog))) 457 raise ValueError(msg) 458 self.groups = np.asarray(groups) 459 else: 460 self.groups = None 461 462 if 'disp' not in args: 463 args['disp'] = False 464 465 fit_rslts = super(PHReg, self).fit(**args) 466 467 if self.groups is None: 468 cov_params = fit_rslts.cov_params() 469 else: 470 cov_params = self.robust_covariance(fit_rslts.params) 471 472 results = PHRegResults(self, fit_rslts.params, cov_params) 473 474 return results 475 476 def fit_regularized(self, method="elastic_net", alpha=0., 477 start_params=None, refit=False, **kwargs): 478 r""" 479 Return a regularized fit to a linear regression model. 480 481 Parameters 482 ---------- 483 method : {'elastic_net'} 484 Only the `elastic_net` approach is currently implemented. 485 alpha : scalar or array_like 486 The penalty weight. If a scalar, the same penalty weight 487 applies to all variables in the model. If a vector, it 488 must have the same length as `params`, and contains a 489 penalty weight for each coefficient. 490 start_params : array_like 491 Starting values for `params`. 492 refit : bool 493 If True, the model is refit using only the variables that 494 have non-zero coefficients in the regularized fit. The 495 refitted model is not regularized. 496 **kwargs 497 Additional keyword arguments used to fit the model. 498 499 Returns 500 ------- 501 PHRegResults 502 Returns a results instance. 503 504 Notes 505 ----- 506 The penalty is the ``elastic net`` penalty, which is a 507 combination of L1 and L2 penalties. 508 509 The function that is minimized is: 510 511 .. math:: 512 513 -loglike/n + alpha*((1-L1\_wt)*|params|_2^2/2 + L1\_wt*|params|_1) 514 515 where :math:`|*|_1` and :math:`|*|_2` are the L1 and L2 norms. 516 517 Post-estimation results are based on the same data used to 518 select variables, hence may be subject to overfitting biases. 519 520 The elastic_net method uses the following keyword arguments: 521 522 maxiter : int 523 Maximum number of iterations 524 L1_wt : float 525 Must be in [0, 1]. The L1 penalty has weight L1_wt and the 526 L2 penalty has weight 1 - L1_wt. 527 cnvrg_tol : float 528 Convergence threshold for line searches 529 zero_tol : float 530 Coefficients below this threshold are treated as zero. 531 """ 532 533 from statsmodels.base.elastic_net import fit_elasticnet 534 535 if method != "elastic_net": 536 raise ValueError("method for fit_regularied must be elastic_net") 537 538 defaults = {"maxiter" : 50, "L1_wt" : 1, "cnvrg_tol" : 1e-10, 539 "zero_tol" : 1e-10} 540 defaults.update(kwargs) 541 542 return fit_elasticnet(self, method=method, 543 alpha=alpha, 544 start_params=start_params, 545 refit=refit, 546 **defaults) 547 548 549 def loglike(self, params): 550 """ 551 Returns the log partial likelihood function evaluated at 552 `params`. 553 """ 554 555 if self.ties == "breslow": 556 return self.breslow_loglike(params) 557 elif self.ties == "efron": 558 return self.efron_loglike(params) 559 560 def score(self, params): 561 """ 562 Returns the score function evaluated at `params`. 563 """ 564 565 if self.ties == "breslow": 566 return self.breslow_gradient(params) 567 elif self.ties == "efron": 568 return self.efron_gradient(params) 569 570 def hessian(self, params): 571 """ 572 Returns the Hessian matrix of the log partial likelihood 573 function evaluated at `params`. 574 """ 575 576 if self.ties == "breslow": 577 return self.breslow_hessian(params) 578 else: 579 return self.efron_hessian(params) 580 581 def breslow_loglike(self, params): 582 """ 583 Returns the value of the log partial likelihood function 584 evaluated at `params`, using the Breslow method to handle tied 585 times. 586 """ 587 588 surv = self.surv 589 590 like = 0. 591 592 # Loop over strata 593 for stx in range(surv.nstrat): 594 595 uft_ix = surv.ufailt_ix[stx] 596 exog_s = surv.exog_s[stx] 597 nuft = len(uft_ix) 598 599 linpred = np.dot(exog_s, params) 600 if surv.offset_s is not None: 601 linpred += surv.offset_s[stx] 602 linpred -= linpred.max() 603 e_linpred = np.exp(linpred) 604 605 xp0 = 0. 606 607 # Iterate backward through the unique failure times. 608 for i in range(nuft)[::-1]: 609 610 # Update for new cases entering the risk set. 611 ix = surv.risk_enter[stx][i] 612 xp0 += e_linpred[ix].sum() 613 614 # Account for all cases that fail at this point. 615 ix = uft_ix[i] 616 like += (linpred[ix] - np.log(xp0)).sum() 617 618 # Update for cases leaving the risk set. 619 ix = surv.risk_exit[stx][i] 620 xp0 -= e_linpred[ix].sum() 621 622 return like 623 624 def efron_loglike(self, params): 625 """ 626 Returns the value of the log partial likelihood function 627 evaluated at `params`, using the Efron method to handle tied 628 times. 629 """ 630 631 surv = self.surv 632 633 like = 0. 634 635 # Loop over strata 636 for stx in range(surv.nstrat): 637 638 # exog and linear predictor for this stratum 639 exog_s = surv.exog_s[stx] 640 linpred = np.dot(exog_s, params) 641 if surv.offset_s is not None: 642 linpred += surv.offset_s[stx] 643 linpred -= linpred.max() 644 e_linpred = np.exp(linpred) 645 646 xp0 = 0. 647 648 # Iterate backward through the unique failure times. 649 uft_ix = surv.ufailt_ix[stx] 650 nuft = len(uft_ix) 651 for i in range(nuft)[::-1]: 652 653 # Update for new cases entering the risk set. 654 ix = surv.risk_enter[stx][i] 655 xp0 += e_linpred[ix].sum() 656 xp0f = e_linpred[uft_ix[i]].sum() 657 658 # Account for all cases that fail at this point. 659 ix = uft_ix[i] 660 like += linpred[ix].sum() 661 662 m = len(ix) 663 J = np.arange(m, dtype=np.float64) / m 664 like -= np.log(xp0 - J*xp0f).sum() 665 666 # Update for cases leaving the risk set. 667 ix = surv.risk_exit[stx][i] 668 xp0 -= e_linpred[ix].sum() 669 670 return like 671 672 def breslow_gradient(self, params): 673 """ 674 Returns the gradient of the log partial likelihood, using the 675 Breslow method to handle tied times. 676 """ 677 678 surv = self.surv 679 680 grad = 0. 681 682 # Loop over strata 683 for stx in range(surv.nstrat): 684 685 # Indices of subjects in the stratum 686 strat_ix = surv.stratum_rows[stx] 687 688 # Unique failure times in the stratum 689 uft_ix = surv.ufailt_ix[stx] 690 nuft = len(uft_ix) 691 692 # exog and linear predictor for the stratum 693 exog_s = surv.exog_s[stx] 694 linpred = np.dot(exog_s, params) 695 if surv.offset_s is not None: 696 linpred += surv.offset_s[stx] 697 linpred -= linpred.max() 698 e_linpred = np.exp(linpred) 699 700 xp0, xp1 = 0., 0. 701 702 # Iterate backward through the unique failure times. 703 for i in range(nuft)[::-1]: 704 705 # Update for new cases entering the risk set. 706 ix = surv.risk_enter[stx][i] 707 if len(ix) > 0: 708 v = exog_s[ix,:] 709 xp0 += e_linpred[ix].sum() 710 xp1 += (e_linpred[ix][:,None] * v).sum(0) 711 712 # Account for all cases that fail at this point. 713 ix = uft_ix[i] 714 grad += (exog_s[ix,:] - xp1 / xp0).sum(0) 715 716 # Update for cases leaving the risk set. 717 ix = surv.risk_exit[stx][i] 718 if len(ix) > 0: 719 v = exog_s[ix,:] 720 xp0 -= e_linpred[ix].sum() 721 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 722 723 return grad 724 725 def efron_gradient(self, params): 726 """ 727 Returns the gradient of the log partial likelihood evaluated 728 at `params`, using the Efron method to handle tied times. 729 """ 730 731 surv = self.surv 732 733 grad = 0. 734 735 # Loop over strata 736 for stx in range(surv.nstrat): 737 738 # Indices of cases in the stratum 739 strat_ix = surv.stratum_rows[stx] 740 741 # exog and linear predictor of the stratum 742 exog_s = surv.exog_s[stx] 743 linpred = np.dot(exog_s, params) 744 if surv.offset_s is not None: 745 linpred += surv.offset_s[stx] 746 linpred -= linpred.max() 747 e_linpred = np.exp(linpred) 748 749 xp0, xp1 = 0., 0. 750 751 # Iterate backward through the unique failure times. 752 uft_ix = surv.ufailt_ix[stx] 753 nuft = len(uft_ix) 754 for i in range(nuft)[::-1]: 755 756 # Update for new cases entering the risk set. 757 ix = surv.risk_enter[stx][i] 758 if len(ix) > 0: 759 v = exog_s[ix,:] 760 xp0 += e_linpred[ix].sum() 761 xp1 += (e_linpred[ix][:,None] * v).sum(0) 762 ixf = uft_ix[i] 763 if len(ixf) > 0: 764 v = exog_s[ixf,:] 765 xp0f = e_linpred[ixf].sum() 766 xp1f = (e_linpred[ixf][:,None] * v).sum(0) 767 768 # Consider all cases that fail at this point. 769 grad += v.sum(0) 770 771 m = len(ixf) 772 J = np.arange(m, dtype=np.float64) / m 773 numer = xp1 - np.outer(J, xp1f) 774 denom = xp0 - np.outer(J, xp0f) 775 ratio = numer / denom 776 rsum = ratio.sum(0) 777 grad -= rsum 778 779 # Update for cases leaving the risk set. 780 ix = surv.risk_exit[stx][i] 781 if len(ix) > 0: 782 v = exog_s[ix,:] 783 xp0 -= e_linpred[ix].sum() 784 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 785 786 return grad 787 788 def breslow_hessian(self, params): 789 """ 790 Returns the Hessian of the log partial likelihood evaluated at 791 `params`, using the Breslow method to handle tied times. 792 """ 793 794 surv = self.surv 795 796 hess = 0. 797 798 # Loop over strata 799 for stx in range(surv.nstrat): 800 801 uft_ix = surv.ufailt_ix[stx] 802 nuft = len(uft_ix) 803 804 exog_s = surv.exog_s[stx] 805 806 linpred = np.dot(exog_s, params) 807 if surv.offset_s is not None: 808 linpred += surv.offset_s[stx] 809 linpred -= linpred.max() 810 e_linpred = np.exp(linpred) 811 812 xp0, xp1, xp2 = 0., 0., 0. 813 814 # Iterate backward through the unique failure times. 815 for i in range(nuft)[::-1]: 816 817 # Update for new cases entering the risk set. 818 ix = surv.risk_enter[stx][i] 819 if len(ix) > 0: 820 xp0 += e_linpred[ix].sum() 821 v = exog_s[ix,:] 822 xp1 += (e_linpred[ix][:,None] * v).sum(0) 823 elx = e_linpred[ix] 824 xp2 += np.einsum("ij,ik,i->jk", v, v, elx) 825 826 # Account for all cases that fail at this point. 827 m = len(uft_ix[i]) 828 hess += m*(xp2 / xp0 - np.outer(xp1, xp1) / xp0**2) 829 830 # Update for new cases entering the risk set. 831 ix = surv.risk_exit[stx][i] 832 if len(ix) > 0: 833 xp0 -= e_linpred[ix].sum() 834 v = exog_s[ix,:] 835 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 836 elx = e_linpred[ix] 837 xp2 -= np.einsum("ij,ik,i->jk", v, v, elx) 838 return -hess 839 840 def efron_hessian(self, params): 841 """ 842 Returns the Hessian matrix of the partial log-likelihood 843 evaluated at `params`, using the Efron method to handle tied 844 times. 845 """ 846 847 surv = self.surv 848 849 hess = 0. 850 851 # Loop over strata 852 for stx in range(surv.nstrat): 853 854 exog_s = surv.exog_s[stx] 855 856 linpred = np.dot(exog_s, params) 857 if surv.offset_s is not None: 858 linpred += surv.offset_s[stx] 859 linpred -= linpred.max() 860 e_linpred = np.exp(linpred) 861 862 xp0, xp1, xp2 = 0., 0., 0. 863 864 # Iterate backward through the unique failure times. 865 uft_ix = surv.ufailt_ix[stx] 866 nuft = len(uft_ix) 867 for i in range(nuft)[::-1]: 868 869 # Update for new cases entering the risk set. 870 ix = surv.risk_enter[stx][i] 871 if len(ix) > 0: 872 xp0 += e_linpred[ix].sum() 873 v = exog_s[ix,:] 874 xp1 += (e_linpred[ix][:,None] * v).sum(0) 875 elx = e_linpred[ix] 876 xp2 += np.einsum("ij,ik,i->jk", v, v, elx) 877 878 ixf = uft_ix[i] 879 if len(ixf) > 0: 880 v = exog_s[ixf,:] 881 xp0f = e_linpred[ixf].sum() 882 xp1f = (e_linpred[ixf][:,None] * v).sum(0) 883 elx = e_linpred[ixf] 884 xp2f = np.einsum("ij,ik,i->jk", v, v, elx) 885 886 # Account for all cases that fail at this point. 887 m = len(uft_ix[i]) 888 J = np.arange(m, dtype=np.float64) / m 889 c0 = xp0 - J*xp0f 890 hess += xp2 * np.sum(1 / c0) 891 hess -= xp2f * np.sum(J / c0) 892 mat = (xp1[None, :] - np.outer(J, xp1f)) / c0[:, None] 893 hess -= np.einsum("ij,ik->jk", mat, mat) 894 895 # Update for new cases entering the risk set. 896 ix = surv.risk_exit[stx][i] 897 if len(ix) > 0: 898 xp0 -= e_linpred[ix].sum() 899 v = exog_s[ix,:] 900 xp1 -= (e_linpred[ix][:,None] * v).sum(0) 901 elx = e_linpred[ix] 902 xp2 -= np.einsum("ij,ik,i->jk", v, v, elx) 903 904 return -hess 905 906 def robust_covariance(self, params): 907 """ 908 Returns a covariance matrix for the proportional hazards model 909 regresion coefficient estimates that is robust to certain 910 forms of model misspecification. 911 912 Parameters 913 ---------- 914 params : ndarray 915 The parameter vector at which the covariance matrix is 916 calculated. 917 918 Returns 919 ------- 920 The robust covariance matrix as a square ndarray. 921 922 Notes 923 ----- 924 This function uses the `groups` argument to determine groups 925 within which observations may be dependent. The covariance 926 matrix is calculated using the Huber-White "sandwich" approach. 927 """ 928 929 if self.groups is None: 930 raise ValueError("`groups` must be specified to calculate the robust covariance matrix") 931 932 hess = self.hessian(params) 933 934 score_obs = self.score_residuals(params) 935 936 # Collapse 937 grads = {} 938 for i,g in enumerate(self.groups): 939 if g not in grads: 940 grads[g] = 0. 941 grads[g] += score_obs[i, :] 942 grads = np.asarray(list(grads.values())) 943 944 mat = grads[None, :, :] 945 mat = mat.T * mat 946 mat = mat.sum(1) 947 948 hess_inv = np.linalg.inv(hess) 949 cmat = np.dot(hess_inv, np.dot(mat, hess_inv)) 950 951 return cmat 952 953 def score_residuals(self, params): 954 """ 955 Returns the score residuals calculated at a given vector of 956 parameters. 957 958 Parameters 959 ---------- 960 params : ndarray 961 The parameter vector at which the score residuals are 962 calculated. 963 964 Returns 965 ------- 966 The score residuals, returned as a ndarray having the same 967 shape as `exog`. 968 969 Notes 970 ----- 971 Observations in a stratum with no observed events have undefined 972 score residuals, and contain NaN in the returned matrix. 973 """ 974 975 surv = self.surv 976 977 score_resid = np.zeros(self.exog.shape, dtype=np.float64) 978 979 # Use to set undefined values to NaN. 980 mask = np.zeros(self.exog.shape[0], dtype=np.int32) 981 982 w_avg = self.weighted_covariate_averages(params) 983 984 # Loop over strata 985 for stx in range(surv.nstrat): 986 987 uft_ix = surv.ufailt_ix[stx] 988 exog_s = surv.exog_s[stx] 989 nuft = len(uft_ix) 990 strat_ix = surv.stratum_rows[stx] 991 992 xp0 = 0. 993 994 linpred = np.dot(exog_s, params) 995 if surv.offset_s is not None: 996 linpred += surv.offset_s[stx] 997 linpred -= linpred.max() 998 e_linpred = np.exp(linpred) 999 1000 at_risk_ix = set([]) 1001 1002 # Iterate backward through the unique failure times. 1003 for i in range(nuft)[::-1]: 1004 1005 # Update for new cases entering the risk set. 1006 ix = surv.risk_enter[stx][i] 1007 at_risk_ix |= set(ix) 1008 xp0 += e_linpred[ix].sum() 1009 1010 atr_ix = list(at_risk_ix) 1011 leverage = exog_s[atr_ix, :] - w_avg[stx][i, :] 1012 1013 # Event indicators 1014 d = np.zeros(exog_s.shape[0]) 1015 d[uft_ix[i]] = 1 1016 1017 # The increment in the cumulative hazard 1018 dchaz = len(uft_ix[i]) / xp0 1019 1020 # Piece of the martingale residual 1021 mrp = d[atr_ix] - e_linpred[atr_ix] * dchaz 1022 1023 # Update the score residuals 1024 ii = strat_ix[atr_ix] 1025 score_resid[ii,:] += leverage * mrp[:, None] 1026 mask[ii] = 1 1027 1028 # Update for cases leaving the risk set. 1029 ix = surv.risk_exit[stx][i] 1030 at_risk_ix -= set(ix) 1031 xp0 -= e_linpred[ix].sum() 1032 1033 jj = np.flatnonzero(mask == 0) 1034 if len(jj) > 0: 1035 score_resid[jj, :] = np.nan 1036 1037 return score_resid 1038 1039 def weighted_covariate_averages(self, params): 1040 """ 1041 Returns the hazard-weighted average of covariate values for 1042 subjects who are at-risk at a particular time. 1043 1044 Parameters 1045 ---------- 1046 params : ndarray 1047 Parameter vector 1048 1049 Returns 1050 ------- 1051 averages : list of ndarrays 1052 averages[stx][i,:] is a row vector containing the weighted 1053 average values (for all the covariates) of at-risk 1054 subjects a the i^th largest observed failure time in 1055 stratum `stx`, using the hazard multipliers as weights. 1056 1057 Notes 1058 ----- 1059 Used to calculate leverages and score residuals. 1060 """ 1061 1062 surv = self.surv 1063 1064 averages = [] 1065 xp0, xp1 = 0., 0. 1066 1067 # Loop over strata 1068 for stx in range(surv.nstrat): 1069 1070 uft_ix = surv.ufailt_ix[stx] 1071 exog_s = surv.exog_s[stx] 1072 nuft = len(uft_ix) 1073 1074 average_s = np.zeros((len(uft_ix), exog_s.shape[1]), 1075 dtype=np.float64) 1076 1077 linpred = np.dot(exog_s, params) 1078 if surv.offset_s is not None: 1079 linpred += surv.offset_s[stx] 1080 linpred -= linpred.max() 1081 e_linpred = np.exp(linpred) 1082 1083 # Iterate backward through the unique failure times. 1084 for i in range(nuft)[::-1]: 1085 1086 # Update for new cases entering the risk set. 1087 ix = surv.risk_enter[stx][i] 1088 xp0 += e_linpred[ix].sum() 1089 xp1 += np.dot(e_linpred[ix], exog_s[ix, :]) 1090 1091 average_s[i, :] = xp1 / xp0 1092 1093 # Update for cases leaving the risk set. 1094 ix = surv.risk_exit[stx][i] 1095 xp0 -= e_linpred[ix].sum() 1096 xp1 -= np.dot(e_linpred[ix], exog_s[ix, :]) 1097 1098 averages.append(average_s) 1099 1100 return averages 1101 1102 def baseline_cumulative_hazard(self, params): 1103 """ 1104 Estimate the baseline cumulative hazard and survival 1105 functions. 1106 1107 Parameters 1108 ---------- 1109 params : ndarray 1110 The model parameters. 1111 1112 Returns 1113 ------- 1114 A list of triples (time, hazard, survival) containing the time 1115 values and corresponding cumulative hazard and survival 1116 function values for each stratum. 1117 1118 Notes 1119 ----- 1120 Uses the Nelson-Aalen estimator. 1121 """ 1122 1123 # TODO: some disagreements with R, not the same algorithm but 1124 # hard to deduce what R is doing. Our results are reasonable. 1125 1126 surv = self.surv 1127 rslt = [] 1128 1129 # Loop over strata 1130 for stx in range(surv.nstrat): 1131 1132 uft = surv.ufailt[stx] 1133 uft_ix = surv.ufailt_ix[stx] 1134 exog_s = surv.exog_s[stx] 1135 nuft = len(uft_ix) 1136 1137 linpred = np.dot(exog_s, params) 1138 if surv.offset_s is not None: 1139 linpred += surv.offset_s[stx] 1140 e_linpred = np.exp(linpred) 1141 1142 xp0 = 0. 1143 h0 = np.zeros(nuft, dtype=np.float64) 1144 1145 # Iterate backward through the unique failure times. 1146 for i in range(nuft)[::-1]: 1147 1148 # Update for new cases entering the risk set. 1149 ix = surv.risk_enter[stx][i] 1150 xp0 += e_linpred[ix].sum() 1151 1152 # Account for all cases that fail at this point. 1153 ix = uft_ix[i] 1154 h0[i] = len(ix) / xp0 1155 1156 # Update for cases leaving the risk set. 1157 ix = surv.risk_exit[stx][i] 1158 xp0 -= e_linpred[ix].sum() 1159 1160 cumhaz = np.cumsum(h0) - h0 1161 current_strata_surv = np.exp(-cumhaz) 1162 rslt.append([uft, cumhaz, current_strata_surv]) 1163 1164 return rslt 1165 1166 def baseline_cumulative_hazard_function(self, params): 1167 """ 1168 Returns a function that calculates the baseline cumulative 1169 hazard function for each stratum. 1170 1171 Parameters 1172 ---------- 1173 params : ndarray 1174 The model parameters. 1175 1176 Returns 1177 ------- 1178 A dict mapping stratum names to the estimated baseline 1179 cumulative hazard function. 1180 """ 1181 1182 from scipy.interpolate import interp1d 1183 surv = self.surv 1184 base = self.baseline_cumulative_hazard(params) 1185 1186 cumhaz_f = {} 1187 for stx in range(surv.nstrat): 1188 time_h = base[stx][0] 1189 cumhaz = base[stx][1] 1190 time_h = np.r_[-np.inf, time_h, np.inf] 1191 cumhaz = np.r_[cumhaz[0], cumhaz, cumhaz[-1]] 1192 func = interp1d(time_h, cumhaz, kind='zero') 1193 cumhaz_f[self.surv.stratum_names[stx]] = func 1194 1195 return cumhaz_f 1196 1197 @Appender(_predict_docstring % { 1198 'params_doc': _predict_params_doc, 1199 'cov_params_doc': _predict_cov_params_docstring}) 1200 def predict(self, params, exog=None, cov_params=None, endog=None, 1201 strata=None, offset=None, pred_type="lhr", pred_only=False): 1202 1203 # This function breaks mediation, because it does not simply 1204 # return the predicted values as an array. 1205 1206 pred_type = pred_type.lower() 1207 if pred_type not in ["lhr", "hr", "surv", "cumhaz"]: 1208 msg = "Type %s not allowed for prediction" % pred_type 1209 raise ValueError(msg) 1210 1211 class bunch: 1212 predicted_values = None 1213 standard_errors = None 1214 ret_val = bunch() 1215 1216 # Do not do anything with offset here because we want to allow 1217 # different offsets to be specified even if exog is the model 1218 # exog. 1219 exog_provided = True 1220 if exog is None: 1221 exog = self.exog 1222 exog_provided = False 1223 1224 lhr = np.dot(exog, params) 1225 if offset is not None: 1226 lhr += offset 1227 # Never use self.offset unless we are also using self.exog 1228 elif self.offset is not None and not exog_provided: 1229 lhr += self.offset 1230 1231 # Handle lhr and hr prediction first, since they do not make 1232 # use of the hazard function. 1233 1234 if pred_type == "lhr": 1235 ret_val.predicted_values = lhr 1236 if cov_params is not None: 1237 mat = np.dot(exog, cov_params) 1238 va = (mat * exog).sum(1) 1239 ret_val.standard_errors = np.sqrt(va) 1240 if pred_only: 1241 return ret_val.predicted_values 1242 return ret_val 1243 1244 hr = np.exp(lhr) 1245 1246 if pred_type == "hr": 1247 ret_val.predicted_values = hr 1248 if pred_only: 1249 return ret_val.predicted_values 1250 return ret_val 1251 1252 # Makes sure endog is defined 1253 if endog is None and exog_provided: 1254 msg = "If `exog` is provided `endog` must be provided." 1255 raise ValueError(msg) 1256 # Use model endog if using model exog 1257 elif endog is None and not exog_provided: 1258 endog = self.endog 1259 1260 # Make sure strata is defined 1261 if strata is None: 1262 if exog_provided and self.surv.nstrat > 1: 1263 raise ValueError("`strata` must be provided") 1264 if self.strata is None: 1265 strata = [self.surv.stratum_names[0],] * len(endog) 1266 else: 1267 strata = self.strata 1268 1269 cumhaz = np.nan * np.ones(len(endog), dtype=np.float64) 1270 stv = np.unique(strata) 1271 bhaz = self.baseline_cumulative_hazard_function(params) 1272 for stx in stv: 1273 ix = np.flatnonzero(strata == stx) 1274 func = bhaz[stx] 1275 cumhaz[ix] = func(endog[ix]) * hr[ix] 1276 1277 if pred_type == "cumhaz": 1278 ret_val.predicted_values = cumhaz 1279 1280 elif pred_type == "surv": 1281 ret_val.predicted_values = np.exp(-cumhaz) 1282 1283 if pred_only: 1284 return ret_val.predicted_values 1285 1286 return ret_val 1287 1288 def get_distribution(self, params, scale=1.0, exog=None): 1289 """ 1290 Returns a scipy distribution object corresponding to the 1291 distribution of uncensored endog (duration) values for each 1292 case. 1293 1294 Parameters 1295 ---------- 1296 params : array_like 1297 The proportional hazards model parameters. 1298 scale : float 1299 Present for compatibility, not used. 1300 exog : array_like 1301 A design matrix, defaults to model.exog. 1302 1303 Returns 1304 ------- 1305 A list of objects of type scipy.stats.distributions.rv_discrete 1306 1307 Notes 1308 ----- 1309 The distributions are obtained from a simple discrete estimate 1310 of the survivor function that puts all mass on the observed 1311 failure times within a stratum. 1312 """ 1313 1314 surv = self.surv 1315 bhaz = self.baseline_cumulative_hazard(params) 1316 1317 # The arguments to rv_discrete_float, first obtained by 1318 # stratum 1319 pk, xk = [], [] 1320 1321 if exog is None: 1322 exog_split = surv.exog_s 1323 else: 1324 exog_split = self.surv._split(exog) 1325 1326 for stx in range(self.surv.nstrat): 1327 1328 exog_s = exog_split[stx] 1329 1330 linpred = np.dot(exog_s, params) 1331 if surv.offset_s is not None: 1332 linpred += surv.offset_s[stx] 1333 e_linpred = np.exp(linpred) 1334 1335 # The unique failure times for this stratum (the support 1336 # of the distribution). 1337 pts = bhaz[stx][0] 1338 1339 # The individual cumulative hazards for everyone in this 1340 # stratum. 1341 ichaz = np.outer(e_linpred, bhaz[stx][1]) 1342 1343 # The individual survival functions. 1344 usurv = np.exp(-ichaz) 1345 z = np.zeros((usurv.shape[0], 1)) 1346 usurv = np.concatenate((usurv, z), axis=1) 1347 1348 # The individual survival probability masses. 1349 probs = -np.diff(usurv, 1) 1350 1351 pk.append(probs) 1352 xk.append(np.outer(np.ones(probs.shape[0]), pts)) 1353 1354 # Pad to make all strata have the same shape 1355 mxc = max([x.shape[1] for x in xk]) 1356 for k in range(self.surv.nstrat): 1357 if xk[k].shape[1] < mxc: 1358 xk1 = np.zeros((xk[k].shape[0], mxc)) 1359 pk1 = np.zeros((pk[k].shape[0], mxc)) 1360 xk1[:, 0:xk[k].shape[1]] = xk[k] 1361 pk1[:, 0:pk[k].shape[1]] = pk[k] 1362 xk[k], pk[k] = xk1, pk1 1363 1364 # Put the support points and probabilities into single matrices 1365 xka = np.nan * np.ones((len(self.endog), mxc)) 1366 pka = np.ones((len(self.endog), mxc), dtype=np.float64) / mxc 1367 for stx in range(self.surv.nstrat): 1368 ix = self.surv.stratum_rows[stx] 1369 xka[ix, :] = xk[stx] 1370 pka[ix, :] = pk[stx] 1371 1372 dist = rv_discrete_float(xka, pka) 1373 1374 return dist 1375 1376 1377class PHRegResults(base.LikelihoodModelResults): 1378 ''' 1379 Class to contain results of fitting a Cox proportional hazards 1380 survival model. 1381 1382 PHregResults inherits from statsmodels.LikelihoodModelResults 1383 1384 Parameters 1385 ---------- 1386 See statsmodels.LikelihoodModelResults 1387 1388 Attributes 1389 ---------- 1390 model : class instance 1391 PHreg model instance that called fit. 1392 normalized_cov_params : ndarray 1393 The sampling covariance matrix of the estimates 1394 params : ndarray 1395 The coefficients of the fitted model. Each coefficient is the 1396 log hazard ratio corresponding to a 1 unit difference in a 1397 single covariate while holding the other covariates fixed. 1398 bse : ndarray 1399 The standard errors of the fitted parameters. 1400 1401 See Also 1402 -------- 1403 statsmodels.LikelihoodModelResults 1404 ''' 1405 1406 def __init__(self, model, params, cov_params, scale=1., covariance_type="naive"): 1407 1408 # There is no scale parameter, but we need it for 1409 # meta-procedures that work with results. 1410 1411 self.covariance_type = covariance_type 1412 self.df_resid = model.df_resid 1413 self.df_model = model.df_model 1414 1415 super(PHRegResults, self).__init__(model, params, scale=1., 1416 normalized_cov_params=cov_params) 1417 1418 @cache_readonly 1419 def standard_errors(self): 1420 """ 1421 Returns the standard errors of the parameter estimates. 1422 """ 1423 return np.sqrt(np.diag(self.cov_params())) 1424 1425 @cache_readonly 1426 def bse(self): 1427 """ 1428 Returns the standard errors of the parameter estimates. 1429 """ 1430 return self.standard_errors 1431 1432 def get_distribution(self): 1433 """ 1434 Returns a scipy distribution object corresponding to the 1435 distribution of uncensored endog (duration) values for each 1436 case. 1437 1438 Returns 1439 ------- 1440 A list of objects of type scipy.stats.distributions.rv_discrete 1441 1442 Notes 1443 ----- 1444 The distributions are obtained from a simple discrete estimate 1445 of the survivor function that puts all mass on the observed 1446 failure times within a stratum. 1447 """ 1448 1449 return self.model.get_distribution(self.params) 1450 1451 @Appender(_predict_docstring % {'params_doc': '', 'cov_params_doc': ''}) 1452 def predict(self, endog=None, exog=None, strata=None, 1453 offset=None, transform=True, pred_type="lhr"): 1454 return super(PHRegResults, self).predict(exog=exog, 1455 transform=transform, 1456 cov_params=self.cov_params(), 1457 endog=endog, 1458 strata=strata, 1459 offset=offset, 1460 pred_type=pred_type) 1461 1462 def _group_stats(self, groups): 1463 """ 1464 Descriptive statistics of the groups. 1465 """ 1466 gsizes = np.unique(groups, return_counts=True) 1467 gsizes = gsizes[1] 1468 return gsizes.min(), gsizes.max(), gsizes.mean(), len(gsizes) 1469 1470 @cache_readonly 1471 def weighted_covariate_averages(self): 1472 """ 1473 The average covariate values within the at-risk set at each 1474 event time point, weighted by hazard. 1475 """ 1476 return self.model.weighted_covariate_averages(self.params) 1477 1478 @cache_readonly 1479 def score_residuals(self): 1480 """ 1481 A matrix containing the score residuals. 1482 """ 1483 return self.model.score_residuals(self.params) 1484 1485 @cache_readonly 1486 def baseline_cumulative_hazard(self): 1487 """ 1488 A list (corresponding to the strata) containing the baseline 1489 cumulative hazard function evaluated at the event points. 1490 """ 1491 return self.model.baseline_cumulative_hazard(self.params) 1492 1493 @cache_readonly 1494 def baseline_cumulative_hazard_function(self): 1495 """ 1496 A list (corresponding to the strata) containing function 1497 objects that calculate the cumulative hazard function. 1498 """ 1499 return self.model.baseline_cumulative_hazard_function(self.params) 1500 1501 @cache_readonly 1502 def schoenfeld_residuals(self): 1503 """ 1504 A matrix containing the Schoenfeld residuals. 1505 1506 Notes 1507 ----- 1508 Schoenfeld residuals for censored observations are set to zero. 1509 """ 1510 1511 surv = self.model.surv 1512 w_avg = self.weighted_covariate_averages 1513 1514 # Initialize at NaN since rows that belong to strata with no 1515 # events have undefined residuals. 1516 sch_resid = np.nan*np.ones(self.model.exog.shape, dtype=np.float64) 1517 1518 # Loop over strata 1519 for stx in range(surv.nstrat): 1520 1521 uft = surv.ufailt[stx] 1522 exog_s = surv.exog_s[stx] 1523 time_s = surv.time_s[stx] 1524 strat_ix = surv.stratum_rows[stx] 1525 1526 ii = np.searchsorted(uft, time_s) 1527 1528 # These subjects are censored after the last event in 1529 # their stratum, so have empty risk sets and undefined 1530 # residuals. 1531 jj = np.flatnonzero(ii < len(uft)) 1532 1533 sch_resid[strat_ix[jj], :] = exog_s[jj, :] - w_avg[stx][ii[jj], :] 1534 1535 jj = np.flatnonzero(self.model.status == 0) 1536 sch_resid[jj, :] = np.nan 1537 1538 return sch_resid 1539 1540 @cache_readonly 1541 def martingale_residuals(self): 1542 """ 1543 The martingale residuals. 1544 """ 1545 1546 surv = self.model.surv 1547 1548 # Initialize at NaN since rows that belong to strata with no 1549 # events have undefined residuals. 1550 mart_resid = np.nan*np.ones(len(self.model.endog), dtype=np.float64) 1551 1552 cumhaz_f_list = self.baseline_cumulative_hazard_function 1553 1554 # Loop over strata 1555 for stx in range(surv.nstrat): 1556 1557 cumhaz_f = cumhaz_f_list[stx] 1558 1559 exog_s = surv.exog_s[stx] 1560 time_s = surv.time_s[stx] 1561 1562 linpred = np.dot(exog_s, self.params) 1563 if surv.offset_s is not None: 1564 linpred += surv.offset_s[stx] 1565 e_linpred = np.exp(linpred) 1566 1567 ii = surv.stratum_rows[stx] 1568 chaz = cumhaz_f(time_s) 1569 mart_resid[ii] = self.model.status[ii] - e_linpred * chaz 1570 1571 return mart_resid 1572 1573 def summary(self, yname=None, xname=None, title=None, alpha=.05): 1574 """ 1575 Summarize the proportional hazards regression results. 1576 1577 Parameters 1578 ---------- 1579 yname : str, optional 1580 Default is `y` 1581 xname : list[str], optional 1582 Names for the exogenous variables, default is `x#` for ## in p the 1583 number of regressors. Must match the number of parameters in 1584 the model 1585 title : str, optional 1586 Title for the top table. If not None, then this replaces 1587 the default title 1588 alpha : float 1589 significance level for the confidence intervals 1590 1591 Returns 1592 ------- 1593 smry : Summary instance 1594 this holds the summary tables and text, which can be 1595 printed or converted to various output formats. 1596 1597 See Also 1598 -------- 1599 statsmodels.iolib.summary2.Summary : class to hold summary results 1600 """ 1601 1602 from statsmodels.iolib import summary2 1603 smry = summary2.Summary() 1604 float_format = "%8.3f" 1605 1606 info = {} 1607 info["Model:"] = "PH Reg" 1608 if yname is None: 1609 yname = self.model.endog_names 1610 info["Dependent variable:"] = yname 1611 info["Ties:"] = self.model.ties.capitalize() 1612 info["Sample size:"] = str(self.model.surv.n_obs) 1613 info["Num. events:"] = str(int(sum(self.model.status))) 1614 1615 if self.model.groups is not None: 1616 mn, mx, avg, num = self._group_stats(self.model.groups) 1617 info["Num groups:"] = "%.0f" % num 1618 info["Min group size:"] = "%.0f" % mn 1619 info["Max group size:"] = "%.0f" % mx 1620 info["Avg group size:"] = "%.1f" % avg 1621 1622 if self.model.strata is not None: 1623 mn, mx, avg, num = self._group_stats(self.model.strata) 1624 info["Num strata:"] = "%.0f" % num 1625 info["Min stratum size:"] = "%.0f" % mn 1626 info["Max stratum size:"] = "%.0f" % mx 1627 info["Avg stratum size:"] = "%.1f" % avg 1628 1629 smry.add_dict(info, align='l', float_format=float_format) 1630 1631 param = summary2.summary_params(self, alpha=alpha) 1632 param = param.rename(columns={"Coef.": "log HR", 1633 "Std.Err.": "log HR SE"}) 1634 param.insert(2, "HR", np.exp(param["log HR"])) 1635 a = "[%.3f" % (alpha / 2) 1636 param.loc[:, a] = np.exp(param.loc[:, a]) 1637 a = "%.3f]" % (1 - alpha / 2) 1638 param.loc[:, a] = np.exp(param.loc[:, a]) 1639 if xname is not None: 1640 param.index = xname 1641 smry.add_df(param, float_format=float_format) 1642 smry.add_title(title=title, results=self) 1643 smry.add_text("Confidence intervals are for the hazard ratios") 1644 1645 dstrat = self.model.surv.nstrat_orig - self.model.surv.nstrat 1646 if dstrat > 0: 1647 if dstrat == 1: 1648 smry.add_text("1 stratum dropped for having no events") 1649 else: 1650 smry.add_text("%d strata dropped for having no events" % dstrat) 1651 1652 if self.model.entry is not None: 1653 n_entry = sum(self.model.entry != 0) 1654 if n_entry == 1: 1655 smry.add_text("1 observation has a positive entry time") 1656 else: 1657 smry.add_text("%d observations have positive entry times" % n_entry) 1658 1659 if self.model.groups is not None: 1660 smry.add_text("Standard errors account for dependence within groups") 1661 1662 if hasattr(self, "regularized"): 1663 smry.add_text("Standard errors do not account for the regularization") 1664 1665 return smry 1666 1667 1668class rv_discrete_float(object): 1669 """ 1670 A class representing a collection of discrete distributions. 1671 1672 Parameters 1673 ---------- 1674 xk : 2d array_like 1675 The support points, should be non-decreasing within each 1676 row. 1677 pk : 2d array_like 1678 The probabilities, should sum to one within each row. 1679 1680 Notes 1681 ----- 1682 Each row of `xk`, and the corresponding row of `pk` describe a 1683 discrete distribution. 1684 1685 `xk` and `pk` should both be two-dimensional ndarrays. Each row 1686 of `pk` should sum to 1. 1687 1688 This class is used as a substitute for scipy.distributions. 1689 rv_discrete, since that class does not allow non-integer support 1690 points, or vectorized operations. 1691 1692 Only a limited number of methods are implemented here compared to 1693 the other scipy distribution classes. 1694 """ 1695 1696 def __init__(self, xk, pk): 1697 1698 self.xk = xk 1699 self.pk = pk 1700 self.cpk = np.cumsum(self.pk, axis=1) 1701 1702 def rvs(self, n=None): 1703 """ 1704 Returns a random sample from the discrete distribution. 1705 1706 A vector is returned containing a single draw from each row of 1707 `xk`, using the probabilities of the corresponding row of `pk` 1708 1709 Parameters 1710 ---------- 1711 n : not used 1712 Present for signature compatibility 1713 """ 1714 1715 n = self.xk.shape[0] 1716 u = np.random.uniform(size=n) 1717 1718 ix = (self.cpk < u[:, None]).sum(1) 1719 ii = np.arange(n, dtype=np.int32) 1720 return self.xk[(ii,ix)] 1721 1722 def mean(self): 1723 """ 1724 Returns a vector containing the mean values of the discrete 1725 distributions. 1726 1727 A vector is returned containing the mean value of each row of 1728 `xk`, using the probabilities in the corresponding row of 1729 `pk`. 1730 """ 1731 1732 return (self.xk * self.pk).sum(1) 1733 1734 def var(self): 1735 """ 1736 Returns a vector containing the variances of the discrete 1737 distributions. 1738 1739 A vector is returned containing the variance for each row of 1740 `xk`, using the probabilities in the corresponding row of 1741 `pk`. 1742 """ 1743 1744 mn = self.mean() 1745 xkc = self.xk - mn[:, None] 1746 1747 return (self.pk * (self.xk - xkc)**2).sum(1) 1748 1749 def std(self): 1750 """ 1751 Returns a vector containing the standard deviations of the 1752 discrete distributions. 1753 1754 A vector is returned containing the standard deviation for 1755 each row of `xk`, using the probabilities in the corresponding 1756 row of `pk`. 1757 """ 1758 1759 return np.sqrt(self.var()) 1760