1import numpy as np
2from statsmodels.duration.hazard_regression import PHReg
3
4
5def _kernel_cumincidence(time, status, exog, kfunc, freq_weights,
6                         dimred=True):
7    """
8    Calculates cumulative incidence functions using kernels.
9
10    Parameters
11    ----------
12    time : array_like
13        The observed time values
14    status : array_like
15        The status values.  status == 0 indicates censoring,
16        status == 1, 2, ... are the events.
17    exog : array_like
18        Covariates such that censoring becomes independent of
19        outcome times conditioned on the covariate values.
20    kfunc : function
21        A kernel function
22    freq_weights : array_like
23        Optional frequency weights
24    dimred : bool
25        If True, proportional hazards regression models are used to
26        reduce exog to two columns by predicting overall events and
27        censoring in two separate models.  If False, exog is used
28        directly for calculating kernel weights without dimension
29        reduction.
30    """
31
32    # Reorder so time is ascending
33    ii = np.argsort(time)
34    time = time[ii]
35    status = status[ii]
36    exog = exog[ii, :]
37    nobs = len(time)
38
39    # Convert the unique times to ranks (0, 1, 2, ...)
40    utime, rtime = np.unique(time, return_inverse=True)
41
42    # Last index where each unique time occurs.
43    ie = np.searchsorted(time, utime, side='right') - 1
44
45    ngrp = int(status.max())
46
47    # All-cause status
48    statusa = (status >= 1).astype(np.float64)
49
50    if freq_weights is not None:
51        freq_weights = freq_weights / freq_weights.sum()
52
53    ip = []
54    sp = [None] * nobs
55    n_risk = [None] * nobs
56    kd = [None] * nobs
57    for k in range(ngrp):
58        status0 = (status == k + 1).astype(np.float64)
59
60        # Dimension reduction step
61        if dimred:
62            sfe = PHReg(time, exog, status0).fit()
63            fitval_e = sfe.predict().predicted_values
64            sfc = PHReg(time, exog, 1 - status0).fit()
65            fitval_c = sfc.predict().predicted_values
66            exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None]))
67            exog2d -= exog2d.mean(0)
68            exog2d /= exog2d.std(0)
69        else:
70            exog2d = exog
71
72        ip0 = 0
73        for i in range(nobs):
74
75            if k == 0:
76                kd1 = exog2d - exog2d[i, :]
77                kd1 = kfunc(kd1)
78                kd[i] = kd1
79
80            # Get the local all-causes survival function
81            if k == 0:
82                denom = np.cumsum(kd[i][::-1])[::-1]
83                num = kd[i] * statusa
84                rat = num / denom
85                tr = 1e-15
86                ii = np.flatnonzero((denom < tr) & (num < tr))
87                rat[ii] = 0
88                ratc = 1 - rat
89                ratc = np.clip(ratc, 1e-10, np.inf)
90                lrat = np.log(ratc)
91                prat = np.cumsum(lrat)[ie]
92                sf = np.exp(prat)
93                sp[i] = np.r_[1, sf[:-1]]
94                n_risk[i] = denom[ie]
95
96            # Number of cause-specific deaths at each unique time.
97            d0 = np.bincount(rtime, weights=status0*kd[i],
98                             minlength=len(utime))
99
100            # The cumulative incidence function probabilities.  Carry
101            # forward once the effective sample size drops below 1.
102            ip1 = np.cumsum(sp[i] * d0 / n_risk[i])
103            jj = len(ip1) - np.searchsorted(n_risk[i][::-1], 1)
104            if jj < len(ip1):
105                ip1[jj:] = ip1[jj - 1]
106            if freq_weights is None:
107                ip0 += ip1
108            else:
109                ip0 += freq_weights[i] * ip1
110
111        if freq_weights is None:
112            ip0 /= nobs
113
114        ip.append(ip0)
115
116    return utime, ip
117
118
119def _kernel_survfunc(time, status, exog, kfunc, freq_weights):
120    """
121    Estimate the marginal survival function under dependent censoring.
122
123    Parameters
124    ----------
125    time : array_like
126        The observed times for each subject
127    status : array_like
128        The status for each subject (1 indicates event, 0 indicates
129        censoring)
130    exog : array_like
131        Covariates such that censoring is independent conditional on
132        exog
133    kfunc : function
134        Kernel function
135    freq_weights : array_like
136        Optional frequency weights
137
138    Returns
139    -------
140    probs : array_like
141        The estimated survival probabilities
142    times : array_like
143        The times at which the survival probabilities are estimated
144
145    References
146    ----------
147    Zeng, Donglin 2004. Estimating Marginal Survival Function by
148    Adjusting for Dependent Censoring Using Many Covariates. The
149    Annals of Statistics 32 (4): 1533 55.
150    doi:10.1214/009053604000000508.
151    https://arxiv.org/pdf/math/0409180.pdf
152    """
153
154    # Dimension reduction step
155    sfe = PHReg(time, exog, status).fit()
156    fitval_e = sfe.predict().predicted_values
157    sfc = PHReg(time, exog, 1 - status).fit()
158    fitval_c = sfc.predict().predicted_values
159    exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None]))
160
161    n = len(time)
162    ixd = np.flatnonzero(status == 1)
163
164    # For consistency with standard KM, only compute the survival
165    # function at the times of observed events.
166    utime = np.unique(time[ixd])
167
168    # Reorder everything so time is ascending
169    ii = np.argsort(time)
170    time = time[ii]
171    status = status[ii]
172    exog2d = exog2d[ii, :]
173
174    # Last index where each evaluation time occurs.
175    ie = np.searchsorted(time, utime, side='right') - 1
176
177    if freq_weights is not None:
178        freq_weights = freq_weights / freq_weights.sum()
179
180    sprob = 0.
181    for i in range(n):
182
183        kd = exog2d - exog2d[i, :]
184        kd = kfunc(kd)
185
186        denom = np.cumsum(kd[::-1])[::-1]
187        num = kd * status
188        rat = num / denom
189        tr = 1e-15
190        ii = np.flatnonzero((denom < tr) & (num < tr))
191        rat[ii] = 0
192        ratc = 1 - rat
193        ratc = np.clip(ratc, 1e-12, np.inf)
194        lrat = np.log(ratc)
195        prat = np.cumsum(lrat)[ie]
196        prat = np.exp(prat)
197
198        if freq_weights is None:
199            sprob += prat
200        else:
201            sprob += prat * freq_weights[i]
202
203    if freq_weights is None:
204        sprob /= n
205
206    return sprob, utime
207