1""" 2Conditional logistic, Poisson, and multinomial logit regression 3""" 4 5import numpy as np 6import statsmodels.base.model as base 7import statsmodels.regression.linear_model as lm 8import statsmodels.base.wrapper as wrap 9from statsmodels.discrete.discrete_model import (MultinomialResults, 10 MultinomialResultsWrapper) 11import collections 12import warnings 13import itertools 14 15 16class _ConditionalModel(base.LikelihoodModel): 17 18 def __init__(self, endog, exog, missing='none', **kwargs): 19 20 if "groups" not in kwargs: 21 raise ValueError("'groups' is a required argument") 22 groups = kwargs["groups"] 23 24 if groups.size != endog.size: 25 msg = "'endog' and 'groups' should have the same dimensions" 26 raise ValueError(msg) 27 28 if exog.shape[0] != endog.size: 29 msg = "The leading dimension of 'exog' should equal the length of 'endog'" 30 raise ValueError(msg) 31 32 super(_ConditionalModel, self).__init__( 33 endog, exog, missing=missing, **kwargs) 34 35 if self.data.const_idx is not None: 36 msg = ("Conditional models should not have an intercept in the " + 37 "design matrix") 38 raise ValueError(msg) 39 40 exog = self.exog 41 self.k_params = exog.shape[1] 42 43 # Get the row indices for each group 44 row_ix = {} 45 for i, g in enumerate(groups): 46 if g not in row_ix: 47 row_ix[g] = [] 48 row_ix[g].append(i) 49 50 # Split the data into groups and remove groups with no variation 51 endog, exog = np.asarray(endog), np.asarray(exog) 52 offset = kwargs.get("offset") 53 self._endog_grp = [] 54 self._exog_grp = [] 55 self._groupsize = [] 56 if offset is not None: 57 offset = np.asarray(offset) 58 self._offset_grp = [] 59 self._offset = [] 60 self._sumy = [] 61 self.nobs = 0 62 drops = [0, 0] 63 for g, ix in row_ix.items(): 64 y = endog[ix].flat 65 if np.std(y) == 0: 66 drops[0] += 1 67 drops[1] += len(y) 68 continue 69 self.nobs += len(y) 70 self._endog_grp.append(y) 71 if offset is not None: 72 self._offset_grp.append(offset[ix]) 73 self._groupsize.append(len(y)) 74 self._exog_grp.append(exog[ix, :]) 75 self._sumy.append(np.sum(y)) 76 77 if drops[0] > 0: 78 msg = ("Dropped %d groups and %d observations for having " + 79 "no within-group variance") % tuple(drops) 80 warnings.warn(msg) 81 82 # This can be pre-computed 83 if offset is not None: 84 self._endofs = [] 85 for k, ofs in enumerate(self._offset_grp): 86 self._endofs.append(np.dot(self._endog_grp[k], ofs)) 87 88 # Number of groups 89 self._n_groups = len(self._endog_grp) 90 91 # These are the sufficient statistics 92 self._xy = [] 93 self._n1 = [] 94 for g in range(self._n_groups): 95 self._xy.append(np.dot(self._endog_grp[g], self._exog_grp[g])) 96 self._n1.append(np.sum(self._endog_grp[g])) 97 98 def hessian(self, params): 99 100 from statsmodels.tools.numdiff import approx_fprime 101 hess = approx_fprime(params, self.score) 102 hess = np.atleast_2d(hess) 103 return hess 104 105 def fit(self, 106 start_params=None, 107 method='BFGS', 108 maxiter=100, 109 full_output=True, 110 disp=False, 111 fargs=(), 112 callback=None, 113 retall=False, 114 skip_hessian=False, 115 **kwargs): 116 117 rslt = super(_ConditionalModel, self).fit( 118 start_params=start_params, 119 method=method, 120 maxiter=maxiter, 121 full_output=full_output, 122 disp=disp, 123 skip_hessian=skip_hessian) 124 125 crslt = ConditionalResults(self, rslt.params, rslt.cov_params(), 1) 126 crslt.method = method 127 crslt.nobs = self.nobs 128 crslt.n_groups = self._n_groups 129 crslt._group_stats = [ 130 "%d" % min(self._groupsize), 131 "%d" % max(self._groupsize), 132 "%.1f" % np.mean(self._groupsize) 133 ] 134 rslt = ConditionalResultsWrapper(crslt) 135 return rslt 136 137 def fit_regularized(self, 138 method="elastic_net", 139 alpha=0., 140 start_params=None, 141 refit=False, 142 **kwargs): 143 """ 144 Return a regularized fit to a linear regression model. 145 146 Parameters 147 ---------- 148 method : {'elastic_net'} 149 Only the `elastic_net` approach is currently implemented. 150 alpha : scalar or array_like 151 The penalty weight. If a scalar, the same penalty weight 152 applies to all variables in the model. If a vector, it 153 must have the same length as `params`, and contains a 154 penalty weight for each coefficient. 155 start_params : array_like 156 Starting values for `params`. 157 refit : bool 158 If True, the model is refit using only the variables that 159 have non-zero coefficients in the regularized fit. The 160 refitted model is not regularized. 161 **kwargs 162 Additional keyword argument that are used when fitting the model. 163 164 Returns 165 ------- 166 Results 167 A results instance. 168 """ 169 170 from statsmodels.base.elastic_net import fit_elasticnet 171 172 if method != "elastic_net": 173 raise ValueError("method for fit_regularied must be elastic_net") 174 175 defaults = {"maxiter": 50, "L1_wt": 1, "cnvrg_tol": 1e-10, 176 "zero_tol": 1e-10} 177 defaults.update(kwargs) 178 179 return fit_elasticnet(self, method=method, 180 alpha=alpha, 181 start_params=start_params, 182 refit=refit, 183 **defaults) 184 185 # Override to allow groups to be passed as a variable name. 186 @classmethod 187 def from_formula(cls, 188 formula, 189 data, 190 subset=None, 191 drop_cols=None, 192 *args, 193 **kwargs): 194 195 try: 196 groups = kwargs["groups"] 197 del kwargs["groups"] 198 except KeyError: 199 raise ValueError("'groups' is a required argument") 200 201 if isinstance(groups, str): 202 groups = data[groups] 203 204 if "0+" not in formula.replace(" ", ""): 205 warnings.warn("Conditional models should not include an intercept") 206 207 model = super(_ConditionalModel, cls).from_formula( 208 formula, data=data, groups=groups, *args, **kwargs) 209 210 return model 211 212 213class ConditionalLogit(_ConditionalModel): 214 """ 215 Fit a conditional logistic regression model to grouped data. 216 217 Every group is implicitly given an intercept, but the model is fit using 218 a conditional likelihood in which the intercepts are not present. Thus, 219 intercept estimates are not given, but the other parameter estimates can 220 be interpreted as being adjusted for any group-level confounders. 221 222 Parameters 223 ---------- 224 endog : array_like 225 The response variable, must contain only 0 and 1. 226 exog : array_like 227 The array of covariates. Do not include an intercept 228 in this array. 229 groups : array_like 230 Codes defining the groups. This is a required keyword parameter. 231 """ 232 233 def __init__(self, endog, exog, missing='none', **kwargs): 234 235 super(ConditionalLogit, self).__init__( 236 endog, exog, missing=missing, **kwargs) 237 238 if np.any(np.unique(self.endog) != np.r_[0, 1]): 239 msg = "endog must be coded as 0, 1" 240 raise ValueError(msg) 241 242 self.K = self.exog.shape[1] 243 # i.e. self.k_params, for compatibility with MNLogit 244 245 def loglike(self, params): 246 247 ll = 0 248 for g in range(len(self._endog_grp)): 249 ll += self.loglike_grp(g, params) 250 251 return ll 252 253 def score(self, params): 254 255 score = 0 256 for g in range(self._n_groups): 257 score += self.score_grp(g, params) 258 259 return score 260 261 def _denom(self, grp, params, ofs=None): 262 263 if ofs is None: 264 ofs = 0 265 266 exb = np.exp(np.dot(self._exog_grp[grp], params) + ofs) 267 268 # In the recursions, f may be called multiple times with the 269 # same arguments, so we memoize the results. 270 memo = {} 271 272 def f(t, k): 273 if t < k: 274 return 0 275 if k == 0: 276 return 1 277 278 try: 279 return memo[(t, k)] 280 except KeyError: 281 pass 282 283 v = f(t - 1, k) + f(t - 1, k - 1) * exb[t - 1] 284 memo[(t, k)] = v 285 286 return v 287 288 return f(self._groupsize[grp], self._n1[grp]) 289 290 def _denom_grad(self, grp, params, ofs=None): 291 292 if ofs is None: 293 ofs = 0 294 295 ex = self._exog_grp[grp] 296 exb = np.exp(np.dot(ex, params) + ofs) 297 298 # s may be called multiple times in the recursions with the 299 # same arguments, so memoize the results. 300 memo = {} 301 302 def s(t, k): 303 304 if t < k: 305 return 0, np.zeros(self.k_params) 306 if k == 0: 307 return 1, 0 308 309 try: 310 return memo[(t, k)] 311 except KeyError: 312 pass 313 314 h = exb[t - 1] 315 a, b = s(t - 1, k) 316 c, e = s(t - 1, k - 1) 317 d = c * h * ex[t - 1, :] 318 319 u, v = a + c * h, b + d + e * h 320 memo[(t, k)] = (u, v) 321 322 return u, v 323 324 return s(self._groupsize[grp], self._n1[grp]) 325 326 def loglike_grp(self, grp, params): 327 328 ofs = None 329 if hasattr(self, 'offset'): 330 ofs = self._offset_grp[grp] 331 332 llg = np.dot(self._xy[grp], params) 333 334 if ofs is not None: 335 llg += self._endofs[grp] 336 337 llg -= np.log(self._denom(grp, params, ofs)) 338 339 return llg 340 341 def score_grp(self, grp, params): 342 343 ofs = 0 344 if hasattr(self, 'offset'): 345 ofs = self._offset_grp[grp] 346 347 d, h = self._denom_grad(grp, params, ofs) 348 return self._xy[grp] - h / d 349 350 351class ConditionalPoisson(_ConditionalModel): 352 """ 353 Fit a conditional Poisson regression model to grouped data. 354 355 Every group is implicitly given an intercept, but the model is fit using 356 a conditional likelihood in which the intercepts are not present. Thus, 357 intercept estimates are not given, but the other parameter estimates can 358 be interpreted as being adjusted for any group-level confounders. 359 360 Parameters 361 ---------- 362 endog : array_like 363 The response variable 364 exog : array_like 365 The covariates 366 groups : array_like 367 Codes defining the groups. This is a required keyword parameter. 368 """ 369 370 def loglike(self, params): 371 372 ofs = None 373 if hasattr(self, 'offset'): 374 ofs = self._offset_grp 375 376 ll = 0.0 377 378 for i in range(len(self._endog_grp)): 379 380 xb = np.dot(self._exog_grp[i], params) 381 if ofs is not None: 382 xb += ofs[i] 383 exb = np.exp(xb) 384 y = self._endog_grp[i] 385 ll += np.dot(y, xb) 386 s = exb.sum() 387 ll -= self._sumy[i] * np.log(s) 388 389 return ll 390 391 def score(self, params): 392 393 ofs = None 394 if hasattr(self, 'offset'): 395 ofs = self._offset_grp 396 397 score = 0.0 398 399 for i in range(len(self._endog_grp)): 400 401 x = self._exog_grp[i] 402 xb = np.dot(x, params) 403 if ofs is not None: 404 xb += ofs[i] 405 exb = np.exp(xb) 406 s = exb.sum() 407 y = self._endog_grp[i] 408 score += np.dot(y, x) 409 score -= self._sumy[i] * np.dot(exb, x) / s 410 411 return score 412 413 414class ConditionalResults(base.LikelihoodModelResults): 415 def __init__(self, model, params, normalized_cov_params, scale): 416 417 super(ConditionalResults, self).__init__( 418 model, 419 params, 420 normalized_cov_params=normalized_cov_params, 421 scale=scale) 422 423 def summary(self, yname=None, xname=None, title=None, alpha=.05): 424 """ 425 Summarize the fitted model. 426 427 Parameters 428 ---------- 429 yname : str, optional 430 Default is `y` 431 xname : list[str], optional 432 Names for the exogenous variables, default is "var_xx". 433 Must match the number of parameters in the model 434 title : str, optional 435 Title for the top table. If not None, then this replaces the 436 default title 437 alpha : float 438 Significance level for the confidence intervals 439 440 Returns 441 ------- 442 smry : Summary instance 443 This holds the summary tables and text, which can be printed or 444 converted to various output formats. 445 446 See Also 447 -------- 448 statsmodels.iolib.summary.Summary : class to hold summary 449 results 450 """ 451 452 top_left = [ 453 ('Dep. Variable:', None), 454 ('Model:', None), 455 ('Log-Likelihood:', None), 456 ('Method:', [self.method]), 457 ('Date:', None), 458 ('Time:', None), 459 ] 460 461 top_right = [ 462 ('No. Observations:', None), 463 ('No. groups:', [self.n_groups]), 464 ('Min group size:', [self._group_stats[0]]), 465 ('Max group size:', [self._group_stats[1]]), 466 ('Mean group size:', [self._group_stats[2]]), 467 ] 468 469 if title is None: 470 title = "Conditional Logit Model Regression Results" 471 472 # create summary tables 473 from statsmodels.iolib.summary import Summary 474 smry = Summary() 475 smry.add_table_2cols( 476 self, 477 gleft=top_left, 478 gright=top_right, # [], 479 yname=yname, 480 xname=xname, 481 title=title) 482 smry.add_table_params( 483 self, yname=yname, xname=xname, alpha=alpha, use_t=self.use_t) 484 485 return smry 486 487class ConditionalMNLogit(_ConditionalModel): 488 """ 489 Fit a conditional multinomial logit model to grouped data. 490 491 Parameters 492 ---------- 493 endog : array_like 494 The dependent variable, must be integer-valued, coded 495 0, 1, ..., c-1, where c is the number of response 496 categories. 497 exog : array_like 498 The independent variables. 499 groups : array_like 500 Codes defining the groups. This is a required keyword parameter. 501 502 Notes 503 ----- 504 Equivalent to femlogit in Stata. 505 506 References 507 ---------- 508 Gary Chamberlain (1980). Analysis of covariance with qualitative 509 data. The Review of Economic Studies. Vol. 47, No. 1, pp. 225-238. 510 """ 511 512 def __init__(self, endog, exog, missing='none', **kwargs): 513 514 super(ConditionalMNLogit, self).__init__( 515 endog, exog, missing=missing, **kwargs) 516 517 # endog must be integers 518 self.endog = self.endog.astype(int) 519 520 self.k_cat = self.endog.max() + 1 521 self.df_model = (self.k_cat - 1) * self.exog.shape[1] 522 self.df_resid = self.nobs - self.df_model 523 self._ynames_map = {j: str(j) for j in range(self.k_cat)} 524 self.J = self.k_cat # Unfortunate name, needed for results 525 self.K = self.exog.shape[1] # for compatibility with MNLogit 526 527 if self.endog.min() < 0: 528 msg = "endog may not contain negative values" 529 raise ValueError(msg) 530 531 grx = collections.defaultdict(list) 532 for k, v in enumerate(self.groups): 533 grx[v].append(k) 534 self._group_labels = list(grx.keys()) 535 self._group_labels.sort() 536 self._grp_ix = [grx[k] for k in self._group_labels] 537 538 def fit(self, 539 start_params=None, 540 method='BFGS', 541 maxiter=100, 542 full_output=True, 543 disp=False, 544 fargs=(), 545 callback=None, 546 retall=False, 547 skip_hessian=False, 548 **kwargs): 549 550 if start_params is None: 551 q = self.exog.shape[1] 552 c = self.k_cat - 1 553 start_params = np.random.normal(size=q * c) 554 555 # Do not call super(...).fit because it cannot handle the 2d-params. 556 rslt = base.LikelihoodModel.fit( 557 self, 558 start_params=start_params, 559 method=method, 560 maxiter=maxiter, 561 full_output=full_output, 562 disp=disp, 563 skip_hessian=skip_hessian) 564 565 rslt.params = rslt.params.reshape((self.exog.shape[1], -1)) 566 rslt = MultinomialResults(self, rslt) 567 568 # Not clear what the null likelihood should be, there is no intercept 569 # so the null model is not clearly defined. This is needed for summary 570 # to work. 571 rslt.set_null_options(llnull=np.nan) 572 573 return MultinomialResultsWrapper(rslt) 574 575 def loglike(self, params): 576 577 q = self.exog.shape[1] 578 c = self.k_cat - 1 579 580 pmat = params.reshape((q, c)) 581 pmat = np.concatenate((np.zeros((q, 1)), pmat), axis=1) 582 lpr = np.dot(self.exog, pmat) 583 584 ll = 0.0 585 for ii in self._grp_ix: 586 x = lpr[ii, :] 587 jj = np.arange(x.shape[0], dtype=int) 588 y = self.endog[ii] 589 denom = 0.0 590 for p in itertools.permutations(y): 591 denom += np.exp(x[(jj, p)].sum()) 592 ll += x[(jj, y)].sum() - np.log(denom) 593 594 return ll 595 596 597 def score(self, params): 598 599 q = self.exog.shape[1] 600 c = self.k_cat - 1 601 602 pmat = params.reshape((q, c)) 603 pmat = np.concatenate((np.zeros((q, 1)), pmat), axis=1) 604 lpr = np.dot(self.exog, pmat) 605 606 grad = np.zeros((q, c)) 607 for ii in self._grp_ix: 608 x = lpr[ii, :] 609 jj = np.arange(x.shape[0], dtype=int) 610 y = self.endog[ii] 611 denom = 0.0 612 denomg = np.zeros((q, c)) 613 for p in itertools.permutations(y): 614 v = np.exp(x[(jj, p)].sum()) 615 denom += v 616 for i, r in enumerate(p): 617 if r != 0: 618 denomg[:, r - 1] += v * self.exog[ii[i], :] 619 620 for i, r in enumerate(y): 621 if r != 0: 622 grad[:, r - 1] += self.exog[ii[i], :] 623 624 grad -= denomg / denom 625 626 return grad.flatten() 627 628 629 630class ConditionalResultsWrapper(lm.RegressionResultsWrapper): 631 pass 632 633 634wrap.populate_wrapper(ConditionalResultsWrapper, ConditionalResults) 635