1from statsmodels.compat.python import lzip 2 3import datetime 4from functools import reduce 5import re 6import textwrap 7 8import numpy as np 9import pandas as pd 10 11from .table import SimpleTable 12from .tableformatting import fmt_latex, fmt_txt 13 14 15class Summary(object): 16 def __init__(self): 17 self.tables = [] 18 self.settings = [] 19 self.extra_txt = [] 20 self.title = None 21 self._merge_latex = False 22 23 def __str__(self): 24 return self.as_text() 25 26 def __repr__(self): 27 return str(type(self)) + '\n"""\n' + self.__str__() + '\n"""' 28 29 def _repr_html_(self): 30 """Display as HTML in IPython notebook.""" 31 return self.as_html() 32 33 def add_df(self, df, index=True, header=True, float_format='%.4f', 34 align='r'): 35 """ 36 Add the contents of a DataFrame to summary table 37 38 Parameters 39 ---------- 40 df : DataFrame 41 header : bool 42 Reproduce the DataFrame column labels in summary table 43 index : bool 44 Reproduce the DataFrame row labels in summary table 45 float_format : str 46 Formatting to float data columns 47 align : str 48 Data alignment (l/c/r) 49 """ 50 51 settings = {'index': index, 'header': header, 52 'float_format': float_format, 'align': align} 53 self.tables.append(df) 54 self.settings.append(settings) 55 56 def add_array(self, array, align='r', float_format="%.4f"): 57 """Add the contents of a Numpy array to summary table 58 59 Parameters 60 ---------- 61 array : numpy array (2D) 62 float_format : str 63 Formatting to array if type is float 64 align : str 65 Data alignment (l/c/r) 66 """ 67 68 table = pd.DataFrame(array) 69 self.add_df(table, index=False, header=False, 70 float_format=float_format, align=align) 71 72 def add_dict(self, d, ncols=2, align='l', float_format="%.4f"): 73 """Add the contents of a Dict to summary table 74 75 Parameters 76 ---------- 77 d : dict 78 Keys and values are automatically coerced to strings with str(). 79 Users are encouraged to format them before using add_dict. 80 ncols : int 81 Number of columns of the output table 82 align : str 83 Data alignment (l/c/r) 84 float_format : str 85 Formatting to float data columns 86 """ 87 88 keys = [_formatter(x, float_format) for x in d.keys()] 89 vals = [_formatter(x, float_format) for x in d.values()] 90 data = np.array(lzip(keys, vals)) 91 92 if data.shape[0] % ncols != 0: 93 pad = ncols - (data.shape[0] % ncols) 94 data = np.vstack([data, np.array(pad * [['', '']])]) 95 96 data = np.split(data, ncols) 97 data = reduce(lambda x, y: np.hstack([x, y]), data) 98 self.add_array(data, align=align) 99 100 def add_text(self, string): 101 """Append a note to the bottom of the summary table. In ASCII tables, 102 the note will be wrapped to table width. Notes are not indendented. 103 """ 104 self.extra_txt.append(string) 105 106 def add_title(self, title=None, results=None): 107 """Insert a title on top of the summary table. If a string is provided 108 in the title argument, that string is printed. If no title string is 109 provided but a results instance is provided, statsmodels attempts 110 to construct a useful title automatically. 111 """ 112 if isinstance(title, str): 113 self.title = title 114 else: 115 if results is not None: 116 model = results.model.__class__.__name__ 117 if model in _model_types: 118 model = _model_types[model] 119 self.title = 'Results: ' + model 120 else: 121 self.title = '' 122 123 def add_base(self, results, alpha=0.05, float_format="%.4f", title=None, 124 xname=None, yname=None): 125 """Try to construct a basic summary instance. 126 127 Parameters 128 ---------- 129 results : Model results instance 130 alpha : float 131 significance level for the confidence intervals (optional) 132 float_format: str 133 Float formatting for summary of parameters (optional) 134 title : str 135 Title of the summary table (optional) 136 xname : list[str] of length equal to the number of parameters 137 Names of the independent variables (optional) 138 yname : str 139 Name of the dependent variable (optional) 140 """ 141 142 param = summary_params(results, alpha=alpha, use_t=results.use_t) 143 info = summary_model(results) 144 if xname is not None: 145 param.index = xname 146 if yname is not None: 147 info['Dependent Variable:'] = yname 148 self.add_dict(info, align='l') 149 self.add_df(param, float_format=float_format) 150 self.add_title(title=title, results=results) 151 152 def as_text(self): 153 """Generate ASCII Summary Table 154 """ 155 156 tables = self.tables 157 settings = self.settings 158 title = self.title 159 extra_txt = self.extra_txt 160 161 pad_col, pad_index, widest = _measure_tables(tables, settings) 162 163 rule_equal = widest * '=' 164 165 simple_tables = _simple_tables(tables, settings, pad_col, pad_index) 166 tab = [x.as_text() for x in simple_tables] 167 168 tab = '\n'.join(tab) 169 tab = tab.split('\n') 170 tab[0] = rule_equal 171 tab.append(rule_equal) 172 tab = '\n'.join(tab) 173 174 if title is not None: 175 title = title 176 if len(title) < widest: 177 title = ' ' * int(widest / 2 - len(title) / 2) + title 178 else: 179 title = '' 180 181 txt = [textwrap.wrap(x, widest) for x in extra_txt] 182 txt = ['\n'.join(x) for x in txt] 183 txt = '\n'.join(txt) 184 185 out = '\n'.join([title, tab, txt]) 186 187 return out 188 189 def as_html(self): 190 """Generate HTML Summary Table 191 """ 192 193 tables = self.tables 194 settings = self.settings 195 196 simple_tables = _simple_tables(tables, settings) 197 tab = [x.as_html() for x in simple_tables] 198 tab = '\n'.join(tab) 199 200 return tab 201 202 def as_latex(self, label=''): 203 """Generate LaTeX Summary Table 204 205 Parameters 206 ---------- 207 label : str 208 Label of the summary table that can be referenced 209 in a latex document (optional) 210 """ 211 tables = self.tables 212 settings = self.settings 213 title = self.title 214 215 if title is not None: 216 title = '\\caption{' + title + '}' 217 else: 218 title = '\\caption{}' 219 220 label = '\\label{' + label + '}' 221 222 simple_tables = _simple_tables(tables, settings) 223 tab = [x.as_latex_tabular() for x in simple_tables] 224 tab = '\n\n'.join(tab) 225 226 to_replace = ('\\\\hline\\n\\\\hline\\n\\\\' 227 'end{tabular}\\n\\\\begin{tabular}{.*}\\n') 228 229 if self._merge_latex: 230 # create single tabular object for summary_col 231 tab = re.sub(to_replace, r'\\midrule\n', tab) 232 233 out = '\\begin{table}', title, label, tab, '\\end{table}' 234 out = '\n'.join(out) 235 return out 236 237 238def _measure_tables(tables, settings): 239 """Compare width of ascii tables in a list and calculate padding values. 240 We add space to each col_sep to get us as close as possible to the 241 width of the largest table. Then, we add a few spaces to the first 242 column to pad the rest. 243 """ 244 245 simple_tables = _simple_tables(tables, settings) 246 tab = [x.as_text() for x in simple_tables] 247 248 length = [len(x.splitlines()[0]) for x in tab] 249 len_max = max(length) 250 pad_sep = [] 251 pad_index = [] 252 253 for i in range(len(tab)): 254 nsep = max(tables[i].shape[1] - 1, 1) 255 pad = int((len_max - length[i]) / nsep) 256 pad_sep.append(pad) 257 len_new = length[i] + nsep * pad 258 pad_index.append(len_max - len_new) 259 260 return pad_sep, pad_index, max(length) 261 262 263# Useful stuff # TODO: be more specific 264_model_types = {'OLS': 'Ordinary least squares', 265 'GLS': 'Generalized least squares', 266 'GLSAR': 'Generalized least squares with AR(p)', 267 'WLS': 'Weighted least squares', 268 'RLM': 'Robust linear model', 269 'NBin': 'Negative binomial model', 270 'GLM': 'Generalized linear model' 271 } 272 273 274def summary_model(results): 275 """ 276 Create a dict with information about the model 277 """ 278 279 def time_now(*args, **kwds): 280 now = datetime.datetime.now() 281 return now.strftime('%Y-%m-%d %H:%M') 282 283 info = {} 284 info['Model:'] = lambda x: x.model.__class__.__name__ 285 info['Model Family:'] = lambda x: x.family.__class.__name__ 286 info['Link Function:'] = lambda x: x.family.link.__class__.__name__ 287 info['Dependent Variable:'] = lambda x: x.model.endog_names 288 info['Date:'] = time_now 289 info['No. Observations:'] = lambda x: "%#6d" % x.nobs 290 info['Df Model:'] = lambda x: "%#6d" % x.df_model 291 info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid 292 info['Converged:'] = lambda x: x.mle_retvals['converged'] 293 info['No. Iterations:'] = lambda x: x.mle_retvals['iterations'] 294 info['Method:'] = lambda x: x.method 295 info['Norm:'] = lambda x: x.fit_options['norm'] 296 info['Scale Est.:'] = lambda x: x.fit_options['scale_est'] 297 info['Cov. Type:'] = lambda x: x.fit_options['cov'] 298 299 rsquared_type = '' if results.k_constant else ' (uncentered)' 300 info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared 301 info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj # noqa:E501 302 info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared 303 info['AIC:'] = lambda x: "%8.4f" % x.aic 304 info['BIC:'] = lambda x: "%8.4f" % x.bic 305 info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf 306 info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull 307 info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue 308 info['Deviance:'] = lambda x: "%#8.5g" % x.deviance 309 info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2 310 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue 311 info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue 312 info['Scale:'] = lambda x: "%#8.5g" % x.scale 313 out = {} 314 for key, func in info.items(): 315 try: 316 out[key] = func(results) 317 except (AttributeError, KeyError, NotImplementedError): 318 # NOTE: some models do not have loglike defined (RLM), 319 # so raise NotImplementedError 320 pass 321 return out 322 323 324def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True, 325 skip_header=False, float_format="%.4f"): 326 """create a summary table of parameters from results instance 327 328 Parameters 329 ---------- 330 res : results instance 331 some required information is directly taken from the result 332 instance 333 yname : {str, None} 334 optional name for the endogenous variable, default is "y" 335 xname : {list[str], None} 336 optional names for the exogenous variables, default is "var_xx" 337 alpha : float 338 significance level for the confidence intervals 339 use_t : bool 340 indicator whether the p-values are based on the Student-t 341 distribution (if True) or on the normal distribution (if False) 342 skip_header : bool 343 If false (default), then the header row is added. If true, then no 344 header row is added. 345 float_format : str 346 float formatting options (e.g. ".3g") 347 348 Returns 349 ------- 350 params_table : SimpleTable instance 351 """ 352 353 if isinstance(results, tuple): 354 results, params, bse, tvalues, pvalues, conf_int = results 355 else: 356 params = results.params 357 bse = results.bse 358 tvalues = results.tvalues 359 pvalues = results.pvalues 360 conf_int = results.conf_int(alpha) 361 362 data = np.array([params, bse, tvalues, pvalues]).T 363 data = np.hstack([data, conf_int]) 364 data = pd.DataFrame(data) 365 366 if use_t: 367 data.columns = ['Coef.', 'Std.Err.', 't', 'P>|t|', 368 '[' + str(alpha / 2), str(1 - alpha / 2) + ']'] 369 else: 370 data.columns = ['Coef.', 'Std.Err.', 'z', 'P>|z|', 371 '[' + str(alpha / 2), str(1 - alpha / 2) + ']'] 372 373 if not xname: 374 try: 375 data.index = results.model.data.param_names 376 except AttributeError: 377 data.index = results.model.exog_names 378 else: 379 data.index = xname 380 381 return data 382 383 384# Vertical summary instance for multiple models 385def _col_params(result, float_format='%.4f', stars=True): 386 """Stack coefficients and standard errors in single column 387 """ 388 389 # Extract parameters 390 res = summary_params(result) 391 # Format float 392 for col in res.columns[:2]: 393 res[col] = res[col].apply(lambda x: float_format % x) 394 # Std.Errors in parentheses 395 res.iloc[:, 1] = '(' + res.iloc[:, 1] + ')' 396 # Significance stars 397 if stars: 398 idx = res.iloc[:, 3] < .1 399 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*' 400 idx = res.iloc[:, 3] < .05 401 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*' 402 idx = res.iloc[:, 3] < .01 403 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*' 404 # Stack Coefs and Std.Errors 405 res = res.iloc[:, :2] 406 res = res.stack() 407 408 rsquared = getattr(result, 'rsquared', np.nan) 409 rsquared_adj = getattr(result, 'rsquared_adj', np.nan) 410 r2 = pd.Series({('R-squared', ""): rsquared, 411 ('R-squared Adj.', ""): rsquared_adj}) 412 413 if r2.notnull().any(): 414 r2 = r2.apply(lambda x: float_format % x) 415 res = pd.concat([res, r2], axis=0) 416 res = pd.DataFrame(res) 417 res.columns = [str(result.model.endog_names)] 418 return res 419 420 421def _col_info(result, info_dict=None): 422 """Stack model info in a column 423 """ 424 425 if info_dict is None: 426 info_dict = {} 427 out = [] 428 index = [] 429 for i in info_dict: 430 if isinstance(info_dict[i], dict): 431 # this is a specific model info_dict, but not for this result... 432 continue 433 try: 434 out.append(info_dict[i](result)) 435 except AttributeError: 436 out.append('') 437 index.append(i) 438 out = pd.DataFrame({str(result.model.endog_names): out}, index=index) 439 return out 440 441 442def _make_unique(list_of_names): 443 if len(set(list_of_names)) == len(list_of_names): 444 return list_of_names 445 # pandas does not like it if multiple columns have the same names 446 from collections import defaultdict 447 name_counter = defaultdict(str) 448 header = [] 449 for _name in list_of_names: 450 name_counter[_name] += "I" 451 header.append(_name + " " + name_counter[_name]) 452 return header 453 454 455def summary_col(results, float_format='%.4f', model_names=(), stars=False, 456 info_dict=None, regressor_order=(), drop_omitted=False): 457 """ 458 Summarize multiple results instances side-by-side (coefs and SEs) 459 460 Parameters 461 ---------- 462 results : statsmodels results instance or list of result instances 463 float_format : str, optional 464 float format for coefficients and standard errors 465 Default : '%.4f' 466 model_names : list[str], optional 467 Must have same length as the number of results. If the names are not 468 unique, a roman number will be appended to all model names 469 stars : bool 470 print significance stars 471 info_dict : dict, default None 472 dict of functions to be applied to results instances to retrieve 473 model info. To use specific information for different models, add a 474 (nested) info_dict with model name as the key. 475 Example: `info_dict = {"N":lambda x:(x.nobs), "R2": ..., "OLS":{ 476 "R2":...}}` would only show `R2` for OLS regression models, but 477 additionally `N` for all other results. 478 Default : None (use the info_dict specified in 479 result.default_model_infos, if this property exists) 480 regressor_order : list[str], optional 481 list of names of the regressors in the desired order. All regressors 482 not specified will be appended to the end of the list. 483 drop_omitted : bool, optional 484 Includes regressors that are not specified in regressor_order. If 485 False, regressors not specified will be appended to end of the list. 486 If True, only regressors in regressor_order will be included. 487 """ 488 489 if not isinstance(results, list): 490 results = [results] 491 492 cols = [_col_params(x, stars=stars, float_format=float_format) for x in 493 results] 494 495 # Unique column names (pandas has problems merging otherwise) 496 if model_names: 497 colnames = _make_unique(model_names) 498 else: 499 colnames = _make_unique([x.columns[0] for x in cols]) 500 for i in range(len(cols)): 501 cols[i].columns = [colnames[i]] 502 503 def merg(x, y): 504 return x.merge(y, how='outer', right_index=True, 505 left_index=True) 506 507 summ = reduce(merg, cols) 508 509 if regressor_order: 510 varnames = summ.index.get_level_values(0).tolist() 511 vc = pd.Series(varnames).value_counts() 512 varnames = vc.loc[vc == 2].index.tolist() 513 ordered = [x for x in regressor_order if x in varnames] 514 unordered = [x for x in varnames if x not in regressor_order] 515 new_order = ordered + unordered 516 other = [x for x in summ.index.get_level_values(0) 517 if x not in new_order] 518 new_order += other 519 if drop_omitted: 520 for uo in unordered: 521 new_order.remove(uo) 522 summ = summ.loc[new_order] 523 524 idx = [] 525 index = summ.index.get_level_values(0) 526 for i in range(0, index.shape[0], 2): 527 idx.append(index[i]) 528 if (i + 1) < index.shape[0] and (index[i] == index[i + 1]): 529 idx.append("") 530 else: 531 idx.append(index[i + 1]) 532 summ.index = idx 533 534 # add infos about the models. 535 if info_dict: 536 cols = [_col_info(x, info_dict.get(x.model.__class__.__name__, 537 info_dict)) for x in results] 538 else: 539 cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in 540 results] 541 # use unique column names, otherwise the merge will not succeed 542 for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])): 543 df.columns = [name] 544 545 def merg(x, y): 546 return x.merge(y, how='outer', right_index=True, 547 left_index=True) 548 549 info = reduce(merg, cols) 550 dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error 551 dat.columns = summ.columns 552 dat.index = pd.Index(summ.index.tolist() + info.index.tolist()) 553 summ = dat 554 555 summ = summ.fillna('') 556 557 smry = Summary() 558 smry._merge_latex = True 559 smry.add_df(summ, header=True, align='l') 560 smry.add_text('Standard errors in parentheses.') 561 if stars: 562 smry.add_text('* p<.1, ** p<.05, ***p<.01') 563 564 return smry 565 566 567def _formatter(element, float_format='%.4f'): 568 try: 569 out = float_format % element 570 except (ValueError, TypeError): 571 out = str(element) 572 return out.strip() 573 574 575def _df_to_simpletable(df, align='r', float_format="%.4f", header=True, 576 index=True, table_dec_above='-', table_dec_below=None, 577 header_dec_below='-', pad_col=0, pad_index=0): 578 dat = df.copy() 579 dat = dat.applymap(lambda x: _formatter(x, float_format)) 580 if header: 581 headers = [str(x) for x in dat.columns.tolist()] 582 else: 583 headers = None 584 if index: 585 stubs = [str(x) + int(pad_index) * ' ' for x in dat.index.tolist()] 586 else: 587 dat.iloc[:, 0] = [str(x) + int(pad_index) * ' ' 588 for x in dat.iloc[:, 0]] 589 stubs = None 590 st = SimpleTable(np.array(dat), headers=headers, stubs=stubs, 591 ltx_fmt=fmt_latex, txt_fmt=fmt_txt) 592 st.output_formats['latex']['data_aligns'] = align 593 st.output_formats['latex']['header_align'] = align 594 st.output_formats['txt']['data_aligns'] = align 595 st.output_formats['txt']['table_dec_above'] = table_dec_above 596 st.output_formats['txt']['table_dec_below'] = table_dec_below 597 st.output_formats['txt']['header_dec_below'] = header_dec_below 598 st.output_formats['txt']['colsep'] = ' ' * int(pad_col + 1) 599 return st 600 601 602def _simple_tables(tables, settings, pad_col=None, pad_index=None): 603 simple_tables = [] 604 float_format = settings[0]['float_format'] if settings else '%.4f' 605 if pad_col is None: 606 pad_col = [0] * len(tables) 607 if pad_index is None: 608 pad_index = [0] * len(tables) 609 for i, v in enumerate(tables): 610 index = settings[i]['index'] 611 header = settings[i]['header'] 612 align = settings[i]['align'] 613 simple_tables.append(_df_to_simpletable(v, align=align, 614 float_format=float_format, 615 header=header, index=index, 616 pad_col=pad_col[i], 617 pad_index=pad_index[i])) 618 return simple_tables 619