1from statsmodels.compat.python import lzip
2
3import datetime
4from functools import reduce
5import re
6import textwrap
7
8import numpy as np
9import pandas as pd
10
11from .table import SimpleTable
12from .tableformatting import fmt_latex, fmt_txt
13
14
15class Summary(object):
16    def __init__(self):
17        self.tables = []
18        self.settings = []
19        self.extra_txt = []
20        self.title = None
21        self._merge_latex = False
22
23    def __str__(self):
24        return self.as_text()
25
26    def __repr__(self):
27        return str(type(self)) + '\n"""\n' + self.__str__() + '\n"""'
28
29    def _repr_html_(self):
30        """Display as HTML in IPython notebook."""
31        return self.as_html()
32
33    def add_df(self, df, index=True, header=True, float_format='%.4f',
34               align='r'):
35        """
36        Add the contents of a DataFrame to summary table
37
38        Parameters
39        ----------
40        df : DataFrame
41        header : bool
42            Reproduce the DataFrame column labels in summary table
43        index : bool
44            Reproduce the DataFrame row labels in summary table
45        float_format : str
46            Formatting to float data columns
47        align : str
48            Data alignment (l/c/r)
49        """
50
51        settings = {'index': index, 'header': header,
52                    'float_format': float_format, 'align': align}
53        self.tables.append(df)
54        self.settings.append(settings)
55
56    def add_array(self, array, align='r', float_format="%.4f"):
57        """Add the contents of a Numpy array to summary table
58
59        Parameters
60        ----------
61        array : numpy array (2D)
62        float_format : str
63            Formatting to array if type is float
64        align : str
65            Data alignment (l/c/r)
66        """
67
68        table = pd.DataFrame(array)
69        self.add_df(table, index=False, header=False,
70                    float_format=float_format, align=align)
71
72    def add_dict(self, d, ncols=2, align='l', float_format="%.4f"):
73        """Add the contents of a Dict to summary table
74
75        Parameters
76        ----------
77        d : dict
78            Keys and values are automatically coerced to strings with str().
79            Users are encouraged to format them before using add_dict.
80        ncols : int
81            Number of columns of the output table
82        align : str
83            Data alignment (l/c/r)
84        float_format : str
85            Formatting to float data columns
86        """
87
88        keys = [_formatter(x, float_format) for x in d.keys()]
89        vals = [_formatter(x, float_format) for x in d.values()]
90        data = np.array(lzip(keys, vals))
91
92        if data.shape[0] % ncols != 0:
93            pad = ncols - (data.shape[0] % ncols)
94            data = np.vstack([data, np.array(pad * [['', '']])])
95
96        data = np.split(data, ncols)
97        data = reduce(lambda x, y: np.hstack([x, y]), data)
98        self.add_array(data, align=align)
99
100    def add_text(self, string):
101        """Append a note to the bottom of the summary table. In ASCII tables,
102        the note will be wrapped to table width. Notes are not indendented.
103        """
104        self.extra_txt.append(string)
105
106    def add_title(self, title=None, results=None):
107        """Insert a title on top of the summary table. If a string is provided
108        in the title argument, that string is printed. If no title string is
109        provided but a results instance is provided, statsmodels attempts
110        to construct a useful title automatically.
111        """
112        if isinstance(title, str):
113            self.title = title
114        else:
115            if results is not None:
116                model = results.model.__class__.__name__
117                if model in _model_types:
118                    model = _model_types[model]
119                self.title = 'Results: ' + model
120            else:
121                self.title = ''
122
123    def add_base(self, results, alpha=0.05, float_format="%.4f", title=None,
124                 xname=None, yname=None):
125        """Try to construct a basic summary instance.
126
127        Parameters
128        ----------
129        results : Model results instance
130        alpha : float
131            significance level for the confidence intervals (optional)
132        float_format: str
133            Float formatting for summary of parameters (optional)
134        title : str
135            Title of the summary table (optional)
136        xname : list[str] of length equal to the number of parameters
137            Names of the independent variables (optional)
138        yname : str
139            Name of the dependent variable (optional)
140        """
141
142        param = summary_params(results, alpha=alpha, use_t=results.use_t)
143        info = summary_model(results)
144        if xname is not None:
145            param.index = xname
146        if yname is not None:
147            info['Dependent Variable:'] = yname
148        self.add_dict(info, align='l')
149        self.add_df(param, float_format=float_format)
150        self.add_title(title=title, results=results)
151
152    def as_text(self):
153        """Generate ASCII Summary Table
154        """
155
156        tables = self.tables
157        settings = self.settings
158        title = self.title
159        extra_txt = self.extra_txt
160
161        pad_col, pad_index, widest = _measure_tables(tables, settings)
162
163        rule_equal = widest * '='
164
165        simple_tables = _simple_tables(tables, settings, pad_col, pad_index)
166        tab = [x.as_text() for x in simple_tables]
167
168        tab = '\n'.join(tab)
169        tab = tab.split('\n')
170        tab[0] = rule_equal
171        tab.append(rule_equal)
172        tab = '\n'.join(tab)
173
174        if title is not None:
175            title = title
176            if len(title) < widest:
177                title = ' ' * int(widest / 2 - len(title) / 2) + title
178        else:
179            title = ''
180
181        txt = [textwrap.wrap(x, widest) for x in extra_txt]
182        txt = ['\n'.join(x) for x in txt]
183        txt = '\n'.join(txt)
184
185        out = '\n'.join([title, tab, txt])
186
187        return out
188
189    def as_html(self):
190        """Generate HTML Summary Table
191        """
192
193        tables = self.tables
194        settings = self.settings
195
196        simple_tables = _simple_tables(tables, settings)
197        tab = [x.as_html() for x in simple_tables]
198        tab = '\n'.join(tab)
199
200        return tab
201
202    def as_latex(self, label=''):
203        """Generate LaTeX Summary Table
204
205        Parameters
206        ----------
207        label : str
208            Label of the summary table that can be referenced
209            in a latex document (optional)
210        """
211        tables = self.tables
212        settings = self.settings
213        title = self.title
214
215        if title is not None:
216            title = '\\caption{' + title + '}'
217        else:
218            title = '\\caption{}'
219
220        label = '\\label{' + label + '}'
221
222        simple_tables = _simple_tables(tables, settings)
223        tab = [x.as_latex_tabular() for x in simple_tables]
224        tab = '\n\n'.join(tab)
225
226        to_replace = ('\\\\hline\\n\\\\hline\\n\\\\'
227                      'end{tabular}\\n\\\\begin{tabular}{.*}\\n')
228
229        if self._merge_latex:
230            # create single tabular object for summary_col
231            tab = re.sub(to_replace, r'\\midrule\n', tab)
232
233        out = '\\begin{table}', title, label, tab, '\\end{table}'
234        out = '\n'.join(out)
235        return out
236
237
238def _measure_tables(tables, settings):
239    """Compare width of ascii tables in a list and calculate padding values.
240    We add space to each col_sep to get us as close as possible to the
241    width of the largest table. Then, we add a few spaces to the first
242    column to pad the rest.
243    """
244
245    simple_tables = _simple_tables(tables, settings)
246    tab = [x.as_text() for x in simple_tables]
247
248    length = [len(x.splitlines()[0]) for x in tab]
249    len_max = max(length)
250    pad_sep = []
251    pad_index = []
252
253    for i in range(len(tab)):
254        nsep = max(tables[i].shape[1] - 1, 1)
255        pad = int((len_max - length[i]) / nsep)
256        pad_sep.append(pad)
257        len_new = length[i] + nsep * pad
258        pad_index.append(len_max - len_new)
259
260    return pad_sep, pad_index, max(length)
261
262
263# Useful stuff  # TODO: be more specific
264_model_types = {'OLS': 'Ordinary least squares',
265                'GLS': 'Generalized least squares',
266                'GLSAR': 'Generalized least squares with AR(p)',
267                'WLS': 'Weighted least squares',
268                'RLM': 'Robust linear model',
269                'NBin': 'Negative binomial model',
270                'GLM': 'Generalized linear model'
271                }
272
273
274def summary_model(results):
275    """
276    Create a dict with information about the model
277    """
278
279    def time_now(*args, **kwds):
280        now = datetime.datetime.now()
281        return now.strftime('%Y-%m-%d %H:%M')
282
283    info = {}
284    info['Model:'] = lambda x: x.model.__class__.__name__
285    info['Model Family:'] = lambda x: x.family.__class.__name__
286    info['Link Function:'] = lambda x: x.family.link.__class__.__name__
287    info['Dependent Variable:'] = lambda x: x.model.endog_names
288    info['Date:'] = time_now
289    info['No. Observations:'] = lambda x: "%#6d" % x.nobs
290    info['Df Model:'] = lambda x: "%#6d" % x.df_model
291    info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid
292    info['Converged:'] = lambda x: x.mle_retvals['converged']
293    info['No. Iterations:'] = lambda x: x.mle_retvals['iterations']
294    info['Method:'] = lambda x: x.method
295    info['Norm:'] = lambda x: x.fit_options['norm']
296    info['Scale Est.:'] = lambda x: x.fit_options['scale_est']
297    info['Cov. Type:'] = lambda x: x.fit_options['cov']
298
299    rsquared_type = '' if results.k_constant else ' (uncentered)'
300    info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared
301    info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj  # noqa:E501
302    info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared
303    info['AIC:'] = lambda x: "%8.4f" % x.aic
304    info['BIC:'] = lambda x: "%8.4f" % x.bic
305    info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf
306    info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull
307    info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue
308    info['Deviance:'] = lambda x: "%#8.5g" % x.deviance
309    info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2
310    info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue
311    info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue
312    info['Scale:'] = lambda x: "%#8.5g" % x.scale
313    out = {}
314    for key, func in info.items():
315        try:
316            out[key] = func(results)
317        except (AttributeError, KeyError, NotImplementedError):
318            # NOTE: some models do not have loglike defined (RLM),
319            #   so raise NotImplementedError
320            pass
321    return out
322
323
324def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True,
325                   skip_header=False, float_format="%.4f"):
326    """create a summary table of parameters from results instance
327
328    Parameters
329    ----------
330    res : results instance
331        some required information is directly taken from the result
332        instance
333    yname : {str, None}
334        optional name for the endogenous variable, default is "y"
335    xname : {list[str], None}
336        optional names for the exogenous variables, default is "var_xx"
337    alpha : float
338        significance level for the confidence intervals
339    use_t : bool
340        indicator whether the p-values are based on the Student-t
341        distribution (if True) or on the normal distribution (if False)
342    skip_header : bool
343        If false (default), then the header row is added. If true, then no
344        header row is added.
345    float_format : str
346        float formatting options (e.g. ".3g")
347
348    Returns
349    -------
350    params_table : SimpleTable instance
351    """
352
353    if isinstance(results, tuple):
354        results, params, bse, tvalues, pvalues, conf_int = results
355    else:
356        params = results.params
357        bse = results.bse
358        tvalues = results.tvalues
359        pvalues = results.pvalues
360        conf_int = results.conf_int(alpha)
361
362    data = np.array([params, bse, tvalues, pvalues]).T
363    data = np.hstack([data, conf_int])
364    data = pd.DataFrame(data)
365
366    if use_t:
367        data.columns = ['Coef.', 'Std.Err.', 't', 'P>|t|',
368                        '[' + str(alpha / 2), str(1 - alpha / 2) + ']']
369    else:
370        data.columns = ['Coef.', 'Std.Err.', 'z', 'P>|z|',
371                        '[' + str(alpha / 2), str(1 - alpha / 2) + ']']
372
373    if not xname:
374        try:
375            data.index = results.model.data.param_names
376        except AttributeError:
377            data.index = results.model.exog_names
378    else:
379        data.index = xname
380
381    return data
382
383
384# Vertical summary instance for multiple models
385def _col_params(result, float_format='%.4f', stars=True):
386    """Stack coefficients and standard errors in single column
387    """
388
389    # Extract parameters
390    res = summary_params(result)
391    # Format float
392    for col in res.columns[:2]:
393        res[col] = res[col].apply(lambda x: float_format % x)
394    # Std.Errors in parentheses
395    res.iloc[:, 1] = '(' + res.iloc[:, 1] + ')'
396    # Significance stars
397    if stars:
398        idx = res.iloc[:, 3] < .1
399        res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
400        idx = res.iloc[:, 3] < .05
401        res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
402        idx = res.iloc[:, 3] < .01
403        res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
404    # Stack Coefs and Std.Errors
405    res = res.iloc[:, :2]
406    res = res.stack()
407
408    rsquared = getattr(result, 'rsquared', np.nan)
409    rsquared_adj = getattr(result, 'rsquared_adj', np.nan)
410    r2 = pd.Series({('R-squared', ""): rsquared,
411                    ('R-squared Adj.', ""): rsquared_adj})
412
413    if r2.notnull().any():
414        r2 = r2.apply(lambda x: float_format % x)
415        res = pd.concat([res, r2], axis=0)
416    res = pd.DataFrame(res)
417    res.columns = [str(result.model.endog_names)]
418    return res
419
420
421def _col_info(result, info_dict=None):
422    """Stack model info in a column
423    """
424
425    if info_dict is None:
426        info_dict = {}
427    out = []
428    index = []
429    for i in info_dict:
430        if isinstance(info_dict[i], dict):
431            # this is a specific model info_dict, but not for this result...
432            continue
433        try:
434            out.append(info_dict[i](result))
435        except AttributeError:
436            out.append('')
437        index.append(i)
438    out = pd.DataFrame({str(result.model.endog_names): out}, index=index)
439    return out
440
441
442def _make_unique(list_of_names):
443    if len(set(list_of_names)) == len(list_of_names):
444        return list_of_names
445    # pandas does not like it if multiple columns have the same names
446    from collections import defaultdict
447    name_counter = defaultdict(str)
448    header = []
449    for _name in list_of_names:
450        name_counter[_name] += "I"
451        header.append(_name + " " + name_counter[_name])
452    return header
453
454
455def summary_col(results, float_format='%.4f', model_names=(), stars=False,
456                info_dict=None, regressor_order=(), drop_omitted=False):
457    """
458    Summarize multiple results instances side-by-side (coefs and SEs)
459
460    Parameters
461    ----------
462    results : statsmodels results instance or list of result instances
463    float_format : str, optional
464        float format for coefficients and standard errors
465        Default : '%.4f'
466    model_names : list[str], optional
467        Must have same length as the number of results. If the names are not
468        unique, a roman number will be appended to all model names
469    stars : bool
470        print significance stars
471    info_dict : dict, default None
472        dict of functions to be applied to results instances to retrieve
473        model info. To use specific information for different models, add a
474        (nested) info_dict with model name as the key.
475        Example: `info_dict = {"N":lambda x:(x.nobs), "R2": ..., "OLS":{
476        "R2":...}}` would only show `R2` for OLS regression models, but
477        additionally `N` for all other results.
478        Default : None (use the info_dict specified in
479        result.default_model_infos, if this property exists)
480    regressor_order : list[str], optional
481        list of names of the regressors in the desired order. All regressors
482        not specified will be appended to the end of the list.
483    drop_omitted : bool, optional
484        Includes regressors that are not specified in regressor_order. If
485        False, regressors not specified will be appended to end of the list.
486        If True, only regressors in regressor_order will be included.
487    """
488
489    if not isinstance(results, list):
490        results = [results]
491
492    cols = [_col_params(x, stars=stars, float_format=float_format) for x in
493            results]
494
495    # Unique column names (pandas has problems merging otherwise)
496    if model_names:
497        colnames = _make_unique(model_names)
498    else:
499        colnames = _make_unique([x.columns[0] for x in cols])
500    for i in range(len(cols)):
501        cols[i].columns = [colnames[i]]
502
503    def merg(x, y):
504        return x.merge(y, how='outer', right_index=True,
505                       left_index=True)
506
507    summ = reduce(merg, cols)
508
509    if regressor_order:
510        varnames = summ.index.get_level_values(0).tolist()
511        vc = pd.Series(varnames).value_counts()
512        varnames = vc.loc[vc == 2].index.tolist()
513        ordered = [x for x in regressor_order if x in varnames]
514        unordered = [x for x in varnames if x not in regressor_order]
515        new_order = ordered + unordered
516        other = [x for x in summ.index.get_level_values(0)
517                 if x not in new_order]
518        new_order += other
519        if drop_omitted:
520            for uo in unordered:
521                new_order.remove(uo)
522        summ = summ.loc[new_order]
523
524    idx = []
525    index = summ.index.get_level_values(0)
526    for i in range(0, index.shape[0], 2):
527        idx.append(index[i])
528        if (i + 1) < index.shape[0] and (index[i] == index[i + 1]):
529            idx.append("")
530        else:
531            idx.append(index[i + 1])
532    summ.index = idx
533
534    # add infos about the models.
535    if info_dict:
536        cols = [_col_info(x, info_dict.get(x.model.__class__.__name__,
537                                           info_dict)) for x in results]
538    else:
539        cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in
540                results]
541    # use unique column names, otherwise the merge will not succeed
542    for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])):
543        df.columns = [name]
544
545    def merg(x, y):
546        return x.merge(y, how='outer', right_index=True,
547                       left_index=True)
548
549    info = reduce(merg, cols)
550    dat = pd.DataFrame(np.vstack([summ, info]))  # pd.concat better, but error
551    dat.columns = summ.columns
552    dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
553    summ = dat
554
555    summ = summ.fillna('')
556
557    smry = Summary()
558    smry._merge_latex = True
559    smry.add_df(summ, header=True, align='l')
560    smry.add_text('Standard errors in parentheses.')
561    if stars:
562        smry.add_text('* p<.1, ** p<.05, ***p<.01')
563
564    return smry
565
566
567def _formatter(element, float_format='%.4f'):
568    try:
569        out = float_format % element
570    except (ValueError, TypeError):
571        out = str(element)
572    return out.strip()
573
574
575def _df_to_simpletable(df, align='r', float_format="%.4f", header=True,
576                       index=True, table_dec_above='-', table_dec_below=None,
577                       header_dec_below='-', pad_col=0, pad_index=0):
578    dat = df.copy()
579    dat = dat.applymap(lambda x: _formatter(x, float_format))
580    if header:
581        headers = [str(x) for x in dat.columns.tolist()]
582    else:
583        headers = None
584    if index:
585        stubs = [str(x) + int(pad_index) * ' ' for x in dat.index.tolist()]
586    else:
587        dat.iloc[:, 0] = [str(x) + int(pad_index) * ' '
588                          for x in dat.iloc[:, 0]]
589        stubs = None
590    st = SimpleTable(np.array(dat), headers=headers, stubs=stubs,
591                     ltx_fmt=fmt_latex, txt_fmt=fmt_txt)
592    st.output_formats['latex']['data_aligns'] = align
593    st.output_formats['latex']['header_align'] = align
594    st.output_formats['txt']['data_aligns'] = align
595    st.output_formats['txt']['table_dec_above'] = table_dec_above
596    st.output_formats['txt']['table_dec_below'] = table_dec_below
597    st.output_formats['txt']['header_dec_below'] = header_dec_below
598    st.output_formats['txt']['colsep'] = ' ' * int(pad_col + 1)
599    return st
600
601
602def _simple_tables(tables, settings, pad_col=None, pad_index=None):
603    simple_tables = []
604    float_format = settings[0]['float_format'] if settings else '%.4f'
605    if pad_col is None:
606        pad_col = [0] * len(tables)
607    if pad_index is None:
608        pad_index = [0] * len(tables)
609    for i, v in enumerate(tables):
610        index = settings[i]['index']
611        header = settings[i]['header']
612        align = settings[i]['align']
613        simple_tables.append(_df_to_simpletable(v, align=align,
614                                                float_format=float_format,
615                                                header=header, index=index,
616                                                pad_col=pad_col[i],
617                                                pad_index=pad_index[i]))
618    return simple_tables
619