2Base tools for handling various kinds of data structures, attaching metadata to
3results, and doing data cleaning
5from statsmodels.compat.python import lmap
7from functools import reduce
9import numpy as np
10from pandas import DataFrame, Series, isnull, MultiIndex
12import statsmodels.tools.data as data_util
13from statsmodels.tools.decorators import cache_readonly, cache_writable
14from statsmodels.tools.sm_exceptions import MissingDataError
17def _asarray_2dcolumns(x):
18    if np.asarray(x).ndim > 1 and np.asarray(x).squeeze().ndim == 1:
19        return
22def _asarray_2d_null_rows(x):
23    """
24    Makes sure input is an array and is 2d. Makes sure output is 2d. True
25    indicates a null in the rows of 2d x.
26    """
27    #Have to have the asarrays because isnull does not account for array_like
28    #input
29    x = np.asarray(x)
30    if x.ndim == 1:
31        x = x[:, None]
32    return np.any(isnull(x), axis=1)[:, None]
35def _nan_rows(*arrs):
36    """
37    Returns a boolean array which is True where any of the rows in any
38    of the _2d_ arrays in arrs are NaNs. Inputs can be any mixture of Series,
39    DataFrames or array_like.
40    """
41    if len(arrs) == 1:
42        arrs += ([[False]],)
44    def _nan_row_maybe_two_inputs(x, y):
45        # check for dtype bc dataframe has dtypes
46        x_is_boolean_array = hasattr(x, 'dtype') and x.dtype == bool and x
47        return np.logical_or(_asarray_2d_null_rows(x),
48                             (x_is_boolean_array | _asarray_2d_null_rows(y)))
49    return reduce(_nan_row_maybe_two_inputs, arrs).squeeze()
52class ModelData(object):
53    """
54    Class responsible for handling input data and extracting metadata into the
55    appropriate form
56    """
57    _param_names = None
58    _cov_names = None
60    def __init__(self, endog, exog=None, missing='none', hasconst=None,
61                 **kwargs):
62        if data_util._is_recarray(endog) or data_util._is_recarray(exog):
63            from statsmodels.tools.sm_exceptions import recarray_exception
64            raise NotImplementedError(recarray_exception)
65        if 'design_info' in kwargs:
66            self.design_info = kwargs.pop('design_info')
67        if 'formula' in kwargs:
68            self.formula = kwargs.pop('formula')
69        if missing != 'none':
70            arrays, nan_idx = self.handle_missing(endog, exog, missing,
71                                                  **kwargs)
72            self.missing_row_idx = nan_idx
73            self.__dict__.update(arrays)  # attach all the data arrays
74            self.orig_endog = self.endog
75            self.orig_exog = self.exog
76            self.endog, self.exog = self._convert_endog_exog(self.endog,
77                                                             self.exog)
78        else:
79            self.__dict__.update(kwargs)  # attach the extra arrays anyway
80            self.orig_endog = endog
81            self.orig_exog = exog
82            self.endog, self.exog = self._convert_endog_exog(endog, exog)
84        self.const_idx = None
85        self.k_constant = 0
86        self._handle_constant(hasconst)
87        self._check_integrity()
88        self._cache = {}
90    def __getstate__(self):
91        from copy import copy
92        d = copy(self.__dict__)
93        if "design_info" in d:
94            del d["design_info"]
95            d["restore_design_info"] = True
96        return d
98    def __setstate__(self, d):
99        if "restore_design_info" in d:
100            # NOTE: there may be a more performant way to do this
101            from patsy import dmatrices, PatsyError
102            exc = []
103            try:
104                data = d['frame']
105            except KeyError:
106                data = d['orig_endog'].join(d['orig_exog'])
108            for depth in [2, 3, 1, 0, 4]:  # sequence is a guess where to likely find it
109                try:
110                    _, design = dmatrices(d['formula'], data, eval_env=depth,
111                                          return_type='dataframe')
112                    break
113                except (NameError, PatsyError) as e:
114                    exc.append(e)   # why do I need a reference from outside except block
115                    pass
116            else:
117                raise exc[-1]
119            self.design_info = design.design_info
120            del d["restore_design_info"]
121        self.__dict__.update(d)
123    def _handle_constant(self, hasconst):
124        if hasconst is False or self.exog is None:
125            self.k_constant = 0
126            self.const_idx = None
127        else:
128            # detect where the constant is
129            check_implicit = False
130            exog_max = np.max(self.exog, axis=0)
131            if not np.isfinite(exog_max).all():
132                raise MissingDataError('exog contains inf or nans')
133            exog_min = np.min(self.exog, axis=0)
134            const_idx = np.where(exog_max == exog_min)[0].squeeze()
135            self.k_constant = const_idx.size
137            if self.k_constant == 1:
138                if self.exog[:, const_idx].mean() != 0:
139                    self.const_idx = int(const_idx)
140                else:
141                    # we only have a zero column and no other constant
142                    check_implicit = True
143            elif self.k_constant > 1:
144                # we have more than one constant column
145                # look for ones
146                values = []  # keep values if we need != 0
147                for idx in const_idx:
148                    value = self.exog[:, idx].mean()
149                    if value == 1:
150                        self.k_constant = 1
151                        self.const_idx = int(idx)
152                        break
153                    values.append(value)
154                else:
155                    # we did not break, no column of ones
156                    pos = (np.array(values) != 0)
157                    if pos.any():
158                        # take the first nonzero column
159                        self.k_constant = 1
160                        self.const_idx = int(const_idx[pos.argmax()])
161                    else:
162                        # only zero columns
163                        check_implicit = True
164            elif self.k_constant == 0:
165                check_implicit = True
166            else:
167                # should not be here
168                pass
170            if check_implicit and not hasconst:
171                # look for implicit constant
172                # Compute rank of augmented matrix
173                augmented_exog = np.column_stack(
174                            (np.ones(self.exog.shape[0]), self.exog))
175                rank_augm = np.linalg.matrix_rank(augmented_exog)
176                rank_orig = np.linalg.matrix_rank(self.exog)
177                self.k_constant = int(rank_orig == rank_augm)
178                self.const_idx = None
179            elif hasconst:
180                # Ensure k_constant is 1 any time hasconst is True
181                # even if one is not found
182                self.k_constant = 1
184    @classmethod
185    def _drop_nans(cls, x, nan_mask):
186        return x[nan_mask]
188    @classmethod
189    def _drop_nans_2d(cls, x, nan_mask):
190        return x[nan_mask][:, nan_mask]
192    @classmethod
193    def handle_missing(cls, endog, exog, missing, **kwargs):
194        """
195        This returns a dictionary with keys endog, exog and the keys of
196        kwargs. It preserves Nones.
197        """
198        none_array_names = []
200        # patsy's already dropped NaNs in y/X
201        missing_idx = kwargs.pop('missing_idx', None)
203        if missing_idx is not None:
204            # y, X already handled by patsy. add back in later.
205            combined = ()
206            combined_names = []
207            if exog is None:
208                none_array_names += ['exog']
209        elif exog is not None:
210            combined = (endog, exog)
211            combined_names = ['endog', 'exog']
212        else:
213            combined = (endog,)
214            combined_names = ['endog']
215            none_array_names += ['exog']
217        # deal with other arrays
218        combined_2d = ()
219        combined_2d_names = []
220        if len(kwargs):
221            for key, value_array in kwargs.items():
222                if value_array is None or np.ndim(value_array) == 0:
223                    none_array_names += [key]
224                    continue
225                # grab 1d arrays
226                if value_array.ndim == 1:
227                    combined += (np.asarray(value_array),)
228                    combined_names += [key]
229                elif value_array.squeeze().ndim == 1:
230                    combined += (np.asarray(value_array),)
231                    combined_names += [key]
233                # grab 2d arrays that are _assumed_ to be symmetric
234                elif value_array.ndim == 2:
235                    combined_2d += (np.asarray(value_array),)
236                    combined_2d_names += [key]
237                else:
238                    raise ValueError("Arrays with more than 2 dimensions "
239                                     "are not yet handled")
241        if missing_idx is not None:
242            nan_mask = missing_idx
243            updated_row_mask = None
244            if combined:  # there were extra arrays not handled by patsy
245                combined_nans = _nan_rows(*combined)
246                if combined_nans.shape[0] != nan_mask.shape[0]:
247                    raise ValueError("Shape mismatch between endog/exog "
248                                     "and extra arrays given to model.")
249                # for going back and updated endog/exog
250                updated_row_mask = combined_nans[~nan_mask]
251                nan_mask |= combined_nans  # for updating extra arrays only
252            if combined_2d:
253                combined_2d_nans = _nan_rows(combined_2d)
254                if combined_2d_nans.shape[0] != nan_mask.shape[0]:
255                    raise ValueError("Shape mismatch between endog/exog "
256                                     "and extra 2d arrays given to model.")
257                if updated_row_mask is not None:
258                    updated_row_mask |= combined_2d_nans[~nan_mask]
259                else:
260                    updated_row_mask = combined_2d_nans[~nan_mask]
261                nan_mask |= combined_2d_nans
263        else:
264            nan_mask = _nan_rows(*combined)
265            if combined_2d:
266                nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)
268        if not np.any(nan_mask):  # no missing do not do anything
269            combined = dict(zip(combined_names, combined))
270            if combined_2d:
271                combined.update(dict(zip(combined_2d_names, combined_2d)))
272            if none_array_names:
273                combined.update({k: kwargs.get(k, None)
274                                 for k in none_array_names})
276            if missing_idx is not None:
277                combined.update({'endog': endog})
278                if exog is not None:
279                    combined.update({'exog': exog})
281            return combined, []
283        elif missing == 'raise':
284            raise MissingDataError("NaNs were encountered in the data")
286        elif missing == 'drop':
287            nan_mask = ~nan_mask
288            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
289            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
290            combined = dict(zip(combined_names, lmap(drop_nans, combined)))
292            if missing_idx is not None:
293                if updated_row_mask is not None:
294                    updated_row_mask = ~updated_row_mask
295                    # update endog/exog with this new information
296                    endog = cls._drop_nans(endog, updated_row_mask)
297                    if exog is not None:
298                        exog = cls._drop_nans(exog, updated_row_mask)
300                combined.update({'endog': endog})
301                if exog is not None:
302                    combined.update({'exog': exog})
304            if combined_2d:
305                combined.update(dict(zip(combined_2d_names,
306                                         lmap(drop_nans_2d, combined_2d))))
307            if none_array_names:
308                combined.update({k: kwargs.get(k, None)
309                                 for k in none_array_names})
311            return combined, np.where(~nan_mask)[0].tolist()
312        else:
313            raise ValueError("missing option %s not understood" % missing)
315    def _convert_endog_exog(self, endog, exog):
317        # for consistent outputs if endog is (n,1)
318        yarr = self._get_yarr(endog)
319        xarr = None
320        if exog is not None:
321            xarr = self._get_xarr(exog)
322            if xarr.ndim == 1:
323                xarr = xarr[:, None]
324            if xarr.ndim != 2:
325                raise ValueError("exog is not 1d or 2d")
327        return yarr, xarr
329    @cache_writable()
330    def ynames(self):
331        endog = self.orig_endog
332        ynames = self._get_names(endog)
333        if not ynames:
334            ynames = _make_endog_names(self.endog)
336        if len(ynames) == 1:
337            return ynames[0]
338        else:
339            return list(ynames)
341    @cache_writable()
342    def xnames(self):
343        exog = self.orig_exog
344        if exog is not None:
345            xnames = self._get_names(exog)
346            if not xnames:
347                xnames = _make_exog_names(self.exog)
348            return list(xnames)
349        return None
351    @property
352    def param_names(self):
353        # for handling names of 'extra' parameters in summary, etc.
354        return self._param_names or self.xnames
356    @param_names.setter
357    def param_names(self, values):
358        self._param_names = values
360    @property
361    def cov_names(self):
362        """
363        Labels for covariance matrices
365        In multidimensional models, each dimension of a covariance matrix
366        differs from the number of param_names.
368        If not set, returns param_names
369        """
370        # for handling names of covariance names in multidimensional models
371        if self._cov_names is not None:
372            return self._cov_names
373        return self.param_names
375    @cov_names.setter
376    def cov_names(self, value):
377        # for handling names of covariance names in multidimensional models
378        self._cov_names = value
380    @cache_readonly
381    def row_labels(self):
382        exog = self.orig_exog
383        if exog is not None:
384            row_labels = self._get_row_labels(exog)
385        else:
386            endog = self.orig_endog
387            row_labels = self._get_row_labels(endog)
388        return row_labels
390    def _get_row_labels(self, arr):
391        return None
393    def _get_names(self, arr):
394        if isinstance(arr, DataFrame):
395            if isinstance(arr.columns, MultiIndex):
396                # Flatten MultiIndexes into "simple" column names
397                return ['_'.join((level for level in c if level))
398                        for c in arr.columns]
399            else:
400                return list(arr.columns)
401        elif isinstance(arr, Series):
402            if arr.name:
403                return [arr.name]
404            else:
405                return
406        else:
407            try:
408                return arr.dtype.names
409            except AttributeError:
410                pass
412        return None
414    def _get_yarr(self, endog):
415        if data_util._is_structured_ndarray(endog):
416            endog = data_util.struct_to_ndarray(endog)
417        endog = np.asarray(endog)
418        if len(endog) == 1:  # never squeeze to a scalar
419            if endog.ndim == 1:
420                return endog
421            elif endog.ndim > 1:
422                return np.asarray([endog.squeeze()])
424        return endog.squeeze()
426    def _get_xarr(self, exog):
427        if data_util._is_structured_ndarray(exog):
428            exog = data_util.struct_to_ndarray(exog)
429        return np.asarray(exog)
431    def _check_integrity(self):
432        if self.exog is not None:
433            if len(self.exog) != len(self.endog):
434                raise ValueError("endog and exog matrices are different sizes")
436    def wrap_output(self, obj, how='columns', names=None):
437        if how == 'columns':
438            return self.attach_columns(obj)
439        elif how == 'rows':
440            return self.attach_rows(obj)
441        elif how == 'cov':
442            return self.attach_cov(obj)
443        elif how == 'dates':
444            return self.attach_dates(obj)
445        elif how == 'columns_eq':
446            return self.attach_columns_eq(obj)
447        elif how == 'cov_eq':
448            return self.attach_cov_eq(obj)
449        elif how == 'generic_columns':
450            return self.attach_generic_columns(obj, names)
451        elif how == 'generic_columns_2d':
452            return self.attach_generic_columns_2d(obj, names)
453        elif how == 'ynames':
454            return self.attach_ynames(obj)
455        elif how == 'multivariate_confint':
456            return self.attach_mv_confint(obj)
457        else:
458            return obj
460    def attach_columns(self, result):
461        return result
463    def attach_columns_eq(self, result):
464        return result
466    def attach_cov(self, result):
467        return result
469    def attach_cov_eq(self, result):
470        return result
472    def attach_rows(self, result):
473        return result
475    def attach_dates(self, result):
476        return result
478    def attach_mv_confint(self, result):
479        return result
481    def attach_generic_columns(self, result, *args, **kwargs):
482        return result
484    def attach_generic_columns_2d(self, result, *args, **kwargs):
485        return result
487    def attach_ynames(self, result):
488        return result
491class PatsyData(ModelData):
492    def _get_names(self, arr):
493        return arr.design_info.column_names
496class PandasData(ModelData):
497    """
498    Data handling class which knows how to reattach pandas metadata to model
499    results
500    """
502    def _convert_endog_exog(self, endog, exog=None):
503        #TODO: remove this when we handle dtype systematically
504        endog = np.asarray(endog)
505        exog = exog if exog is None else np.asarray(exog)
506        if endog.dtype == object or exog is not None and exog.dtype == object:
507            raise ValueError("Pandas data cast to numpy dtype of object. "
508                             "Check input data with np.asarray(data).")
509        return super(PandasData, self)._convert_endog_exog(endog, exog)
511    @classmethod
512    def _drop_nans(cls, x, nan_mask):
513        if isinstance(x, (Series, DataFrame)):
514            return x.loc[nan_mask]
515        else:  # extra arguments could be plain ndarrays
516            return super(PandasData, cls)._drop_nans(x, nan_mask)
518    @classmethod
519    def _drop_nans_2d(cls, x, nan_mask):
520        if isinstance(x, (Series, DataFrame)):
521            return x.loc[nan_mask].loc[:, nan_mask]
522        else:  # extra arguments could be plain ndarrays
523            return super(PandasData, cls)._drop_nans_2d(x, nan_mask)
525    def _check_integrity(self):
526        endog, exog = self.orig_endog, self.orig_exog
527        # exog can be None and we could be upcasting one or the other
528        if (exog is not None and
529                (hasattr(endog, 'index') and hasattr(exog, 'index')) and
530                not self.orig_endog.index.equals(self.orig_exog.index)):
531            raise ValueError("The indices for endog and exog are not aligned")
532        super(PandasData, self)._check_integrity()
534    def _get_row_labels(self, arr):
535        try:
536            return arr.index
537        except AttributeError:
538            # if we've gotten here it's because endog is pandas and
539            # exog is not, so just return the row labels from endog
540            return self.orig_endog.index
542    def attach_generic_columns(self, result, names):
543        # get the attribute to use
544        column_names = getattr(self, names, None)
545        return Series(result, index=column_names)
547    def attach_generic_columns_2d(self, result, rownames, colnames=None):
548        colnames = colnames or rownames
549        rownames = getattr(self, rownames, None)
550        colnames = getattr(self, colnames, None)
551        return DataFrame(result, index=rownames, columns=colnames)
553    def attach_columns(self, result):
554        # this can either be a 1d array or a scalar
555        # do not squeeze because it might be a 2d row array
556        # if it needs a squeeze, the bug is elsewhere
557        if result.ndim <= 1:
558            return Series(result, index=self.param_names)
559        else:  # for e.g., confidence intervals
560            return DataFrame(result, index=self.param_names)
562    def attach_columns_eq(self, result):
563        return DataFrame(result, index=self.xnames, columns=self.ynames)
565    def attach_cov(self, result):
566        return DataFrame(result, index=self.cov_names, columns=self.cov_names)
568    def attach_cov_eq(self, result):
569        return DataFrame(result, index=self.ynames, columns=self.ynames)
571    def attach_rows(self, result):
572        # assumes if len(row_labels) > len(result) it's bc it was truncated
573        # at the front, for AR lags, for example
574        squeezed = result.squeeze()
575        k_endog = np.array(self.ynames, ndmin=1).shape[0]
576        if k_endog > 1 and squeezed.shape == (k_endog,):
577            squeezed = squeezed[None, :]
578        # May be zero-dim, for example in the case of forecast one step in tsa
579        if squeezed.ndim < 2:
580            out = Series(squeezed)
581        else:
582            out = DataFrame(result)
583            out.columns = self.ynames
584        out.index = self.row_labels[-len(result):]
585        return out
587    def attach_dates(self, result):
588        squeezed = result.squeeze()
589        k_endog = np.array(self.ynames, ndmin=1).shape[0]
590        if k_endog > 1 and squeezed.shape == (k_endog,):
591            squeezed = np.asarray(squeezed)[None, :]
592        # May be zero-dim, for example in the case of forecast one step in tsa
593        if squeezed.ndim < 2:
594            return Series(squeezed, index=self.predict_dates)
595        else:
596            return DataFrame(np.asarray(result),
597                             index=self.predict_dates,
598                             columns=self.ynames)
600    def attach_mv_confint(self, result):
601        return DataFrame(result.reshape((-1, 2)),
602                         index=self.cov_names,
603                         columns=['lower', 'upper'])
605    def attach_ynames(self, result):
606        squeezed = result.squeeze()
607        # May be zero-dim, for example in the case of forecast one step in tsa
608        if squeezed.ndim < 2:
609            return Series(squeezed, name=self.ynames)
610        else:
611            return DataFrame(result, columns=self.ynames)
614def _make_endog_names(endog):
615    if endog.ndim == 1 or endog.shape[1] == 1:
616        ynames = ['y']
617    else:  # for VAR
618        ynames = ['y%d' % (i+1) for i in range(endog.shape[1])]
620    return ynames
623def _make_exog_names(exog):
624    exog_var = exog.var(0)
625    if (exog_var == 0).any():
626        # assumes one constant in first or last position
627        # avoid exception if more than one constant
628        const_idx = exog_var.argmin()
629        exog_names = ['x%d' % i for i in range(1, exog.shape[1])]
630        exog_names.insert(const_idx, 'const')
631    else:
632        exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)]
634    return exog_names
637def handle_missing(endog, exog=None, missing='none', **kwargs):
638    klass = handle_data_class_factory(endog, exog)
639    if missing == 'none':
640        ret_dict = dict(endog=endog, exog=exog)
641        ret_dict.update(kwargs)
642        return ret_dict, None
643    return klass.handle_missing(endog, exog, missing=missing, **kwargs)
646def handle_data_class_factory(endog, exog):
647    """
648    Given inputs
649    """
650    if data_util._is_using_ndarray_type(endog, exog):
651        klass = ModelData
652    elif data_util._is_using_pandas(endog, exog):
653        klass = PandasData
654    elif data_util._is_using_patsy(endog, exog):
655        klass = PatsyData
656    # keep this check last
657    elif data_util._is_using_ndarray(endog, exog):
658        klass = ModelData
659    else:
660        raise ValueError('unrecognized data structures: %s / %s' %
661                         (type(endog), type(exog)))
662    return klass
665def handle_data(endog, exog, missing='none', hasconst=None, **kwargs):
666    # deal with lists and tuples up-front
667    if isinstance(endog, (list, tuple)):
668        endog = np.asarray(endog)
669    if isinstance(exog, (list, tuple)):
670        exog = np.asarray(exog)
672    klass = handle_data_class_factory(endog, exog)
673    return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
674                 **kwargs)