1"""
2Base tools for handling various kinds of data structures, attaching metadata to
3results, and doing data cleaning
4"""
5from statsmodels.compat.python import lmap
6
7from functools import reduce
8
9import numpy as np
10from pandas import DataFrame, Series, isnull, MultiIndex
11
12import statsmodels.tools.data as data_util
13from statsmodels.tools.decorators import cache_readonly, cache_writable
14from statsmodels.tools.sm_exceptions import MissingDataError
15
16
17def _asarray_2dcolumns(x):
18    if np.asarray(x).ndim > 1 and np.asarray(x).squeeze().ndim == 1:
19        return
20
21
22def _asarray_2d_null_rows(x):
23    """
24    Makes sure input is an array and is 2d. Makes sure output is 2d. True
25    indicates a null in the rows of 2d x.
26    """
27    #Have to have the asarrays because isnull does not account for array_like
28    #input
29    x = np.asarray(x)
30    if x.ndim == 1:
31        x = x[:, None]
32    return np.any(isnull(x), axis=1)[:, None]
33
34
35def _nan_rows(*arrs):
36    """
37    Returns a boolean array which is True where any of the rows in any
38    of the _2d_ arrays in arrs are NaNs. Inputs can be any mixture of Series,
39    DataFrames or array_like.
40    """
41    if len(arrs) == 1:
42        arrs += ([[False]],)
43
44    def _nan_row_maybe_two_inputs(x, y):
45        # check for dtype bc dataframe has dtypes
46        x_is_boolean_array = hasattr(x, 'dtype') and x.dtype == bool and x
47        return np.logical_or(_asarray_2d_null_rows(x),
48                             (x_is_boolean_array | _asarray_2d_null_rows(y)))
49    return reduce(_nan_row_maybe_two_inputs, arrs).squeeze()
50
51
52class ModelData(object):
53    """
54    Class responsible for handling input data and extracting metadata into the
55    appropriate form
56    """
57    _param_names = None
58    _cov_names = None
59
60    def __init__(self, endog, exog=None, missing='none', hasconst=None,
61                 **kwargs):
62        if data_util._is_recarray(endog) or data_util._is_recarray(exog):
63            from statsmodels.tools.sm_exceptions import recarray_exception
64            raise NotImplementedError(recarray_exception)
65        if 'design_info' in kwargs:
66            self.design_info = kwargs.pop('design_info')
67        if 'formula' in kwargs:
68            self.formula = kwargs.pop('formula')
69        if missing != 'none':
70            arrays, nan_idx = self.handle_missing(endog, exog, missing,
71                                                  **kwargs)
72            self.missing_row_idx = nan_idx
73            self.__dict__.update(arrays)  # attach all the data arrays
74            self.orig_endog = self.endog
75            self.orig_exog = self.exog
76            self.endog, self.exog = self._convert_endog_exog(self.endog,
77                                                             self.exog)
78        else:
79            self.__dict__.update(kwargs)  # attach the extra arrays anyway
80            self.orig_endog = endog
81            self.orig_exog = exog
82            self.endog, self.exog = self._convert_endog_exog(endog, exog)
83
84        self.const_idx = None
85        self.k_constant = 0
86        self._handle_constant(hasconst)
87        self._check_integrity()
88        self._cache = {}
89
90    def __getstate__(self):
91        from copy import copy
92        d = copy(self.__dict__)
93        if "design_info" in d:
94            del d["design_info"]
95            d["restore_design_info"] = True
96        return d
97
98    def __setstate__(self, d):
99        if "restore_design_info" in d:
100            # NOTE: there may be a more performant way to do this
101            from patsy import dmatrices, PatsyError
102            exc = []
103            try:
104                data = d['frame']
105            except KeyError:
106                data = d['orig_endog'].join(d['orig_exog'])
107
108            for depth in [2, 3, 1, 0, 4]:  # sequence is a guess where to likely find it
109                try:
110                    _, design = dmatrices(d['formula'], data, eval_env=depth,
111                                          return_type='dataframe')
112                    break
113                except (NameError, PatsyError) as e:
114                    exc.append(e)   # why do I need a reference from outside except block
115                    pass
116            else:
117                raise exc[-1]
118
119            self.design_info = design.design_info
120            del d["restore_design_info"]
121        self.__dict__.update(d)
122
123    def _handle_constant(self, hasconst):
124        if hasconst is False or self.exog is None:
125            self.k_constant = 0
126            self.const_idx = None
127        else:
128            # detect where the constant is
129            check_implicit = False
130            exog_max = np.max(self.exog, axis=0)
131            if not np.isfinite(exog_max).all():
132                raise MissingDataError('exog contains inf or nans')
133            exog_min = np.min(self.exog, axis=0)
134            const_idx = np.where(exog_max == exog_min)[0].squeeze()
135            self.k_constant = const_idx.size
136
137            if self.k_constant == 1:
138                if self.exog[:, const_idx].mean() != 0:
139                    self.const_idx = int(const_idx)
140                else:
141                    # we only have a zero column and no other constant
142                    check_implicit = True
143            elif self.k_constant > 1:
144                # we have more than one constant column
145                # look for ones
146                values = []  # keep values if we need != 0
147                for idx in const_idx:
148                    value = self.exog[:, idx].mean()
149                    if value == 1:
150                        self.k_constant = 1
151                        self.const_idx = int(idx)
152                        break
153                    values.append(value)
154                else:
155                    # we did not break, no column of ones
156                    pos = (np.array(values) != 0)
157                    if pos.any():
158                        # take the first nonzero column
159                        self.k_constant = 1
160                        self.const_idx = int(const_idx[pos.argmax()])
161                    else:
162                        # only zero columns
163                        check_implicit = True
164            elif self.k_constant == 0:
165                check_implicit = True
166            else:
167                # should not be here
168                pass
169
170            if check_implicit and not hasconst:
171                # look for implicit constant
172                # Compute rank of augmented matrix
173                augmented_exog = np.column_stack(
174                            (np.ones(self.exog.shape[0]), self.exog))
175                rank_augm = np.linalg.matrix_rank(augmented_exog)
176                rank_orig = np.linalg.matrix_rank(self.exog)
177                self.k_constant = int(rank_orig == rank_augm)
178                self.const_idx = None
179            elif hasconst:
180                # Ensure k_constant is 1 any time hasconst is True
181                # even if one is not found
182                self.k_constant = 1
183
184    @classmethod
185    def _drop_nans(cls, x, nan_mask):
186        return x[nan_mask]
187
188    @classmethod
189    def _drop_nans_2d(cls, x, nan_mask):
190        return x[nan_mask][:, nan_mask]
191
192    @classmethod
193    def handle_missing(cls, endog, exog, missing, **kwargs):
194        """
195        This returns a dictionary with keys endog, exog and the keys of
196        kwargs. It preserves Nones.
197        """
198        none_array_names = []
199
200        # patsy's already dropped NaNs in y/X
201        missing_idx = kwargs.pop('missing_idx', None)
202
203        if missing_idx is not None:
204            # y, X already handled by patsy. add back in later.
205            combined = ()
206            combined_names = []
207            if exog is None:
208                none_array_names += ['exog']
209        elif exog is not None:
210            combined = (endog, exog)
211            combined_names = ['endog', 'exog']
212        else:
213            combined = (endog,)
214            combined_names = ['endog']
215            none_array_names += ['exog']
216
217        # deal with other arrays
218        combined_2d = ()
219        combined_2d_names = []
220        if len(kwargs):
221            for key, value_array in kwargs.items():
222                if value_array is None or np.ndim(value_array) == 0:
223                    none_array_names += [key]
224                    continue
225                # grab 1d arrays
226                if value_array.ndim == 1:
227                    combined += (np.asarray(value_array),)
228                    combined_names += [key]
229                elif value_array.squeeze().ndim == 1:
230                    combined += (np.asarray(value_array),)
231                    combined_names += [key]
232
233                # grab 2d arrays that are _assumed_ to be symmetric
234                elif value_array.ndim == 2:
235                    combined_2d += (np.asarray(value_array),)
236                    combined_2d_names += [key]
237                else:
238                    raise ValueError("Arrays with more than 2 dimensions "
239                                     "are not yet handled")
240
241        if missing_idx is not None:
242            nan_mask = missing_idx
243            updated_row_mask = None
244            if combined:  # there were extra arrays not handled by patsy
245                combined_nans = _nan_rows(*combined)
246                if combined_nans.shape[0] != nan_mask.shape[0]:
247                    raise ValueError("Shape mismatch between endog/exog "
248                                     "and extra arrays given to model.")
249                # for going back and updated endog/exog
250                updated_row_mask = combined_nans[~nan_mask]
251                nan_mask |= combined_nans  # for updating extra arrays only
252            if combined_2d:
253                combined_2d_nans = _nan_rows(combined_2d)
254                if combined_2d_nans.shape[0] != nan_mask.shape[0]:
255                    raise ValueError("Shape mismatch between endog/exog "
256                                     "and extra 2d arrays given to model.")
257                if updated_row_mask is not None:
258                    updated_row_mask |= combined_2d_nans[~nan_mask]
259                else:
260                    updated_row_mask = combined_2d_nans[~nan_mask]
261                nan_mask |= combined_2d_nans
262
263        else:
264            nan_mask = _nan_rows(*combined)
265            if combined_2d:
266                nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)
267
268        if not np.any(nan_mask):  # no missing do not do anything
269            combined = dict(zip(combined_names, combined))
270            if combined_2d:
271                combined.update(dict(zip(combined_2d_names, combined_2d)))
272            if none_array_names:
273                combined.update({k: kwargs.get(k, None)
274                                 for k in none_array_names})
275
276            if missing_idx is not None:
277                combined.update({'endog': endog})
278                if exog is not None:
279                    combined.update({'exog': exog})
280
281            return combined, []
282
283        elif missing == 'raise':
284            raise MissingDataError("NaNs were encountered in the data")
285
286        elif missing == 'drop':
287            nan_mask = ~nan_mask
288            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
289            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
290            combined = dict(zip(combined_names, lmap(drop_nans, combined)))
291
292            if missing_idx is not None:
293                if updated_row_mask is not None:
294                    updated_row_mask = ~updated_row_mask
295                    # update endog/exog with this new information
296                    endog = cls._drop_nans(endog, updated_row_mask)
297                    if exog is not None:
298                        exog = cls._drop_nans(exog, updated_row_mask)
299
300                combined.update({'endog': endog})
301                if exog is not None:
302                    combined.update({'exog': exog})
303
304            if combined_2d:
305                combined.update(dict(zip(combined_2d_names,
306                                         lmap(drop_nans_2d, combined_2d))))
307            if none_array_names:
308                combined.update({k: kwargs.get(k, None)
309                                 for k in none_array_names})
310
311            return combined, np.where(~nan_mask)[0].tolist()
312        else:
313            raise ValueError("missing option %s not understood" % missing)
314
315    def _convert_endog_exog(self, endog, exog):
316
317        # for consistent outputs if endog is (n,1)
318        yarr = self._get_yarr(endog)
319        xarr = None
320        if exog is not None:
321            xarr = self._get_xarr(exog)
322            if xarr.ndim == 1:
323                xarr = xarr[:, None]
324            if xarr.ndim != 2:
325                raise ValueError("exog is not 1d or 2d")
326
327        return yarr, xarr
328
329    @cache_writable()
330    def ynames(self):
331        endog = self.orig_endog
332        ynames = self._get_names(endog)
333        if not ynames:
334            ynames = _make_endog_names(self.endog)
335
336        if len(ynames) == 1:
337            return ynames[0]
338        else:
339            return list(ynames)
340
341    @cache_writable()
342    def xnames(self):
343        exog = self.orig_exog
344        if exog is not None:
345            xnames = self._get_names(exog)
346            if not xnames:
347                xnames = _make_exog_names(self.exog)
348            return list(xnames)
349        return None
350
351    @property
352    def param_names(self):
353        # for handling names of 'extra' parameters in summary, etc.
354        return self._param_names or self.xnames
355
356    @param_names.setter
357    def param_names(self, values):
358        self._param_names = values
359
360    @property
361    def cov_names(self):
362        """
363        Labels for covariance matrices
364
365        In multidimensional models, each dimension of a covariance matrix
366        differs from the number of param_names.
367
368        If not set, returns param_names
369        """
370        # for handling names of covariance names in multidimensional models
371        if self._cov_names is not None:
372            return self._cov_names
373        return self.param_names
374
375    @cov_names.setter
376    def cov_names(self, value):
377        # for handling names of covariance names in multidimensional models
378        self._cov_names = value
379
380    @cache_readonly
381    def row_labels(self):
382        exog = self.orig_exog
383        if exog is not None:
384            row_labels = self._get_row_labels(exog)
385        else:
386            endog = self.orig_endog
387            row_labels = self._get_row_labels(endog)
388        return row_labels
389
390    def _get_row_labels(self, arr):
391        return None
392
393    def _get_names(self, arr):
394        if isinstance(arr, DataFrame):
395            if isinstance(arr.columns, MultiIndex):
396                # Flatten MultiIndexes into "simple" column names
397                return ['_'.join((level for level in c if level))
398                        for c in arr.columns]
399            else:
400                return list(arr.columns)
401        elif isinstance(arr, Series):
402            if arr.name:
403                return [arr.name]
404            else:
405                return
406        else:
407            try:
408                return arr.dtype.names
409            except AttributeError:
410                pass
411
412        return None
413
414    def _get_yarr(self, endog):
415        if data_util._is_structured_ndarray(endog):
416            endog = data_util.struct_to_ndarray(endog)
417        endog = np.asarray(endog)
418        if len(endog) == 1:  # never squeeze to a scalar
419            if endog.ndim == 1:
420                return endog
421            elif endog.ndim > 1:
422                return np.asarray([endog.squeeze()])
423
424        return endog.squeeze()
425
426    def _get_xarr(self, exog):
427        if data_util._is_structured_ndarray(exog):
428            exog = data_util.struct_to_ndarray(exog)
429        return np.asarray(exog)
430
431    def _check_integrity(self):
432        if self.exog is not None:
433            if len(self.exog) != len(self.endog):
434                raise ValueError("endog and exog matrices are different sizes")
435
436    def wrap_output(self, obj, how='columns', names=None):
437        if how == 'columns':
438            return self.attach_columns(obj)
439        elif how == 'rows':
440            return self.attach_rows(obj)
441        elif how == 'cov':
442            return self.attach_cov(obj)
443        elif how == 'dates':
444            return self.attach_dates(obj)
445        elif how == 'columns_eq':
446            return self.attach_columns_eq(obj)
447        elif how == 'cov_eq':
448            return self.attach_cov_eq(obj)
449        elif how == 'generic_columns':
450            return self.attach_generic_columns(obj, names)
451        elif how == 'generic_columns_2d':
452            return self.attach_generic_columns_2d(obj, names)
453        elif how == 'ynames':
454            return self.attach_ynames(obj)
455        elif how == 'multivariate_confint':
456            return self.attach_mv_confint(obj)
457        else:
458            return obj
459
460    def attach_columns(self, result):
461        return result
462
463    def attach_columns_eq(self, result):
464        return result
465
466    def attach_cov(self, result):
467        return result
468
469    def attach_cov_eq(self, result):
470        return result
471
472    def attach_rows(self, result):
473        return result
474
475    def attach_dates(self, result):
476        return result
477
478    def attach_mv_confint(self, result):
479        return result
480
481    def attach_generic_columns(self, result, *args, **kwargs):
482        return result
483
484    def attach_generic_columns_2d(self, result, *args, **kwargs):
485        return result
486
487    def attach_ynames(self, result):
488        return result
489
490
491class PatsyData(ModelData):
492    def _get_names(self, arr):
493        return arr.design_info.column_names
494
495
496class PandasData(ModelData):
497    """
498    Data handling class which knows how to reattach pandas metadata to model
499    results
500    """
501
502    def _convert_endog_exog(self, endog, exog=None):
503        #TODO: remove this when we handle dtype systematically
504        endog = np.asarray(endog)
505        exog = exog if exog is None else np.asarray(exog)
506        if endog.dtype == object or exog is not None and exog.dtype == object:
507            raise ValueError("Pandas data cast to numpy dtype of object. "
508                             "Check input data with np.asarray(data).")
509        return super(PandasData, self)._convert_endog_exog(endog, exog)
510
511    @classmethod
512    def _drop_nans(cls, x, nan_mask):
513        if isinstance(x, (Series, DataFrame)):
514            return x.loc[nan_mask]
515        else:  # extra arguments could be plain ndarrays
516            return super(PandasData, cls)._drop_nans(x, nan_mask)
517
518    @classmethod
519    def _drop_nans_2d(cls, x, nan_mask):
520        if isinstance(x, (Series, DataFrame)):
521            return x.loc[nan_mask].loc[:, nan_mask]
522        else:  # extra arguments could be plain ndarrays
523            return super(PandasData, cls)._drop_nans_2d(x, nan_mask)
524
525    def _check_integrity(self):
526        endog, exog = self.orig_endog, self.orig_exog
527        # exog can be None and we could be upcasting one or the other
528        if (exog is not None and
529                (hasattr(endog, 'index') and hasattr(exog, 'index')) and
530                not self.orig_endog.index.equals(self.orig_exog.index)):
531            raise ValueError("The indices for endog and exog are not aligned")
532        super(PandasData, self)._check_integrity()
533
534    def _get_row_labels(self, arr):
535        try:
536            return arr.index
537        except AttributeError:
538            # if we've gotten here it's because endog is pandas and
539            # exog is not, so just return the row labels from endog
540            return self.orig_endog.index
541
542    def attach_generic_columns(self, result, names):
543        # get the attribute to use
544        column_names = getattr(self, names, None)
545        return Series(result, index=column_names)
546
547    def attach_generic_columns_2d(self, result, rownames, colnames=None):
548        colnames = colnames or rownames
549        rownames = getattr(self, rownames, None)
550        colnames = getattr(self, colnames, None)
551        return DataFrame(result, index=rownames, columns=colnames)
552
553    def attach_columns(self, result):
554        # this can either be a 1d array or a scalar
555        # do not squeeze because it might be a 2d row array
556        # if it needs a squeeze, the bug is elsewhere
557        if result.ndim <= 1:
558            return Series(result, index=self.param_names)
559        else:  # for e.g., confidence intervals
560            return DataFrame(result, index=self.param_names)
561
562    def attach_columns_eq(self, result):
563        return DataFrame(result, index=self.xnames, columns=self.ynames)
564
565    def attach_cov(self, result):
566        return DataFrame(result, index=self.cov_names, columns=self.cov_names)
567
568    def attach_cov_eq(self, result):
569        return DataFrame(result, index=self.ynames, columns=self.ynames)
570
571    def attach_rows(self, result):
572        # assumes if len(row_labels) > len(result) it's bc it was truncated
573        # at the front, for AR lags, for example
574        squeezed = result.squeeze()
575        k_endog = np.array(self.ynames, ndmin=1).shape[0]
576        if k_endog > 1 and squeezed.shape == (k_endog,):
577            squeezed = squeezed[None, :]
578        # May be zero-dim, for example in the case of forecast one step in tsa
579        if squeezed.ndim < 2:
580            out = Series(squeezed)
581        else:
582            out = DataFrame(result)
583            out.columns = self.ynames
584        out.index = self.row_labels[-len(result):]
585        return out
586
587    def attach_dates(self, result):
588        squeezed = result.squeeze()
589        k_endog = np.array(self.ynames, ndmin=1).shape[0]
590        if k_endog > 1 and squeezed.shape == (k_endog,):
591            squeezed = np.asarray(squeezed)[None, :]
592        # May be zero-dim, for example in the case of forecast one step in tsa
593        if squeezed.ndim < 2:
594            return Series(squeezed, index=self.predict_dates)
595        else:
596            return DataFrame(np.asarray(result),
597                             index=self.predict_dates,
598                             columns=self.ynames)
599
600    def attach_mv_confint(self, result):
601        return DataFrame(result.reshape((-1, 2)),
602                         index=self.cov_names,
603                         columns=['lower', 'upper'])
604
605    def attach_ynames(self, result):
606        squeezed = result.squeeze()
607        # May be zero-dim, for example in the case of forecast one step in tsa
608        if squeezed.ndim < 2:
609            return Series(squeezed, name=self.ynames)
610        else:
611            return DataFrame(result, columns=self.ynames)
612
613
614def _make_endog_names(endog):
615    if endog.ndim == 1 or endog.shape[1] == 1:
616        ynames = ['y']
617    else:  # for VAR
618        ynames = ['y%d' % (i+1) for i in range(endog.shape[1])]
619
620    return ynames
621
622
623def _make_exog_names(exog):
624    exog_var = exog.var(0)
625    if (exog_var == 0).any():
626        # assumes one constant in first or last position
627        # avoid exception if more than one constant
628        const_idx = exog_var.argmin()
629        exog_names = ['x%d' % i for i in range(1, exog.shape[1])]
630        exog_names.insert(const_idx, 'const')
631    else:
632        exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)]
633
634    return exog_names
635
636
637def handle_missing(endog, exog=None, missing='none', **kwargs):
638    klass = handle_data_class_factory(endog, exog)
639    if missing == 'none':
640        ret_dict = dict(endog=endog, exog=exog)
641        ret_dict.update(kwargs)
642        return ret_dict, None
643    return klass.handle_missing(endog, exog, missing=missing, **kwargs)
644
645
646def handle_data_class_factory(endog, exog):
647    """
648    Given inputs
649    """
650    if data_util._is_using_ndarray_type(endog, exog):
651        klass = ModelData
652    elif data_util._is_using_pandas(endog, exog):
653        klass = PandasData
654    elif data_util._is_using_patsy(endog, exog):
655        klass = PatsyData
656    # keep this check last
657    elif data_util._is_using_ndarray(endog, exog):
658        klass = ModelData
659    else:
660        raise ValueError('unrecognized data structures: %s / %s' %
661                         (type(endog), type(exog)))
662    return klass
663
664
665def handle_data(endog, exog, missing='none', hasconst=None, **kwargs):
666    # deal with lists and tuples up-front
667    if isinstance(endog, (list, tuple)):
668        endog = np.asarray(endog)
669    if isinstance(exog, (list, tuple)):
670        exog = np.asarray(exog)
671
672    klass = handle_data_class_factory(endog, exog)
673    return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
674                 **kwargs)
675