1""" 2Base tools for handling various kinds of data structures, attaching metadata to 3results, and doing data cleaning 4""" 5from statsmodels.compat.python import lmap 6 7from functools import reduce 8 9import numpy as np 10from pandas import DataFrame, Series, isnull, MultiIndex 11 12import statsmodels.tools.data as data_util 13from statsmodels.tools.decorators import cache_readonly, cache_writable 14from statsmodels.tools.sm_exceptions import MissingDataError 15 16 17def _asarray_2dcolumns(x): 18 if np.asarray(x).ndim > 1 and np.asarray(x).squeeze().ndim == 1: 19 return 20 21 22def _asarray_2d_null_rows(x): 23 """ 24 Makes sure input is an array and is 2d. Makes sure output is 2d. True 25 indicates a null in the rows of 2d x. 26 """ 27 #Have to have the asarrays because isnull does not account for array_like 28 #input 29 x = np.asarray(x) 30 if x.ndim == 1: 31 x = x[:, None] 32 return np.any(isnull(x), axis=1)[:, None] 33 34 35def _nan_rows(*arrs): 36 """ 37 Returns a boolean array which is True where any of the rows in any 38 of the _2d_ arrays in arrs are NaNs. Inputs can be any mixture of Series, 39 DataFrames or array_like. 40 """ 41 if len(arrs) == 1: 42 arrs += ([[False]],) 43 44 def _nan_row_maybe_two_inputs(x, y): 45 # check for dtype bc dataframe has dtypes 46 x_is_boolean_array = hasattr(x, 'dtype') and x.dtype == bool and x 47 return np.logical_or(_asarray_2d_null_rows(x), 48 (x_is_boolean_array | _asarray_2d_null_rows(y))) 49 return reduce(_nan_row_maybe_two_inputs, arrs).squeeze() 50 51 52class ModelData(object): 53 """ 54 Class responsible for handling input data and extracting metadata into the 55 appropriate form 56 """ 57 _param_names = None 58 _cov_names = None 59 60 def __init__(self, endog, exog=None, missing='none', hasconst=None, 61 **kwargs): 62 if data_util._is_recarray(endog) or data_util._is_recarray(exog): 63 from statsmodels.tools.sm_exceptions import recarray_exception 64 raise NotImplementedError(recarray_exception) 65 if 'design_info' in kwargs: 66 self.design_info = kwargs.pop('design_info') 67 if 'formula' in kwargs: 68 self.formula = kwargs.pop('formula') 69 if missing != 'none': 70 arrays, nan_idx = self.handle_missing(endog, exog, missing, 71 **kwargs) 72 self.missing_row_idx = nan_idx 73 self.__dict__.update(arrays) # attach all the data arrays 74 self.orig_endog = self.endog 75 self.orig_exog = self.exog 76 self.endog, self.exog = self._convert_endog_exog(self.endog, 77 self.exog) 78 else: 79 self.__dict__.update(kwargs) # attach the extra arrays anyway 80 self.orig_endog = endog 81 self.orig_exog = exog 82 self.endog, self.exog = self._convert_endog_exog(endog, exog) 83 84 self.const_idx = None 85 self.k_constant = 0 86 self._handle_constant(hasconst) 87 self._check_integrity() 88 self._cache = {} 89 90 def __getstate__(self): 91 from copy import copy 92 d = copy(self.__dict__) 93 if "design_info" in d: 94 del d["design_info"] 95 d["restore_design_info"] = True 96 return d 97 98 def __setstate__(self, d): 99 if "restore_design_info" in d: 100 # NOTE: there may be a more performant way to do this 101 from patsy import dmatrices, PatsyError 102 exc = [] 103 try: 104 data = d['frame'] 105 except KeyError: 106 data = d['orig_endog'].join(d['orig_exog']) 107 108 for depth in [2, 3, 1, 0, 4]: # sequence is a guess where to likely find it 109 try: 110 _, design = dmatrices(d['formula'], data, eval_env=depth, 111 return_type='dataframe') 112 break 113 except (NameError, PatsyError) as e: 114 exc.append(e) # why do I need a reference from outside except block 115 pass 116 else: 117 raise exc[-1] 118 119 self.design_info = design.design_info 120 del d["restore_design_info"] 121 self.__dict__.update(d) 122 123 def _handle_constant(self, hasconst): 124 if hasconst is False or self.exog is None: 125 self.k_constant = 0 126 self.const_idx = None 127 else: 128 # detect where the constant is 129 check_implicit = False 130 exog_max = np.max(self.exog, axis=0) 131 if not np.isfinite(exog_max).all(): 132 raise MissingDataError('exog contains inf or nans') 133 exog_min = np.min(self.exog, axis=0) 134 const_idx = np.where(exog_max == exog_min)[0].squeeze() 135 self.k_constant = const_idx.size 136 137 if self.k_constant == 1: 138 if self.exog[:, const_idx].mean() != 0: 139 self.const_idx = int(const_idx) 140 else: 141 # we only have a zero column and no other constant 142 check_implicit = True 143 elif self.k_constant > 1: 144 # we have more than one constant column 145 # look for ones 146 values = [] # keep values if we need != 0 147 for idx in const_idx: 148 value = self.exog[:, idx].mean() 149 if value == 1: 150 self.k_constant = 1 151 self.const_idx = int(idx) 152 break 153 values.append(value) 154 else: 155 # we did not break, no column of ones 156 pos = (np.array(values) != 0) 157 if pos.any(): 158 # take the first nonzero column 159 self.k_constant = 1 160 self.const_idx = int(const_idx[pos.argmax()]) 161 else: 162 # only zero columns 163 check_implicit = True 164 elif self.k_constant == 0: 165 check_implicit = True 166 else: 167 # should not be here 168 pass 169 170 if check_implicit and not hasconst: 171 # look for implicit constant 172 # Compute rank of augmented matrix 173 augmented_exog = np.column_stack( 174 (np.ones(self.exog.shape[0]), self.exog)) 175 rank_augm = np.linalg.matrix_rank(augmented_exog) 176 rank_orig = np.linalg.matrix_rank(self.exog) 177 self.k_constant = int(rank_orig == rank_augm) 178 self.const_idx = None 179 elif hasconst: 180 # Ensure k_constant is 1 any time hasconst is True 181 # even if one is not found 182 self.k_constant = 1 183 184 @classmethod 185 def _drop_nans(cls, x, nan_mask): 186 return x[nan_mask] 187 188 @classmethod 189 def _drop_nans_2d(cls, x, nan_mask): 190 return x[nan_mask][:, nan_mask] 191 192 @classmethod 193 def handle_missing(cls, endog, exog, missing, **kwargs): 194 """ 195 This returns a dictionary with keys endog, exog and the keys of 196 kwargs. It preserves Nones. 197 """ 198 none_array_names = [] 199 200 # patsy's already dropped NaNs in y/X 201 missing_idx = kwargs.pop('missing_idx', None) 202 203 if missing_idx is not None: 204 # y, X already handled by patsy. add back in later. 205 combined = () 206 combined_names = [] 207 if exog is None: 208 none_array_names += ['exog'] 209 elif exog is not None: 210 combined = (endog, exog) 211 combined_names = ['endog', 'exog'] 212 else: 213 combined = (endog,) 214 combined_names = ['endog'] 215 none_array_names += ['exog'] 216 217 # deal with other arrays 218 combined_2d = () 219 combined_2d_names = [] 220 if len(kwargs): 221 for key, value_array in kwargs.items(): 222 if value_array is None or np.ndim(value_array) == 0: 223 none_array_names += [key] 224 continue 225 # grab 1d arrays 226 if value_array.ndim == 1: 227 combined += (np.asarray(value_array),) 228 combined_names += [key] 229 elif value_array.squeeze().ndim == 1: 230 combined += (np.asarray(value_array),) 231 combined_names += [key] 232 233 # grab 2d arrays that are _assumed_ to be symmetric 234 elif value_array.ndim == 2: 235 combined_2d += (np.asarray(value_array),) 236 combined_2d_names += [key] 237 else: 238 raise ValueError("Arrays with more than 2 dimensions " 239 "are not yet handled") 240 241 if missing_idx is not None: 242 nan_mask = missing_idx 243 updated_row_mask = None 244 if combined: # there were extra arrays not handled by patsy 245 combined_nans = _nan_rows(*combined) 246 if combined_nans.shape[0] != nan_mask.shape[0]: 247 raise ValueError("Shape mismatch between endog/exog " 248 "and extra arrays given to model.") 249 # for going back and updated endog/exog 250 updated_row_mask = combined_nans[~nan_mask] 251 nan_mask |= combined_nans # for updating extra arrays only 252 if combined_2d: 253 combined_2d_nans = _nan_rows(combined_2d) 254 if combined_2d_nans.shape[0] != nan_mask.shape[0]: 255 raise ValueError("Shape mismatch between endog/exog " 256 "and extra 2d arrays given to model.") 257 if updated_row_mask is not None: 258 updated_row_mask |= combined_2d_nans[~nan_mask] 259 else: 260 updated_row_mask = combined_2d_nans[~nan_mask] 261 nan_mask |= combined_2d_nans 262 263 else: 264 nan_mask = _nan_rows(*combined) 265 if combined_2d: 266 nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d) 267 268 if not np.any(nan_mask): # no missing do not do anything 269 combined = dict(zip(combined_names, combined)) 270 if combined_2d: 271 combined.update(dict(zip(combined_2d_names, combined_2d))) 272 if none_array_names: 273 combined.update({k: kwargs.get(k, None) 274 for k in none_array_names}) 275 276 if missing_idx is not None: 277 combined.update({'endog': endog}) 278 if exog is not None: 279 combined.update({'exog': exog}) 280 281 return combined, [] 282 283 elif missing == 'raise': 284 raise MissingDataError("NaNs were encountered in the data") 285 286 elif missing == 'drop': 287 nan_mask = ~nan_mask 288 drop_nans = lambda x: cls._drop_nans(x, nan_mask) 289 drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) 290 combined = dict(zip(combined_names, lmap(drop_nans, combined))) 291 292 if missing_idx is not None: 293 if updated_row_mask is not None: 294 updated_row_mask = ~updated_row_mask 295 # update endog/exog with this new information 296 endog = cls._drop_nans(endog, updated_row_mask) 297 if exog is not None: 298 exog = cls._drop_nans(exog, updated_row_mask) 299 300 combined.update({'endog': endog}) 301 if exog is not None: 302 combined.update({'exog': exog}) 303 304 if combined_2d: 305 combined.update(dict(zip(combined_2d_names, 306 lmap(drop_nans_2d, combined_2d)))) 307 if none_array_names: 308 combined.update({k: kwargs.get(k, None) 309 for k in none_array_names}) 310 311 return combined, np.where(~nan_mask)[0].tolist() 312 else: 313 raise ValueError("missing option %s not understood" % missing) 314 315 def _convert_endog_exog(self, endog, exog): 316 317 # for consistent outputs if endog is (n,1) 318 yarr = self._get_yarr(endog) 319 xarr = None 320 if exog is not None: 321 xarr = self._get_xarr(exog) 322 if xarr.ndim == 1: 323 xarr = xarr[:, None] 324 if xarr.ndim != 2: 325 raise ValueError("exog is not 1d or 2d") 326 327 return yarr, xarr 328 329 @cache_writable() 330 def ynames(self): 331 endog = self.orig_endog 332 ynames = self._get_names(endog) 333 if not ynames: 334 ynames = _make_endog_names(self.endog) 335 336 if len(ynames) == 1: 337 return ynames[0] 338 else: 339 return list(ynames) 340 341 @cache_writable() 342 def xnames(self): 343 exog = self.orig_exog 344 if exog is not None: 345 xnames = self._get_names(exog) 346 if not xnames: 347 xnames = _make_exog_names(self.exog) 348 return list(xnames) 349 return None 350 351 @property 352 def param_names(self): 353 # for handling names of 'extra' parameters in summary, etc. 354 return self._param_names or self.xnames 355 356 @param_names.setter 357 def param_names(self, values): 358 self._param_names = values 359 360 @property 361 def cov_names(self): 362 """ 363 Labels for covariance matrices 364 365 In multidimensional models, each dimension of a covariance matrix 366 differs from the number of param_names. 367 368 If not set, returns param_names 369 """ 370 # for handling names of covariance names in multidimensional models 371 if self._cov_names is not None: 372 return self._cov_names 373 return self.param_names 374 375 @cov_names.setter 376 def cov_names(self, value): 377 # for handling names of covariance names in multidimensional models 378 self._cov_names = value 379 380 @cache_readonly 381 def row_labels(self): 382 exog = self.orig_exog 383 if exog is not None: 384 row_labels = self._get_row_labels(exog) 385 else: 386 endog = self.orig_endog 387 row_labels = self._get_row_labels(endog) 388 return row_labels 389 390 def _get_row_labels(self, arr): 391 return None 392 393 def _get_names(self, arr): 394 if isinstance(arr, DataFrame): 395 if isinstance(arr.columns, MultiIndex): 396 # Flatten MultiIndexes into "simple" column names 397 return ['_'.join((level for level in c if level)) 398 for c in arr.columns] 399 else: 400 return list(arr.columns) 401 elif isinstance(arr, Series): 402 if arr.name: 403 return [arr.name] 404 else: 405 return 406 else: 407 try: 408 return arr.dtype.names 409 except AttributeError: 410 pass 411 412 return None 413 414 def _get_yarr(self, endog): 415 if data_util._is_structured_ndarray(endog): 416 endog = data_util.struct_to_ndarray(endog) 417 endog = np.asarray(endog) 418 if len(endog) == 1: # never squeeze to a scalar 419 if endog.ndim == 1: 420 return endog 421 elif endog.ndim > 1: 422 return np.asarray([endog.squeeze()]) 423 424 return endog.squeeze() 425 426 def _get_xarr(self, exog): 427 if data_util._is_structured_ndarray(exog): 428 exog = data_util.struct_to_ndarray(exog) 429 return np.asarray(exog) 430 431 def _check_integrity(self): 432 if self.exog is not None: 433 if len(self.exog) != len(self.endog): 434 raise ValueError("endog and exog matrices are different sizes") 435 436 def wrap_output(self, obj, how='columns', names=None): 437 if how == 'columns': 438 return self.attach_columns(obj) 439 elif how == 'rows': 440 return self.attach_rows(obj) 441 elif how == 'cov': 442 return self.attach_cov(obj) 443 elif how == 'dates': 444 return self.attach_dates(obj) 445 elif how == 'columns_eq': 446 return self.attach_columns_eq(obj) 447 elif how == 'cov_eq': 448 return self.attach_cov_eq(obj) 449 elif how == 'generic_columns': 450 return self.attach_generic_columns(obj, names) 451 elif how == 'generic_columns_2d': 452 return self.attach_generic_columns_2d(obj, names) 453 elif how == 'ynames': 454 return self.attach_ynames(obj) 455 elif how == 'multivariate_confint': 456 return self.attach_mv_confint(obj) 457 else: 458 return obj 459 460 def attach_columns(self, result): 461 return result 462 463 def attach_columns_eq(self, result): 464 return result 465 466 def attach_cov(self, result): 467 return result 468 469 def attach_cov_eq(self, result): 470 return result 471 472 def attach_rows(self, result): 473 return result 474 475 def attach_dates(self, result): 476 return result 477 478 def attach_mv_confint(self, result): 479 return result 480 481 def attach_generic_columns(self, result, *args, **kwargs): 482 return result 483 484 def attach_generic_columns_2d(self, result, *args, **kwargs): 485 return result 486 487 def attach_ynames(self, result): 488 return result 489 490 491class PatsyData(ModelData): 492 def _get_names(self, arr): 493 return arr.design_info.column_names 494 495 496class PandasData(ModelData): 497 """ 498 Data handling class which knows how to reattach pandas metadata to model 499 results 500 """ 501 502 def _convert_endog_exog(self, endog, exog=None): 503 #TODO: remove this when we handle dtype systematically 504 endog = np.asarray(endog) 505 exog = exog if exog is None else np.asarray(exog) 506 if endog.dtype == object or exog is not None and exog.dtype == object: 507 raise ValueError("Pandas data cast to numpy dtype of object. " 508 "Check input data with np.asarray(data).") 509 return super(PandasData, self)._convert_endog_exog(endog, exog) 510 511 @classmethod 512 def _drop_nans(cls, x, nan_mask): 513 if isinstance(x, (Series, DataFrame)): 514 return x.loc[nan_mask] 515 else: # extra arguments could be plain ndarrays 516 return super(PandasData, cls)._drop_nans(x, nan_mask) 517 518 @classmethod 519 def _drop_nans_2d(cls, x, nan_mask): 520 if isinstance(x, (Series, DataFrame)): 521 return x.loc[nan_mask].loc[:, nan_mask] 522 else: # extra arguments could be plain ndarrays 523 return super(PandasData, cls)._drop_nans_2d(x, nan_mask) 524 525 def _check_integrity(self): 526 endog, exog = self.orig_endog, self.orig_exog 527 # exog can be None and we could be upcasting one or the other 528 if (exog is not None and 529 (hasattr(endog, 'index') and hasattr(exog, 'index')) and 530 not self.orig_endog.index.equals(self.orig_exog.index)): 531 raise ValueError("The indices for endog and exog are not aligned") 532 super(PandasData, self)._check_integrity() 533 534 def _get_row_labels(self, arr): 535 try: 536 return arr.index 537 except AttributeError: 538 # if we've gotten here it's because endog is pandas and 539 # exog is not, so just return the row labels from endog 540 return self.orig_endog.index 541 542 def attach_generic_columns(self, result, names): 543 # get the attribute to use 544 column_names = getattr(self, names, None) 545 return Series(result, index=column_names) 546 547 def attach_generic_columns_2d(self, result, rownames, colnames=None): 548 colnames = colnames or rownames 549 rownames = getattr(self, rownames, None) 550 colnames = getattr(self, colnames, None) 551 return DataFrame(result, index=rownames, columns=colnames) 552 553 def attach_columns(self, result): 554 # this can either be a 1d array or a scalar 555 # do not squeeze because it might be a 2d row array 556 # if it needs a squeeze, the bug is elsewhere 557 if result.ndim <= 1: 558 return Series(result, index=self.param_names) 559 else: # for e.g., confidence intervals 560 return DataFrame(result, index=self.param_names) 561 562 def attach_columns_eq(self, result): 563 return DataFrame(result, index=self.xnames, columns=self.ynames) 564 565 def attach_cov(self, result): 566 return DataFrame(result, index=self.cov_names, columns=self.cov_names) 567 568 def attach_cov_eq(self, result): 569 return DataFrame(result, index=self.ynames, columns=self.ynames) 570 571 def attach_rows(self, result): 572 # assumes if len(row_labels) > len(result) it's bc it was truncated 573 # at the front, for AR lags, for example 574 squeezed = result.squeeze() 575 k_endog = np.array(self.ynames, ndmin=1).shape[0] 576 if k_endog > 1 and squeezed.shape == (k_endog,): 577 squeezed = squeezed[None, :] 578 # May be zero-dim, for example in the case of forecast one step in tsa 579 if squeezed.ndim < 2: 580 out = Series(squeezed) 581 else: 582 out = DataFrame(result) 583 out.columns = self.ynames 584 out.index = self.row_labels[-len(result):] 585 return out 586 587 def attach_dates(self, result): 588 squeezed = result.squeeze() 589 k_endog = np.array(self.ynames, ndmin=1).shape[0] 590 if k_endog > 1 and squeezed.shape == (k_endog,): 591 squeezed = np.asarray(squeezed)[None, :] 592 # May be zero-dim, for example in the case of forecast one step in tsa 593 if squeezed.ndim < 2: 594 return Series(squeezed, index=self.predict_dates) 595 else: 596 return DataFrame(np.asarray(result), 597 index=self.predict_dates, 598 columns=self.ynames) 599 600 def attach_mv_confint(self, result): 601 return DataFrame(result.reshape((-1, 2)), 602 index=self.cov_names, 603 columns=['lower', 'upper']) 604 605 def attach_ynames(self, result): 606 squeezed = result.squeeze() 607 # May be zero-dim, for example in the case of forecast one step in tsa 608 if squeezed.ndim < 2: 609 return Series(squeezed, name=self.ynames) 610 else: 611 return DataFrame(result, columns=self.ynames) 612 613 614def _make_endog_names(endog): 615 if endog.ndim == 1 or endog.shape[1] == 1: 616 ynames = ['y'] 617 else: # for VAR 618 ynames = ['y%d' % (i+1) for i in range(endog.shape[1])] 619 620 return ynames 621 622 623def _make_exog_names(exog): 624 exog_var = exog.var(0) 625 if (exog_var == 0).any(): 626 # assumes one constant in first or last position 627 # avoid exception if more than one constant 628 const_idx = exog_var.argmin() 629 exog_names = ['x%d' % i for i in range(1, exog.shape[1])] 630 exog_names.insert(const_idx, 'const') 631 else: 632 exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)] 633 634 return exog_names 635 636 637def handle_missing(endog, exog=None, missing='none', **kwargs): 638 klass = handle_data_class_factory(endog, exog) 639 if missing == 'none': 640 ret_dict = dict(endog=endog, exog=exog) 641 ret_dict.update(kwargs) 642 return ret_dict, None 643 return klass.handle_missing(endog, exog, missing=missing, **kwargs) 644 645 646def handle_data_class_factory(endog, exog): 647 """ 648 Given inputs 649 """ 650 if data_util._is_using_ndarray_type(endog, exog): 651 klass = ModelData 652 elif data_util._is_using_pandas(endog, exog): 653 klass = PandasData 654 elif data_util._is_using_patsy(endog, exog): 655 klass = PatsyData 656 # keep this check last 657 elif data_util._is_using_ndarray(endog, exog): 658 klass = ModelData 659 else: 660 raise ValueError('unrecognized data structures: %s / %s' % 661 (type(endog), type(exog))) 662 return klass 663 664 665def handle_data(endog, exog, missing='none', hasconst=None, **kwargs): 666 # deal with lists and tuples up-front 667 if isinstance(endog, (list, tuple)): 668 endog = np.asarray(endog) 669 if isinstance(exog, (list, tuple)): 670 exog = np.asarray(exog) 671 672 klass = handle_data_class_factory(endog, exog) 673 return klass(endog, exog=exog, missing=missing, hasconst=hasconst, 674 **kwargs) 675