1# -*- coding: utf-8 -*- 2""" 3Dynamic factor model. 4 5Author: Chad Fulton 6License: BSD-3 7""" 8from collections import OrderedDict 9from warnings import warn 10 11import numpy as np 12import pandas as pd 13from scipy.linalg import cho_factor, cho_solve, LinAlgError 14 15from statsmodels.tools.data import _is_using_pandas 16from statsmodels.tools.validation import int_like 17from statsmodels.tools.decorators import cache_readonly 18from statsmodels.regression.linear_model import OLS 19from statsmodels.genmod.generalized_linear_model import GLM 20from statsmodels.multivariate.pca import PCA 21 22from statsmodels.tsa.statespace.sarimax import SARIMAX 23from statsmodels.tsa.statespace._quarterly_ar1 import QuarterlyAR1 24from statsmodels.tsa.vector_ar.var_model import VAR 25from statsmodels.tools.tools import Bunch 26from statsmodels.tools.validation import string_like 27from statsmodels.tsa.tsatools import lagmat 28from statsmodels.tsa.statespace import mlemodel, initialization 29from statsmodels.tsa.statespace.tools import ( 30 companion_matrix, is_invertible, constrain_stationary_univariate, 31 constrain_stationary_multivariate, unconstrain_stationary_univariate, 32 unconstrain_stationary_multivariate) 33from statsmodels.tsa.statespace.kalman_smoother import ( 34 SMOOTHER_STATE, SMOOTHER_STATE_COV, SMOOTHER_STATE_AUTOCOV) 35from statsmodels.base.data import PandasData 36 37from statsmodels.iolib.table import SimpleTable 38from statsmodels.iolib.summary import Summary 39from statsmodels.iolib.tableformatting import fmt_params 40 41 42class FactorBlock(dict): 43 """ 44 Helper class for describing and indexing a block of factors. 45 46 Parameters 47 ---------- 48 factor_names : tuple of str 49 Tuple of factor names in the block (in the order that they will appear 50 in the state vector). 51 factor_order : int 52 Order of the vector autoregression governing the factor block dynamics. 53 endog_factor_map : pd.DataFrame 54 Mapping from endog variable names to factor names. 55 state_offset : int 56 Offset of this factor block in the state vector. 57 has_endog_Q : bool 58 Flag if the model contains quarterly data. 59 60 Notes 61 ----- 62 The goal of this class is, in particular, to make it easier to retrieve 63 indexes of subsets of the state vector that are associated with a 64 particular block of factors. 65 66 - `factors_ix` is a matrix of indices, with rows corresponding to factors 67 in the block and columns corresponding to lags 68 - `factors` is vec(factors_ix) (i.e. it stacks columns, so that it is 69 `factors_ix.ravel(order='F')`). Thinking about a VAR system, the first 70 k*p elements correspond to the equation for the first variable. The next 71 k*p elements correspond to the equation for the second variable, and so 72 on. It contains all of the lags in the state vector, which is max(5, p) 73 - `factors_ar` is the subset of `factors` that have nonzero coefficients, 74 so it contains lags up to p. 75 - `factors_L1` only contains the first lag of the factors 76 - `factors_L1_5` contains the first - fifth lags of the factors 77 78 """ 79 80 def __init__(self, factor_names, factor_order, endog_factor_map, 81 state_offset, k_endog_Q): 82 self.factor_names = factor_names 83 self.k_factors = len(self.factor_names) 84 self.factor_order = factor_order 85 self.endog_factor_map = endog_factor_map.loc[:, factor_names] 86 self.state_offset = state_offset 87 self.k_endog_Q = k_endog_Q 88 if self.k_endog_Q > 0: 89 self._factor_order = max(5, self.factor_order) 90 else: 91 self._factor_order = self.factor_order 92 self.k_states = self.k_factors * self._factor_order 93 94 # Save items 95 self['factors'] = self.factors 96 self['factors_ar'] = self.factors_ar 97 self['factors_ix'] = self.factors_ix 98 self['factors_L1'] = self.factors_L1 99 self['factors_L1_5'] = self.factors_L1_5 100 101 @property 102 def factors_ix(self): 103 """Factor state index array, shaped (k_factors, lags).""" 104 # i.e. the position in the state vector of the second lag of the third 105 # factor is factors_ix[2, 1] 106 # ravel(order='F') gives e.g (f0.L1, f1.L1, f0.L2, f1.L2, f0.L3, ...) 107 # while 108 # ravel(order='C') gives e.g (f0.L1, f0.L2, f0.L3, f1.L1, f1.L2, ...) 109 o = self.state_offset 110 return np.reshape(o + np.arange(self.k_factors * self._factor_order), 111 (self._factor_order, self.k_factors)).T 112 113 @property 114 def factors(self): 115 """Factors and all lags in the state vector (max(5, p)).""" 116 # Note that this is equivalent to factors_ix with ravel(order='F') 117 o = self.state_offset 118 return np.s_[o:o + self.k_factors * self._factor_order] 119 120 @property 121 def factors_ar(self): 122 """Factors and all lags used in the factor autoregression (p).""" 123 o = self.state_offset 124 return np.s_[o:o + self.k_factors * self.factor_order] 125 126 @property 127 def factors_L1(self): 128 """Factors (first block / lag only).""" 129 o = self.state_offset 130 return np.s_[o:o + self.k_factors] 131 132 @property 133 def factors_L1_5(self): 134 """Factors plus four lags.""" 135 o = self.state_offset 136 return np.s_[o:o + self.k_factors * 5] 137 138 139class DynamicFactorMQStates(dict): 140 """ 141 Helper class for describing and indexing the state vector. 142 143 Parameters 144 ---------- 145 k_endog_M : int 146 Number of monthly (or non-time-specific, if k_endog_Q=0) variables. 147 k_endog_Q : int 148 Number of quarterly variables. 149 endog_names : list 150 Names of the endogenous variables. 151 factors : int, list, or dict 152 Integer giving the number of (global) factors, a list with the names of 153 (global) factors, or a dictionary with: 154 155 - keys : names of endogenous variables 156 - values : lists of factor names. 157 158 If this is an integer, then the factor names will be 0, 1, .... 159 factor_orders : int or dict 160 Integer describing the order of the vector autoregression (VAR) 161 governing all factor block dynamics or dictionary with: 162 163 - keys : factor name or tuples of factor names in a block 164 - values : integer describing the VAR order for that factor block 165 166 If a dictionary, this defines the order of the factor blocks in the 167 state vector. Otherwise, factors are ordered so that factors that load 168 on more variables come first (and then alphabetically, to break ties). 169 factor_multiplicities : int or dict 170 This argument provides a convenient way to specify multiple factors 171 that load identically on variables. For example, one may want two 172 "global" factors (factors that load on all variables) that evolve 173 jointly according to a VAR. One could specify two global factors in the 174 `factors` argument and specify that they are in the same block in the 175 `factor_orders` argument, but it is easier to specify a single global 176 factor in the `factors` argument, and set the order in the 177 `factor_orders` argument, and then set the factor multiplicity to 2. 178 179 This argument must be an integer describing the factor multiplicity for 180 all factors or dictionary with: 181 182 - keys : factor name 183 - values : integer describing the factor multiplicity for the factors 184 in the given block 185 idiosyncratic_ar1 : bool 186 Whether or not to model the idiosyncratic component for each series as 187 an AR(1) process. If False, the idiosyncratic component is instead 188 modeled as white noise. 189 190 Attributes 191 ---------- 192 k_endog : int 193 Total number of endogenous variables. 194 k_states : int 195 Total number of state variables (those associated with the factors and 196 those associated with the idiosyncratic disturbances). 197 k_posdef : int 198 Total number of state disturbance terms (those associated with the 199 factors and those associated with the idiosyncratic disturbances). 200 k_endog_M : int 201 Number of monthly (or non-time-specific, if k_endog_Q=0) variables. 202 k_endog_Q : int 203 Number of quarterly variables. 204 k_factors : int 205 Total number of factors. Note that factor multiplicities will have 206 already been expanded. 207 k_states_factors : int 208 The number of state variables associated with factors (includes both 209 factors and lags of factors included in the state vector). 210 k_posdef_factors : int 211 The number of state disturbance terms associated with factors. 212 k_states_idio : int 213 Total number of state variables associated with idiosyncratic 214 disturbances. 215 k_posdef_idio : int 216 Total number of state disturbance terms associated with idiosyncratic 217 disturbances. 218 k_states_idio_M : int 219 The number of state variables associated with idiosyncratic 220 disturbances for monthly (or non-time-specific if there are no 221 quarterly variables) variables. If the disturbances are AR(1), then 222 this will be equal to `k_endog_M`, otherwise it will be equal to zero. 223 k_states_idio_Q : int 224 The number of state variables associated with idiosyncratic 225 disturbances for quarterly variables. This will always be equal to 226 `k_endog_Q * 5`, even if the disturbances are not AR(1). 227 k_posdef_idio_M : int 228 The number of state disturbance terms associated with idiosyncratic 229 disturbances for monthly (or non-time-specific if there are no 230 quarterly variables) variables. If the disturbances are AR(1), then 231 this will be equal to `k_endog_M`, otherwise it will be equal to zero. 232 k_posdef_idio_Q : int 233 The number of state disturbance terms associated with idiosyncratic 234 disturbances for quarterly variables. This will always be equal to 235 `k_endog_Q`, even if the disturbances are not AR(1). 236 idiosyncratic_ar1 : bool 237 Whether or not to model the idiosyncratic component for each series as 238 an AR(1) process. 239 factor_blocks : list of FactorBlock 240 List of `FactorBlock` helper instances for each factor block. 241 factor_names : list of str 242 List of factor names. 243 factors : dict 244 Dictionary with: 245 246 - keys : names of endogenous variables 247 - values : lists of factor names. 248 249 Note that factor multiplicities will have already been expanded. 250 factor_orders : dict 251 Dictionary with: 252 253 - keys : tuple of factor names 254 - values : integer describing autoregression order 255 256 Note that factor multiplicities will have already been expanded. 257 max_factor_order : int 258 Maximum autoregression order across all factor blocks. 259 factor_block_orders : pd.Series 260 Series containing lag orders, with the factor block (a tuple of factor 261 names) as the index. 262 factor_multiplicities : dict 263 Dictionary with: 264 265 - keys : factor name 266 - values : integer describing the factor multiplicity for the factors 267 in the given block 268 endog_factor_map : dict 269 Dictionary with: 270 271 - keys : endog name 272 - values : list of factor names 273 loading_counts : pd.Series 274 Series containing number of endogenous variables loading on each 275 factor, with the factor name as the index. 276 block_loading_counts : dict 277 Dictionary with: 278 279 - keys : tuple of factor names 280 - values : average number of endogenous variables loading on the block 281 (note that average is over the factors in the block) 282 283 Notes 284 ----- 285 The goal of this class is, in particular, to make it easier to retrieve 286 indexes of subsets of the state vector. 287 288 Note that the ordering of the factor blocks in the state vector is 289 determined by the `factor_orders` argument if a dictionary. Otherwise, 290 factors are ordered so that factors that load on more variables come first 291 (and then alphabetically, to break ties). 292 293 - `factors_L1` is an array with the indexes of first lag of the factors 294 from each block. Ordered first by block, and then by lag. 295 - `factors_L1_5` is an array with the indexes contains the first - fifth 296 lags of the factors from each block. Ordered first by block, and then by 297 lag. 298 - `factors_L1_5_ix` is an array shaped (5, k_factors) with the indexes 299 of the first - fifth lags of the factors from each block. 300 - `idio_ar_L1` is an array with the indexes of the first lag of the 301 idiosyncratic AR states, both monthly (if appliable) and quarterly. 302 - `idio_ar_M` is a slice with the indexes of the idiosyncratic disturbance 303 states for the monthly (or non-time-specific if there are no quarterly 304 variables) variables. It is an empty slice if 305 `idiosyncratic_ar1 = False`. 306 - `idio_ar_Q` is a slice with the indexes of the idiosyncratic disturbance 307 states and all lags, for the quarterly variables. It is an empty slice if 308 there are no quarterly variable. 309 - `idio_ar_Q_ix` is an array shaped (k_endog_Q, 5) with the indexes of the 310 first - fifth lags of the idiosyncratic disturbance states for the 311 quarterly variables. 312 - `endog_factor_iloc` is a list of lists, with entries for each endogenous 313 variable. The entry for variable `i`, `endog_factor_iloc[i]` is a list of 314 indexes of the factors that variable `i` loads on. This does not include 315 any lags, but it can be used with e.g. `factors_L1_5_ix` to get lags. 316 317 """ 318 319 def __init__(self, k_endog_M, k_endog_Q, endog_names, factors, 320 factor_orders, factor_multiplicities, idiosyncratic_ar1): 321 # Save model parameterization 322 self.k_endog_M = k_endog_M 323 self.k_endog_Q = k_endog_Q 324 self.k_endog = self.k_endog_M + self.k_endog_Q 325 self.idiosyncratic_ar1 = idiosyncratic_ar1 326 327 # Validate factor-related inputs 328 factors_is_int = np.issubdtype(type(factors), np.integer) 329 factors_is_list = isinstance(factors, (list, tuple)) 330 orders_is_int = np.issubdtype(type(factor_orders), np.integer) 331 if factor_multiplicities is None: 332 factor_multiplicities = 1 333 mult_is_int = np.issubdtype(type(factor_multiplicities), np.integer) 334 335 if not (factors_is_int or factors_is_list or 336 isinstance(factors, dict)): 337 raise ValueError('`factors` argument must an integer number of' 338 ' factors, a list of global factor names, or a' 339 ' dictionary, mapping observed variables to' 340 ' factors.') 341 if not (orders_is_int or isinstance(factor_orders, dict)): 342 raise ValueError('`factor_orders` argument must either be an' 343 ' integer or a dictionary.') 344 if not (mult_is_int or isinstance(factor_multiplicities, dict)): 345 raise ValueError('`factor_multiplicities` argument must either be' 346 ' an integer or a dictionary.') 347 348 # Expand integers 349 # If `factors` is an integer, we assume that it denotes the number of 350 # global factors (factors that load on each variable) 351 if factors_is_int or factors_is_list: 352 # Validate this here for a more informative error message 353 if ((factors_is_int and factors == 0) or 354 (factors_is_list and len(factors) == 0)): 355 raise ValueError('The model must contain at least one factor.') 356 357 if factors_is_list: 358 factor_names = list(factors) 359 else: 360 factor_names = [f'{i}' for i in range(factors)] 361 factors = {name: factor_names[:] for name in endog_names} 362 _factor_names = [] 363 for val in factors.values(): 364 _factor_names.extend(val) 365 factor_names = set(_factor_names) 366 if orders_is_int: 367 factor_orders = {factor_name: factor_orders 368 for factor_name in factor_names} 369 if mult_is_int: 370 factor_multiplicities = {factor_name: factor_multiplicities 371 for factor_name in factor_names} 372 373 # Apply the factor multiplicities 374 factors, factor_orders = self._apply_factor_multiplicities( 375 factors, factor_orders, factor_multiplicities) 376 377 # Save the (potentially expanded) variables 378 self.factors = factors 379 self.factor_orders = factor_orders 380 self.factor_multiplicities = factor_multiplicities 381 382 # Get the mapping between endog and factors 383 self.endog_factor_map = self._construct_endog_factor_map( 384 factors, endog_names) 385 self.k_factors = self.endog_factor_map.shape[1] 386 387 # Validate number of factors 388 # TODO: could do more extensive validation here. 389 if self.k_factors > self.k_endog_M: 390 raise ValueError(f'Number of factors ({self.k_factors}) cannot be' 391 ' greater than the number of monthly endogenous' 392 f' variables ({self.k_endog_M}).') 393 394 # Get `loading_counts`: factor -> # endog loading on the factor 395 self.loading_counts = ( 396 self.endog_factor_map.sum(axis=0).rename('count') 397 .reset_index().sort_values(['count', 'factor'], 398 ascending=[False, True]) 399 .set_index('factor')) 400 # `block_loading_counts`: block -> average of (# loading on factor) 401 # across each factor in the block 402 block_loading_counts = { 403 block: np.atleast_1d( 404 self.loading_counts.loc[list(block), 'count']).mean(axis=0) 405 for block in factor_orders.keys()} 406 ix = pd.Index(block_loading_counts.keys(), tupleize_cols=False, 407 name='block') 408 self.block_loading_counts = pd.Series( 409 list(block_loading_counts.values()), 410 index=ix, name='count').to_frame().sort_values( 411 ['count', 'block'], ascending=[False, True])['count'] 412 413 # Get the mapping between factor blocks and VAR order 414 415 # `factor_block_orders`: pd.Series of factor block -> lag order 416 ix = pd.Index(factor_orders.keys(), tupleize_cols=False, name='block') 417 self.factor_block_orders = pd.Series( 418 list(factor_orders.values()), index=ix, name='order') 419 420 # If the `factor_orders` variable was an integer, then it did not 421 # define an ordering for the factor blocks. In this case, we use the 422 # loading counts to do so. This ensures that e.g. global factors are 423 # listed first. 424 if orders_is_int: 425 keys = self.block_loading_counts.keys() 426 self.factor_block_orders = self.factor_block_orders.loc[keys] 427 self.factor_block_orders.index.name = 'block' 428 429 # Define factor_names based on factor_block_orders (instead of on those 430 # from `endog_factor_map`) to (a) make sure that factors are allocated 431 # to only one block, and (b) order the factor names to be consistent 432 # with the block definitions. 433 factor_names = pd.Series( 434 np.concatenate(list(self.factor_block_orders.index))) 435 missing = [name for name in self.endog_factor_map.columns 436 if name not in factor_names.tolist()] 437 if len(missing): 438 ix = pd.Index([(factor_name,) for factor_name in missing], 439 tupleize_cols=False, name='block') 440 default_block_orders = pd.Series(np.ones(len(ix), dtype=int), 441 index=ix, name='order') 442 self.factor_block_orders = ( 443 self.factor_block_orders.append(default_block_orders)) 444 factor_names = pd.Series( 445 np.concatenate(list(self.factor_block_orders.index))) 446 duplicates = factor_names.duplicated() 447 if duplicates.any(): 448 duplicate_names = set(factor_names[duplicates]) 449 raise ValueError('Each factor can be assigned to at most one' 450 ' block of factors in `factor_orders`.' 451 f' Duplicate entries for {duplicate_names}') 452 self.factor_names = factor_names.tolist() 453 self.max_factor_order = np.max(self.factor_block_orders) 454 455 # Re-order the columns of the endog factor mapping to reflect the 456 # orderings of endog_names and factor_names 457 self.endog_factor_map = ( 458 self.endog_factor_map.loc[endog_names, factor_names]) 459 460 # Create factor block helpers, and get factor-related state and posdef 461 # dimensions 462 self.k_states_factors = 0 463 self.k_posdef_factors = 0 464 state_offset = 0 465 self.factor_blocks = [] 466 for factor_names, factor_order in self.factor_block_orders.items(): 467 block = FactorBlock(factor_names, factor_order, 468 self.endog_factor_map, state_offset, 469 self.k_endog_Q) 470 self.k_states_factors += block.k_states 471 self.k_posdef_factors += block.k_factors 472 state_offset += block.k_states 473 474 self.factor_blocks.append(block) 475 476 # Idiosyncratic state dimensions 477 self.k_states_idio_M = self.k_endog_M if idiosyncratic_ar1 else 0 478 self.k_states_idio_Q = self.k_endog_Q * 5 479 self.k_states_idio = self.k_states_idio_M + self.k_states_idio_Q 480 481 # Idiosyncratic posdef dimensions 482 self.k_posdef_idio_M = self.k_endog_M if self.idiosyncratic_ar1 else 0 483 self.k_posdef_idio_Q = self.k_endog_Q 484 self.k_posdef_idio = self.k_posdef_idio_M + self.k_posdef_idio_Q 485 486 # Total states, posdef 487 self.k_states = self.k_states_factors + self.k_states_idio 488 self.k_posdef = self.k_posdef_factors + self.k_posdef_idio 489 490 # Cache 491 self._endog_factor_iloc = None 492 493 def _apply_factor_multiplicities(self, factors, factor_orders, 494 factor_multiplicities): 495 """ 496 Expand `factors` and `factor_orders` to account for factor multiplity. 497 498 For example, if there is a `global` factor with multiplicity 2, then 499 this method expands that into `global.1` and `global.2` in both the 500 `factors` and `factor_orders` dictionaries. 501 502 Parameters 503 ---------- 504 factors : dict 505 Dictionary of {endog_name: list of factor names} 506 factor_orders : dict 507 Dictionary of {tuple of factor names: factor order} 508 factor_multiplicities : dict 509 Dictionary of {factor name: factor multiplicity} 510 511 Returns 512 ------- 513 new_factors : dict 514 Dictionary of {endog_name: list of factor names}, with factor names 515 expanded to incorporate multiplicities. 516 new_factors : dict 517 Dictionary of {tuple of factor names: factor order}, with factor 518 names in each tuple expanded to incorporate multiplicities. 519 """ 520 # Expand the factors to account for the multiplicities 521 new_factors = {} 522 for endog_name, factors_list in factors.items(): 523 new_factor_list = [] 524 for factor_name in factors_list: 525 n = factor_multiplicities.get(factor_name, 1) 526 if n > 1: 527 new_factor_list += [f'{factor_name}.{i + 1}' 528 for i in range(n)] 529 else: 530 new_factor_list.append(factor_name) 531 new_factors[endog_name] = new_factor_list 532 533 # Expand the factor orders to account for the multiplicities 534 new_factor_orders = {} 535 for block, factor_order in factor_orders.items(): 536 if not isinstance(block, tuple): 537 block = (block,) 538 new_block = [] 539 for factor_name in block: 540 n = factor_multiplicities.get(factor_name, 1) 541 if n > 1: 542 new_block += [f'{factor_name}.{i + 1}' 543 for i in range(n)] 544 else: 545 new_block += [factor_name] 546 new_factor_orders[tuple(new_block)] = factor_order 547 548 return new_factors, new_factor_orders 549 550 def _construct_endog_factor_map(self, factors, endog_names): 551 """ 552 Construct mapping of observed variables to factors. 553 554 Parameters 555 ---------- 556 factors : dict 557 Dictionary of {endog_name: list of factor names} 558 endog_names : list of str 559 List of the names of the observed variables. 560 561 Returns 562 ------- 563 endog_factor_map : pd.DataFrame 564 Boolean dataframe with `endog_names` as the index and the factor 565 names (computed from the `factors` input) as the columns. Each cell 566 is True if the associated factor is allowed to load on the 567 associated observed variable. 568 569 """ 570 # Validate that all entries in the factors dictionary have associated 571 # factors 572 missing = [] 573 for key, value in factors.items(): 574 if not isinstance(value, (list, tuple)) or len(value) == 0: 575 missing.append(key) 576 if len(missing): 577 raise ValueError('Each observed variable must be mapped to at' 578 ' least one factor in the `factors` dictionary.' 579 f' Variables missing factors are: {missing}.') 580 581 # Validate that we have been told about the factors for each endog 582 # variable. This is because it doesn't make sense to include an 583 # observed variable that doesn't load on any factor 584 missing = set(endog_names).difference(set(factors.keys())) 585 if len(missing): 586 raise ValueError('If a `factors` dictionary is provided, then' 587 ' it must include entries for each observed' 588 f' variable. Missing variables are: {missing}.') 589 590 # Figure out the set of factor names 591 # (0 is just a dummy value for the dict - we just do it this way to 592 # collect the keys, in order, without duplicates.) 593 factor_names = {} 594 for key, value in factors.items(): 595 if isinstance(value, str): 596 factor_names[value] = 0 597 else: 598 factor_names.update({v: 0 for v in value}) 599 factor_names = list(factor_names.keys()) 600 k_factors = len(factor_names) 601 602 endog_factor_map = pd.DataFrame( 603 np.zeros((self.k_endog, k_factors), dtype=bool), 604 index=pd.Index(endog_names, name='endog'), 605 columns=pd.Index(factor_names, name='factor')) 606 for key, value in factors.items(): 607 endog_factor_map.loc[key, value] = True 608 609 return endog_factor_map 610 611 @property 612 def factors_L1(self): 613 """Factors.""" 614 ix = np.arange(self.k_states_factors) 615 iloc = tuple(ix[block.factors_L1] for block in self.factor_blocks) 616 return np.concatenate(iloc) 617 618 @property 619 def factors_L1_5_ix(self): 620 """Factors plus any lags, index shaped (5, k_factors).""" 621 ix = np.arange(self.k_states_factors) 622 iloc = [] 623 for block in self.factor_blocks: 624 iloc.append(ix[block.factors_L1_5].reshape(5, block.k_factors)) 625 return np.concatenate(iloc, axis=1) 626 627 @property 628 def idio_ar_L1(self): 629 """Idiosyncratic AR states, (first block / lag only).""" 630 ix1 = self.k_states_factors 631 if self.idiosyncratic_ar1: 632 ix2 = ix1 + self.k_endog 633 else: 634 ix2 = ix1 + self.k_endog_Q 635 return np.s_[ix1:ix2] 636 637 @property 638 def idio_ar_M(self): 639 """Idiosyncratic AR states for monthly variables.""" 640 ix1 = self.k_states_factors 641 ix2 = ix1 642 if self.idiosyncratic_ar1: 643 ix2 += self.k_endog_M 644 return np.s_[ix1:ix2] 645 646 @property 647 def idio_ar_Q(self): 648 """Idiosyncratic AR states and all lags for quarterly variables.""" 649 # Note that this is equivalent to idio_ar_Q_ix with ravel(order='F') 650 ix1 = self.k_states_factors 651 if self.idiosyncratic_ar1: 652 ix1 += self.k_endog_M 653 ix2 = ix1 + self.k_endog_Q * 5 654 return np.s_[ix1:ix2] 655 656 @property 657 def idio_ar_Q_ix(self): 658 """Idiosyncratic AR (quarterly) state index, (k_endog_Q, lags).""" 659 # i.e. the position in the state vector of the second lag of the third 660 # quarterly variable is idio_ar_Q_ix[2, 1] 661 # ravel(order='F') gives e.g (y1.L1, y2.L1, y1.L2, y2.L3, y1.L3, ...) 662 # while 663 # ravel(order='C') gives e.g (y1.L1, y1.L2, y1.L3, y2.L1, y2.L2, ...) 664 start = self.k_states_factors 665 if self.idiosyncratic_ar1: 666 start += self.k_endog_M 667 return (start + np.reshape( 668 np.arange(5 * self.k_endog_Q), (5, self.k_endog_Q)).T) 669 670 @property 671 def endog_factor_iloc(self): 672 """List of list of int, factor indexes for each observed variable.""" 673 # i.e. endog_factor_iloc[i] is a list of integer locations of the 674 # factors that load on the ith observed variable 675 if self._endog_factor_iloc is None: 676 ilocs = [] 677 for i in range(self.k_endog): 678 ilocs.append(np.where(self.endog_factor_map.iloc[i])[0]) 679 self._endog_factor_iloc = ilocs 680 return self._endog_factor_iloc 681 682 def __getitem__(self, key): 683 """ 684 Use square brackets to access index / slice elements. 685 686 This is convenient in highlighting the indexing / slice quality of 687 these attributes in the code below. 688 """ 689 if key in ['factors_L1', 'factors_L1_5_ix', 'idio_ar_L1', 'idio_ar_M', 690 'idio_ar_Q', 'idio_ar_Q_ix']: 691 return getattr(self, key) 692 else: 693 raise KeyError(key) 694 695 696class DynamicFactorMQ(mlemodel.MLEModel): 697 r""" 698 Dynamic factor model with EM algorithm; option for monthly/quarterly data. 699 700 Implementation of the dynamic factor model of Bańbura and Modugno (2014) 701 ([1]_) and Bańbura, Giannone, and Reichlin (2011) ([2]_). Uses the EM 702 algorithm for parameter fitting, and so can accommodate a large number of 703 left-hand-side variables. Specifications can include any collection of 704 blocks of factors, including different factor autoregression orders, and 705 can include AR(1) processes for idiosyncratic disturbances. Can 706 incorporate monthly/quarterly mixed frequency data along the lines of 707 Mariano and Murasawa (2011) ([4]_). A special case of this model is the 708 Nowcasting model of Bok et al. (2017) ([3]_). Moreover, this model can be 709 used to compute the news associated with updated data releases. 710 711 Parameters 712 ---------- 713 endog : array_like 714 Observed time-series process :math:`y`. See the "Notes" section for 715 details on how to set up a model with monthly/quarterly mixed frequency 716 data. 717 k_endog_monthly : int, optional 718 If specifying a monthly/quarterly mixed frequency model in which the 719 provided `endog` dataset contains both the monthly and quarterly data, 720 this variable should be used to indicate how many of the variables 721 are monthly. Note that when using the `k_endog_monthly` argument, the 722 columns with monthly variables in `endog` should be ordered first, and 723 the columns with quarterly variables should come afterwards. See the 724 "Notes" section for details on how to set up a model with 725 monthly/quarterly mixed frequency data. 726 factors : int, list, or dict, optional 727 Integer giving the number of (global) factors, a list with the names of 728 (global) factors, or a dictionary with: 729 730 - keys : names of endogenous variables 731 - values : lists of factor names. 732 733 If this is an integer, then the factor names will be 0, 1, .... The 734 default is a single factor that loads on all variables. Note that there 735 cannot be more factors specified than there are monthly variables. 736 factor_orders : int or dict, optional 737 Integer describing the order of the vector autoregression (VAR) 738 governing all factor block dynamics or dictionary with: 739 740 - keys : factor name or tuples of factor names in a block 741 - values : integer describing the VAR order for that factor block 742 743 If a dictionary, this defines the order of the factor blocks in the 744 state vector. Otherwise, factors are ordered so that factors that load 745 on more variables come first (and then alphabetically, to break ties). 746 factor_multiplicities : int or dict, optional 747 This argument provides a convenient way to specify multiple factors 748 that load identically on variables. For example, one may want two 749 "global" factors (factors that load on all variables) that evolve 750 jointly according to a VAR. One could specify two global factors in the 751 `factors` argument and specify that they are in the same block in the 752 `factor_orders` argument, but it is easier to specify a single global 753 factor in the `factors` argument, and set the order in the 754 `factor_orders` argument, and then set the factor multiplicity to 2. 755 756 This argument must be an integer describing the factor multiplicity for 757 all factors or dictionary with: 758 759 - keys : factor name 760 - values : integer describing the factor multiplicity for the factors 761 in the given block 762 763 idiosyncratic_ar1 : bool 764 Whether or not to model the idiosyncratic component for each series as 765 an AR(1) process. If False, the idiosyncratic component is instead 766 modeled as white noise. 767 standardize : bool or tuple, optional 768 If a boolean, whether or not to standardize each endogenous variable to 769 have mean zero and standard deviation 1 before fitting the model. See 770 "Notes" for details about how this option works with postestimation 771 output. If a tuple (usually only used internally), then the tuple must 772 have length 2, with each element containing a Pandas series with index 773 equal to the names of the endogenous variables. The first element 774 should contain the mean values and the second element should contain 775 the standard deviations. Default is True. 776 endog_quarterly : pandas.Series or pandas.DataFrame 777 Observed quarterly variables. If provided, must be a Pandas Series or 778 DataFrame with a DatetimeIndex or PeriodIndex at the quarterly 779 frequency. See the "Notes" section for details on how to set up a model 780 with monthly/quarterly mixed frequency data. 781 init_t0 : bool, optional 782 If True, this option initializes the Kalman filter with the 783 distribution for :math:`\alpha_0` rather than :math:`\alpha_1`. See 784 the "Notes" section for more details. This option is rarely used except 785 for testing. Default is False. 786 obs_cov_diag : bool, optional 787 If True and if `idiosyncratic_ar1 is True`, then this option puts small 788 positive values in the observation disturbance covariance matrix. This 789 is not required for estimation and is rarely used except for testing. 790 (It is sometimes used to prevent numerical errors, for example those 791 associated with a positive semi-definite forecast error covariance 792 matrix at the first time step when using EM initialization, but state 793 space models in Statsmodels switch to the univariate approach in those 794 cases, and so do not need to use this trick). Default is False. 795 796 Notes 797 ----- 798 The basic model is: 799 800 .. math:: 801 802 y_t & = \Lambda f_t + \epsilon_t \\ 803 f_t & = A_1 f_{t-1} + \dots + A_p f_{t-p} + u_t 804 805 where: 806 807 - :math:`y_t` is observed data at time t 808 - :math:`\epsilon_t` is idiosyncratic disturbance at time t (see below for 809 details, including modeling serial correlation in this term) 810 - :math:`f_t` is the unobserved factor at time t 811 - :math:`u_t \sim N(0, Q)` is the factor disturbance at time t 812 813 and: 814 815 - :math:`\Lambda` is referred to as the matrix of factor loadings 816 - :math:`A_i` are matrices of autoregression coefficients 817 818 Furthermore, we allow the idiosyncratic disturbances to be serially 819 correlated, so that, if `idiosyncratic_ar1=True`, 820 :math:`\epsilon_{i,t} = \rho_i \epsilon_{i,t-1} + e_{i,t}`, where 821 :math:`e_{i,t} \sim N(0, \sigma_i^2)`. If `idiosyncratic_ar1=False`, 822 then we instead have :math:`\epsilon_{i,t} = e_{i,t}`. 823 824 This basic setup can be found in [1]_, [2]_, [3]_, and [4]_. 825 826 We allow for two generalizations of this model: 827 828 1. Following [2]_, we allow multiple "blocks" of factors, which are 829 independent from the other blocks of factors. Different blocks can be 830 set to load on different subsets of the observed variables, and can be 831 specified with different lag orders. 832 2. Following [4]_ and [2]_, we allow mixed frequency models in which both 833 monthly and quarterly data are used. See the section on "Mixed frequency 834 models", below, for more details. 835 836 Additional notes: 837 838 - The observed data may contain arbitrary patterns of missing entries. 839 840 **EM algorithm** 841 842 This model contains a potentially very large number of parameters, and it 843 can be difficult and take a prohibitively long time to numerically optimize 844 the likelihood function using quasi-Newton methods. Instead, the default 845 fitting method in this model uses the EM algorithm, as detailed in [1]_. 846 As a result, the model can accommodate datasets with hundreds of 847 observed variables. 848 849 **Mixed frequency data** 850 851 This model can handle mixed frequency data in two ways. In this section, 852 we only briefly describe this, and refer readers to [2]_ and [4]_ for all 853 details. 854 855 First, because there can be arbitrary patterns of missing data in the 856 observed vector, one can simply include lower frequency variables as 857 observed in a particular higher frequency period, and missing otherwise. 858 For example, in a monthly model, one could include quarterly data as 859 occurring on the third month of each quarter. To use this method, one 860 simply needs to combine the data into a single dataset at the higher 861 frequency that can be passed to this model as the `endog` argument. 862 However, depending on the type of variables used in the analysis and the 863 assumptions about the data generating process, this approach may not be 864 valid. 865 866 For example, suppose that we are interested in the growth rate of real GDP, 867 which is measured at a quarterly frequency. If the basic factor model is 868 specified at a monthly frequency, then the quarterly growth rate in the 869 third month of each quarter -- which is what we actually observe -- is 870 approximated by a particular weighted average of unobserved monthly growth 871 rates. We need to take this particular weight moving average into account 872 in constructing our model, and this is what the second approach does. 873 874 The second approach follows [2]_ and [4]_ in constructing a state space 875 form to explicitly model the quarterly growth rates in terms of the 876 unobserved monthly growth rates. To use this approach, there are two 877 methods: 878 879 1. Combine the monthly and quarterly data into a single dataset at the 880 monthly frequency, with the monthly data in the first columns and the 881 quarterly data in the last columns. Pass this dataset to the model as 882 the `endog` argument and give the number of the variables that are 883 monthly as the `k_endog_monthly` argument. 884 2. Construct a monthly dataset as a Pandas DataFrame with a DatetimeIndex 885 or PeriodIndex at the monthly frequency and separately construct a 886 quarterly dataset as a Pandas DataFrame with a DatetimeIndex or 887 PeriodIndex at the quarterly frequency. Pass the monthly DataFrame to 888 the model as the `endog` argument and pass the quarterly DataFrame to 889 the model as the `endog_quarterly` argument. 890 891 Note that this only incorporates one particular type of mixed frequency 892 data. See also Banbura et al. (2013). "Now-Casting and the Real-Time Data 893 Flow." for discussion about other types of mixed frequency data that are 894 not supported by this framework. 895 896 **Nowcasting and the news** 897 898 Through its support for monthly/quarterly mixed frequency data, this model 899 can allow for the nowcasting of quarterly variables based on monthly 900 observations. In particular, [2]_ and [3]_ use this model to construct 901 nowcasts of real GDP and analyze the impacts of "the news", derived from 902 incoming data on a real-time basis. This latter functionality can be 903 accessed through the `news` method of the results object. 904 905 **Standardizing data** 906 907 As is often the case in formulating a dynamic factor model, we do not 908 explicitly account for the mean of each observed variable. Instead, the 909 default behavior is to standardize each variable prior to estimation. Thus 910 if :math:`y_t` are the given observed data, the dynamic factor model is 911 actually estimated on the standardized data defined by: 912 913 .. math:: 914 915 x_{i, t} = (y_{i, t} - \bar y_i) / s_i 916 917 where :math:`\bar y_i` is the sample mean and :math:`s_i` is the sample 918 standard deviation. 919 920 By default, if standardization is applied prior to estimation, results such 921 as in-sample predictions, out-of-sample forecasts, and the computation of 922 the "news" are reported in the scale of the original data (i.e. the model 923 output has the reverse transformation applied before it is returned to the 924 user). 925 926 Standardization can be disabled by passing `standardization=False` to the 927 model constructor. 928 929 **Identification of factors and loadings** 930 931 The estimated factors and the factor loadings in this model are only 932 identified up to an invertible transformation. As described in (the working 933 paper version of) [2]_, while it is possible to impose normalizations to 934 achieve identification, the EM algorithm does will converge regardless. 935 Moreover, for nowcasting and forecasting purposes, identification is not 936 required. This model does not impose any normalization to identify the 937 factors and the factor loadings. 938 939 **Miscellaneous** 940 941 There are two arguments available in the model constructor that are rarely 942 used but which deserve a brief mention: `init_t0` and `obs_cov_diag`. These 943 arguments are provided to allow exactly matching the output of other 944 packages that have slight differences in how the underlying state space 945 model is set up / applied. 946 947 - `init_t0`: state space models in Statsmodels follow Durbin and Koopman in 948 initializing the model with :math:`\alpha_1 \sim N(a_1, P_1)`. Other 949 implementations sometimes initialize instead with 950 :math:`\alpha_0 \sim N(a_0, P_0)`. We can accommodate this by prepending 951 a row of NaNs to the observed dataset. 952 - `obs_cov_diag`: the state space form in [1]_ incorporates non-zero (but 953 very small) diagonal elements for the observation disturbance covariance 954 matrix. 955 956 Examples 957 -------- 958 Constructing and fitting a `DynamicFactorMQ` model. 959 960 >>> data = sm.datasets.macrodata.load_pandas().data.iloc[-100:] 961 >>> data.index = pd.period_range(start='1984Q4', end='2009Q3', freq='Q') 962 >>> endog = data[['infl', 'tbilrate']].resample('M').last() 963 >>> endog_Q = np.log(data[['realgdp', 'realcons']]).diff().iloc[1:] * 400 964 965 **Basic usage** 966 967 In the simplest case, passing only the `endog` argument results in a model 968 with a single factor that follows an AR(1) process. Note that because we 969 are not also providing an `endog_quarterly` dataset, `endog` can be a numpy 970 array or Pandas DataFrame with any index (it does not have to be monthly). 971 972 The `summary` method can be useful in checking the model specification. 973 974 >>> mod = sm.tsa.DynamicFactorMQ(endog) 975 >>> print(mod.summary()) 976 Model Specification: Dynamic Factor Model 977 ========================================================================== 978 Model: Dynamic Factor Model # of monthly variables: 2 979 + 1 factors in 1 blocks # of factors: 1 980 + AR(1) idiosyncratic Idiosyncratic disturbances: AR(1) 981 Sample: 1984-10 Standardize variables: True 982 - 2009-09 983 Observed variables / factor loadings 984 ======================== 985 Dep. variable 0 986 ------------------------ 987 infl X 988 tbilrate X 989 Factor blocks: 990 ===================== 991 block order 992 --------------------- 993 0 1 994 ===================== 995 996 **Factors** 997 998 With `factors=2`, there will be two independent factors that will each 999 evolve according to separate AR(1) processes. 1000 1001 >>> mod = sm.tsa.DynamicFactorMQ(endog, factors=2) 1002 >>> print(mod.summary()) 1003 Model Specification: Dynamic Factor Model 1004 ========================================================================== 1005 Model: Dynamic Factor Model # of monthly variables: 2 1006 + 2 factors in 2 blocks # of factors: 2 1007 + AR(1) idiosyncratic Idiosyncratic disturbances: AR(1) 1008 Sample: 1984-10 Standardize variables: True 1009 - 2009-09 1010 Observed variables / factor loadings 1011 =================================== 1012 Dep. variable 0 1 1013 ----------------------------------- 1014 infl X X 1015 tbilrate X X 1016 Factor blocks: 1017 ===================== 1018 block order 1019 --------------------- 1020 0 1 1021 1 1 1022 ===================== 1023 1024 **Factor multiplicities** 1025 1026 By instead specifying `factor_multiplicities=2`, we would still have two 1027 factors, but they would be dependent and would evolve jointly according 1028 to a VAR(1) process. 1029 1030 >>> mod = sm.tsa.DynamicFactorMQ(endog, factor_multiplicities=2) 1031 >>> print(mod.summary()) 1032 Model Specification: Dynamic Factor Model 1033 ========================================================================== 1034 Model: Dynamic Factor Model # of monthly variables: 2 1035 + 2 factors in 1 blocks # of factors: 2 1036 + AR(1) idiosyncratic Idiosyncratic disturbances: AR(1) 1037 Sample: 1984-10 Standardize variables: True 1038 - 2009-09 1039 Observed variables / factor loadings 1040 =================================== 1041 Dep. variable 0.1 0.2 1042 ----------------------------------- 1043 infl X X 1044 tbilrate X X 1045 Factor blocks: 1046 ===================== 1047 block order 1048 --------------------- 1049 0.1, 0.2 1 1050 ===================== 1051 1052 **Factor orders** 1053 1054 In either of the above cases, we could extend the order of the (vector) 1055 autoregressions by using the `factor_orders` argument. For example, the 1056 below model would contain two independent factors that each evolve 1057 according to a separate AR(2) process: 1058 1059 >>> mod = sm.tsa.DynamicFactorMQ(endog, factors=2, factor_orders=2) 1060 >>> print(mod.summary()) 1061 Model Specification: Dynamic Factor Model 1062 ========================================================================== 1063 Model: Dynamic Factor Model # of monthly variables: 2 1064 + 2 factors in 2 blocks # of factors: 2 1065 + AR(1) idiosyncratic Idiosyncratic disturbances: AR(1) 1066 Sample: 1984-10 Standardize variables: True 1067 - 2009-09 1068 Observed variables / factor loadings 1069 =================================== 1070 Dep. variable 0 1 1071 ----------------------------------- 1072 infl X X 1073 tbilrate X X 1074 Factor blocks: 1075 ===================== 1076 block order 1077 --------------------- 1078 0 2 1079 1 2 1080 ===================== 1081 1082 **Serial correlation in the idiosyncratic disturbances** 1083 1084 By default, the model allows each idiosyncratic disturbance terms to evolve 1085 according to an AR(1) process. If preferred, they can instead be specified 1086 to be serially independent by passing `ididosyncratic_ar1=False`. 1087 1088 >>> mod = sm.tsa.DynamicFactorMQ(endog, idiosyncratic_ar1=False) 1089 >>> print(mod.summary()) 1090 Model Specification: Dynamic Factor Model 1091 ========================================================================== 1092 Model: Dynamic Factor Model # of monthly variables: 2 1093 + 1 factors in 1 blocks # of factors: 1 1094 + iid idiosyncratic Idiosyncratic disturbances: iid 1095 Sample: 1984-10 Standardize variables: True 1096 - 2009-09 1097 Observed variables / factor loadings 1098 ======================== 1099 Dep. variable 0 1100 ------------------------ 1101 infl X 1102 tbilrate X 1103 Factor blocks: 1104 ===================== 1105 block order 1106 --------------------- 1107 0 1 1108 ===================== 1109 1110 *Monthly / Quarterly mixed frequency* 1111 1112 To specify a monthly / quarterly mixed frequency model see the (Notes 1113 section for more details about these models): 1114 1115 >>> mod = sm.tsa.DynamicFactorMQ(endog, endog_quarterly=endog_Q) 1116 >>> print(mod.summary()) 1117 Model Specification: Dynamic Factor Model 1118 ========================================================================== 1119 Model: Dynamic Factor Model # of monthly variables: 2 1120 + 1 factors in 1 blocks # of quarterly variables: 2 1121 + Mixed frequency (M/Q) # of factors: 1 1122 + AR(1) idiosyncratic Idiosyncratic disturbances: AR(1) 1123 Sample: 1984-10 Standardize variables: True 1124 - 2009-09 1125 Observed variables / factor loadings 1126 ======================== 1127 Dep. variable 0 1128 ------------------------ 1129 infl X 1130 tbilrate X 1131 realgdp X 1132 realcons X 1133 Factor blocks: 1134 ===================== 1135 block order 1136 --------------------- 1137 0 1 1138 ===================== 1139 1140 *Customize observed variable / factor loadings* 1141 1142 To specify that certain that certain observed variables only load on 1143 certain factors, it is possible to pass a dictionary to the `factors` 1144 argument. 1145 1146 >>> factors = {'infl': ['global'] 1147 ... 'tbilrate': ['global'] 1148 ... 'realgdp': ['global', 'real'] 1149 ... 'realcons': ['global', 'real']} 1150 >>> mod = sm.tsa.DynamicFactorMQ(endog, endog_quarterly=endog_Q) 1151 >>> print(mod.summary()) 1152 Model Specification: Dynamic Factor Model 1153 ========================================================================== 1154 Model: Dynamic Factor Model # of monthly variables: 2 1155 + 2 factors in 2 blocks # of quarterly variables: 2 1156 + Mixed frequency (M/Q) # of factor blocks: 2 1157 + AR(1) idiosyncratic Idiosyncratic disturbances: AR(1) 1158 Sample: 1984-10 Standardize variables: True 1159 - 2009-09 1160 Observed variables / factor loadings 1161 =================================== 1162 Dep. variable global real 1163 ----------------------------------- 1164 infl X 1165 tbilrate X 1166 realgdp X X 1167 realcons X X 1168 Factor blocks: 1169 ===================== 1170 block order 1171 --------------------- 1172 global 1 1173 real 1 1174 ===================== 1175 1176 **Fitting parameters** 1177 1178 To fit the model, use the `fit` method. This method uses the EM algorithm 1179 by default. 1180 1181 >>> mod = sm.tsa.DynamicFactorMQ(endog) 1182 >>> res = mod.fit() 1183 >>> print(res.summary()) 1184 Dynamic Factor Results 1185 ========================================================================== 1186 Dep. Variable: ['infl', 'tbilrate'] No. Observations: 300 1187 Model: Dynamic Factor Model Log Likelihood -127.909 1188 + 1 factors in 1 blocks AIC 271.817 1189 + AR(1) idiosyncratic BIC 301.447 1190 Date: Tue, 04 Aug 2020 HQIC 283.675 1191 Time: 15:59:11 EM Iterations 83 1192 Sample: 10-31-1984 1193 - 09-30-2009 1194 Covariance Type: Not computed 1195 Observation equation: 1196 ============================================================== 1197 Factor loadings: 0 idiosyncratic: AR(1) var. 1198 -------------------------------------------------------------- 1199 infl -0.67 0.39 0.73 1200 tbilrate -0.63 0.99 0.01 1201 Transition: Factor block 0 1202 ======================================= 1203 L1.0 error variance 1204 --------------------------------------- 1205 0 0.98 0.01 1206 ======================================= 1207 Warnings: 1208 [1] Covariance matrix not calculated. 1209 1210 *Displaying iteration progress* 1211 1212 To display information about the EM iterations, use the `disp` argument. 1213 1214 >>> mod = sm.tsa.DynamicFactorMQ(endog) 1215 >>> res = mod.fit(disp=10) 1216 EM start iterations, llf=-291.21 1217 EM iteration 10, llf=-157.17, convergence criterion=0.053801 1218 EM iteration 20, llf=-128.99, convergence criterion=0.0035545 1219 EM iteration 30, llf=-127.97, convergence criterion=0.00010224 1220 EM iteration 40, llf=-127.93, convergence criterion=1.3281e-05 1221 EM iteration 50, llf=-127.92, convergence criterion=5.4725e-06 1222 EM iteration 60, llf=-127.91, convergence criterion=2.8665e-06 1223 EM iteration 70, llf=-127.91, convergence criterion=1.6999e-06 1224 EM iteration 80, llf=-127.91, convergence criterion=1.1085e-06 1225 EM converged at iteration 83, llf=-127.91, 1226 convergence criterion=9.9004e-07 < tolerance=1e-06 1227 1228 **Results: forecasting, impulse responses, and more** 1229 1230 One the model is fitted, there are a number of methods available from the 1231 results object. Some examples include: 1232 1233 *Forecasting* 1234 1235 >>> mod = sm.tsa.DynamicFactorMQ(endog) 1236 >>> res = mod.fit() 1237 >>> print(res.forecast(steps=5)) 1238 infl tbilrate 1239 2009-10 1.784169 0.260401 1240 2009-11 1.735848 0.305981 1241 2009-12 1.730674 0.350968 1242 2010-01 1.742110 0.395369 1243 2010-02 1.759786 0.439194 1244 1245 *Impulse responses* 1246 1247 >>> mod = sm.tsa.DynamicFactorMQ(endog) 1248 >>> res = mod.fit() 1249 >>> print(res.impulse_responses(steps=5)) 1250 infl tbilrate 1251 0 -1.511956 -1.341498 1252 1 -1.483172 -1.315960 1253 2 -1.454937 -1.290908 1254 3 -1.427240 -1.266333 1255 4 -1.400069 -1.242226 1256 5 -1.373416 -1.218578 1257 1258 For other available methods (including in-sample prediction, simulation of 1259 time series, extending the results to incorporate new data, and the news), 1260 see the documentation for state space models. 1261 1262 References 1263 ---------- 1264 .. [1] Bańbura, Marta, and Michele Modugno. 1265 "Maximum likelihood estimation of factor models on datasets with 1266 arbitrary pattern of missing data." 1267 Journal of Applied Econometrics 29, no. 1 (2014): 133-160. 1268 .. [2] Bańbura, Marta, Domenico Giannone, and Lucrezia Reichlin. 1269 "Nowcasting." 1270 The Oxford Handbook of Economic Forecasting. July 8, 2011. 1271 .. [3] Bok, Brandyn, Daniele Caratelli, Domenico Giannone, 1272 Argia M. Sbordone, and Andrea Tambalotti. 2018. 1273 "Macroeconomic Nowcasting and Forecasting with Big Data." 1274 Annual Review of Economics 10 (1): 615-43. 1275 https://doi.org/10.1146/annurev-economics-080217-053214. 1276 .. [4] Mariano, Roberto S., and Yasutomo Murasawa. 1277 "A coincident index, common factors, and monthly real GDP." 1278 Oxford Bulletin of Economics and Statistics 72, no. 1 (2010): 27-46. 1279 1280 """ 1281 1282 def __init__(self, endog, k_endog_monthly=None, factors=1, factor_orders=1, 1283 factor_multiplicities=None, idiosyncratic_ar1=True, 1284 standardize=True, endog_quarterly=None, init_t0=False, 1285 obs_cov_diag=False, **kwargs): 1286 # Handle endog variables 1287 if endog_quarterly is not None: 1288 if k_endog_monthly is not None: 1289 raise ValueError('If `endog_quarterly` is specified, then' 1290 ' `endog` must contain only monthly' 1291 ' variables, and so `k_endog_monthly` cannot' 1292 ' be specified since it will be inferred from' 1293 ' the shape of `endog`.') 1294 endog, k_endog_monthly = self.construct_endog( 1295 endog, endog_quarterly) 1296 endog_is_pandas = _is_using_pandas(endog, None) 1297 1298 if endog_is_pandas: 1299 if isinstance(endog, pd.Series): 1300 endog = endog.to_frame() 1301 else: 1302 if np.ndim(endog) < 2: 1303 endog = np.atleast_2d(endog).T 1304 1305 if k_endog_monthly is None: 1306 k_endog_monthly = endog.shape[1] 1307 1308 if endog_is_pandas: 1309 endog_names = endog.columns.tolist() 1310 else: 1311 if endog.shape[1] == 1: 1312 endog_names = ['y'] 1313 else: 1314 endog_names = [f'y{i + 1}' for i in range(endog.shape[1])] 1315 1316 self.k_endog_M = int_like(k_endog_monthly, 'k_endog_monthly') 1317 self.k_endog_Q = endog.shape[1] - self.k_endog_M 1318 1319 # Compute helper for handling factors / state indexing 1320 s = self._s = DynamicFactorMQStates( 1321 self.k_endog_M, self.k_endog_Q, endog_names, factors, 1322 factor_orders, factor_multiplicities, idiosyncratic_ar1) 1323 1324 # Save parameterization 1325 self.factors = factors 1326 self.factor_orders = factor_orders 1327 self.factor_multiplicities = factor_multiplicities 1328 1329 self.endog_factor_map = self._s.endog_factor_map 1330 self.factor_block_orders = self._s.factor_block_orders 1331 self.factor_names = self._s.factor_names 1332 self.k_factors = self._s.k_factors 1333 self.k_factor_blocks = len(self.factor_block_orders) 1334 self.max_factor_order = self._s.max_factor_order 1335 self.idiosyncratic_ar1 = idiosyncratic_ar1 1336 self.init_t0 = init_t0 1337 self.obs_cov_diag = obs_cov_diag 1338 1339 if self.init_t0: 1340 # TODO: test each of these options 1341 if endog_is_pandas: 1342 ix = pd.period_range(endog.index[0] - 1, endog.index[-1], 1343 freq='M') 1344 endog = endog.reindex(ix) 1345 else: 1346 endog = np.c_[[np.nan] * endog.shape[1], endog.T].T 1347 1348 # Standardize endog, if requested 1349 # Note: endog_mean and endog_std will always each be 1-dimensional with 1350 # length equal to the number of endog variables 1351 if isinstance(standardize, tuple) and len(standardize) == 2: 1352 endog_mean, endog_std = standardize 1353 1354 # Validate the input 1355 n = endog.shape[1] 1356 if (isinstance(endog_mean, pd.Series) and not 1357 endog_mean.index.equals(pd.Index(endog_names))): 1358 raise ValueError('Invalid value passed for `standardize`:' 1359 ' if a Pandas Series, must have index' 1360 f' {endog_names}. Got {endog_mean.index}.') 1361 else: 1362 endog_mean = np.atleast_1d(endog_mean) 1363 if (isinstance(endog_std, pd.Series) and not 1364 endog_std.index.equals(pd.Index(endog_names))): 1365 raise ValueError('Invalid value passed for `standardize`:' 1366 ' if a Pandas Series, must have index' 1367 f' {endog_names}. Got {endog_std.index}.') 1368 else: 1369 endog_std = np.atleast_1d(endog_std) 1370 1371 if (np.shape(endog_mean) != (n,) or np.shape(endog_std) != (n,)): 1372 raise ValueError('Invalid value passed for `standardize`: each' 1373 f' element must be shaped ({n},).') 1374 standardize = True 1375 1376 # Make sure we have Pandas if endog is Pandas 1377 if endog_is_pandas: 1378 endog_mean = pd.Series(endog_mean, index=endog_names) 1379 endog_std = pd.Series(endog_std, index=endog_names) 1380 1381 elif standardize in [1, True]: 1382 endog_mean = endog.mean(axis=0) 1383 endog_std = endog.std(axis=0) 1384 elif standardize in [0, False]: 1385 endog_mean = np.zeros(endog.shape[1]) 1386 endog_std = np.ones(endog.shape[1]) 1387 else: 1388 raise ValueError('Invalid value passed for `standardize`.') 1389 self._endog_mean = endog_mean 1390 self._endog_std = endog_std 1391 self.standardize = standardize 1392 if np.any(self._endog_std < 1e-10): 1393 ix = np.where(self._endog_std < 1e-10) 1394 names = np.array(endog_names)[ix[0]].tolist() 1395 raise ValueError('Constant variable(s) found in observed' 1396 ' variables, but constants cannot be included' 1397 f' in this model. These variables are: {names}.') 1398 1399 if self.standardize: 1400 endog = (endog - self._endog_mean) / self._endog_std 1401 1402 # Observation / states slices 1403 o = self._o = { 1404 'M': np.s_[:self.k_endog_M], 1405 'Q': np.s_[self.k_endog_M:]} 1406 1407 # Construct the basic state space representation 1408 super().__init__(endog, k_states=s.k_states, k_posdef=s.k_posdef, 1409 **kwargs) 1410 1411 # Revert the standardization for orig_endog 1412 if self.standardize: 1413 self.data.orig_endog = ( 1414 self.data.orig_endog * self._endog_std + self._endog_mean) 1415 1416 # State initialization 1417 # Note: we could just initialize the entire thing as stationary, but 1418 # doing each block separately should be faster and avoid numerical 1419 # issues 1420 if 'initialization' not in kwargs: 1421 self.ssm.initialize(self._default_initialization()) 1422 1423 # Fixed components of the state space representation 1424 1425 # > design 1426 if self.idiosyncratic_ar1: 1427 self['design', o['M'], s['idio_ar_M']] = np.eye(self.k_endog_M) 1428 multipliers = [1, 2, 3, 2, 1] 1429 for i in range(len(multipliers)): 1430 m = multipliers[i] 1431 self['design', o['Q'], s['idio_ar_Q_ix'][:, i]] = ( 1432 m * np.eye(self.k_endog_Q)) 1433 1434 # > obs cov 1435 if self.obs_cov_diag: 1436 self['obs_cov'] = np.eye(self.k_endog) * 1e-4 1437 1438 # > transition 1439 for block in s.factor_blocks: 1440 if block.k_factors == 1: 1441 tmp = 0 1442 else: 1443 tmp = np.zeros((block.k_factors, block.k_factors)) 1444 self['transition', block['factors'], block['factors']] = ( 1445 companion_matrix([1] + [tmp] * block._factor_order).T) 1446 if self.k_endog_Q == 1: 1447 tmp = 0 1448 else: 1449 tmp = np.zeros((self.k_endog_Q, self.k_endog_Q)) 1450 self['transition', s['idio_ar_Q'], s['idio_ar_Q']] = ( 1451 companion_matrix([1] + [tmp] * 5).T) 1452 1453 # > selection 1454 ix1 = ix2 = 0 1455 for block in s.factor_blocks: 1456 ix2 += block.k_factors 1457 self['selection', block['factors_ix'][:, 0], ix1:ix2] = ( 1458 np.eye(block.k_factors)) 1459 ix1 = ix2 1460 if self.idiosyncratic_ar1: 1461 ix2 = ix1 + self.k_endog_M 1462 self['selection', s['idio_ar_M'], ix1:ix2] = np.eye(self.k_endog_M) 1463 ix1 = ix2 1464 1465 ix2 = ix1 + self.k_endog_Q 1466 self['selection', s['idio_ar_Q_ix'][:, 0], ix1:ix2] = ( 1467 np.eye(self.k_endog_Q)) 1468 1469 # Parameters 1470 self.params = OrderedDict([ 1471 ('loadings', np.sum(self.endog_factor_map.values)), 1472 ('factor_ar', np.sum([block.k_factors**2 * block.factor_order 1473 for block in s.factor_blocks])), 1474 ('factor_cov', np.sum([block.k_factors * (block.k_factors + 1) // 2 1475 for block in s.factor_blocks])), 1476 ('idiosyncratic_ar1', 1477 self.k_endog if self.idiosyncratic_ar1 else 0), 1478 ('idiosyncratic_var', self.k_endog)]) 1479 self.k_params = np.sum(list(self.params.values())) 1480 1481 # Parameter slices 1482 ix = np.split(np.arange(self.k_params), 1483 np.cumsum(list(self.params.values()))[:-1]) 1484 self._p = dict(zip(self.params.keys(), ix)) 1485 1486 # Cache 1487 self._loading_constraints = {} 1488 1489 # Initialization kwarg keys, e.g. for cloning 1490 self._init_keys += [ 1491 'factors', 'factor_orders', 'factor_multiplicities', 1492 'idiosyncratic_ar1', 'standardize', 'init_t0', 1493 'obs_cov_diag'] + list(kwargs.keys()) 1494 1495 @classmethod 1496 def construct_endog(cls, endog_monthly, endog_quarterly): 1497 """ 1498 Construct a combined dataset from separate monthly and quarterly data. 1499 1500 Parameters 1501 ---------- 1502 endog_monthly : array_like 1503 Monthly dataset. If a quarterly dataset is given, then this must 1504 be a Pandas object with a PeriodIndex or DatetimeIndex at a monthly 1505 frequency. 1506 endog_quarterly : array_like or None 1507 Quarterly dataset. If not None, then this must be a Pandas object 1508 with a PeriodIndex or DatetimeIndex at a quarterly frequency. 1509 1510 Returns 1511 ------- 1512 endog : array_like 1513 If both endog_monthly and endog_quarterly were given, this is a 1514 Pandas DataFrame with a PeriodIndex at the monthly frequency, with 1515 all of the columns from `endog_monthly` ordered first and the 1516 columns from `endog_quarterly` ordered afterwards. Otherwise it is 1517 simply the input `endog_monthly` dataset. 1518 k_endog_monthly : int 1519 The number of monthly variables (which are ordered first) in the 1520 returned `endog` dataset. 1521 """ 1522 # Create combined dataset 1523 if endog_quarterly is not None: 1524 # Validate endog_monthly 1525 base_msg = ('If given both monthly and quarterly data' 1526 ' then the monthly dataset must be a Pandas' 1527 ' object with a date index at a monthly frequency.') 1528 if not isinstance(endog_monthly, (pd.Series, pd.DataFrame)): 1529 raise ValueError('Given monthly dataset is not a' 1530 ' Pandas object. ' + base_msg) 1531 elif endog_monthly.index.inferred_type not in ("datetime64", 1532 "period"): 1533 raise ValueError('Given monthly dataset has an' 1534 ' index with non-date values. ' + base_msg) 1535 elif not getattr(endog_monthly.index, 'freqstr', 'N')[0] == 'M': 1536 freqstr = getattr(endog_monthly.index, 'freqstr', 'None') 1537 raise ValueError('Index of given monthly dataset has a' 1538 ' non-monthly frequency (to check this,' 1539 ' examine the `freqstr` attribute of the' 1540 ' index of the dataset - it should start with' 1541 ' M if it is monthly).' 1542 f' Got {freqstr}. ' + base_msg) 1543 1544 # Validate endog_quarterly 1545 base_msg = ('If a quarterly dataset is given, then it must be a' 1546 ' Pandas object with a date index at a quarterly' 1547 ' frequency.') 1548 if not isinstance(endog_quarterly, (pd.Series, pd.DataFrame)): 1549 raise ValueError('Given quarterly dataset is not a' 1550 ' Pandas object. ' + base_msg) 1551 elif endog_quarterly.index.inferred_type not in ("datetime64", 1552 "period"): 1553 raise ValueError('Given quarterly dataset has an' 1554 ' index with non-date values. ' + base_msg) 1555 elif not getattr(endog_quarterly.index, 'freqstr', 'N')[0] == 'Q': 1556 freqstr = getattr(endog_quarterly.index, 'freqstr', 'None') 1557 raise ValueError('Index of given quarterly dataset' 1558 ' has a non-quarterly frequency (to check' 1559 ' this, examine the `freqstr` attribute of' 1560 ' the index of the dataset - it should start' 1561 ' with Q if it is quarterly).' 1562 f' Got {freqstr}. ' + base_msg) 1563 1564 # Convert to PeriodIndex, if applicable 1565 if hasattr(endog_monthly.index, 'to_period'): 1566 endog_monthly = endog_monthly.to_period('M') 1567 if hasattr(endog_quarterly.index, 'to_period'): 1568 endog_quarterly = endog_quarterly.to_period('Q') 1569 1570 # Combine the datasets 1571 endog = pd.concat([ 1572 endog_monthly, 1573 endog_quarterly.resample('M', convention='end').first()], 1574 axis=1) 1575 1576 # Make sure we didn't accidentally get duplicate column names 1577 column_counts = endog.columns.value_counts() 1578 if column_counts.max() > 1: 1579 columns = endog.columns.values.astype(object) 1580 for name in column_counts.index: 1581 count = column_counts.loc[name] 1582 if count == 1: 1583 continue 1584 mask = columns == name 1585 columns[mask] = [f'{name}{i + 1}' for i in range(count)] 1586 endog.columns = columns 1587 else: 1588 endog = endog_monthly.copy() 1589 shape = endog_monthly.shape 1590 k_endog_monthly = shape[1] if len(shape) == 2 else 1 1591 1592 return endog, k_endog_monthly 1593 1594 def clone(self, endog, k_endog_monthly=None, endog_quarterly=None, 1595 retain_standardization=False, **kwargs): 1596 """ 1597 Clone state space model with new data and optionally new specification. 1598 1599 Parameters 1600 ---------- 1601 endog : array_like 1602 The observed time-series process :math:`y` 1603 k_endog_monthly : int, optional 1604 If specifying a monthly/quarterly mixed frequency model in which 1605 the provided `endog` dataset contains both the monthly and 1606 quarterly data, this variable should be used to indicate how many 1607 of the variables are monthly. 1608 endog_quarterly : array_like, optional 1609 Observations of quarterly variables. If provided, must be a 1610 Pandas Series or DataFrame with a DatetimeIndex or PeriodIndex at 1611 the quarterly frequency. 1612 kwargs 1613 Keyword arguments to pass to the new model class to change the 1614 model specification. 1615 1616 Returns 1617 ------- 1618 model : DynamicFactorMQ instance 1619 """ 1620 if retain_standardization and self.standardize: 1621 kwargs['standardize'] = (self._endog_mean, self._endog_std) 1622 mod = self._clone_from_init_kwds( 1623 endog, k_endog_monthly=k_endog_monthly, 1624 endog_quarterly=endog_quarterly, **kwargs) 1625 return mod 1626 1627 @property 1628 def _res_classes(self): 1629 return {'fit': (DynamicFactorMQResults, mlemodel.MLEResultsWrapper)} 1630 1631 def _default_initialization(self): 1632 s = self._s 1633 init = initialization.Initialization(self.k_states) 1634 for block in s.factor_blocks: 1635 init.set(block['factors'], 'stationary') 1636 if self.idiosyncratic_ar1: 1637 for i in range(s['idio_ar_M'].start, s['idio_ar_M'].stop): 1638 init.set(i, 'stationary') 1639 init.set(s['idio_ar_Q'], 'stationary') 1640 return init 1641 1642 def _get_endog_names(self, truncate=None, as_string=None): 1643 if truncate is None: 1644 truncate = False if as_string is False or self.k_endog == 1 else 24 1645 if as_string is False and truncate is not False: 1646 raise ValueError('Can only truncate endog names if they' 1647 ' are returned as a string.') 1648 if as_string is None: 1649 as_string = truncate is not False 1650 1651 # The base `endog_names` property is only a list if there are at least 1652 # two variables; often, we need it to be a list 1653 endog_names = self.endog_names 1654 if not isinstance(endog_names, list): 1655 endog_names = [endog_names] 1656 1657 if as_string: 1658 endog_names = [str(name) for name in endog_names] 1659 1660 if truncate is not False: 1661 n = truncate 1662 endog_names = [name if len(name) <= n else name[:n] + '...' 1663 for name in endog_names] 1664 1665 return endog_names 1666 1667 @property 1668 def _model_name(self): 1669 model_name = [ 1670 'Dynamic Factor Model', 1671 f'{self.k_factors} factors in {self.k_factor_blocks} blocks'] 1672 if self.k_endog_Q > 0: 1673 model_name.append('Mixed frequency (M/Q)') 1674 1675 error_type = 'AR(1)' if self.idiosyncratic_ar1 else 'iid' 1676 model_name.append(f'{error_type} idiosyncratic') 1677 1678 return model_name 1679 1680 def summary(self, truncate_endog_names=None): 1681 """ 1682 Create a summary table describing the model. 1683 1684 Parameters 1685 ---------- 1686 truncate_endog_names : int, optional 1687 The number of characters to show for names of observed variables. 1688 Default is 24 if there is more than one observed variable, or 1689 an unlimited number of there is only one. 1690 """ 1691 # Get endog names 1692 endog_names = self._get_endog_names(truncate=truncate_endog_names, 1693 as_string=True) 1694 1695 title = 'Model Specification: Dynamic Factor Model' 1696 1697 if self._index_dates: 1698 ix = self._index 1699 d = ix[0] 1700 sample = ['%s' % d] 1701 d = ix[-1] 1702 sample += ['- ' + '%s' % d] 1703 else: 1704 sample = [str(0), ' - ' + str(self.nobs)] 1705 1706 # Standardize the model name as a list of str 1707 model_name = self._model_name 1708 1709 # - Top summary table ------------------------------------------------ 1710 top_left = [] 1711 top_left.append(('Model:', [model_name[0]])) 1712 for i in range(1, len(model_name)): 1713 top_left.append(('', ['+ ' + model_name[i]])) 1714 top_left += [ 1715 ('Sample:', [sample[0]]), 1716 ('', [sample[1]])] 1717 1718 top_right = [] 1719 if self.k_endog_Q > 0: 1720 top_right += [ 1721 ('# of monthly variables:', [self.k_endog_M]), 1722 ('# of quarterly variables:', [self.k_endog_Q])] 1723 else: 1724 top_right += [('# of observed variables:', [self.k_endog])] 1725 if self.k_factor_blocks == 1: 1726 top_right += [('# of factors:', [self.k_factors])] 1727 else: 1728 top_right += [('# of factor blocks:', [self.k_factor_blocks])] 1729 top_right += [('Idiosyncratic disturbances:', 1730 ['AR(1)' if self.idiosyncratic_ar1 else 'iid']), 1731 ('Standardize variables:', [self.standardize])] 1732 1733 summary = Summary() 1734 self.model = self 1735 summary.add_table_2cols(self, gleft=top_left, gright=top_right, 1736 title=title) 1737 table_ix = 1 1738 del self.model 1739 1740 # - Endog / factor map ----------------------------------------------- 1741 data = self.endog_factor_map.replace({True: 'X', False: ''}) 1742 data.index = endog_names 1743 for name, col in data.iteritems(): 1744 data[name] = data[name] + (' ' * (len(name) // 2)) 1745 data.index.name = 'Dep. variable' 1746 data = data.reset_index() 1747 1748 params_data = data.values 1749 params_header = data.columns.map(str).tolist() 1750 params_stubs = None 1751 1752 title = 'Observed variables / factor loadings' 1753 table = SimpleTable( 1754 params_data, params_header, params_stubs, 1755 txt_fmt=fmt_params, title=title) 1756 1757 summary.tables.insert(table_ix, table) 1758 table_ix += 1 1759 1760 # - Factor blocks summary table -------------------------------------- 1761 data = self.factor_block_orders.reset_index() 1762 data['block'] = data['block'].map( 1763 lambda factor_names: ', '.join(factor_names)) 1764 data[['order']] = ( 1765 data[['order']].applymap(str)) 1766 1767 params_data = data.values 1768 params_header = data.columns.map(str).tolist() 1769 params_stubs = None 1770 1771 title = 'Factor blocks:' 1772 table = SimpleTable( 1773 params_data, params_header, params_stubs, 1774 txt_fmt=fmt_params, title=title) 1775 1776 summary.tables.insert(table_ix, table) 1777 table_ix += 1 1778 1779 return summary 1780 1781 def __str__(self): 1782 """Summary tables showing model specification.""" 1783 return str(self.summary()) 1784 1785 @property 1786 def state_names(self): 1787 """(list of str) List of human readable names for unobserved states.""" 1788 # Factors 1789 state_names = [] 1790 for block in self._s.factor_blocks: 1791 state_names += [f'{name}' for name in block.factor_names[:]] 1792 for s in range(1, block._factor_order): 1793 state_names += [f'L{s}.{name}' 1794 for name in block.factor_names] 1795 1796 # Monthly error 1797 endog_names = self._get_endog_names() 1798 if self.idiosyncratic_ar1: 1799 endog_names_M = endog_names[self._o['M']] 1800 state_names += [f'eps_M.{name}' for name in endog_names_M] 1801 endog_names_Q = endog_names[self._o['Q']] 1802 1803 # Quarterly error 1804 state_names += [f'eps_Q.{name}' for name in endog_names_Q] 1805 for s in range(1, 5): 1806 state_names += [f'L{s}.eps_Q.{name}' for name in endog_names_Q] 1807 return state_names 1808 1809 @property 1810 def param_names(self): 1811 """(list of str) List of human readable parameter names.""" 1812 param_names = [] 1813 # Loadings 1814 # So that Lambda = params[ix].reshape(self.k_endog, self.k_factors) 1815 # (where Lambda stacks Lambda_M and Lambda_Q) 1816 endog_names = self._get_endog_names(as_string=False) 1817 for endog_name in endog_names: 1818 for block in self._s.factor_blocks: 1819 for factor_name in block.factor_names: 1820 if self.endog_factor_map.loc[endog_name, factor_name]: 1821 param_names.append( 1822 f'loading.{factor_name}->{endog_name}') 1823 1824 # Factor VAR 1825 for block in self._s.factor_blocks: 1826 for to_factor in block.factor_names: 1827 param_names += [f'L{i}.{from_factor}->{to_factor}' 1828 for i in range(1, block.factor_order + 1) 1829 for from_factor in block.factor_names] 1830 1831 # Factor covariance 1832 for i in range(len(self._s.factor_blocks)): 1833 block = self._s.factor_blocks[i] 1834 param_names += [f'fb({i}).cov.chol[{j + 1},{k + 1}]' 1835 for j in range(block.k_factors) 1836 for k in range(j + 1)] 1837 1838 # Error AR(1) 1839 if self.idiosyncratic_ar1: 1840 endog_names_M = endog_names[self._o['M']] 1841 param_names += [f'L1.eps_M.{name}' for name in endog_names_M] 1842 1843 endog_names_Q = endog_names[self._o['Q']] 1844 param_names += [f'L1.eps_Q.{name}' for name in endog_names_Q] 1845 1846 # Error innovation variances 1847 param_names += [f'sigma2.{name}' for name in endog_names] 1848 1849 return param_names 1850 1851 @property 1852 def start_params(self): 1853 """(array) Starting parameters for maximum likelihood estimation.""" 1854 params = np.zeros(self.k_params, dtype=np.float64) 1855 1856 # (1) estimate factors one at a time, where the first step uses 1857 # PCA on all `endog` variables that load on the first factor, and 1858 # subsequent steps use residuals from the previous steps. 1859 # TODO: what about factors that only load on quarterly variables? 1860 endog_factor_map_M = self.endog_factor_map.iloc[:self.k_endog_M] 1861 factors = [] 1862 endog = (pd.DataFrame(self.endog).interpolate() 1863 .fillna(method='backfill') 1864 .values) 1865 for name in self.factor_names: 1866 # Try to retrieve this from monthly variables, which is most 1867 # consistent 1868 endog_ix = np.where(endog_factor_map_M.loc[:, name])[0] 1869 # But fall back to quarterly if necessary 1870 if len(endog_ix) == 0: 1871 endog_ix = np.where(self.endog_factor_map.loc[:, name])[0] 1872 factor_endog = endog[:, endog_ix] 1873 1874 res_pca = PCA(factor_endog, ncomp=1, method='eig', normalize=False) 1875 factors.append(res_pca.factors) 1876 endog[:, endog_ix] -= res_pca.projection 1877 factors = np.concatenate(factors, axis=1) 1878 1879 # (2) Estimate coefficients for each endog, one at a time (OLS for 1880 # monthly variables, restricted OLS for quarterly). Also, compute 1881 # residuals. 1882 loadings = [] 1883 resid = [] 1884 for i in range(self.k_endog_M): 1885 factor_ix = self._s.endog_factor_iloc[i] 1886 factor_exog = factors[:, factor_ix] 1887 mod_ols = OLS(self.endog[:, i], exog=factor_exog, missing='drop') 1888 res_ols = mod_ols.fit() 1889 loadings += res_ols.params.tolist() 1890 resid.append(res_ols.resid) 1891 for i in range(self.k_endog_M, self.k_endog): 1892 factor_ix = self._s.endog_factor_iloc[i] 1893 factor_exog = lagmat(factors[:, factor_ix], 4, original='in') 1894 mod_glm = GLM(self.endog[:, i], factor_exog, missing='drop') 1895 res_glm = mod_glm.fit_constrained(self.loading_constraints(i)) 1896 loadings += res_glm.params[:len(factor_ix)].tolist() 1897 resid.append(res_glm.resid_response) 1898 params[self._p['loadings']] = loadings 1899 1900 # (3) For each factor block, use an AR or VAR model to get coefficients 1901 # and covariance estimate 1902 # Factor transitions 1903 stationary = True 1904 1905 factor_ar = [] 1906 factor_cov = [] 1907 i = 0 1908 for block in self._s.factor_blocks: 1909 factors_endog = factors[:, i:i + block.k_factors] 1910 i += block.k_factors 1911 1912 if block.factor_order == 0: 1913 continue 1914 1915 if block.k_factors == 1: 1916 mod_factors = SARIMAX(factors_endog, 1917 order=(block.factor_order, 0, 0)) 1918 sp = mod_factors.start_params 1919 block_factor_ar = sp[:-1] 1920 block_factor_cov = sp[-1:] 1921 1922 coefficient_matrices = mod_factors.start_params[:-1] 1923 elif block.k_factors > 1: 1924 mod_factors = VAR(factors_endog) 1925 res_factors = mod_factors.fit( 1926 maxlags=block.factor_order, ic=None, trend='n') 1927 1928 block_factor_ar = res_factors.params.T.ravel() 1929 L = np.linalg.cholesky(res_factors.sigma_u) 1930 block_factor_cov = L[np.tril_indices_from(L)] 1931 1932 coefficient_matrices = np.transpose( 1933 np.reshape(block_factor_ar, 1934 (block.k_factors, block.k_factors, 1935 block.factor_order)), (2, 0, 1)) 1936 1937 # Test for stationarity 1938 stationary = is_invertible([1] + list(-coefficient_matrices)) 1939 1940 # Check for stationarity 1941 if not stationary: 1942 warn('Non-stationary starting factor autoregressive' 1943 ' parameters found for factor block' 1944 f' {block.factor_names}. Using zeros as starting' 1945 ' parameters.') 1946 block_factor_ar[:] = 0 1947 cov_factor = np.diag(factors_endog.std(axis=0)) 1948 block_factor_cov = ( 1949 cov_factor[np.tril_indices(block.k_factors)]) 1950 factor_ar += block_factor_ar.tolist() 1951 factor_cov += block_factor_cov.tolist() 1952 params[self._p['factor_ar']] = factor_ar 1953 params[self._p['factor_cov']] = factor_cov 1954 1955 # (4) Use residuals from step (2) to estimate the idiosyncratic 1956 # component 1957 # Idiosyncratic component 1958 if self.idiosyncratic_ar1: 1959 idio_ar1 = [] 1960 idio_var = [] 1961 for i in range(self.k_endog_M): 1962 mod_idio = SARIMAX(resid[i], order=(1, 0, 0), trend='c') 1963 sp = mod_idio.start_params 1964 idio_ar1.append(np.clip(sp[1], -0.99, 0.99)) 1965 idio_var.append(np.clip(sp[-1], 1e-5, np.inf)) 1966 for i in range(self.k_endog_M, self.k_endog): 1967 y = self.endog[:, i].copy() 1968 y[~np.isnan(y)] = resid[i] 1969 mod_idio = QuarterlyAR1(y) 1970 res_idio = mod_idio.fit(maxiter=10, return_params=True, 1971 disp=False) 1972 res_idio = mod_idio.fit_em(res_idio, maxiter=5, 1973 return_params=True) 1974 idio_ar1.append(np.clip(res_idio[0], -0.99, 0.99)) 1975 idio_var.append(np.clip(res_idio[1], 1e-5, np.inf)) 1976 params[self._p['idiosyncratic_ar1']] = idio_ar1 1977 params[self._p['idiosyncratic_var']] = idio_var 1978 else: 1979 idio_var = [np.var(resid[i]) for i in range(self.k_endog_M)] 1980 for i in range(self.k_endog_M, self.k_endog): 1981 y = self.endog[:, i].copy() 1982 y[~np.isnan(y)] = resid[i] 1983 mod_idio = QuarterlyAR1(y) 1984 res_idio = mod_idio.fit(return_params=True, disp=False) 1985 idio_var.append(np.clip(res_idio[1], 1e-5, np.inf)) 1986 params[self._p['idiosyncratic_var']] = idio_var 1987 1988 return params 1989 1990 def transform_params(self, unconstrained): 1991 """ 1992 Transform parameters from optimizer space to model space. 1993 1994 Transform unconstrained parameters used by the optimizer to constrained 1995 parameters used in likelihood evaluation. 1996 1997 Parameters 1998 ---------- 1999 unconstrained : array_like 2000 Array of unconstrained parameters used by the optimizer, to be 2001 transformed. 2002 2003 Returns 2004 ------- 2005 constrained : array_like 2006 Array of constrained parameters which may be used in likelihood 2007 evaluation. 2008 """ 2009 constrained = unconstrained.copy() 2010 2011 # Stationary factor VAR 2012 unconstrained_factor_ar = unconstrained[self._p['factor_ar']] 2013 constrained_factor_ar = [] 2014 i = 0 2015 for block in self._s.factor_blocks: 2016 length = block.k_factors**2 * block.factor_order 2017 tmp_coeff = np.reshape( 2018 unconstrained_factor_ar[i:i + length], 2019 (block.k_factors, block.k_factors * block.factor_order)) 2020 tmp_cov = np.eye(block.k_factors) 2021 tmp_coeff, _ = constrain_stationary_multivariate(tmp_coeff, 2022 tmp_cov) 2023 constrained_factor_ar += tmp_coeff.ravel().tolist() 2024 i += length 2025 constrained[self._p['factor_ar']] = constrained_factor_ar 2026 2027 # Stationary idiosyncratic AR(1) 2028 if self.idiosyncratic_ar1: 2029 idio_ar1 = unconstrained[self._p['idiosyncratic_ar1']] 2030 constrained[self._p['idiosyncratic_ar1']] = [ 2031 constrain_stationary_univariate(idio_ar1[i:i + 1])[0] 2032 for i in range(self.k_endog)] 2033 2034 # Positive idiosyncratic variances 2035 constrained[self._p['idiosyncratic_var']] = ( 2036 constrained[self._p['idiosyncratic_var']]**2) 2037 2038 return constrained 2039 2040 def untransform_params(self, constrained): 2041 """ 2042 Transform parameters from model space to optimizer space. 2043 2044 Transform constrained parameters used in likelihood evaluation 2045 to unconstrained parameters used by the optimizer. 2046 2047 Parameters 2048 ---------- 2049 constrained : array_like 2050 Array of constrained parameters used in likelihood evaluation, to 2051 be transformed. 2052 2053 Returns 2054 ------- 2055 unconstrained : array_like 2056 Array of unconstrained parameters used by the optimizer. 2057 """ 2058 unconstrained = constrained.copy() 2059 2060 # Stationary factor VAR 2061 constrained_factor_ar = constrained[self._p['factor_ar']] 2062 unconstrained_factor_ar = [] 2063 i = 0 2064 for block in self._s.factor_blocks: 2065 length = block.k_factors**2 * block.factor_order 2066 tmp_coeff = np.reshape( 2067 constrained_factor_ar[i:i + length], 2068 (block.k_factors, block.k_factors * block.factor_order)) 2069 tmp_cov = np.eye(block.k_factors) 2070 tmp_coeff, _ = unconstrain_stationary_multivariate(tmp_coeff, 2071 tmp_cov) 2072 unconstrained_factor_ar += tmp_coeff.ravel().tolist() 2073 i += length 2074 unconstrained[self._p['factor_ar']] = unconstrained_factor_ar 2075 2076 # Stationary idiosyncratic AR(1) 2077 if self.idiosyncratic_ar1: 2078 idio_ar1 = constrained[self._p['idiosyncratic_ar1']] 2079 unconstrained[self._p['idiosyncratic_ar1']] = [ 2080 unconstrain_stationary_univariate(idio_ar1[i:i + 1])[0] 2081 for i in range(self.k_endog)] 2082 2083 # Positive idiosyncratic variances 2084 unconstrained[self._p['idiosyncratic_var']] = ( 2085 unconstrained[self._p['idiosyncratic_var']]**0.5) 2086 2087 return unconstrained 2088 2089 def update(self, params, **kwargs): 2090 """ 2091 Update the parameters of the model. 2092 2093 Parameters 2094 ---------- 2095 params : array_like 2096 Array of new parameters. 2097 transformed : bool, optional 2098 Whether or not `params` is already transformed. If set to False, 2099 `transform_params` is called. Default is True. 2100 2101 """ 2102 params = super().update(params, **kwargs) 2103 2104 # Local copies 2105 o = self._o 2106 s = self._s 2107 p = self._p 2108 2109 # Loadings 2110 loadings = params[p['loadings']] 2111 start = 0 2112 for i in range(self.k_endog_M): 2113 iloc = self._s.endog_factor_iloc[i] 2114 k_factors = len(iloc) 2115 factor_ix = s['factors_L1'][iloc] 2116 self['design', i, factor_ix] = loadings[start:start + k_factors] 2117 start += k_factors 2118 multipliers = np.array([1, 2, 3, 2, 1])[:, None] 2119 for i in range(self.k_endog_M, self.k_endog): 2120 iloc = self._s.endog_factor_iloc[i] 2121 k_factors = len(iloc) 2122 factor_ix = s['factors_L1_5_ix'][:, iloc] 2123 self['design', i, factor_ix.ravel()] = np.ravel( 2124 loadings[start:start + k_factors] * multipliers) 2125 start += k_factors 2126 2127 # Factor VAR 2128 factor_ar = params[p['factor_ar']] 2129 start = 0 2130 for block in s.factor_blocks: 2131 k_params = block.k_factors**2 * block.factor_order 2132 A = np.reshape( 2133 factor_ar[start:start + k_params], 2134 (block.k_factors, block.k_factors * block.factor_order)) 2135 start += k_params 2136 self['transition', block['factors_L1'], block['factors_ar']] = A 2137 2138 # Factor covariance 2139 factor_cov = params[p['factor_cov']] 2140 start = 0 2141 ix1 = 0 2142 for block in s.factor_blocks: 2143 k_params = block.k_factors * (block.k_factors + 1) // 2 2144 L = np.zeros((block.k_factors, block.k_factors), 2145 dtype=params.dtype) 2146 L[np.tril_indices_from(L)] = factor_cov[start:start + k_params] 2147 start += k_params 2148 Q = L @ L.T 2149 ix2 = ix1 + block.k_factors 2150 self['state_cov', ix1:ix2, ix1:ix2] = Q 2151 ix1 = ix2 2152 2153 # Error AR(1) 2154 if self.idiosyncratic_ar1: 2155 alpha = np.diag(params[p['idiosyncratic_ar1']]) 2156 self['transition', s['idio_ar_L1'], s['idio_ar_L1']] = alpha 2157 2158 # Error variances 2159 if self.idiosyncratic_ar1: 2160 self['state_cov', self.k_factors:, self.k_factors:] = ( 2161 np.diag(params[p['idiosyncratic_var']])) 2162 else: 2163 idio_var = params[p['idiosyncratic_var']] 2164 self['obs_cov', o['M'], o['M']] = np.diag(idio_var[o['M']]) 2165 self['state_cov', self.k_factors:, self.k_factors:] = ( 2166 np.diag(idio_var[o['Q']])) 2167 2168 @property 2169 def loglike_constant(self): 2170 """ 2171 Constant term in the joint log-likelihood function. 2172 2173 Useful in facilitating comparisons to other packages that exclude the 2174 constant from the log-likelihood computation. 2175 """ 2176 return -0.5 * (1 - np.isnan(self.endog)).sum() * np.log(2 * np.pi) 2177 2178 def loading_constraints(self, i): 2179 r""" 2180 Matrix formulation of quarterly variables' factor loading constraints. 2181 2182 Parameters 2183 ---------- 2184 i : int 2185 Index of the `endog` variable to compute constraints for. 2186 2187 Returns 2188 ------- 2189 R : array (k_constraints, k_factors * 5) 2190 q : array (k_constraints,) 2191 2192 Notes 2193 ----- 2194 If the factors were known, then the factor loadings for the ith 2195 quarterly variable would be computed by a linear regression of the form 2196 2197 y_i = A_i' f + B_i' L1.f + C_i' L2.f + D_i' L3.f + E_i' L4.f 2198 2199 where: 2200 2201 - f is (k_i x 1) and collects all of the factors that load on y_i 2202 - L{j}.f is (k_i x 1) and collects the jth lag of each factor 2203 - A_i, ..., E_i are (k_i x 1) and collect factor loadings 2204 2205 As the observed variable is quarterly while the factors are monthly, we 2206 want to restrict the estimated regression coefficients to be: 2207 2208 y_i = A_i f + 2 A_i L1.f + 3 A_i L2.f + 2 A_i L3.f + A_i L4.f 2209 2210 Stack the unconstrained coefficients: \Lambda_i = [A_i' B_i' ... E_i']' 2211 2212 Then the constraints can be written as follows, for l = 1, ..., k_i 2213 2214 - 2 A_{i,l} - B_{i,l} = 0 2215 - 3 A_{i,l} - C_{i,l} = 0 2216 - 2 A_{i,l} - D_{i,l} = 0 2217 - A_{i,l} - E_{i,l} = 0 2218 2219 So that k_constraints = 4 * k_i. In matrix form the constraints are: 2220 2221 .. math:: 2222 2223 R \Lambda_i = q 2224 2225 where :math:`\Lambda_i` is shaped `(k_i * 5,)`, :math:`R` is shaped 2226 `(k_constraints, k_i * 5)`, and :math:`q` is shaped `(k_constraints,)`. 2227 2228 2229 For example, for the case that k_i = 2, we can write: 2230 2231 | 2 0 -1 0 0 0 0 0 0 0 | | A_{i,1} | | 0 | 2232 | 0 2 0 -1 0 0 0 0 0 0 | | A_{i,2} | | 0 | 2233 | 3 0 0 0 -1 0 0 0 0 0 | | B_{i,1} | | 0 | 2234 | 0 3 0 0 0 -1 0 0 0 0 | | B_{i,2} | | 0 | 2235 | 2 0 0 0 0 0 -1 0 0 0 | | C_{i,1} | = | 0 | 2236 | 0 2 0 0 0 0 0 -1 0 0 | | C_{i,2} | | 0 | 2237 | 1 0 0 0 0 0 0 0 -1 0 | | D_{i,1} | | 0 | 2238 | 0 1 0 0 0 0 0 0 0 -1 | | D_{i,2} | | 0 | 2239 | E_{i,1} | | 0 | 2240 | E_{i,2} | | 0 | 2241 2242 """ 2243 if i < self.k_endog_M: 2244 raise ValueError('No constraints for monthly variables.') 2245 if i not in self._loading_constraints: 2246 k_factors = self.endog_factor_map.iloc[i].sum() 2247 2248 R = np.zeros((k_factors * 4, k_factors * 5)) 2249 q = np.zeros(R.shape[0]) 2250 2251 # Let R = [R_1 R_2] 2252 # Then R_1 is multiples of the identity matrix 2253 multipliers = np.array([1, 2, 3, 2, 1]) 2254 R[:, :k_factors] = np.reshape( 2255 (multipliers[1:] * np.eye(k_factors)[..., None]).T, 2256 (k_factors * 4, k_factors)) 2257 2258 # And R_2 is the identity 2259 R[:, k_factors:] = np.diag([-1] * (k_factors * 4)) 2260 2261 self._loading_constraints[i] = (R, q) 2262 return self._loading_constraints[i] 2263 2264 def fit(self, start_params=None, transformed=True, includes_fixed=False, 2265 cov_type='none', cov_kwds=None, method='em', maxiter=500, 2266 tolerance=1e-6, em_initialization=True, mstep_method=None, 2267 full_output=1, disp=False, callback=None, return_params=False, 2268 optim_score=None, optim_complex_step=None, optim_hessian=None, 2269 flags=None, low_memory=False, llf_decrease_action='revert', 2270 llf_decrease_tolerance=1e-4, **kwargs): 2271 """ 2272 Fits the model by maximum likelihood via Kalman filter. 2273 2274 Parameters 2275 ---------- 2276 start_params : array_like, optional 2277 Initial guess of the solution for the loglikelihood maximization. 2278 If None, the default is given by Model.start_params. 2279 transformed : bool, optional 2280 Whether or not `start_params` is already transformed. Default is 2281 True. 2282 includes_fixed : bool, optional 2283 If parameters were previously fixed with the `fix_params` method, 2284 this argument describes whether or not `start_params` also includes 2285 the fixed parameters, in addition to the free parameters. Default 2286 is False. 2287 cov_type : str, optional 2288 The `cov_type` keyword governs the method for calculating the 2289 covariance matrix of parameter estimates. Can be one of: 2290 2291 - 'opg' for the outer product of gradient estimator 2292 - 'oim' for the observed information matrix estimator, calculated 2293 using the method of Harvey (1989) 2294 - 'approx' for the observed information matrix estimator, 2295 calculated using a numerical approximation of the Hessian matrix. 2296 - 'robust' for an approximate (quasi-maximum likelihood) covariance 2297 matrix that may be valid even in the presence of some 2298 misspecifications. Intermediate calculations use the 'oim' 2299 method. 2300 - 'robust_approx' is the same as 'robust' except that the 2301 intermediate calculations use the 'approx' method. 2302 - 'none' for no covariance matrix calculation. 2303 2304 Default is 'none', since computing this matrix can be very slow 2305 when there are a large number of parameters. 2306 cov_kwds : dict or None, optional 2307 A dictionary of arguments affecting covariance matrix computation. 2308 2309 **opg, oim, approx, robust, robust_approx** 2310 2311 - 'approx_complex_step' : bool, optional - If True, numerical 2312 approximations are computed using complex-step methods. If False, 2313 numerical approximations are computed using finite difference 2314 methods. Default is True. 2315 - 'approx_centered' : bool, optional - If True, numerical 2316 approximations computed using finite difference methods use a 2317 centered approximation. Default is False. 2318 method : str, optional 2319 The `method` determines which solver from `scipy.optimize` 2320 is used, and it can be chosen from among the following strings: 2321 2322 - 'em' for the EM algorithm 2323 - 'newton' for Newton-Raphson 2324 - 'nm' for Nelder-Mead 2325 - 'bfgs' for Broyden-Fletcher-Goldfarb-Shanno (BFGS) 2326 - 'lbfgs' for limited-memory BFGS with optional box constraints 2327 - 'powell' for modified Powell's method 2328 - 'cg' for conjugate gradient 2329 - 'ncg' for Newton-conjugate gradient 2330 - 'basinhopping' for global basin-hopping solver 2331 2332 The explicit arguments in `fit` are passed to the solver, 2333 with the exception of the basin-hopping solver. Each 2334 solver has several optional arguments that are not the same across 2335 solvers. See the notes section below (or scipy.optimize) for the 2336 available arguments and for the list of explicit arguments that the 2337 basin-hopping solver supports. 2338 maxiter : int, optional 2339 The maximum number of iterations to perform. 2340 tolerance : float, optional 2341 Tolerance to use for convergence checking when using the EM 2342 algorithm. To set the tolerance for other methods, pass 2343 the optimizer-specific keyword argument(s). 2344 full_output : bool, optional 2345 Set to True to have all available output in the Results object's 2346 mle_retvals attribute. The output is dependent on the solver. 2347 See LikelihoodModelResults notes section for more information. 2348 disp : bool, optional 2349 Set to True to print convergence messages. 2350 callback : callable callback(xk), optional 2351 Called after each iteration, as callback(xk), where xk is the 2352 current parameter vector. 2353 return_params : bool, optional 2354 Whether or not to return only the array of maximizing parameters. 2355 Default is False. 2356 optim_score : {'harvey', 'approx'} or None, optional 2357 The method by which the score vector is calculated. 'harvey' uses 2358 the method from Harvey (1989), 'approx' uses either finite 2359 difference or complex step differentiation depending upon the 2360 value of `optim_complex_step`, and None uses the built-in gradient 2361 approximation of the optimizer. Default is None. This keyword is 2362 only relevant if the optimization method uses the score. 2363 optim_complex_step : bool, optional 2364 Whether or not to use complex step differentiation when 2365 approximating the score; if False, finite difference approximation 2366 is used. Default is True. This keyword is only relevant if 2367 `optim_score` is set to 'harvey' or 'approx'. 2368 optim_hessian : {'opg','oim','approx'}, optional 2369 The method by which the Hessian is numerically approximated. 'opg' 2370 uses outer product of gradients, 'oim' uses the information 2371 matrix formula from Harvey (1989), and 'approx' uses numerical 2372 approximation. This keyword is only relevant if the 2373 optimization method uses the Hessian matrix. 2374 low_memory : bool, optional 2375 If set to True, techniques are applied to substantially reduce 2376 memory usage. If used, some features of the results object will 2377 not be available (including smoothed results and in-sample 2378 prediction), although out-of-sample forecasting is possible. 2379 Note that this option is not available when using the EM algorithm 2380 (which is the default for this model). Default is False. 2381 llf_decrease_action : {'ignore', 'warn', 'revert'}, optional 2382 Action to take if the log-likelihood decreases in an EM iteration. 2383 'ignore' continues the iterations, 'warn' issues a warning but 2384 continues the iterations, while 'revert' ends the iterations and 2385 returns the result from the last good iteration. Default is 'warn'. 2386 llf_decrease_tolerance : float, optional 2387 Minimum size of the log-likelihood decrease required to trigger a 2388 warning or to end the EM iterations. Setting this value slightly 2389 larger than zero allows small decreases in the log-likelihood that 2390 may be caused by numerical issues. If set to zero, then any 2391 decrease will trigger the `llf_decrease_action`. Default is 1e-4. 2392 **kwargs 2393 Additional keyword arguments to pass to the optimizer. 2394 2395 Returns 2396 ------- 2397 MLEResults 2398 2399 See Also 2400 -------- 2401 statsmodels.base.model.LikelihoodModel.fit 2402 statsmodels.tsa.statespace.mlemodel.MLEResults 2403 """ 2404 if method == 'em': 2405 return self.fit_em( 2406 start_params=start_params, transformed=transformed, 2407 cov_type=cov_type, cov_kwds=cov_kwds, maxiter=maxiter, 2408 tolerance=tolerance, em_initialization=em_initialization, 2409 mstep_method=mstep_method, full_output=full_output, disp=disp, 2410 return_params=return_params, low_memory=low_memory, 2411 llf_decrease_action=llf_decrease_action, 2412 llf_decrease_tolerance=llf_decrease_tolerance, **kwargs) 2413 else: 2414 return super().fit( 2415 start_params=start_params, transformed=transformed, 2416 includes_fixed=includes_fixed, cov_type=cov_type, 2417 cov_kwds=cov_kwds, method=method, maxiter=maxiter, 2418 full_output=full_output, disp=disp, 2419 callback=callback, return_params=return_params, 2420 optim_score=optim_score, 2421 optim_complex_step=optim_complex_step, 2422 optim_hessian=optim_hessian, flags=flags, 2423 low_memory=low_memory, **kwargs) 2424 2425 def fit_em(self, start_params=None, transformed=True, cov_type='none', 2426 cov_kwds=None, maxiter=500, tolerance=1e-6, disp=False, 2427 em_initialization=True, mstep_method=None, full_output=True, 2428 return_params=False, low_memory=False, 2429 llf_decrease_action='revert', llf_decrease_tolerance=1e-4): 2430 """ 2431 Fits the model by maximum likelihood via the EM algorithm. 2432 2433 Parameters 2434 ---------- 2435 start_params : array_like, optional 2436 Initial guess of the solution for the loglikelihood maximization. 2437 The default is to use `DynamicFactorMQ.start_params`. 2438 transformed : bool, optional 2439 Whether or not `start_params` is already transformed. Default is 2440 True. 2441 cov_type : str, optional 2442 The `cov_type` keyword governs the method for calculating the 2443 covariance matrix of parameter estimates. Can be one of: 2444 2445 - 'opg' for the outer product of gradient estimator 2446 - 'oim' for the observed information matrix estimator, calculated 2447 using the method of Harvey (1989) 2448 - 'approx' for the observed information matrix estimator, 2449 calculated using a numerical approximation of the Hessian matrix. 2450 - 'robust' for an approximate (quasi-maximum likelihood) covariance 2451 matrix that may be valid even in the presence of some 2452 misspecifications. Intermediate calculations use the 'oim' 2453 method. 2454 - 'robust_approx' is the same as 'robust' except that the 2455 intermediate calculations use the 'approx' method. 2456 - 'none' for no covariance matrix calculation. 2457 2458 Default is 'none', since computing this matrix can be very slow 2459 when there are a large number of parameters. 2460 cov_kwds : dict or None, optional 2461 A dictionary of arguments affecting covariance matrix computation. 2462 2463 **opg, oim, approx, robust, robust_approx** 2464 2465 - 'approx_complex_step' : bool, optional - If True, numerical 2466 approximations are computed using complex-step methods. If False, 2467 numerical approximations are computed using finite difference 2468 methods. Default is True. 2469 - 'approx_centered' : bool, optional - If True, numerical 2470 approximations computed using finite difference methods use a 2471 centered approximation. Default is False. 2472 maxiter : int, optional 2473 The maximum number of EM iterations to perform. 2474 tolerance : float, optional 2475 Parameter governing convergence of the EM algorithm. The 2476 `tolerance` is the minimum relative increase in the likelihood 2477 for which convergence will be declared. A smaller value for the 2478 `tolerance` will typically yield more precise parameter estimates, 2479 but will typically require more EM iterations. Default is 1e-6. 2480 disp : int or bool, optional 2481 Controls printing of EM iteration progress. If an integer, progress 2482 is printed at every `disp` iterations. A value of True is 2483 interpreted as the value of 1. Default is False (nothing will be 2484 printed). 2485 em_initialization : bool, optional 2486 Whether or not to also update the Kalman filter initialization 2487 using the EM algorithm. Default is True. 2488 mstep_method : {None, 'missing', 'nonmissing'}, optional 2489 The EM algorithm maximization step. If there are no NaN values 2490 in the dataset, this can be set to "nonmissing" (which is slightly 2491 faster) or "missing", otherwise it must be "missing". Default is 2492 "nonmissing" if there are no NaN values or "missing" if there are. 2493 full_output : bool, optional 2494 Set to True to have all available output from EM iterations in 2495 the Results object's mle_retvals attribute. 2496 return_params : bool, optional 2497 Whether or not to return only the array of maximizing parameters. 2498 Default is False. 2499 low_memory : bool, optional 2500 This option cannot be used with the EM algorithm and will raise an 2501 error if set to True. Default is False. 2502 llf_decrease_action : {'ignore', 'warn', 'revert'}, optional 2503 Action to take if the log-likelihood decreases in an EM iteration. 2504 'ignore' continues the iterations, 'warn' issues a warning but 2505 continues the iterations, while 'revert' ends the iterations and 2506 returns the result from the last good iteration. Default is 'warn'. 2507 llf_decrease_tolerance : float, optional 2508 Minimum size of the log-likelihood decrease required to trigger a 2509 warning or to end the EM iterations. Setting this value slightly 2510 larger than zero allows small decreases in the log-likelihood that 2511 may be caused by numerical issues. If set to zero, then any 2512 decrease will trigger the `llf_decrease_action`. Default is 1e-4. 2513 2514 Returns 2515 ------- 2516 DynamicFactorMQResults 2517 2518 See Also 2519 -------- 2520 statsmodels.tsa.statespace.mlemodel.MLEModel.fit 2521 statsmodels.tsa.statespace.mlemodel.MLEResults 2522 """ 2523 if self._has_fixed_params: 2524 raise NotImplementedError('Cannot fit using the EM algorithm while' 2525 ' holding some parameters fixed.') 2526 if low_memory: 2527 raise ValueError('Cannot fit using the EM algorithm when using' 2528 ' low_memory option.') 2529 2530 if start_params is None: 2531 start_params = self.start_params 2532 transformed = True 2533 else: 2534 start_params = np.array(start_params, ndmin=1) 2535 2536 if not transformed: 2537 start_params = self.transform_params(start_params) 2538 2539 llf_decrease_action = string_like( 2540 llf_decrease_action, 'llf_decrease_action', 2541 options=['ignore', 'warn', 'revert']) 2542 2543 disp = int(disp) 2544 2545 # Perform expectation-maximization 2546 s = self._s 2547 llf = [] 2548 params = [start_params] 2549 init = None 2550 inits = [self.ssm.initialization] 2551 i = 0 2552 delta = 0 2553 terminate = False 2554 # init_stationary = None if em_initialization else True 2555 while i < maxiter and not terminate and (i < 1 or (delta > tolerance)): 2556 out = self._em_iteration(params[-1], init=init, 2557 mstep_method=mstep_method) 2558 new_llf = out[0].llf_obs.sum() 2559 2560 # If we are not using EM initialization, then we need to check for 2561 # non-stationary parameters 2562 if not em_initialization: 2563 self.update(out[1]) 2564 switch_init = [] 2565 T = self['transition'] 2566 init = self.ssm.initialization 2567 iloc = np.arange(self.k_states) 2568 2569 # We may only have global initialization if we have no 2570 # quarterly variables and idiosyncratic_ar1=False 2571 if self.k_endog_Q == 0 and not self.idiosyncratic_ar1: 2572 block = s.factor_blocks[0] 2573 if init.initialization_type == 'stationary': 2574 Tb = T[block['factors'], block['factors']] 2575 if not np.all(np.linalg.eigvals(Tb) < (1 - 1e-10)): 2576 init.set(block['factors'], 'diffuse') 2577 switch_init.append( 2578 'factor block:' 2579 f' {tuple(block.factor_names)}') 2580 else: 2581 # Factor blocks 2582 for block in s.factor_blocks: 2583 b = tuple(iloc[block['factors']]) 2584 init_type = init.blocks[b].initialization_type 2585 if init_type == 'stationary': 2586 Tb = T[block['factors'], block['factors']] 2587 if not np.all(np.linalg.eigvals(Tb) < (1 - 1e-10)): 2588 init.set(block['factors'], 'diffuse') 2589 switch_init.append( 2590 'factor block:' 2591 f' {tuple(block.factor_names)}') 2592 2593 if self.idiosyncratic_ar1: 2594 endog_names = self._get_endog_names(as_string=True) 2595 # Monthly variables 2596 for j in range(s['idio_ar_M'].start, s['idio_ar_M'].stop): 2597 init_type = init.blocks[(j,)].initialization_type 2598 if init_type == 'stationary': 2599 if not np.abs(T[j, j]) < (1 - 1e-10): 2600 init.set(j, 'diffuse') 2601 name = endog_names[j - s['idio_ar_M'].start] 2602 switch_init.append( 2603 'idiosyncratic AR(1) for monthly' 2604 f' variable: {name}') 2605 2606 # Quarterly variables 2607 if self.k_endog_Q > 0: 2608 b = tuple(iloc[s['idio_ar_Q']]) 2609 init_type = init.blocks[b].initialization_type 2610 if init_type == 'stationary': 2611 Tb = T[s['idio_ar_Q'], s['idio_ar_Q']] 2612 if not np.all(np.linalg.eigvals(Tb) < (1 - 1e-10)): 2613 init.set(s['idio_ar_Q'], 'diffuse') 2614 switch_init.append( 2615 'idiosyncratic AR(1) for the' 2616 ' block of quarterly variables') 2617 2618 if len(switch_init) > 0: 2619 warn('Non-stationary parameters found at EM iteration' 2620 f' {i + 1}, which is not compatible with' 2621 ' stationary initialization. Initialization was' 2622 ' switched to diffuse for the following: ' 2623 f' {switch_init}, and fitting was restarted.') 2624 results = self.fit_em( 2625 start_params=params[-1], transformed=transformed, 2626 cov_type=cov_type, cov_kwds=cov_kwds, 2627 maxiter=maxiter, tolerance=tolerance, 2628 em_initialization=em_initialization, 2629 mstep_method=mstep_method, full_output=full_output, 2630 disp=disp, return_params=return_params, 2631 low_memory=low_memory, 2632 llf_decrease_action=llf_decrease_action, 2633 llf_decrease_tolerance=llf_decrease_tolerance) 2634 self.ssm.initialize(self._default_initialization()) 2635 return results 2636 2637 # Check for decrease in the log-likelihood 2638 # Note: allow a little numerical error before declaring a decrease 2639 llf_decrease = ( 2640 i > 0 and (new_llf - llf[-1]) < -llf_decrease_tolerance) 2641 2642 if llf_decrease_action == 'revert' and llf_decrease: 2643 warn(f'Log-likelihood decreased at EM iteration {i + 1}.' 2644 f' Reverting to the results from EM iteration {i}' 2645 ' (prior to the decrease) and returning the solution.') 2646 # Terminated iteration 2647 i -= 1 2648 terminate = True 2649 else: 2650 if llf_decrease_action == 'warn' and llf_decrease: 2651 warn(f'Log-likelihood decreased at EM iteration {i + 1},' 2652 ' which can indicate numerical issues.') 2653 llf.append(new_llf) 2654 params.append(out[1]) 2655 if em_initialization: 2656 init = initialization.Initialization( 2657 self.k_states, 'known', 2658 constant=out[0].smoothed_state[..., 0], 2659 stationary_cov=out[0].smoothed_state_cov[..., 0]) 2660 inits.append(init) 2661 if i > 0: 2662 delta = (2 * np.abs(llf[-1] - llf[-2]) / 2663 (np.abs(llf[-1]) + np.abs(llf[-2]))) 2664 else: 2665 delta = np.inf 2666 2667 # If `disp` is not False, display the first iteration 2668 if disp and i == 0: 2669 print(f'EM start iterations, llf={llf[-1]:.5g}') 2670 # Print output every `disp` observations 2671 elif disp and ((i + 1) % disp) == 0: 2672 print(f'EM iteration {i + 1}, llf={llf[-1]:.5g},' 2673 f' convergence criterion={delta:.5g}') 2674 2675 # Advance the iteration counter 2676 i += 1 2677 2678 # Check for convergence 2679 not_converged = (i == maxiter and delta > tolerance) 2680 2681 # If no convergence without explicit termination, warn users 2682 if not_converged: 2683 warn(f'EM reached maximum number of iterations ({maxiter}),' 2684 f' without achieving convergence: llf={llf[-1]:.5g},' 2685 f' convergence criterion={delta:.5g}' 2686 f' (while specified tolerance was {tolerance:.5g})') 2687 2688 # If `disp` is not False, display the final iteration 2689 if disp: 2690 if terminate: 2691 print(f'EM terminated at iteration {i}, llf={llf[-1]:.5g},' 2692 f' convergence criterion={delta:.5g}' 2693 f' (while specified tolerance was {tolerance:.5g})') 2694 elif not_converged: 2695 print(f'EM reached maximum number of iterations ({maxiter}),' 2696 f' without achieving convergence: llf={llf[-1]:.5g},' 2697 f' convergence criterion={delta:.5g}' 2698 f' (while specified tolerance was {tolerance:.5g})') 2699 else: 2700 print(f'EM converged at iteration {i}, llf={llf[-1]:.5g},' 2701 f' convergence criterion={delta:.5g}' 2702 f' < tolerance={tolerance:.5g}') 2703 2704 # Just return the fitted parameters if requested 2705 if return_params: 2706 result = params[-1] 2707 # Otherwise construct the results class if desired 2708 else: 2709 if em_initialization: 2710 base_init = self.ssm.initialization 2711 self.ssm.initialization = init 2712 # Note that because we are using params[-1], we are actually using 2713 # the results from one additional iteration compared to the 2714 # iteration at which we declared convergence. 2715 result = self.smooth(params[-1], transformed=True, 2716 cov_type=cov_type, cov_kwds=cov_kwds) 2717 if em_initialization: 2718 self.ssm.initialization = base_init 2719 2720 # Save the output 2721 if full_output: 2722 llf.append(result.llf) 2723 em_retvals = Bunch(**{'params': np.array(params), 2724 'llf': np.array(llf), 2725 'iter': i, 2726 'inits': inits}) 2727 em_settings = Bunch(**{'method': 'em', 2728 'tolerance': tolerance, 2729 'maxiter': maxiter}) 2730 else: 2731 em_retvals = None 2732 em_settings = None 2733 2734 result._results.mle_retvals = em_retvals 2735 result._results.mle_settings = em_settings 2736 2737 return result 2738 2739 def _em_iteration(self, params0, init=None, mstep_method=None): 2740 """EM iteration.""" 2741 # (E)xpectation step 2742 res = self._em_expectation_step(params0, init=init) 2743 2744 # (M)aximization step 2745 params1 = self._em_maximization_step(res, params0, 2746 mstep_method=mstep_method) 2747 2748 return res, params1 2749 2750 def _em_expectation_step(self, params0, init=None): 2751 """EM expectation step.""" 2752 # (E)xpectation step 2753 self.update(params0) 2754 # Re-initialize state, if new initialization is given 2755 if init is not None: 2756 base_init = self.ssm.initialization 2757 self.ssm.initialization = init 2758 # Perform smoothing, only saving what is required 2759 res = self.ssm.smooth( 2760 SMOOTHER_STATE | SMOOTHER_STATE_COV | SMOOTHER_STATE_AUTOCOV, 2761 update_filter=False) 2762 res.llf_obs = np.array( 2763 self.ssm._kalman_filter.loglikelihood, copy=True) 2764 # Reset initialization 2765 if init is not None: 2766 self.ssm.initialization = base_init 2767 2768 return res 2769 2770 def _em_maximization_step(self, res, params0, mstep_method=None): 2771 """EM maximization step.""" 2772 s = self._s 2773 2774 a = res.smoothed_state.T[..., None] 2775 cov_a = res.smoothed_state_cov.transpose(2, 0, 1) 2776 acov_a = res.smoothed_state_autocov.transpose(2, 0, 1) 2777 2778 # E[a_t a_t'], t = 0, ..., T 2779 Eaa = cov_a.copy() + np.matmul(a, a.transpose(0, 2, 1)) 2780 # E[a_t a_{t-1}'], t = 1, ..., T 2781 Eaa1 = acov_a[:-1] + np.matmul(a[1:], a[:-1].transpose(0, 2, 1)) 2782 2783 # Observation equation 2784 has_missing = np.any(res.nmissing) 2785 if mstep_method is None: 2786 mstep_method = 'missing' if has_missing else 'nonmissing' 2787 mstep_method = mstep_method.lower() 2788 if mstep_method == 'nonmissing' and has_missing: 2789 raise ValueError('Cannot use EM algorithm option' 2790 ' `mstep_method="nonmissing"` with missing data.') 2791 2792 if mstep_method == 'nonmissing': 2793 func = self._em_maximization_obs_nonmissing 2794 elif mstep_method == 'missing': 2795 func = self._em_maximization_obs_missing 2796 else: 2797 raise ValueError('Invalid maximization step method: "%s".' 2798 % mstep_method) 2799 # TODO: compute H is pretty slow 2800 Lambda, H = func(res, Eaa, a, compute_H=(not self.idiosyncratic_ar1)) 2801 2802 # Factor VAR and covariance 2803 factor_ar = [] 2804 factor_cov = [] 2805 for b in s.factor_blocks: 2806 A = Eaa[:-1, b['factors_ar'], b['factors_ar']].sum(axis=0) 2807 B = Eaa1[:, b['factors_L1'], b['factors_ar']].sum(axis=0) 2808 C = Eaa[1:, b['factors_L1'], b['factors_L1']].sum(axis=0) 2809 nobs = Eaa.shape[0] - 1 2810 2811 # want: x = B A^{-1}, so solve: x A = B or solve: A' x' = B' 2812 try: 2813 f_A = cho_solve(cho_factor(A), B.T).T 2814 except LinAlgError: 2815 # Fall back to general solver if there are problems with 2816 # postive-definiteness 2817 f_A = np.linalg.solve(A, B.T).T 2818 2819 f_Q = (C - f_A @ B.T) / nobs 2820 factor_ar += f_A.ravel().tolist() 2821 factor_cov += ( 2822 np.linalg.cholesky(f_Q)[np.tril_indices_from(f_Q)].tolist()) 2823 2824 # Idiosyncratic AR(1) and variances 2825 if self.idiosyncratic_ar1: 2826 ix = s['idio_ar_L1'] 2827 2828 Ad = Eaa[:-1, ix, ix].sum(axis=0).diagonal() 2829 Bd = Eaa1[:, ix, ix].sum(axis=0).diagonal() 2830 Cd = Eaa[1:, ix, ix].sum(axis=0).diagonal() 2831 nobs = Eaa.shape[0] - 1 2832 2833 alpha = Bd / Ad 2834 sigma2 = (Cd - alpha * Bd) / nobs 2835 else: 2836 ix = s['idio_ar_L1'] 2837 C = Eaa[:, ix, ix].sum(axis=0) 2838 sigma2 = np.r_[H.diagonal()[self._o['M']], 2839 C.diagonal() / Eaa.shape[0]] 2840 2841 # Save parameters 2842 params1 = np.zeros_like(params0) 2843 loadings = [] 2844 for i in range(self.k_endog): 2845 iloc = self._s.endog_factor_iloc[i] 2846 factor_ix = s['factors_L1'][iloc] 2847 loadings += Lambda[i, factor_ix].tolist() 2848 params1[self._p['loadings']] = loadings 2849 params1[self._p['factor_ar']] = factor_ar 2850 params1[self._p['factor_cov']] = factor_cov 2851 if self.idiosyncratic_ar1: 2852 params1[self._p['idiosyncratic_ar1']] = alpha 2853 params1[self._p['idiosyncratic_var']] = sigma2 2854 2855 return params1 2856 2857 def _em_maximization_obs_nonmissing(self, res, Eaa, a, compute_H=False): 2858 """EM maximization step, observation equation without missing data.""" 2859 s = self._s 2860 dtype = Eaa.dtype 2861 2862 # Observation equation (non-missing) 2863 # Note: we only compute loadings for monthly variables because 2864 # quarterly variables will always have missing entries, so we would 2865 # never choose this method in that case 2866 k = s.k_states_factors 2867 Lambda = np.zeros((self.k_endog, k), dtype=dtype) 2868 for i in range(self.k_endog): 2869 y = self.endog[:, i:i + 1] 2870 iloc = self._s.endog_factor_iloc[i] 2871 factor_ix = s['factors_L1'][iloc] 2872 2873 ix = (np.s_[:],) + np.ix_(factor_ix, factor_ix) 2874 A = Eaa[ix].sum(axis=0) 2875 B = y.T @ a[:, factor_ix, 0] 2876 if self.idiosyncratic_ar1: 2877 ix1 = s.k_states_factors + i 2878 ix2 = ix1 + 1 2879 B -= Eaa[:, ix1:ix2, factor_ix].sum(axis=0) 2880 2881 # want: x = B A^{-1}, so solve: x A = B or solve: A' x' = B' 2882 try: 2883 Lambda[i, factor_ix] = cho_solve(cho_factor(A), B.T).T 2884 except LinAlgError: 2885 # Fall back to general solver if there are problems with 2886 # postive-definiteness 2887 Lambda[i, factor_ix] = np.linalg.solve(A, B.T).T 2888 2889 # Compute new obs cov 2890 # Note: this is unnecessary if `idiosyncratic_ar1=True`. 2891 # This is written in a slightly more general way than 2892 # Banbura and Modugno (2014), equation (7); see instead equation (13) 2893 # of Wu et al. (1996) 2894 # "An algorithm for estimating parameters of state-space models" 2895 if compute_H: 2896 Z = self['design'].copy() 2897 Z[:, :k] = Lambda 2898 BL = self.endog.T @ a[..., 0] @ Z.T 2899 C = self.endog.T @ self.endog 2900 2901 H = (C + -BL - BL.T + Z @ Eaa.sum(axis=0) @ Z.T) / self.nobs 2902 else: 2903 H = np.zeros((self.k_endog, self.k_endog), dtype=dtype) * np.nan 2904 2905 return Lambda, H 2906 2907 def _em_maximization_obs_missing(self, res, Eaa, a, compute_H=False): 2908 """EM maximization step, observation equation with missing data.""" 2909 s = self._s 2910 dtype = Eaa.dtype 2911 2912 # Observation equation (missing) 2913 k = s.k_states_factors 2914 Lambda = np.zeros((self.k_endog, k), dtype=dtype) 2915 2916 W = (1 - res.missing.T) 2917 mask = W.astype(bool) 2918 2919 # Compute design for monthly 2920 # Note: the relevant A changes for each i 2921 for i in range(self.k_endog_M): 2922 iloc = self._s.endog_factor_iloc[i] 2923 factor_ix = s['factors_L1'][iloc] 2924 2925 m = mask[:, i] 2926 yt = self.endog[m, i:i + 1] 2927 2928 ix = np.ix_(m, factor_ix, factor_ix) 2929 Ai = Eaa[ix].sum(axis=0) 2930 Bi = yt.T @ a[np.ix_(m, factor_ix)][..., 0] 2931 if self.idiosyncratic_ar1: 2932 ix1 = s.k_states_factors + i 2933 ix2 = ix1 + 1 2934 Bi -= Eaa[m, ix1:ix2][..., factor_ix].sum(axis=0) 2935 # want: x = B A^{-1}, so solve: x A = B or solve: A' x' = B' 2936 try: 2937 Lambda[i, factor_ix] = cho_solve(cho_factor(Ai), Bi.T).T 2938 except LinAlgError: 2939 # Fall back to general solver if there are problems with 2940 # postive-definiteness 2941 Lambda[i, factor_ix] = np.linalg.solve(Ai, Bi.T).T 2942 2943 # Compute unrestricted design for quarterly 2944 # See Banbura at al. (2011), where this is described in Appendix C, 2945 # between equations (13) and (14). 2946 if self.k_endog_Q > 0: 2947 # Note: the relevant A changes for each i 2948 multipliers = np.array([1, 2, 3, 2, 1])[:, None] 2949 for i in range(self.k_endog_M, self.k_endog): 2950 iloc = self._s.endog_factor_iloc[i] 2951 factor_ix = s['factors_L1_5_ix'][:, iloc].ravel().tolist() 2952 2953 R, _ = self.loading_constraints(i) 2954 2955 iQ = i - self.k_endog_M 2956 m = mask[:, i] 2957 yt = self.endog[m, i:i + 1] 2958 ix = np.ix_(m, factor_ix, factor_ix) 2959 Ai = Eaa[ix].sum(axis=0) 2960 BiQ = yt.T @ a[np.ix_(m, factor_ix)][..., 0] 2961 if self.idiosyncratic_ar1: 2962 ix = (np.s_[:],) + np.ix_(s['idio_ar_Q_ix'][iQ], factor_ix) 2963 Eepsf = Eaa[ix] 2964 BiQ -= (multipliers * Eepsf[m].sum(axis=0)).sum(axis=0) 2965 2966 # Note that there was a typo in Banbura et al. (2011) for 2967 # the formula applying the restrictions. In their notation, 2968 # they show (C D C')^{-1} while it should be (C D^{-1} C')^{-1} 2969 # Note: in reality, this is: 2970 # unrestricted - Aii @ R.T @ RARi @ (R @ unrestricted - q) 2971 # where the restrictions are defined as: R @ unrestricted = q 2972 # However, here q = 0, so we can simplify. 2973 try: 2974 L_and_lower = cho_factor(Ai) 2975 # x = BQ A^{-1}, or x A = BQ, so solve A' x' = (BQ)' 2976 unrestricted = cho_solve(L_and_lower, BiQ.T).T[0] 2977 AiiRT = cho_solve(L_and_lower, R.T) 2978 2979 L_and_lower = cho_factor(R @ AiiRT) 2980 RAiiRTiR = cho_solve(L_and_lower, R) 2981 restricted = unrestricted - AiiRT @ RAiiRTiR @ unrestricted 2982 except LinAlgError: 2983 # Fall back to slower method if there are problems with 2984 # postive-definiteness 2985 Aii = np.linalg.inv(Ai) 2986 unrestricted = (BiQ @ Aii)[0] 2987 RARi = np.linalg.inv(R @ Aii @ R.T) 2988 restricted = (unrestricted - 2989 Aii @ R.T @ RARi @ R @ unrestricted) 2990 Lambda[i, factor_ix] = restricted 2991 2992 # Compute new obs cov 2993 # Note: this is unnecessary if `idiosyncratic_ar1=True`. 2994 # See Banbura and Modugno (2014), equation (12) 2995 # This does not literally follow their formula, e.g. multiplying by the 2996 # W_t selection matrices, because those formulas require loops that are 2997 # relatively slow. The formulation here is vectorized. 2998 if compute_H: 2999 Z = self['design'].copy() 3000 Z[:, :Lambda.shape[1]] = Lambda 3001 3002 y = np.nan_to_num(self.endog) 3003 C = y.T @ y 3004 W = W[..., None] 3005 IW = 1 - W 3006 3007 WL = W * Z 3008 WLT = WL.transpose(0, 2, 1) 3009 BL = y[..., None] @ a.transpose(0, 2, 1) @ WLT 3010 A = Eaa 3011 3012 BLT = BL.transpose(0, 2, 1) 3013 IWT = IW.transpose(0, 2, 1) 3014 3015 H = (C + (-BL - BLT + WL @ A @ WLT + 3016 IW * self['obs_cov'] * IWT).sum(axis=0)) / self.nobs 3017 else: 3018 H = np.zeros((self.k_endog, self.k_endog), dtype=dtype) * np.nan 3019 3020 return Lambda, H 3021 3022 def smooth(self, params, transformed=True, includes_fixed=False, 3023 complex_step=False, cov_type='none', cov_kwds=None, 3024 return_ssm=False, results_class=None, 3025 results_wrapper_class=None, **kwargs): 3026 """ 3027 Kalman smoothing. 3028 3029 Parameters 3030 ---------- 3031 params : array_like 3032 Array of parameters at which to evaluate the loglikelihood 3033 function. 3034 transformed : bool, optional 3035 Whether or not `params` is already transformed. Default is True. 3036 return_ssm : bool,optional 3037 Whether or not to return only the state space output or a full 3038 results object. Default is to return a full results object. 3039 cov_type : str, optional 3040 See `MLEResults.fit` for a description of covariance matrix types 3041 for results object. Default is None. 3042 cov_kwds : dict or None, optional 3043 See `MLEResults.get_robustcov_results` for a description required 3044 keywords for alternative covariance estimators 3045 **kwargs 3046 Additional keyword arguments to pass to the Kalman filter. See 3047 `KalmanFilter.filter` for more details. 3048 """ 3049 return super().smooth( 3050 params, transformed=transformed, includes_fixed=includes_fixed, 3051 complex_step=complex_step, cov_type=cov_type, cov_kwds=cov_kwds, 3052 return_ssm=return_ssm, results_class=results_class, 3053 results_wrapper_class=results_wrapper_class, **kwargs) 3054 3055 def filter(self, params, transformed=True, includes_fixed=False, 3056 complex_step=False, cov_type='none', cov_kwds=None, 3057 return_ssm=False, results_class=None, 3058 results_wrapper_class=None, low_memory=False, **kwargs): 3059 """ 3060 Kalman filtering. 3061 3062 Parameters 3063 ---------- 3064 params : array_like 3065 Array of parameters at which to evaluate the loglikelihood 3066 function. 3067 transformed : bool, optional 3068 Whether or not `params` is already transformed. Default is True. 3069 return_ssm : bool,optional 3070 Whether or not to return only the state space output or a full 3071 results object. Default is to return a full results object. 3072 cov_type : str, optional 3073 See `MLEResults.fit` for a description of covariance matrix types 3074 for results object. Default is 'none'. 3075 cov_kwds : dict or None, optional 3076 See `MLEResults.get_robustcov_results` for a description required 3077 keywords for alternative covariance estimators 3078 low_memory : bool, optional 3079 If set to True, techniques are applied to substantially reduce 3080 memory usage. If used, some features of the results object will 3081 not be available (including in-sample prediction), although 3082 out-of-sample forecasting is possible. Default is False. 3083 **kwargs 3084 Additional keyword arguments to pass to the Kalman filter. See 3085 `KalmanFilter.filter` for more details. 3086 """ 3087 return super().filter( 3088 params, transformed=transformed, includes_fixed=includes_fixed, 3089 complex_step=complex_step, cov_type=cov_type, cov_kwds=cov_kwds, 3090 return_ssm=return_ssm, results_class=results_class, 3091 results_wrapper_class=results_wrapper_class, **kwargs) 3092 3093 def simulate(self, params, nsimulations, measurement_shocks=None, 3094 state_shocks=None, initial_state=None, anchor=None, 3095 repetitions=None, exog=None, extend_model=None, 3096 extend_kwargs=None, transformed=True, includes_fixed=False, 3097 original_scale=True, **kwargs): 3098 r""" 3099 Simulate a new time series following the state space model. 3100 3101 Parameters 3102 ---------- 3103 params : array_like 3104 Array of parameters to use in constructing the state space 3105 representation to use when simulating. 3106 nsimulations : int 3107 The number of observations to simulate. If the model is 3108 time-invariant this can be any number. If the model is 3109 time-varying, then this number must be less than or equal to the 3110 number of observations. 3111 measurement_shocks : array_like, optional 3112 If specified, these are the shocks to the measurement equation, 3113 :math:`\varepsilon_t`. If unspecified, these are automatically 3114 generated using a pseudo-random number generator. If specified, 3115 must be shaped `nsimulations` x `k_endog`, where `k_endog` is the 3116 same as in the state space model. 3117 state_shocks : array_like, optional 3118 If specified, these are the shocks to the state equation, 3119 :math:`\eta_t`. If unspecified, these are automatically 3120 generated using a pseudo-random number generator. If specified, 3121 must be shaped `nsimulations` x `k_posdef` where `k_posdef` is the 3122 same as in the state space model. 3123 initial_state : array_like, optional 3124 If specified, this is the initial state vector to use in 3125 simulation, which should be shaped (`k_states` x 1), where 3126 `k_states` is the same as in the state space model. If unspecified, 3127 but the model has been initialized, then that initialization is 3128 used. This must be specified if `anchor` is anything other than 3129 "start" or 0 (or else you can use the `simulate` method on a 3130 results object rather than on the model object). 3131 anchor : int, str, or datetime, optional 3132 First period for simulation. The simulation will be conditional on 3133 all existing datapoints prior to the `anchor`. Type depends on the 3134 index of the given `endog` in the model. Two special cases are the 3135 strings 'start' and 'end'. `start` refers to beginning the 3136 simulation at the first period of the sample, and `end` refers to 3137 beginning the simulation at the first period after the sample. 3138 Integer values can run from 0 to `nobs`, or can be negative to 3139 apply negative indexing. Finally, if a date/time index was provided 3140 to the model, then this argument can be a date string to parse or a 3141 datetime type. Default is 'start'. 3142 repetitions : int, optional 3143 Number of simulated paths to generate. Default is 1 simulated path. 3144 exog : array_like, optional 3145 New observations of exogenous regressors, if applicable. 3146 transformed : bool, optional 3147 Whether or not `params` is already transformed. Default is 3148 True. 3149 includes_fixed : bool, optional 3150 If parameters were previously fixed with the `fix_params` method, 3151 this argument describes whether or not `params` also includes 3152 the fixed parameters, in addition to the free parameters. Default 3153 is False. 3154 original_scale : bool, optional 3155 If the model specification standardized the data, whether or not 3156 to return simulations in the original scale of the data (i.e. 3157 before it was standardized by the model). Default is True. 3158 3159 Returns 3160 ------- 3161 simulated_obs : ndarray 3162 An array of simulated observations. If `repetitions=None`, then it 3163 will be shaped (nsimulations x k_endog) or (nsimulations,) if 3164 `k_endog=1`. Otherwise it will be shaped 3165 (nsimulations x k_endog x repetitions). If the model was given 3166 Pandas input then the output will be a Pandas object. If 3167 `k_endog > 1` and `repetitions` is not None, then the output will 3168 be a Pandas DataFrame that has a MultiIndex for the columns, with 3169 the first level containing the names of the `endog` variables and 3170 the second level containing the repetition number. 3171 """ 3172 # Get usual simulations (in the possibly-standardized scale) 3173 sim = super().simulate( 3174 params, nsimulations, measurement_shocks=measurement_shocks, 3175 state_shocks=state_shocks, initial_state=initial_state, 3176 anchor=anchor, repetitions=repetitions, exog=exog, 3177 extend_model=extend_model, extend_kwargs=extend_kwargs, 3178 transformed=transformed, includes_fixed=includes_fixed, **kwargs) 3179 3180 # If applicable, convert predictions back to original space 3181 if self.standardize and original_scale: 3182 use_pandas = isinstance(self.data, PandasData) 3183 shape = sim.shape 3184 3185 if use_pandas: 3186 # pd.Series (k_endog=1, replications=None) 3187 if len(shape) == 1: 3188 sim = sim * self._endog_std[0] + self._endog_mean[0] 3189 # pd.DataFrame (k_endog > 1, replications=None) 3190 # [or] 3191 # pd.DataFrame with MultiIndex (replications > 0) 3192 elif len(shape) == 2: 3193 sim = (sim.multiply(self._endog_std, axis=1, level=0) 3194 .add(self._endog_mean, axis=1, level=0)) 3195 else: 3196 # 1-dim array (k_endog=1, replications=None) 3197 if len(shape) == 1: 3198 sim = sim * self._endog_std + self._endog_mean 3199 # 2-dim array (k_endog > 1, replications=None) 3200 elif len(shape) == 2: 3201 sim = sim * self._endog_std + self._endog_mean 3202 # 3-dim array with MultiIndex (replications > 0) 3203 else: 3204 # Get arrays into the form that can be used for 3205 # broadcasting 3206 std = np.atleast_2d(self._endog_std)[..., None] 3207 mean = np.atleast_2d(self._endog_mean)[..., None] 3208 sim = sim * std + mean 3209 3210 return sim 3211 3212 def impulse_responses(self, params, steps=1, impulse=0, 3213 orthogonalized=False, cumulative=False, anchor=None, 3214 exog=None, extend_model=None, extend_kwargs=None, 3215 transformed=True, includes_fixed=False, 3216 original_scale=True, **kwargs): 3217 """ 3218 Impulse response function. 3219 3220 Parameters 3221 ---------- 3222 params : array_like 3223 Array of model parameters. 3224 steps : int, optional 3225 The number of steps for which impulse responses are calculated. 3226 Default is 1. Note that for time-invariant models, the initial 3227 impulse is not counted as a step, so if `steps=1`, the output will 3228 have 2 entries. 3229 impulse : int or array_like 3230 If an integer, the state innovation to pulse; must be between 0 3231 and `k_posdef-1`. Alternatively, a custom impulse vector may be 3232 provided; must be shaped `k_posdef x 1`. 3233 orthogonalized : bool, optional 3234 Whether or not to perform impulse using orthogonalized innovations. 3235 Note that this will also affect custum `impulse` vectors. Default 3236 is False. 3237 cumulative : bool, optional 3238 Whether or not to return cumulative impulse responses. Default is 3239 False. 3240 anchor : int, str, or datetime, optional 3241 Time point within the sample for the state innovation impulse. Type 3242 depends on the index of the given `endog` in the model. Two special 3243 cases are the strings 'start' and 'end', which refer to setting the 3244 impulse at the first and last points of the sample, respectively. 3245 Integer values can run from 0 to `nobs - 1`, or can be negative to 3246 apply negative indexing. Finally, if a date/time index was provided 3247 to the model, then this argument can be a date string to parse or a 3248 datetime type. Default is 'start'. 3249 exog : array_like, optional 3250 New observations of exogenous regressors for our-of-sample periods, 3251 if applicable. 3252 transformed : bool, optional 3253 Whether or not `params` is already transformed. Default is 3254 True. 3255 includes_fixed : bool, optional 3256 If parameters were previously fixed with the `fix_params` method, 3257 this argument describes whether or not `params` also includes 3258 the fixed parameters, in addition to the free parameters. Default 3259 is False. 3260 original_scale : bool, optional 3261 If the model specification standardized the data, whether or not 3262 to return impulse responses in the original scale of the data (i.e. 3263 before it was standardized by the model). Default is True. 3264 **kwargs 3265 If the model has time-varying design or transition matrices and the 3266 combination of `anchor` and `steps` implies creating impulse 3267 responses for the out-of-sample period, then these matrices must 3268 have updated values provided for the out-of-sample steps. For 3269 example, if `design` is a time-varying component, `nobs` is 10, 3270 `anchor=1`, and `steps` is 15, a (`k_endog` x `k_states` x 7) 3271 matrix must be provided with the new design matrix values. 3272 3273 Returns 3274 ------- 3275 impulse_responses : ndarray 3276 Responses for each endogenous variable due to the impulse 3277 given by the `impulse` argument. For a time-invariant model, the 3278 impulse responses are given for `steps + 1` elements (this gives 3279 the "initial impulse" followed by `steps` responses for the 3280 important cases of VAR and SARIMAX models), while for time-varying 3281 models the impulse responses are only given for `steps` elements 3282 (to avoid having to unexpectedly provide updated time-varying 3283 matrices). 3284 3285 """ 3286 # Get usual simulations (in the possibly-standardized scale) 3287 irfs = super().impulse_responses( 3288 params, steps=steps, impulse=impulse, 3289 orthogonalized=orthogonalized, cumulative=cumulative, 3290 anchor=anchor, exog=exog, extend_model=extend_model, 3291 extend_kwargs=extend_kwargs, transformed=transformed, 3292 includes_fixed=includes_fixed, original_scale=original_scale, 3293 **kwargs) 3294 3295 # If applicable, convert predictions back to original space 3296 if self.standardize and original_scale: 3297 use_pandas = isinstance(self.data, PandasData) 3298 shape = irfs.shape 3299 3300 if use_pandas: 3301 # pd.Series (k_endog=1, replications=None) 3302 if len(shape) == 1: 3303 irfs = irfs * self._endog_std[0] 3304 # pd.DataFrame (k_endog > 1) 3305 # [or] 3306 # pd.DataFrame with MultiIndex (replications > 0) 3307 elif len(shape) == 2: 3308 irfs = irfs.multiply(self._endog_std, axis=1, level=0) 3309 else: 3310 # 1-dim array (k_endog=1) 3311 if len(shape) == 1: 3312 irfs = irfs * self._endog_std 3313 # 2-dim array (k_endog > 1) 3314 elif len(shape) == 2: 3315 irfs = irfs * self._endog_std 3316 3317 return irfs 3318 3319 3320class DynamicFactorMQResults(mlemodel.MLEResults): 3321 """ 3322 Results from fitting a dynamic factor model 3323 """ 3324 def __init__(self, model, params, filter_results, cov_type=None, **kwargs): 3325 super(DynamicFactorMQResults, self).__init__( 3326 model, params, filter_results, cov_type, **kwargs) 3327 3328 @property 3329 def factors(self): 3330 """ 3331 Estimates of unobserved factors. 3332 3333 Returns 3334 ------- 3335 out : Bunch 3336 Has the following attributes shown in Notes. 3337 3338 Notes 3339 ----- 3340 The output is a bunch of the following format: 3341 3342 - `filtered`: a time series array with the filtered estimate of 3343 the component 3344 - `filtered_cov`: a time series array with the filtered estimate of 3345 the variance/covariance of the component 3346 - `smoothed`: a time series array with the smoothed estimate of 3347 the component 3348 - `smoothed_cov`: a time series array with the smoothed estimate of 3349 the variance/covariance of the component 3350 - `offset`: an integer giving the offset in the state vector where 3351 this component begins 3352 """ 3353 out = None 3354 if self.model.k_factors > 0: 3355 iloc = self.model._s.factors_L1 3356 ix = np.array(self.model.state_names)[iloc].tolist() 3357 out = Bunch( 3358 filtered=self.states.filtered.loc[:, ix], 3359 filtered_cov=self.states.filtered_cov.loc[np.s_[ix, :], ix], 3360 smoothed=None, smoothed_cov=None) 3361 if self.smoothed_state is not None: 3362 out.smoothed = self.states.smoothed.loc[:, ix] 3363 if self.smoothed_state_cov is not None: 3364 out.smoothed_cov = ( 3365 self.states.smoothed_cov.loc[np.s_[ix, :], ix]) 3366 return out 3367 3368 def get_coefficients_of_determination(self, method='individual', 3369 which=None): 3370 """ 3371 Get coefficients of determination (R-squared) for variables / factors. 3372 3373 Parameters 3374 ---------- 3375 method : {'individual', 'joint', 'cumulative'}, optional 3376 The type of R-squared values to generate. "individual" plots 3377 the R-squared of each variable on each factor; "joint" plots the 3378 R-squared of each variable on each factor that it loads on; 3379 "cumulative" plots the successive R-squared values as each 3380 additional factor is added to the regression, for each variable. 3381 Default is 'individual'. 3382 which: {None, 'filtered', 'smoothed'}, optional 3383 Whether to compute R-squared values based on filtered or smoothed 3384 estimates of the factors. Default is 'smoothed' if smoothed results 3385 are available and 'filtered' otherwise. 3386 3387 Returns 3388 ------- 3389 rsquared : pd.DataFrame or pd.Series 3390 The R-squared values from regressions of observed variables on 3391 one or more of the factors. If method='individual' or 3392 method='cumulative', this will be a Pandas DataFrame with observed 3393 variables as the index and factors as the columns . If 3394 method='joint', will be a Pandas Series with observed variables as 3395 the index. 3396 3397 See Also 3398 -------- 3399 plot_coefficients_of_determination 3400 coefficients_of_determination 3401 """ 3402 from statsmodels.tools import add_constant 3403 3404 method = string_like(method, 'method', options=['individual', 'joint', 3405 'cumulative']) 3406 if which is None: 3407 which = 'filtered' if self.smoothed_state is None else 'smoothed' 3408 3409 k_endog = self.model.k_endog 3410 k_factors = self.model.k_factors 3411 ef_map = self.model._s.endog_factor_map 3412 endog_names = self.model.endog_names 3413 factor_names = self.model.factor_names 3414 3415 if method == 'individual': 3416 coefficients = np.zeros((k_endog, k_factors)) 3417 3418 for i in range(k_factors): 3419 exog = add_constant(self.factors[which].iloc[:, i]) 3420 for j in range(k_endog): 3421 if ef_map.iloc[j, i]: 3422 endog = self.filter_results.endog[j] 3423 coefficients[j, i] = ( 3424 OLS(endog, exog, missing='drop').fit().rsquared) 3425 else: 3426 coefficients[j, i] = np.nan 3427 3428 coefficients = pd.DataFrame(coefficients, index=endog_names, 3429 columns=factor_names) 3430 elif method == 'joint': 3431 coefficients = np.zeros((k_endog,)) 3432 exog = add_constant(self.factors[which]) 3433 for j in range(k_endog): 3434 endog = self.filter_results.endog[j] 3435 ix = np.r_[True, ef_map.iloc[j]].tolist() 3436 X = exog.loc[:, ix] 3437 coefficients[j] = ( 3438 OLS(endog, X, missing='drop').fit().rsquared) 3439 coefficients = pd.Series(coefficients, index=endog_names) 3440 elif method == 'cumulative': 3441 coefficients = np.zeros((k_endog, k_factors)) 3442 exog = add_constant(self.factors[which]) 3443 for j in range(k_endog): 3444 endog = self.filter_results.endog[j] 3445 3446 for i in range(k_factors): 3447 if self.model._s.endog_factor_map.iloc[j, i]: 3448 ix = np.r_[True, ef_map.iloc[j, :i + 1], 3449 [False] * (k_factors - i - 1)] 3450 X = exog.loc[:, ix.astype(bool).tolist()] 3451 coefficients[j, i] = ( 3452 OLS(endog, X, missing='drop').fit().rsquared) 3453 else: 3454 coefficients[j, i] = np.nan 3455 coefficients = pd.DataFrame(coefficients, index=endog_names, 3456 columns=factor_names) 3457 3458 return coefficients 3459 3460 @cache_readonly 3461 def coefficients_of_determination(self): 3462 """ 3463 Individual coefficients of determination (:math:`R^2`). 3464 3465 Coefficients of determination (:math:`R^2`) from regressions of 3466 endogenous variables on individual estimated factors. 3467 3468 Returns 3469 ------- 3470 coefficients_of_determination : ndarray 3471 A `k_endog` x `k_factors` array, where 3472 `coefficients_of_determination[i, j]` represents the :math:`R^2` 3473 value from a regression of factor `j` and a constant on endogenous 3474 variable `i`. 3475 3476 Notes 3477 ----- 3478 Although it can be difficult to interpret the estimated factor loadings 3479 and factors, it is often helpful to use the coefficients of 3480 determination from univariate regressions to assess the importance of 3481 each factor in explaining the variation in each endogenous variable. 3482 3483 In models with many variables and factors, this can sometimes lend 3484 interpretation to the factors (for example sometimes one factor will 3485 load primarily on real variables and another on nominal variables). 3486 3487 See Also 3488 -------- 3489 get_coefficients_of_determination 3490 plot_coefficients_of_determination 3491 """ 3492 return self.get_coefficients_of_determination(method='individual') 3493 3494 def plot_coefficients_of_determination(self, method='individual', 3495 which=None, endog_labels=None, 3496 fig=None, figsize=None): 3497 """ 3498 Plot coefficients of determination (R-squared) for variables / factors. 3499 3500 Parameters 3501 ---------- 3502 method : {'individual', 'joint', 'cumulative'}, optional 3503 The type of R-squared values to generate. "individual" plots 3504 the R-squared of each variable on each factor; "joint" plots the 3505 R-squared of each variable on each factor that it loads on; 3506 "cumulative" plots the successive R-squared values as each 3507 additional factor is added to the regression, for each variable. 3508 Default is 'individual'. 3509 which: {None, 'filtered', 'smoothed'}, optional 3510 Whether to compute R-squared values based on filtered or smoothed 3511 estimates of the factors. Default is 'smoothed' if smoothed results 3512 are available and 'filtered' otherwise. 3513 endog_labels : bool, optional 3514 Whether or not to label the endogenous variables along the x-axis 3515 of the plots. Default is to include labels if there are 5 or fewer 3516 endogenous variables. 3517 fig : Figure, optional 3518 If given, subplots are created in this figure instead of in a new 3519 figure. Note that the grid will be created in the provided 3520 figure using `fig.add_subplot()`. 3521 figsize : tuple, optional 3522 If a figure is created, this argument allows specifying a size. 3523 The tuple is (width, height). 3524 3525 Notes 3526 ----- 3527 The endogenous variables are arranged along the x-axis according to 3528 their position in the model's `endog` array. 3529 3530 See Also 3531 -------- 3532 get_coefficients_of_determination 3533 """ 3534 from statsmodels.graphics.utils import _import_mpl, create_mpl_fig 3535 _import_mpl() 3536 fig = create_mpl_fig(fig, figsize) 3537 3538 method = string_like(method, 'method', options=['individual', 'joint', 3539 'cumulative']) 3540 3541 # Should we label endogenous variables? 3542 if endog_labels is None: 3543 endog_labels = self.model.k_endog <= 5 3544 3545 # Plot the coefficients of determination 3546 rsquared = self.get_coefficients_of_determination(method=method, 3547 which=which) 3548 3549 if method in ['individual', 'cumulative']: 3550 plot_idx = 1 3551 for factor_name, coeffs in rsquared.T.iterrows(): 3552 # Create the new axis 3553 ax = fig.add_subplot(self.model.k_factors, 1, plot_idx) 3554 ax.set_ylim((0, 1)) 3555 ax.set(title=f'{factor_name}', ylabel=r'$R^2$') 3556 3557 coeffs.plot(ax=ax, kind='bar') 3558 if plot_idx < len(rsquared.columns) or not endog_labels: 3559 ax.xaxis.set_ticklabels([]) 3560 3561 plot_idx += 1 3562 elif method == 'joint': 3563 ax = fig.add_subplot(1, 1, 1) 3564 ax.set_ylim((0, 1)) 3565 ax.set(title=r'$R^2$ - regression on all loaded factors', 3566 ylabel=r'$R^2$') 3567 rsquared.plot(ax=ax, kind='bar') 3568 if not endog_labels: 3569 ax.xaxis.set_ticklabels([]) 3570 3571 return fig 3572 3573 def get_prediction(self, start=None, end=None, dynamic=False, 3574 index=None, exog=None, extend_model=None, 3575 extend_kwargs=None, original_scale=True, **kwargs): 3576 """ 3577 In-sample prediction and out-of-sample forecasting. 3578 3579 Parameters 3580 ---------- 3581 start : int, str, or datetime, optional 3582 Zero-indexed observation number at which to start forecasting, 3583 i.e., the first forecast is start. Can also be a date string to 3584 parse or a datetime type. Default is the the zeroth observation. 3585 end : int, str, or datetime, optional 3586 Zero-indexed observation number at which to end forecasting, i.e., 3587 the last forecast is end. Can also be a date string to 3588 parse or a datetime type. However, if the dates index does not 3589 have a fixed frequency, end must be an integer index if you 3590 want out of sample prediction. Default is the last observation in 3591 the sample. 3592 dynamic : bool, int, str, or datetime, optional 3593 Integer offset relative to `start` at which to begin dynamic 3594 prediction. Can also be an absolute date string to parse or a 3595 datetime type (these are not interpreted as offsets). 3596 Prior to this observation, true endogenous values will be used for 3597 prediction; starting with this observation and continuing through 3598 the end of prediction, forecasted endogenous values will be used 3599 instead. 3600 original_scale : bool, optional 3601 If the model specification standardized the data, whether or not 3602 to return predictions in the original scale of the data (i.e. 3603 before it was standardized by the model). Default is True. 3604 **kwargs 3605 Additional arguments may required for forecasting beyond the end 3606 of the sample. See `FilterResults.predict` for more details. 3607 3608 Returns 3609 ------- 3610 forecast : ndarray 3611 Array of out of in-sample predictions and / or out-of-sample 3612 forecasts. An (npredict x k_endog) array. 3613 """ 3614 # Get usual predictions (in the possibly-standardized scale) 3615 res = super().get_prediction(start=start, end=end, dynamic=dynamic, 3616 index=index, exog=exog, 3617 extend_model=extend_model, 3618 extend_kwargs=extend_kwargs, **kwargs) 3619 3620 # If applicable, convert predictions back to original space 3621 if self.model.standardize and original_scale: 3622 prediction_results = res.prediction_results 3623 k_endog, nobs = prediction_results.endog.shape 3624 3625 mean = np.array(self.model._endog_mean) 3626 std = np.array(self.model._endog_std) 3627 3628 if self.model.k_endog > 1: 3629 mean = mean[None, :] 3630 std = std[None, :] 3631 3632 if not prediction_results.results.memory_no_forecast_mean: 3633 res._results._predicted_mean = ( 3634 res._results._predicted_mean * std + mean) 3635 3636 if not prediction_results.results.memory_no_forecast_cov: 3637 if k_endog == 1: 3638 res._results._var_pred_mean *= std**2 3639 else: 3640 res._results._var_pred_mean = ( 3641 std * res._results._var_pred_mean * std.T) 3642 3643 return res 3644 3645 def news(self, comparison, impact_date=None, impacted_variable=None, 3646 start=None, end=None, periods=None, exog=None, 3647 comparison_type=None, return_raw=False, tolerance=1e-10, 3648 endog_quarterly=None, original_scale=True, **kwargs): 3649 """ 3650 Compute impacts from updated data (news and revisions). 3651 3652 Parameters 3653 ---------- 3654 comparison : array_like or MLEResults 3655 An updated dataset with updated and/or revised data from which the 3656 news can be computed, or an updated or previous results object 3657 to use in computing the news. 3658 impact_date : int, str, or datetime, optional 3659 A single specific period of impacts from news and revisions to 3660 compute. Can also be a date string to parse or a datetime type. 3661 This argument cannot be used in combination with `start`, `end`, or 3662 `periods`. Default is the first out-of-sample observation. 3663 impacted_variable : str, list, array, or slice, optional 3664 Observation variable label or slice of labels specifying that only 3665 specific impacted variables should be shown in the News output. The 3666 impacted variable(s) describe the variables that were *affected* by 3667 the news. If you do not know the labels for the variables, check 3668 the `endog_names` attribute of the model instance. 3669 start : int, str, or datetime, optional 3670 The first period of impacts from news and revisions to compute. 3671 Can also be a date string to parse or a datetime type. Default is 3672 the first out-of-sample observation. 3673 end : int, str, or datetime, optional 3674 The last period of impacts from news and revisions to compute. 3675 Can also be a date string to parse or a datetime type. Default is 3676 the first out-of-sample observation. 3677 periods : int, optional 3678 The number of periods of impacts from news and revisions to 3679 compute. 3680 exog : array_like, optional 3681 Array of exogenous regressors for the out-of-sample period, if 3682 applicable. 3683 comparison_type : {None, 'previous', 'updated'} 3684 This denotes whether the `comparison` argument represents a 3685 *previous* results object or dataset or an *updated* results object 3686 or dataset. If not specified, then an attempt is made to determine 3687 the comparison type. 3688 return_raw : bool, optional 3689 Whether or not to return only the specific output or a full 3690 results object. Default is to return a full results object. 3691 tolerance : float, optional 3692 The numerical threshold for determining zero impact. Default is 3693 that any impact less than 1e-10 is assumed to be zero. 3694 endog_quarterly : array_like, optional 3695 New observations of quarterly variables, if `comparison` was 3696 provided as an updated monthly dataset. If this argument is 3697 provided, it must be a Pandas Series or DataFrame with a 3698 DatetimeIndex or PeriodIndex at the quarterly frequency. 3699 3700 References 3701 ---------- 3702 .. [1] Bańbura, Marta, and Michele Modugno. 3703 "Maximum likelihood estimation of factor models on datasets with 3704 arbitrary pattern of missing data." 3705 Journal of Applied Econometrics 29, no. 1 (2014): 133-160. 3706 .. [2] Bańbura, Marta, Domenico Giannone, and Lucrezia Reichlin. 3707 "Nowcasting." 3708 The Oxford Handbook of Economic Forecasting. July 8, 2011. 3709 .. [3] Bańbura, Marta, Domenico Giannone, Michele Modugno, and Lucrezia 3710 Reichlin. 3711 "Now-casting and the real-time data flow." 3712 In Handbook of economic forecasting, vol. 2, pp. 195-237. 3713 Elsevier, 2013. 3714 """ 3715 news_results = super().news( 3716 comparison, impact_date=impact_date, 3717 impacted_variable=impacted_variable, start=start, end=end, 3718 periods=periods, exog=exog, comparison_type=comparison_type, 3719 return_raw=return_raw, tolerance=tolerance, 3720 endog_quarterly=endog_quarterly, **kwargs) 3721 3722 # If we have standardized the data, we may want to report the news in 3723 # the original scale. If so, we need to modify the data to "undo" the 3724 # standardization. 3725 if not return_raw and self.model.standardize and original_scale: 3726 endog_mean = self.model._endog_mean 3727 endog_std = self.model._endog_std 3728 3729 # Don't need to add in the mean for the impacts, since they are 3730 # the difference of two forecasts 3731 news_results.total_impacts = ( 3732 news_results.total_impacts * endog_std) 3733 news_results.update_impacts = ( 3734 news_results.update_impacts * endog_std) 3735 if news_results.revision_impacts is not None: 3736 news_results.revision_impacts = ( 3737 news_results.revision_impacts * endog_std) 3738 3739 # Update forecasts 3740 for name in ['prev_impacted_forecasts', 'news', 'update_realized', 3741 'update_forecasts', 'post_impacted_forecasts']: 3742 dta = getattr(news_results, name) 3743 3744 # for pd.Series, dta.multiply(...) removes the name attribute; 3745 # save it now so that we can add it back in 3746 orig_name = None 3747 if hasattr(dta, 'name'): 3748 orig_name = dta.name 3749 3750 dta = dta.multiply(endog_std, level=1) 3751 3752 # add back in the name attribute if it was removed 3753 if orig_name is not None: 3754 dta.name = orig_name 3755 3756 if name != 'news': 3757 dta = dta.add(endog_mean, level=1) 3758 setattr(news_results, name, dta) 3759 3760 # For the weights: rows correspond to update (date, variable) and 3761 # columns correspond to the impacted variable. 3762 # 1. Because we have modified the updates (realized, forecasts, and 3763 # forecast errors) to be in the scale of the original updated 3764 # variable, we need to essentially reverse that change for each 3765 # row of the weights by dividing by the standard deviation of 3766 # that row's updated variable 3767 # 2. Because we want the impacts to be in the scale of the original 3768 # impacted variable, we need to multiply each column by the 3769 # standard deviation of that column's impacted variable 3770 news_results.weights = ( 3771 news_results.weights.divide(endog_std, axis=0, level=1) 3772 .multiply(endog_std, axis=1, level=1)) 3773 3774 return news_results 3775 3776 def append(self, endog, endog_quarterly=None, refit=False, fit_kwargs=None, 3777 copy_initialization=True, retain_standardization=True, 3778 **kwargs): 3779 """ 3780 Recreate the results object with new data appended to original data. 3781 3782 Creates a new result object applied to a dataset that is created by 3783 appending new data to the end of the model's original data. The new 3784 results can then be used for analysis or forecasting. 3785 3786 Parameters 3787 ---------- 3788 endog : array_like 3789 New observations from the modeled time-series process. 3790 endog_quarterly : array_like, optional 3791 New observations of quarterly variables. If provided, must be a 3792 Pandas Series or DataFrame with a DatetimeIndex or PeriodIndex at 3793 the quarterly frequency. 3794 refit : bool, optional 3795 Whether to re-fit the parameters, based on the combined dataset. 3796 Default is False (so parameters from the current results object 3797 are used to create the new results object). 3798 fit_kwargs : dict, optional 3799 Keyword arguments to pass to `fit` (if `refit=True`) or `filter` / 3800 `smooth`. 3801 copy_initialization : bool, optional 3802 Whether or not to copy the initialization from the current results 3803 set to the new model. Default is True. 3804 retain_standardization : bool, optional 3805 Whether or not to use the mean and standard deviations that were 3806 used to standardize the data in the current model in the new model. 3807 Default is True. 3808 **kwargs 3809 Keyword arguments may be used to modify model specification 3810 arguments when created the new model object. 3811 3812 Returns 3813 ------- 3814 results 3815 Updated Results object, that includes results from both the 3816 original dataset and the new dataset. 3817 3818 Notes 3819 ----- 3820 The `endog` and `exog` arguments to this method must be formatted in 3821 the same way (e.g. Pandas Series versus Numpy array) as were the 3822 `endog` and `exog` arrays passed to the original model. 3823 3824 The `endog` (and, if applicable, `endog_quarterly`) arguments to this 3825 method should consist of new observations that occurred directly after 3826 the last element of `endog`. For any other kind of dataset, see the 3827 `apply` method. 3828 3829 This method will apply filtering to all of the original data as well 3830 as to the new data. To apply filtering only to the new data (which 3831 can be much faster if the original dataset is large), see the `extend` 3832 method. 3833 3834 See Also 3835 -------- 3836 extend 3837 apply 3838 """ 3839 # Construct the combined dataset, if necessary 3840 endog, k_endog_monthly = DynamicFactorMQ.construct_endog( 3841 endog, endog_quarterly) 3842 3843 # Check for compatible dimensions 3844 k_endog = endog.shape[1] if len(endog.shape) == 2 else 1 3845 if (k_endog_monthly != self.model.k_endog_M or 3846 k_endog != self.model.k_endog): 3847 raise ValueError('Cannot append data of a different dimension to' 3848 ' a model.') 3849 3850 kwargs['k_endog_monthly'] = k_endog_monthly 3851 3852 return super().append( 3853 endog, refit=refit, fit_kwargs=fit_kwargs, 3854 copy_initialization=copy_initialization, 3855 retain_standardization=retain_standardization, **kwargs) 3856 3857 def extend(self, endog, endog_quarterly=None, fit_kwargs=None, 3858 retain_standardization=True, **kwargs): 3859 """ 3860 Recreate the results object for new data that extends original data. 3861 3862 Creates a new result object applied to a new dataset that is assumed to 3863 follow directly from the end of the model's original data. The new 3864 results can then be used for analysis or forecasting. 3865 3866 Parameters 3867 ---------- 3868 endog : array_like 3869 New observations from the modeled time-series process. 3870 endog_quarterly : array_like, optional 3871 New observations of quarterly variables. If provided, must be a 3872 Pandas Series or DataFrame with a DatetimeIndex or PeriodIndex at 3873 the quarterly frequency. 3874 fit_kwargs : dict, optional 3875 Keyword arguments to pass to `filter` or `smooth`. 3876 retain_standardization : bool, optional 3877 Whether or not to use the mean and standard deviations that were 3878 used to standardize the data in the current model in the new model. 3879 Default is True. 3880 **kwargs 3881 Keyword arguments may be used to modify model specification 3882 arguments when created the new model object. 3883 3884 Returns 3885 ------- 3886 results 3887 Updated Results object, that includes results only for the new 3888 dataset. 3889 3890 See Also 3891 -------- 3892 append 3893 apply 3894 3895 Notes 3896 ----- 3897 The `endog` argument to this method should consist of new observations 3898 that occurred directly after the last element of the model's original 3899 `endog` array. For any other kind of dataset, see the `apply` method. 3900 3901 This method will apply filtering only to the new data provided by the 3902 `endog` argument, which can be much faster than re-filtering the entire 3903 dataset. However, the returned results object will only have results 3904 for the new data. To retrieve results for both the new data and the 3905 original data, see the `append` method. 3906 """ 3907 # Construct the combined dataset, if necessary 3908 endog, k_endog_monthly = DynamicFactorMQ.construct_endog( 3909 endog, endog_quarterly) 3910 3911 # Check for compatible dimensions 3912 k_endog = endog.shape[1] if len(endog.shape) == 2 else 1 3913 if (k_endog_monthly != self.model.k_endog_M or 3914 k_endog != self.model.k_endog): 3915 raise ValueError('Cannot append data of a different dimension to' 3916 ' a model.') 3917 3918 kwargs['k_endog_monthly'] = k_endog_monthly 3919 return super().extend( 3920 endog, fit_kwargs=fit_kwargs, 3921 retain_standardization=retain_standardization, **kwargs) 3922 3923 def apply(self, endog, k_endog_monthly=None, endog_quarterly=None, 3924 refit=False, fit_kwargs=None, copy_initialization=False, 3925 retain_standardization=True, **kwargs): 3926 """ 3927 Apply the fitted parameters to new data unrelated to the original data. 3928 3929 Creates a new result object using the current fitted parameters, 3930 applied to a completely new dataset that is assumed to be unrelated to 3931 the model's original data. The new results can then be used for 3932 analysis or forecasting. 3933 3934 Parameters 3935 ---------- 3936 endog : array_like 3937 New observations from the modeled time-series process. 3938 k_endog_monthly : int, optional 3939 If specifying a monthly/quarterly mixed frequency model in which 3940 the provided `endog` dataset contains both the monthly and 3941 quarterly data, this variable should be used to indicate how many 3942 of the variables are monthly. 3943 endog_quarterly : array_like, optional 3944 New observations of quarterly variables. If provided, must be a 3945 Pandas Series or DataFrame with a DatetimeIndex or PeriodIndex at 3946 the quarterly frequency. 3947 refit : bool, optional 3948 Whether to re-fit the parameters, using the new dataset. 3949 Default is False (so parameters from the current results object 3950 are used to create the new results object). 3951 fit_kwargs : dict, optional 3952 Keyword arguments to pass to `fit` (if `refit=True`) or `filter` / 3953 `smooth`. 3954 copy_initialization : bool, optional 3955 Whether or not to copy the initialization from the current results 3956 set to the new model. Default is False. 3957 retain_standardization : bool, optional 3958 Whether or not to use the mean and standard deviations that were 3959 used to standardize the data in the current model in the new model. 3960 Default is True. 3961 **kwargs 3962 Keyword arguments may be used to modify model specification 3963 arguments when created the new model object. 3964 3965 Returns 3966 ------- 3967 results 3968 Updated Results object, that includes results only for the new 3969 dataset. 3970 3971 See Also 3972 -------- 3973 statsmodels.tsa.statespace.mlemodel.MLEResults.append 3974 statsmodels.tsa.statespace.mlemodel.MLEResults.apply 3975 3976 Notes 3977 ----- 3978 The `endog` argument to this method should consist of new observations 3979 that are not necessarily related to the original model's `endog` 3980 dataset. For observations that continue that original dataset by follow 3981 directly after its last element, see the `append` and `extend` methods. 3982 """ 3983 mod = self.model.clone(endog, k_endog_monthly=k_endog_monthly, 3984 endog_quarterly=endog_quarterly, 3985 retain_standardization=retain_standardization, 3986 **kwargs) 3987 if copy_initialization: 3988 res = self.filter_results 3989 init = initialization.Initialization( 3990 self.model.k_states, 'known', constant=res.initial_state, 3991 stationary_cov=res.initial_state_cov) 3992 mod.ssm.initialization = init 3993 3994 res = self._apply(mod, refit=refit, fit_kwargs=fit_kwargs, **kwargs) 3995 3996 return res 3997 3998 def summary(self, alpha=.05, start=None, title=None, model_name=None, 3999 display_params=True, display_diagnostics=False, 4000 display_params_as_list=False, truncate_endog_names=None, 4001 display_max_endog=3): 4002 """ 4003 Summarize the Model. 4004 4005 Parameters 4006 ---------- 4007 alpha : float, optional 4008 Significance level for the confidence intervals. Default is 0.05. 4009 start : int, optional 4010 Integer of the start observation. Default is 0. 4011 title : str, optional 4012 The title used for the summary table. 4013 model_name : str, optional 4014 The name of the model used. Default is to use model class name. 4015 4016 Returns 4017 ------- 4018 summary : Summary instance 4019 This holds the summary table and text, which can be printed or 4020 converted to various output formats. 4021 4022 See Also 4023 -------- 4024 statsmodels.iolib.summary.Summary 4025 """ 4026 mod = self.model 4027 4028 # Default title / model name 4029 if title is None: 4030 title = 'Dynamic Factor Results' 4031 if model_name is None: 4032 model_name = self.model._model_name 4033 4034 # Get endog names 4035 endog_names = self.model._get_endog_names( 4036 truncate=truncate_endog_names) 4037 4038 # Get extra elements for top summary table 4039 extra_top_left = None 4040 extra_top_right = [] 4041 mle_retvals = getattr(self, 'mle_retvals', None) 4042 mle_settings = getattr(self, 'mle_settings', None) 4043 if mle_settings is not None and mle_settings.method == 'em': 4044 extra_top_right += [('EM Iterations', [f'{mle_retvals.iter}'])] 4045 4046 # Get the basic summary tables 4047 summary = super().summary( 4048 alpha=alpha, start=start, title=title, model_name=model_name, 4049 display_params=(display_params and display_params_as_list), 4050 display_diagnostics=display_diagnostics, 4051 truncate_endog_names=truncate_endog_names, 4052 display_max_endog=display_max_endog, 4053 extra_top_left=extra_top_left, extra_top_right=extra_top_right) 4054 4055 # Get tables of parameters 4056 table_ix = 1 4057 if not display_params_as_list: 4058 4059 # Observation equation table 4060 data = pd.DataFrame( 4061 self.filter_results.design[:, mod._s['factors_L1'], 0], 4062 index=endog_names, columns=mod.factor_names) 4063 data = data.applymap(lambda s: '%.2f' % s) 4064 4065 # Idiosyncratic terms 4066 # data[' '] = ' ' 4067 k_idio = 1 4068 if mod.idiosyncratic_ar1: 4069 data[' idiosyncratic: AR(1)'] = ( 4070 self.params[mod._p['idiosyncratic_ar1']]) 4071 k_idio += 1 4072 data['var.'] = self.params[mod._p['idiosyncratic_var']] 4073 data.iloc[:, -k_idio:] = data.iloc[:, -k_idio:].applymap( 4074 lambda s: '%.2f' % s) 4075 4076 data.index.name = 'Factor loadings:' 4077 4078 # Clear entries for non-loading factors 4079 base_iloc = np.arange(mod.k_factors) 4080 for i in range(mod.k_endog): 4081 iloc = [j for j in base_iloc 4082 if j not in mod._s.endog_factor_iloc[i]] 4083 data.iloc[i, iloc] = '.' 4084 4085 data = data.reset_index() 4086 4087 # Build the table 4088 params_data = data.values 4089 params_header = data.columns.tolist() 4090 params_stubs = None 4091 4092 title = 'Observation equation:' 4093 table = SimpleTable( 4094 params_data, params_header, params_stubs, 4095 txt_fmt=fmt_params, title=title) 4096 summary.tables.insert(table_ix, table) 4097 table_ix += 1 4098 4099 # Factor transitions 4100 ix1 = 0 4101 ix2 = 0 4102 for i in range(len(mod._s.factor_blocks)): 4103 block = mod._s.factor_blocks[i] 4104 ix2 += block.k_factors 4105 4106 T = self.filter_results.transition 4107 lag_names = [] 4108 for j in range(block.factor_order): 4109 lag_names += [f'L{j + 1}.{name}' 4110 for name in block.factor_names] 4111 data = pd.DataFrame(T[block.factors_L1, block.factors_ar, 0], 4112 index=block.factor_names, 4113 columns=lag_names) 4114 data.index.name = '' 4115 data = data.applymap(lambda s: '%.2f' % s) 4116 4117 Q = self.filter_results.state_cov 4118 # data[' '] = '' 4119 if block.k_factors == 1: 4120 data[' error variance'] = Q[ix1, ix1] 4121 else: 4122 data[' error covariance'] = block.factor_names 4123 for j in range(block.k_factors): 4124 data[block.factor_names[j]] = Q[ix1:ix2, ix1 + j] 4125 data.iloc[:, -block.k_factors:] = ( 4126 data.iloc[:, -block.k_factors:].applymap( 4127 lambda s: '%.2f' % s)) 4128 4129 data = data.reset_index() 4130 4131 params_data = data.values 4132 params_header = data.columns.tolist() 4133 params_stubs = None 4134 4135 title = f'Transition: Factor block {i}' 4136 table = SimpleTable( 4137 params_data, params_header, params_stubs, 4138 txt_fmt=fmt_params, title=title) 4139 summary.tables.insert(table_ix, table) 4140 table_ix += 1 4141 4142 ix1 = ix2 4143 4144 return summary 4145