1#!/usr/local/bin/python3.8 2# coding: utf-8 3 4# DO NOT EDIT 5# Autogenerated from the notebook statespace_sarimax_pymc3.ipynb. 6# Edit the notebook and then sync the output with this file. 7# 8# flake8: noqa 9# DO NOT EDIT 10 11# # Fast Bayesian estimation of SARIMAX models 12 13# ## Introduction 14# 15# This notebook will show how to use fast Bayesian methods to estimate 16# SARIMAX (Seasonal AutoRegressive Integrated Moving Average with eXogenous 17# regressors) models. These methods can also be parallelized across multiple 18# cores. 19# 20# Here, fast methods means a version of Hamiltonian Monte Carlo called the 21# No-U-Turn Sampler (NUTS) developed by Hoffmann and Gelman: see [Hoffman, 22# M. D., & Gelman, A. (2014). The No-U-Turn sampler: adaptively setting path 23# lengths in Hamiltonian Monte Carlo. Journal of Machine Learning Research, 24# 15(1), 1593-1623.](https://arxiv.org/abs/1111.4246). As they say, "the 25# cost of HMC per independent sample from a target distribution of dimension 26# $D$ is roughly $\mathcal{O}(D^{5/4})$, which stands in sharp contrast with 27# the $\mathcal{O}(D^{2})$ cost of random-walk Metropolis". So for problems 28# of larger dimension, the time-saving with HMC is significant. However it 29# does require the gradient, or Jacobian, of the model to be provided. 30# 31# This notebook will combine the Python libraries 32# [statsmodels](https://www.statsmodels.org/stable/index.html), which does 33# econometrics, and [PyMC3](https://docs.pymc.io/), which is for Bayesian 34# estimation, to perform fast Bayesian estimation of a simple SARIMAX model, 35# in this case an ARMA(1, 1) model for US CPI. 36# 37# Note that, for simple models like AR(p), base PyMC3 is a quicker way to 38# fit a model; there's an [example 39# here](https://docs.pymc.io/notebooks/AR.html). The advantage of using 40# statsmodels is that it gives access to methods that can solve a vast range 41# of statespace models. 42# 43# The model we'll solve is given by 44# 45# $$ 46# y_t = \phi y_{t-1} + \varepsilon_t + \theta_1 \varepsilon_{t-1}, \qquad 47# \varepsilon_t \sim N(0, \sigma^2) 48# $$ 49# 50# with 1 auto-regressive term and 1 moving average term. In statespace 51# form it is written as: 52# 53# $$ 54# \begin{align} 55# y_t & = \underbrace{\begin{bmatrix} 1 & \theta_1 \end{bmatrix}}_{Z} 56# \underbrace{\begin{bmatrix} \alpha_{1,t} \\ \alpha_{2,t} 57# \end{bmatrix}}_{\alpha_t} \\ 58# \begin{bmatrix} \alpha_{1,t+1} \\ \alpha_{2,t+1} \end{bmatrix} & = 59# \underbrace{\begin{bmatrix} 60# \phi & 0 \\ 61# 1 & 0 \\ 62# \end{bmatrix}}_{T} \begin{bmatrix} \alpha_{1,t} \\ \alpha_{2,t} 63# \end{bmatrix} + 64# \underbrace{\begin{bmatrix} 1 \\ 0 \end{bmatrix}}_{R} 65# \underbrace{\varepsilon_{t+1}}_{\eta_t} \\ 66# \end{align} 67# $$ 68# 69# The code will follow these steps: 70# 1. Import external dependencies 71# 2. Download and plot the data on US CPI 72# 3. Simple maximum likelihood estimation (MLE) as an example 73# 4. Definitions of helper functions to provide tensors to the library 74# doing Bayesian estimation 75# 5. Bayesian estimation via NUTS 76# 6. Application to US CPI series 77# 78# Finally, Appendix A shows how to re-use the helper functions from step 79# (4) to estimate a different state space model, `UnobservedComponents`, 80# using the same Bayesian methods. 81 82# ### 1. Import external dependencies 83 84import matplotlib.pyplot as plt 85import numpy as np 86import pandas as pd 87import pymc3 as pm 88import statsmodels.api as sm 89import theano 90import theano.tensor as tt 91from pandas.plotting import register_matplotlib_converters 92from pandas_datareader.data import DataReader 93 94plt.style.use("seaborn") 95register_matplotlib_converters() 96 97# ### 2. Download and plot the data on US CPI 98# 99# We'll get the data from FRED: 100 101cpi = DataReader("CPIAUCNS", "fred", start="1971-01", end="2018-12") 102cpi.index = pd.DatetimeIndex(cpi.index, freq="MS") 103 104# Define the inflation series that we'll use in analysis 105inf = np.log(cpi).resample("QS").mean().diff()[1:] * 400 106inf = inf.dropna() 107print(inf.head()) 108 109# Plot the series 110fig, ax = plt.subplots(figsize=(9, 4), dpi=300) 111ax.plot(inf.index, inf, label=r"$\Delta \log CPI$", lw=2) 112ax.legend(loc="lower left") 113plt.show() 114 115# ### 3. Fit the model with maximum likelihood 116# 117# Statsmodels does all of the hard work of this for us - creating and 118# fitting the model takes just two lines of code. The model order parameters 119# correspond to auto-regressive, difference, and moving average orders 120# respectively. 121 122# Create an SARIMAX model instance - here we use it to estimate 123# the parameters via MLE using the `fit` method, but we can 124# also re-use it below for the Bayesian estimation 125mod = sm.tsa.statespace.SARIMAX(inf, order=(1, 0, 1)) 126 127res_mle = mod.fit(disp=False) 128print(res_mle.summary()) 129 130# It's a good fit. We can also get the series of one-step ahead 131# predictions and plot it next to the actual data, along with a confidence 132# band. 133# 134 135predict_mle = res_mle.get_prediction() 136predict_mle_ci = predict_mle.conf_int() 137lower = predict_mle_ci["lower CPIAUCNS"] 138upper = predict_mle_ci["upper CPIAUCNS"] 139 140# Graph 141fig, ax = plt.subplots(figsize=(9, 4), dpi=300) 142 143# Plot data points 144inf.plot(ax=ax, style="-", label="Observed") 145 146# Plot predictions 147predict_mle.predicted_mean.plot(ax=ax, 148 style="r.", 149 label="One-step-ahead forecast") 150ax.fill_between(predict_mle_ci.index, lower, upper, color="r", alpha=0.1) 151ax.legend(loc="lower left") 152plt.show() 153 154# ### 4. Helper functions to provide tensors to the library doing Bayesian 155# estimation 156# 157# We're almost on to the magic but there are a few preliminaries. Feel 158# free to skip this section if you're not interested in the technical 159# details. 160 161# ### Technical Details 162# 163# PyMC3 is a Bayesian estimation library ("Probabilistic Programming in 164# Python: Bayesian Modeling and Probabilistic Machine Learning with Theano") 165# that is a) fast and b) optimized for Bayesian machine learning, for 166# instance [Bayesian neural networks](https://docs.pymc.io/notebooks/bayesia 167# n_neural_network_advi.html). To do all of this, it is built on top of a 168# Theano, a library that aims to evaluate tensors very efficiently and 169# provide symbolic differentiation (necessary for any kind of deep 170# learning). It is the symbolic differentiation that means PyMC3 can use 171# NUTS on any problem formulated within PyMC3. 172# 173# We are not formulating a problem directly in PyMC3; we're using 174# statsmodels to specify the statespace model and solve it with the Kalman 175# filter. So we need to put the plumbing of statsmodels and PyMC3 together, 176# which means wrapping the statsmodels SARIMAX model object in a Theano- 177# flavored wrapper before passing information to PyMC3 for estimation. 178# 179# Because of this, we can't use the Theano auto-differentiation directly. 180# Happily, statsmodels SARIMAX objects have a method to return the Jacobian 181# evaluated at the parameter values. We'll be making use of this to provide 182# gradients so that we can use NUTS. 183 184# #### Defining helper functions to translate models into a PyMC3 friendly 185# form 186# 187# First, we'll create the Theano wrappers. They will be in the form of 188# 'Ops', operation objects, that 'perform' particular tasks. They are 189# initialized with a statsmodels `model` instance. 190# 191# Although this code may look somewhat opaque, it is generic for any state 192# space model in statsmodels. 193 194 195class Loglike(tt.Op): 196 197 itypes = [tt.dvector] # expects a vector of parameter values when called 198 otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood) 199 200 def __init__(self, model): 201 self.model = model 202 self.score = Score(self.model) 203 204 def perform(self, node, inputs, outputs): 205 (theta, ) = inputs # contains the vector of parameters 206 llf = self.model.loglike(theta) 207 outputs[0][0] = np.array(llf) # output the log-likelihood 208 209 def grad(self, inputs, g): 210 # the method that calculates the gradients - it actually returns the 211 # vector-Jacobian product - g[0] is a vector of parameter values 212 (theta, ) = inputs # our parameters 213 out = [g[0] * self.score(theta)] 214 return out 215 216 217class Score(tt.Op): 218 itypes = [tt.dvector] 219 otypes = [tt.dvector] 220 221 def __init__(self, model): 222 self.model = model 223 224 def perform(self, node, inputs, outputs): 225 (theta, ) = inputs 226 outputs[0][0] = self.model.score(theta) 227 228 229# ### 5. Bayesian estimation with NUTS 230# 231# The next step is to set the parameters for the Bayesian estimation, 232# specify our priors, and run it. 233 234# Set sampling params 235ndraws = 3000 # number of draws from the distribution 236nburn = 600 # number of "burn-in points" (which will be discarded) 237 238# Now for the fun part! There are three parameters to estimate: $\phi$, 239# $\theta_1$, and $\sigma$. We'll use uninformative uniform priors for the 240# first two, and an inverse gamma for the last one. Then we'll run the 241# inference optionally using as many computer cores as I have. 242 243# Construct an instance of the Theano wrapper defined above, which 244# will allow PyMC3 to compute the likelihood and Jacobian in a way 245# that it can make use of. Here we are using the same model instance 246# created earlier for MLE analysis (we could also create a new model 247# instance if we preferred) 248loglike = Loglike(mod) 249 250with pm.Model() as m: 251 # Priors 252 arL1 = pm.Uniform("ar.L1", -0.99, 0.99) 253 maL1 = pm.Uniform("ma.L1", -0.99, 0.99) 254 sigma2 = pm.InverseGamma("sigma2", 2, 4) 255 256 # convert variables to tensor vectors 257 theta = tt.as_tensor_variable([arL1, maL1, sigma2]) 258 259 # use a DensityDist (use a lamdba function to "call" the Op) 260 pm.DensityDist("likelihood", loglike, observed=theta) 261 262 # Draw samples 263 trace = pm.sample( 264 ndraws, 265 tune=nburn, 266 return_inferencedata=True, 267 cores=1, 268 compute_convergence_checks=False, 269 ) 270 271# Note that the NUTS sampler is auto-assigned because we provided 272# gradients. PyMC3 will use Metropolis or Slicing samplers if it does not 273# find that gradients are available. There are an impressive number of draws 274# per second for a "block box" style computation! However, note that if the 275# model can be represented directly by PyMC3 (like the AR(p) models 276# mentioned above), then computation can be substantially faster. 277# 278# Inference is complete, but are the results any good? There are a number 279# of ways to check. The first is to look at the posterior distributions 280# (with lines showing the MLE values): 281 282plt.tight_layout() 283# Note: the syntax here for the lines argument is required for 284# PyMC3 versions >= 3.7 285# For version <= 3.6 you can use lines=dict(res_mle.params) instead 286_ = pm.plot_trace( 287 trace, 288 lines=[(k, {}, [v]) for k, v in dict(res_mle.params).items()], 289 combined=True, 290 figsize=(12, 12), 291) 292 293# The estimated posteriors clearly peak close to the parameters found by 294# MLE. We can also see a summary of the estimated values: 295 296pm.summary(trace) 297 298# Here $\hat{R}$ is the Gelman-Rubin statistic. It tests for lack of 299# convergence by comparing the variance between multiple chains to the 300# variance within each chain. If convergence has been achieved, the between- 301# chain and within-chain variances should be identical. If $\hat{R}<1.2$ for 302# all model parameters, we can have some confidence that convergence has 303# been reached. 304# 305# Additionally, the highest posterior density interval (the gap between 306# the two values of HPD in the table) is small for each of the variables. 307# 308# ### 6. Application of Bayesian estimates of parameters 309# 310# We'll now re-instigate a version of the model but using the parameters 311# from the Bayesian estimation, and again plot the one-step-ahead forecasts. 312 313# Retrieve the posterior means 314params = pm.summary(trace)["mean"].values 315 316# Construct results using these posterior means as parameter values 317res_bayes = mod.smooth(params) 318 319predict_bayes = res_bayes.get_prediction() 320predict_bayes_ci = predict_bayes.conf_int() 321lower = predict_bayes_ci["lower CPIAUCNS"] 322upper = predict_bayes_ci["upper CPIAUCNS"] 323 324# Graph 325fig, ax = plt.subplots(figsize=(9, 4), dpi=300) 326 327# Plot data points 328inf.plot(ax=ax, style="-", label="Observed") 329 330# Plot predictions 331predict_bayes.predicted_mean.plot(ax=ax, 332 style="r.", 333 label="One-step-ahead forecast") 334ax.fill_between(predict_bayes_ci.index, lower, upper, color="r", alpha=0.1) 335ax.legend(loc="lower left") 336plt.show() 337 338# ## Appendix A. Application to `UnobservedComponents` models 339 340# We can reuse the `Loglike` and `Score` wrappers defined above to 341# consider a different state space model. For example, we might want to 342# model inflation as the combination of a random walk trend and 343# autoregressive error term: 344# 345# $$ 346# \begin{aligned} 347# y_t & = \mu_t + \varepsilon_t \\ 348# \mu_t & = \mu_{t-1} + \eta_t \\ 349# \varepsilon_t &= \phi \varepsilon_t + \zeta_t 350# \end{aligned} 351# $$ 352# 353# This model can be constructed in Statsmodels with the 354# `UnobservedComponents` class using the `rwalk` and `autoregressive` 355# specifications. As before, we can fit the model using maximum likelihood 356# via the `fit` method. 357 358# Construct the model instance 359mod_uc = sm.tsa.UnobservedComponents(inf, "rwalk", autoregressive=1) 360 361# Fit the model via maximum likelihood 362res_uc_mle = mod_uc.fit() 363print(res_uc_mle.summary()) 364 365# As noted earlier, the Theano wrappers (`Loglike` and `Score`) that we 366# created above are generic, so we can re-use essentially the same code to 367# explore the model with Bayesian methods. 368 369# Set sampling params 370ndraws = 3000 # number of draws from the distribution 371nburn = 600 # number of "burn-in points" (which will be discarded) 372 373# Here we follow the same procedure as above, but now we instantiate the 374# Theano wrapper `Loglike` with the UC model instance instead of the 375# SARIMAX model instance 376loglike_uc = Loglike(mod_uc) 377 378with pm.Model(): 379 # Priors 380 sigma2level = pm.InverseGamma("sigma2.level", 1, 1) 381 sigma2ar = pm.InverseGamma("sigma2.ar", 1, 1) 382 arL1 = pm.Uniform("ar.L1", -0.99, 0.99) 383 384 # convert variables to tensor vectors 385 theta_uc = tt.as_tensor_variable([sigma2level, sigma2ar, arL1]) 386 387 # use a DensityDist (use a lamdba function to "call" the Op) 388 pm.DensityDist("likelihood", loglike_uc, observed=theta_uc) 389 390 # Draw samples 391 trace_uc = pm.sample( 392 ndraws, 393 tune=nburn, 394 return_inferencedata=True, 395 cores=1, 396 compute_convergence_checks=False, 397 ) 398 399# And as before we can plot the marginal posteriors. In contrast to the 400# SARIMAX example, here the posterior modes are somewhat different from the 401# MLE estimates. 402 403plt.tight_layout() 404# Note: the syntax here for the lines argument is required for 405# PyMC3 versions >= 3.7 406# For version <= 3.6 you can use lines=dict(res_mle.params) instead 407_ = pm.plot_trace( 408 trace_uc, 409 lines=[(k, {}, [v]) for k, v in dict(res_uc_mle.params).items()], 410 combined=True, 411 figsize=(12, 12), 412) 413 414pm.summary(trace_uc) 415 416# Retrieve the posterior means 417params = pm.summary(trace_uc)["mean"].values 418 419# Construct results using these posterior means as parameter values 420res_uc_bayes = mod_uc.smooth(params) 421 422# One benefit of this model is that it gives us an estimate of the 423# underling "level" of inflation, using the smoothed estimate of $\mu_t$, 424# which we can access as the "level" column in the results objects' 425# `states.smoothed` attribute. In this case, because the Bayesian posterior 426# mean of the level's variance is larger than the MLE estimate, its 427# estimated level is a little more volatile. 428 429# Graph 430fig, ax = plt.subplots(figsize=(9, 4), dpi=300) 431 432# Plot data points 433inf["CPIAUCNS"].plot(ax=ax, style="-", label="Observed data") 434 435# Plot estimate of the level term 436res_uc_mle.states.smoothed["level"].plot(ax=ax, label="Smoothed level (MLE)") 437res_uc_bayes.states.smoothed["level"].plot(ax=ax, 438 label="Smoothed level (Bayesian)") 439 440ax.legend(loc="lower left") 441