1#!/usr/local/bin/python3.8
2# coding: utf-8
3
4# DO NOT EDIT
5# Autogenerated from the notebook statespace_sarimax_pymc3.ipynb.
6# Edit the notebook and then sync the output with this file.
7#
8# flake8: noqa
9# DO NOT EDIT
10
11# # Fast Bayesian estimation of SARIMAX models
12
13# ## Introduction
14#
15# This notebook will show how to use fast Bayesian methods to estimate
16# SARIMAX (Seasonal AutoRegressive Integrated Moving Average with eXogenous
17# regressors) models. These methods can also be parallelized across multiple
18# cores.
19#
20# Here, fast methods means a version of Hamiltonian Monte Carlo called the
21# No-U-Turn Sampler (NUTS) developed by Hoffmann and Gelman: see [Hoffman,
22# M. D., & Gelman, A. (2014). The No-U-Turn sampler: adaptively setting path
23# lengths in Hamiltonian Monte Carlo. Journal of Machine Learning Research,
24# 15(1), 1593-1623.](https://arxiv.org/abs/1111.4246). As they say, "the
25# cost of HMC per independent sample from a target distribution of dimension
26# $D$ is roughly $\mathcal{O}(D^{5/4})$, which stands in sharp contrast with
27# the $\mathcal{O}(D^{2})$ cost of random-walk Metropolis". So for problems
28# of larger dimension, the time-saving with HMC is significant. However it
29# does require the gradient, or Jacobian, of the model to be provided.
30#
31# This notebook will combine the Python libraries
32# [statsmodels](https://www.statsmodels.org/stable/index.html), which does
33# econometrics, and [PyMC3](https://docs.pymc.io/), which is for Bayesian
34# estimation, to perform fast Bayesian estimation of a simple SARIMAX model,
35# in this case an ARMA(1, 1) model for US CPI.
36#
37# Note that, for simple models like AR(p), base PyMC3 is a quicker way to
38# fit a model; there's an [example
39# here](https://docs.pymc.io/notebooks/AR.html). The advantage of using
40# statsmodels is that it gives access to methods that can solve a vast range
41# of statespace models.
42#
43# The model we'll solve is given by
44#
45# $$
46# y_t = \phi y_{t-1} + \varepsilon_t + \theta_1 \varepsilon_{t-1}, \qquad
47# \varepsilon_t \sim N(0, \sigma^2)
48# $$
49#
50# with 1 auto-regressive term and 1 moving average term. In statespace
51# form it is written as:
52#
53# $$
54# \begin{align}
55# y_t & = \underbrace{\begin{bmatrix} 1 & \theta_1 \end{bmatrix}}_{Z}
56# \underbrace{\begin{bmatrix} \alpha_{1,t} \\ \alpha_{2,t}
57# \end{bmatrix}}_{\alpha_t} \\
58#     \begin{bmatrix} \alpha_{1,t+1} \\ \alpha_{2,t+1} \end{bmatrix} & =
59# \underbrace{\begin{bmatrix}
60#         \phi & 0 \\
61#         1      & 0     \\
62#     \end{bmatrix}}_{T} \begin{bmatrix} \alpha_{1,t} \\ \alpha_{2,t}
63# \end{bmatrix} +
64#     \underbrace{\begin{bmatrix} 1 \\ 0 \end{bmatrix}}_{R}
65# \underbrace{\varepsilon_{t+1}}_{\eta_t} \\
66# \end{align}
67# $$
68#
69# The code will follow these steps:
70# 1. Import external dependencies
71# 2. Download and plot the data on US CPI
72# 3. Simple maximum likelihood estimation (MLE) as an example
73# 4. Definitions of helper functions to provide tensors to the library
74# doing Bayesian estimation
75# 5. Bayesian estimation via NUTS
76# 6. Application to US CPI series
77#
78# Finally, Appendix A shows how to re-use the helper functions from step
79# (4) to estimate a different state space model, `UnobservedComponents`,
80# using the same Bayesian methods.
81
82# ### 1. Import external dependencies
83
84import matplotlib.pyplot as plt
85import numpy as np
86import pandas as pd
87import pymc3 as pm
88import statsmodels.api as sm
89import theano
90import theano.tensor as tt
91from pandas.plotting import register_matplotlib_converters
92from pandas_datareader.data import DataReader
93
94plt.style.use("seaborn")
95register_matplotlib_converters()
96
97# ### 2. Download and plot the data on US CPI
98#
99# We'll get the data from FRED:
100
101cpi = DataReader("CPIAUCNS", "fred", start="1971-01", end="2018-12")
102cpi.index = pd.DatetimeIndex(cpi.index, freq="MS")
103
104# Define the inflation series that we'll use in analysis
105inf = np.log(cpi).resample("QS").mean().diff()[1:] * 400
106inf = inf.dropna()
107print(inf.head())
108
109# Plot the series
110fig, ax = plt.subplots(figsize=(9, 4), dpi=300)
111ax.plot(inf.index, inf, label=r"$\Delta \log CPI$", lw=2)
112ax.legend(loc="lower left")
113plt.show()
114
115# ### 3. Fit the model with maximum likelihood
116#
117# Statsmodels does all of the hard work of this for us - creating and
118# fitting the model takes just two lines of code. The model order parameters
119# correspond to auto-regressive, difference, and moving average orders
120# respectively.
121
122# Create an SARIMAX model instance - here we use it to estimate
123# the parameters via MLE using the `fit` method, but we can
124# also re-use it below for the Bayesian estimation
125mod = sm.tsa.statespace.SARIMAX(inf, order=(1, 0, 1))
126
127res_mle = mod.fit(disp=False)
128print(res_mle.summary())
129
130# It's a good fit. We can also get the series of one-step ahead
131# predictions and plot it next to the actual data, along with a confidence
132# band.
133#
134
135predict_mle = res_mle.get_prediction()
136predict_mle_ci = predict_mle.conf_int()
137lower = predict_mle_ci["lower CPIAUCNS"]
138upper = predict_mle_ci["upper CPIAUCNS"]
139
140# Graph
141fig, ax = plt.subplots(figsize=(9, 4), dpi=300)
142
143# Plot data points
144inf.plot(ax=ax, style="-", label="Observed")
145
146# Plot predictions
147predict_mle.predicted_mean.plot(ax=ax,
148                                style="r.",
149                                label="One-step-ahead forecast")
150ax.fill_between(predict_mle_ci.index, lower, upper, color="r", alpha=0.1)
151ax.legend(loc="lower left")
152plt.show()
153
154# ### 4. Helper functions to provide tensors to the library doing Bayesian
155# estimation
156#
157# We're almost on to the magic but there are a few preliminaries. Feel
158# free to skip this section if you're not interested in the technical
159# details.
160
161# ### Technical Details
162#
163# PyMC3 is a Bayesian estimation library ("Probabilistic Programming in
164# Python: Bayesian Modeling and Probabilistic Machine Learning with Theano")
165# that is a) fast and b) optimized for Bayesian machine learning, for
166# instance [Bayesian neural networks](https://docs.pymc.io/notebooks/bayesia
167# n_neural_network_advi.html). To do all of this, it is built on top of a
168# Theano, a library that aims to evaluate tensors very efficiently and
169# provide symbolic differentiation (necessary for any kind of deep
170# learning). It is the symbolic differentiation that means PyMC3 can use
171# NUTS on any problem formulated within PyMC3.
172#
173# We are not formulating a problem directly in PyMC3; we're using
174# statsmodels to specify the statespace model and solve it with the Kalman
175# filter. So we need to put the plumbing of statsmodels and PyMC3 together,
176# which means wrapping the statsmodels SARIMAX model object in a Theano-
177# flavored wrapper before passing information to PyMC3 for estimation.
178#
179# Because of this, we can't use the Theano auto-differentiation directly.
180# Happily, statsmodels SARIMAX objects have a method to return the Jacobian
181# evaluated at the parameter values. We'll be making use of this to provide
182# gradients so that we can use NUTS.
183
184# #### Defining helper functions to translate models into a PyMC3 friendly
185# form
186#
187# First, we'll create the Theano wrappers. They will be in the form of
188# 'Ops', operation objects, that 'perform' particular tasks. They are
189# initialized with a statsmodels `model` instance.
190#
191# Although this code may look somewhat opaque, it is generic for any state
192# space model in statsmodels.
193
194
195class Loglike(tt.Op):
196
197    itypes = [tt.dvector]  # expects a vector of parameter values when called
198    otypes = [tt.dscalar]  # outputs a single scalar value (the log likelihood)
199
200    def __init__(self, model):
201        self.model = model
202        self.score = Score(self.model)
203
204    def perform(self, node, inputs, outputs):
205        (theta, ) = inputs  # contains the vector of parameters
206        llf = self.model.loglike(theta)
207        outputs[0][0] = np.array(llf)  # output the log-likelihood
208
209    def grad(self, inputs, g):
210        # the method that calculates the gradients - it actually returns the
211        # vector-Jacobian product - g[0] is a vector of parameter values
212        (theta, ) = inputs  # our parameters
213        out = [g[0] * self.score(theta)]
214        return out
215
216
217class Score(tt.Op):
218    itypes = [tt.dvector]
219    otypes = [tt.dvector]
220
221    def __init__(self, model):
222        self.model = model
223
224    def perform(self, node, inputs, outputs):
225        (theta, ) = inputs
226        outputs[0][0] = self.model.score(theta)
227
228
229# ### 5. Bayesian estimation with NUTS
230#
231# The next step is to set the parameters for the Bayesian estimation,
232# specify our priors, and run it.
233
234# Set sampling params
235ndraws = 3000  # number of draws from the distribution
236nburn = 600  # number of "burn-in points" (which will be discarded)
237
238# Now for the fun part! There are three parameters to estimate: $\phi$,
239# $\theta_1$, and $\sigma$. We'll use uninformative uniform priors for the
240# first two, and an inverse gamma for the last one. Then we'll run the
241# inference optionally using as many computer cores as I have.
242
243# Construct an instance of the Theano wrapper defined above, which
244# will allow PyMC3 to compute the likelihood and Jacobian in a way
245# that it can make use of. Here we are using the same model instance
246# created earlier for MLE analysis (we could also create a new model
247# instance if we preferred)
248loglike = Loglike(mod)
249
250with pm.Model() as m:
251    # Priors
252    arL1 = pm.Uniform("ar.L1", -0.99, 0.99)
253    maL1 = pm.Uniform("ma.L1", -0.99, 0.99)
254    sigma2 = pm.InverseGamma("sigma2", 2, 4)
255
256    # convert variables to tensor vectors
257    theta = tt.as_tensor_variable([arL1, maL1, sigma2])
258
259    # use a DensityDist (use a lamdba function to "call" the Op)
260    pm.DensityDist("likelihood", loglike, observed=theta)
261
262    # Draw samples
263    trace = pm.sample(
264        ndraws,
265        tune=nburn,
266        return_inferencedata=True,
267        cores=1,
268        compute_convergence_checks=False,
269    )
270
271# Note that the NUTS sampler is auto-assigned because we provided
272# gradients. PyMC3 will use Metropolis or Slicing samplers if it does not
273# find that gradients are available. There are an impressive number of draws
274# per second for a "block box" style computation! However, note that if the
275# model can be represented directly by PyMC3 (like the AR(p) models
276# mentioned above), then computation can be substantially faster.
277#
278# Inference is complete, but are the results any good? There are a number
279# of ways to check. The first is to look at the posterior distributions
280# (with lines showing the MLE values):
281
282plt.tight_layout()
283# Note: the syntax here for the lines argument is required for
284# PyMC3 versions >= 3.7
285# For version <= 3.6 you can use lines=dict(res_mle.params) instead
286_ = pm.plot_trace(
287    trace,
288    lines=[(k, {}, [v]) for k, v in dict(res_mle.params).items()],
289    combined=True,
290    figsize=(12, 12),
291)
292
293# The estimated posteriors clearly peak close to the parameters found by
294# MLE. We can also see a summary of the estimated values:
295
296pm.summary(trace)
297
298# Here $\hat{R}$ is the Gelman-Rubin statistic. It tests for lack of
299# convergence by comparing the variance between multiple chains to the
300# variance within each chain. If convergence has been achieved, the between-
301# chain and within-chain variances should be identical. If $\hat{R}<1.2$ for
302# all model parameters, we can have some confidence that convergence has
303# been reached.
304#
305# Additionally, the highest posterior density interval (the gap between
306# the two values of HPD in the table) is small for each of the variables.
307#
308# ### 6. Application of Bayesian estimates of parameters
309#
310# We'll now re-instigate a version of the model but using the parameters
311# from the Bayesian estimation, and again plot the one-step-ahead forecasts.
312
313# Retrieve the posterior means
314params = pm.summary(trace)["mean"].values
315
316# Construct results using these posterior means as parameter values
317res_bayes = mod.smooth(params)
318
319predict_bayes = res_bayes.get_prediction()
320predict_bayes_ci = predict_bayes.conf_int()
321lower = predict_bayes_ci["lower CPIAUCNS"]
322upper = predict_bayes_ci["upper CPIAUCNS"]
323
324# Graph
325fig, ax = plt.subplots(figsize=(9, 4), dpi=300)
326
327# Plot data points
328inf.plot(ax=ax, style="-", label="Observed")
329
330# Plot predictions
331predict_bayes.predicted_mean.plot(ax=ax,
332                                  style="r.",
333                                  label="One-step-ahead forecast")
334ax.fill_between(predict_bayes_ci.index, lower, upper, color="r", alpha=0.1)
335ax.legend(loc="lower left")
336plt.show()
337
338# ## Appendix A. Application to `UnobservedComponents` models
339
340# We can reuse the `Loglike` and `Score` wrappers defined above to
341# consider a different state space model. For example, we might want to
342# model inflation as the combination of a random walk trend and
343# autoregressive error term:
344#
345# $$
346# \begin{aligned}
347# y_t & = \mu_t + \varepsilon_t \\
348# \mu_t & = \mu_{t-1} + \eta_t \\
349# \varepsilon_t &= \phi \varepsilon_t + \zeta_t
350# \end{aligned}
351# $$
352#
353# This model can be constructed in Statsmodels with the
354# `UnobservedComponents` class using the `rwalk` and `autoregressive`
355# specifications. As before, we can fit the model using maximum likelihood
356# via the `fit` method.
357
358# Construct the model instance
359mod_uc = sm.tsa.UnobservedComponents(inf, "rwalk", autoregressive=1)
360
361# Fit the model via maximum likelihood
362res_uc_mle = mod_uc.fit()
363print(res_uc_mle.summary())
364
365# As noted earlier, the Theano wrappers (`Loglike` and `Score`) that we
366# created above are generic, so we can re-use essentially the same code to
367# explore the model with Bayesian methods.
368
369# Set sampling params
370ndraws = 3000  # number of draws from the distribution
371nburn = 600  # number of "burn-in points" (which will be discarded)
372
373# Here we follow the same procedure as above, but now we instantiate the
374# Theano wrapper `Loglike` with the UC model instance instead of the
375# SARIMAX model instance
376loglike_uc = Loglike(mod_uc)
377
378with pm.Model():
379    # Priors
380    sigma2level = pm.InverseGamma("sigma2.level", 1, 1)
381    sigma2ar = pm.InverseGamma("sigma2.ar", 1, 1)
382    arL1 = pm.Uniform("ar.L1", -0.99, 0.99)
383
384    # convert variables to tensor vectors
385    theta_uc = tt.as_tensor_variable([sigma2level, sigma2ar, arL1])
386
387    # use a DensityDist (use a lamdba function to "call" the Op)
388    pm.DensityDist("likelihood", loglike_uc, observed=theta_uc)
389
390    # Draw samples
391    trace_uc = pm.sample(
392        ndraws,
393        tune=nburn,
394        return_inferencedata=True,
395        cores=1,
396        compute_convergence_checks=False,
397    )
398
399# And as before we can plot the marginal posteriors. In contrast to the
400# SARIMAX example, here the posterior modes are somewhat different from the
401# MLE estimates.
402
403plt.tight_layout()
404# Note: the syntax here for the lines argument is required for
405# PyMC3 versions >= 3.7
406# For version <= 3.6 you can use lines=dict(res_mle.params) instead
407_ = pm.plot_trace(
408    trace_uc,
409    lines=[(k, {}, [v]) for k, v in dict(res_uc_mle.params).items()],
410    combined=True,
411    figsize=(12, 12),
412)
413
414pm.summary(trace_uc)
415
416# Retrieve the posterior means
417params = pm.summary(trace_uc)["mean"].values
418
419# Construct results using these posterior means as parameter values
420res_uc_bayes = mod_uc.smooth(params)
421
422# One benefit of this model is that it gives us an estimate of the
423# underling "level" of inflation, using the smoothed estimate of $\mu_t$,
424# which we can access as the "level" column in the results objects'
425# `states.smoothed` attribute. In this case, because the Bayesian posterior
426# mean of the level's variance is larger than the MLE estimate, its
427# estimated level is a little more volatile.
428
429# Graph
430fig, ax = plt.subplots(figsize=(9, 4), dpi=300)
431
432# Plot data points
433inf["CPIAUCNS"].plot(ax=ax, style="-", label="Observed data")
434
435# Plot estimate of the level term
436res_uc_mle.states.smoothed["level"].plot(ax=ax, label="Smoothed level (MLE)")
437res_uc_bayes.states.smoothed["level"].plot(ax=ax,
438                                           label="Smoothed level (Bayesian)")
439
440ax.legend(loc="lower left")
441