1#!/usr/local/bin/python3.8
2# coding: utf-8
3
4# DO NOT EDIT
5# Autogenerated from the notebook quantile_regression.ipynb.
6# Edit the notebook and then sync the output with this file.
7#
8# flake8: noqa
9# DO NOT EDIT
10
11# # Quantile regression
12
13#
14# This example page shows how to use ``statsmodels``' ``QuantReg`` class
15# to replicate parts of the analysis published in
16#
17# * Koenker, Roger and Kevin F. Hallock. "Quantile Regression". Journal of
18# Economic Perspectives, Volume 15, Number 4, Fall 2001, Pages 143–156
19#
20# We are interested in the relationship between income and expenditures on
21# food for a sample of working class Belgian households in 1857 (the Engel
22# data).
23#
24# ## Setup
25#
26# We first need to load some modules and to retrieve the data.
27# Conveniently, the Engel dataset is shipped with ``statsmodels``.
28
29import numpy as np
30import pandas as pd
31import statsmodels.api as sm
32import statsmodels.formula.api as smf
33import matplotlib.pyplot as plt
34
35data = sm.datasets.engel.load_pandas().data
36data.head()
37
38# ## Least Absolute Deviation
39#
40# The LAD model is a special case of quantile regression where q=0.5
41
42mod = smf.quantreg("foodexp ~ income", data)
43res = mod.fit(q=0.5)
44print(res.summary())
45
46# ## Visualizing the results
47#
48# We estimate the quantile regression model for many quantiles between .05
49# and .95, and compare best fit line from each of these models to Ordinary
50# Least Squares results.
51
52# ### Prepare data for plotting
53#
54# For convenience, we place the quantile regression results in a Pandas
55# DataFrame, and the OLS results in a dictionary.
56
57quantiles = np.arange(0.05, 0.96, 0.1)
58
59
60def fit_model(q):
61    res = mod.fit(q=q)
62    return [q, res.params["Intercept"], res.params["income"]
63            ] + res.conf_int().loc["income"].tolist()
64
65
66models = [fit_model(x) for x in quantiles]
67models = pd.DataFrame(models, columns=["q", "a", "b", "lb", "ub"])
68
69ols = smf.ols("foodexp ~ income", data).fit()
70ols_ci = ols.conf_int().loc["income"].tolist()
71ols = dict(a=ols.params["Intercept"],
72           b=ols.params["income"],
73           lb=ols_ci[0],
74           ub=ols_ci[1])
75
76print(models)
77print(ols)
78
79# ### First plot
80#
81# This plot compares best fit lines for 10 quantile regression models to
82# the least squares fit. As Koenker and Hallock (2001) point out, we see
83# that:
84#
85# 1. Food expenditure increases with income
86# 2. The *dispersion* of food expenditure increases with income
87# 3. The least squares estimates fit low income observations quite poorly
88# (i.e. the OLS line passes over most low income households)
89
90x = np.arange(data.income.min(), data.income.max(), 50)
91get_y = lambda a, b: a + b * x
92
93fig, ax = plt.subplots(figsize=(8, 6))
94
95for i in range(models.shape[0]):
96    y = get_y(models.a[i], models.b[i])
97    ax.plot(x, y, linestyle="dotted", color="grey")
98
99y = get_y(ols["a"], ols["b"])
100
101ax.plot(x, y, color="red", label="OLS")
102ax.scatter(data.income, data.foodexp, alpha=0.2)
103ax.set_xlim((240, 3000))
104ax.set_ylim((240, 2000))
105legend = ax.legend()
106ax.set_xlabel("Income", fontsize=16)
107ax.set_ylabel("Food expenditure", fontsize=16)
108
109# ### Second plot
110#
111# The dotted black lines form 95% point-wise confidence band around 10
112# quantile regression estimates (solid black line). The red lines represent
113# OLS regression results along with their 95% confidence interval.
114#
115# In most cases, the quantile regression point estimates lie outside the
116# OLS confidence interval, which suggests that the effect of income on food
117# expenditure may not be constant across the distribution.
118
119n = models.shape[0]
120p1 = plt.plot(models.q, models.b, color="black", label="Quantile Reg.")
121p2 = plt.plot(models.q, models.ub, linestyle="dotted", color="black")
122p3 = plt.plot(models.q, models.lb, linestyle="dotted", color="black")
123p4 = plt.plot(models.q, [ols["b"]] * n, color="red", label="OLS")
124p5 = plt.plot(models.q, [ols["lb"]] * n, linestyle="dotted", color="red")
125p6 = plt.plot(models.q, [ols["ub"]] * n, linestyle="dotted", color="red")
126plt.ylabel(r"$\beta_{income}$")
127plt.xlabel("Quantiles of the conditional food expenditure distribution")
128plt.legend()
129plt.show()
130