1#!/usr/local/bin/python3.8 2# coding: utf-8 3 4# DO NOT EDIT 5# Autogenerated from the notebook quantile_regression.ipynb. 6# Edit the notebook and then sync the output with this file. 7# 8# flake8: noqa 9# DO NOT EDIT 10 11# # Quantile regression 12 13# 14# This example page shows how to use ``statsmodels``' ``QuantReg`` class 15# to replicate parts of the analysis published in 16# 17# * Koenker, Roger and Kevin F. Hallock. "Quantile Regression". Journal of 18# Economic Perspectives, Volume 15, Number 4, Fall 2001, Pages 143–156 19# 20# We are interested in the relationship between income and expenditures on 21# food for a sample of working class Belgian households in 1857 (the Engel 22# data). 23# 24# ## Setup 25# 26# We first need to load some modules and to retrieve the data. 27# Conveniently, the Engel dataset is shipped with ``statsmodels``. 28 29import numpy as np 30import pandas as pd 31import statsmodels.api as sm 32import statsmodels.formula.api as smf 33import matplotlib.pyplot as plt 34 35data = sm.datasets.engel.load_pandas().data 36data.head() 37 38# ## Least Absolute Deviation 39# 40# The LAD model is a special case of quantile regression where q=0.5 41 42mod = smf.quantreg("foodexp ~ income", data) 43res = mod.fit(q=0.5) 44print(res.summary()) 45 46# ## Visualizing the results 47# 48# We estimate the quantile regression model for many quantiles between .05 49# and .95, and compare best fit line from each of these models to Ordinary 50# Least Squares results. 51 52# ### Prepare data for plotting 53# 54# For convenience, we place the quantile regression results in a Pandas 55# DataFrame, and the OLS results in a dictionary. 56 57quantiles = np.arange(0.05, 0.96, 0.1) 58 59 60def fit_model(q): 61 res = mod.fit(q=q) 62 return [q, res.params["Intercept"], res.params["income"] 63 ] + res.conf_int().loc["income"].tolist() 64 65 66models = [fit_model(x) for x in quantiles] 67models = pd.DataFrame(models, columns=["q", "a", "b", "lb", "ub"]) 68 69ols = smf.ols("foodexp ~ income", data).fit() 70ols_ci = ols.conf_int().loc["income"].tolist() 71ols = dict(a=ols.params["Intercept"], 72 b=ols.params["income"], 73 lb=ols_ci[0], 74 ub=ols_ci[1]) 75 76print(models) 77print(ols) 78 79# ### First plot 80# 81# This plot compares best fit lines for 10 quantile regression models to 82# the least squares fit. As Koenker and Hallock (2001) point out, we see 83# that: 84# 85# 1. Food expenditure increases with income 86# 2. The *dispersion* of food expenditure increases with income 87# 3. The least squares estimates fit low income observations quite poorly 88# (i.e. the OLS line passes over most low income households) 89 90x = np.arange(data.income.min(), data.income.max(), 50) 91get_y = lambda a, b: a + b * x 92 93fig, ax = plt.subplots(figsize=(8, 6)) 94 95for i in range(models.shape[0]): 96 y = get_y(models.a[i], models.b[i]) 97 ax.plot(x, y, linestyle="dotted", color="grey") 98 99y = get_y(ols["a"], ols["b"]) 100 101ax.plot(x, y, color="red", label="OLS") 102ax.scatter(data.income, data.foodexp, alpha=0.2) 103ax.set_xlim((240, 3000)) 104ax.set_ylim((240, 2000)) 105legend = ax.legend() 106ax.set_xlabel("Income", fontsize=16) 107ax.set_ylabel("Food expenditure", fontsize=16) 108 109# ### Second plot 110# 111# The dotted black lines form 95% point-wise confidence band around 10 112# quantile regression estimates (solid black line). The red lines represent 113# OLS regression results along with their 95% confidence interval. 114# 115# In most cases, the quantile regression point estimates lie outside the 116# OLS confidence interval, which suggests that the effect of income on food 117# expenditure may not be constant across the distribution. 118 119n = models.shape[0] 120p1 = plt.plot(models.q, models.b, color="black", label="Quantile Reg.") 121p2 = plt.plot(models.q, models.ub, linestyle="dotted", color="black") 122p3 = plt.plot(models.q, models.lb, linestyle="dotted", color="black") 123p4 = plt.plot(models.q, [ols["b"]] * n, color="red", label="OLS") 124p5 = plt.plot(models.q, [ols["lb"]] * n, linestyle="dotted", color="red") 125p6 = plt.plot(models.q, [ols["ub"]] * n, linestyle="dotted", color="red") 126plt.ylabel(r"$\beta_{income}$") 127plt.xlabel("Quantiles of the conditional food expenditure distribution") 128plt.legend() 129plt.show() 130