1#!/usr/local/bin/python3.8 2# coding: utf-8 3 4# DO NOT EDIT 5# Autogenerated from the notebook kernel_density.ipynb. 6# Edit the notebook and then sync the output with this file. 7# 8# flake8: noqa 9# DO NOT EDIT 10 11# # Kernel Density Estimation 12# 13# Kernel density estimation is the process of estimating an unknown 14# probability density function using a *kernel function* $K(u)$. While a 15# histogram counts the number of data points in somewhat arbitrary regions, 16# a kernel density estimate is a function defined as the sum of a kernel 17# function on every data point. The kernel function typically exhibits the 18# following properties: 19# 20# 1. Symmetry such that $K(u) = K(-u)$. 21# 2. Normalization such that $\int_{-\infty}^{\infty} K(u) \ du = 1$ . 22# 3. Monotonically decreasing such that $K'(u) < 0$ when $u > 0$. 23# 4. Expected value equal to zero such that $\mathrm{E}[K] = 0$. 24# 25# For more information about kernel density estimation, see for instance 26# [Wikipedia - Kernel density 27# estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation). 28# 29# A univariate kernel density estimator is implemented in 30# `sm.nonparametric.KDEUnivariate`. 31# In this example we will show the following: 32# 33# * Basic usage, how to fit the estimator. 34# * The effect of varying the bandwidth of the kernel using the `bw` 35# argument. 36# * The various kernel functions available using the `kernel` argument. 37 38import numpy as np 39from scipy import stats 40import statsmodels.api as sm 41import matplotlib.pyplot as plt 42from statsmodels.distributions.mixture_rvs import mixture_rvs 43 44# ## A univariate example 45 46np.random.seed( 47 12345) # Seed the random number generator for reproducible results 48 49# We create a bimodal distribution: a mixture of two normal distributions 50# with locations at `-1` and `1`. 51 52# Location, scale and weight for the two distributions 53dist1_loc, dist1_scale, weight1 = -1, 0.5, 0.25 54dist2_loc, dist2_scale, weight2 = 1, 0.5, 0.75 55 56# Sample from a mixture of distributions 57obs_dist = mixture_rvs( 58 prob=[weight1, weight2], 59 size=250, 60 dist=[stats.norm, stats.norm], 61 kwargs=( 62 dict(loc=dist1_loc, scale=dist1_scale), 63 dict(loc=dist2_loc, scale=dist2_scale), 64 ), 65) 66 67# The simplest non-parametric technique for density estimation is the 68# histogram. 69 70fig = plt.figure(figsize=(12, 5)) 71ax = fig.add_subplot(111) 72 73# Scatter plot of data samples and histogram 74ax.scatter( 75 obs_dist, 76 np.abs(np.random.randn(obs_dist.size)), 77 zorder=15, 78 color="red", 79 marker="x", 80 alpha=0.5, 81 label="Samples", 82) 83lines = ax.hist(obs_dist, bins=20, edgecolor="k", label="Histogram") 84 85ax.legend(loc="best") 86ax.grid(True, zorder=-5) 87 88# ## Fitting with the default arguments 89 90# The histogram above is discontinuous. To compute a continuous 91# probability density function, 92# we can use kernel density estimation. 93# 94# We initialize a univariate kernel density estimator using 95# `KDEUnivariate`. 96 97kde = sm.nonparametric.KDEUnivariate(obs_dist) 98kde.fit() # Estimate the densities 99 100# We present a figure of the fit, as well as the true distribution. 101 102fig = plt.figure(figsize=(12, 5)) 103ax = fig.add_subplot(111) 104 105# Plot the histrogram 106ax.hist( 107 obs_dist, 108 bins=20, 109 density=True, 110 label="Histogram from samples", 111 zorder=5, 112 edgecolor="k", 113 alpha=0.5, 114) 115 116# Plot the KDE as fitted using the default arguments 117ax.plot(kde.support, kde.density, lw=3, label="KDE from samples", zorder=10) 118 119# Plot the true distribution 120true_values = ( 121 stats.norm.pdf(loc=dist1_loc, scale=dist1_scale, x=kde.support) * weight1 + 122 stats.norm.pdf(loc=dist2_loc, scale=dist2_scale, x=kde.support) * weight2) 123ax.plot(kde.support, true_values, lw=3, label="True distribution", zorder=15) 124 125# Plot the samples 126ax.scatter( 127 obs_dist, 128 np.abs(np.random.randn(obs_dist.size)) / 40, 129 marker="x", 130 color="red", 131 zorder=20, 132 label="Samples", 133 alpha=0.5, 134) 135 136ax.legend(loc="best") 137ax.grid(True, zorder=-5) 138 139# In the code above, default arguments were used. We can also vary the 140# bandwidth of the kernel, as we will now see. 141 142# ## Varying the bandwidth using the `bw` argument 143 144# The bandwidth of the kernel can be adjusted using the `bw` argument. 145# In the following example, a bandwidth of `bw=0.2` seems to fit the data 146# well. 147 148fig = plt.figure(figsize=(12, 5)) 149ax = fig.add_subplot(111) 150 151# Plot the histrogram 152ax.hist( 153 obs_dist, 154 bins=25, 155 label="Histogram from samples", 156 zorder=5, 157 edgecolor="k", 158 density=True, 159 alpha=0.5, 160) 161 162# Plot the KDE for various bandwidths 163for bandwidth in [0.1, 0.2, 0.4]: 164 kde.fit(bw=bandwidth) # Estimate the densities 165 ax.plot( 166 kde.support, 167 kde.density, 168 "--", 169 lw=2, 170 color="k", 171 zorder=10, 172 label="KDE from samples, bw = {}".format(round(bandwidth, 2)), 173 ) 174 175# Plot the true distribution 176ax.plot(kde.support, true_values, lw=3, label="True distribution", zorder=15) 177 178# Plot the samples 179ax.scatter( 180 obs_dist, 181 np.abs(np.random.randn(obs_dist.size)) / 50, 182 marker="x", 183 color="red", 184 zorder=20, 185 label="Data samples", 186 alpha=0.5, 187) 188 189ax.legend(loc="best") 190ax.set_xlim([-3, 3]) 191ax.grid(True, zorder=-5) 192 193# ## Comparing kernel functions 194 195# In the example above, a Gaussian kernel was used. Several other kernels 196# are also available. 197 198from statsmodels.nonparametric.kde import kernel_switch 199 200list(kernel_switch.keys()) 201 202# ### The available kernel functions 203 204# Create a figure 205fig = plt.figure(figsize=(12, 5)) 206 207# Enumerate every option for the kernel 208for i, (ker_name, ker_class) in enumerate(kernel_switch.items()): 209 210 # Initialize the kernel object 211 kernel = ker_class() 212 213 # Sample from the domain 214 domain = kernel.domain or [-3, 3] 215 x_vals = np.linspace(*domain, num=2**10) 216 y_vals = kernel(x_vals) 217 218 # Create a subplot, set the title 219 ax = fig.add_subplot(3, 3, i + 1) 220 ax.set_title('Kernel function "{}"'.format(ker_name)) 221 ax.plot(x_vals, y_vals, lw=3, label="{}".format(ker_name)) 222 ax.scatter([0], [0], marker="x", color="red") 223 plt.grid(True, zorder=-5) 224 ax.set_xlim(domain) 225 226plt.tight_layout() 227 228# ### The available kernel functions on three data points 229 230# We now examine how the kernel density estimate will fit to three equally 231# spaced data points. 232 233# Create three equidistant points 234data = np.linspace(-1, 1, 3) 235kde = sm.nonparametric.KDEUnivariate(data) 236 237# Create a figure 238fig = plt.figure(figsize=(12, 5)) 239 240# Enumerate every option for the kernel 241for i, kernel in enumerate(kernel_switch.keys()): 242 243 # Create a subplot, set the title 244 ax = fig.add_subplot(3, 3, i + 1) 245 ax.set_title('Kernel function "{}"'.format(kernel)) 246 247 # Fit the model (estimate densities) 248 kde.fit(kernel=kernel, fft=False, gridsize=2**10) 249 250 # Create the plot 251 ax.plot(kde.support, 252 kde.density, 253 lw=3, 254 label="KDE from samples", 255 zorder=10) 256 ax.scatter(data, np.zeros_like(data), marker="x", color="red") 257 plt.grid(True, zorder=-5) 258 ax.set_xlim([-3, 3]) 259 260plt.tight_layout() 261 262# ## A more difficult case 263# 264# The fit is not always perfect. See the example below for a harder case. 265 266obs_dist = mixture_rvs( 267 [0.25, 0.75], 268 size=250, 269 dist=[stats.norm, stats.beta], 270 kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=1, args=(1, 0.5))), 271) 272 273kde = sm.nonparametric.KDEUnivariate(obs_dist) 274kde.fit() 275 276fig = plt.figure(figsize=(12, 5)) 277ax = fig.add_subplot(111) 278ax.hist(obs_dist, bins=20, density=True, edgecolor="k", zorder=4, alpha=0.5) 279ax.plot(kde.support, kde.density, lw=3, zorder=7) 280# Plot the samples 281ax.scatter( 282 obs_dist, 283 np.abs(np.random.randn(obs_dist.size)) / 50, 284 marker="x", 285 color="red", 286 zorder=20, 287 label="Data samples", 288 alpha=0.5, 289) 290ax.grid(True, zorder=-5) 291 292# ## The KDE is a distribution 293# 294# Since the KDE is a distribution, we can access attributes and methods 295# such as: 296# 297# - `entropy` 298# - `evaluate` 299# - `cdf` 300# - `icdf` 301# - `sf` 302# - `cumhazard` 303 304obs_dist = mixture_rvs( 305 [0.25, 0.75], 306 size=1000, 307 dist=[stats.norm, stats.norm], 308 kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=0.5)), 309) 310kde = sm.nonparametric.KDEUnivariate(obs_dist) 311kde.fit(gridsize=2**10) 312 313kde.entropy 314 315kde.evaluate(-1) 316 317# ### Cumulative distribution, it's inverse, and the survival function 318 319fig = plt.figure(figsize=(12, 5)) 320ax = fig.add_subplot(111) 321 322ax.plot(kde.support, kde.cdf, lw=3, label="CDF") 323ax.plot(np.linspace(0, 1, num=kde.icdf.size), 324 kde.icdf, 325 lw=3, 326 label="Inverse CDF") 327ax.plot(kde.support, kde.sf, lw=3, label="Survival function") 328ax.legend(loc="best") 329ax.grid(True, zorder=-5) 330 331# ### The Cumulative Hazard Function 332 333fig = plt.figure(figsize=(12, 5)) 334ax = fig.add_subplot(111) 335ax.plot(kde.support, kde.cumhazard, lw=3, label="Cumulative Hazard Function") 336ax.legend(loc="best") 337ax.grid(True, zorder=-5) 338