1#!/usr/local/bin/python3.8
2# coding: utf-8
3
4# DO NOT EDIT
5# Autogenerated from the notebook kernel_density.ipynb.
6# Edit the notebook and then sync the output with this file.
7#
8# flake8: noqa
9# DO NOT EDIT
10
11# # Kernel Density Estimation
12#
13# Kernel density estimation is the process of estimating an unknown
14# probability density function using a *kernel function* $K(u)$. While a
15# histogram counts the number of data points in somewhat arbitrary regions,
16# a kernel density estimate is a function defined as the sum of a kernel
17# function on every data point. The kernel function typically exhibits the
18# following properties:
19#
20# 1. Symmetry such that $K(u) = K(-u)$.
21# 2. Normalization such that $\int_{-\infty}^{\infty} K(u) \ du = 1$ .
22# 3. Monotonically decreasing such that $K'(u) < 0$ when $u > 0$.
23# 4. Expected value equal to zero such that $\mathrm{E}[K] = 0$.
24#
25# For more information about kernel density estimation, see for instance
26# [Wikipedia - Kernel density
27# estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation).
28#
29# A univariate kernel density estimator is implemented in
30# `sm.nonparametric.KDEUnivariate`.
31# In this example we will show the following:
32#
33# * Basic usage, how to fit the estimator.
34# * The effect of varying the bandwidth of the kernel using the `bw`
35# argument.
36# * The various kernel functions available using the `kernel` argument.
37
38import numpy as np
39from scipy import stats
40import statsmodels.api as sm
41import matplotlib.pyplot as plt
42from statsmodels.distributions.mixture_rvs import mixture_rvs
43
44# ## A univariate example
45
46np.random.seed(
47    12345)  # Seed the random number generator for reproducible results
48
49# We create a bimodal distribution: a mixture of two normal distributions
50# with locations at `-1` and `1`.
51
52# Location, scale and weight for the two distributions
53dist1_loc, dist1_scale, weight1 = -1, 0.5, 0.25
54dist2_loc, dist2_scale, weight2 = 1, 0.5, 0.75
55
56# Sample from a mixture of distributions
57obs_dist = mixture_rvs(
58    prob=[weight1, weight2],
59    size=250,
60    dist=[stats.norm, stats.norm],
61    kwargs=(
62        dict(loc=dist1_loc, scale=dist1_scale),
63        dict(loc=dist2_loc, scale=dist2_scale),
64    ),
65)
66
67# The simplest non-parametric technique for density estimation is the
68# histogram.
69
70fig = plt.figure(figsize=(12, 5))
71ax = fig.add_subplot(111)
72
73# Scatter plot of data samples and histogram
74ax.scatter(
75    obs_dist,
76    np.abs(np.random.randn(obs_dist.size)),
77    zorder=15,
78    color="red",
79    marker="x",
80    alpha=0.5,
81    label="Samples",
82)
83lines = ax.hist(obs_dist, bins=20, edgecolor="k", label="Histogram")
84
85ax.legend(loc="best")
86ax.grid(True, zorder=-5)
87
88# ## Fitting with the default arguments
89
90# The histogram above is discontinuous. To compute a continuous
91# probability density function,
92# we can use kernel density estimation.
93#
94# We initialize a univariate kernel density estimator using
95# `KDEUnivariate`.
96
97kde = sm.nonparametric.KDEUnivariate(obs_dist)
98kde.fit()  # Estimate the densities
99
100# We present a figure of the fit, as well as the true distribution.
101
102fig = plt.figure(figsize=(12, 5))
103ax = fig.add_subplot(111)
104
105# Plot the histrogram
106ax.hist(
107    obs_dist,
108    bins=20,
109    density=True,
110    label="Histogram from samples",
111    zorder=5,
112    edgecolor="k",
113    alpha=0.5,
114)
115
116# Plot the KDE as fitted using the default arguments
117ax.plot(kde.support, kde.density, lw=3, label="KDE from samples", zorder=10)
118
119# Plot the true distribution
120true_values = (
121    stats.norm.pdf(loc=dist1_loc, scale=dist1_scale, x=kde.support) * weight1 +
122    stats.norm.pdf(loc=dist2_loc, scale=dist2_scale, x=kde.support) * weight2)
123ax.plot(kde.support, true_values, lw=3, label="True distribution", zorder=15)
124
125# Plot the samples
126ax.scatter(
127    obs_dist,
128    np.abs(np.random.randn(obs_dist.size)) / 40,
129    marker="x",
130    color="red",
131    zorder=20,
132    label="Samples",
133    alpha=0.5,
134)
135
136ax.legend(loc="best")
137ax.grid(True, zorder=-5)
138
139# In the code above, default arguments were used. We can also vary the
140# bandwidth of the kernel, as we will now see.
141
142# ## Varying the bandwidth using the `bw` argument
143
144# The bandwidth of the kernel can be adjusted using the `bw` argument.
145# In the following example, a bandwidth of `bw=0.2` seems to fit the data
146# well.
147
148fig = plt.figure(figsize=(12, 5))
149ax = fig.add_subplot(111)
150
151# Plot the histrogram
152ax.hist(
153    obs_dist,
154    bins=25,
155    label="Histogram from samples",
156    zorder=5,
157    edgecolor="k",
158    density=True,
159    alpha=0.5,
160)
161
162# Plot the KDE for various bandwidths
163for bandwidth in [0.1, 0.2, 0.4]:
164    kde.fit(bw=bandwidth)  # Estimate the densities
165    ax.plot(
166        kde.support,
167        kde.density,
168        "--",
169        lw=2,
170        color="k",
171        zorder=10,
172        label="KDE from samples, bw = {}".format(round(bandwidth, 2)),
173    )
174
175# Plot the true distribution
176ax.plot(kde.support, true_values, lw=3, label="True distribution", zorder=15)
177
178# Plot the samples
179ax.scatter(
180    obs_dist,
181    np.abs(np.random.randn(obs_dist.size)) / 50,
182    marker="x",
183    color="red",
184    zorder=20,
185    label="Data samples",
186    alpha=0.5,
187)
188
189ax.legend(loc="best")
190ax.set_xlim([-3, 3])
191ax.grid(True, zorder=-5)
192
193# ## Comparing kernel functions
194
195# In the example above, a Gaussian kernel was used. Several other kernels
196# are also available.
197
198from statsmodels.nonparametric.kde import kernel_switch
199
200list(kernel_switch.keys())
201
202# ### The available kernel functions
203
204# Create a figure
205fig = plt.figure(figsize=(12, 5))
206
207# Enumerate every option for the kernel
208for i, (ker_name, ker_class) in enumerate(kernel_switch.items()):
209
210    # Initialize the kernel object
211    kernel = ker_class()
212
213    # Sample from the domain
214    domain = kernel.domain or [-3, 3]
215    x_vals = np.linspace(*domain, num=2**10)
216    y_vals = kernel(x_vals)
217
218    # Create a subplot, set the title
219    ax = fig.add_subplot(3, 3, i + 1)
220    ax.set_title('Kernel function "{}"'.format(ker_name))
221    ax.plot(x_vals, y_vals, lw=3, label="{}".format(ker_name))
222    ax.scatter([0], [0], marker="x", color="red")
223    plt.grid(True, zorder=-5)
224    ax.set_xlim(domain)
225
226plt.tight_layout()
227
228# ### The available kernel functions on three data points
229
230# We now examine how the kernel density estimate will fit to three equally
231# spaced data points.
232
233# Create three equidistant points
234data = np.linspace(-1, 1, 3)
235kde = sm.nonparametric.KDEUnivariate(data)
236
237# Create a figure
238fig = plt.figure(figsize=(12, 5))
239
240# Enumerate every option for the kernel
241for i, kernel in enumerate(kernel_switch.keys()):
242
243    # Create a subplot, set the title
244    ax = fig.add_subplot(3, 3, i + 1)
245    ax.set_title('Kernel function "{}"'.format(kernel))
246
247    # Fit the model (estimate densities)
248    kde.fit(kernel=kernel, fft=False, gridsize=2**10)
249
250    # Create the plot
251    ax.plot(kde.support,
252            kde.density,
253            lw=3,
254            label="KDE from samples",
255            zorder=10)
256    ax.scatter(data, np.zeros_like(data), marker="x", color="red")
257    plt.grid(True, zorder=-5)
258    ax.set_xlim([-3, 3])
259
260plt.tight_layout()
261
262# ## A more difficult case
263#
264# The fit is not always perfect. See the example below for a harder case.
265
266obs_dist = mixture_rvs(
267    [0.25, 0.75],
268    size=250,
269    dist=[stats.norm, stats.beta],
270    kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=1, args=(1, 0.5))),
271)
272
273kde = sm.nonparametric.KDEUnivariate(obs_dist)
274kde.fit()
275
276fig = plt.figure(figsize=(12, 5))
277ax = fig.add_subplot(111)
278ax.hist(obs_dist, bins=20, density=True, edgecolor="k", zorder=4, alpha=0.5)
279ax.plot(kde.support, kde.density, lw=3, zorder=7)
280# Plot the samples
281ax.scatter(
282    obs_dist,
283    np.abs(np.random.randn(obs_dist.size)) / 50,
284    marker="x",
285    color="red",
286    zorder=20,
287    label="Data samples",
288    alpha=0.5,
289)
290ax.grid(True, zorder=-5)
291
292# ## The KDE is a distribution
293#
294# Since the KDE is a distribution, we can access attributes and methods
295# such as:
296#
297# - `entropy`
298# - `evaluate`
299# - `cdf`
300# - `icdf`
301# - `sf`
302# - `cumhazard`
303
304obs_dist = mixture_rvs(
305    [0.25, 0.75],
306    size=1000,
307    dist=[stats.norm, stats.norm],
308    kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=0.5)),
309)
310kde = sm.nonparametric.KDEUnivariate(obs_dist)
311kde.fit(gridsize=2**10)
312
313kde.entropy
314
315kde.evaluate(-1)
316
317# ### Cumulative distribution, it's inverse, and the survival function
318
319fig = plt.figure(figsize=(12, 5))
320ax = fig.add_subplot(111)
321
322ax.plot(kde.support, kde.cdf, lw=3, label="CDF")
323ax.plot(np.linspace(0, 1, num=kde.icdf.size),
324        kde.icdf,
325        lw=3,
326        label="Inverse CDF")
327ax.plot(kde.support, kde.sf, lw=3, label="Survival function")
328ax.legend(loc="best")
329ax.grid(True, zorder=-5)
330
331# ### The Cumulative Hazard Function
332
333fig = plt.figure(figsize=(12, 5))
334ax = fig.add_subplot(111)
335ax.plot(kde.support, kde.cumhazard, lw=3, label="Cumulative Hazard Function")
336ax.legend(loc="best")
337ax.grid(True, zorder=-5)
338