1 //! Kernel density estimation
2 
3 pub mod kernel;
4 
5 use self::kernel::Kernel;
6 use crate::stats::float::Float;
7 use crate::stats::univariate::Sample;
8 use rayon::prelude::*;
9 
10 /// Univariate kernel density estimator
11 pub struct Kde<'a, A, K>
12 where
13     A: Float,
14     K: Kernel<A>,
15 {
16     bandwidth: A,
17     kernel: K,
18     sample: &'a Sample<A>,
19 }
20 
21 impl<'a, A, K> Kde<'a, A, K>
22 where
23     A: 'a + Float,
24     K: Kernel<A>,
25 {
26     /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating
27     /// the bandwidth using the method `bw`
new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K>28     pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> {
29         Kde {
30             bandwidth: bw.estimate(sample),
31             kernel,
32             sample,
33         }
34     }
35 
36     /// Returns the bandwidth used by the estimator
bandwidth(&self) -> A37     pub fn bandwidth(&self) -> A {
38         self.bandwidth
39     }
40 
41     /// Maps the KDE over `xs`
42     ///
43     /// - Multihreaded
map(&self, xs: &[A]) -> Box<[A]>44     pub fn map(&self, xs: &[A]) -> Box<[A]> {
45         xs.par_iter()
46             .map(|&x| self.estimate(x))
47             .collect::<Vec<_>>()
48             .into_boxed_slice()
49     }
50 
51     /// Estimates the probability density of `x`
estimate(&self, x: A) -> A52     pub fn estimate(&self, x: A) -> A {
53         let _0 = A::cast(0);
54         let slice = self.sample;
55         let h = self.bandwidth;
56         let n = A::cast(slice.len());
57 
58         let sum = slice
59             .iter()
60             .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h));
61 
62         sum / (h * n)
63     }
64 }
65 
66 /// Method to estimate the bandwidth
67 pub enum Bandwidth {
68     /// Use Silverman's rule of thumb to estimate the bandwidth from the sample
69     Silverman,
70 }
71 
72 impl Bandwidth {
estimate<A: Float>(self, sample: &Sample<A>) -> A73     fn estimate<A: Float>(self, sample: &Sample<A>) -> A {
74         match self {
75             Bandwidth::Silverman => {
76                 let factor = A::cast(4. / 3.);
77                 let exponent = A::cast(1. / 5.);
78                 let n = A::cast(sample.len());
79                 let sigma = sample.std_dev(None);
80 
81                 sigma * (factor / n).powf(exponent)
82             }
83         }
84     }
85 }
86 
87 #[cfg(test)]
88 macro_rules! test {
89     ($ty:ident) => {
90         mod $ty {
91             use approx::relative_eq;
92             use quickcheck::quickcheck;
93             use quickcheck::TestResult;
94 
95             use crate::stats::univariate::kde::kernel::Gaussian;
96             use crate::stats::univariate::kde::{Bandwidth, Kde};
97             use crate::stats::univariate::Sample;
98 
99             // The [-inf inf] integral of the estimated PDF should be one
100             quickcheck! {
101                 fn integral(size: usize, start: usize) -> TestResult {
102                     const DX: $ty = 1e-3;
103 
104                     if let Some(v) = crate::stats::test::vec::<$ty>(size, start) {
105                         let slice = &v[start..];
106                         let data = Sample::new(slice);
107                         let kde = Kde::new(data, Gaussian, Bandwidth::Silverman);
108                         let h = kde.bandwidth();
109                         // NB Obviously a [-inf inf] integral is not feasible, but this range works
110                         // quite well
111                         let (a, b) = (data.min() - 5. * h, data.max() + 5. * h);
112 
113                         let mut acc = 0.;
114                         let mut x = a;
115                         let mut y = kde.estimate(a);
116 
117                         while x < b {
118                             acc += DX * y / 2.;
119 
120                             x += DX;
121                             y = kde.estimate(x);
122 
123                             acc += DX * y / 2.;
124                         }
125 
126                         TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5))
127                     } else {
128                         TestResult::discard()
129                     }
130                 }
131             }
132         }
133     };
134 }
135 
136 #[cfg(test)]
137 mod test {
138     test!(f32);
139     test!(f64);
140 }
141