1'''subclassing kde
2
3Author: josef pktd
4'''
5
6import numpy as np
7from numpy.testing import assert_almost_equal, assert_
8import scipy
9from scipy import stats
10import matplotlib.pylab as plt
11
12
13class gaussian_kde_set_covariance(stats.gaussian_kde):
14    '''
15    from Anne Archibald in mailinglist:
16    http://www.nabble.com/Width-of-the-gaussian-in-stats.kde.gaussian_kde---td19558924.html#a19558924
17    '''
18    def __init__(self, dataset, covariance):
19        self.covariance = covariance
20        scipy.stats.gaussian_kde.__init__(self, dataset)
21
22    def _compute_covariance(self):
23        self.inv_cov = np.linalg.inv(self.covariance)
24        self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.n
25
26
27class gaussian_kde_covfact(stats.gaussian_kde):
28    def __init__(self, dataset, covfact = 'scotts'):
29        self.covfact = covfact
30        scipy.stats.gaussian_kde.__init__(self, dataset)
31
32    def _compute_covariance_(self):
33        '''not used'''
34        self.inv_cov = np.linalg.inv(self.covariance)
35        self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.n
36
37    def covariance_factor(self):
38        if self.covfact in ['sc', 'scotts']:
39            return self.scotts_factor()
40        if self.covfact in ['si', 'silverman']:
41            return self.silverman_factor()
42        elif self.covfact:
43            return float(self.covfact)
44        else:
45            raise ValueError('covariance factor has to be scotts, silverman or a number')
46
47    def reset_covfact(self, covfact):
48        self.covfact = covfact
49        self.covariance_factor()
50        self._compute_covariance()
51
52def plotkde(covfact):
53    gkde.reset_covfact(covfact)
54    kdepdf = gkde.evaluate(ind)
55    plt.figure()
56    # plot histgram of sample
57    plt.hist(xn, bins=20, normed=1)
58    # plot estimated density
59    plt.plot(ind, kdepdf, label='kde', color="g")
60    # plot data generating density
61    plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
62                  (1-alpha) * stats.norm.pdf(ind, loc=mhigh),
63                  color="r", label='DGP: normal mix')
64    plt.title('Kernel Density Estimation - ' + str(gkde.covfact))
65    plt.legend()
66
67
68def test_kde_1d():
69    np.random.seed(8765678)
70    n_basesample = 500
71    xn = np.random.randn(n_basesample)
72    xnmean = xn.mean()
73    xnstd = xn.std(ddof=1)
74    print(xnmean, xnstd)
75
76    # get kde for original sample
77    gkde = stats.gaussian_kde(xn)
78
79    # evaluate the density function for the kde for some points
80    xs = np.linspace(-7,7,501)
81    kdepdf = gkde.evaluate(xs)
82    normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
83    print('MSE', np.sum((kdepdf - normpdf)**2))
84    print('axabserror', np.max(np.abs(kdepdf - normpdf)))
85    intervall = xs[1] - xs[0]
86    assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
87    #assert_array_almost_equal(kdepdf, normpdf, decimal=2)
88    print(gkde.integrate_gaussian(0.0, 1.0))
89    print(gkde.integrate_box_1d(-np.inf, 0.0))
90    print(gkde.integrate_box_1d(0.0, np.inf))
91    print(gkde.integrate_box_1d(-np.inf, xnmean))
92    print(gkde.integrate_box_1d(xnmean, np.inf))
93
94    assert_almost_equal(gkde.integrate_box_1d(xnmean, np.inf), 0.5, decimal=1)
95    assert_almost_equal(gkde.integrate_box_1d(-np.inf, xnmean), 0.5, decimal=1)
96    assert_almost_equal(gkde.integrate_box(xnmean, np.inf), 0.5, decimal=1)
97    assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), 0.5, decimal=1)
98
99    assert_almost_equal(gkde.integrate_kde(gkde),
100                        (kdepdf**2).sum()*intervall, decimal=2)
101    assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
102                        (kdepdf*normpdf).sum()*intervall, decimal=2)
103##    assert_almost_equal(gkde.integrate_gaussian(0.0, 1.0),
104##                        (kdepdf*normpdf).sum()*intervall, decimal=2)
105
106
107
108
109if __name__ == '__main__':
110    # generate a sample
111    n_basesample = 1000
112    np.random.seed(8765678)
113    alpha = 0.6 #weight for (prob of) lower distribution
114    mlow, mhigh = (-3,3)  #mean locations for gaussian mixture
115    xn =  np.concatenate([mlow + np.random.randn(alpha * n_basesample),
116                       mhigh + np.random.randn((1-alpha) * n_basesample)])
117
118    # get kde for original sample
119    #gkde = stats.gaussian_kde(xn)
120    gkde = gaussian_kde_covfact(xn, 0.1)
121    # evaluate the density function for the kde for some points
122    ind = np.linspace(-7,7,101)
123    kdepdf = gkde.evaluate(ind)
124
125    plt.figure()
126    # plot histgram of sample
127    plt.hist(xn, bins=20, normed=1)
128    # plot estimated density
129    plt.plot(ind, kdepdf, label='kde', color="g")
130    # plot data generating density
131    plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
132                  (1-alpha) * stats.norm.pdf(ind, loc=mhigh),
133                  color="r", label='DGP: normal mix')
134    plt.title('Kernel Density Estimation')
135    plt.legend()
136
137    gkde = gaussian_kde_covfact(xn, 'scotts')
138    kdepdf = gkde.evaluate(ind)
139    plt.figure()
140    # plot histgram of sample
141    plt.hist(xn, bins=20, normed=1)
142    # plot estimated density
143    plt.plot(ind, kdepdf, label='kde', color="g")
144    # plot data generating density
145    plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
146                  (1-alpha) * stats.norm.pdf(ind, loc=mhigh),
147                  color="r", label='DGP: normal mix')
148    plt.title('Kernel Density Estimation')
149    plt.legend()
150    #plt.show()
151    for cv in ['scotts', 'silverman', 0.05, 0.1, 0.5]:
152        plotkde(cv)
153
154    test_kde_1d()
155
156
157    np.random.seed(8765678)
158    n_basesample = 1000
159    xn = np.random.randn(n_basesample)
160    xnmean = xn.mean()
161    xnstd = xn.std(ddof=1)
162
163    # get kde for original sample
164    gkde = stats.gaussian_kde(xn)
165