1# Natural Language Toolkit: Expectation Maximization Clusterer
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5# URL: <http://nltk.org/>
6# For license information, see LICENSE.TXT
7from __future__ import print_function, unicode_literals
8
9try:
10    import numpy
11except ImportError:
12    pass
13
14from nltk.compat import python_2_unicode_compatible
15from nltk.cluster.util import VectorSpaceClusterer
16
17
18@python_2_unicode_compatible
19class EMClusterer(VectorSpaceClusterer):
20    """
21    The Gaussian EM clusterer models the vectors as being produced by
22    a mixture of k Gaussian sources. The parameters of these sources
23    (prior probability, mean and covariance matrix) are then found to
24    maximise the likelihood of the given data. This is done with the
25    expectation maximisation algorithm. It starts with k arbitrarily
26    chosen means, priors and covariance matrices. It then calculates
27    the membership probabilities for each vector in each of the
28    clusters; this is the 'E' step. The cluster parameters are then
29    updated in the 'M' step using the maximum likelihood estimate from
30    the cluster membership probabilities. This process continues until
31    the likelihood of the data does not significantly increase.
32    """
33
34    def __init__(
35        self,
36        initial_means,
37        priors=None,
38        covariance_matrices=None,
39        conv_threshold=1e-6,
40        bias=0.1,
41        normalise=False,
42        svd_dimensions=None,
43    ):
44        """
45        Creates an EM clusterer with the given starting parameters,
46        convergence threshold and vector mangling parameters.
47
48        :param  initial_means: the means of the gaussian cluster centers
49        :type   initial_means: [seq of] numpy array or seq of SparseArray
50        :param  priors: the prior probability for each cluster
51        :type   priors: numpy array or seq of float
52        :param  covariance_matrices: the covariance matrix for each cluster
53        :type   covariance_matrices: [seq of] numpy array
54        :param  conv_threshold: maximum change in likelihood before deemed
55                    convergent
56        :type   conv_threshold: int or float
57        :param  bias: variance bias used to ensure non-singular covariance
58                      matrices
59        :type   bias: float
60        :param  normalise:  should vectors be normalised to length 1
61        :type   normalise:  boolean
62        :param  svd_dimensions: number of dimensions to use in reducing vector
63                               dimensionsionality with SVD
64        :type   svd_dimensions: int
65        """
66        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
67        self._means = numpy.array(initial_means, numpy.float64)
68        self._num_clusters = len(initial_means)
69        self._conv_threshold = conv_threshold
70        self._covariance_matrices = covariance_matrices
71        self._priors = priors
72        self._bias = bias
73
74    def num_clusters(self):
75        return self._num_clusters
76
77    def cluster_vectorspace(self, vectors, trace=False):
78        assert len(vectors) > 0
79
80        # set the parameters to initial values
81        dimensions = len(vectors[0])
82        means = self._means
83        priors = self._priors
84        if not priors:
85            priors = self._priors = (
86                numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
87            )
88        covariances = self._covariance_matrices
89        if not covariances:
90            covariances = self._covariance_matrices = [
91                numpy.identity(dimensions, numpy.float64)
92                for i in range(self._num_clusters)
93            ]
94
95        # do the E and M steps until the likelihood plateaus
96        lastl = self._loglikelihood(vectors, priors, means, covariances)
97        converged = False
98
99        while not converged:
100            if trace:
101                print('iteration; loglikelihood', lastl)
102            # E-step, calculate hidden variables, h[i,j]
103            h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
104            for i in range(len(vectors)):
105                for j in range(self._num_clusters):
106                    h[i, j] = priors[j] * self._gaussian(
107                        means[j], covariances[j], vectors[i]
108                    )
109                h[i, :] /= sum(h[i, :])
110
111            # M-step, update parameters - cvm, p, mean
112            for j in range(self._num_clusters):
113                covariance_before = covariances[j]
114                new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
115                new_mean = numpy.zeros(dimensions, numpy.float64)
116                sum_hj = 0.0
117                for i in range(len(vectors)):
118                    delta = vectors[i] - means[j]
119                    new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
120                    sum_hj += h[i, j]
121                    new_mean += h[i, j] * vectors[i]
122                covariances[j] = new_covariance / sum_hj
123                means[j] = new_mean / sum_hj
124                priors[j] = sum_hj / len(vectors)
125
126                # bias term to stop covariance matrix being singular
127                covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
128
129            # calculate likelihood - FIXME: may be broken
130            l = self._loglikelihood(vectors, priors, means, covariances)
131
132            # check for convergence
133            if abs(lastl - l) < self._conv_threshold:
134                converged = True
135            lastl = l
136
137    def classify_vectorspace(self, vector):
138        best = None
139        for j in range(self._num_clusters):
140            p = self._priors[j] * self._gaussian(
141                self._means[j], self._covariance_matrices[j], vector
142            )
143            if not best or p > best[0]:
144                best = (p, j)
145        return best[1]
146
147    def likelihood_vectorspace(self, vector, cluster):
148        cid = self.cluster_names().index(cluster)
149        return self._priors[cluster] * self._gaussian(
150            self._means[cluster], self._covariance_matrices[cluster], vector
151        )
152
153    def _gaussian(self, mean, cvm, x):
154        m = len(mean)
155        assert cvm.shape == (m, m), 'bad sized covariance matrix, %s' % str(cvm.shape)
156        try:
157            det = numpy.linalg.det(cvm)
158            inv = numpy.linalg.inv(cvm)
159            a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0)
160            dx = x - mean
161            print(dx, inv)
162            b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
163            return a * numpy.exp(b)
164        except OverflowError:
165            # happens when the exponent is negative infinity - i.e. b = 0
166            # i.e. the inverse of cvm is huge (cvm is almost zero)
167            return 0
168
169    def _loglikelihood(self, vectors, priors, means, covariances):
170        llh = 0.0
171        for vector in vectors:
172            p = 0
173            for j in range(len(priors)):
174                p += priors[j] * self._gaussian(means[j], covariances[j], vector)
175            llh += numpy.log(p)
176        return llh
177
178    def __repr__(self):
179        return '<EMClusterer means=%s>' % list(self._means)
180
181
182def demo():
183    """
184    Non-interactive demonstration of the clusterers with simple 2-D data.
185    """
186
187    from nltk import cluster
188
189    # example from figure 14.10, page 519, Manning and Schutze
190
191    vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
192    means = [[4, 2], [4, 2.01]]
193
194    clusterer = cluster.EMClusterer(means, bias=0.1)
195    clusters = clusterer.cluster(vectors, True, trace=True)
196
197    print('Clustered:', vectors)
198    print('As:       ', clusters)
199    print()
200
201    for c in range(2):
202        print('Cluster:', c)
203        print('Prior:  ', clusterer._priors[c])
204        print('Mean:   ', clusterer._means[c])
205        print('Covar:  ', clusterer._covariance_matrices[c])
206        print()
207
208    # classify a new vector
209    vector = numpy.array([2, 2])
210    print('classify(%s):' % vector, end=' ')
211    print(clusterer.classify(vector))
212
213    # show the classification probabilities
214    vector = numpy.array([2, 2])
215    print('classification_probdist(%s):' % vector)
216    pdist = clusterer.classification_probdist(vector)
217    for sample in pdist.samples():
218        print('%s => %.0f%%' % (sample, pdist.prob(sample) * 100))
219
220
221#
222#     The following demo code is broken.
223#
224#     # use a set of tokens with 2D indices
225#     vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
226
227#     # test the EM clusterer with means given by k-means (2) and
228#     # dimensionality reduction
229#     clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1)
230#     print 'Clusterer:', clusterer
231#     clusters = clusterer.cluster(vectors)
232#     means = clusterer.means()
233#     print 'Means:', clusterer.means()
234#     print
235
236#     clusterer = cluster.EMClusterer(means, svd_dimensions=1)
237#     clusters = clusterer.cluster(vectors, True)
238#     print 'Clusterer:', clusterer
239#     print 'Clustered:', str(vectors)[:60], '...'
240#     print 'As:', str(clusters)[:60], '...'
241#     print
242
243#     # classify a new vector
244#     vector = numpy.array([3, 3])
245#     print 'classify(%s):' % vector,
246#     print clusterer.classify(vector)
247#     print
248
249#     # show the classification probabilities
250#     vector = numpy.array([2.2, 2])
251#     print 'classification_probdist(%s)' % vector
252#     pdist = clusterer.classification_probdist(vector)
253#     for sample in pdist:
254#         print '%s => %.0f%%' % (sample, pdist.prob(sample) *100)
255
256if __name__ == '__main__':
257    demo()
258