1# Natural Language Toolkit: Expectation Maximization Clusterer 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Trevor Cohn <tacohn@cs.mu.oz.au> 5# URL: <http://nltk.org/> 6# For license information, see LICENSE.TXT 7from __future__ import print_function, unicode_literals 8 9try: 10 import numpy 11except ImportError: 12 pass 13 14from nltk.compat import python_2_unicode_compatible 15from nltk.cluster.util import VectorSpaceClusterer 16 17 18@python_2_unicode_compatible 19class EMClusterer(VectorSpaceClusterer): 20 """ 21 The Gaussian EM clusterer models the vectors as being produced by 22 a mixture of k Gaussian sources. The parameters of these sources 23 (prior probability, mean and covariance matrix) are then found to 24 maximise the likelihood of the given data. This is done with the 25 expectation maximisation algorithm. It starts with k arbitrarily 26 chosen means, priors and covariance matrices. It then calculates 27 the membership probabilities for each vector in each of the 28 clusters; this is the 'E' step. The cluster parameters are then 29 updated in the 'M' step using the maximum likelihood estimate from 30 the cluster membership probabilities. This process continues until 31 the likelihood of the data does not significantly increase. 32 """ 33 34 def __init__( 35 self, 36 initial_means, 37 priors=None, 38 covariance_matrices=None, 39 conv_threshold=1e-6, 40 bias=0.1, 41 normalise=False, 42 svd_dimensions=None, 43 ): 44 """ 45 Creates an EM clusterer with the given starting parameters, 46 convergence threshold and vector mangling parameters. 47 48 :param initial_means: the means of the gaussian cluster centers 49 :type initial_means: [seq of] numpy array or seq of SparseArray 50 :param priors: the prior probability for each cluster 51 :type priors: numpy array or seq of float 52 :param covariance_matrices: the covariance matrix for each cluster 53 :type covariance_matrices: [seq of] numpy array 54 :param conv_threshold: maximum change in likelihood before deemed 55 convergent 56 :type conv_threshold: int or float 57 :param bias: variance bias used to ensure non-singular covariance 58 matrices 59 :type bias: float 60 :param normalise: should vectors be normalised to length 1 61 :type normalise: boolean 62 :param svd_dimensions: number of dimensions to use in reducing vector 63 dimensionsionality with SVD 64 :type svd_dimensions: int 65 """ 66 VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) 67 self._means = numpy.array(initial_means, numpy.float64) 68 self._num_clusters = len(initial_means) 69 self._conv_threshold = conv_threshold 70 self._covariance_matrices = covariance_matrices 71 self._priors = priors 72 self._bias = bias 73 74 def num_clusters(self): 75 return self._num_clusters 76 77 def cluster_vectorspace(self, vectors, trace=False): 78 assert len(vectors) > 0 79 80 # set the parameters to initial values 81 dimensions = len(vectors[0]) 82 means = self._means 83 priors = self._priors 84 if not priors: 85 priors = self._priors = ( 86 numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters 87 ) 88 covariances = self._covariance_matrices 89 if not covariances: 90 covariances = self._covariance_matrices = [ 91 numpy.identity(dimensions, numpy.float64) 92 for i in range(self._num_clusters) 93 ] 94 95 # do the E and M steps until the likelihood plateaus 96 lastl = self._loglikelihood(vectors, priors, means, covariances) 97 converged = False 98 99 while not converged: 100 if trace: 101 print('iteration; loglikelihood', lastl) 102 # E-step, calculate hidden variables, h[i,j] 103 h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64) 104 for i in range(len(vectors)): 105 for j in range(self._num_clusters): 106 h[i, j] = priors[j] * self._gaussian( 107 means[j], covariances[j], vectors[i] 108 ) 109 h[i, :] /= sum(h[i, :]) 110 111 # M-step, update parameters - cvm, p, mean 112 for j in range(self._num_clusters): 113 covariance_before = covariances[j] 114 new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64) 115 new_mean = numpy.zeros(dimensions, numpy.float64) 116 sum_hj = 0.0 117 for i in range(len(vectors)): 118 delta = vectors[i] - means[j] 119 new_covariance += h[i, j] * numpy.multiply.outer(delta, delta) 120 sum_hj += h[i, j] 121 new_mean += h[i, j] * vectors[i] 122 covariances[j] = new_covariance / sum_hj 123 means[j] = new_mean / sum_hj 124 priors[j] = sum_hj / len(vectors) 125 126 # bias term to stop covariance matrix being singular 127 covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64) 128 129 # calculate likelihood - FIXME: may be broken 130 l = self._loglikelihood(vectors, priors, means, covariances) 131 132 # check for convergence 133 if abs(lastl - l) < self._conv_threshold: 134 converged = True 135 lastl = l 136 137 def classify_vectorspace(self, vector): 138 best = None 139 for j in range(self._num_clusters): 140 p = self._priors[j] * self._gaussian( 141 self._means[j], self._covariance_matrices[j], vector 142 ) 143 if not best or p > best[0]: 144 best = (p, j) 145 return best[1] 146 147 def likelihood_vectorspace(self, vector, cluster): 148 cid = self.cluster_names().index(cluster) 149 return self._priors[cluster] * self._gaussian( 150 self._means[cluster], self._covariance_matrices[cluster], vector 151 ) 152 153 def _gaussian(self, mean, cvm, x): 154 m = len(mean) 155 assert cvm.shape == (m, m), 'bad sized covariance matrix, %s' % str(cvm.shape) 156 try: 157 det = numpy.linalg.det(cvm) 158 inv = numpy.linalg.inv(cvm) 159 a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0) 160 dx = x - mean 161 print(dx, inv) 162 b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx) 163 return a * numpy.exp(b) 164 except OverflowError: 165 # happens when the exponent is negative infinity - i.e. b = 0 166 # i.e. the inverse of cvm is huge (cvm is almost zero) 167 return 0 168 169 def _loglikelihood(self, vectors, priors, means, covariances): 170 llh = 0.0 171 for vector in vectors: 172 p = 0 173 for j in range(len(priors)): 174 p += priors[j] * self._gaussian(means[j], covariances[j], vector) 175 llh += numpy.log(p) 176 return llh 177 178 def __repr__(self): 179 return '<EMClusterer means=%s>' % list(self._means) 180 181 182def demo(): 183 """ 184 Non-interactive demonstration of the clusterers with simple 2-D data. 185 """ 186 187 from nltk import cluster 188 189 # example from figure 14.10, page 519, Manning and Schutze 190 191 vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] 192 means = [[4, 2], [4, 2.01]] 193 194 clusterer = cluster.EMClusterer(means, bias=0.1) 195 clusters = clusterer.cluster(vectors, True, trace=True) 196 197 print('Clustered:', vectors) 198 print('As: ', clusters) 199 print() 200 201 for c in range(2): 202 print('Cluster:', c) 203 print('Prior: ', clusterer._priors[c]) 204 print('Mean: ', clusterer._means[c]) 205 print('Covar: ', clusterer._covariance_matrices[c]) 206 print() 207 208 # classify a new vector 209 vector = numpy.array([2, 2]) 210 print('classify(%s):' % vector, end=' ') 211 print(clusterer.classify(vector)) 212 213 # show the classification probabilities 214 vector = numpy.array([2, 2]) 215 print('classification_probdist(%s):' % vector) 216 pdist = clusterer.classification_probdist(vector) 217 for sample in pdist.samples(): 218 print('%s => %.0f%%' % (sample, pdist.prob(sample) * 100)) 219 220 221# 222# The following demo code is broken. 223# 224# # use a set of tokens with 2D indices 225# vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 226 227# # test the EM clusterer with means given by k-means (2) and 228# # dimensionality reduction 229# clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1) 230# print 'Clusterer:', clusterer 231# clusters = clusterer.cluster(vectors) 232# means = clusterer.means() 233# print 'Means:', clusterer.means() 234# print 235 236# clusterer = cluster.EMClusterer(means, svd_dimensions=1) 237# clusters = clusterer.cluster(vectors, True) 238# print 'Clusterer:', clusterer 239# print 'Clustered:', str(vectors)[:60], '...' 240# print 'As:', str(clusters)[:60], '...' 241# print 242 243# # classify a new vector 244# vector = numpy.array([3, 3]) 245# print 'classify(%s):' % vector, 246# print clusterer.classify(vector) 247# print 248 249# # show the classification probabilities 250# vector = numpy.array([2.2, 2]) 251# print 'classification_probdist(%s)' % vector 252# pdist = clusterer.classification_probdist(vector) 253# for sample in pdist: 254# print '%s => %.0f%%' % (sample, pdist.prob(sample) *100) 255 256if __name__ == '__main__': 257 demo() 258