1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
22 
23 import static de.lmu.ifi.dbs.elki.math.linearalgebra.VMath.argmax;
24 
25 import java.util.ArrayList;
26 import java.util.List;
27 
28 import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
29 import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
30 import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans;
31 import de.lmu.ifi.dbs.elki.data.Cluster;
32 import de.lmu.ifi.dbs.elki.data.Clustering;
33 import de.lmu.ifi.dbs.elki.data.NumberVector;
34 import de.lmu.ifi.dbs.elki.data.model.MeanModel;
35 import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
36 import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
37 import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
38 import de.lmu.ifi.dbs.elki.database.Database;
39 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
40 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
41 import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
42 import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
43 import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
44 import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
45 import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
46 import de.lmu.ifi.dbs.elki.database.relation.Relation;
47 import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
48 import de.lmu.ifi.dbs.elki.logging.Logging;
49 import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
50 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
51 import de.lmu.ifi.dbs.elki.utilities.Alias;
52 import de.lmu.ifi.dbs.elki.utilities.Priority;
53 import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
54 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
55 import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
56 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
57 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
58 import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
59 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
60 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
61 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
62 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
63 
64 import net.jafama.FastMath;
65 
66 /**
67  * Clustering by expectation maximization (EM-Algorithm), also known as Gaussian
68  * Mixture Modeling (GMM), with optional MAP regularization.
69  * <p>
70  * Reference:
71  * <p>
72  * A. P. Dempster, N. M. Laird, D. B. Rubin:<br>
73  * Maximum Likelihood from Incomplete Data via the EM algorithm.<br>
74  * Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31
75  * <p>
76  * The MAP estimation is derived from
77  * <p>
78  * C. Fraley and A. E. Raftery<br>
79  * Bayesian Regularization for Normal Mixture Estimation and Model-Based
80  * Clustering<br>
81  * J. Classification 24(2)
82  *
83  * @author Arthur Zimek
84  * @author Erich Schubert
85  * @since 0.1
86  *
87  * @composed - - - EMClusterModelFactory
88  *
89  * @param <V> vector type to analyze
90  * @param <M> model type to produce
91  */
92 @Title("EM-Clustering: Clustering by Expectation Maximization")
93 @Description("Cluster data via Gaussian mixture modeling and the EM algorithm")
94 @Reference(authors = "A. P. Dempster, N. M. Laird, D. B. Rubin", //
95     title = "Maximum Likelihood from Incomplete Data via the EM algorithm", //
96     booktitle = "Journal of the Royal Statistical Society, Series B, 39(1)", //
97     url = "http://www.jstor.org/stable/2984875", //
98     bibkey = "journals/jroyastatsocise2/DempsterLR77")
99 @Reference(title = "Bayesian Regularization for Normal Mixture Estimation and Model-Based Clustering", //
100     authors = "C. Fraley, A. E. Raftery", //
101     booktitle = "J. Classification 24(2)", //
102     url = "https://doi.org/10.1007/s00357-007-0004-5", //
103     bibkey = "DBLP:journals/classification/FraleyR07")
104 @Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.EM")
105 @Priority(Priority.RECOMMENDED)
106 public class EM<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> {
107   /**
108    * The logger for this class.
109    */
110   private static final Logging LOG = Logging.getLogger(EM.class);
111 
112   /**
113    * Key for statistics logging.
114    */
115   private static final String KEY = EM.class.getName();
116 
117   /**
118    * Number of clusters
119    */
120   private int k;
121 
122   /**
123    * Delta parameter
124    */
125   private double delta;
126 
127   /**
128    * Factory for producing the initial cluster model.
129    */
130   private EMClusterModelFactory<V, M> mfactory;
131 
132   /**
133    * Maximum number of iterations to allow
134    */
135   private int maxiter;
136 
137   /**
138    * Prior to enable MAP estimation (use 0 for MLE)
139    */
140   private double prior = 0.;
141 
142   /**
143    * Retain soft assignments.
144    */
145   private boolean soft;
146 
147   /**
148    * Minimum loglikelihood to avoid -infinity.
149    */
150   private static final double MIN_LOGLIKELIHOOD = -100000;
151 
152   /**
153    * Soft assignment result type.
154    */
155   public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class);
156 
157   /**
158    * Constructor.
159    *
160    * @param k k parameter
161    * @param delta delta parameter
162    * @param mfactory EM cluster model factory
163    */
EM(int k, double delta, EMClusterModelFactory<V, M> mfactory)164   public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory) {
165     this(k, delta, mfactory, -1, 0., false);
166   }
167 
168   /**
169    * Constructor.
170    *
171    * @param k k parameter
172    * @param delta delta parameter
173    * @param mfactory EM cluster model factory
174    * @param maxiter Maximum number of iterations
175    * @param soft Include soft assignments
176    */
EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, boolean soft)177   public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, boolean soft) {
178     this(k, delta, mfactory, maxiter, 0., soft);
179   }
180 
181   /**
182    * Constructor.
183    *
184    * @param k k parameter
185    * @param delta delta parameter
186    * @param mfactory EM cluster model factory
187    * @param maxiter Maximum number of iterations
188    * @param prior MAP prior
189    * @param soft Include soft assignments
190    */
EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, double prior, boolean soft)191   public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, double prior, boolean soft) {
192     super();
193     this.k = k;
194     this.delta = delta;
195     this.mfactory = mfactory;
196     this.maxiter = maxiter;
197     this.prior = prior;
198     this.soft = soft;
199   }
200 
201   /**
202    * Performs the EM clustering algorithm on the given database.
203    *
204    * Finally a hard clustering is provided where each clusters gets assigned the
205    * points exhibiting the highest probability to belong to this cluster. But
206    * still, the database objects hold associated the complete probability-vector
207    * for all models.
208    *
209    * @param database Database
210    * @param relation Relation
211    * @return Result
212    */
run(Database database, Relation<V> relation)213   public Clustering<M> run(Database database, Relation<V> relation) {
214     if(relation.size() == 0) {
215       throw new IllegalArgumentException("database empty: must contain elements");
216     }
217     // initial models
218     List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
219     WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
220     double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
221     DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
222     if(LOG.isStatistics()) {
223       LOG.statistics(likestat.setDouble(loglikelihood));
224     }
225 
226     // iteration unless no change
227     int it = 0, lastimprovement = 0;
228     double bestloglikelihood = loglikelihood; // For detecting instabilities.
229     for(++it; it < maxiter || maxiter < 0; it++) {
230       final double oldloglikelihood = loglikelihood;
231       recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
232       // reassign probabilities
233       loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
234 
235       if(LOG.isStatistics()) {
236         LOG.statistics(likestat.setDouble(loglikelihood));
237       }
238       if(loglikelihood - bestloglikelihood > delta) {
239         lastimprovement = it;
240         bestloglikelihood = loglikelihood;
241       }
242       if(Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
243         break;
244       }
245     }
246     if(LOG.isStatistics()) {
247       LOG.statistics(new LongStatistic(KEY + ".iterations", it));
248     }
249 
250     // fill result with clusters and models
251     List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
252     for(int i = 0; i < k; i++) {
253       hardClusters.add(DBIDUtil.newArray());
254     }
255 
256     // provide a hard clustering
257     for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
258       hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
259     }
260     Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
261     // provide models within the result
262     for(int i = 0; i < k; i++) {
263       result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
264     }
265     if(isSoft()) {
266       result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
267     }
268     else {
269       probClusterIGivenX.destroy();
270     }
271     return result;
272   }
273 
274   /**
275    * Recompute the covariance matrixes.
276    *
277    * @param relation Vector data
278    * @param probClusterIGivenX Object probabilities
279    * @param models Cluster models to update
280    * @param prior MAP prior (use 0 for MLE)
281    */
recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models, double prior)282   public static void recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models, double prior) {
283     final int k = models.size();
284     boolean needsTwoPass = false;
285     for(EMClusterModel<?> m : models) {
286       m.beginEStep();
287       needsTwoPass |= m.needsTwoPass();
288     }
289     // First pass, only for two-pass models.
290     if(needsTwoPass) {
291       for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
292         double[] clusterProbabilities = probClusterIGivenX.get(iditer);
293         NumberVector instance = relation.get(iditer);
294         for(int i = 0; i < clusterProbabilities.length; i++) {
295           final double prob = clusterProbabilities[i];
296           if(prob > 1e-10) {
297             models.get(i).firstPassE(instance, prob);
298           }
299         }
300       }
301       for(EMClusterModel<?> m : models) {
302         m.finalizeFirstPassE();
303       }
304     }
305     double[] wsum = new double[k];
306     for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
307       double[] clusterProbabilities = probClusterIGivenX.get(iditer);
308       NumberVector instance = relation.get(iditer);
309       for(int i = 0; i < clusterProbabilities.length; i++) {
310         final double prob = clusterProbabilities[i];
311         if(prob > 1e-10) {
312           models.get(i).updateE(instance, prob);
313         }
314         wsum[i] += prob;
315       }
316     }
317     for(int i = 0; i < models.size(); i++) {
318       // MLE / MAP
319       final double weight = prior <= 0. ? wsum[i] / relation.size() : (wsum[i] + prior - 1) / (relation.size() + prior * k - k);
320       models.get(i).finalizeEStep(weight, prior);
321     }
322   }
323 
324   /**
325    * Assigns the current probability values to the instances in the database and
326    * compute the expectation value of the current mixture of distributions.
327    *
328    * Computed as the sum of the logarithms of the prior probability of each
329    * instance.
330    *
331    * @param relation the database used for assignment to instances
332    * @param models Cluster models
333    * @param probClusterIGivenX Output storage for cluster probabilities
334    * @return the expectation value of the current mixture of distributions
335    */
assignProbabilitiesToInstances(Relation<? extends NumberVector> relation, List<? extends EMClusterModel<?>> models, WritableDataStore<double[]> probClusterIGivenX)336   public static double assignProbabilitiesToInstances(Relation<? extends NumberVector> relation, List<? extends EMClusterModel<?>> models, WritableDataStore<double[]> probClusterIGivenX) {
337     final int k = models.size();
338     double emSum = 0.;
339 
340     for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
341       NumberVector vec = relation.get(iditer);
342       double[] probs = new double[k];
343       for(int i = 0; i < k; i++) {
344         double v = models.get(i).estimateLogDensity(vec);
345         probs[i] = v > MIN_LOGLIKELIHOOD ? v : MIN_LOGLIKELIHOOD;
346       }
347       final double logP = logSumExp(probs);
348       for(int i = 0; i < k; i++) {
349         probs[i] = FastMath.exp(probs[i] - logP);
350       }
351       probClusterIGivenX.put(iditer, probs);
352       emSum += logP;
353     }
354     return emSum / relation.size();
355   }
356 
357   /**
358    * Compute log(sum(exp(x_i)), with attention to numerical issues.
359    *
360    * @param x Input
361    * @return Result
362    */
logSumExp(double[] x)363   private static double logSumExp(double[] x) {
364     double max = x[0];
365     for(int i = 1; i < x.length; i++) {
366       final double v = x[i];
367       max = v > max ? v : max;
368     }
369     final double cutoff = max - 35.350506209; // log_e(2**51)
370     double acc = 0.;
371     for(int i = 0; i < x.length; i++) {
372       final double v = x[i];
373       if(v > cutoff) {
374         acc += v < max ? FastMath.exp(v - max) : 1.;
375       }
376     }
377     return acc > 1. ? (max + FastMath.log(acc)) : max;
378   }
379 
380   @Override
getInputTypeRestriction()381   public TypeInformation[] getInputTypeRestriction() {
382     return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
383   }
384 
385   @Override
getLogger()386   protected Logging getLogger() {
387     return LOG;
388   }
389 
390   /**
391    * @return the soft
392    */
isSoft()393   public boolean isSoft() {
394     return soft;
395   }
396 
397   /**
398    * @param soft the soft to set
399    */
setSoft(boolean soft)400   public void setSoft(boolean soft) {
401     this.soft = soft;
402   }
403 
404   /**
405    * Parameterization class.
406    *
407    * @author Erich Schubert
408    */
409   public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer {
410     /**
411      * Parameter to specify the number of clusters to find, must be an integer
412      * greater than 0.
413      */
414     public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find.");
415 
416     /**
417      * Parameter to specify the termination criterion for maximization of E(M):
418      * E(M) - E(M') &lt; em.delta, must be a double equal to or greater than 0.
419      */
420     public static final OptionID DELTA_ID = new OptionID("em.delta", //
421         "The termination criterion for maximization of E(M): " + //
422             "E(M) - E(M') < em.delta");
423 
424     /**
425      * Parameter to specify the EM cluster models to use.
426      */
427     public static final OptionID INIT_ID = new OptionID("em.model", //
428         "Model factory.");
429 
430     /**
431      * Parameter to specify the MAP prior
432      */
433     public static final OptionID PRIOR_ID = new OptionID("em.map.prior", //
434         "Regularization factor for MAP estimation.");
435 
436     /**
437      * Number of clusters.
438      */
439     protected int k;
440 
441     /**
442      * Stopping threshold
443      */
444     protected double delta;
445 
446     /**
447      * Initialization method
448      */
449     protected EMClusterModelFactory<V, M> initializer;
450 
451     /**
452      * Maximum number of iterations.
453      */
454     protected int maxiter = -1;
455 
456     /**
457      * Prior to enable MAP estimation (use 0 for MLE)
458      */
459     double prior = 0.;
460 
461     @Override
makeOptions(Parameterization config)462     protected void makeOptions(Parameterization config) {
463       super.makeOptions(config);
464       IntParameter kP = new IntParameter(K_ID) //
465           .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
466       if(config.grab(kP)) {
467         k = kP.getValue();
468       }
469 
470       ObjectParameter<EMClusterModelFactory<V, M>> initialP = new ObjectParameter<>(INIT_ID, EMClusterModelFactory.class, MultivariateGaussianModelFactory.class);
471       if(config.grab(initialP)) {
472         initializer = initialP.instantiateClass(config);
473       }
474 
475       DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 1e-7)//
476           .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
477       if(config.grab(deltaP)) {
478         delta = deltaP.getValue();
479       }
480 
481       IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID)//
482           .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT) //
483           .setOptional(true);
484       if(config.grab(maxiterP)) {
485         maxiter = maxiterP.getValue();
486       }
487 
488       DoubleParameter priorP = new DoubleParameter(PRIOR_ID) //
489           .setOptional(true) //
490           .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
491       if(config.grab(priorP)) {
492         prior = priorP.doubleValue();
493       }
494     }
495 
496     @Override
makeInstance()497     protected EM<V, M> makeInstance() {
498       return new EM<>(k, delta, initializer, maxiter, prior, false);
499     }
500   }
501 }
502