1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.algorithm.clustering.em; 22 23 import static de.lmu.ifi.dbs.elki.math.linearalgebra.VMath.argmax; 24 25 import java.util.ArrayList; 26 import java.util.List; 27 28 import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; 29 import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; 30 import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans; 31 import de.lmu.ifi.dbs.elki.data.Cluster; 32 import de.lmu.ifi.dbs.elki.data.Clustering; 33 import de.lmu.ifi.dbs.elki.data.NumberVector; 34 import de.lmu.ifi.dbs.elki.data.model.MeanModel; 35 import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; 36 import de.lmu.ifi.dbs.elki.data.type.TypeInformation; 37 import de.lmu.ifi.dbs.elki.data.type.TypeUtil; 38 import de.lmu.ifi.dbs.elki.database.Database; 39 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; 40 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; 41 import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; 42 import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; 43 import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; 44 import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; 45 import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; 46 import de.lmu.ifi.dbs.elki.database.relation.Relation; 47 import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; 48 import de.lmu.ifi.dbs.elki.logging.Logging; 49 import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; 50 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; 51 import de.lmu.ifi.dbs.elki.utilities.Alias; 52 import de.lmu.ifi.dbs.elki.utilities.Priority; 53 import de.lmu.ifi.dbs.elki.utilities.documentation.Description; 54 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; 55 import de.lmu.ifi.dbs.elki.utilities.documentation.Title; 56 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; 57 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; 58 import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; 59 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; 60 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; 61 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; 62 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; 63 64 import net.jafama.FastMath; 65 66 /** 67 * Clustering by expectation maximization (EM-Algorithm), also known as Gaussian 68 * Mixture Modeling (GMM), with optional MAP regularization. 69 * <p> 70 * Reference: 71 * <p> 72 * A. P. Dempster, N. M. Laird, D. B. Rubin:<br> 73 * Maximum Likelihood from Incomplete Data via the EM algorithm.<br> 74 * Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31 75 * <p> 76 * The MAP estimation is derived from 77 * <p> 78 * C. Fraley and A. E. Raftery<br> 79 * Bayesian Regularization for Normal Mixture Estimation and Model-Based 80 * Clustering<br> 81 * J. Classification 24(2) 82 * 83 * @author Arthur Zimek 84 * @author Erich Schubert 85 * @since 0.1 86 * 87 * @composed - - - EMClusterModelFactory 88 * 89 * @param <V> vector type to analyze 90 * @param <M> model type to produce 91 */ 92 @Title("EM-Clustering: Clustering by Expectation Maximization") 93 @Description("Cluster data via Gaussian mixture modeling and the EM algorithm") 94 @Reference(authors = "A. P. Dempster, N. M. Laird, D. B. Rubin", // 95 title = "Maximum Likelihood from Incomplete Data via the EM algorithm", // 96 booktitle = "Journal of the Royal Statistical Society, Series B, 39(1)", // 97 url = "http://www.jstor.org/stable/2984875", // 98 bibkey = "journals/jroyastatsocise2/DempsterLR77") 99 @Reference(title = "Bayesian Regularization for Normal Mixture Estimation and Model-Based Clustering", // 100 authors = "C. Fraley, A. E. Raftery", // 101 booktitle = "J. Classification 24(2)", // 102 url = "https://doi.org/10.1007/s00357-007-0004-5", // 103 bibkey = "DBLP:journals/classification/FraleyR07") 104 @Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.EM") 105 @Priority(Priority.RECOMMENDED) 106 public class EM<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> { 107 /** 108 * The logger for this class. 109 */ 110 private static final Logging LOG = Logging.getLogger(EM.class); 111 112 /** 113 * Key for statistics logging. 114 */ 115 private static final String KEY = EM.class.getName(); 116 117 /** 118 * Number of clusters 119 */ 120 private int k; 121 122 /** 123 * Delta parameter 124 */ 125 private double delta; 126 127 /** 128 * Factory for producing the initial cluster model. 129 */ 130 private EMClusterModelFactory<V, M> mfactory; 131 132 /** 133 * Maximum number of iterations to allow 134 */ 135 private int maxiter; 136 137 /** 138 * Prior to enable MAP estimation (use 0 for MLE) 139 */ 140 private double prior = 0.; 141 142 /** 143 * Retain soft assignments. 144 */ 145 private boolean soft; 146 147 /** 148 * Minimum loglikelihood to avoid -infinity. 149 */ 150 private static final double MIN_LOGLIKELIHOOD = -100000; 151 152 /** 153 * Soft assignment result type. 154 */ 155 public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class); 156 157 /** 158 * Constructor. 159 * 160 * @param k k parameter 161 * @param delta delta parameter 162 * @param mfactory EM cluster model factory 163 */ EM(int k, double delta, EMClusterModelFactory<V, M> mfactory)164 public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory) { 165 this(k, delta, mfactory, -1, 0., false); 166 } 167 168 /** 169 * Constructor. 170 * 171 * @param k k parameter 172 * @param delta delta parameter 173 * @param mfactory EM cluster model factory 174 * @param maxiter Maximum number of iterations 175 * @param soft Include soft assignments 176 */ EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, boolean soft)177 public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, boolean soft) { 178 this(k, delta, mfactory, maxiter, 0., soft); 179 } 180 181 /** 182 * Constructor. 183 * 184 * @param k k parameter 185 * @param delta delta parameter 186 * @param mfactory EM cluster model factory 187 * @param maxiter Maximum number of iterations 188 * @param prior MAP prior 189 * @param soft Include soft assignments 190 */ EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, double prior, boolean soft)191 public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, double prior, boolean soft) { 192 super(); 193 this.k = k; 194 this.delta = delta; 195 this.mfactory = mfactory; 196 this.maxiter = maxiter; 197 this.prior = prior; 198 this.soft = soft; 199 } 200 201 /** 202 * Performs the EM clustering algorithm on the given database. 203 * 204 * Finally a hard clustering is provided where each clusters gets assigned the 205 * points exhibiting the highest probability to belong to this cluster. But 206 * still, the database objects hold associated the complete probability-vector 207 * for all models. 208 * 209 * @param database Database 210 * @param relation Relation 211 * @return Result 212 */ run(Database database, Relation<V> relation)213 public Clustering<M> run(Database database, Relation<V> relation) { 214 if(relation.size() == 0) { 215 throw new IllegalArgumentException("database empty: must contain elements"); 216 } 217 // initial models 218 List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC); 219 WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); 220 double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX); 221 DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null; 222 if(LOG.isStatistics()) { 223 LOG.statistics(likestat.setDouble(loglikelihood)); 224 } 225 226 // iteration unless no change 227 int it = 0, lastimprovement = 0; 228 double bestloglikelihood = loglikelihood; // For detecting instabilities. 229 for(++it; it < maxiter || maxiter < 0; it++) { 230 final double oldloglikelihood = loglikelihood; 231 recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior); 232 // reassign probabilities 233 loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX); 234 235 if(LOG.isStatistics()) { 236 LOG.statistics(likestat.setDouble(loglikelihood)); 237 } 238 if(loglikelihood - bestloglikelihood > delta) { 239 lastimprovement = it; 240 bestloglikelihood = loglikelihood; 241 } 242 if(Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) { 243 break; 244 } 245 } 246 if(LOG.isStatistics()) { 247 LOG.statistics(new LongStatistic(KEY + ".iterations", it)); 248 } 249 250 // fill result with clusters and models 251 List<ModifiableDBIDs> hardClusters = new ArrayList<>(k); 252 for(int i = 0; i < k; i++) { 253 hardClusters.add(DBIDUtil.newArray()); 254 } 255 256 // provide a hard clustering 257 for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { 258 hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer); 259 } 260 Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering"); 261 // provide models within the result 262 for(int i = 0; i < k; i++) { 263 result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster())); 264 } 265 if(isSoft()) { 266 result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs())); 267 } 268 else { 269 probClusterIGivenX.destroy(); 270 } 271 return result; 272 } 273 274 /** 275 * Recompute the covariance matrixes. 276 * 277 * @param relation Vector data 278 * @param probClusterIGivenX Object probabilities 279 * @param models Cluster models to update 280 * @param prior MAP prior (use 0 for MLE) 281 */ recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models, double prior)282 public static void recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models, double prior) { 283 final int k = models.size(); 284 boolean needsTwoPass = false; 285 for(EMClusterModel<?> m : models) { 286 m.beginEStep(); 287 needsTwoPass |= m.needsTwoPass(); 288 } 289 // First pass, only for two-pass models. 290 if(needsTwoPass) { 291 for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { 292 double[] clusterProbabilities = probClusterIGivenX.get(iditer); 293 NumberVector instance = relation.get(iditer); 294 for(int i = 0; i < clusterProbabilities.length; i++) { 295 final double prob = clusterProbabilities[i]; 296 if(prob > 1e-10) { 297 models.get(i).firstPassE(instance, prob); 298 } 299 } 300 } 301 for(EMClusterModel<?> m : models) { 302 m.finalizeFirstPassE(); 303 } 304 } 305 double[] wsum = new double[k]; 306 for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { 307 double[] clusterProbabilities = probClusterIGivenX.get(iditer); 308 NumberVector instance = relation.get(iditer); 309 for(int i = 0; i < clusterProbabilities.length; i++) { 310 final double prob = clusterProbabilities[i]; 311 if(prob > 1e-10) { 312 models.get(i).updateE(instance, prob); 313 } 314 wsum[i] += prob; 315 } 316 } 317 for(int i = 0; i < models.size(); i++) { 318 // MLE / MAP 319 final double weight = prior <= 0. ? wsum[i] / relation.size() : (wsum[i] + prior - 1) / (relation.size() + prior * k - k); 320 models.get(i).finalizeEStep(weight, prior); 321 } 322 } 323 324 /** 325 * Assigns the current probability values to the instances in the database and 326 * compute the expectation value of the current mixture of distributions. 327 * 328 * Computed as the sum of the logarithms of the prior probability of each 329 * instance. 330 * 331 * @param relation the database used for assignment to instances 332 * @param models Cluster models 333 * @param probClusterIGivenX Output storage for cluster probabilities 334 * @return the expectation value of the current mixture of distributions 335 */ assignProbabilitiesToInstances(Relation<? extends NumberVector> relation, List<? extends EMClusterModel<?>> models, WritableDataStore<double[]> probClusterIGivenX)336 public static double assignProbabilitiesToInstances(Relation<? extends NumberVector> relation, List<? extends EMClusterModel<?>> models, WritableDataStore<double[]> probClusterIGivenX) { 337 final int k = models.size(); 338 double emSum = 0.; 339 340 for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { 341 NumberVector vec = relation.get(iditer); 342 double[] probs = new double[k]; 343 for(int i = 0; i < k; i++) { 344 double v = models.get(i).estimateLogDensity(vec); 345 probs[i] = v > MIN_LOGLIKELIHOOD ? v : MIN_LOGLIKELIHOOD; 346 } 347 final double logP = logSumExp(probs); 348 for(int i = 0; i < k; i++) { 349 probs[i] = FastMath.exp(probs[i] - logP); 350 } 351 probClusterIGivenX.put(iditer, probs); 352 emSum += logP; 353 } 354 return emSum / relation.size(); 355 } 356 357 /** 358 * Compute log(sum(exp(x_i)), with attention to numerical issues. 359 * 360 * @param x Input 361 * @return Result 362 */ logSumExp(double[] x)363 private static double logSumExp(double[] x) { 364 double max = x[0]; 365 for(int i = 1; i < x.length; i++) { 366 final double v = x[i]; 367 max = v > max ? v : max; 368 } 369 final double cutoff = max - 35.350506209; // log_e(2**51) 370 double acc = 0.; 371 for(int i = 0; i < x.length; i++) { 372 final double v = x[i]; 373 if(v > cutoff) { 374 acc += v < max ? FastMath.exp(v - max) : 1.; 375 } 376 } 377 return acc > 1. ? (max + FastMath.log(acc)) : max; 378 } 379 380 @Override getInputTypeRestriction()381 public TypeInformation[] getInputTypeRestriction() { 382 return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); 383 } 384 385 @Override getLogger()386 protected Logging getLogger() { 387 return LOG; 388 } 389 390 /** 391 * @return the soft 392 */ isSoft()393 public boolean isSoft() { 394 return soft; 395 } 396 397 /** 398 * @param soft the soft to set 399 */ setSoft(boolean soft)400 public void setSoft(boolean soft) { 401 this.soft = soft; 402 } 403 404 /** 405 * Parameterization class. 406 * 407 * @author Erich Schubert 408 */ 409 public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer { 410 /** 411 * Parameter to specify the number of clusters to find, must be an integer 412 * greater than 0. 413 */ 414 public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find."); 415 416 /** 417 * Parameter to specify the termination criterion for maximization of E(M): 418 * E(M) - E(M') < em.delta, must be a double equal to or greater than 0. 419 */ 420 public static final OptionID DELTA_ID = new OptionID("em.delta", // 421 "The termination criterion for maximization of E(M): " + // 422 "E(M) - E(M') < em.delta"); 423 424 /** 425 * Parameter to specify the EM cluster models to use. 426 */ 427 public static final OptionID INIT_ID = new OptionID("em.model", // 428 "Model factory."); 429 430 /** 431 * Parameter to specify the MAP prior 432 */ 433 public static final OptionID PRIOR_ID = new OptionID("em.map.prior", // 434 "Regularization factor for MAP estimation."); 435 436 /** 437 * Number of clusters. 438 */ 439 protected int k; 440 441 /** 442 * Stopping threshold 443 */ 444 protected double delta; 445 446 /** 447 * Initialization method 448 */ 449 protected EMClusterModelFactory<V, M> initializer; 450 451 /** 452 * Maximum number of iterations. 453 */ 454 protected int maxiter = -1; 455 456 /** 457 * Prior to enable MAP estimation (use 0 for MLE) 458 */ 459 double prior = 0.; 460 461 @Override makeOptions(Parameterization config)462 protected void makeOptions(Parameterization config) { 463 super.makeOptions(config); 464 IntParameter kP = new IntParameter(K_ID) // 465 .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); 466 if(config.grab(kP)) { 467 k = kP.getValue(); 468 } 469 470 ObjectParameter<EMClusterModelFactory<V, M>> initialP = new ObjectParameter<>(INIT_ID, EMClusterModelFactory.class, MultivariateGaussianModelFactory.class); 471 if(config.grab(initialP)) { 472 initializer = initialP.instantiateClass(config); 473 } 474 475 DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 1e-7)// 476 .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); 477 if(config.grab(deltaP)) { 478 delta = deltaP.getValue(); 479 } 480 481 IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID)// 482 .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT) // 483 .setOptional(true); 484 if(config.grab(maxiterP)) { 485 maxiter = maxiterP.getValue(); 486 } 487 488 DoubleParameter priorP = new DoubleParameter(PRIOR_ID) // 489 .setOptional(true) // 490 .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); 491 if(config.grab(priorP)) { 492 prior = priorP.doubleValue(); 493 } 494 } 495 496 @Override makeInstance()497 protected EM<V, M> makeInstance() { 498 return new EM<>(k, delta, initializer, maxiter, prior, false); 499 } 500 } 501 } 502