1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; 22 23 import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization; 24 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; 25 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; 26 import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; 27 import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; 28 import de.lmu.ifi.dbs.elki.database.ids.*; 29 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; 30 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; 31 import de.lmu.ifi.dbs.elki.logging.Logging; 32 import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; 33 import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; 34 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; 35 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; 36 37 /** 38 * The Partitioning Around Medoids (PAM) algorithm with some additional 39 * optimizations proposed by Reynolds et al. 40 * <p> 41 * In our implementation, we could not observe a substantial improvement over 42 * the original PAM algorithm. This may be because of modern CPU architectures, 43 * where saving an addition may be neglibile compared to caching and pipelining. 44 * <p> 45 * Reference: 46 * <p> 47 * A. P. Reynolds, G. Richards, B. de la Iglesia, V. J. Rayward-Smith<br> 48 * Clustering Rules: A Comparison of Partitioning and Hierarchical Clustering 49 * Algorithms<br> 50 * J. Math. Model. Algorithms 5(4) 51 * 52 * @author Erich Schubert 53 * 54 * @param <V> vector datatype 55 */ 56 @Reference(authors = "A. P. Reynolds, G. Richards, B. de la Iglesia, V. J. Rayward-Smith", // 57 title = "Clustering Rules: A Comparison of Partitioning and Hierarchical Clustering Algorithms", // 58 booktitle = "J. Math. Model. Algorithms 5(4)", // 59 url = "https://doi.org/10.1007/s10852-005-9022-1", // 60 bibkey = "DBLP:journals/jmma/ReynoldsRIR06") 61 public class KMedoidsPAMReynolds<V> extends KMedoidsPAM<V> { 62 /** 63 * The logger for this class. 64 */ 65 private static final Logging LOG = Logging.getLogger(KMedoidsPAMReynolds.class); 66 67 /** 68 * Key for statistics logging. 69 */ 70 private static final String KEY = KMedoidsPAMReynolds.class.getName(); 71 72 /** 73 * Constructor. 74 * 75 * @param distanceFunction distance function 76 * @param k k parameter 77 * @param maxiter Maxiter parameter 78 * @param initializer Function to generate the initial means 79 */ KMedoidsPAMReynolds(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer)80 public KMedoidsPAMReynolds(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { 81 super(distanceFunction, k, maxiter, initializer); 82 } 83 84 @Override run(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, WritableIntegerDataStore assignment)85 protected void run(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, WritableIntegerDataStore assignment) { 86 new Instance(distQ, ids, assignment).run(medoids, maxiter); 87 } 88 89 /** 90 * Instance for a single dataset. 91 * 92 * @author Erich Schubert 93 */ 94 protected static class Instance extends KMedoidsPAM.Instance { 95 /** 96 * Constructor. 97 * 98 * @param distQ Distance query 99 * @param ids IDs to process 100 * @param assignment Cluster assignment 101 */ Instance(DistanceQuery<?> distQ, DBIDs ids, WritableIntegerDataStore assignment)102 public Instance(DistanceQuery<?> distQ, DBIDs ids, WritableIntegerDataStore assignment) { 103 super(distQ, ids, assignment); 104 } 105 106 /** 107 * Run the PAM optimization phase. 108 * 109 * @param medoids Medoids list 110 * @param maxiter 111 * @return final cost 112 */ run(ArrayModifiableDBIDs medoids, int maxiter)113 protected double run(ArrayModifiableDBIDs medoids, int maxiter) { 114 final int k = medoids.size(); 115 // Initial assignment to nearest medoids 116 // TODO: reuse distance information, from the build phase, when possible? 117 double tc = assignToNearestCluster(medoids); 118 if(LOG.isStatistics()) { 119 LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc)); 120 } 121 122 WritableDoubleDataStore tnearest = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); 123 124 IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; 125 // Swap phase 126 DBIDVar bestid = DBIDUtil.newVar(); 127 int iteration = 1; 128 for(; maxiter <= 0 || iteration <= maxiter; iteration++) { 129 LOG.incrementProcessed(prog); 130 // Try to swap a non-medoid with a medoid member: 131 double best = Double.POSITIVE_INFINITY; 132 int bestcluster = -1; 133 // Iterate over each medoid: 134 for(int pi = 0; pi < k; pi++) { 135 // Compute medoid removal costs only once, c.f., Reynolds et al. 136 double basecost = computeRemovalCost(pi, tnearest); 137 // Iterate over all non-medoids: 138 for(DBIDIter h = ids.iter(); h.valid(); h.advance()) { 139 // h is a non-medoid currently in cluster of medoid m. 140 // hdist is the cost we get back by making the non-medoid h medoid. 141 double cpi = basecost + computeReassignmentCost(h, tnearest); 142 if(cpi < best) { 143 best = cpi; 144 bestid.set(h); 145 bestcluster = pi; 146 } 147 } 148 } 149 if(!(best < -1e-12 * tc)) { 150 break; 151 } 152 medoids.set(bestcluster, bestid); 153 // Reassign 154 double nc = assignToNearestCluster(medoids); 155 if(LOG.isStatistics()) { 156 LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc)); 157 } 158 if(nc > tc) { 159 if(nc - tc < 1e-7 * tc) { 160 LOG.warning("PAM failed to converge (numerical instability?)"); 161 break; 162 } 163 LOG.warning("PAM failed to converge: costs increased by: " + (nc - tc) + " exepected a decrease by " + best); 164 break; 165 } 166 tc = nc; 167 } 168 LOG.setCompleted(prog); 169 if(LOG.isStatistics()) { 170 LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); 171 } 172 return tc; 173 } 174 175 /** 176 * Compute the cost of removing a medoid just once. This can then be reused 177 * for every point, thus decreasing the runtime cost at low memory overhead. 178 * 179 * The output array contains for each medoid the cost of removing all its 180 * points, and reassigning them to the second nearest medoid instead. 181 * 182 * @return Cost 183 */ computeRemovalCost(int i, WritableDoubleDataStore tnearest)184 protected double computeRemovalCost(int i, WritableDoubleDataStore tnearest) { 185 double cost = 0; 186 // Compute costs of reassigning to the second closest medoid. 187 for(DBIDIter j = ids.iter(); j.valid(); j.advance()) { 188 final double n = nearest.doubleValue(j); 189 if(assignment.intValue(j) == i) { 190 final double s = second.doubleValue(j); 191 cost += s - n; 192 tnearest.put(j, s); 193 } 194 else { 195 tnearest.put(j, n); 196 } 197 } 198 return cost; 199 } 200 201 /** 202 * Compute the reassignment cost, for all medoids in one pass. 203 * 204 * @param h Current object to swap with any medoid. 205 * @param tnearest Distance to the nearest except the removed medoid 206 * @return cost change 207 */ computeReassignmentCost(DBIDRef h, WritableDoubleDataStore tnearest)208 protected double computeReassignmentCost(DBIDRef h, WritableDoubleDataStore tnearest) { 209 double cost = 0.; 210 // Compute costs of reassigning other objects j: 211 for(DBIDIter j = ids.iter(); j.valid(); j.advance()) { 212 // New medoid is closest. Reassignment to second nearest was 213 // precomputed already, in {@link #computeRemovalCost} 214 // Case 1c: j is closer to h than its current medoid 215 double dist = distQ.distance(h, j), cur = tnearest.doubleValue(j); 216 if(dist < cur) { 217 cost += dist - cur; 218 } 219 } 220 return cost; 221 } 222 } 223 224 @Override getLogger()225 protected Logging getLogger() { 226 return LOG; 227 } 228 229 /** 230 * Parameterization class. 231 * 232 * @author Erich Schubert 233 */ 234 public static class Parameterizer<V> extends KMedoidsPAM.Parameterizer<V> { 235 @Override makeInstance()236 protected KMedoidsPAMReynolds<V> makeInstance() { 237 return new KMedoidsPAMReynolds<>(distanceFunction, k, maxiter, initializer); 238 } 239 } 240 } 241