1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
22 
23 import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
24 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
25 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
26 import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
27 import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
28 import de.lmu.ifi.dbs.elki.database.ids.*;
29 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
30 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
31 import de.lmu.ifi.dbs.elki.logging.Logging;
32 import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
33 import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
34 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
35 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
36 
37 /**
38  * The Partitioning Around Medoids (PAM) algorithm with some additional
39  * optimizations proposed by Reynolds et al.
40  * <p>
41  * In our implementation, we could not observe a substantial improvement over
42  * the original PAM algorithm. This may be because of modern CPU architectures,
43  * where saving an addition may be neglibile compared to caching and pipelining.
44  * <p>
45  * Reference:
46  * <p>
47  * A. P. Reynolds, G. Richards, B. de la Iglesia, V. J. Rayward-Smith<br>
48  * Clustering Rules: A Comparison of Partitioning and Hierarchical Clustering
49  * Algorithms<br>
50  * J. Math. Model. Algorithms 5(4)
51  *
52  * @author Erich Schubert
53  *
54  * @param <V> vector datatype
55  */
56 @Reference(authors = "A. P. Reynolds, G. Richards, B. de la Iglesia, V. J. Rayward-Smith", //
57     title = "Clustering Rules: A Comparison of Partitioning and Hierarchical Clustering Algorithms", //
58     booktitle = "J. Math. Model. Algorithms 5(4)", //
59     url = "https://doi.org/10.1007/s10852-005-9022-1", //
60     bibkey = "DBLP:journals/jmma/ReynoldsRIR06")
61 public class KMedoidsPAMReynolds<V> extends KMedoidsPAM<V> {
62   /**
63    * The logger for this class.
64    */
65   private static final Logging LOG = Logging.getLogger(KMedoidsPAMReynolds.class);
66 
67   /**
68    * Key for statistics logging.
69    */
70   private static final String KEY = KMedoidsPAMReynolds.class.getName();
71 
72   /**
73    * Constructor.
74    *
75    * @param distanceFunction distance function
76    * @param k k parameter
77    * @param maxiter Maxiter parameter
78    * @param initializer Function to generate the initial means
79    */
KMedoidsPAMReynolds(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer)80   public KMedoidsPAMReynolds(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
81     super(distanceFunction, k, maxiter, initializer);
82   }
83 
84   @Override
run(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, WritableIntegerDataStore assignment)85   protected void run(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, WritableIntegerDataStore assignment) {
86     new Instance(distQ, ids, assignment).run(medoids, maxiter);
87   }
88 
89   /**
90    * Instance for a single dataset.
91    *
92    * @author Erich Schubert
93    */
94   protected static class Instance extends KMedoidsPAM.Instance {
95     /**
96      * Constructor.
97      *
98      * @param distQ Distance query
99      * @param ids IDs to process
100      * @param assignment Cluster assignment
101      */
Instance(DistanceQuery<?> distQ, DBIDs ids, WritableIntegerDataStore assignment)102     public Instance(DistanceQuery<?> distQ, DBIDs ids, WritableIntegerDataStore assignment) {
103       super(distQ, ids, assignment);
104     }
105 
106     /**
107      * Run the PAM optimization phase.
108      *
109      * @param medoids Medoids list
110      * @param maxiter
111      * @return final cost
112      */
run(ArrayModifiableDBIDs medoids, int maxiter)113     protected double run(ArrayModifiableDBIDs medoids, int maxiter) {
114       final int k = medoids.size();
115       // Initial assignment to nearest medoids
116       // TODO: reuse distance information, from the build phase, when possible?
117       double tc = assignToNearestCluster(medoids);
118       if(LOG.isStatistics()) {
119         LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
120       }
121 
122       WritableDoubleDataStore tnearest = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
123 
124       IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null;
125       // Swap phase
126       DBIDVar bestid = DBIDUtil.newVar();
127       int iteration = 1;
128       for(; maxiter <= 0 || iteration <= maxiter; iteration++) {
129         LOG.incrementProcessed(prog);
130         // Try to swap a non-medoid with a medoid member:
131         double best = Double.POSITIVE_INFINITY;
132         int bestcluster = -1;
133         // Iterate over each medoid:
134         for(int pi = 0; pi < k; pi++) {
135           // Compute medoid removal costs only once, c.f., Reynolds et al.
136           double basecost = computeRemovalCost(pi, tnearest);
137           // Iterate over all non-medoids:
138           for(DBIDIter h = ids.iter(); h.valid(); h.advance()) {
139             // h is a non-medoid currently in cluster of medoid m.
140             // hdist is the cost we get back by making the non-medoid h medoid.
141             double cpi = basecost + computeReassignmentCost(h, tnearest);
142             if(cpi < best) {
143               best = cpi;
144               bestid.set(h);
145               bestcluster = pi;
146             }
147           }
148         }
149         if(!(best < -1e-12 * tc)) {
150           break;
151         }
152         medoids.set(bestcluster, bestid);
153         // Reassign
154         double nc = assignToNearestCluster(medoids);
155         if(LOG.isStatistics()) {
156           LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
157         }
158         if(nc > tc) {
159           if(nc - tc < 1e-7 * tc) {
160             LOG.warning("PAM failed to converge (numerical instability?)");
161             break;
162           }
163           LOG.warning("PAM failed to converge: costs increased by: " + (nc - tc) + " exepected a decrease by " + best);
164           break;
165         }
166         tc = nc;
167       }
168       LOG.setCompleted(prog);
169       if(LOG.isStatistics()) {
170         LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
171       }
172       return tc;
173     }
174 
175     /**
176      * Compute the cost of removing a medoid just once. This can then be reused
177      * for every point, thus decreasing the runtime cost at low memory overhead.
178      *
179      * The output array contains for each medoid the cost of removing all its
180      * points, and reassigning them to the second nearest medoid instead.
181      *
182      * @return Cost
183      */
computeRemovalCost(int i, WritableDoubleDataStore tnearest)184     protected double computeRemovalCost(int i, WritableDoubleDataStore tnearest) {
185       double cost = 0;
186       // Compute costs of reassigning to the second closest medoid.
187       for(DBIDIter j = ids.iter(); j.valid(); j.advance()) {
188         final double n = nearest.doubleValue(j);
189         if(assignment.intValue(j) == i) {
190           final double s = second.doubleValue(j);
191           cost += s - n;
192           tnearest.put(j, s);
193         }
194         else {
195           tnearest.put(j, n);
196         }
197       }
198       return cost;
199     }
200 
201     /**
202      * Compute the reassignment cost, for all medoids in one pass.
203      *
204      * @param h Current object to swap with any medoid.
205      * @param tnearest Distance to the nearest except the removed medoid
206      * @return cost change
207      */
computeReassignmentCost(DBIDRef h, WritableDoubleDataStore tnearest)208     protected double computeReassignmentCost(DBIDRef h, WritableDoubleDataStore tnearest) {
209       double cost = 0.;
210       // Compute costs of reassigning other objects j:
211       for(DBIDIter j = ids.iter(); j.valid(); j.advance()) {
212         // New medoid is closest. Reassignment to second nearest was
213         // precomputed already, in {@link #computeRemovalCost}
214         // Case 1c: j is closer to h than its current medoid
215         double dist = distQ.distance(h, j), cur = tnearest.doubleValue(j);
216         if(dist < cur) {
217           cost += dist - cur;
218         }
219       }
220       return cost;
221     }
222   }
223 
224   @Override
getLogger()225   protected Logging getLogger() {
226     return LOG;
227   }
228 
229   /**
230    * Parameterization class.
231    *
232    * @author Erich Schubert
233    */
234   public static class Parameterizer<V> extends KMedoidsPAM.Parameterizer<V> {
235     @Override
makeInstance()236     protected KMedoidsPAMReynolds<V> makeInstance() {
237       return new KMedoidsPAMReynolds<>(distanceFunction, k, maxiter, initializer);
238     }
239   }
240 }
241