1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.index.distancematrix;
22 
23 import java.util.ArrayList;
24 import java.util.List;
25 
26 import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
27 import de.lmu.ifi.dbs.elki.database.ids.*;
28 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
29 import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
30 import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
31 import de.lmu.ifi.dbs.elki.database.relation.Relation;
32 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
33 import de.lmu.ifi.dbs.elki.index.DistanceIndex;
34 import de.lmu.ifi.dbs.elki.index.IndexFactory;
35 import de.lmu.ifi.dbs.elki.index.KNNIndex;
36 import de.lmu.ifi.dbs.elki.index.RangeIndex;
37 import de.lmu.ifi.dbs.elki.logging.Logging;
38 import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
39 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
40 import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
41 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
42 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
43 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
44 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
45 
46 /**
47  * Distance matrix, for precomputing similarity for a small data set.
48  * <p>
49  * This class uses a linear memory layout (not a ragged array), and assumes
50  * symmetry as well as strictness. This way, it only stores the upper triangle
51  * matrix with double precision. It has to store (n-1) * (n-2) distance values
52  * in memory, requiring 8 * (n-1) * (n-2) bytes. Since Java has a size limit of
53  * arrays of 31 bits (signed integer), we can store at most \(2^16\) objects
54  * (precisely, 65536 objects) in a single array, which needs about 16 GB of RAM.
55  *
56  * @author Erich Schubert
57  * @since 0.7.0
58  *
59  * @has - - - PrecomputedDistanceQuery
60  * @has - - - PrecomputedKNNQuery
61  * @has - - - PrecomputedRangeQuery
62  *
63  * @param <O> Object type
64  */
65 public class PrecomputedDistanceMatrix<O> implements DistanceIndex<O>, RangeIndex<O>, KNNIndex<O> {
66   /**
67    * Class logger.
68    */
69   private static final Logging LOG = Logging.getLogger(PrecomputedDistanceMatrix.class);
70 
71   /**
72    * Data relation.
73    */
74   protected final Relation<O> relation;
75 
76   /**
77    * Nested distance function.
78    */
79   protected final DistanceFunction<? super O> distanceFunction;
80 
81   /**
82    * Nested distance query.
83    */
84   protected DistanceQuery<O> distanceQuery;
85 
86   /**
87    * Distance matrix.
88    */
89   private double[] matrix = null;
90 
91   /**
92    * DBID range.
93    */
94   private DBIDRange ids;
95 
96   /**
97    * Size of DBID range.
98    */
99   private int size;
100 
101   /**
102    * Constructor.
103    *
104    * @param relation Data relation
105    * @param range DBID range
106    * @param distanceFunction Distance function
107    */
PrecomputedDistanceMatrix(Relation<O> relation, DBIDRange range, DistanceFunction<? super O> distanceFunction)108   public PrecomputedDistanceMatrix(Relation<O> relation, DBIDRange range, DistanceFunction<? super O> distanceFunction) {
109     super();
110     this.relation = relation;
111     this.ids = range;
112     this.distanceFunction = distanceFunction;
113 
114     if(!distanceFunction.isSymmetric()) {
115       throw new AbortException("Distance matrixes currently only support symmetric distance functions (Patches welcome).");
116     }
117   }
118 
119   @Override
initialize()120   public void initialize() {
121     size = ids.size();
122     if(size > 65536) {
123       throw new AbortException("Distance matrixes currently have a limit of 65536 objects (~16 GB). After this, the array size exceeds the Java integer range, and a different data structure needs to be used.");
124     }
125 
126     distanceQuery = distanceFunction.instantiate(relation);
127 
128     final int msize = triangleSize(size);
129     matrix = new double[msize];
130     DBIDArrayIter ix = ids.iter(), iy = ids.iter();
131 
132     FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Precomputing distance matrix", msize, LOG) : null;
133     int pos = 0;
134     for(ix.seek(0); ix.valid(); ix.advance()) {
135       // y < x -- must match {@link #getOffset}!
136       for(iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
137         matrix[pos] = distanceQuery.distance(ix, iy);
138         pos++;
139       }
140       if(prog != null) {
141         prog.setProcessed(prog.getProcessed() + ix.getOffset(), LOG);
142       }
143     }
144     LOG.ensureCompleted(prog);
145   }
146 
147   /**
148    * Compute the size of a complete x by x triangle (minus diagonal)
149    *
150    * @param x Offset
151    * @return Size of complete triangle
152    */
triangleSize(int x)153   protected static int triangleSize(int x) {
154     return (x * (x - 1)) >>> 1;
155   }
156 
157   /**
158    * Array offset computation.
159    *
160    * @param x X parameter
161    * @param y Y parameter
162    * @return Array offset
163    */
getOffset(int x, int y)164   private int getOffset(int x, int y) {
165     return (y < x) ? (triangleSize(x) + y) : (triangleSize(y) + x);
166   }
167 
168   @Override
logStatistics()169   public void logStatistics() {
170     if(matrix != null) {
171       LOG.statistics(new LongStatistic(this.getClass().getName() + ".matrix-size", matrix.length));
172     }
173   }
174 
175   @Override
getLongName()176   public String getLongName() {
177     return "Precomputed Distance Matrix";
178   }
179 
180   @Override
getShortName()181   public String getShortName() {
182     return "distance-matrix";
183   }
184 
185   @Override
getDistanceQuery(DistanceFunction<? super O> distanceFunction, Object... hints)186   public DistanceQuery<O> getDistanceQuery(DistanceFunction<? super O> distanceFunction, Object... hints) {
187     if(this.distanceFunction.equals(distanceFunction)) {
188       return new PrecomputedDistanceQuery();
189     }
190     return null;
191   }
192 
193   @Override
getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints)194   public KNNQuery<O> getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints) {
195     if(this.distanceFunction.equals(distanceQuery.getDistanceFunction())) {
196       return new PrecomputedKNNQuery();
197     }
198     return null;
199   }
200 
201   @Override
getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints)202   public RangeQuery<O> getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints) {
203     if(this.distanceFunction.equals(distanceQuery.getDistanceFunction())) {
204       return new PrecomputedRangeQuery();
205     }
206     return null;
207   }
208 
209   /**
210    * Distance query using the precomputed matrix.
211    *
212    * @author Erich Schubert
213    */
214   private class PrecomputedDistanceQuery implements DistanceQuery<O> {
215     @Override
distance(DBIDRef id1, DBIDRef id2)216     public double distance(DBIDRef id1, DBIDRef id2) {
217       final int x = ids.getOffset(id1), y = ids.getOffset(id2);
218       return (x != y) ? matrix[getOffset(x, y)] : 0.;
219     }
220 
221     @Override
distance(O o1, DBIDRef id2)222     public double distance(O o1, DBIDRef id2) {
223       return distanceQuery.distance(o1, id2);
224     }
225 
226     @Override
distance(DBIDRef id1, O o2)227     public double distance(DBIDRef id1, O o2) {
228       return distanceQuery.distance(id1, o2);
229     }
230 
231     @Override
distance(O o1, O o2)232     public double distance(O o1, O o2) {
233       return distanceQuery.distance(o1, o2);
234     }
235 
236     @Override
getDistanceFunction()237     public DistanceFunction<? super O> getDistanceFunction() {
238       return distanceQuery.getDistanceFunction();
239     }
240 
241     @Override
getRelation()242     public Relation<? extends O> getRelation() {
243       return relation;
244     }
245   }
246 
247   /**
248    * Range query using the distance matrix.
249    *
250    * @author Erich Schubert
251    */
252   private class PrecomputedRangeQuery implements RangeQuery<O> {
253     @Override
getRangeForDBID(DBIDRef id, double range)254     public DoubleDBIDList getRangeForDBID(DBIDRef id, double range) {
255       ModifiableDoubleDBIDList ret = DBIDUtil.newDistanceDBIDList();
256       getRangeForDBID(id, range, ret);
257       ret.sort();
258       return ret;
259     }
260 
261     @Override
getRangeForDBID(DBIDRef id, double range, ModifiableDoubleDBIDList result)262     public void getRangeForDBID(DBIDRef id, double range, ModifiableDoubleDBIDList result) {
263       result.add(0., id);
264       DBIDArrayIter it = ids.iter();
265 
266       final int x = ids.getOffset(id);
267       // Case y < x: triangleSize(x) + y
268       int pos = triangleSize(x);
269       for(int y = 0; y < x; y++) {
270         final double dist = matrix[pos];
271         if(dist <= range) {
272           result.add(dist, it.seek(y));
273         }
274         pos++;
275       }
276       assert (pos == triangleSize(x + 1));
277       // Case y > x: triangleSize(y) + x
278       pos = triangleSize(x + 1) + x;
279       for(int y = x + 1; y < size; y++) {
280         final double dist = matrix[pos];
281         if(dist <= range) {
282           result.add(dist, it.seek(y));
283         }
284         pos += y;
285       }
286     }
287 
288     @Override
getRangeForObject(O obj, double range)289     public DoubleDBIDList getRangeForObject(O obj, double range) {
290       throw new AbortException("Preprocessor KNN query only supports ID queries.");
291     }
292 
293     @Override
getRangeForObject(O obj, double range, ModifiableDoubleDBIDList result)294     public void getRangeForObject(O obj, double range, ModifiableDoubleDBIDList result) {
295       throw new AbortException("Preprocessor KNN query only supports ID queries.");
296     }
297   }
298 
299   /**
300    * kNN query using the distance matrix.
301    *
302    * @author Erich Schubert
303    */
304   private class PrecomputedKNNQuery implements KNNQuery<O> {
305     @Override
getKNNForDBID(DBIDRef id, int k)306     public KNNList getKNNForDBID(DBIDRef id, int k) {
307       KNNHeap heap = DBIDUtil.newHeap(k);
308       heap.insert(0., id);
309       DBIDArrayIter it = ids.iter();
310       double max = Double.POSITIVE_INFINITY;
311       final int x = ids.getOffset(id);
312       // Case y < x: triangleSize(x) + y
313       int pos = triangleSize(x);
314       for(int y = 0; y < x; y++) {
315         final double dist = matrix[pos];
316         if(dist <= max) {
317           max = heap.insert(dist, it.seek(y));
318         }
319         pos++;
320       }
321       assert (pos == triangleSize(x + 1));
322       // Case y > x: triangleSize(y) + x
323       pos = triangleSize(x + 1) + x;
324       for(int y = x + 1; y < size; y++) {
325         final double dist = matrix[pos];
326         if(dist <= max) {
327           max = heap.insert(dist, it.seek(y));
328         }
329         pos += y;
330       }
331       return heap.toKNNList();
332     }
333 
334     @Override
getKNNForBulkDBIDs(ArrayDBIDs ids, int k)335     public List<? extends KNNList> getKNNForBulkDBIDs(ArrayDBIDs ids, int k) {
336       // TODO: optimize
337       List<KNNList> ret = new ArrayList<>(ids.size());
338       for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
339         ret.add(getKNNForDBID(iter, k));
340       }
341       return ret;
342     }
343 
344     @Override
getKNNForObject(O obj, int k)345     public KNNList getKNNForObject(O obj, int k) {
346       throw new AbortException("Preprocessor KNN query only supports ID queries.");
347     }
348   }
349 
350   /**
351    * Factory for the index.
352    *
353    * @author Erich Schubert
354    *
355    * @has - - - PrecomputedDistanceMatrix
356    *
357    * @param <O> Object type
358    */
359   public static class Factory<O> implements IndexFactory<O> {
360     /**
361      * Nested distance function.
362      */
363     final protected DistanceFunction<? super O> distanceFunction;
364 
365     /**
366      * Constructor.
367      *
368      * @param distanceFunction Distance function
369      */
Factory(DistanceFunction<? super O> distanceFunction)370     public Factory(DistanceFunction<? super O> distanceFunction) {
371       super();
372       this.distanceFunction = distanceFunction;
373     }
374 
375     @Override
instantiate(Relation<O> relation)376     public PrecomputedDistanceMatrix<O> instantiate(Relation<O> relation) {
377       DBIDs rids = relation.getDBIDs();
378       if(!(rids instanceof DBIDRange)) {
379         throw new AbortException("Distance matrixes are currently only supported for DBID ranges (as used by static databases; not on modifiable databases) for performance reasons (Patches welcome).");
380       }
381       return new PrecomputedDistanceMatrix<>(relation, (DBIDRange) rids, distanceFunction);
382     }
383 
384     @Override
getInputTypeRestriction()385     public TypeInformation getInputTypeRestriction() {
386       return distanceFunction.getInputTypeRestriction();
387     }
388 
389     /**
390      * Parameterizer.
391      *
392      * @author Erich Schubert
393      *
394      * @hidden
395      *
396      * @param <O> Object type
397      */
398     public static class Parameterizer<O> extends AbstractParameterizer {
399       /**
400        * Option parameter for the precomputed distance matrix.
401        */
402       public static final OptionID DISTANCE_ID = new OptionID("matrix.distance", "Distance function for the precomputed distance matrix.");
403 
404       /**
405        * Nested distance function.
406        */
407       protected DistanceFunction<? super O> distanceFunction;
408 
409       @Override
makeOptions(Parameterization config)410       protected void makeOptions(Parameterization config) {
411         super.makeOptions(config);
412         ObjectParameter<DistanceFunction<? super O>> distanceP = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class);
413         if(config.grab(distanceP)) {
414           distanceFunction = distanceP.instantiateClass(config);
415         }
416       }
417 
418       @Override
makeInstance()419       protected Factory<O> makeInstance() {
420         return new Factory<>(distanceFunction);
421       }
422     }
423   }
424 }
425