1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.index.distancematrix; 22 23 import java.util.ArrayList; 24 import java.util.List; 25 26 import de.lmu.ifi.dbs.elki.data.type.TypeInformation; 27 import de.lmu.ifi.dbs.elki.database.ids.*; 28 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; 29 import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; 30 import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; 31 import de.lmu.ifi.dbs.elki.database.relation.Relation; 32 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; 33 import de.lmu.ifi.dbs.elki.index.DistanceIndex; 34 import de.lmu.ifi.dbs.elki.index.IndexFactory; 35 import de.lmu.ifi.dbs.elki.index.KNNIndex; 36 import de.lmu.ifi.dbs.elki.index.RangeIndex; 37 import de.lmu.ifi.dbs.elki.logging.Logging; 38 import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; 39 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; 40 import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; 41 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; 42 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; 43 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; 44 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; 45 46 /** 47 * Distance matrix, for precomputing similarity for a small data set. 48 * <p> 49 * This class uses a linear memory layout (not a ragged array), and assumes 50 * symmetry as well as strictness. This way, it only stores the upper triangle 51 * matrix with double precision. It has to store (n-1) * (n-2) distance values 52 * in memory, requiring 8 * (n-1) * (n-2) bytes. Since Java has a size limit of 53 * arrays of 31 bits (signed integer), we can store at most \(2^16\) objects 54 * (precisely, 65536 objects) in a single array, which needs about 16 GB of RAM. 55 * 56 * @author Erich Schubert 57 * @since 0.7.0 58 * 59 * @has - - - PrecomputedDistanceQuery 60 * @has - - - PrecomputedKNNQuery 61 * @has - - - PrecomputedRangeQuery 62 * 63 * @param <O> Object type 64 */ 65 public class PrecomputedDistanceMatrix<O> implements DistanceIndex<O>, RangeIndex<O>, KNNIndex<O> { 66 /** 67 * Class logger. 68 */ 69 private static final Logging LOG = Logging.getLogger(PrecomputedDistanceMatrix.class); 70 71 /** 72 * Data relation. 73 */ 74 protected final Relation<O> relation; 75 76 /** 77 * Nested distance function. 78 */ 79 protected final DistanceFunction<? super O> distanceFunction; 80 81 /** 82 * Nested distance query. 83 */ 84 protected DistanceQuery<O> distanceQuery; 85 86 /** 87 * Distance matrix. 88 */ 89 private double[] matrix = null; 90 91 /** 92 * DBID range. 93 */ 94 private DBIDRange ids; 95 96 /** 97 * Size of DBID range. 98 */ 99 private int size; 100 101 /** 102 * Constructor. 103 * 104 * @param relation Data relation 105 * @param range DBID range 106 * @param distanceFunction Distance function 107 */ PrecomputedDistanceMatrix(Relation<O> relation, DBIDRange range, DistanceFunction<? super O> distanceFunction)108 public PrecomputedDistanceMatrix(Relation<O> relation, DBIDRange range, DistanceFunction<? super O> distanceFunction) { 109 super(); 110 this.relation = relation; 111 this.ids = range; 112 this.distanceFunction = distanceFunction; 113 114 if(!distanceFunction.isSymmetric()) { 115 throw new AbortException("Distance matrixes currently only support symmetric distance functions (Patches welcome)."); 116 } 117 } 118 119 @Override initialize()120 public void initialize() { 121 size = ids.size(); 122 if(size > 65536) { 123 throw new AbortException("Distance matrixes currently have a limit of 65536 objects (~16 GB). After this, the array size exceeds the Java integer range, and a different data structure needs to be used."); 124 } 125 126 distanceQuery = distanceFunction.instantiate(relation); 127 128 final int msize = triangleSize(size); 129 matrix = new double[msize]; 130 DBIDArrayIter ix = ids.iter(), iy = ids.iter(); 131 132 FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Precomputing distance matrix", msize, LOG) : null; 133 int pos = 0; 134 for(ix.seek(0); ix.valid(); ix.advance()) { 135 // y < x -- must match {@link #getOffset}! 136 for(iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { 137 matrix[pos] = distanceQuery.distance(ix, iy); 138 pos++; 139 } 140 if(prog != null) { 141 prog.setProcessed(prog.getProcessed() + ix.getOffset(), LOG); 142 } 143 } 144 LOG.ensureCompleted(prog); 145 } 146 147 /** 148 * Compute the size of a complete x by x triangle (minus diagonal) 149 * 150 * @param x Offset 151 * @return Size of complete triangle 152 */ triangleSize(int x)153 protected static int triangleSize(int x) { 154 return (x * (x - 1)) >>> 1; 155 } 156 157 /** 158 * Array offset computation. 159 * 160 * @param x X parameter 161 * @param y Y parameter 162 * @return Array offset 163 */ getOffset(int x, int y)164 private int getOffset(int x, int y) { 165 return (y < x) ? (triangleSize(x) + y) : (triangleSize(y) + x); 166 } 167 168 @Override logStatistics()169 public void logStatistics() { 170 if(matrix != null) { 171 LOG.statistics(new LongStatistic(this.getClass().getName() + ".matrix-size", matrix.length)); 172 } 173 } 174 175 @Override getLongName()176 public String getLongName() { 177 return "Precomputed Distance Matrix"; 178 } 179 180 @Override getShortName()181 public String getShortName() { 182 return "distance-matrix"; 183 } 184 185 @Override getDistanceQuery(DistanceFunction<? super O> distanceFunction, Object... hints)186 public DistanceQuery<O> getDistanceQuery(DistanceFunction<? super O> distanceFunction, Object... hints) { 187 if(this.distanceFunction.equals(distanceFunction)) { 188 return new PrecomputedDistanceQuery(); 189 } 190 return null; 191 } 192 193 @Override getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints)194 public KNNQuery<O> getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints) { 195 if(this.distanceFunction.equals(distanceQuery.getDistanceFunction())) { 196 return new PrecomputedKNNQuery(); 197 } 198 return null; 199 } 200 201 @Override getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints)202 public RangeQuery<O> getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints) { 203 if(this.distanceFunction.equals(distanceQuery.getDistanceFunction())) { 204 return new PrecomputedRangeQuery(); 205 } 206 return null; 207 } 208 209 /** 210 * Distance query using the precomputed matrix. 211 * 212 * @author Erich Schubert 213 */ 214 private class PrecomputedDistanceQuery implements DistanceQuery<O> { 215 @Override distance(DBIDRef id1, DBIDRef id2)216 public double distance(DBIDRef id1, DBIDRef id2) { 217 final int x = ids.getOffset(id1), y = ids.getOffset(id2); 218 return (x != y) ? matrix[getOffset(x, y)] : 0.; 219 } 220 221 @Override distance(O o1, DBIDRef id2)222 public double distance(O o1, DBIDRef id2) { 223 return distanceQuery.distance(o1, id2); 224 } 225 226 @Override distance(DBIDRef id1, O o2)227 public double distance(DBIDRef id1, O o2) { 228 return distanceQuery.distance(id1, o2); 229 } 230 231 @Override distance(O o1, O o2)232 public double distance(O o1, O o2) { 233 return distanceQuery.distance(o1, o2); 234 } 235 236 @Override getDistanceFunction()237 public DistanceFunction<? super O> getDistanceFunction() { 238 return distanceQuery.getDistanceFunction(); 239 } 240 241 @Override getRelation()242 public Relation<? extends O> getRelation() { 243 return relation; 244 } 245 } 246 247 /** 248 * Range query using the distance matrix. 249 * 250 * @author Erich Schubert 251 */ 252 private class PrecomputedRangeQuery implements RangeQuery<O> { 253 @Override getRangeForDBID(DBIDRef id, double range)254 public DoubleDBIDList getRangeForDBID(DBIDRef id, double range) { 255 ModifiableDoubleDBIDList ret = DBIDUtil.newDistanceDBIDList(); 256 getRangeForDBID(id, range, ret); 257 ret.sort(); 258 return ret; 259 } 260 261 @Override getRangeForDBID(DBIDRef id, double range, ModifiableDoubleDBIDList result)262 public void getRangeForDBID(DBIDRef id, double range, ModifiableDoubleDBIDList result) { 263 result.add(0., id); 264 DBIDArrayIter it = ids.iter(); 265 266 final int x = ids.getOffset(id); 267 // Case y < x: triangleSize(x) + y 268 int pos = triangleSize(x); 269 for(int y = 0; y < x; y++) { 270 final double dist = matrix[pos]; 271 if(dist <= range) { 272 result.add(dist, it.seek(y)); 273 } 274 pos++; 275 } 276 assert (pos == triangleSize(x + 1)); 277 // Case y > x: triangleSize(y) + x 278 pos = triangleSize(x + 1) + x; 279 for(int y = x + 1; y < size; y++) { 280 final double dist = matrix[pos]; 281 if(dist <= range) { 282 result.add(dist, it.seek(y)); 283 } 284 pos += y; 285 } 286 } 287 288 @Override getRangeForObject(O obj, double range)289 public DoubleDBIDList getRangeForObject(O obj, double range) { 290 throw new AbortException("Preprocessor KNN query only supports ID queries."); 291 } 292 293 @Override getRangeForObject(O obj, double range, ModifiableDoubleDBIDList result)294 public void getRangeForObject(O obj, double range, ModifiableDoubleDBIDList result) { 295 throw new AbortException("Preprocessor KNN query only supports ID queries."); 296 } 297 } 298 299 /** 300 * kNN query using the distance matrix. 301 * 302 * @author Erich Schubert 303 */ 304 private class PrecomputedKNNQuery implements KNNQuery<O> { 305 @Override getKNNForDBID(DBIDRef id, int k)306 public KNNList getKNNForDBID(DBIDRef id, int k) { 307 KNNHeap heap = DBIDUtil.newHeap(k); 308 heap.insert(0., id); 309 DBIDArrayIter it = ids.iter(); 310 double max = Double.POSITIVE_INFINITY; 311 final int x = ids.getOffset(id); 312 // Case y < x: triangleSize(x) + y 313 int pos = triangleSize(x); 314 for(int y = 0; y < x; y++) { 315 final double dist = matrix[pos]; 316 if(dist <= max) { 317 max = heap.insert(dist, it.seek(y)); 318 } 319 pos++; 320 } 321 assert (pos == triangleSize(x + 1)); 322 // Case y > x: triangleSize(y) + x 323 pos = triangleSize(x + 1) + x; 324 for(int y = x + 1; y < size; y++) { 325 final double dist = matrix[pos]; 326 if(dist <= max) { 327 max = heap.insert(dist, it.seek(y)); 328 } 329 pos += y; 330 } 331 return heap.toKNNList(); 332 } 333 334 @Override getKNNForBulkDBIDs(ArrayDBIDs ids, int k)335 public List<? extends KNNList> getKNNForBulkDBIDs(ArrayDBIDs ids, int k) { 336 // TODO: optimize 337 List<KNNList> ret = new ArrayList<>(ids.size()); 338 for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { 339 ret.add(getKNNForDBID(iter, k)); 340 } 341 return ret; 342 } 343 344 @Override getKNNForObject(O obj, int k)345 public KNNList getKNNForObject(O obj, int k) { 346 throw new AbortException("Preprocessor KNN query only supports ID queries."); 347 } 348 } 349 350 /** 351 * Factory for the index. 352 * 353 * @author Erich Schubert 354 * 355 * @has - - - PrecomputedDistanceMatrix 356 * 357 * @param <O> Object type 358 */ 359 public static class Factory<O> implements IndexFactory<O> { 360 /** 361 * Nested distance function. 362 */ 363 final protected DistanceFunction<? super O> distanceFunction; 364 365 /** 366 * Constructor. 367 * 368 * @param distanceFunction Distance function 369 */ Factory(DistanceFunction<? super O> distanceFunction)370 public Factory(DistanceFunction<? super O> distanceFunction) { 371 super(); 372 this.distanceFunction = distanceFunction; 373 } 374 375 @Override instantiate(Relation<O> relation)376 public PrecomputedDistanceMatrix<O> instantiate(Relation<O> relation) { 377 DBIDs rids = relation.getDBIDs(); 378 if(!(rids instanceof DBIDRange)) { 379 throw new AbortException("Distance matrixes are currently only supported for DBID ranges (as used by static databases; not on modifiable databases) for performance reasons (Patches welcome)."); 380 } 381 return new PrecomputedDistanceMatrix<>(relation, (DBIDRange) rids, distanceFunction); 382 } 383 384 @Override getInputTypeRestriction()385 public TypeInformation getInputTypeRestriction() { 386 return distanceFunction.getInputTypeRestriction(); 387 } 388 389 /** 390 * Parameterizer. 391 * 392 * @author Erich Schubert 393 * 394 * @hidden 395 * 396 * @param <O> Object type 397 */ 398 public static class Parameterizer<O> extends AbstractParameterizer { 399 /** 400 * Option parameter for the precomputed distance matrix. 401 */ 402 public static final OptionID DISTANCE_ID = new OptionID("matrix.distance", "Distance function for the precomputed distance matrix."); 403 404 /** 405 * Nested distance function. 406 */ 407 protected DistanceFunction<? super O> distanceFunction; 408 409 @Override makeOptions(Parameterization config)410 protected void makeOptions(Parameterization config) { 411 super.makeOptions(config); 412 ObjectParameter<DistanceFunction<? super O>> distanceP = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class); 413 if(config.grab(distanceP)) { 414 distanceFunction = distanceP.instantiateClass(config); 415 } 416 } 417 418 @Override makeInstance()419 protected Factory<O> makeInstance() { 420 return new Factory<>(distanceFunction); 421 } 422 } 423 } 424 } 425