1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.index.tree.metrical.covertree; 22 23 import java.util.ArrayList; 24 25 import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; 26 import de.lmu.ifi.dbs.elki.database.ids.DBID; 27 import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; 28 import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; 29 import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; 30 import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; 31 import de.lmu.ifi.dbs.elki.database.ids.DBIDs; 32 import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; 33 import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter; 34 import de.lmu.ifi.dbs.elki.database.ids.KNNHeap; 35 import de.lmu.ifi.dbs.elki.database.ids.KNNList; 36 import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList; 37 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; 38 import de.lmu.ifi.dbs.elki.database.query.knn.AbstractDistanceKNNQuery; 39 import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; 40 import de.lmu.ifi.dbs.elki.database.query.range.AbstractDistanceRangeQuery; 41 import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; 42 import de.lmu.ifi.dbs.elki.database.relation.Relation; 43 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; 44 import de.lmu.ifi.dbs.elki.index.KNNIndex; 45 import de.lmu.ifi.dbs.elki.index.RangeIndex; 46 import de.lmu.ifi.dbs.elki.logging.Logging; 47 import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; 48 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; 49 import de.lmu.ifi.dbs.elki.utilities.Priority; 50 import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleObjectMinHeap; 51 52 /** 53 * Simplified cover tree data structure (in-memory). This is a <i>metrical</i> 54 * data structure that is similar to the M-tree, but not as balanced and 55 * disk-oriented. However, by not having these requirements it does not require 56 * the expensive splitting procedures of M-tree. 57 * <p> 58 * This version does not store the distance to the parent, so it needs only 59 * about 40% of the memory of {@link CoverTree} but does more distance 60 * computations for search. 61 * <p> 62 * Reference: 63 * <p> 64 * A. Beygelzimer, S. Kakade, J. Langford<br> 65 * Cover trees for nearest neighbor<br> 66 * In Proc. 23rd International Conference on Machine Learning (ICML). 67 * <p> 68 * TODO: allow insertions and removals, as in the original publication. 69 * 70 * @author Erich Schubert 71 * @since 0.7.0 72 * 73 * @has - - - CoverTreeRangeQuery 74 * @has - - - CoverTreeKNNQuery 75 */ 76 @Priority(Priority.RECOMMENDED) 77 public class SimplifiedCoverTree<O> extends AbstractCoverTree<O> implements RangeIndex<O>, KNNIndex<O> { 78 /** 79 * Class logger. 80 */ 81 private static final Logging LOG = Logging.getLogger(SimplifiedCoverTree.class); 82 83 /** 84 * Tree root. 85 */ 86 private Node root = null; 87 88 /** 89 * Constructor. 90 * 91 * @param relation data relation 92 * @param distanceFunction distance function 93 * @param expansion Expansion rate 94 * @param truncate Truncate branches with less than this number of instances. 95 */ SimplifiedCoverTree(Relation<O> relation, DistanceFunction<? super O> distanceFunction, double expansion, int truncate)96 public SimplifiedCoverTree(Relation<O> relation, DistanceFunction<? super O> distanceFunction, double expansion, int truncate) { 97 super(relation, distanceFunction, expansion, truncate); 98 } 99 100 /** 101 * Node object. 102 * 103 * @author Erich Schubert 104 */ 105 private static final class Node { 106 /** 107 * Objects in this node. Except for the first, which is the routing object. 108 */ 109 ArrayModifiableDBIDs singletons; 110 111 /** 112 * Maximum distance to descendants. 113 */ 114 double maxDist = 0.; 115 116 /** 117 * Child nodes. 118 */ 119 ArrayList<Node> children; 120 121 /** 122 * Constructor. 123 * 124 * @param r Object. 125 * @param maxDist Maximum distance to any descendant. 126 */ Node(DBIDRef r, double maxDist)127 public Node(DBIDRef r, double maxDist) { 128 this.singletons = DBIDUtil.newArray(); 129 this.singletons.add(r); 130 this.children = new ArrayList<>(); 131 this.maxDist = maxDist; 132 } 133 134 /** 135 * Constructor for leaf node. 136 * 137 * @param r Object. 138 * @param maxDist Maximum distance to any descendant. 139 * @param singletons Singletons. 140 */ Node(DBIDRef r, double maxDist, DoubleDBIDList singletons)141 public Node(DBIDRef r, double maxDist, DoubleDBIDList singletons) { 142 assert (!singletons.contains(r)); 143 this.singletons = DBIDUtil.newArray(singletons.size() + 1); 144 this.singletons.add(r); 145 this.singletons.addDBIDs(singletons); 146 this.children = null; 147 this.maxDist = maxDist; 148 } 149 150 /** 151 * True, if the node is a leaf. 152 * 153 * @return {@code true}, if this is a leaf node. 154 */ isLeaf()155 public boolean isLeaf() { 156 return children == null || children.isEmpty(); 157 } 158 } 159 160 @Override initialize()161 public void initialize() { 162 bulkLoad(relation.getDBIDs()); 163 if(LOG.isVerbose()) { 164 int[] counts = new int[5]; 165 checkCoverTree(root, counts, 0); 166 LOG.statistics(new LongStatistic(this.getClass().getName() + ".nodes", counts[0])); 167 LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".avg-depth", counts[1] / (double) counts[0])); 168 LOG.statistics(new LongStatistic(this.getClass().getName() + ".max-depth", counts[2])); 169 LOG.statistics(new LongStatistic(this.getClass().getName() + ".singletons", counts[3])); 170 LOG.statistics(new LongStatistic(this.getClass().getName() + ".entries", counts[4])); 171 } 172 } 173 174 /** 175 * Bulk-load the index. 176 * 177 * @param ids IDs to load 178 */ bulkLoad(DBIDs ids)179 public void bulkLoad(DBIDs ids) { 180 if(ids.size() == 0) { 181 return; 182 } 183 assert (root == null) : "Tree already initialized."; 184 DBIDIter it = ids.iter(); 185 DBID first = DBIDUtil.deref(it); 186 // Compute distances to all neighbors: 187 ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList(ids.size() - 1); 188 for(it.advance(); it.valid(); it.advance()) { 189 candidates.add(distance(first, it), it); 190 } 191 root = bulkConstruct(first, Integer.MAX_VALUE, candidates); 192 } 193 194 /** 195 * Bulk-load the cover tree. 196 * <p> 197 * This bulk-load is slightly simpler than the one used in the original 198 * cover-tree source: We do not look back into the "far" set of candidates. 199 * 200 * @param cur Current routing object 201 * @param maxScale Maximum scale 202 * @param elems Candidates 203 * @return Root node of subtree 204 */ bulkConstruct(DBIDRef cur, int maxScale, ModifiableDoubleDBIDList elems)205 protected Node bulkConstruct(DBIDRef cur, int maxScale, ModifiableDoubleDBIDList elems) { 206 assert (!elems.contains(cur)); 207 final double max = maxDistance(elems); 208 final int scale = Math.min(distToScale(max) - 1, maxScale); 209 final int nextScale = scale - 1; 210 // Leaf node, because points coincide, we are too deep, or have too few 211 // elements remaining: 212 if(max <= 0 || scale <= scaleBottom || elems.size() < truncate) { 213 return new Node(cur, max, elems); 214 } 215 // Find neighbors in the cover of the current object: 216 ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList(); 217 excludeNotCovered(elems, scaleToDist(scale), candidates); 218 // If no elements were not in the cover, build a compact tree: 219 if(candidates.size() == 0) { 220 LOG.warning("Scale not chosen appropriately? " + max + " " + scaleToDist(scale)); 221 return bulkConstruct(cur, nextScale, elems); 222 } 223 // We will have at least one other child, so build the parent: 224 Node node = new Node(cur, max); 225 // Routing element now is a singleton: 226 final boolean curSingleton = elems.size() == 0; 227 if(!curSingleton) { 228 // Add node for the routing object: 229 node.children.add(bulkConstruct(cur, nextScale, elems)); 230 } 231 final double fmax = scaleToDist(nextScale); 232 // Build additional cover nodes: 233 for(DoubleDBIDListIter it = candidates.iter(); it.valid();) { 234 assert (it.getOffset() == 0); 235 DBID t = DBIDUtil.deref(it); 236 elems.clear(); // Recycle. 237 collectByCover(it, candidates, fmax, elems); 238 assert (DBIDUtil.equal(t, it)) : "First element in candidates must not change!"; 239 if(elems.size() == 0) { // Singleton 240 node.singletons.add(it); 241 } 242 else { 243 // Build a full child node: 244 node.children.add(bulkConstruct(it, nextScale, elems)); 245 } 246 candidates.removeSwap(0); 247 } 248 assert (candidates.size() == 0); 249 // Routing object is not yet handled: 250 if(curSingleton) { 251 if(node.isLeaf()) { 252 node.children = null; // First in leaf is enough. 253 } 254 else { 255 node.singletons.add(cur); // Add as regular singleton. 256 } 257 } 258 // TODO: improve recycling of lists? 259 return node; 260 } 261 262 /** 263 * Collect some statistics on the tree. 264 * 265 * @param cur Current node 266 * @param counts Counter set 267 * @param depth Current depth 268 */ checkCoverTree(Node cur, int[] counts, int depth)269 private void checkCoverTree(Node cur, int[] counts, int depth) { 270 counts[0] += 1; // Node count 271 counts[1] += depth; // Sum of depth 272 counts[2] = depth > counts[2] ? depth : counts[2]; // Max depth 273 counts[3] += cur.singletons.size() - 1; 274 counts[4] += cur.singletons.size() - (cur.children == null ? 0 : 1); 275 if(cur.children != null) { 276 ++depth; 277 for(Node chi : cur.children) { 278 checkCoverTree(chi, counts, depth); 279 } 280 assert (!cur.children.isEmpty()) : "Empty childs list."; 281 } 282 } 283 284 @Override getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints)285 public RangeQuery<O> getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints) { 286 // Query on the relation we index 287 if(distanceQuery.getRelation() != relation) { 288 return null; 289 } 290 DistanceFunction<? super O> distanceFunction = (DistanceFunction<? super O>) distanceQuery.getDistanceFunction(); 291 if(!this.distanceFunction.equals(distanceFunction)) { 292 LOG.debug("Distance function not supported by index - or 'equals' not implemented right!"); 293 return null; 294 } 295 DistanceQuery<O> dq = distanceFunction.instantiate(relation); 296 return new CoverTreeRangeQuery(dq); 297 } 298 299 @Override getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints)300 public KNNQuery<O> getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints) { 301 // Query on the relation we index 302 if(distanceQuery.getRelation() != relation) { 303 return null; 304 } 305 DistanceFunction<? super O> distanceFunction = (DistanceFunction<? super O>) distanceQuery.getDistanceFunction(); 306 if(!this.distanceFunction.equals(distanceFunction)) { 307 return null; 308 } 309 DistanceQuery<O> dq = distanceFunction.instantiate(relation); 310 return new CoverTreeKNNQuery(dq); 311 } 312 313 @Override getLogger()314 protected Logging getLogger() { 315 return LOG; 316 } 317 318 /** 319 * Range query class. 320 * 321 * @author Erich Schubert 322 */ 323 public class CoverTreeRangeQuery extends AbstractDistanceRangeQuery<O> implements RangeQuery<O> { 324 /** 325 * Constructor. 326 * 327 * @param distanceQuery Distance query 328 */ CoverTreeRangeQuery(DistanceQuery<O> distanceQuery)329 public CoverTreeRangeQuery(DistanceQuery<O> distanceQuery) { 330 super(distanceQuery); 331 } 332 333 @Override getRangeForObject(O obj, double range, ModifiableDoubleDBIDList ret)334 public void getRangeForObject(O obj, double range, ModifiableDoubleDBIDList ret) { 335 ArrayList<Node> open = new ArrayList<Node>(); // LIFO stack 336 open.add(root); 337 DBIDVar r = DBIDUtil.newVar(); 338 while(!open.isEmpty()) { 339 final Node cur = open.remove(open.size() - 1); // pop() 340 cur.singletons.assignVar(0, r); 341 final double d = distance(obj, r); 342 // Covered area not in range (metric assumption!): 343 if(d - cur.maxDist > range) { 344 continue; 345 } 346 if(!cur.isLeaf()) { // Inner node: 347 for(int i = 0, l = cur.children.size(); i < l; i++) { 348 open.add(cur.children.get(i)); 349 } 350 } 351 else { // Leaf node 352 // Consider routing object, too: 353 if(d <= range) { 354 ret.add(d, r); // First element is a candidate now 355 } 356 } 357 // For remaining singletons, compute the distances: 358 for(int i = 1, l = cur.singletons.size(); i < l; i++) { 359 cur.singletons.assignVar(i, r); 360 final double d2 = distance(obj, r); 361 if(d2 <= range) { 362 ret.add(d2, r); 363 } 364 } 365 } 366 } 367 } 368 369 /** 370 * KNN Query class. 371 * 372 * @author Erich Schubert 373 */ 374 public class CoverTreeKNNQuery extends AbstractDistanceKNNQuery<O> implements KNNQuery<O> { 375 /** 376 * Constructor. 377 * 378 * @param distanceQuery Distance 379 */ CoverTreeKNNQuery(DistanceQuery<O> distanceQuery)380 public CoverTreeKNNQuery(DistanceQuery<O> distanceQuery) { 381 super(distanceQuery); 382 } 383 384 @Override getKNNForObject(O obj, int k)385 public KNNList getKNNForObject(O obj, int k) { 386 if(k < 1) { 387 throw new IllegalArgumentException("At least one object has to be requested!"); 388 } 389 390 KNNHeap knnList = DBIDUtil.newHeap(k); 391 double d_k = Double.POSITIVE_INFINITY; 392 393 final DoubleObjectMinHeap<Node> pq = new DoubleObjectMinHeap<>(); 394 395 // Push the root node 396 final double rootdist = distance(obj, root.singletons.iter()); 397 pq.add(rootdist - root.maxDist, root); 398 399 // search in tree 400 while(!pq.isEmpty()) { 401 final Node cur = pq.peekValue(); 402 final double prio = pq.peekKey(); // Minimum distance to cover 403 final double d = prio + cur.maxDist; // Restore distance to center. 404 pq.poll(); // Remove 405 406 if(knnList.size() >= k && prio > d_k) { 407 continue; 408 } 409 410 final DBIDIter it = cur.singletons.iter(); 411 412 if(!cur.isLeaf()) { // Inner node: 413 for(Node c : cur.children) { 414 final DBIDIter f = c.singletons.iter(); 415 final double dist = DBIDUtil.equal(f, it) ? d : distance(obj, f); 416 final double newprio = dist - c.maxDist; // Minimum distance 417 if(newprio <= d_k) { 418 pq.add(newprio, c); 419 } 420 } 421 } 422 else { // Leaf node 423 // Consider routing object, too: 424 if(d <= d_k) { 425 d_k = knnList.insert(d, it); // First element is a candidate now 426 } 427 } 428 it.advance(); // Skip routing object. 429 // For remaining singletons, compute the distances: 430 while(it.valid()) { 431 final double d2 = distance(obj, it); 432 if(d2 <= d_k) { 433 d_k = knnList.insert(d2, it); 434 } 435 it.advance(); 436 } 437 } 438 return knnList.toKNNList(); 439 } 440 } 441 442 /** 443 * Index factory. 444 * 445 * @author Erich Schubert 446 * 447 * @has - - - SimplifiedCoverTree 448 * 449 * @param <O> Object type 450 */ 451 public static class Factory<O> extends AbstractCoverTree.Factory<O> { 452 /** 453 * Constructor. 454 * 455 * @param distanceFunction Distance function 456 * @param expansion Expansion rate 457 * @param truncate Truncate branches with less than this number of 458 * instances. 459 */ Factory(DistanceFunction<? super O> distanceFunction, double expansion, int truncate)460 public Factory(DistanceFunction<? super O> distanceFunction, double expansion, int truncate) { 461 super(distanceFunction, expansion, truncate); 462 } 463 464 @Override instantiate(Relation<O> relation)465 public SimplifiedCoverTree<O> instantiate(Relation<O> relation) { 466 return new SimplifiedCoverTree<O>(relation, distanceFunction, expansion, truncate); 467 } 468 469 /** 470 * Parameterization class. 471 * 472 * @author Erich Schubert 473 */ 474 public static class Parameterizer<O> extends AbstractCoverTree.Factory.Parameterizer<O> { 475 @Override makeInstance()476 protected SimplifiedCoverTree.Factory<O> makeInstance() { 477 return new SimplifiedCoverTree.Factory<>(distanceFunction, expansion, truncate); 478 } 479 } 480 } 481 } 482