1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.index.tree.metrical.covertree;
22 
23 import java.util.ArrayList;
24 
25 import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
26 import de.lmu.ifi.dbs.elki.database.ids.DBID;
27 import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
28 import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
29 import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
30 import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
31 import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
32 import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList;
33 import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter;
34 import de.lmu.ifi.dbs.elki.database.ids.KNNHeap;
35 import de.lmu.ifi.dbs.elki.database.ids.KNNList;
36 import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList;
37 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
38 import de.lmu.ifi.dbs.elki.database.query.knn.AbstractDistanceKNNQuery;
39 import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
40 import de.lmu.ifi.dbs.elki.database.query.range.AbstractDistanceRangeQuery;
41 import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
42 import de.lmu.ifi.dbs.elki.database.relation.Relation;
43 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
44 import de.lmu.ifi.dbs.elki.index.KNNIndex;
45 import de.lmu.ifi.dbs.elki.index.RangeIndex;
46 import de.lmu.ifi.dbs.elki.logging.Logging;
47 import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
48 import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
49 import de.lmu.ifi.dbs.elki.utilities.Priority;
50 import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleObjectMinHeap;
51 
52 /**
53  * Simplified cover tree data structure (in-memory). This is a <i>metrical</i>
54  * data structure that is similar to the M-tree, but not as balanced and
55  * disk-oriented. However, by not having these requirements it does not require
56  * the expensive splitting procedures of M-tree.
57  * <p>
58  * This version does not store the distance to the parent, so it needs only
59  * about 40% of the memory of {@link CoverTree} but does more distance
60  * computations for search.
61  * <p>
62  * Reference:
63  * <p>
64  * A. Beygelzimer, S. Kakade, J. Langford<br>
65  * Cover trees for nearest neighbor<br>
66  * In Proc. 23rd International Conference on Machine Learning (ICML).
67  * <p>
68  * TODO: allow insertions and removals, as in the original publication.
69  *
70  * @author Erich Schubert
71  * @since 0.7.0
72  *
73  * @has - - - CoverTreeRangeQuery
74  * @has - - - CoverTreeKNNQuery
75  */
76 @Priority(Priority.RECOMMENDED)
77 public class SimplifiedCoverTree<O> extends AbstractCoverTree<O> implements RangeIndex<O>, KNNIndex<O> {
78   /**
79    * Class logger.
80    */
81   private static final Logging LOG = Logging.getLogger(SimplifiedCoverTree.class);
82 
83   /**
84    * Tree root.
85    */
86   private Node root = null;
87 
88   /**
89    * Constructor.
90    *
91    * @param relation data relation
92    * @param distanceFunction distance function
93    * @param expansion Expansion rate
94    * @param truncate Truncate branches with less than this number of instances.
95    */
SimplifiedCoverTree(Relation<O> relation, DistanceFunction<? super O> distanceFunction, double expansion, int truncate)96   public SimplifiedCoverTree(Relation<O> relation, DistanceFunction<? super O> distanceFunction, double expansion, int truncate) {
97     super(relation, distanceFunction, expansion, truncate);
98   }
99 
100   /**
101    * Node object.
102    *
103    * @author Erich Schubert
104    */
105   private static final class Node {
106     /**
107      * Objects in this node. Except for the first, which is the routing object.
108      */
109     ArrayModifiableDBIDs singletons;
110 
111     /**
112      * Maximum distance to descendants.
113      */
114     double maxDist = 0.;
115 
116     /**
117      * Child nodes.
118      */
119     ArrayList<Node> children;
120 
121     /**
122      * Constructor.
123      *
124      * @param r Object.
125      * @param maxDist Maximum distance to any descendant.
126      */
Node(DBIDRef r, double maxDist)127     public Node(DBIDRef r, double maxDist) {
128       this.singletons = DBIDUtil.newArray();
129       this.singletons.add(r);
130       this.children = new ArrayList<>();
131       this.maxDist = maxDist;
132     }
133 
134     /**
135      * Constructor for leaf node.
136      *
137      * @param r Object.
138      * @param maxDist Maximum distance to any descendant.
139      * @param singletons Singletons.
140      */
Node(DBIDRef r, double maxDist, DoubleDBIDList singletons)141     public Node(DBIDRef r, double maxDist, DoubleDBIDList singletons) {
142       assert (!singletons.contains(r));
143       this.singletons = DBIDUtil.newArray(singletons.size() + 1);
144       this.singletons.add(r);
145       this.singletons.addDBIDs(singletons);
146       this.children = null;
147       this.maxDist = maxDist;
148     }
149 
150     /**
151      * True, if the node is a leaf.
152      *
153      * @return {@code true}, if this is a leaf node.
154      */
isLeaf()155     public boolean isLeaf() {
156       return children == null || children.isEmpty();
157     }
158   }
159 
160   @Override
initialize()161   public void initialize() {
162     bulkLoad(relation.getDBIDs());
163     if(LOG.isVerbose()) {
164       int[] counts = new int[5];
165       checkCoverTree(root, counts, 0);
166       LOG.statistics(new LongStatistic(this.getClass().getName() + ".nodes", counts[0]));
167       LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".avg-depth", counts[1] / (double) counts[0]));
168       LOG.statistics(new LongStatistic(this.getClass().getName() + ".max-depth", counts[2]));
169       LOG.statistics(new LongStatistic(this.getClass().getName() + ".singletons", counts[3]));
170       LOG.statistics(new LongStatistic(this.getClass().getName() + ".entries", counts[4]));
171     }
172   }
173 
174   /**
175    * Bulk-load the index.
176    *
177    * @param ids IDs to load
178    */
bulkLoad(DBIDs ids)179   public void bulkLoad(DBIDs ids) {
180     if(ids.size() == 0) {
181       return;
182     }
183     assert (root == null) : "Tree already initialized.";
184     DBIDIter it = ids.iter();
185     DBID first = DBIDUtil.deref(it);
186     // Compute distances to all neighbors:
187     ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList(ids.size() - 1);
188     for(it.advance(); it.valid(); it.advance()) {
189       candidates.add(distance(first, it), it);
190     }
191     root = bulkConstruct(first, Integer.MAX_VALUE, candidates);
192   }
193 
194   /**
195    * Bulk-load the cover tree.
196    * <p>
197    * This bulk-load is slightly simpler than the one used in the original
198    * cover-tree source: We do not look back into the "far" set of candidates.
199    *
200    * @param cur Current routing object
201    * @param maxScale Maximum scale
202    * @param elems Candidates
203    * @return Root node of subtree
204    */
bulkConstruct(DBIDRef cur, int maxScale, ModifiableDoubleDBIDList elems)205   protected Node bulkConstruct(DBIDRef cur, int maxScale, ModifiableDoubleDBIDList elems) {
206     assert (!elems.contains(cur));
207     final double max = maxDistance(elems);
208     final int scale = Math.min(distToScale(max) - 1, maxScale);
209     final int nextScale = scale - 1;
210     // Leaf node, because points coincide, we are too deep, or have too few
211     // elements remaining:
212     if(max <= 0 || scale <= scaleBottom || elems.size() < truncate) {
213       return new Node(cur, max, elems);
214     }
215     // Find neighbors in the cover of the current object:
216     ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList();
217     excludeNotCovered(elems, scaleToDist(scale), candidates);
218     // If no elements were not in the cover, build a compact tree:
219     if(candidates.size() == 0) {
220       LOG.warning("Scale not chosen appropriately? " + max + " " + scaleToDist(scale));
221       return bulkConstruct(cur, nextScale, elems);
222     }
223     // We will have at least one other child, so build the parent:
224     Node node = new Node(cur, max);
225     // Routing element now is a singleton:
226     final boolean curSingleton = elems.size() == 0;
227     if(!curSingleton) {
228       // Add node for the routing object:
229       node.children.add(bulkConstruct(cur, nextScale, elems));
230     }
231     final double fmax = scaleToDist(nextScale);
232     // Build additional cover nodes:
233     for(DoubleDBIDListIter it = candidates.iter(); it.valid();) {
234       assert (it.getOffset() == 0);
235       DBID t = DBIDUtil.deref(it);
236       elems.clear(); // Recycle.
237       collectByCover(it, candidates, fmax, elems);
238       assert (DBIDUtil.equal(t, it)) : "First element in candidates must not change!";
239       if(elems.size() == 0) { // Singleton
240         node.singletons.add(it);
241       }
242       else {
243         // Build a full child node:
244         node.children.add(bulkConstruct(it, nextScale, elems));
245       }
246       candidates.removeSwap(0);
247     }
248     assert (candidates.size() == 0);
249     // Routing object is not yet handled:
250     if(curSingleton) {
251       if(node.isLeaf()) {
252         node.children = null; // First in leaf is enough.
253       }
254       else {
255         node.singletons.add(cur); // Add as regular singleton.
256       }
257     }
258     // TODO: improve recycling of lists?
259     return node;
260   }
261 
262   /**
263    * Collect some statistics on the tree.
264    *
265    * @param cur Current node
266    * @param counts Counter set
267    * @param depth Current depth
268    */
checkCoverTree(Node cur, int[] counts, int depth)269   private void checkCoverTree(Node cur, int[] counts, int depth) {
270     counts[0] += 1; // Node count
271     counts[1] += depth; // Sum of depth
272     counts[2] = depth > counts[2] ? depth : counts[2]; // Max depth
273     counts[3] += cur.singletons.size() - 1;
274     counts[4] += cur.singletons.size() - (cur.children == null ? 0 : 1);
275     if(cur.children != null) {
276       ++depth;
277       for(Node chi : cur.children) {
278         checkCoverTree(chi, counts, depth);
279       }
280       assert (!cur.children.isEmpty()) : "Empty childs list.";
281     }
282   }
283 
284   @Override
getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints)285   public RangeQuery<O> getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints) {
286     // Query on the relation we index
287     if(distanceQuery.getRelation() != relation) {
288       return null;
289     }
290     DistanceFunction<? super O> distanceFunction = (DistanceFunction<? super O>) distanceQuery.getDistanceFunction();
291     if(!this.distanceFunction.equals(distanceFunction)) {
292       LOG.debug("Distance function not supported by index - or 'equals' not implemented right!");
293       return null;
294     }
295     DistanceQuery<O> dq = distanceFunction.instantiate(relation);
296     return new CoverTreeRangeQuery(dq);
297   }
298 
299   @Override
getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints)300   public KNNQuery<O> getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints) {
301     // Query on the relation we index
302     if(distanceQuery.getRelation() != relation) {
303       return null;
304     }
305     DistanceFunction<? super O> distanceFunction = (DistanceFunction<? super O>) distanceQuery.getDistanceFunction();
306     if(!this.distanceFunction.equals(distanceFunction)) {
307       return null;
308     }
309     DistanceQuery<O> dq = distanceFunction.instantiate(relation);
310     return new CoverTreeKNNQuery(dq);
311   }
312 
313   @Override
getLogger()314   protected Logging getLogger() {
315     return LOG;
316   }
317 
318   /**
319    * Range query class.
320    *
321    * @author Erich Schubert
322    */
323   public class CoverTreeRangeQuery extends AbstractDistanceRangeQuery<O> implements RangeQuery<O> {
324     /**
325      * Constructor.
326      *
327      * @param distanceQuery Distance query
328      */
CoverTreeRangeQuery(DistanceQuery<O> distanceQuery)329     public CoverTreeRangeQuery(DistanceQuery<O> distanceQuery) {
330       super(distanceQuery);
331     }
332 
333     @Override
getRangeForObject(O obj, double range, ModifiableDoubleDBIDList ret)334     public void getRangeForObject(O obj, double range, ModifiableDoubleDBIDList ret) {
335       ArrayList<Node> open = new ArrayList<Node>(); // LIFO stack
336       open.add(root);
337       DBIDVar r = DBIDUtil.newVar();
338       while(!open.isEmpty()) {
339         final Node cur = open.remove(open.size() - 1); // pop()
340         cur.singletons.assignVar(0, r);
341         final double d = distance(obj, r);
342         // Covered area not in range (metric assumption!):
343         if(d - cur.maxDist > range) {
344           continue;
345         }
346         if(!cur.isLeaf()) { // Inner node:
347           for(int i = 0, l = cur.children.size(); i < l; i++) {
348             open.add(cur.children.get(i));
349           }
350         }
351         else { // Leaf node
352           // Consider routing object, too:
353           if(d <= range) {
354             ret.add(d, r); // First element is a candidate now
355           }
356         }
357         // For remaining singletons, compute the distances:
358         for(int i = 1, l = cur.singletons.size(); i < l; i++) {
359           cur.singletons.assignVar(i, r);
360           final double d2 = distance(obj, r);
361           if(d2 <= range) {
362             ret.add(d2, r);
363           }
364         }
365       }
366     }
367   }
368 
369   /**
370    * KNN Query class.
371    *
372    * @author Erich Schubert
373    */
374   public class CoverTreeKNNQuery extends AbstractDistanceKNNQuery<O> implements KNNQuery<O> {
375     /**
376      * Constructor.
377      *
378      * @param distanceQuery Distance
379      */
CoverTreeKNNQuery(DistanceQuery<O> distanceQuery)380     public CoverTreeKNNQuery(DistanceQuery<O> distanceQuery) {
381       super(distanceQuery);
382     }
383 
384     @Override
getKNNForObject(O obj, int k)385     public KNNList getKNNForObject(O obj, int k) {
386       if(k < 1) {
387         throw new IllegalArgumentException("At least one object has to be requested!");
388       }
389 
390       KNNHeap knnList = DBIDUtil.newHeap(k);
391       double d_k = Double.POSITIVE_INFINITY;
392 
393       final DoubleObjectMinHeap<Node> pq = new DoubleObjectMinHeap<>();
394 
395       // Push the root node
396       final double rootdist = distance(obj, root.singletons.iter());
397       pq.add(rootdist - root.maxDist, root);
398 
399       // search in tree
400       while(!pq.isEmpty()) {
401         final Node cur = pq.peekValue();
402         final double prio = pq.peekKey(); // Minimum distance to cover
403         final double d = prio + cur.maxDist; // Restore distance to center.
404         pq.poll(); // Remove
405 
406         if(knnList.size() >= k && prio > d_k) {
407           continue;
408         }
409 
410         final DBIDIter it = cur.singletons.iter();
411 
412         if(!cur.isLeaf()) { // Inner node:
413           for(Node c : cur.children) {
414             final DBIDIter f = c.singletons.iter();
415             final double dist = DBIDUtil.equal(f, it) ? d : distance(obj, f);
416             final double newprio = dist - c.maxDist; // Minimum distance
417             if(newprio <= d_k) {
418               pq.add(newprio, c);
419             }
420           }
421         }
422         else { // Leaf node
423           // Consider routing object, too:
424           if(d <= d_k) {
425             d_k = knnList.insert(d, it); // First element is a candidate now
426           }
427         }
428         it.advance(); // Skip routing object.
429         // For remaining singletons, compute the distances:
430         while(it.valid()) {
431           final double d2 = distance(obj, it);
432           if(d2 <= d_k) {
433             d_k = knnList.insert(d2, it);
434           }
435           it.advance();
436         }
437       }
438       return knnList.toKNNList();
439     }
440   }
441 
442   /**
443    * Index factory.
444    *
445    * @author Erich Schubert
446    *
447    * @has - - - SimplifiedCoverTree
448    *
449    * @param <O> Object type
450    */
451   public static class Factory<O> extends AbstractCoverTree.Factory<O> {
452     /**
453      * Constructor.
454      *
455      * @param distanceFunction Distance function
456      * @param expansion Expansion rate
457      * @param truncate Truncate branches with less than this number of
458      *        instances.
459      */
Factory(DistanceFunction<? super O> distanceFunction, double expansion, int truncate)460     public Factory(DistanceFunction<? super O> distanceFunction, double expansion, int truncate) {
461       super(distanceFunction, expansion, truncate);
462     }
463 
464     @Override
instantiate(Relation<O> relation)465     public SimplifiedCoverTree<O> instantiate(Relation<O> relation) {
466       return new SimplifiedCoverTree<O>(relation, distanceFunction, expansion, truncate);
467     }
468 
469     /**
470      * Parameterization class.
471      *
472      * @author Erich Schubert
473      */
474     public static class Parameterizer<O> extends AbstractCoverTree.Factory.Parameterizer<O> {
475       @Override
makeInstance()476       protected SimplifiedCoverTree.Factory<O> makeInstance() {
477         return new SimplifiedCoverTree.Factory<>(distanceFunction, expansion, truncate);
478       }
479     }
480   }
481 }
482