1 /**********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2011  Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
5  *
6  * THE BSD LICENSE
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *************************************************************************/
29 #ifndef FLANN_UTIL_CUDA_RESULTSET_H
30 #define FLANN_UTIL_CUDA_RESULTSET_H
31 
32 #include <flann/util/cuda/heap.h>
33 #include <limits>
34 
35 __device__ __forceinline__
infinity()36 float infinity()
37 {
38        return __int_as_float(0x7f800000);
39 }
40 
41 #ifndef INFINITY
42 #define INFINITY infinity()
43 #endif
44 
45 namespace flann
46 {
47 namespace cuda
48 {
49 //! result set for the 1nn search. Doesn't do any global memory accesses on its own,
50 template< typename DistanceType >
51 struct SingleResultSet
52 {
53     int bestIndex;
54     DistanceType bestDist;
55     const DistanceType epsError;
56 
57     __device__ __host__
SingleResultSetSingleResultSet58     SingleResultSet( DistanceType eps ) : bestIndex(-1),bestDist(INFINITY), epsError(eps){ }
59 
60     __device__
61     inline float
worstDistSingleResultSet62     worstDist()
63     {
64         return bestDist;
65     }
66 
67     __device__
68     inline void
insertSingleResultSet69     insert(int index, DistanceType dist)
70     {
71         if( dist <= bestDist ) {
72             bestIndex=index;
73             bestDist=dist;
74         }
75     }
76 
77     DistanceType* resultDist;
78     int* resultIndex;
79 
80     __device__
81     inline void
setResultLocationSingleResultSet82     setResultLocation( DistanceType* dists, int* index, int thread, int stride )
83     {
84         resultDist=dists+thread*stride;
85         resultIndex=index+thread*stride;
86         if( stride != 1 ) {
87             for( int i=1; i<stride; i++ ) {
88                 resultDist[i]=INFINITY;
89                 resultIndex[i]=-1;
90             }
91         }
92     }
93 
94     __device__
95     inline void
finishSingleResultSet96     finish()
97     {
98         resultDist[0]=bestDist;
99         resultIndex[0]=bestIndex;
100     }
101 };
102 
103 template< typename DistanceType >
104 struct GreaterThan
105 {
106     __device__
operatorGreaterThan107     bool operator()(DistanceType a, DistanceType b)
108     {
109         return a>b;
110     }
111 };
112 
113 
114 // using this and the template uses 2 or 3 registers more than the direct implementation in the kNearestKernel, but
115 // there is no speed difference.
116 // Setting useHeap as a template parameter leads to a whole lot of things being
117 // optimized away by nvcc.
118 // Register counts are the same as when removing not-needed variables in explicit specializations
119 // and the "if( useHeap )" branches are eliminated at compile time.
120 // The downside of this: a bit more complex kernel launch code.
121 template< typename DistanceType, bool useHeap >
122 struct KnnResultSet
123 {
124     int foundNeighbors;
125     DistanceType largestHeapDist;
126     int maxDistIndex;
127     const int k;
128     const bool sorted;
129     const DistanceType epsError;
130 
131 
132     __device__ __host__
KnnResultSetKnnResultSet133     KnnResultSet(int knn, bool sortResults, DistanceType eps) : foundNeighbors(0),largestHeapDist(INFINITY),k(knn), sorted(sortResults), epsError(eps){ }
134 
135     //          __host__ __device__
136     //          KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ }
137 
138     __device__
139     inline DistanceType
worstDistKnnResultSet140     worstDist()
141     {
142         return largestHeapDist;
143     }
144 
145     __device__
146     inline void
insertKnnResultSet147     insert(int index, DistanceType dist)
148     {
149         if( foundNeighbors<k ) {
150             resultDist[foundNeighbors]=dist;
151             resultIndex[foundNeighbors]=index;
152             if( foundNeighbors==k-1) {
153                 if( useHeap ) {
154                     flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>());
155                     largestHeapDist=resultDist[0];
156                 }
157                 else {
158                     findLargestDistIndex();
159                 }
160 
161             }
162             foundNeighbors++;
163         }
164         else if( dist < largestHeapDist ) {
165             if( useHeap ) {
166                 resultDist[0]=dist;
167                 resultIndex[0]=index;
168                 flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan<DistanceType>());
169                 largestHeapDist=resultDist[0];
170             }
171             else {
172                 resultDist[maxDistIndex]=dist;
173                 resultIndex[maxDistIndex]=index;
174                 findLargestDistIndex();
175             }
176 
177         }
178     }
179 
180     __device__
181     void
findLargestDistIndexKnnResultSet182     findLargestDistIndex( )
183     {
184         largestHeapDist=resultDist[0];
185         maxDistIndex=0;
186         for( int i=1; i<k; i++ )
187             if( resultDist[i] > largestHeapDist ) {
188                 maxDistIndex=i;
189                 largestHeapDist=resultDist[i];
190             }
191     }
192 
193     float* resultDist;
194     int* resultIndex;
195 
196     __device__
197     inline void
setResultLocationKnnResultSet198     setResultLocation( DistanceType* dists, int* index, int thread, int stride )
199     {
200         resultDist=dists+stride*thread;
201         resultIndex=index+stride*thread;
202         for( int i=0; i<stride; i++ ) {
203             resultDist[i]=INFINITY;
204             resultIndex[i]=-1;
205             //                  resultIndex[tid+i*blockDim.x]=-1;
206             //                  resultDist[tid+i*blockDim.x]=INFINITY;
207         }
208     }
209 
210     __host__ __device__
211     inline void
finishKnnResultSet212     finish()
213     {
214         if( sorted ) {
215             if( !useHeap ) flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>());
216             for( int i=k-1; i>0; i-- ) {
217                 flann::cuda::swap( resultDist[0], resultDist[i] );
218                 flann::cuda::swap( resultIndex[0], resultIndex[i] );
219                 flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() );
220             }
221         }
222     }
223 };
224 
225 template <typename DistanceType>
226 struct CountingRadiusResultSet
227 {
228     int count_;
229     DistanceType radius_sq_;
230     int max_neighbors_;
231 
232     __device__ __host__
CountingRadiusResultSetCountingRadiusResultSet233     CountingRadiusResultSet(DistanceType radius, int max_neighbors) : count_(0),radius_sq_(radius), max_neighbors_(max_neighbors){ }
234 
235     __device__
236     inline DistanceType
worstDistCountingRadiusResultSet237     worstDist()
238     {
239         return radius_sq_;
240     }
241 
242     __device__
243     inline void
insertCountingRadiusResultSet244     insert(int index, float dist)
245     {
246         if( dist < radius_sq_ ) {
247             count_++;
248         }
249     }
250 
251     int* resultIndex;
252 
253     __device__
254     inline void
setResultLocationCountingRadiusResultSet255     setResultLocation( DistanceType* /*dists*/, int* count, int thread, int stride )
256     {
257         resultIndex=count+thread*stride;
258     }
259 
260     __device__
261     inline void
finishCountingRadiusResultSet262     finish()
263     {
264         if(( max_neighbors_<=0) ||( count_<=max_neighbors_) ) resultIndex[0]=count_;
265         else resultIndex[0]=max_neighbors_;
266     }
267 };
268 
269 template<typename DistanceType, bool useHeap>
270 struct RadiusKnnResultSet
271 {
272     int foundNeighbors;
273     DistanceType largestHeapDist;
274     int maxDistElem;
275     const int k;
276     const bool sorted;
277     const DistanceType radius_sq_;
278     int* segment_starts_;
279     //          int count_;
280 
281 
282     __device__ __host__
RadiusKnnResultSetRadiusKnnResultSet283     RadiusKnnResultSet(DistanceType radius, int knn, int* segment_starts, bool sortResults) : foundNeighbors(0),largestHeapDist(radius),k(knn), sorted(sortResults), radius_sq_(radius),segment_starts_(segment_starts) { }
284 
285     //          __host__ __device__
286     //          KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ }
287 
288     __device__
289     inline DistanceType
worstDistRadiusKnnResultSet290     worstDist()
291     {
292         return largestHeapDist;
293     }
294 
295     __device__
296     inline void
insertRadiusKnnResultSet297     insert(int index, DistanceType dist)
298     {
299         if( dist < radius_sq_ ) {
300             if( foundNeighbors<k ) {
301                 resultDist[foundNeighbors]=dist;
302                 resultIndex[foundNeighbors]=index;
303                 if(( foundNeighbors==k-1) && useHeap) {
304                     if( useHeap ) {
305                         flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>());
306                         largestHeapDist=resultDist[0];
307                     }
308                     else {
309                         findLargestDistIndex();
310                     }
311                 }
312                 foundNeighbors++;
313 
314             }
315             else if( dist < largestHeapDist ) {
316                 if( useHeap ) {
317                     resultDist[0]=dist;
318                     resultIndex[0]=index;
319                     flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan<DistanceType>());
320                     largestHeapDist=resultDist[0];
321                 }
322                 else {
323                     resultDist[maxDistElem]=dist;
324                     resultIndex[maxDistElem]=index;
325                     findLargestDistIndex();
326                 }
327             }
328         }
329     }
330 
331     __device__
332     void
findLargestDistIndexRadiusKnnResultSet333     findLargestDistIndex( )
334     {
335         largestHeapDist=resultDist[0];
336         maxDistElem=0;
337         for( int i=1; i<k; i++ )
338             if( resultDist[i] > largestHeapDist ) {
339                 maxDistElem=i;
340                 largestHeapDist=resultDist[i];
341             }
342     }
343 
344 
345     DistanceType* resultDist;
346     int* resultIndex;
347 
348     __device__
349     inline void
setResultLocationRadiusKnnResultSet350     setResultLocation( DistanceType* dists, int* index, int thread, int /*stride*/ )
351     {
352         resultDist=dists+segment_starts_[thread];
353         resultIndex=index+segment_starts_[thread];
354     }
355 
356     __device__
357     inline void
finishRadiusKnnResultSet358     finish()
359     {
360         if( sorted ) {
361             if( !useHeap ) flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>());
362             for( int i=foundNeighbors-1; i>0; i-- ) {
363                 flann::cuda::swap( resultDist[0], resultDist[i] );
364                 flann::cuda::swap( resultIndex[0], resultIndex[i] );
365                 flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() );
366             }
367         }
368     }
369 };
370 
371 // Difference to RadiusKnnResultSet: Works like KnnResultSet, doesn't pack the results densely (as the RadiusResultSet does)
372 template <typename DistanceType, bool useHeap>
373 struct KnnRadiusResultSet
374 {
375     int foundNeighbors;
376     DistanceType largestHeapDist;
377     int maxDistIndex;
378     const int k;
379     const bool sorted;
380     const DistanceType epsError;
381     const DistanceType radius_sq;
382 
383 
384     __device__ __host__
KnnRadiusResultSetKnnRadiusResultSet385     KnnRadiusResultSet(int knn, bool sortResults, DistanceType eps, DistanceType radius) : foundNeighbors(0),largestHeapDist(radius),k(knn), sorted(sortResults), epsError(eps),radius_sq(radius){ }
386 
387     //          __host__ __device__
388     //          KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ }
389 
390     __device__
391     inline DistanceType
worstDistKnnRadiusResultSet392     worstDist()
393     {
394         return largestHeapDist;
395     }
396 
397     __device__
398     inline void
insertKnnRadiusResultSet399     insert(int index, DistanceType dist)
400     {
401         if( dist < largestHeapDist ) {
402             if( foundNeighbors<k ) {
403                 resultDist[foundNeighbors]=dist;
404                 resultIndex[foundNeighbors]=index;
405                 if( foundNeighbors==k-1 ) {
406                     if( useHeap ) {
407                         flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>());
408                         largestHeapDist=resultDist[0];
409                     }
410                     else {
411                         findLargestDistIndex();
412                     }
413                 }
414                 foundNeighbors++;
415             }
416             else { //if( dist < largestHeapDist )
417                 if( useHeap ) {
418                     resultDist[0]=dist;
419                     resultIndex[0]=index;
420                     flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan<DistanceType>());
421                     largestHeapDist=resultDist[0];
422                 }
423                 else {
424                     resultDist[maxDistIndex]=dist;
425                     resultIndex[maxDistIndex]=index;
426                     findLargestDistIndex();
427                 }
428             }
429         }
430     }
431     __device__
432     void
findLargestDistIndexKnnRadiusResultSet433     findLargestDistIndex( )
434     {
435         largestHeapDist=resultDist[0];
436         maxDistIndex=0;
437         for( int i=1; i<k; i++ )
438             if( resultDist[i] > largestHeapDist ) {
439                 maxDistIndex=i;
440                 largestHeapDist=resultDist[i];
441             }
442     }
443 
444     DistanceType* resultDist;
445     int* resultIndex;
446 
447     __device__
448     inline void
setResultLocationKnnRadiusResultSet449     setResultLocation( DistanceType* dists, int* index, int thread, int stride )
450     {
451         resultDist=dists+stride*thread;
452         resultIndex=index+stride*thread;
453         for( int i=0; i<stride; i++ ) {
454             resultDist[i]=INFINITY;
455             resultIndex[i]=-1;
456             //                  resultIndex[tid+i*blockDim.x]=-1;
457             //                  resultDist[tid+i*blockDim.x]=INFINITY;
458         }
459     }
460 
461     __device__
462     inline void
finishKnnRadiusResultSet463     finish()
464     {
465         if( sorted ) {
466             if( !useHeap ) flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>());
467             for( int i=k-1; i>0; i-- ) {
468                 flann::cuda::swap( resultDist[0], resultDist[i] );
469                 flann::cuda::swap( resultIndex[0], resultIndex[i] );
470                 flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() );
471             }
472         }
473     }
474 };
475 
476 //! fills the radius output buffer.
477 //! IMPORTANT ASSERTION: ASSUMES THAT THERE IS ENOUGH SPACE FOR EVERY NEIGHBOR! IF THIS ISN'T
478 //! TRUE, USE KnnRadiusResultSet! (Otherwise, the neighbors of one element might overflow into the next element, or past the buffer.)
479 template< typename DistanceType >
480 struct RadiusResultSet
481 {
482     DistanceType radius_sq_;
483     int* segment_starts_;
484     int count_;
485     bool sorted_;
486 
487     __device__ __host__
RadiusResultSetRadiusResultSet488     RadiusResultSet(DistanceType radius, int* segment_starts, bool sorted) : radius_sq_(radius), segment_starts_(segment_starts), count_(0), sorted_(sorted){ }
489 
490     __device__
491     inline DistanceType
worstDistRadiusResultSet492     worstDist()
493     {
494         return radius_sq_;
495     }
496 
497     __device__
498     inline void
insertRadiusResultSet499     insert(int index, DistanceType dist)
500     {
501         if( dist < radius_sq_ ) {
502             resultIndex[count_]=index;
503             resultDist[count_]=dist;
504             count_++;
505         }
506     }
507 
508     int* resultIndex;
509     DistanceType* resultDist;
510 
511     __device__
512     inline void
setResultLocationRadiusResultSet513     setResultLocation( DistanceType* dists, int* index, int thread, int /*stride*/ )
514     {
515         resultIndex=index+segment_starts_[thread];
516         resultDist=dists+segment_starts_[thread];
517     }
518 
519     __device__
520     inline void
finishRadiusResultSet521     finish()
522     {
523         if( sorted_ ) {
524             flann::cuda::heap::make_heap( resultDist,resultIndex, count_, GreaterThan<DistanceType>() );
525             for( int i=count_-1; i>0; i-- ) {
526                 flann::cuda::swap( resultDist[0], resultDist[i] );
527                 flann::cuda::swap( resultIndex[0], resultIndex[i] );
528                 flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() );
529             }
530         }
531     }
532 };
533 }
534 }
535 
536 #endif
537