1 /**********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2011       Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
5  *
6  * THE BSD LICENSE
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *************************************************************************/
29 
30 #include "kdtree_cuda_3d_index.h"
31 #include <flann/algorithms/dist.h>
32 #include <flann/util/cuda/result_set.h>
33 // #define THRUST_DEBUG 1
34 #include <cuda.h>
35 #include <thrust/gather.h>
36 #include <thrust/copy.h>
37 #include <thrust/device_vector.h>
38 #include <vector_types.h>
39 #include <flann/util/cutil_math.h>
40 #include <thrust/host_vector.h>
41 #include <thrust/copy.h>
42 #include <flann/util/cuda/heap.h>
43 #include <thrust/scan.h>
44 #include <thrust/count.h>
45 #include <flann/algorithms/kdtree_cuda_builder.h>
46 #include <vector_types.h>
47 namespace flann
48 {
49 
50 namespace KdTreeCudaPrivate
51 {
52 template< typename GPUResultSet, typename Distance >
53 __device__
searchNeighbors(const cuda::kd_tree_builder_detail::SplitInfo * splits,const int * child1,const int * parent,const float4 * aabbLow,const float4 * aabbHigh,const float4 * elements,const float4 & q,GPUResultSet & result,const Distance & distance=Distance ())54 void searchNeighbors(const cuda::kd_tree_builder_detail::SplitInfo* splits,
55                      const int* child1,
56                      const int* parent,
57                      const float4* aabbLow,
58                      const float4* aabbHigh, const float4* elements, const float4& q, GPUResultSet& result, const Distance& distance = Distance() )
59 {
60 
61     bool backtrack=false;
62     int lastNode=-1;
63     int current=0;
64 
65     cuda::kd_tree_builder_detail::SplitInfo split;
66     while(true) {
67         if( current==-1 ) break;
68         split = splits[current];
69 
70         float diff1;
71         if( split.split_dim==0 ) diff1=q.x- split.split_val;
72         else if( split.split_dim==1 ) diff1=q.y- split.split_val;
73         else if( split.split_dim==2 ) diff1=q.z- split.split_val;
74 
75         // children are next to each other: leftChild+1 == rightChild
76         int leftChild= child1[current];
77         int bestChild=leftChild;
78         int otherChild=leftChild;
79 
80         if ((diff1)<0) {
81             otherChild++;
82         }
83         else {
84             bestChild++;
85         }
86 
87         if( !backtrack ) {
88             /* If this is a leaf node, then do check and return. */
89             if (leftChild==-1) {
90                 for (int i=split.left; i<split.right; ++i) {
91                     float dist=distance.dist(elements[i],q);
92                     result.insert(i,dist);
93                 }
94                 backtrack=true;
95                 lastNode=current;
96                 current=parent[current];
97             }
98             else { // go to closer child node
99                 lastNode=current;
100                 current=bestChild;
101             }
102         }
103         else { // continue moving back up the tree or visit far node?
104               // minimum possible distance between query point and a point inside the AABB
105             float mindistsq=0;
106             float4 aabbMin=aabbLow[otherChild];
107             float4 aabbMax=aabbHigh[otherChild];
108 
109             if( q.x < aabbMin.x ) mindistsq+=distance.axisDist(q.x, aabbMin.x);
110             else if( q.x > aabbMax.x ) mindistsq+=distance.axisDist(q.x, aabbMax.x);
111             if( q.y < aabbMin.y ) mindistsq+=distance.axisDist(q.y, aabbMin.y);
112             else if( q.y > aabbMax.y ) mindistsq+=distance.axisDist(q.y, aabbMax.y);
113             if( q.z < aabbMin.z ) mindistsq+=distance.axisDist(q.z, aabbMin.z);
114             else if( q.z > aabbMax.z ) mindistsq+=distance.axisDist(q.z, aabbMax.z);
115 
116             //  the far node was NOT the last node (== not visited yet) AND there could be a closer point in it
117             if(( lastNode==bestChild) && (mindistsq <= result.worstDist() ) ) {
118                 lastNode=current;
119                 current=otherChild;
120                 backtrack=false;
121             }
122             else {
123                 lastNode=current;
124                 current=parent[current];
125             }
126         }
127 
128     }
129 }
130 
131 
132 template< typename GPUResultSet, typename Distance >
133 __global__
nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo * splits,const int * child1,const int * parent,const float4 * aabbMin,const float4 * aabbMax,const float4 * elements,const float * query,int stride,int resultStride,int * resultIndex,float * resultDist,int querysize,GPUResultSet result,Distance dist=Distance ())134 void nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo* splits,
135                    const int* child1,
136                    const int* parent,
137                    const float4* aabbMin,
138                    const float4* aabbMax, const float4* elements, const float* query, int stride, int resultStride, int* resultIndex, float* resultDist, int querysize, GPUResultSet result, Distance dist = Distance())
139 {
140     typedef float DistanceType;
141     typedef float ElementType;
142     //                  typedef DistanceType float;
143     size_t tid = blockDim.x*blockIdx.x + threadIdx.x;
144 
145     if( tid >= querysize ) return;
146 
147     float4 q = make_float4(query[tid*stride],query[tid*stride+1],query[tid*stride+2],0);
148 
149     result.setResultLocation( resultDist, resultIndex, tid, resultStride );
150 
151     searchNeighbors(splits,child1,parent,aabbMin,aabbMax,elements, q, result, dist);
152 
153     result.finish();
154 }
155 
156 }
157 
158 //! contains some pointers that use cuda data types and that cannot be easily
159 //! forward-declared.
160 //! basically it contains all GPU buffers
161 template<typename Distance>
162 struct KDTreeCuda3dIndex<Distance>::GpuHelper
163 {
164     thrust::device_vector< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_;
165     thrust::device_vector< int >* gpu_parent_;
166     thrust::device_vector< int >* gpu_child1_;
167     thrust::device_vector< float4 >* gpu_aabb_min_;
168     thrust::device_vector< float4 >* gpu_aabb_max_;
169     thrust::device_vector<float4>* gpu_points_;
170     thrust::device_vector<int>* gpu_vind_;
GpuHelperflann::KDTreeCuda3dIndex::GpuHelper171     GpuHelper() :  gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0){
172     }
~GpuHelperflann::KDTreeCuda3dIndex::GpuHelper173     ~GpuHelper()
174     {
175         delete gpu_splits_;
176         gpu_splits_=0;
177         delete gpu_parent_;
178         gpu_parent_=0;
179         delete gpu_child1_;
180         gpu_child1_=0;
181         delete gpu_aabb_max_;
182         gpu_aabb_max_=0;
183         delete gpu_aabb_min_;
184         gpu_aabb_min_=0;
185         delete gpu_vind_;
186         gpu_vind_=0;
187 
188         delete gpu_points_;
189         gpu_points_=0;
190     }
191 };
192 
193 //! thrust transform functor
194 //! transforms indices in the internal data set back to the original indices
195 struct map_indices
196 {
197     const int* v_;
198 
map_indicesflann::map_indices199     map_indices(const int* v) : v_(v) {
200     }
201 
202     __host__ __device__
operator ()flann::map_indices203     float operator() (const int&i) const
204     {
205         if( i>= 0 ) return v_[i];
206         else return i;
207     }
208 };
209 
210 //! implementation of L2 distance for the CUDA kernels
211 struct CudaL2
212 {
213 
214     static float
215     __host__ __device__
axisDistflann::CudaL2216     axisDist( float a, float b )
217     {
218         return (a-b)*(a-b);
219     }
220 
221     static float
222     __host__ __device__
distflann::CudaL2223     dist( float4 a, float4 b )
224     {
225         float4 diff = a-b;
226         return dot(diff,diff);
227     }
228 };
229 
230 //! implementation of L1 distance for the CUDA kernels
231 //! NOT TESTED!
232 struct CudaL1
233 {
234 
235     static float
236     __host__ __device__
axisDistflann::CudaL1237     axisDist( float a, float b )
238     {
239         return fabs(a-b);
240     }
241 
242     static float
243     __host__ __device__
distflann::CudaL1244     dist( float4 a, float4 b )
245     {
246         return fabs(a.x-b.x)+fabs (a.y-b.y)+( a.z-b.z)+(a.w-b.w);
247     }
248 };
249 
250 //! used to adapt CPU and GPU distance types.
251 //! specializations define the ::type as their corresponding GPU distance type
252 //! \see GpuDistance< L2<float> >, GpuDistance< L2_Simple<float> >
253 template< class Distance >
254 struct GpuDistance
255 {
256 };
257 
258 template<>
259 struct GpuDistance< L2<float> >
260 {
261     typedef CudaL2 type;
262 };
263 
264 template<>
265 struct GpuDistance< L2_Simple<float> >
266 {
267     typedef CudaL2 type;
268 };
269 template<>
270 struct GpuDistance< L1<float> >
271 {
272     typedef CudaL1 type;
273 };
274 
275 
276 template< typename Distance >
knnSearchGpu(const Matrix<ElementType> & queries,Matrix<int> & indices,Matrix<DistanceType> & dists,size_t knn,const SearchParams & params) const277 void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const
278 {
279     assert(indices.rows >= queries.rows);
280     assert(dists.rows >= queries.rows);
281     assert(int(indices.cols) >= knn);
282     assert( dists.cols == indices.cols && dists.stride==indices.stride );
283 
284     int istride=queries.stride/sizeof(ElementType);
285     int ostride=indices.stride/4;
286 
287     bool matrices_on_gpu = params.matrices_in_gpu_ram;
288 
289     int threadsPerBlock = 128;
290     int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
291 
292     float epsError = 1+params.eps;
293     bool sorted = params.sorted;
294     bool use_heap = params.use_heap;
295 
296     typename GpuDistance<Distance>::type distance;
297 //       std::cout<<" search: "<<std::endl;
298 //       std::cout<<"  rows: "<<indices.rows<<" "<<dists.rows<<" "<<queries.rows<<std::endl;
299 //       std::cout<<"  cols: "<<indices.cols<<" "<<dists.cols<<" "<<queries.cols<<std::endl;
300 //       std::cout<<"  stride: "<<indices.stride<<" "<<dists.stride<<" "<<queries.stride<<std::endl;
301 //       std::cout<<"  stride2:"<<istride<<" "<<ostride<<std::endl;
302 //       std::cout<<"  knn:"<<knn<<"  matrices_on_gpu:"<<matrices_on_gpu<<std::endl;
303 
304     if( !matrices_on_gpu ) {
305         thrust::device_vector<float> queriesDev(istride* queries.rows,0);
306         thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
307         thrust::device_vector<float> distsDev(queries.rows* ostride);
308         thrust::device_vector<int> indicesDev(queries.rows* ostride);
309 
310 
311 
312         if( knn==1  ) {
313             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
314                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
315                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
316                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
317                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
318                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
319                                                                                   thrust::raw_pointer_cast(&queriesDev[0]),
320                                                                                   istride,
321                                                                                   ostride,
322                                                                                   thrust::raw_pointer_cast(&indicesDev[0]),
323                                                                                   thrust::raw_pointer_cast(&distsDev[0]),
324                                                                                   queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
325             //                          KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
326             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
327             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&queriesDev[0]),
328             //                                                                                                                                                                                                                                                                                          queries.stride,
329             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&indicesDev[0]),
330             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&distsDev[0]),
331             //                                                                                                                                                                                                                                                                                          queries.rows, epsError);
332             //
333         }
334         else {
335             if( use_heap ) {
336                 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
337                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
338                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
339                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
340                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
341                                                                                       thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
342                                                                                       thrust::raw_pointer_cast(&queriesDev[0]),
343                                                                                       istride,
344                                                                                       ostride,
345                                                                                       thrust::raw_pointer_cast(&indicesDev[0]),
346                                                                                       thrust::raw_pointer_cast(&distsDev[0]),
347                                                                                       queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
348                                                                                       , distance);
349             }
350             else {
351                 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
352                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
353                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
354                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
355                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
356                                                                                       thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
357                                                                                       thrust::raw_pointer_cast(&queriesDev[0]),
358                                                                                       istride,
359                                                                                       ostride,
360                                                                                       thrust::raw_pointer_cast(&indicesDev[0]),
361                                                                                       thrust::raw_pointer_cast(&distsDev[0]),
362                                                                                       queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
363                                                                                       distance
364                                                                                       );
365             }
366         }
367         thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
368         thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
369         thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
370     }
371     else {
372         thrust::device_ptr<float> qd = thrust::device_pointer_cast(queries.ptr());
373         thrust::device_ptr<float> dd = thrust::device_pointer_cast(dists.ptr());
374         thrust::device_ptr<int> id = thrust::device_pointer_cast(indices.ptr());
375 
376 
377 
378         if( knn==1  ) {
379             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
380                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
381                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
382                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
383                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
384                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
385                                                                                   qd.get(),
386                                                                                   istride,
387                                                                                   ostride,
388                                                                                   id.get(),
389                                                                                   dd.get(),
390                                                                                   queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
391             //                          KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
392             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
393             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&queriesDev[0]),
394             //                                                                                                                                                                                                                                                                                          queries.stride,
395             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&indicesDev[0]),
396             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&distsDev[0]),
397             //                                                                                                                                                                                                                                                                                          queries.rows, epsError);
398             //
399         }
400         else {
401             if( use_heap ) {
402                 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
403                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
404                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
405                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
406                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
407                                                                                       thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
408                                                                                       qd.get(),
409                                                                                       istride,
410                                                                                       ostride,
411                                                                                       id.get(),
412                                                                                       dd.get(),
413                                                                                       queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
414                                                                                       , distance);
415             }
416             else {
417                 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
418                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
419                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
420                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
421                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
422                                                                                       thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
423                                                                                       qd.get(),
424                                                                                       istride,
425                                                                                       ostride,
426                                                                                       id.get(),
427                                                                                       dd.get(),
428                                                                                       queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
429                                                                                       distance
430                                                                                       );
431             }
432         }
433         thrust::transform(id, id+knn*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
434     }
435 }
436 
437 
438 template< typename Distance>
radiusSearchGpu(const Matrix<ElementType> & queries,std::vector<std::vector<int>> & indices,std::vector<std::vector<DistanceType>> & dists,float radius,const SearchParams & params) const439 int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
440                                                   std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const
441 {
442     //  assert(indices.roasdfws >= queries.rows);
443     //  assert(dists.rows >= queries.rows);
444 
445     int max_neighbors = params.max_neighbors;
446     bool sorted = params.sorted;
447     bool use_heap = params.use_heap;
448     if (indices.size() < queries.rows ) indices.resize(queries.rows);
449     if (dists.size() < queries.rows ) dists.resize(queries.rows);
450 
451     int istride=queries.stride/sizeof(ElementType);
452 
453     thrust::device_vector<float> queriesDev(istride* queries.rows,0);
454     thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
455     thrust::device_vector<int> countsDev(queries.rows);
456 
457     typename GpuDistance<Distance>::type distance;
458 
459     int threadsPerBlock = 128;
460     int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
461 
462 
463     KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
464                                                                           thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
465                                                                           thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
466                                                                           thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
467                                                                           thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
468                                                                           thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
469                                                                           thrust::raw_pointer_cast(&queriesDev[0]),
470                                                                           istride,
471                                                                           1,
472                                                                           thrust::raw_pointer_cast(&countsDev[0]),
473                                                                           0,
474                                                                           queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,max_neighbors),
475                                                                           distance
476                                                                           );
477 
478     thrust::host_vector<int> counts_host=countsDev;
479 
480     if( max_neighbors!=0 ) { // we'll need this later, but the exclusive_scan will change the array
481         for( size_t i=0; i<queries.rows; i++ ) {
482             int count = counts_host[i];
483             if( count > 0 ) {
484                 indices[i].resize(count);
485                 dists[i].resize(count);
486             }
487             else {
488                 indices[i].clear();
489                 dists[i].clear();
490             }
491 
492         }
493     }
494 
495     int neighbors_last_elem = countsDev.back();
496     thrust::exclusive_scan( countsDev.begin(), countsDev.end(), countsDev.begin() );
497 
498     size_t total_neighbors=neighbors_last_elem+countsDev.back();
499     if( max_neighbors==0 ) return total_neighbors;
500 
501     thrust::device_vector<int> indicesDev(total_neighbors,-1);
502     thrust::device_vector<float> distsDev(total_neighbors,std::numeric_limits<float>::infinity());
503 
504     if( max_neighbors<0 ) {
505         KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
506                                                                               thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
507                                                                               thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
508                                                                               thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
509                                                                               thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
510                                                                               thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
511                                                                               thrust::raw_pointer_cast(&queriesDev[0]),
512                                                                               istride,
513                                                                               1,
514                                                                               thrust::raw_pointer_cast(&indicesDev[0]),
515                                                                               thrust::raw_pointer_cast(&distsDev[0]),
516                                                                               queries.rows, flann::cuda::RadiusResultSet<float>(radius,thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
517     }
518     else {
519         if( use_heap ) {
520             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
521                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
522                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
523                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
524                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
525                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
526                                                                                   thrust::raw_pointer_cast(&queriesDev[0]),
527                                                                                   istride,
528                                                                                   1,
529                                                                                   thrust::raw_pointer_cast(&indicesDev[0]),
530                                                                                   thrust::raw_pointer_cast(&distsDev[0]),
531                                                                                   queries.rows, flann::cuda::RadiusKnnResultSet<float, true>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
532         }
533         else {
534             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
535                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
536                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
537                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
538                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
539                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
540                                                                                   thrust::raw_pointer_cast(&queriesDev[0]),
541                                                                                   istride,
542                                                                                   1,
543                                                                                   thrust::raw_pointer_cast(&indicesDev[0]),
544                                                                                   thrust::raw_pointer_cast(&distsDev[0]),
545                                                                                   queries.rows, flann::cuda::RadiusKnnResultSet<float, false>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
546         }
547     }
548     thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
549     thrust::host_vector<int> indices_temp = indicesDev;
550     thrust::host_vector<float> dists_temp = distsDev;
551 
552     int buffer_index=0;
553     for( size_t i=0; i<queries.rows; i++ ) {
554         for( size_t j=0; j<counts_host[i]; j++ ) {
555             dists[i][j]=dists_temp[buffer_index];
556             indices[i][j]=indices_temp[buffer_index];
557             ++buffer_index;
558         }
559     }
560 
561     return buffer_index;
562 }
563 
564 //! used in the radius search to count the total number of neighbors
565 struct isNotMinusOne
566 {
567     __host__ __device__
operator ()flann::isNotMinusOne568     bool operator() ( int i ){
569         return i!=-1;
570     }
571 };
572 
573 template< typename Distance>
radiusSearchGpu(const Matrix<ElementType> & queries,Matrix<int> & indices,Matrix<DistanceType> & dists,float radius,const SearchParams & params) const574 int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const
575 {
576 	int max_neighbors = params.max_neighbors;
577     assert(indices.rows >= queries.rows);
578     assert(dists.rows >= queries.rows || max_neighbors==0 );
579     assert(indices.stride==dists.stride  || max_neighbors==0 );
580     assert( indices.cols==indices.stride/sizeof(int) );
581     assert(dists.rows >= queries.rows || max_neighbors==0 );
582 
583     bool sorted = params.sorted;
584     bool matrices_on_gpu = params.matrices_in_gpu_ram;
585     float epsError = 1+params.eps;
586     bool use_heap = params.use_heap;
587     int istride=queries.stride/sizeof(ElementType);
588     int ostride= indices.stride/4;
589 
590 
591     if( max_neighbors<0 ) max_neighbors=indices.cols;
592 
593     if( !matrices_on_gpu ) {
594         thrust::device_vector<float> queriesDev(istride* queries.rows,0);
595         thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
596         typename GpuDistance<Distance>::type distance;
597         int threadsPerBlock = 128;
598         int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
599         if( max_neighbors== 0 ) {
600             thrust::device_vector<int> indicesDev(queries.rows* ostride);
601             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
602                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
603                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
604                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
605                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
606                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
607                                                                                   thrust::raw_pointer_cast(&queriesDev[0]),
608                                                                                   istride,
609                                                                                   ostride,
610                                                                                   thrust::raw_pointer_cast(&indicesDev[0]),
611                                                                                   0,
612                                                                                   queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
613                                                                                   distance
614                                                                                   );
615             thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
616             return thrust::reduce(indicesDev.begin(), indicesDev.end() );
617         }
618 
619 
620         thrust::device_vector<float> distsDev(queries.rows* max_neighbors);
621         thrust::device_vector<int> indicesDev(queries.rows* max_neighbors);
622 
623         if( use_heap ) {
624             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
625                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
626                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
627                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
628                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
629                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
630                                                                                   thrust::raw_pointer_cast(&queriesDev[0]),
631                                                                                   istride,
632                                                                                   ostride,
633                                                                                   thrust::raw_pointer_cast(&indicesDev[0]),
634                                                                                   thrust::raw_pointer_cast(&distsDev[0]),
635                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
636         }
637         else {
638             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
639                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
640                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
641                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
642                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
643                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
644                                                                                   thrust::raw_pointer_cast(&queriesDev[0]),
645                                                                                   istride,
646                                                                                   ostride,
647                                                                                   thrust::raw_pointer_cast(&indicesDev[0]),
648                                                                                   thrust::raw_pointer_cast(&distsDev[0]),
649                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
650         }
651 
652         thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
653         thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
654         thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
655 
656         return thrust::count_if(indicesDev.begin(), indicesDev.end(), isNotMinusOne() );
657     }
658     else {
659 
660         thrust::device_ptr<float> qd=thrust::device_pointer_cast(queries.ptr());
661         thrust::device_ptr<float> dd=thrust::device_pointer_cast(dists.ptr());
662         thrust::device_ptr<int> id=thrust::device_pointer_cast(indices.ptr());
663         typename GpuDistance<Distance>::type distance;
664         int threadsPerBlock = 128;
665         int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
666 
667         if( max_neighbors== 0 ) {
668             thrust::device_vector<int> indicesDev(queries.rows* indices.stride);
669             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
670                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
671                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
672                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
673                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
674                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
675                                                                                   qd.get(),
676                                                                                   istride,
677                                                                                   ostride,
678                                                                                   id.get(),
679                                                                                   0,
680                                                                                   queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
681                                                                                   distance
682                                                                                   );
683             thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
684             return thrust::reduce(indicesDev.begin(), indicesDev.end() );
685         }
686 
687         if( use_heap ) {
688             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
689                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
690                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
691                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
692                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
693                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
694                                                                                   qd.get(),
695                                                                                   istride,
696                                                                                   ostride,
697                                                                                   id.get(),
698                                                                                   dd.get(),
699                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
700         }
701         else {
702             KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
703                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
704                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
705                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
706                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
707                                                                                   thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
708                                                                                   qd.get(),
709                                                                                   istride,
710                                                                                   ostride,
711                                                                                   id.get(),
712                                                                                   dd.get(),
713                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
714         }
715 
716         thrust::transform(id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
717 
718         return thrust::count_if(id, id+max_neighbors*queries.rows, isNotMinusOne() );
719     }
720 }
721 
722 template<typename Distance>
uploadTreeToGpu()723 void KDTreeCuda3dIndex<Distance>::uploadTreeToGpu()
724 {
725     // just make sure that no weird alignment stuff is going on...
726     // shouldn't, but who knows
727     // (I would make this a (boost) static assertion, but so far flann seems to avoid boost
728     //  assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) );
729     delete gpu_helper_;
730     gpu_helper_ = new GpuHelper;
731     gpu_helper_->gpu_points_=new thrust::device_vector<float4>(size_);
732     thrust::device_vector<float4> tmp(size_);
733     if( get_param(index_params_,"input_is_gpu_float4",false) ) {
734 		assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float));
735         thrust::copy( thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin());
736 
737     }
738     else {
739         // k is limited to 4 -> use 128bit-alignment regardless of dimensionality
740         // makes cpu search about 5% slower, but gpu can read a float4 w/ a single instruction
741         // (vs a float2 and a float load for a float3 value)
742         // pad data directly to avoid having to copy and re-format the data when
743         // copying it to the GPU
744         data_ = flann::Matrix<ElementType>(new ElementType[size_*4], size_, dim_,4*4);
745         for (size_t i=0; i<size_; ++i) {
746             for (size_t j=0; j<dim_; ++j) {
747                 data_[i][j] = dataset_[i][j];
748             }
749             for (size_t j=dim_; j<4; ++j) {
750                 data_[i][j] = 0;
751             }
752         }
753         thrust::copy((float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin());
754     }
755 
756     CudaKdTreeBuilder builder( tmp, leaf_max_size_ );
757     builder.buildTree();
758 
759     gpu_helper_->gpu_splits_ = builder.splits_;
760     gpu_helper_->gpu_aabb_min_ = builder.aabb_min_;
761     gpu_helper_->gpu_aabb_max_ = builder.aabb_max_;
762     gpu_helper_->gpu_child1_ = builder.child1_;
763     gpu_helper_->gpu_parent_=builder.parent_;
764     gpu_helper_->gpu_vind_=builder.index_x_;
765     thrust::gather( builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin());
766 
767     //  gpu_helper_->gpu_nodes_=new thrust::device_vector<KdTreeCudaPrivate::GpuNode>(node_count_);
768 
769 
770     //  gpu_helper_->gpu_vind_=new thrust::device_vector<int>(size_);
771     //  thrust::copy( (KdTreeCudaPrivate::GpuNode*)&(tree_[0]), ((KdTreeCudaPrivate::GpuNode*)&(tree_[0]))+tree_.size(),  gpu_helper_->gpu_nodes_->begin());
772 
773     //  thrust::copy(vind_.begin(),vind_.end(),gpu_helper_->gpu_vind_->begin());
774 
775     //  buildGpuTree();
776 }
777 
778 
779 template<typename Distance>
clearGpuBuffers()780 void KDTreeCuda3dIndex<Distance>::clearGpuBuffers()
781 {
782     delete gpu_helper_;
783     gpu_helper_=0;
784 }
785 
786 // explicit instantiations for distance-independent functions
787 template
788 void KDTreeCuda3dIndex<flann::L2<float> >::uploadTreeToGpu();
789 
790 template
791 void KDTreeCuda3dIndex<flann::L2<float> >::clearGpuBuffers();
792 
793 template
794 struct KDTreeCuda3dIndex<flann::L2<float> >::GpuHelper;
795 
796 template
797 void KDTreeCuda3dIndex<flann::L2<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
798 
799 template
800 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
801 template
802 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
803                                                            std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
804 
805 // explicit instantiations for distance-independent functions
806 template
807 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::uploadTreeToGpu();
808 
809 template
810 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::clearGpuBuffers();
811 
812 template
813 struct KDTreeCuda3dIndex<flann::L2_Simple<float> >::GpuHelper;
814 
815 template
816 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
817 
818 template
819 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
820 template
821 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
822                                                                   std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
823 
824 
825 // explicit instantiations for distance-independent functions
826 template
827 void KDTreeCuda3dIndex<flann::L1<float> >::uploadTreeToGpu();
828 
829 template
830 void KDTreeCuda3dIndex<flann::L1<float> >::clearGpuBuffers();
831 
832 template
833 struct KDTreeCuda3dIndex<flann::L1<float> >::GpuHelper;
834 
835 template
836 void KDTreeCuda3dIndex<flann::L1<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
837 
838 template
839 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
840 template
841 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
842                                                            std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
843 }
844