1 /**********************************************************************
2 * Software License Agreement (BSD License)
3 *
4 * Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
5 *
6 * THE BSD LICENSE
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *************************************************************************/
29
30 #include "kdtree_cuda_3d_index.h"
31 #include <flann/algorithms/dist.h>
32 #include <flann/util/cuda/result_set.h>
33 // #define THRUST_DEBUG 1
34 #include <cuda.h>
35 #include <thrust/gather.h>
36 #include <thrust/copy.h>
37 #include <thrust/device_vector.h>
38 #include <vector_types.h>
39 #include <flann/util/cutil_math.h>
40 #include <thrust/host_vector.h>
41 #include <thrust/copy.h>
42 #include <flann/util/cuda/heap.h>
43 #include <thrust/scan.h>
44 #include <thrust/count.h>
45 #include <flann/algorithms/kdtree_cuda_builder.h>
46 #include <vector_types.h>
47 namespace flann
48 {
49
50 namespace KdTreeCudaPrivate
51 {
52 template< typename GPUResultSet, typename Distance >
53 __device__
searchNeighbors(const cuda::kd_tree_builder_detail::SplitInfo * splits,const int * child1,const int * parent,const float4 * aabbLow,const float4 * aabbHigh,const float4 * elements,const float4 & q,GPUResultSet & result,const Distance & distance=Distance ())54 void searchNeighbors(const cuda::kd_tree_builder_detail::SplitInfo* splits,
55 const int* child1,
56 const int* parent,
57 const float4* aabbLow,
58 const float4* aabbHigh, const float4* elements, const float4& q, GPUResultSet& result, const Distance& distance = Distance() )
59 {
60
61 bool backtrack=false;
62 int lastNode=-1;
63 int current=0;
64
65 cuda::kd_tree_builder_detail::SplitInfo split;
66 while(true) {
67 if( current==-1 ) break;
68 split = splits[current];
69
70 float diff1;
71 if( split.split_dim==0 ) diff1=q.x- split.split_val;
72 else if( split.split_dim==1 ) diff1=q.y- split.split_val;
73 else if( split.split_dim==2 ) diff1=q.z- split.split_val;
74
75 // children are next to each other: leftChild+1 == rightChild
76 int leftChild= child1[current];
77 int bestChild=leftChild;
78 int otherChild=leftChild;
79
80 if ((diff1)<0) {
81 otherChild++;
82 }
83 else {
84 bestChild++;
85 }
86
87 if( !backtrack ) {
88 /* If this is a leaf node, then do check and return. */
89 if (leftChild==-1) {
90 for (int i=split.left; i<split.right; ++i) {
91 float dist=distance.dist(elements[i],q);
92 result.insert(i,dist);
93 }
94 backtrack=true;
95 lastNode=current;
96 current=parent[current];
97 }
98 else { // go to closer child node
99 lastNode=current;
100 current=bestChild;
101 }
102 }
103 else { // continue moving back up the tree or visit far node?
104 // minimum possible distance between query point and a point inside the AABB
105 float mindistsq=0;
106 float4 aabbMin=aabbLow[otherChild];
107 float4 aabbMax=aabbHigh[otherChild];
108
109 if( q.x < aabbMin.x ) mindistsq+=distance.axisDist(q.x, aabbMin.x);
110 else if( q.x > aabbMax.x ) mindistsq+=distance.axisDist(q.x, aabbMax.x);
111 if( q.y < aabbMin.y ) mindistsq+=distance.axisDist(q.y, aabbMin.y);
112 else if( q.y > aabbMax.y ) mindistsq+=distance.axisDist(q.y, aabbMax.y);
113 if( q.z < aabbMin.z ) mindistsq+=distance.axisDist(q.z, aabbMin.z);
114 else if( q.z > aabbMax.z ) mindistsq+=distance.axisDist(q.z, aabbMax.z);
115
116 // the far node was NOT the last node (== not visited yet) AND there could be a closer point in it
117 if(( lastNode==bestChild) && (mindistsq <= result.worstDist() ) ) {
118 lastNode=current;
119 current=otherChild;
120 backtrack=false;
121 }
122 else {
123 lastNode=current;
124 current=parent[current];
125 }
126 }
127
128 }
129 }
130
131
132 template< typename GPUResultSet, typename Distance >
133 __global__
nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo * splits,const int * child1,const int * parent,const float4 * aabbMin,const float4 * aabbMax,const float4 * elements,const float * query,int stride,int resultStride,int * resultIndex,float * resultDist,int querysize,GPUResultSet result,Distance dist=Distance ())134 void nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo* splits,
135 const int* child1,
136 const int* parent,
137 const float4* aabbMin,
138 const float4* aabbMax, const float4* elements, const float* query, int stride, int resultStride, int* resultIndex, float* resultDist, int querysize, GPUResultSet result, Distance dist = Distance())
139 {
140 typedef float DistanceType;
141 typedef float ElementType;
142 // typedef DistanceType float;
143 size_t tid = blockDim.x*blockIdx.x + threadIdx.x;
144
145 if( tid >= querysize ) return;
146
147 float4 q = make_float4(query[tid*stride],query[tid*stride+1],query[tid*stride+2],0);
148
149 result.setResultLocation( resultDist, resultIndex, tid, resultStride );
150
151 searchNeighbors(splits,child1,parent,aabbMin,aabbMax,elements, q, result, dist);
152
153 result.finish();
154 }
155
156 }
157
158 //! contains some pointers that use cuda data types and that cannot be easily
159 //! forward-declared.
160 //! basically it contains all GPU buffers
161 template<typename Distance>
162 struct KDTreeCuda3dIndex<Distance>::GpuHelper
163 {
164 thrust::device_vector< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_;
165 thrust::device_vector< int >* gpu_parent_;
166 thrust::device_vector< int >* gpu_child1_;
167 thrust::device_vector< float4 >* gpu_aabb_min_;
168 thrust::device_vector< float4 >* gpu_aabb_max_;
169 thrust::device_vector<float4>* gpu_points_;
170 thrust::device_vector<int>* gpu_vind_;
GpuHelperflann::KDTreeCuda3dIndex::GpuHelper171 GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0){
172 }
~GpuHelperflann::KDTreeCuda3dIndex::GpuHelper173 ~GpuHelper()
174 {
175 delete gpu_splits_;
176 gpu_splits_=0;
177 delete gpu_parent_;
178 gpu_parent_=0;
179 delete gpu_child1_;
180 gpu_child1_=0;
181 delete gpu_aabb_max_;
182 gpu_aabb_max_=0;
183 delete gpu_aabb_min_;
184 gpu_aabb_min_=0;
185 delete gpu_vind_;
186 gpu_vind_=0;
187
188 delete gpu_points_;
189 gpu_points_=0;
190 }
191 };
192
193 //! thrust transform functor
194 //! transforms indices in the internal data set back to the original indices
195 struct map_indices
196 {
197 const int* v_;
198
map_indicesflann::map_indices199 map_indices(const int* v) : v_(v) {
200 }
201
202 __host__ __device__
operator ()flann::map_indices203 float operator() (const int&i) const
204 {
205 if( i>= 0 ) return v_[i];
206 else return i;
207 }
208 };
209
210 //! implementation of L2 distance for the CUDA kernels
211 struct CudaL2
212 {
213
214 static float
215 __host__ __device__
axisDistflann::CudaL2216 axisDist( float a, float b )
217 {
218 return (a-b)*(a-b);
219 }
220
221 static float
222 __host__ __device__
distflann::CudaL2223 dist( float4 a, float4 b )
224 {
225 float4 diff = a-b;
226 return dot(diff,diff);
227 }
228 };
229
230 //! implementation of L1 distance for the CUDA kernels
231 //! NOT TESTED!
232 struct CudaL1
233 {
234
235 static float
236 __host__ __device__
axisDistflann::CudaL1237 axisDist( float a, float b )
238 {
239 return fabs(a-b);
240 }
241
242 static float
243 __host__ __device__
distflann::CudaL1244 dist( float4 a, float4 b )
245 {
246 return fabs(a.x-b.x)+fabs (a.y-b.y)+( a.z-b.z)+(a.w-b.w);
247 }
248 };
249
250 //! used to adapt CPU and GPU distance types.
251 //! specializations define the ::type as their corresponding GPU distance type
252 //! \see GpuDistance< L2<float> >, GpuDistance< L2_Simple<float> >
253 template< class Distance >
254 struct GpuDistance
255 {
256 };
257
258 template<>
259 struct GpuDistance< L2<float> >
260 {
261 typedef CudaL2 type;
262 };
263
264 template<>
265 struct GpuDistance< L2_Simple<float> >
266 {
267 typedef CudaL2 type;
268 };
269 template<>
270 struct GpuDistance< L1<float> >
271 {
272 typedef CudaL1 type;
273 };
274
275
276 template< typename Distance >
knnSearchGpu(const Matrix<ElementType> & queries,Matrix<int> & indices,Matrix<DistanceType> & dists,size_t knn,const SearchParams & params) const277 void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const
278 {
279 assert(indices.rows >= queries.rows);
280 assert(dists.rows >= queries.rows);
281 assert(int(indices.cols) >= knn);
282 assert( dists.cols == indices.cols && dists.stride==indices.stride );
283
284 int istride=queries.stride/sizeof(ElementType);
285 int ostride=indices.stride/4;
286
287 bool matrices_on_gpu = params.matrices_in_gpu_ram;
288
289 int threadsPerBlock = 128;
290 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
291
292 float epsError = 1+params.eps;
293 bool sorted = params.sorted;
294 bool use_heap = params.use_heap;
295
296 typename GpuDistance<Distance>::type distance;
297 // std::cout<<" search: "<<std::endl;
298 // std::cout<<" rows: "<<indices.rows<<" "<<dists.rows<<" "<<queries.rows<<std::endl;
299 // std::cout<<" cols: "<<indices.cols<<" "<<dists.cols<<" "<<queries.cols<<std::endl;
300 // std::cout<<" stride: "<<indices.stride<<" "<<dists.stride<<" "<<queries.stride<<std::endl;
301 // std::cout<<" stride2:"<<istride<<" "<<ostride<<std::endl;
302 // std::cout<<" knn:"<<knn<<" matrices_on_gpu:"<<matrices_on_gpu<<std::endl;
303
304 if( !matrices_on_gpu ) {
305 thrust::device_vector<float> queriesDev(istride* queries.rows,0);
306 thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
307 thrust::device_vector<float> distsDev(queries.rows* ostride);
308 thrust::device_vector<int> indicesDev(queries.rows* ostride);
309
310
311
312 if( knn==1 ) {
313 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
314 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
315 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
316 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
317 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
318 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
319 thrust::raw_pointer_cast(&queriesDev[0]),
320 istride,
321 ostride,
322 thrust::raw_pointer_cast(&indicesDev[0]),
323 thrust::raw_pointer_cast(&distsDev[0]),
324 queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
325 // KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
326 // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
327 // thrust::raw_pointer_cast(&queriesDev[0]),
328 // queries.stride,
329 // thrust::raw_pointer_cast(&indicesDev[0]),
330 // thrust::raw_pointer_cast(&distsDev[0]),
331 // queries.rows, epsError);
332 //
333 }
334 else {
335 if( use_heap ) {
336 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
337 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
338 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
339 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
340 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
341 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
342 thrust::raw_pointer_cast(&queriesDev[0]),
343 istride,
344 ostride,
345 thrust::raw_pointer_cast(&indicesDev[0]),
346 thrust::raw_pointer_cast(&distsDev[0]),
347 queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
348 , distance);
349 }
350 else {
351 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
352 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
353 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
354 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
355 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
356 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
357 thrust::raw_pointer_cast(&queriesDev[0]),
358 istride,
359 ostride,
360 thrust::raw_pointer_cast(&indicesDev[0]),
361 thrust::raw_pointer_cast(&distsDev[0]),
362 queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
363 distance
364 );
365 }
366 }
367 thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
368 thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
369 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
370 }
371 else {
372 thrust::device_ptr<float> qd = thrust::device_pointer_cast(queries.ptr());
373 thrust::device_ptr<float> dd = thrust::device_pointer_cast(dists.ptr());
374 thrust::device_ptr<int> id = thrust::device_pointer_cast(indices.ptr());
375
376
377
378 if( knn==1 ) {
379 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
380 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
381 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
382 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
383 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
384 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
385 qd.get(),
386 istride,
387 ostride,
388 id.get(),
389 dd.get(),
390 queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
391 // KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
392 // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
393 // thrust::raw_pointer_cast(&queriesDev[0]),
394 // queries.stride,
395 // thrust::raw_pointer_cast(&indicesDev[0]),
396 // thrust::raw_pointer_cast(&distsDev[0]),
397 // queries.rows, epsError);
398 //
399 }
400 else {
401 if( use_heap ) {
402 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
403 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
404 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
405 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
406 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
407 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
408 qd.get(),
409 istride,
410 ostride,
411 id.get(),
412 dd.get(),
413 queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
414 , distance);
415 }
416 else {
417 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
418 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
419 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
420 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
421 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
422 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
423 qd.get(),
424 istride,
425 ostride,
426 id.get(),
427 dd.get(),
428 queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
429 distance
430 );
431 }
432 }
433 thrust::transform(id, id+knn*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
434 }
435 }
436
437
438 template< typename Distance>
radiusSearchGpu(const Matrix<ElementType> & queries,std::vector<std::vector<int>> & indices,std::vector<std::vector<DistanceType>> & dists,float radius,const SearchParams & params) const439 int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
440 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const
441 {
442 // assert(indices.roasdfws >= queries.rows);
443 // assert(dists.rows >= queries.rows);
444
445 int max_neighbors = params.max_neighbors;
446 bool sorted = params.sorted;
447 bool use_heap = params.use_heap;
448 if (indices.size() < queries.rows ) indices.resize(queries.rows);
449 if (dists.size() < queries.rows ) dists.resize(queries.rows);
450
451 int istride=queries.stride/sizeof(ElementType);
452
453 thrust::device_vector<float> queriesDev(istride* queries.rows,0);
454 thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
455 thrust::device_vector<int> countsDev(queries.rows);
456
457 typename GpuDistance<Distance>::type distance;
458
459 int threadsPerBlock = 128;
460 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
461
462
463 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
464 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
465 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
466 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
467 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
468 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
469 thrust::raw_pointer_cast(&queriesDev[0]),
470 istride,
471 1,
472 thrust::raw_pointer_cast(&countsDev[0]),
473 0,
474 queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,max_neighbors),
475 distance
476 );
477
478 thrust::host_vector<int> counts_host=countsDev;
479
480 if( max_neighbors!=0 ) { // we'll need this later, but the exclusive_scan will change the array
481 for( size_t i=0; i<queries.rows; i++ ) {
482 int count = counts_host[i];
483 if( count > 0 ) {
484 indices[i].resize(count);
485 dists[i].resize(count);
486 }
487 else {
488 indices[i].clear();
489 dists[i].clear();
490 }
491
492 }
493 }
494
495 int neighbors_last_elem = countsDev.back();
496 thrust::exclusive_scan( countsDev.begin(), countsDev.end(), countsDev.begin() );
497
498 size_t total_neighbors=neighbors_last_elem+countsDev.back();
499 if( max_neighbors==0 ) return total_neighbors;
500
501 thrust::device_vector<int> indicesDev(total_neighbors,-1);
502 thrust::device_vector<float> distsDev(total_neighbors,std::numeric_limits<float>::infinity());
503
504 if( max_neighbors<0 ) {
505 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
506 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
507 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
508 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
509 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
510 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
511 thrust::raw_pointer_cast(&queriesDev[0]),
512 istride,
513 1,
514 thrust::raw_pointer_cast(&indicesDev[0]),
515 thrust::raw_pointer_cast(&distsDev[0]),
516 queries.rows, flann::cuda::RadiusResultSet<float>(radius,thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
517 }
518 else {
519 if( use_heap ) {
520 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
521 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
522 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
523 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
524 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
525 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
526 thrust::raw_pointer_cast(&queriesDev[0]),
527 istride,
528 1,
529 thrust::raw_pointer_cast(&indicesDev[0]),
530 thrust::raw_pointer_cast(&distsDev[0]),
531 queries.rows, flann::cuda::RadiusKnnResultSet<float, true>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
532 }
533 else {
534 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
535 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
536 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
537 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
538 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
539 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
540 thrust::raw_pointer_cast(&queriesDev[0]),
541 istride,
542 1,
543 thrust::raw_pointer_cast(&indicesDev[0]),
544 thrust::raw_pointer_cast(&distsDev[0]),
545 queries.rows, flann::cuda::RadiusKnnResultSet<float, false>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
546 }
547 }
548 thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
549 thrust::host_vector<int> indices_temp = indicesDev;
550 thrust::host_vector<float> dists_temp = distsDev;
551
552 int buffer_index=0;
553 for( size_t i=0; i<queries.rows; i++ ) {
554 for( size_t j=0; j<counts_host[i]; j++ ) {
555 dists[i][j]=dists_temp[buffer_index];
556 indices[i][j]=indices_temp[buffer_index];
557 ++buffer_index;
558 }
559 }
560
561 return buffer_index;
562 }
563
564 //! used in the radius search to count the total number of neighbors
565 struct isNotMinusOne
566 {
567 __host__ __device__
operator ()flann::isNotMinusOne568 bool operator() ( int i ){
569 return i!=-1;
570 }
571 };
572
573 template< typename Distance>
radiusSearchGpu(const Matrix<ElementType> & queries,Matrix<int> & indices,Matrix<DistanceType> & dists,float radius,const SearchParams & params) const574 int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const
575 {
576 int max_neighbors = params.max_neighbors;
577 assert(indices.rows >= queries.rows);
578 assert(dists.rows >= queries.rows || max_neighbors==0 );
579 assert(indices.stride==dists.stride || max_neighbors==0 );
580 assert( indices.cols==indices.stride/sizeof(int) );
581 assert(dists.rows >= queries.rows || max_neighbors==0 );
582
583 bool sorted = params.sorted;
584 bool matrices_on_gpu = params.matrices_in_gpu_ram;
585 float epsError = 1+params.eps;
586 bool use_heap = params.use_heap;
587 int istride=queries.stride/sizeof(ElementType);
588 int ostride= indices.stride/4;
589
590
591 if( max_neighbors<0 ) max_neighbors=indices.cols;
592
593 if( !matrices_on_gpu ) {
594 thrust::device_vector<float> queriesDev(istride* queries.rows,0);
595 thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
596 typename GpuDistance<Distance>::type distance;
597 int threadsPerBlock = 128;
598 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
599 if( max_neighbors== 0 ) {
600 thrust::device_vector<int> indicesDev(queries.rows* ostride);
601 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
602 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
603 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
604 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
605 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
606 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
607 thrust::raw_pointer_cast(&queriesDev[0]),
608 istride,
609 ostride,
610 thrust::raw_pointer_cast(&indicesDev[0]),
611 0,
612 queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
613 distance
614 );
615 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
616 return thrust::reduce(indicesDev.begin(), indicesDev.end() );
617 }
618
619
620 thrust::device_vector<float> distsDev(queries.rows* max_neighbors);
621 thrust::device_vector<int> indicesDev(queries.rows* max_neighbors);
622
623 if( use_heap ) {
624 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
625 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
626 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
627 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
628 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
629 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
630 thrust::raw_pointer_cast(&queriesDev[0]),
631 istride,
632 ostride,
633 thrust::raw_pointer_cast(&indicesDev[0]),
634 thrust::raw_pointer_cast(&distsDev[0]),
635 queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
636 }
637 else {
638 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
639 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
640 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
641 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
642 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
643 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
644 thrust::raw_pointer_cast(&queriesDev[0]),
645 istride,
646 ostride,
647 thrust::raw_pointer_cast(&indicesDev[0]),
648 thrust::raw_pointer_cast(&distsDev[0]),
649 queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
650 }
651
652 thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
653 thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
654 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
655
656 return thrust::count_if(indicesDev.begin(), indicesDev.end(), isNotMinusOne() );
657 }
658 else {
659
660 thrust::device_ptr<float> qd=thrust::device_pointer_cast(queries.ptr());
661 thrust::device_ptr<float> dd=thrust::device_pointer_cast(dists.ptr());
662 thrust::device_ptr<int> id=thrust::device_pointer_cast(indices.ptr());
663 typename GpuDistance<Distance>::type distance;
664 int threadsPerBlock = 128;
665 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
666
667 if( max_neighbors== 0 ) {
668 thrust::device_vector<int> indicesDev(queries.rows* indices.stride);
669 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
670 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
671 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
672 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
673 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
674 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
675 qd.get(),
676 istride,
677 ostride,
678 id.get(),
679 0,
680 queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
681 distance
682 );
683 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
684 return thrust::reduce(indicesDev.begin(), indicesDev.end() );
685 }
686
687 if( use_heap ) {
688 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
689 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
690 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
691 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
692 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
693 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
694 qd.get(),
695 istride,
696 ostride,
697 id.get(),
698 dd.get(),
699 queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
700 }
701 else {
702 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
703 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
704 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
705 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
706 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
707 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
708 qd.get(),
709 istride,
710 ostride,
711 id.get(),
712 dd.get(),
713 queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
714 }
715
716 thrust::transform(id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
717
718 return thrust::count_if(id, id+max_neighbors*queries.rows, isNotMinusOne() );
719 }
720 }
721
722 template<typename Distance>
uploadTreeToGpu()723 void KDTreeCuda3dIndex<Distance>::uploadTreeToGpu()
724 {
725 // just make sure that no weird alignment stuff is going on...
726 // shouldn't, but who knows
727 // (I would make this a (boost) static assertion, but so far flann seems to avoid boost
728 // assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) );
729 delete gpu_helper_;
730 gpu_helper_ = new GpuHelper;
731 gpu_helper_->gpu_points_=new thrust::device_vector<float4>(size_);
732 thrust::device_vector<float4> tmp(size_);
733 if( get_param(index_params_,"input_is_gpu_float4",false) ) {
734 assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float));
735 thrust::copy( thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin());
736
737 }
738 else {
739 // k is limited to 4 -> use 128bit-alignment regardless of dimensionality
740 // makes cpu search about 5% slower, but gpu can read a float4 w/ a single instruction
741 // (vs a float2 and a float load for a float3 value)
742 // pad data directly to avoid having to copy and re-format the data when
743 // copying it to the GPU
744 data_ = flann::Matrix<ElementType>(new ElementType[size_*4], size_, dim_,4*4);
745 for (size_t i=0; i<size_; ++i) {
746 for (size_t j=0; j<dim_; ++j) {
747 data_[i][j] = dataset_[i][j];
748 }
749 for (size_t j=dim_; j<4; ++j) {
750 data_[i][j] = 0;
751 }
752 }
753 thrust::copy((float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin());
754 }
755
756 CudaKdTreeBuilder builder( tmp, leaf_max_size_ );
757 builder.buildTree();
758
759 gpu_helper_->gpu_splits_ = builder.splits_;
760 gpu_helper_->gpu_aabb_min_ = builder.aabb_min_;
761 gpu_helper_->gpu_aabb_max_ = builder.aabb_max_;
762 gpu_helper_->gpu_child1_ = builder.child1_;
763 gpu_helper_->gpu_parent_=builder.parent_;
764 gpu_helper_->gpu_vind_=builder.index_x_;
765 thrust::gather( builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin());
766
767 // gpu_helper_->gpu_nodes_=new thrust::device_vector<KdTreeCudaPrivate::GpuNode>(node_count_);
768
769
770 // gpu_helper_->gpu_vind_=new thrust::device_vector<int>(size_);
771 // thrust::copy( (KdTreeCudaPrivate::GpuNode*)&(tree_[0]), ((KdTreeCudaPrivate::GpuNode*)&(tree_[0]))+tree_.size(), gpu_helper_->gpu_nodes_->begin());
772
773 // thrust::copy(vind_.begin(),vind_.end(),gpu_helper_->gpu_vind_->begin());
774
775 // buildGpuTree();
776 }
777
778
779 template<typename Distance>
clearGpuBuffers()780 void KDTreeCuda3dIndex<Distance>::clearGpuBuffers()
781 {
782 delete gpu_helper_;
783 gpu_helper_=0;
784 }
785
786 // explicit instantiations for distance-independent functions
787 template
788 void KDTreeCuda3dIndex<flann::L2<float> >::uploadTreeToGpu();
789
790 template
791 void KDTreeCuda3dIndex<flann::L2<float> >::clearGpuBuffers();
792
793 template
794 struct KDTreeCuda3dIndex<flann::L2<float> >::GpuHelper;
795
796 template
797 void KDTreeCuda3dIndex<flann::L2<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
798
799 template
800 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
801 template
802 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
803 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
804
805 // explicit instantiations for distance-independent functions
806 template
807 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::uploadTreeToGpu();
808
809 template
810 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::clearGpuBuffers();
811
812 template
813 struct KDTreeCuda3dIndex<flann::L2_Simple<float> >::GpuHelper;
814
815 template
816 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
817
818 template
819 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
820 template
821 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
822 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
823
824
825 // explicit instantiations for distance-independent functions
826 template
827 void KDTreeCuda3dIndex<flann::L1<float> >::uploadTreeToGpu();
828
829 template
830 void KDTreeCuda3dIndex<flann::L1<float> >::clearGpuBuffers();
831
832 template
833 struct KDTreeCuda3dIndex<flann::L1<float> >::GpuHelper;
834
835 template
836 void KDTreeCuda3dIndex<flann::L1<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
837
838 template
839 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
840 template
841 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
842 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
843 }
844