1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /******************************************************************************
30 * Test of DeviceSelect::If and DevicePartition::If utilities
31 ******************************************************************************/
32
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35
36 #include <stdio.h>
37 #include <typeinfo>
38
39 #include <thrust/device_ptr.h>
40 #include <thrust/copy.h>
41 #include <thrust/partition.h>
42 #include <thrust/iterator/reverse_iterator.h>
43
44 #include <cub/util_allocator.cuh>
45 #include <cub/device/device_select.cuh>
46 #include <cub/device/device_partition.cuh>
47 #include <cub/iterator/counting_input_iterator.cuh>
48
49 #include "test_util.h"
50
51 using namespace cub;
52
53
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57
58 bool g_verbose = false;
59 int g_timing_iterations = 0;
60 int g_repeat = 0;
61 float g_device_giga_bandwidth;
62 CachingDeviceAllocator g_allocator(true);
63
64 // Dispatch types
65 enum Backend
66 {
67 CUB, // CUB method
68 THRUST, // Thrust method
69 CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
70 };
71
72
73 // Selection functor type
74 template <typename T>
75 struct LessThan
76 {
77 T compare;
78
79 __host__ __device__ __forceinline__
LessThanLessThan80 LessThan(T compare) : compare(compare) {}
81
82 __host__ __device__ __forceinline__
operator ()LessThan83 bool operator()(const T &a) const {
84 return (a < compare);
85 }
86 };
87
88 //---------------------------------------------------------------------
89 // Dispatch to different CUB DeviceSelect entrypoints
90 //---------------------------------------------------------------------
91
92
93 /**
94 * Dispatch to select if entrypoint
95 */
96 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
97 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<false>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t stream,bool debug_synchronous)98 cudaError_t Dispatch(
99 Int2Type<CUB> /*dispatch_to*/,
100 Int2Type<false> /*is_flagged*/,
101 Int2Type<false> /*is_partition*/,
102 int timing_timing_iterations,
103 size_t* /*d_temp_storage_bytes*/,
104 cudaError_t* /*d_cdp_error*/,
105
106 void* d_temp_storage,
107 size_t& temp_storage_bytes,
108 InputIteratorT d_in,
109 FlagIteratorT /*d_flags*/,
110 OutputIteratorT d_out,
111 NumSelectedIteratorT d_num_selected_out,
112 OffsetT num_items,
113 SelectOpT select_op,
114 cudaStream_t stream,
115 bool debug_synchronous)
116 {
117 cudaError_t error = cudaSuccess;
118 for (int i = 0; i < timing_timing_iterations; ++i)
119 {
120 error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
121 }
122 return error;
123 }
124
125
126 /**
127 * Dispatch to partition if entrypoint
128 */
129 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
130 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<false>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t stream,bool debug_synchronous)131 cudaError_t Dispatch(
132 Int2Type<CUB> /*dispatch_to*/,
133 Int2Type<false> /*is_flagged*/,
134 Int2Type<true> /*is_partition*/,
135 int timing_timing_iterations,
136 size_t* /*d_temp_storage_bytes*/,
137 cudaError_t* /*d_cdp_error*/,
138
139 void* d_temp_storage,
140 size_t& temp_storage_bytes,
141 InputIteratorT d_in,
142 FlagIteratorT /*d_flags*/,
143 OutputIteratorT d_out,
144 NumSelectedIteratorT d_num_selected_out,
145 OffsetT num_items,
146 SelectOpT select_op,
147 cudaStream_t stream,
148 bool debug_synchronous)
149 {
150 cudaError_t error = cudaSuccess;
151 for (int i = 0; i < timing_timing_iterations; ++i)
152 {
153 error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
154 }
155 return error;
156 }
157
158
159 /**
160 * Dispatch to select flagged entrypoint
161 */
162 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
163 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<true>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t stream,bool debug_synchronous)164 cudaError_t Dispatch(
165 Int2Type<CUB> /*dispatch_to*/,
166 Int2Type<true> /*is_flagged*/,
167 Int2Type<false> /*partition*/,
168 int timing_timing_iterations,
169 size_t* /*d_temp_storage_bytes*/,
170 cudaError_t* /*d_cdp_error*/,
171
172 void* d_temp_storage,
173 size_t& temp_storage_bytes,
174 InputIteratorT d_in,
175 FlagIteratorT d_flags,
176 OutputIteratorT d_out,
177 NumSelectedIteratorT d_num_selected_out,
178 OffsetT num_items,
179 SelectOpT /*select_op*/,
180 cudaStream_t stream,
181 bool debug_synchronous)
182 {
183 cudaError_t error = cudaSuccess;
184 for (int i = 0; i < timing_timing_iterations; ++i)
185 {
186 error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
187 }
188 return error;
189 }
190
191
192 /**
193 * Dispatch to partition flagged entrypoint
194 */
195 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
196 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<true>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t stream,bool debug_synchronous)197 cudaError_t Dispatch(
198 Int2Type<CUB> /*dispatch_to*/,
199 Int2Type<true> /*is_flagged*/,
200 Int2Type<true> /*partition*/,
201 int timing_timing_iterations,
202 size_t* /*d_temp_storage_bytes*/,
203 cudaError_t* /*d_cdp_error*/,
204
205 void* d_temp_storage,
206 size_t& temp_storage_bytes,
207 InputIteratorT d_in,
208 FlagIteratorT d_flags,
209 OutputIteratorT d_out,
210 NumSelectedIteratorT d_num_selected_out,
211 OffsetT num_items,
212 SelectOpT /*select_op*/,
213 cudaStream_t stream,
214 bool debug_synchronous)
215 {
216 cudaError_t error = cudaSuccess;
217 for (int i = 0; i < timing_timing_iterations; ++i)
218 {
219 error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
220 }
221 return error;
222 }
223
224
225 //---------------------------------------------------------------------
226 // Dispatch to different Thrust entrypoints
227 //---------------------------------------------------------------------
228
229 /**
230 * Dispatch to select if entrypoint
231 */
232 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
233 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<false>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t,bool)234 cudaError_t Dispatch(
235 Int2Type<THRUST> /*dispatch_to*/,
236 Int2Type<false> /*is_flagged*/,
237 Int2Type<false> /*is_partition*/,
238 int timing_timing_iterations,
239 size_t* /*d_temp_storage_bytes*/,
240 cudaError_t* /*d_cdp_error*/,
241
242 void* d_temp_storage,
243 size_t& temp_storage_bytes,
244 InputIteratorT d_in,
245 FlagIteratorT /*d_flags*/,
246 OutputIteratorT d_out,
247 NumSelectedIteratorT d_num_selected_out,
248 OffsetT num_items,
249 SelectOpT select_op,
250 cudaStream_t /*stream*/,
251 bool /*debug_synchronous*/)
252 {
253 // The input value type
254 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
255
256 // The output value type
257 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
258 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
259 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
260
261 if (d_temp_storage == 0)
262 {
263 temp_storage_bytes = 1;
264 }
265 else
266 {
267 thrust::device_ptr<OutputT> d_out_wrapper_end;
268 thrust::device_ptr<InputT> d_in_wrapper(d_in);
269 thrust::device_ptr<OutputT> d_out_wrapper(d_out);
270
271 for (int i = 0; i < timing_timing_iterations; ++i)
272 {
273 d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
274 }
275
276 OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
277 CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
278 }
279
280 return cudaSuccess;
281 }
282
283
284 /**
285 * Dispatch to partition if entrypoint
286 */
287 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
288 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<false>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t,bool)289 cudaError_t Dispatch(
290 Int2Type<THRUST> /*dispatch_to*/,
291 Int2Type<false> /*is_flagged*/,
292 Int2Type<true> /*is_partition*/,
293 int timing_timing_iterations,
294 size_t* /*d_temp_storage_bytes*/,
295 cudaError_t* /*d_cdp_error*/,
296
297 void* d_temp_storage,
298 size_t& temp_storage_bytes,
299 InputIteratorT d_in,
300 FlagIteratorT /*d_flags*/,
301 OutputIteratorT d_out,
302 NumSelectedIteratorT d_num_selected_out,
303 OffsetT num_items,
304 SelectOpT select_op,
305 cudaStream_t /*stream*/,
306 bool /*debug_synchronous*/)
307 {
308 // The input value type
309 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
310
311 // The output value type
312 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
313 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
314 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
315
316 typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
317
318 if (d_temp_storage == 0)
319 {
320 temp_storage_bytes = 1;
321 }
322 else
323 {
324 thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
325
326 thrust::device_ptr<InputT> d_in_wrapper(d_in);
327 thrust::device_ptr<OutputT> d_out_wrapper(d_out);
328
329 ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
330
331 for (int i = 0; i < timing_timing_iterations; ++i)
332 {
333 d_out_wrapper_end = thrust::partition_copy(
334 d_in_wrapper,
335 d_in_wrapper + num_items,
336 d_out_wrapper,
337 d_out_unselected,
338 select_op);
339 }
340
341 OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
342 CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
343 }
344
345 return cudaSuccess;
346 }
347
348
349 /**
350 * Dispatch to select flagged entrypoint
351 */
352 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
353 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<true>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t,bool)354 cudaError_t Dispatch(
355 Int2Type<THRUST> /*dispatch_to*/,
356 Int2Type<true> /*is_flagged*/,
357 Int2Type<false> /*is_partition*/,
358 int timing_timing_iterations,
359 size_t* /*d_temp_storage_bytes*/,
360 cudaError_t* /*d_cdp_error*/,
361
362 void* d_temp_storage,
363 size_t& temp_storage_bytes,
364 InputIteratorT d_in,
365 FlagIteratorT d_flags,
366 OutputIteratorT d_out,
367 NumSelectedIteratorT d_num_selected_out,
368 OffsetT num_items,
369 SelectOpT /*select_op*/,
370 cudaStream_t /*stream*/,
371 bool /*debug_synchronous*/)
372 {
373 // The flag type
374 typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
375
376 // The input value type
377 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
378
379 // The output value type
380 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
381 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
382 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
383
384 if (d_temp_storage == 0)
385 {
386 temp_storage_bytes = 1;
387 }
388 else
389 {
390 thrust::device_ptr<OutputT> d_out_wrapper_end;
391 thrust::device_ptr<InputT> d_in_wrapper(d_in);
392 thrust::device_ptr<OutputT> d_out_wrapper(d_out);
393 thrust::device_ptr<FlagT> d_flags_wrapper(d_flags);
394
395 for (int i = 0; i < timing_timing_iterations; ++i)
396 {
397 d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp<bool>());
398 }
399
400 OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
401 CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
402 }
403
404 return cudaSuccess;
405 }
406
407
408 /**
409 * Dispatch to partition flagged entrypoint
410 */
411 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
412 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<true>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t,bool)413 cudaError_t Dispatch(
414 Int2Type<THRUST> /*dispatch_to*/,
415 Int2Type<true> /*is_flagged*/,
416 Int2Type<true> /*is_partition*/,
417 int timing_timing_iterations,
418 size_t* /*d_temp_storage_bytes*/,
419 cudaError_t* /*d_cdp_error*/,
420
421 void* d_temp_storage,
422 size_t& temp_storage_bytes,
423 InputIteratorT d_in,
424 FlagIteratorT d_flags,
425 OutputIteratorT d_out,
426 NumSelectedIteratorT d_num_selected_out,
427 OffsetT num_items,
428 SelectOpT /*select_op*/,
429 cudaStream_t /*stream*/,
430 bool /*debug_synchronous*/)
431 {
432 // The flag type
433 typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
434
435 // The input value type
436 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
437
438 // The output value type
439 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
440 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
441 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
442
443 typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
444
445 if (d_temp_storage == 0)
446 {
447 temp_storage_bytes = 1;
448 }
449 else
450 {
451 thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
452
453 thrust::device_ptr<InputT> d_in_wrapper(d_in);
454 thrust::device_ptr<OutputT> d_out_wrapper(d_out);
455 thrust::device_ptr<FlagT> d_flags_wrapper(d_flags);
456 ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
457
458 for (int i = 0; i < timing_timing_iterations; ++i)
459 {
460 d_out_wrapper_end = thrust::partition_copy(
461 d_in_wrapper,
462 d_in_wrapper + num_items,
463 d_flags_wrapper,
464 d_out_wrapper,
465 d_out_unselected,
466 CastOp<bool>());
467 }
468
469 OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
470 CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
471 }
472
473 return cudaSuccess;
474 }
475
476
477 //---------------------------------------------------------------------
478 // CUDA Nested Parallelism Test Kernel
479 //---------------------------------------------------------------------
480
481 /**
482 * Simple wrapper kernel to invoke DeviceSelect
483 */
484 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
CnpDispatchKernel(IsFlaggedTag is_flagged,IsPartitionTag is_partition,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,bool debug_synchronous)485 __global__ void CnpDispatchKernel(
486 IsFlaggedTag is_flagged,
487 IsPartitionTag is_partition,
488 int timing_timing_iterations,
489 size_t* d_temp_storage_bytes,
490 cudaError_t* d_cdp_error,
491
492 void* d_temp_storage,
493 size_t temp_storage_bytes,
494 InputIteratorT d_in,
495 FlagIteratorT d_flags,
496 OutputIteratorT d_out,
497 NumSelectedIteratorT d_num_selected_out,
498 OffsetT num_items,
499 SelectOpT select_op,
500 bool debug_synchronous)
501 {
502
503 #ifndef CUB_CDP
504 (void)is_flagged;
505 (void)is_partition;
506 (void)timing_timing_iterations;
507 (void)d_temp_storage_bytes;
508 (void)d_temp_storage;
509 (void)temp_storage_bytes;
510 (void)d_in;
511 (void)d_flags;
512 (void)d_out;
513 (void)d_num_selected_out;
514 (void)num_items;
515 (void)select_op;
516 (void)debug_synchronous;
517 *d_cdp_error = cudaErrorNotSupported;
518 #else
519 *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
520 d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
521 *d_temp_storage_bytes = temp_storage_bytes;
522 #endif
523 }
524
525
526 /**
527 * Dispatch to CDP kernel
528 */
529 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
Dispatch(Int2Type<CDP> dispatch_to,IsFlaggedTag is_flagged,IsPartitionTag is_partition,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t stream,bool debug_synchronous)530 cudaError_t Dispatch(
531 Int2Type<CDP> dispatch_to,
532 IsFlaggedTag is_flagged,
533 IsPartitionTag is_partition,
534 int timing_timing_iterations,
535 size_t* d_temp_storage_bytes,
536 cudaError_t* d_cdp_error,
537
538 void* d_temp_storage,
539 size_t& temp_storage_bytes,
540 InputIteratorT d_in,
541 FlagIteratorT d_flags,
542 OutputIteratorT d_out,
543 NumSelectedIteratorT d_num_selected_out,
544 OffsetT num_items,
545 SelectOpT select_op,
546 cudaStream_t stream,
547 bool debug_synchronous)
548 {
549 // Invoke kernel to invoke device-side dispatch
550 CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
551 d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
552
553 // Copy out temp_storage_bytes
554 CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
555
556 // Copy out error
557 cudaError_t retval;
558 CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
559 return retval;
560 }
561
562
563
564 //---------------------------------------------------------------------
565 // Test generation
566 //---------------------------------------------------------------------
567
568
569 /**
570 * Initialize problem
571 */
572 template <typename T>
Initialize(T * h_in,int num_items)573 void Initialize(
574 T* h_in,
575 int num_items)
576 {
577 for (int i = 0; i < num_items; ++i)
578 {
579 // Initialize each item to a randomly selected value from [0..126]
580 unsigned int value;
581 RandomBits(value, 0, 0, 7);
582 if (value == 127)
583 value = 126;
584 InitValue(INTEGER_SEED, h_in[i], value);
585 }
586
587 if (g_verbose)
588 {
589 printf("Input:\n");
590 DisplayResults(h_in, num_items);
591 printf("\n\n");
592 }
593 }
594
595
596 /**
597 * Solve selection problem (and set corresponding flags)
598 */
599 template <
600 typename InputIteratorT,
601 typename FlagIteratorT,
602 typename SelectOpT,
603 typename T>
Solve(InputIteratorT h_in,SelectOpT select_op,T * h_reference,FlagIteratorT h_flags,int num_items)604 int Solve(
605 InputIteratorT h_in,
606 SelectOpT select_op,
607 T* h_reference,
608 FlagIteratorT h_flags,
609 int num_items)
610 {
611 int num_selected = 0;
612 for (int i = 0; i < num_items; ++i)
613 {
614 if ((h_flags[i] = select_op(h_in[i])))
615 {
616 h_reference[num_selected] = h_in[i];
617 num_selected++;
618 }
619 else
620 {
621 h_reference[num_items - (i - num_selected) - 1] = h_in[i];
622 }
623 }
624
625 return num_selected;
626 }
627
628
629
630 /**
631 * Test DeviceSelect for a given problem input
632 */
633 template <
634 Backend BACKEND,
635 bool IS_FLAGGED,
636 bool IS_PARTITION,
637 typename DeviceInputIteratorT,
638 typename FlagT,
639 typename SelectOpT,
640 typename T>
Test(DeviceInputIteratorT d_in,FlagT * h_flags,SelectOpT select_op,T * h_reference,int num_selected,int num_items)641 void Test(
642 DeviceInputIteratorT d_in,
643 FlagT* h_flags,
644 SelectOpT select_op,
645 T* h_reference,
646 int num_selected,
647 int num_items)
648 {
649 // Allocate device flags, output, and num-selected
650 FlagT* d_flags = NULL;
651 T* d_out = NULL;
652 int* d_num_selected_out = NULL;
653 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
654 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
655 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
656
657 // Allocate CDP device arrays
658 size_t* d_temp_storage_bytes = NULL;
659 cudaError_t* d_cdp_error = NULL;
660 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
661 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
662
663 // Allocate temporary storage
664 void *d_temp_storage = NULL;
665 size_t temp_storage_bytes = 0;
666 CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
667 d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
668 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
669
670 // Copy flags and clear device output array
671 CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
672 CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
673 CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
674
675 // Run warmup/correctness iteration
676 CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
677 d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
678
679 // Check for correctness (and display results, if specified)
680 int compare1 = (IS_PARTITION) ?
681 CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
682 CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
683 printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
684
685 int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
686 printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
687
688 // Flush any stdout/stderr
689 fflush(stdout);
690 fflush(stderr);
691
692 // Performance
693 GpuTimer gpu_timer;
694 gpu_timer.Start();
695 CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
696 d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
697 gpu_timer.Stop();
698 float elapsed_millis = gpu_timer.ElapsedMillis();
699
700 // Display performance
701 if (g_timing_iterations > 0)
702 {
703 float avg_millis = elapsed_millis / g_timing_iterations;
704 float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
705 int num_output_items = (IS_PARTITION) ? num_items : num_selected;
706 int num_flag_items = (IS_FLAGGED) ? num_items : 0;
707 size_t num_bytes = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
708 float giga_bandwidth = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
709
710 printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
711 }
712 printf("\n\n");
713
714 // Flush any stdout/stderr
715 fflush(stdout);
716 fflush(stderr);
717
718 // Cleanup
719 if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
720 if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
721 if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
722 if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
723 if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
724 if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
725
726 // Correctness asserts
727 AssertEquals(0, compare1 | compare2);
728 }
729
730
731 /**
732 * Test on pointer type
733 */
734 template <
735 Backend BACKEND,
736 bool IS_FLAGGED,
737 bool IS_PARTITION,
738 typename T>
TestPointer(int num_items,float select_ratio)739 void TestPointer(
740 int num_items,
741 float select_ratio)
742 {
743 typedef char FlagT;
744
745 // Allocate host arrays
746 T* h_in = new T[num_items];
747 FlagT* h_flags = new FlagT[num_items];
748 T* h_reference = new T[num_items];
749
750 // Initialize input
751 Initialize(h_in, num_items);
752
753 // Select a comparison value that is select_ratio through the space of [0,127]
754 T compare;
755 if (select_ratio <= 0.0)
756 InitValue(INTEGER_SEED, compare, 0); // select none
757 else if (select_ratio >= 1.0)
758 InitValue(INTEGER_SEED, compare, 127); // select all
759 else
760 InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
761
762 LessThan<T> select_op(compare);
763 int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
764
765 if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
766 printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
767 (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
768 (IS_FLAGGED) ? "Flagged" : "If",
769 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
770 num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
771 fflush(stdout);
772
773 // Allocate problem device arrays
774 T *d_in = NULL;
775
776 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
777
778 // Initialize device input
779 CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
780
781 // Run Test
782 Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
783
784 // Cleanup
785 if (h_in) delete[] h_in;
786 if (h_reference) delete[] h_reference;
787 if (h_flags) delete[] h_flags;
788 if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
789 }
790
791
792 /**
793 * Test on iterator type
794 */
795 template <
796 Backend BACKEND,
797 bool IS_FLAGGED,
798 bool IS_PARTITION,
799 typename T>
TestIterator(int num_items,float select_ratio)800 void TestIterator(
801 int num_items,
802 float select_ratio)
803 {
804 typedef char FlagT;
805
806 // Allocate host arrays
807 T* h_reference = new T[num_items];
808 FlagT* h_flags = new FlagT[num_items];
809
810 // Use counting iterator as the input
811 CountingInputIterator<T, int> h_in(0);
812
813 // Select a comparison value that is select_ratio through the space of [0,127]
814 T compare;
815 if (select_ratio <= 0.0)
816 InitValue(INTEGER_SEED, compare, 0); // select none
817 else if (select_ratio >= 1.0)
818 InitValue(INTEGER_SEED, compare, 127); // select all
819 else
820 InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
821
822 LessThan<T> select_op(compare);
823 int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
824
825 if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
826 printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
827 (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
828 (IS_FLAGGED) ? "Flagged" : "If",
829 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
830 num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
831 fflush(stdout);
832
833 // Run Test
834 Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
835
836 // Cleanup
837 if (h_reference) delete[] h_reference;
838 if (h_flags) delete[] h_flags;
839 }
840
841
842 /**
843 * Test different selection ratios
844 */
845 template <
846 Backend BACKEND,
847 bool IS_FLAGGED,
848 bool IS_PARTITION,
849 typename T>
Test(int num_items)850 void Test(
851 int num_items)
852 {
853 for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
854 {
855 TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
856 }
857 }
858
859
860 /**
861 * Test (select vs. partition) and (flagged vs. functor)
862 */
863 template <
864 Backend BACKEND,
865 typename T>
TestMethod(int num_items)866 void TestMethod(
867 int num_items)
868 {
869 // Functor
870 Test<BACKEND, false, false, T>(num_items);
871 Test<BACKEND, false, true, T>(num_items);
872
873 // Flagged
874 Test<BACKEND, true, false, T>(num_items);
875 Test<BACKEND, true, true, T>(num_items);
876 }
877
878
879 /**
880 * Test different dispatch
881 */
882 template <
883 typename T>
TestOp(int num_items)884 void TestOp(
885 int num_items)
886 {
887 TestMethod<CUB, T>(num_items);
888 #ifdef CUB_CDP
889 TestMethod<CDP, T>(num_items);
890 #endif
891 }
892
893
894 /**
895 * Test different input sizes
896 */
897 template <typename T>
Test(int num_items)898 void Test(
899 int num_items)
900 {
901 if (num_items < 0)
902 {
903 TestOp<T>(0);
904 TestOp<T>(1);
905 TestOp<T>(100);
906 TestOp<T>(10000);
907 TestOp<T>(1000000);
908 }
909 else
910 {
911 TestOp<T>(num_items);
912 }
913 }
914
915 /**
916 * Test select/partition on pointer types
917 */
918 template <typename T>
ComparePointer(int num_items,float select_ratio)919 void ComparePointer(
920 int num_items,
921 float select_ratio)
922 {
923 printf("-- Select-if ----------------------------\n");
924 TestPointer<CUB, false, false, T>(num_items, select_ratio);
925 TestPointer<THRUST, false, false, T>(num_items, select_ratio);
926
927 printf("-- Partition-if ----------------------------\n");
928 TestPointer<CUB, false, true, T>(num_items, select_ratio);
929 TestPointer<THRUST, false, true, T>(num_items, select_ratio);
930
931 printf("-- Select-flagged ----------------------------\n");
932 TestPointer<CUB, true, false, T>(num_items, select_ratio);
933 TestPointer<THRUST, true, false, T>(num_items, select_ratio);
934
935 printf("-- Partition-flagged ----------------------------\n");
936 TestPointer<CUB, true, true, T>(num_items, select_ratio);
937 TestPointer<THRUST, true, true, T>(num_items, select_ratio);
938
939 }
940
941 //---------------------------------------------------------------------
942 // Main
943 //---------------------------------------------------------------------
944
945 /**
946 * Main
947 */
main(int argc,char ** argv)948 int main(int argc, char** argv)
949 {
950 int num_items = -1;
951 float select_ratio = 0.5;
952
953 // Initialize command line
954 CommandLineArgs args(argc, argv);
955 g_verbose = args.CheckCmdLineFlag("v");
956 args.GetCmdLineArgument("n", num_items);
957 args.GetCmdLineArgument("i", g_timing_iterations);
958 args.GetCmdLineArgument("repeat", g_repeat);
959 args.GetCmdLineArgument("ratio", select_ratio);
960
961 // Print usage
962 if (args.CheckCmdLineFlag("help"))
963 {
964 printf("%s "
965 "[--n=<input items> "
966 "[--i=<timing iterations> "
967 "[--device=<device-id>] "
968 "[--ratio=<selection ratio, default 0.5>] "
969 "[--repeat=<repetitions of entire test suite>] "
970 "[--v] "
971 "[--cdp] "
972 "\n", argv[0]);
973 exit(0);
974 }
975
976 // Initialize device
977 CubDebugExit(args.DeviceInit());
978 g_device_giga_bandwidth = args.device_giga_bandwidth;
979 printf("\n");
980
981 #ifdef QUICKER_TEST
982
983 // Compile/run basic CUB test
984 if (num_items < 0) num_items = 32000000;
985
986 printf("-- Select-if ----------------------------\n");
987 TestPointer<CUB, false, false, int>(num_items, select_ratio);
988
989 printf("-- Partition-if ----------------------------\n");
990 TestPointer<CUB, false, true, int>(num_items, select_ratio);
991
992 printf("-- Select-flagged ----------------------------\n");
993 TestPointer<CUB, true, false, int>(num_items, select_ratio);
994
995 printf("-- Partition-flagged ----------------------------\n");
996 TestPointer<CUB, true, true, int>(num_items, select_ratio);
997
998
999 #elif defined(QUICK_TEST)
1000
1001 // Get device ordinal
1002 int device_ordinal;
1003 CubDebugExit(cudaGetDevice(&device_ordinal));
1004
1005 // Get device SM version
1006 int sm_version;
1007 CubDebugExit(SmVersion(sm_version, device_ordinal));
1008
1009 // Compile/run quick tests
1010 if (num_items < 0) num_items = 32000000;
1011
1012 printf("-- Iterator ----------------------------\n");
1013 TestIterator<CUB, false, false, int>(num_items, select_ratio);
1014
1015 ComparePointer<char>( num_items * ((sm_version <= 130) ? 1 : 4), select_ratio);
1016 ComparePointer<short>( num_items * ((sm_version <= 130) ? 1 : 2), select_ratio);
1017 ComparePointer<int>( num_items, select_ratio);
1018 ComparePointer<long long>( num_items / 2, select_ratio);
1019 ComparePointer<TestFoo>( num_items / 4, select_ratio);
1020
1021 #else
1022
1023 // Compile/run thorough tests
1024 for (int i = 0; i <= g_repeat; ++i)
1025 {
1026 // Test different input types
1027 Test<unsigned char>(num_items);
1028 Test<unsigned short>(num_items);
1029 Test<unsigned int>(num_items);
1030 Test<unsigned long long>(num_items);
1031
1032 Test<uchar2>(num_items);
1033 Test<ushort2>(num_items);
1034 Test<uint2>(num_items);
1035 Test<ulonglong2>(num_items);
1036
1037 Test<uchar4>(num_items);
1038 Test<ushort4>(num_items);
1039 Test<uint4>(num_items);
1040 Test<ulonglong4>(num_items);
1041
1042 Test<TestFoo>(num_items);
1043 Test<TestBar>(num_items);
1044 }
1045
1046 #endif
1047
1048 return 0;
1049 }
1050
1051
1052
1053