1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Test of DeviceSelect::If and DevicePartition::If utilities
31  ******************************************************************************/
32 
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35 
36 #include <stdio.h>
37 #include <typeinfo>
38 
39 #include <thrust/device_ptr.h>
40 #include <thrust/copy.h>
41 #include <thrust/partition.h>
42 #include <thrust/iterator/reverse_iterator.h>
43 
44 #include <cub/util_allocator.cuh>
45 #include <cub/device/device_select.cuh>
46 #include <cub/device/device_partition.cuh>
47 #include <cub/iterator/counting_input_iterator.cuh>
48 
49 #include "test_util.h"
50 
51 using namespace cub;
52 
53 
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57 
58 bool                    g_verbose               = false;
59 int                     g_timing_iterations     = 0;
60 int                     g_repeat                = 0;
61 float                   g_device_giga_bandwidth;
62 CachingDeviceAllocator  g_allocator(true);
63 
64 // Dispatch types
65 enum Backend
66 {
67     CUB,        // CUB method
68     THRUST,     // Thrust method
69     CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
70 };
71 
72 
73 // Selection functor type
74 template <typename T>
75 struct LessThan
76 {
77     T compare;
78 
79     __host__ __device__ __forceinline__
LessThanLessThan80     LessThan(T compare) : compare(compare) {}
81 
82     __host__ __device__ __forceinline__
operator ()LessThan83     bool operator()(const T &a) const {
84         return (a < compare);
85     }
86 };
87 
88 //---------------------------------------------------------------------
89 // Dispatch to different CUB DeviceSelect entrypoints
90 //---------------------------------------------------------------------
91 
92 
93 /**
94  * Dispatch to select if entrypoint
95  */
96 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
97 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<false>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t stream,bool debug_synchronous)98 cudaError_t Dispatch(
99     Int2Type<CUB>               /*dispatch_to*/,
100     Int2Type<false>             /*is_flagged*/,
101     Int2Type<false>             /*is_partition*/,
102     int                         timing_timing_iterations,
103     size_t*                     /*d_temp_storage_bytes*/,
104     cudaError_t*                /*d_cdp_error*/,
105 
106     void*                       d_temp_storage,
107     size_t&                     temp_storage_bytes,
108     InputIteratorT              d_in,
109     FlagIteratorT               /*d_flags*/,
110     OutputIteratorT             d_out,
111     NumSelectedIteratorT        d_num_selected_out,
112     OffsetT                     num_items,
113     SelectOpT                   select_op,
114     cudaStream_t                stream,
115     bool                        debug_synchronous)
116 {
117     cudaError_t error = cudaSuccess;
118     for (int i = 0; i < timing_timing_iterations; ++i)
119     {
120         error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
121     }
122     return error;
123 }
124 
125 
126 /**
127  * Dispatch to partition if entrypoint
128  */
129 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
130 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<false>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t stream,bool debug_synchronous)131 cudaError_t Dispatch(
132     Int2Type<CUB>               /*dispatch_to*/,
133     Int2Type<false>             /*is_flagged*/,
134     Int2Type<true>              /*is_partition*/,
135     int                         timing_timing_iterations,
136     size_t*                     /*d_temp_storage_bytes*/,
137     cudaError_t*                /*d_cdp_error*/,
138 
139     void*                       d_temp_storage,
140     size_t&                     temp_storage_bytes,
141     InputIteratorT              d_in,
142     FlagIteratorT               /*d_flags*/,
143     OutputIteratorT             d_out,
144     NumSelectedIteratorT        d_num_selected_out,
145     OffsetT                     num_items,
146     SelectOpT                   select_op,
147     cudaStream_t                stream,
148     bool                        debug_synchronous)
149 {
150     cudaError_t error = cudaSuccess;
151     for (int i = 0; i < timing_timing_iterations; ++i)
152     {
153         error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
154     }
155     return error;
156 }
157 
158 
159 /**
160  * Dispatch to select flagged entrypoint
161  */
162 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
163 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<true>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t stream,bool debug_synchronous)164 cudaError_t Dispatch(
165     Int2Type<CUB>               /*dispatch_to*/,
166     Int2Type<true>              /*is_flagged*/,
167     Int2Type<false>             /*partition*/,
168     int                         timing_timing_iterations,
169     size_t*                     /*d_temp_storage_bytes*/,
170     cudaError_t*                /*d_cdp_error*/,
171 
172     void*                       d_temp_storage,
173     size_t&                     temp_storage_bytes,
174     InputIteratorT              d_in,
175     FlagIteratorT               d_flags,
176     OutputIteratorT             d_out,
177     NumSelectedIteratorT        d_num_selected_out,
178     OffsetT                     num_items,
179     SelectOpT                   /*select_op*/,
180     cudaStream_t                stream,
181     bool                        debug_synchronous)
182 {
183     cudaError_t error = cudaSuccess;
184     for (int i = 0; i < timing_timing_iterations; ++i)
185     {
186         error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
187     }
188     return error;
189 }
190 
191 
192 /**
193  * Dispatch to partition flagged entrypoint
194  */
195 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
196 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,Int2Type<true>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t stream,bool debug_synchronous)197 cudaError_t Dispatch(
198     Int2Type<CUB>               /*dispatch_to*/,
199     Int2Type<true>              /*is_flagged*/,
200     Int2Type<true>              /*partition*/,
201     int                         timing_timing_iterations,
202     size_t*                     /*d_temp_storage_bytes*/,
203     cudaError_t*                /*d_cdp_error*/,
204 
205     void*                       d_temp_storage,
206     size_t&                     temp_storage_bytes,
207     InputIteratorT              d_in,
208     FlagIteratorT               d_flags,
209     OutputIteratorT             d_out,
210     NumSelectedIteratorT        d_num_selected_out,
211     OffsetT                     num_items,
212     SelectOpT                   /*select_op*/,
213     cudaStream_t                stream,
214     bool                        debug_synchronous)
215 {
216     cudaError_t error = cudaSuccess;
217     for (int i = 0; i < timing_timing_iterations; ++i)
218     {
219         error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
220     }
221     return error;
222 }
223 
224 
225 //---------------------------------------------------------------------
226 // Dispatch to different Thrust entrypoints
227 //---------------------------------------------------------------------
228 
229 /**
230  * Dispatch to select if entrypoint
231  */
232 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
233 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<false>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t,bool)234 cudaError_t Dispatch(
235     Int2Type<THRUST>            /*dispatch_to*/,
236     Int2Type<false>             /*is_flagged*/,
237     Int2Type<false>             /*is_partition*/,
238     int                         timing_timing_iterations,
239     size_t*                     /*d_temp_storage_bytes*/,
240     cudaError_t*                /*d_cdp_error*/,
241 
242     void*                       d_temp_storage,
243     size_t&                     temp_storage_bytes,
244     InputIteratorT              d_in,
245     FlagIteratorT               /*d_flags*/,
246     OutputIteratorT             d_out,
247     NumSelectedIteratorT        d_num_selected_out,
248     OffsetT                     num_items,
249     SelectOpT                   select_op,
250     cudaStream_t                /*stream*/,
251     bool                        /*debug_synchronous*/)
252 {
253     // The input value type
254     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
255 
256     // The output value type
257     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
258         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
259         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
260 
261     if (d_temp_storage == 0)
262     {
263         temp_storage_bytes = 1;
264     }
265     else
266     {
267         thrust::device_ptr<OutputT>         d_out_wrapper_end;
268         thrust::device_ptr<InputT>          d_in_wrapper(d_in);
269         thrust::device_ptr<OutputT>         d_out_wrapper(d_out);
270 
271         for (int i = 0; i < timing_timing_iterations; ++i)
272         {
273             d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
274         }
275 
276         OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
277         CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
278     }
279 
280     return cudaSuccess;
281 }
282 
283 
284 /**
285  * Dispatch to partition if entrypoint
286  */
287 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
288 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<false>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t,bool)289 cudaError_t Dispatch(
290     Int2Type<THRUST>            /*dispatch_to*/,
291     Int2Type<false>             /*is_flagged*/,
292     Int2Type<true>              /*is_partition*/,
293     int                         timing_timing_iterations,
294     size_t*                     /*d_temp_storage_bytes*/,
295     cudaError_t*                /*d_cdp_error*/,
296 
297     void*                       d_temp_storage,
298     size_t&                     temp_storage_bytes,
299     InputIteratorT              d_in,
300     FlagIteratorT               /*d_flags*/,
301     OutputIteratorT             d_out,
302     NumSelectedIteratorT        d_num_selected_out,
303     OffsetT                     num_items,
304     SelectOpT                   select_op,
305     cudaStream_t                /*stream*/,
306     bool                        /*debug_synchronous*/)
307 {
308     // The input value type
309     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
310 
311     // The output value type
312     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
313         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
314         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
315 
316     typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
317 
318     if (d_temp_storage == 0)
319     {
320         temp_storage_bytes = 1;
321     }
322     else
323     {
324         thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
325 
326         thrust::device_ptr<InputT>       d_in_wrapper(d_in);
327         thrust::device_ptr<OutputT>       d_out_wrapper(d_out);
328 
329         ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
330 
331         for (int i = 0; i < timing_timing_iterations; ++i)
332         {
333             d_out_wrapper_end = thrust::partition_copy(
334                 d_in_wrapper,
335                 d_in_wrapper + num_items,
336                 d_out_wrapper,
337                 d_out_unselected,
338                 select_op);
339         }
340 
341         OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
342         CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
343     }
344 
345     return cudaSuccess;
346 }
347 
348 
349 /**
350  * Dispatch to select flagged entrypoint
351  */
352 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
353 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<true>,Int2Type<false>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t,bool)354 cudaError_t Dispatch(
355     Int2Type<THRUST>            /*dispatch_to*/,
356     Int2Type<true>              /*is_flagged*/,
357     Int2Type<false>             /*is_partition*/,
358     int                         timing_timing_iterations,
359     size_t*                     /*d_temp_storage_bytes*/,
360     cudaError_t*                /*d_cdp_error*/,
361 
362     void*                       d_temp_storage,
363     size_t&                     temp_storage_bytes,
364     InputIteratorT              d_in,
365     FlagIteratorT               d_flags,
366     OutputIteratorT             d_out,
367     NumSelectedIteratorT        d_num_selected_out,
368     OffsetT                     num_items,
369     SelectOpT                   /*select_op*/,
370     cudaStream_t                /*stream*/,
371     bool                        /*debug_synchronous*/)
372 {
373     // The flag type
374     typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
375 
376     // The input value type
377     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
378 
379     // The output value type
380     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
381         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
382         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
383 
384     if (d_temp_storage == 0)
385     {
386         temp_storage_bytes = 1;
387     }
388     else
389     {
390         thrust::device_ptr<OutputT>     d_out_wrapper_end;
391         thrust::device_ptr<InputT>      d_in_wrapper(d_in);
392         thrust::device_ptr<OutputT>     d_out_wrapper(d_out);
393         thrust::device_ptr<FlagT>       d_flags_wrapper(d_flags);
394 
395         for (int i = 0; i < timing_timing_iterations; ++i)
396         {
397             d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp<bool>());
398         }
399 
400         OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
401         CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
402     }
403 
404     return cudaSuccess;
405 }
406 
407 
408 /**
409  * Dispatch to partition flagged entrypoint
410  */
411 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
412 __host__ __forceinline__
Dispatch(Int2Type<THRUST>,Int2Type<true>,Int2Type<true>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT,cudaStream_t,bool)413 cudaError_t Dispatch(
414     Int2Type<THRUST>            /*dispatch_to*/,
415     Int2Type<true>              /*is_flagged*/,
416     Int2Type<true>              /*is_partition*/,
417     int                         timing_timing_iterations,
418     size_t*                     /*d_temp_storage_bytes*/,
419     cudaError_t*                /*d_cdp_error*/,
420 
421     void*                       d_temp_storage,
422     size_t&                     temp_storage_bytes,
423     InputIteratorT              d_in,
424     FlagIteratorT               d_flags,
425     OutputIteratorT             d_out,
426     NumSelectedIteratorT        d_num_selected_out,
427     OffsetT                     num_items,
428     SelectOpT                   /*select_op*/,
429     cudaStream_t                /*stream*/,
430     bool                        /*debug_synchronous*/)
431 {
432     // The flag type
433     typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
434 
435     // The input value type
436     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
437 
438     // The output value type
439     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
440         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
441         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
442 
443     typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
444 
445     if (d_temp_storage == 0)
446     {
447         temp_storage_bytes = 1;
448     }
449     else
450     {
451         thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
452 
453         thrust::device_ptr<InputT>  d_in_wrapper(d_in);
454         thrust::device_ptr<OutputT> d_out_wrapper(d_out);
455         thrust::device_ptr<FlagT>   d_flags_wrapper(d_flags);
456         ReverseOutputIteratorT      d_out_unselected(d_out_wrapper + num_items);
457 
458         for (int i = 0; i < timing_timing_iterations; ++i)
459         {
460             d_out_wrapper_end = thrust::partition_copy(
461                 d_in_wrapper,
462                 d_in_wrapper + num_items,
463                 d_flags_wrapper,
464                 d_out_wrapper,
465                 d_out_unselected,
466                 CastOp<bool>());
467         }
468 
469         OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
470         CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
471     }
472 
473     return cudaSuccess;
474 }
475 
476 
477 //---------------------------------------------------------------------
478 // CUDA Nested Parallelism Test Kernel
479 //---------------------------------------------------------------------
480 
481 /**
482  * Simple wrapper kernel to invoke DeviceSelect
483  */
484 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
CnpDispatchKernel(IsFlaggedTag is_flagged,IsPartitionTag is_partition,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,bool debug_synchronous)485 __global__ void CnpDispatchKernel(
486     IsFlaggedTag                is_flagged,
487     IsPartitionTag              is_partition,
488     int                         timing_timing_iterations,
489     size_t*                     d_temp_storage_bytes,
490     cudaError_t*                d_cdp_error,
491 
492     void*                       d_temp_storage,
493     size_t                      temp_storage_bytes,
494     InputIteratorT              d_in,
495     FlagIteratorT               d_flags,
496     OutputIteratorT             d_out,
497     NumSelectedIteratorT        d_num_selected_out,
498     OffsetT                     num_items,
499     SelectOpT                   select_op,
500     bool                        debug_synchronous)
501 {
502 
503 #ifndef CUB_CDP
504     (void)is_flagged;
505     (void)is_partition;
506     (void)timing_timing_iterations;
507     (void)d_temp_storage_bytes;
508     (void)d_temp_storage;
509     (void)temp_storage_bytes;
510     (void)d_in;
511     (void)d_flags;
512     (void)d_out;
513     (void)d_num_selected_out;
514     (void)num_items;
515     (void)select_op;
516     (void)debug_synchronous;
517     *d_cdp_error = cudaErrorNotSupported;
518 #else
519     *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
520         d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
521     *d_temp_storage_bytes = temp_storage_bytes;
522 #endif
523 }
524 
525 
526 /**
527  * Dispatch to CDP kernel
528  */
529 template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
Dispatch(Int2Type<CDP> dispatch_to,IsFlaggedTag is_flagged,IsPartitionTag is_partition,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,FlagIteratorT d_flags,OutputIteratorT d_out,NumSelectedIteratorT d_num_selected_out,OffsetT num_items,SelectOpT select_op,cudaStream_t stream,bool debug_synchronous)530 cudaError_t Dispatch(
531     Int2Type<CDP>               dispatch_to,
532     IsFlaggedTag                is_flagged,
533     IsPartitionTag              is_partition,
534     int                         timing_timing_iterations,
535     size_t*                     d_temp_storage_bytes,
536     cudaError_t*                d_cdp_error,
537 
538     void*                       d_temp_storage,
539     size_t&                     temp_storage_bytes,
540     InputIteratorT              d_in,
541     FlagIteratorT               d_flags,
542     OutputIteratorT             d_out,
543     NumSelectedIteratorT        d_num_selected_out,
544     OffsetT                     num_items,
545     SelectOpT                   select_op,
546     cudaStream_t                stream,
547     bool                        debug_synchronous)
548 {
549     // Invoke kernel to invoke device-side dispatch
550     CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
551         d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
552 
553     // Copy out temp_storage_bytes
554     CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
555 
556     // Copy out error
557     cudaError_t retval;
558     CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
559     return retval;
560 }
561 
562 
563 
564 //---------------------------------------------------------------------
565 // Test generation
566 //---------------------------------------------------------------------
567 
568 
569 /**
570  * Initialize problem
571  */
572 template <typename T>
Initialize(T * h_in,int num_items)573 void Initialize(
574     T*  h_in,
575     int num_items)
576 {
577     for (int i = 0; i < num_items; ++i)
578     {
579         // Initialize each item to a randomly selected value from [0..126]
580         unsigned int value;
581         RandomBits(value, 0, 0, 7);
582         if (value == 127)
583             value = 126;
584         InitValue(INTEGER_SEED, h_in[i], value);
585     }
586 
587     if (g_verbose)
588     {
589         printf("Input:\n");
590         DisplayResults(h_in, num_items);
591         printf("\n\n");
592     }
593 }
594 
595 
596 /**
597  * Solve selection problem (and set corresponding flags)
598  */
599 template <
600     typename        InputIteratorT,
601     typename        FlagIteratorT,
602     typename        SelectOpT,
603     typename        T>
Solve(InputIteratorT h_in,SelectOpT select_op,T * h_reference,FlagIteratorT h_flags,int num_items)604 int Solve(
605     InputIteratorT  h_in,
606     SelectOpT       select_op,
607     T*              h_reference,
608     FlagIteratorT   h_flags,
609     int             num_items)
610 {
611     int num_selected = 0;
612     for (int i = 0; i < num_items; ++i)
613     {
614         if ((h_flags[i] = select_op(h_in[i])))
615         {
616             h_reference[num_selected] = h_in[i];
617             num_selected++;
618         }
619         else
620         {
621             h_reference[num_items - (i - num_selected) - 1] = h_in[i];
622         }
623     }
624 
625     return num_selected;
626 }
627 
628 
629 
630 /**
631  * Test DeviceSelect for a given problem input
632  */
633 template <
634     Backend             BACKEND,
635     bool                IS_FLAGGED,
636     bool                IS_PARTITION,
637     typename            DeviceInputIteratorT,
638     typename            FlagT,
639     typename            SelectOpT,
640     typename            T>
Test(DeviceInputIteratorT d_in,FlagT * h_flags,SelectOpT select_op,T * h_reference,int num_selected,int num_items)641 void Test(
642     DeviceInputIteratorT    d_in,
643     FlagT*                  h_flags,
644     SelectOpT               select_op,
645     T*                      h_reference,
646     int                     num_selected,
647     int                     num_items)
648 {
649     // Allocate device flags, output, and num-selected
650     FlagT*      d_flags = NULL;
651     T*          d_out = NULL;
652     int*        d_num_selected_out = NULL;
653     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
654     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
655     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
656 
657     // Allocate CDP device arrays
658     size_t*         d_temp_storage_bytes = NULL;
659     cudaError_t*    d_cdp_error = NULL;
660     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
661     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
662 
663     // Allocate temporary storage
664     void            *d_temp_storage = NULL;
665     size_t          temp_storage_bytes = 0;
666     CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
667     d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
668     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
669 
670     // Copy flags and clear device output array
671     CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
672     CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
673     CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
674 
675     // Run warmup/correctness iteration
676     CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
677         d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
678 
679     // Check for correctness (and display results, if specified)
680     int compare1 = (IS_PARTITION) ?
681         CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
682         CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
683     printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
684 
685     int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
686     printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
687 
688     // Flush any stdout/stderr
689     fflush(stdout);
690     fflush(stderr);
691 
692     // Performance
693     GpuTimer gpu_timer;
694     gpu_timer.Start();
695     CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
696         d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
697     gpu_timer.Stop();
698     float elapsed_millis = gpu_timer.ElapsedMillis();
699 
700     // Display performance
701     if (g_timing_iterations > 0)
702     {
703         float   avg_millis          = elapsed_millis / g_timing_iterations;
704         float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
705         int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
706         int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
707         size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
708         float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
709 
710         printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
711     }
712     printf("\n\n");
713 
714     // Flush any stdout/stderr
715     fflush(stdout);
716     fflush(stderr);
717 
718     // Cleanup
719     if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
720     if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
721     if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
722     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
723     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
724     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
725 
726     // Correctness asserts
727     AssertEquals(0, compare1 | compare2);
728 }
729 
730 
731 /**
732  * Test on pointer type
733  */
734 template <
735     Backend         BACKEND,
736     bool            IS_FLAGGED,
737     bool            IS_PARTITION,
738     typename        T>
TestPointer(int num_items,float select_ratio)739 void TestPointer(
740     int             num_items,
741     float           select_ratio)
742 {
743     typedef char FlagT;
744 
745     // Allocate host arrays
746     T*      h_in        = new T[num_items];
747     FlagT*  h_flags     = new FlagT[num_items];
748     T*      h_reference = new T[num_items];
749 
750     // Initialize input
751     Initialize(h_in, num_items);
752 
753     // Select a comparison value that is select_ratio through the space of [0,127]
754     T compare;
755     if (select_ratio <= 0.0)
756         InitValue(INTEGER_SEED, compare, 0);        // select none
757     else if (select_ratio >= 1.0)
758         InitValue(INTEGER_SEED, compare, 127);      // select all
759     else
760         InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
761 
762     LessThan<T> select_op(compare);
763     int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
764 
765     if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
766     printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
767         (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
768         (IS_FLAGGED) ? "Flagged" : "If",
769         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
770         num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
771     fflush(stdout);
772 
773     // Allocate problem device arrays
774     T *d_in = NULL;
775 
776     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
777 
778     // Initialize device input
779     CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
780 
781     // Run Test
782     Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
783 
784     // Cleanup
785     if (h_in) delete[] h_in;
786     if (h_reference) delete[] h_reference;
787     if (h_flags) delete[] h_flags;
788     if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
789 }
790 
791 
792 /**
793  * Test on iterator type
794  */
795 template <
796     Backend         BACKEND,
797     bool            IS_FLAGGED,
798     bool            IS_PARTITION,
799     typename        T>
TestIterator(int num_items,float select_ratio)800 void TestIterator(
801     int             num_items,
802     float           select_ratio)
803 {
804     typedef char FlagT;
805 
806     // Allocate host arrays
807     T*      h_reference = new T[num_items];
808     FlagT*  h_flags = new FlagT[num_items];
809 
810     // Use counting iterator as the input
811     CountingInputIterator<T, int> h_in(0);
812 
813     // Select a comparison value that is select_ratio through the space of [0,127]
814     T compare;
815     if (select_ratio <= 0.0)
816         InitValue(INTEGER_SEED, compare, 0);        // select none
817     else if (select_ratio >= 1.0)
818         InitValue(INTEGER_SEED, compare, 127);      // select all
819     else
820         InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
821 
822     LessThan<T> select_op(compare);
823     int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
824 
825     if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
826     printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
827         (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
828         (IS_FLAGGED) ? "Flagged" : "If",
829         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
830         num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
831     fflush(stdout);
832 
833     // Run Test
834     Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
835 
836     // Cleanup
837     if (h_reference) delete[] h_reference;
838     if (h_flags) delete[] h_flags;
839 }
840 
841 
842 /**
843  * Test different selection ratios
844  */
845 template <
846     Backend         BACKEND,
847     bool            IS_FLAGGED,
848     bool            IS_PARTITION,
849     typename        T>
Test(int num_items)850 void Test(
851     int             num_items)
852 {
853     for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
854     {
855         TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
856     }
857 }
858 
859 
860 /**
861  * Test (select vs. partition) and (flagged vs. functor)
862  */
863 template <
864     Backend         BACKEND,
865     typename        T>
TestMethod(int num_items)866 void TestMethod(
867     int             num_items)
868 {
869     // Functor
870     Test<BACKEND, false, false, T>(num_items);
871     Test<BACKEND, false, true, T>(num_items);
872 
873     // Flagged
874     Test<BACKEND, true, false, T>(num_items);
875     Test<BACKEND, true, true, T>(num_items);
876 }
877 
878 
879 /**
880  * Test different dispatch
881  */
882 template <
883     typename        T>
TestOp(int num_items)884 void TestOp(
885     int             num_items)
886 {
887     TestMethod<CUB, T>(num_items);
888 #ifdef CUB_CDP
889     TestMethod<CDP, T>(num_items);
890 #endif
891 }
892 
893 
894 /**
895  * Test different input sizes
896  */
897 template <typename T>
Test(int num_items)898 void Test(
899     int             num_items)
900 {
901     if (num_items < 0)
902     {
903         TestOp<T>(0);
904         TestOp<T>(1);
905         TestOp<T>(100);
906         TestOp<T>(10000);
907         TestOp<T>(1000000);
908     }
909     else
910     {
911         TestOp<T>(num_items);
912     }
913 }
914 
915 /**
916  * Test select/partition on pointer types
917  */
918 template <typename T>
ComparePointer(int num_items,float select_ratio)919 void ComparePointer(
920     int             num_items,
921     float           select_ratio)
922 {
923     printf("-- Select-if ----------------------------\n");
924     TestPointer<CUB, false, false, T>(num_items, select_ratio);
925     TestPointer<THRUST, false, false, T>(num_items, select_ratio);
926 
927     printf("-- Partition-if ----------------------------\n");
928     TestPointer<CUB, false, true, T>(num_items, select_ratio);
929     TestPointer<THRUST, false, true, T>(num_items, select_ratio);
930 
931     printf("-- Select-flagged ----------------------------\n");
932     TestPointer<CUB, true, false, T>(num_items, select_ratio);
933     TestPointer<THRUST, true, false, T>(num_items, select_ratio);
934 
935     printf("-- Partition-flagged ----------------------------\n");
936     TestPointer<CUB, true, true, T>(num_items, select_ratio);
937     TestPointer<THRUST, true, true, T>(num_items, select_ratio);
938 
939 }
940 
941 //---------------------------------------------------------------------
942 // Main
943 //---------------------------------------------------------------------
944 
945 /**
946  * Main
947  */
main(int argc,char ** argv)948 int main(int argc, char** argv)
949 {
950     int num_items           = -1;
951     float select_ratio      = 0.5;
952 
953     // Initialize command line
954     CommandLineArgs args(argc, argv);
955     g_verbose = args.CheckCmdLineFlag("v");
956     args.GetCmdLineArgument("n", num_items);
957     args.GetCmdLineArgument("i", g_timing_iterations);
958     args.GetCmdLineArgument("repeat", g_repeat);
959     args.GetCmdLineArgument("ratio", select_ratio);
960 
961     // Print usage
962     if (args.CheckCmdLineFlag("help"))
963     {
964         printf("%s "
965             "[--n=<input items> "
966             "[--i=<timing iterations> "
967             "[--device=<device-id>] "
968             "[--ratio=<selection ratio, default 0.5>] "
969             "[--repeat=<repetitions of entire test suite>] "
970             "[--v] "
971             "[--cdp] "
972             "\n", argv[0]);
973         exit(0);
974     }
975 
976     // Initialize device
977     CubDebugExit(args.DeviceInit());
978     g_device_giga_bandwidth = args.device_giga_bandwidth;
979     printf("\n");
980 
981 #ifdef QUICKER_TEST
982 
983     // Compile/run basic CUB test
984     if (num_items < 0) num_items = 32000000;
985 
986     printf("-- Select-if ----------------------------\n");
987     TestPointer<CUB, false, false, int>(num_items, select_ratio);
988 
989     printf("-- Partition-if ----------------------------\n");
990     TestPointer<CUB, false, true, int>(num_items, select_ratio);
991 
992     printf("-- Select-flagged ----------------------------\n");
993     TestPointer<CUB, true, false, int>(num_items, select_ratio);
994 
995     printf("-- Partition-flagged ----------------------------\n");
996     TestPointer<CUB, true, true, int>(num_items, select_ratio);
997 
998 
999 #elif defined(QUICK_TEST)
1000 
1001     // Get device ordinal
1002     int device_ordinal;
1003     CubDebugExit(cudaGetDevice(&device_ordinal));
1004 
1005     // Get device SM version
1006     int sm_version;
1007     CubDebugExit(SmVersion(sm_version, device_ordinal));
1008 
1009     // Compile/run quick tests
1010     if (num_items < 0) num_items = 32000000;
1011 
1012     printf("-- Iterator ----------------------------\n");
1013     TestIterator<CUB, false, false, int>(num_items, select_ratio);
1014 
1015     ComparePointer<char>(       num_items * ((sm_version <= 130) ? 1 : 4),  select_ratio);
1016     ComparePointer<short>(      num_items * ((sm_version <= 130) ? 1 : 2),  select_ratio);
1017     ComparePointer<int>(        num_items,                                  select_ratio);
1018     ComparePointer<long long>(  num_items / 2,                              select_ratio);
1019     ComparePointer<TestFoo>(    num_items / 4,                              select_ratio);
1020 
1021 #else
1022 
1023     // Compile/run thorough tests
1024     for (int i = 0; i <= g_repeat; ++i)
1025     {
1026         // Test different input types
1027         Test<unsigned char>(num_items);
1028         Test<unsigned short>(num_items);
1029         Test<unsigned int>(num_items);
1030         Test<unsigned long long>(num_items);
1031 
1032         Test<uchar2>(num_items);
1033         Test<ushort2>(num_items);
1034         Test<uint2>(num_items);
1035         Test<ulonglong2>(num_items);
1036 
1037         Test<uchar4>(num_items);
1038         Test<ushort4>(num_items);
1039         Test<uint4>(num_items);
1040         Test<ulonglong4>(num_items);
1041 
1042         Test<TestFoo>(num_items);
1043         Test<TestBar>(num_items);
1044     }
1045 
1046 #endif
1047 
1048     return 0;
1049 }
1050 
1051 
1052 
1053