1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Test of DeviceReduce utilities
31  ******************************************************************************/
32 
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35 
36 #include <stdio.h>
37 #include <limits>
38 #include <typeinfo>
39 
40 #include <thrust/device_ptr.h>
41 #include <thrust/reduce.h>
42 
43 #include <cub/util_allocator.cuh>
44 #include <cub/device/device_reduce.cuh>
45 #include <cub/device/device_segmented_reduce.cuh>
46 #include <cub/iterator/constant_input_iterator.cuh>
47 #include <cub/iterator/discard_output_iterator.cuh>
48 #include <cub/iterator/transform_input_iterator.cuh>
49 
50 #include "test_util.h"
51 
52 using namespace cub;
53 
54 
55 //---------------------------------------------------------------------
56 // Globals, constants and typedefs
57 //---------------------------------------------------------------------
58 
59 int                     g_ptx_version;
60 int                     g_sm_count;
61 double                  g_device_giga_bandwidth;
62 bool                    g_verbose           = false;
63 bool                    g_verbose_input     = false;
64 int                     g_timing_iterations = 0;
65 int                     g_repeat            = 0;
66 CachingDeviceAllocator  g_allocator(true);
67 
68 
69 // Dispatch types
70 enum Backend
71 {
72     CUB,            // CUB method
73     CUB_SEGMENTED,  // CUB segmented method
74     CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
75     THRUST,         // Thrust method
76 };
77 
78 
79 // Custom max functor
80 struct CustomMax
81 {
82     /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
83     template <typename OutputT>
operator ()CustomMax84     __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
85     {
86         return CUB_MAX(a, b);
87     }
88 };
89 
90 
91 //---------------------------------------------------------------------
92 // Dispatch to different CUB DeviceReduce entrypoints
93 //---------------------------------------------------------------------
94 
95 /**
96  * Dispatch to reduce entrypoint (custom-max)
97  */
98 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
99 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,ReductionOpT reduction_op,cudaStream_t stream,bool debug_synchronous)100 cudaError_t Dispatch(
101     Int2Type<CUB>       /*dispatch_to*/,
102     int                 timing_iterations,
103     size_t              */*d_temp_storage_bytes*/,
104     cudaError_t         */*d_cdp_error*/,
105 
106     void*               d_temp_storage,
107     size_t&             temp_storage_bytes,
108     InputIteratorT      d_in,
109     OutputIteratorT     d_out,
110     int                 num_items,
111     int                 /*max_segments*/,
112     OffsetIteratorT     /*d_segment_offsets*/,
113     ReductionOpT        reduction_op,
114     cudaStream_t        stream,
115     bool                debug_synchronous)
116 {
117     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
118 
119     // The output value type
120     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
121         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
122         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
123 
124     // Max-identity
125     OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
126 
127     // Invoke kernel to device reduction directly
128     cudaError_t error = cudaSuccess;
129     for (int i = 0; i < timing_iterations; ++i)
130     {
131         error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
132             d_in, d_out, num_items, reduction_op, identity,
133             stream, debug_synchronous);
134     }
135 
136     return error;
137 }
138 
139 /**
140  * Dispatch to sum entrypoint
141  */
142 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
143 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::Sum,cudaStream_t stream,bool debug_synchronous)144 cudaError_t Dispatch(
145     Int2Type<CUB>       /*dispatch_to*/,
146     int                 timing_iterations,
147     size_t              */*d_temp_storage_bytes*/,
148     cudaError_t         */*d_cdp_error*/,
149 
150     void*               d_temp_storage,
151     size_t&             temp_storage_bytes,
152     InputIteratorT      d_in,
153     OutputIteratorT     d_out,
154     int                 num_items,
155     int                 /*max_segments*/,
156     OffsetIteratorT     /*d_segment_offsets*/,
157     cub::Sum            /*reduction_op*/,
158     cudaStream_t        stream,
159     bool                debug_synchronous)
160 {
161     // Invoke kernel to device reduction directly
162     cudaError_t error = cudaSuccess;
163     for (int i = 0; i < timing_iterations; ++i)
164     {
165         error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
166     }
167 
168     return error;
169 }
170 
171 /**
172  * Dispatch to min entrypoint
173  */
174 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
175 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::Min,cudaStream_t stream,bool debug_synchronous)176 cudaError_t Dispatch(
177     Int2Type<CUB>       /*dispatch_to*/,
178     int                 timing_iterations,
179     size_t              */*d_temp_storage_bytes*/,
180     cudaError_t         */*d_cdp_error*/,
181 
182     void*               d_temp_storage,
183     size_t&             temp_storage_bytes,
184     InputIteratorT      d_in,
185     OutputIteratorT     d_out,
186     int                 num_items,
187     int                 /*max_segments*/,
188     OffsetIteratorT     /*d_segment_offsets*/,
189     cub::Min            /*reduction_op*/,
190     cudaStream_t        stream,
191     bool                debug_synchronous)
192 {
193     // Invoke kernel to device reduction directly
194     cudaError_t error = cudaSuccess;
195     for (int i = 0; i < timing_iterations; ++i)
196     {
197         error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
198     }
199 
200     return error;
201 }
202 
203 /**
204  * Dispatch to max entrypoint
205  */
206 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
207 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::Max,cudaStream_t stream,bool debug_synchronous)208 cudaError_t Dispatch(
209     Int2Type<CUB>       /*dispatch_to*/,
210     int                 timing_iterations,
211     size_t              */*d_temp_storage_bytes*/,
212     cudaError_t         */*d_cdp_error*/,
213 
214     void*               d_temp_storage,
215     size_t&             temp_storage_bytes,
216     InputIteratorT      d_in,
217     OutputIteratorT     d_out,
218     int                 num_items,
219     int                 /*max_segments*/,
220     OffsetIteratorT     /*d_segment_offsets*/,
221     cub::Max            /*reduction_op*/,
222     cudaStream_t        stream,
223     bool                debug_synchronous)
224 {
225     // Invoke kernel to device reduction directly
226     cudaError_t error = cudaSuccess;
227     for (int i = 0; i < timing_iterations; ++i)
228     {
229         error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
230     }
231 
232     return error;
233 }
234 
235 /**
236  * Dispatch to argmin entrypoint
237  */
238 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
239 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::ArgMin,cudaStream_t stream,bool debug_synchronous)240 cudaError_t Dispatch(
241     Int2Type<CUB>       /*dispatch_to*/,
242     int                 timing_iterations,
243     size_t              */*d_temp_storage_bytes*/,
244     cudaError_t         */*d_cdp_error*/,
245 
246     void*               d_temp_storage,
247     size_t&             temp_storage_bytes,
248     InputIteratorT      d_in,
249     OutputIteratorT     d_out,
250     int                 num_items,
251     int                 /*max_segments*/,
252     OffsetIteratorT     /*d_segment_offsets*/,
253     cub::ArgMin         /*reduction_op*/,
254     cudaStream_t        stream,
255     bool                debug_synchronous)
256 {
257     // Invoke kernel to device reduction directly
258     cudaError_t error = cudaSuccess;
259     for (int i = 0; i < timing_iterations; ++i)
260     {
261         error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
262     }
263 
264     return error;
265 }
266 
267 /**
268  * Dispatch to argmax entrypoint
269  */
270 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
271 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::ArgMax,cudaStream_t stream,bool debug_synchronous)272 cudaError_t Dispatch(
273     Int2Type<CUB>       /*dispatch_to*/,
274     int                 timing_iterations,
275     size_t              */*d_temp_storage_bytes*/,
276     cudaError_t         */*d_cdp_error*/,
277 
278     void*               d_temp_storage,
279     size_t&             temp_storage_bytes,
280     InputIteratorT      d_in,
281     OutputIteratorT     d_out,
282     int                 num_items,
283     int                 /*max_segments*/,
284     OffsetIteratorT     /*d_segment_offsets*/,
285     cub::ArgMax         /*reduction_op*/,
286     cudaStream_t        stream,
287     bool                debug_synchronous)
288 {
289     // Invoke kernel to device reduction directly
290     cudaError_t error = cudaSuccess;
291     for (int i = 0; i < timing_iterations; ++i)
292     {
293         error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
294     }
295 
296     return error;
297 }
298 
299 
300 //---------------------------------------------------------------------
301 // Dispatch to different CUB DeviceSegmentedReduce entrypoints
302 //---------------------------------------------------------------------
303 
304 /**
305  * Dispatch to reduce entrypoint (custom-max)
306  */
307 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
308 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,cudaStream_t stream,bool debug_synchronous)309 cudaError_t Dispatch(
310     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
311     int                 timing_iterations,
312     size_t              */*d_temp_storage_bytes*/,
313     cudaError_t         */*d_cdp_error*/,
314 
315     void*               d_temp_storage,
316     size_t&             temp_storage_bytes,
317     InputIteratorT      d_in,
318     OutputIteratorT     d_out,
319     int                 /*num_items*/,
320     int                 max_segments,
321     OffsetIteratorT     d_segment_offsets,
322     ReductionOpT        reduction_op,
323     cudaStream_t        stream,
324     bool                debug_synchronous)
325 {
326     // The input value type
327     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
328 
329     // The output value type
330     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
331         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
332         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
333 
334     // Max-identity
335     OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
336 
337     // Invoke kernel to device reduction directly
338     cudaError_t error = cudaSuccess;
339     for (int i = 0; i < timing_iterations; ++i)
340     {
341         error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
342             d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
343             stream, debug_synchronous);
344     }
345     return error;
346 }
347 
348 /**
349  * Dispatch to sum entrypoint
350  */
351 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
352 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::Sum,cudaStream_t stream,bool debug_synchronous)353 cudaError_t Dispatch(
354     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
355     int                 timing_iterations,
356     size_t              */*d_temp_storage_bytes*/,
357     cudaError_t         */*d_cdp_error*/,
358 
359     void*               d_temp_storage,
360     size_t&             temp_storage_bytes,
361     InputIteratorT      d_in,
362     OutputIteratorT     d_out,
363     int                 /*num_items*/,
364     int                 max_segments,
365     OffsetIteratorT     d_segment_offsets,
366     cub::Sum            /*reduction_op*/,
367     cudaStream_t        stream,
368     bool                debug_synchronous)
369 {
370     // Invoke kernel to device reduction directly
371     cudaError_t error = cudaSuccess;
372     for (int i = 0; i < timing_iterations; ++i)
373     {
374         error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
375             d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
376             stream, debug_synchronous);
377     }
378     return error;
379 }
380 
381 /**
382  * Dispatch to min entrypoint
383  */
384 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
385 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::Min,cudaStream_t stream,bool debug_synchronous)386 cudaError_t Dispatch(
387     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
388     int                 timing_iterations,
389     size_t              */*d_temp_storage_bytes*/,
390     cudaError_t         */*d_cdp_error*/,
391 
392     void*               d_temp_storage,
393     size_t&             temp_storage_bytes,
394     InputIteratorT      d_in,
395     OutputIteratorT     d_out,
396     int                 /*num_items*/,
397     int                 max_segments,
398     OffsetIteratorT     d_segment_offsets,
399     cub::Min            /*reduction_op*/,
400     cudaStream_t        stream,
401     bool                debug_synchronous)
402 {
403     // Invoke kernel to device reduction directly
404     cudaError_t error = cudaSuccess;
405     for (int i = 0; i < timing_iterations; ++i)
406     {
407         error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
408             d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
409             stream, debug_synchronous);
410     }
411     return error;
412 }
413 
414 /**
415  * Dispatch to max entrypoint
416  */
417 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
418 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::Max,cudaStream_t stream,bool debug_synchronous)419 cudaError_t Dispatch(
420     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
421     int                 timing_iterations,
422     size_t              */*d_temp_storage_bytes*/,
423     cudaError_t         */*d_cdp_error*/,
424 
425     void*               d_temp_storage,
426     size_t&             temp_storage_bytes,
427     InputIteratorT      d_in,
428     OutputIteratorT     d_out,
429     int                 /*num_items*/,
430     int                 max_segments,
431     OffsetIteratorT     d_segment_offsets,
432     cub::Max            /*reduction_op*/,
433     cudaStream_t        stream,
434     bool                debug_synchronous)
435 {
436     // Invoke kernel to device reduction directly
437     cudaError_t error = cudaSuccess;
438     for (int i = 0; i < timing_iterations; ++i)
439     {
440         error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
441             d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
442             stream, debug_synchronous);
443     }
444     return error;
445 }
446 
447 /**
448  * Dispatch to argmin entrypoint
449  */
450 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
451 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::ArgMin,cudaStream_t stream,bool debug_synchronous)452 cudaError_t Dispatch(
453     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
454     int                 timing_iterations,
455     size_t              */*d_temp_storage_bytes*/,
456     cudaError_t         */*d_cdp_error*/,
457 
458     void*               d_temp_storage,
459     size_t&             temp_storage_bytes,
460     InputIteratorT      d_in,
461     OutputIteratorT     d_out,
462     int                 /*num_items*/,
463     int                 max_segments,
464     OffsetIteratorT     d_segment_offsets,
465     cub::ArgMin         /*reduction_op*/,
466     cudaStream_t        stream,
467     bool                debug_synchronous)
468 {
469     // Invoke kernel to device reduction directly
470     cudaError_t error = cudaSuccess;
471     for (int i = 0; i < timing_iterations; ++i)
472     {
473         error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
474             d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
475             stream, debug_synchronous);
476     }
477     return error;
478 }
479 
480 /**
481  * Dispatch to argmax entrypoint
482  */
483 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
484 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::ArgMax,cudaStream_t stream,bool debug_synchronous)485 cudaError_t Dispatch(
486     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
487     int                 timing_iterations,
488     size_t              */*d_temp_storage_bytes*/,
489     cudaError_t         */*d_cdp_error*/,
490 
491     void*               d_temp_storage,
492     size_t&             temp_storage_bytes,
493     InputIteratorT      d_in,
494     OutputIteratorT     d_out,
495     int                 /*num_items*/,
496     int                 max_segments,
497     OffsetIteratorT     d_segment_offsets,
498     cub::ArgMax         /*reduction_op*/,
499     cudaStream_t        stream,
500     bool                debug_synchronous)
501 {
502     // Invoke kernel to device reduction directly
503     cudaError_t error = cudaSuccess;
504     for (int i = 0; i < timing_iterations; ++i)
505     {
506         error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
507             d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
508             stream, debug_synchronous);
509     }
510     return error;
511 }
512 
513 
514 //---------------------------------------------------------------------
515 // Dispatch to different Thrust entrypoints
516 //---------------------------------------------------------------------
517 
518 /**
519  * Dispatch to reduction entrypoint (min or max specialization)
520  */
521 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
Dispatch(Int2Type<THRUST>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,ReductionOpT reduction_op,cudaStream_t,bool)522 cudaError_t Dispatch(
523     Int2Type<THRUST>    /*dispatch_to*/,
524     int                 timing_iterations,
525     size_t              */*d_temp_storage_bytes*/,
526     cudaError_t         */*d_cdp_error*/,
527 
528     void*               d_temp_storage,
529     size_t&             temp_storage_bytes,
530     InputIteratorT      d_in,
531     OutputIteratorT     d_out,
532     int                 num_items,
533     int                 /*max_segments*/,
534     OffsetIteratorT     /*d_segment_offsets*/,
535     ReductionOpT         reduction_op,
536     cudaStream_t        /*stream*/,
537     bool                /*debug_synchronous*/)
538 {
539     // The output value type
540     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
541         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
542         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
543 
544     if (d_temp_storage == 0)
545     {
546         temp_storage_bytes = 1;
547     }
548     else
549     {
550         OutputT init;
551         CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
552 
553         thrust::device_ptr<OutputT> d_in_wrapper(d_in);
554         OutputT retval;
555         for (int i = 0; i < timing_iterations; ++i)
556         {
557             retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
558         }
559 
560         if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
561             CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
562     }
563 
564     return cudaSuccess;
565 }
566 
567 /**
568  * Dispatch to reduction entrypoint (sum specialization)
569  */
570 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
Dispatch(Int2Type<THRUST>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,Sum,cudaStream_t,bool)571 cudaError_t Dispatch(
572     Int2Type<THRUST>    /*dispatch_to*/,
573     int                 timing_iterations,
574     size_t              */*d_temp_storage_bytes*/,
575     cudaError_t         */*d_cdp_error*/,
576 
577     void*               d_temp_storage,
578     size_t&             temp_storage_bytes,
579     InputIteratorT      d_in,
580     OutputIteratorT     d_out,
581     int                 num_items,
582     int                 /*max_segments*/,
583     OffsetIteratorT     /*d_segment_offsets*/,
584     Sum                 /*reduction_op*/,
585     cudaStream_t        /*stream*/,
586     bool                /*debug_synchronous*/)
587 {
588     // The output value type
589     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
590         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
591         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
592 
593     if (d_temp_storage == 0)
594     {
595         temp_storage_bytes = 1;
596     }
597     else
598     {
599         thrust::device_ptr<OutputT> d_in_wrapper(d_in);
600         OutputT retval;
601         for (int i = 0; i < timing_iterations; ++i)
602         {
603             retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
604         }
605 
606         if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
607             CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
608     }
609 
610     return cudaSuccess;
611 }
612 
613 
614 //---------------------------------------------------------------------
615 // CUDA nested-parallelism test kernel
616 //---------------------------------------------------------------------
617 
618 /**
619  * Simple wrapper kernel to invoke DeviceReduce
620  */
621 template <
622     typename            InputIteratorT,
623     typename            OutputIteratorT,
624     typename            OffsetIteratorT,
625     typename            ReductionOpT>
CnpDispatchKernel(int timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int max_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,bool debug_synchronous)626 __global__ void CnpDispatchKernel(
627     int                 timing_iterations,
628     size_t              *d_temp_storage_bytes,
629     cudaError_t         *d_cdp_error,
630 
631     void*               d_temp_storage,
632     size_t              temp_storage_bytes,
633     InputIteratorT      d_in,
634     OutputIteratorT     d_out,
635     int                 num_items,
636     int                 max_segments,
637     OffsetIteratorT     d_segment_offsets,
638     ReductionOpT        reduction_op,
639     bool                debug_synchronous)
640 {
641 #ifndef CUB_CDP
642     (void)timing_iterations;
643     (void)d_temp_storage_bytes;
644     (void)d_cdp_error;
645     (void)d_temp_storage;
646     (void)temp_storage_bytes;
647     (void)d_in;
648     (void)d_out;
649     (void)num_items;
650     (void)max_segments;
651     (void)d_segment_offsets;
652     (void)reduction_op;
653     (void)debug_synchronous;
654     *d_cdp_error = cudaErrorNotSupported;
655 #else
656     *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
657         d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
658     *d_temp_storage_bytes = temp_storage_bytes;
659 #endif
660 }
661 
662 
663 /**
664  * Dispatch to CUB_CDP kernel
665  */
666 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
667 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_CDP> dispatch_to,int timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int max_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,cudaStream_t stream,bool debug_synchronous)668 cudaError_t Dispatch(
669     Int2Type<CUB_CDP>       dispatch_to,
670     int                 timing_iterations,
671     size_t              *d_temp_storage_bytes,
672     cudaError_t         *d_cdp_error,
673 
674     void*               d_temp_storage,
675     size_t&             temp_storage_bytes,
676     InputIteratorT      d_in,
677     OutputIteratorT     d_out,
678     int                 num_items,
679     int                 max_segments,
680     OffsetIteratorT     d_segment_offsets,
681     ReductionOpT        reduction_op,
682     cudaStream_t        stream,
683     bool                debug_synchronous)
684 {
685     // Invoke kernel to invoke device-side dispatch
686     CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
687         d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
688 
689     // Copy out temp_storage_bytes
690     CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
691 
692     // Copy out error
693     cudaError_t retval;
694     CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
695     return retval;
696 }
697 
698 
699 
700 //---------------------------------------------------------------------
701 // Problem generation
702 //---------------------------------------------------------------------
703 
704 /// Initialize problem
705 template <typename InputT>
Initialize(GenMode gen_mode,InputT * h_in,int num_items)706 void Initialize(
707     GenMode         gen_mode,
708     InputT          *h_in,
709     int             num_items)
710 {
711     for (int i = 0; i < num_items; ++i)
712     {
713         InitValue(gen_mode, h_in[i], i);
714     }
715 
716     if (g_verbose_input)
717     {
718         printf("Input:\n");
719         DisplayResults(h_in, num_items);
720         printf("\n\n");
721     }
722 }
723 
724 
725 /// Solve problem (max/custom-max functor)
726 template <typename ReductionOpT, typename InputT, typename _OutputT>
727 struct Solution
728 {
729     typedef _OutputT OutputT;
730 
731     template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution732     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
733         ReductionOpT reduction_op)
734     {
735         for (int i = 0; i < num_segments; ++i)
736         {
737             OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
738             for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
739                 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
740             h_reference[i] = aggregate;
741         }
742     }
743 };
744 
745 /// Solve problem (min functor)
746 template <typename InputT, typename _OutputT>
747 struct Solution<cub::Min, InputT, _OutputT>
748 {
749     typedef _OutputT OutputT;
750 
751     template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution752     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
753         cub::Min reduction_op)
754     {
755         for (int i = 0; i < num_segments; ++i)
756         {
757             OutputT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
758             for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
759                 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
760             h_reference[i] = aggregate;
761         }
762     }
763 };
764 
765 
766 /// Solve problem (sum functor)
767 template <typename InputT, typename _OutputT>
768 struct Solution<cub::Sum, InputT, _OutputT>
769 {
770     typedef _OutputT OutputT;
771 
772     template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution773     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
774         cub::Sum reduction_op)
775     {
776         for (int i = 0; i < num_segments; ++i)
777         {
778             OutputT aggregate;
779             InitValue(INTEGER_SEED, aggregate, 0);
780             for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
781                 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
782             h_reference[i] = aggregate;
783         }
784     }
785 };
786 
787 /// Solve problem (argmin functor)
788 template <typename InputValueT, typename OutputValueT>
789 struct Solution<cub::ArgMin, InputValueT, OutputValueT>
790 {
791     typedef KeyValuePair<int, OutputValueT> OutputT;
792 
793     template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution794     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
795         cub::ArgMin reduction_op)
796     {
797         for (int i = 0; i < num_segments; ++i)
798         {
799             OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
800             for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
801             {
802                 OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
803                 aggregate = reduction_op(aggregate, item);
804             }
805             h_reference[i] = aggregate;
806         }
807     }
808 };
809 
810 
811 /// Solve problem (argmax functor)
812 template <typename InputValueT, typename OutputValueT>
813 struct Solution<cub::ArgMax, InputValueT, OutputValueT>
814 {
815     typedef KeyValuePair<int, OutputValueT> OutputT;
816 
817     template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution818     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
819         cub::ArgMax reduction_op)
820     {
821         for (int i = 0; i < num_segments; ++i)
822         {
823             OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
824             for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
825             {
826                 OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
827                 aggregate = reduction_op(aggregate, item);
828             }
829             h_reference[i] = aggregate;
830         }
831     }
832 };
833 
834 
835 //---------------------------------------------------------------------
836 // Problem generation
837 //---------------------------------------------------------------------
838 
839 /// Test DeviceReduce for a given problem input
840 template <
841     typename                BackendT,
842     typename                DeviceInputIteratorT,
843     typename                DeviceOutputIteratorT,
844     typename                HostReferenceIteratorT,
845     typename                OffsetT,
846     typename                OffsetIteratorT,
847     typename                ReductionOpT>
Test(BackendT backend,DeviceInputIteratorT d_in,DeviceOutputIteratorT d_out,OffsetT num_items,OffsetT num_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,HostReferenceIteratorT h_reference)848 void Test(
849     BackendT                backend,
850     DeviceInputIteratorT    d_in,
851     DeviceOutputIteratorT   d_out,
852     OffsetT                 num_items,
853     OffsetT                 num_segments,
854     OffsetIteratorT         d_segment_offsets,
855     ReductionOpT            reduction_op,
856     HostReferenceIteratorT  h_reference)
857 {
858     // Input data types
859     typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
860 
861     // Allocate CUB_CDP device arrays for temp storage size and error
862     size_t          *d_temp_storage_bytes = NULL;
863     cudaError_t     *d_cdp_error = NULL;
864     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
865     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
866 
867     // Inquire temp device storage
868     void            *d_temp_storage = NULL;
869     size_t          temp_storage_bytes = 0;
870     CubDebugExit(Dispatch(backend, 1,
871         d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
872         d_in, d_out, num_items, num_segments, d_segment_offsets,
873         reduction_op, 0, true));
874 
875     // Allocate temp device storage
876     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
877 
878     // Run warmup/correctness iteration
879     CubDebugExit(Dispatch(backend, 1,
880         d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
881         d_in, d_out, num_items, num_segments, d_segment_offsets,
882         reduction_op, 0, true));
883 
884     // Check for correctness (and display results, if specified)
885     int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
886     printf("\t%s", compare ? "FAIL" : "PASS");
887 
888     // Flush any stdout/stderr
889     fflush(stdout);
890     fflush(stderr);
891 
892     // Performance
893     if (g_timing_iterations > 0)
894     {
895         GpuTimer gpu_timer;
896         gpu_timer.Start();
897 
898         CubDebugExit(Dispatch(backend, g_timing_iterations,
899             d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
900             d_in, d_out, num_items, num_segments, d_segment_offsets,
901             reduction_op, 0, false));
902 
903         gpu_timer.Stop();
904         float elapsed_millis = gpu_timer.ElapsedMillis();
905 
906         // Display performance
907         float avg_millis = elapsed_millis / g_timing_iterations;
908         float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
909         float giga_bandwidth = giga_rate * sizeof(InputT);
910         printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
911             avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
912 
913     }
914 
915     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
916     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
917     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
918 
919     // Correctness asserts
920     AssertEquals(0, compare);
921 }
922 
923 
924 /// Test DeviceReduce
925 template <
926     Backend                 BACKEND,
927     typename                OutputValueT,
928     typename                HostInputIteratorT,
929     typename                DeviceInputIteratorT,
930     typename                OffsetT,
931     typename                OffsetIteratorT,
932     typename                ReductionOpT>
SolveAndTest(HostInputIteratorT h_in,DeviceInputIteratorT d_in,OffsetT num_items,OffsetT num_segments,OffsetIteratorT h_segment_offsets,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op)933 void SolveAndTest(
934     HostInputIteratorT      h_in,
935     DeviceInputIteratorT    d_in,
936     OffsetT                 num_items,
937     OffsetT                 num_segments,
938     OffsetIteratorT         h_segment_offsets,
939     OffsetIteratorT         d_segment_offsets,
940     ReductionOpT            reduction_op)
941 {
942     typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputValueT;
943     typedef Solution<ReductionOpT, InputValueT, OutputValueT>                   SolutionT;
944     typedef typename SolutionT::OutputT                                         OutputT;
945 
946     printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
947         (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
948         typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
949     fflush(stdout);
950 
951     // Allocate and solve solution
952     OutputT *h_reference = new OutputT[num_segments];
953     SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);
954 
955 //    // Run with discard iterator
956 //    DiscardOutputIterator<OffsetT> discard_itr;
957 //    Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
958 
959     // Run with output data (cleared for sanity-check)
960     OutputT *d_out = NULL;
961     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments));
962     CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
963     Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
964 
965     // Cleanup
966     if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
967     if (h_reference) delete[] h_reference;
968 }
969 
970 
971 /// Test specific problem type
972 template <
973     Backend         BACKEND,
974     typename        InputT,
975     typename        OutputT,
976     typename        OffsetT,
977     typename        ReductionOpT>
TestProblem(OffsetT num_items,OffsetT num_segments,GenMode gen_mode,ReductionOpT reduction_op)978 void TestProblem(
979     OffsetT         num_items,
980     OffsetT         num_segments,
981     GenMode         gen_mode,
982     ReductionOpT    reduction_op)
983 {
984     printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
985     fflush(stdout);
986 
987     // Initialize value data
988     InputT* h_in = new InputT[num_items];
989     Initialize(gen_mode, h_in, num_items);
990 
991     // Initialize segment data
992     OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
993     InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
994 
995     // Initialize device data
996     OffsetT *d_segment_offsets      = NULL;
997     InputT  *d_in                   = NULL;
998     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
999     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
1000     CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
1001     CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
1002 
1003     SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);
1004 
1005     if (h_segment_offsets)  delete[] h_segment_offsets;
1006     if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
1007     if (h_in)               delete[] h_in;
1008     if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
1009 }
1010 
1011 
1012 /// Test different operators
1013 template <
1014     Backend             BACKEND,
1015     typename            OutputT,
1016     typename            HostInputIteratorT,
1017     typename            DeviceInputIteratorT,
1018     typename            OffsetT,
1019     typename            OffsetIteratorT>
TestByOp(HostInputIteratorT h_in,DeviceInputIteratorT d_in,OffsetT num_items,OffsetT num_segments,OffsetIteratorT h_segment_offsets,OffsetIteratorT d_segment_offsets)1020 void TestByOp(
1021     HostInputIteratorT      h_in,
1022     DeviceInputIteratorT    d_in,
1023     OffsetT                 num_items,
1024     OffsetT                 num_segments,
1025     OffsetIteratorT         h_segment_offsets,
1026     OffsetIteratorT         d_segment_offsets)
1027 {
1028     SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
1029     SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
1030     SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
1031     SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
1032     SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
1033     SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
1034 }
1035 
1036 
1037 /// Test different backends
1038 template <
1039     typename    InputT,
1040     typename    OutputT,
1041     typename    OffsetT>
TestByBackend(OffsetT num_items,OffsetT max_segments,GenMode gen_mode)1042 void TestByBackend(
1043     OffsetT     num_items,
1044     OffsetT     max_segments,
1045     GenMode     gen_mode)
1046 {
1047     // Initialize host data
1048     printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
1049         num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
1050 
1051     InputT  *h_in               = new InputT[num_items];
1052     OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
1053     Initialize(gen_mode, h_in, num_items);
1054 
1055     // Initialize device data
1056     InputT  *d_in               = NULL;
1057     OffsetT *d_segment_offsets  = NULL;
1058     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
1059     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
1060     CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
1061 
1062     //
1063     // Test single-segment implementations
1064     //
1065 
1066     InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
1067 
1068     // Page-aligned-input tests
1069     TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);                 // Host-dispatch
1070 #ifdef CUB_CDP
1071     TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);             // Device-dispatch
1072 #endif
1073 
1074     // Non-page-aligned-input tests
1075     if (num_items > 1)
1076     {
1077         InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
1078         TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
1079     }
1080 
1081     //
1082     // Test segmented implementation
1083     //
1084 
1085     // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
1086     int max_items_per_segment = 128000;
1087 
1088     for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
1089         num_segments < max_segments;
1090         num_segments = (num_segments * 32) + 1)
1091     {
1092         // Test with segment pointer
1093         InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
1094         CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
1095         TestByOp<CUB_SEGMENTED, OutputT>(
1096             h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);
1097 
1098         // Test with segment iterator
1099         typedef CastOp<OffsetT> IdentityOpT;
1100         IdentityOpT identity_op;
1101         TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
1102             h_segment_offsets,
1103             identity_op);
1104        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
1105             d_segment_offsets,
1106             identity_op);
1107 
1108         TestByOp<CUB_SEGMENTED, OutputT>(
1109             h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
1110     }
1111 
1112     if (h_in)               delete[] h_in;
1113     if (h_segment_offsets)  delete[] h_segment_offsets;
1114     if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
1115     if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
1116 }
1117 
1118 
1119 /// Test different input-generation modes
1120 template <
1121     typename InputT,
1122     typename OutputT,
1123     typename OffsetT>
TestByGenMode(OffsetT num_items,OffsetT max_segments)1124 void TestByGenMode(
1125     OffsetT num_items,
1126     OffsetT max_segments)
1127 {
1128     //
1129     // Test pointer support using different input-generation modes
1130     //
1131 
1132     TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
1133     TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
1134     TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
1135 
1136     //
1137     // Test iterator support using a constant-iterator and SUM
1138     //
1139 
1140     InputT val;
1141     InitValue(UNIFORM, val, 0);
1142     ConstantInputIterator<InputT, OffsetT> h_in(val);
1143 
1144     OffsetT *h_segment_offsets = new OffsetT[1 + 1];
1145     InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
1146 
1147     SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
1148 #ifdef CUB_CDP
1149     SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
1150 #endif
1151 
1152     if (h_segment_offsets) delete[] h_segment_offsets;
1153 }
1154 
1155 
1156 /// Test different problem sizes
1157 template <
1158     typename InputT,
1159     typename OutputT,
1160     typename OffsetT>
1161 struct TestBySize
1162 {
1163     OffsetT max_items;
1164     OffsetT max_segments;
1165 
TestBySizeTestBySize1166     TestBySize(OffsetT max_items, OffsetT max_segments) :
1167         max_items(max_items),
1168         max_segments(max_segments)
1169     {}
1170 
1171     template <typename ActivePolicyT>
InvokeTestBySize1172     cudaError_t Invoke()
1173     {
1174         //
1175         // Black-box testing on all backends
1176         //
1177 
1178         // Test 0, 1, many
1179         TestByGenMode<InputT, OutputT>(0,           max_segments);
1180         TestByGenMode<InputT, OutputT>(1,           max_segments);
1181         TestByGenMode<InputT, OutputT>(max_items,   max_segments);
1182 
1183         // Test random problem sizes from a log-distribution [8, max_items-ish)
1184         int     num_iterations = 8;
1185         double  max_exp = log(double(max_items)) / log(double(2.0));
1186         for (int i = 0; i < num_iterations; ++i)
1187         {
1188             OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
1189             TestByGenMode<InputT, OutputT>(num_items, max_segments);
1190         }
1191 
1192         //
1193         // White-box testing of single-segment problems around specific sizes
1194         //
1195 
1196         // Tile-boundaries: multiple blocks, one tile per block
1197         OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
1198         TestProblem<CUB, InputT, OutputT>(tile_size * 4,  1,      RANDOM, Sum());
1199         TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1,   RANDOM, Sum());
1200         TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1,   RANDOM, Sum());
1201 
1202         // Tile-boundaries: multiple blocks, multiple tiles per block
1203         OffsetT sm_occupancy = 32;
1204         OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
1205         TestProblem<CUB, InputT, OutputT>(occupancy,  1,      RANDOM, Sum());
1206         TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1,   RANDOM, Sum());
1207         TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1,   RANDOM, Sum());
1208 
1209         return cudaSuccess;
1210     }
1211 };
1212 
1213 
1214 /// Test problem type
1215 template <
1216     typename    InputT,
1217     typename    OutputT,
1218     typename    OffsetT>
TestType(OffsetT max_items,OffsetT max_segments)1219 void TestType(
1220     OffsetT     max_items,
1221     OffsetT     max_segments)
1222 {
1223     typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
1224 
1225     TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
1226 
1227     MaxPolicyT::Invoke(g_ptx_version, dispatch);
1228 }
1229 
1230 
1231 //---------------------------------------------------------------------
1232 // Main
1233 //---------------------------------------------------------------------
1234 
1235 
1236 /**
1237  * Main
1238  */
main(int argc,char ** argv)1239 int main(int argc, char** argv)
1240 {
1241     typedef int OffsetT;
1242 
1243     OffsetT max_items       = 27000000;
1244     OffsetT max_segments    = 34000;
1245 
1246     // Initialize command line
1247     CommandLineArgs args(argc, argv);
1248     g_verbose = args.CheckCmdLineFlag("v");
1249     g_verbose_input = args.CheckCmdLineFlag("v2");
1250     args.GetCmdLineArgument("n", max_items);
1251     args.GetCmdLineArgument("s", max_segments);
1252     args.GetCmdLineArgument("i", g_timing_iterations);
1253     args.GetCmdLineArgument("repeat", g_repeat);
1254 
1255     // Print usage
1256     if (args.CheckCmdLineFlag("help"))
1257     {
1258         printf("%s "
1259             "[--n=<input items> "
1260             "[--s=<num segments> "
1261             "[--i=<timing iterations> "
1262             "[--device=<device-id>] "
1263             "[--repeat=<repetitions of entire test suite>]"
1264             "[--v] "
1265             "[--cdp]"
1266             "\n", argv[0]);
1267         exit(0);
1268     }
1269 
1270     // Initialize device
1271     CubDebugExit(args.DeviceInit());
1272     g_device_giga_bandwidth = args.device_giga_bandwidth;
1273 
1274     // Get ptx version
1275     CubDebugExit(PtxVersion(g_ptx_version));
1276 
1277     // Get SM count
1278     g_sm_count = args.deviceProp.multiProcessorCount;
1279 
1280 #ifdef QUICKER_TEST
1281 
1282     // Compile/run basic test
1283 
1284 
1285     TestProblem<CUB, char, int>(            max_items, 1, RANDOM_BIT, Sum());
1286     TestProblem<CUB, short, int>(           max_items, 1, RANDOM_BIT, Sum());
1287 
1288     printf("\n-------------------------------\n");
1289 
1290     TestProblem<CUB, int, int>(             max_items, 1, RANDOM_BIT, Sum());
1291     TestProblem<CUB, long long, long long>( max_items, 1, RANDOM_BIT, Sum());
1292 
1293     printf("\n-------------------------------\n");
1294 
1295     TestProblem<CUB, float, float>( max_items, 1, RANDOM_BIT, Sum());
1296     TestProblem<CUB, double, double>( max_items, 1, RANDOM_BIT, Sum());
1297 
1298     printf("\n-------------------------------\n");
1299 
1300     TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM_BIT, Sum());
1301 
1302 
1303 #elif defined(QUICK_TEST)
1304 
1305     // Compile/run quick comparison tests
1306 
1307     TestProblem<CUB, char, char>(         max_items * 4, 1, UNIFORM, Sum());
1308     TestProblem<THRUST, char, char>(      max_items * 4, 1, UNIFORM, Sum());
1309 
1310     printf("\n----------------------------\n");
1311     TestProblem<CUB, short, short>(        max_items * 2, 1, UNIFORM, Sum());
1312     TestProblem<THRUST, short, short>(     max_items * 2, 1, UNIFORM, Sum());
1313 
1314     printf("\n----------------------------\n");
1315     TestProblem<CUB, int, int>(          max_items,     1, UNIFORM, Sum());
1316     TestProblem<THRUST, int, int>(       max_items,     1, UNIFORM, Sum());
1317 
1318     printf("\n----------------------------\n");
1319     TestProblem<CUB, long long, long long>(    max_items / 2, 1, UNIFORM, Sum());
1320     TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
1321 
1322     printf("\n----------------------------\n");
1323     TestProblem<CUB, TestFoo, TestFoo>(      max_items / 4, 1, UNIFORM, Max());
1324     TestProblem<THRUST, TestFoo, TestFoo>(   max_items / 4, 1, UNIFORM, Max());
1325 
1326 #else
1327 
1328     // Compile/run thorough tests
1329     for (int i = 0; i <= g_repeat; ++i)
1330     {
1331         // Test different input types
1332         TestType<char, char>(max_items, max_segments);
1333 
1334         TestType<unsigned char, unsigned char>(max_items, max_segments);
1335 
1336         TestType<char, int>(max_items, max_segments);
1337 
1338         TestType<short, short>(max_items, max_segments);
1339         TestType<int, int>(max_items, max_segments);
1340         TestType<long, long>(max_items, max_segments);
1341         TestType<long long, long long>(max_items, max_segments);
1342 
1343         TestType<uchar2, uchar2>(max_items, max_segments);
1344         TestType<uint2, uint2>(max_items, max_segments);
1345         TestType<ulonglong2, ulonglong2>(max_items, max_segments);
1346         TestType<ulonglong4, ulonglong4>(max_items, max_segments);
1347 
1348         TestType<TestFoo, TestFoo>(max_items, max_segments);
1349         TestType<TestBar, TestBar>(max_items, max_segments);
1350     }
1351 
1352 #endif
1353 
1354 
1355     printf("\n");
1356     return 0;
1357 }
1358 
1359 
1360 
1361