1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /******************************************************************************
30 * Test of DeviceReduce utilities
31 ******************************************************************************/
32
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35
36 #include <stdio.h>
37 #include <limits>
38 #include <typeinfo>
39
40 #include <thrust/device_ptr.h>
41 #include <thrust/reduce.h>
42
43 #include <cub/util_allocator.cuh>
44 #include <cub/device/device_reduce.cuh>
45 #include <cub/device/device_segmented_reduce.cuh>
46 #include <cub/iterator/constant_input_iterator.cuh>
47 #include <cub/iterator/discard_output_iterator.cuh>
48 #include <cub/iterator/transform_input_iterator.cuh>
49
50 #include "test_util.h"
51
52 using namespace cub;
53
54
55 //---------------------------------------------------------------------
56 // Globals, constants and typedefs
57 //---------------------------------------------------------------------
58
59 int g_ptx_version;
60 int g_sm_count;
61 double g_device_giga_bandwidth;
62 bool g_verbose = false;
63 bool g_verbose_input = false;
64 int g_timing_iterations = 0;
65 int g_repeat = 0;
66 CachingDeviceAllocator g_allocator(true);
67
68
69 // Dispatch types
70 enum Backend
71 {
72 CUB, // CUB method
73 CUB_SEGMENTED, // CUB segmented method
74 CUB_CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
75 THRUST, // Thrust method
76 };
77
78
79 // Custom max functor
80 struct CustomMax
81 {
82 /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
83 template <typename OutputT>
operator ()CustomMax84 __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
85 {
86 return CUB_MAX(a, b);
87 }
88 };
89
90
91 //---------------------------------------------------------------------
92 // Dispatch to different CUB DeviceReduce entrypoints
93 //---------------------------------------------------------------------
94
95 /**
96 * Dispatch to reduce entrypoint (custom-max)
97 */
98 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
99 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,ReductionOpT reduction_op,cudaStream_t stream,bool debug_synchronous)100 cudaError_t Dispatch(
101 Int2Type<CUB> /*dispatch_to*/,
102 int timing_iterations,
103 size_t */*d_temp_storage_bytes*/,
104 cudaError_t */*d_cdp_error*/,
105
106 void* d_temp_storage,
107 size_t& temp_storage_bytes,
108 InputIteratorT d_in,
109 OutputIteratorT d_out,
110 int num_items,
111 int /*max_segments*/,
112 OffsetIteratorT /*d_segment_offsets*/,
113 ReductionOpT reduction_op,
114 cudaStream_t stream,
115 bool debug_synchronous)
116 {
117 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
118
119 // The output value type
120 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
121 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
122 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
123
124 // Max-identity
125 OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
126
127 // Invoke kernel to device reduction directly
128 cudaError_t error = cudaSuccess;
129 for (int i = 0; i < timing_iterations; ++i)
130 {
131 error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
132 d_in, d_out, num_items, reduction_op, identity,
133 stream, debug_synchronous);
134 }
135
136 return error;
137 }
138
139 /**
140 * Dispatch to sum entrypoint
141 */
142 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
143 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::Sum,cudaStream_t stream,bool debug_synchronous)144 cudaError_t Dispatch(
145 Int2Type<CUB> /*dispatch_to*/,
146 int timing_iterations,
147 size_t */*d_temp_storage_bytes*/,
148 cudaError_t */*d_cdp_error*/,
149
150 void* d_temp_storage,
151 size_t& temp_storage_bytes,
152 InputIteratorT d_in,
153 OutputIteratorT d_out,
154 int num_items,
155 int /*max_segments*/,
156 OffsetIteratorT /*d_segment_offsets*/,
157 cub::Sum /*reduction_op*/,
158 cudaStream_t stream,
159 bool debug_synchronous)
160 {
161 // Invoke kernel to device reduction directly
162 cudaError_t error = cudaSuccess;
163 for (int i = 0; i < timing_iterations; ++i)
164 {
165 error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
166 }
167
168 return error;
169 }
170
171 /**
172 * Dispatch to min entrypoint
173 */
174 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
175 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::Min,cudaStream_t stream,bool debug_synchronous)176 cudaError_t Dispatch(
177 Int2Type<CUB> /*dispatch_to*/,
178 int timing_iterations,
179 size_t */*d_temp_storage_bytes*/,
180 cudaError_t */*d_cdp_error*/,
181
182 void* d_temp_storage,
183 size_t& temp_storage_bytes,
184 InputIteratorT d_in,
185 OutputIteratorT d_out,
186 int num_items,
187 int /*max_segments*/,
188 OffsetIteratorT /*d_segment_offsets*/,
189 cub::Min /*reduction_op*/,
190 cudaStream_t stream,
191 bool debug_synchronous)
192 {
193 // Invoke kernel to device reduction directly
194 cudaError_t error = cudaSuccess;
195 for (int i = 0; i < timing_iterations; ++i)
196 {
197 error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
198 }
199
200 return error;
201 }
202
203 /**
204 * Dispatch to max entrypoint
205 */
206 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
207 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::Max,cudaStream_t stream,bool debug_synchronous)208 cudaError_t Dispatch(
209 Int2Type<CUB> /*dispatch_to*/,
210 int timing_iterations,
211 size_t */*d_temp_storage_bytes*/,
212 cudaError_t */*d_cdp_error*/,
213
214 void* d_temp_storage,
215 size_t& temp_storage_bytes,
216 InputIteratorT d_in,
217 OutputIteratorT d_out,
218 int num_items,
219 int /*max_segments*/,
220 OffsetIteratorT /*d_segment_offsets*/,
221 cub::Max /*reduction_op*/,
222 cudaStream_t stream,
223 bool debug_synchronous)
224 {
225 // Invoke kernel to device reduction directly
226 cudaError_t error = cudaSuccess;
227 for (int i = 0; i < timing_iterations; ++i)
228 {
229 error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
230 }
231
232 return error;
233 }
234
235 /**
236 * Dispatch to argmin entrypoint
237 */
238 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
239 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::ArgMin,cudaStream_t stream,bool debug_synchronous)240 cudaError_t Dispatch(
241 Int2Type<CUB> /*dispatch_to*/,
242 int timing_iterations,
243 size_t */*d_temp_storage_bytes*/,
244 cudaError_t */*d_cdp_error*/,
245
246 void* d_temp_storage,
247 size_t& temp_storage_bytes,
248 InputIteratorT d_in,
249 OutputIteratorT d_out,
250 int num_items,
251 int /*max_segments*/,
252 OffsetIteratorT /*d_segment_offsets*/,
253 cub::ArgMin /*reduction_op*/,
254 cudaStream_t stream,
255 bool debug_synchronous)
256 {
257 // Invoke kernel to device reduction directly
258 cudaError_t error = cudaSuccess;
259 for (int i = 0; i < timing_iterations; ++i)
260 {
261 error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
262 }
263
264 return error;
265 }
266
267 /**
268 * Dispatch to argmax entrypoint
269 */
270 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
271 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,cub::ArgMax,cudaStream_t stream,bool debug_synchronous)272 cudaError_t Dispatch(
273 Int2Type<CUB> /*dispatch_to*/,
274 int timing_iterations,
275 size_t */*d_temp_storage_bytes*/,
276 cudaError_t */*d_cdp_error*/,
277
278 void* d_temp_storage,
279 size_t& temp_storage_bytes,
280 InputIteratorT d_in,
281 OutputIteratorT d_out,
282 int num_items,
283 int /*max_segments*/,
284 OffsetIteratorT /*d_segment_offsets*/,
285 cub::ArgMax /*reduction_op*/,
286 cudaStream_t stream,
287 bool debug_synchronous)
288 {
289 // Invoke kernel to device reduction directly
290 cudaError_t error = cudaSuccess;
291 for (int i = 0; i < timing_iterations; ++i)
292 {
293 error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
294 }
295
296 return error;
297 }
298
299
300 //---------------------------------------------------------------------
301 // Dispatch to different CUB DeviceSegmentedReduce entrypoints
302 //---------------------------------------------------------------------
303
304 /**
305 * Dispatch to reduce entrypoint (custom-max)
306 */
307 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
308 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,cudaStream_t stream,bool debug_synchronous)309 cudaError_t Dispatch(
310 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
311 int timing_iterations,
312 size_t */*d_temp_storage_bytes*/,
313 cudaError_t */*d_cdp_error*/,
314
315 void* d_temp_storage,
316 size_t& temp_storage_bytes,
317 InputIteratorT d_in,
318 OutputIteratorT d_out,
319 int /*num_items*/,
320 int max_segments,
321 OffsetIteratorT d_segment_offsets,
322 ReductionOpT reduction_op,
323 cudaStream_t stream,
324 bool debug_synchronous)
325 {
326 // The input value type
327 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
328
329 // The output value type
330 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
331 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
332 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
333
334 // Max-identity
335 OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
336
337 // Invoke kernel to device reduction directly
338 cudaError_t error = cudaSuccess;
339 for (int i = 0; i < timing_iterations; ++i)
340 {
341 error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
342 d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
343 stream, debug_synchronous);
344 }
345 return error;
346 }
347
348 /**
349 * Dispatch to sum entrypoint
350 */
351 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
352 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::Sum,cudaStream_t stream,bool debug_synchronous)353 cudaError_t Dispatch(
354 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
355 int timing_iterations,
356 size_t */*d_temp_storage_bytes*/,
357 cudaError_t */*d_cdp_error*/,
358
359 void* d_temp_storage,
360 size_t& temp_storage_bytes,
361 InputIteratorT d_in,
362 OutputIteratorT d_out,
363 int /*num_items*/,
364 int max_segments,
365 OffsetIteratorT d_segment_offsets,
366 cub::Sum /*reduction_op*/,
367 cudaStream_t stream,
368 bool debug_synchronous)
369 {
370 // Invoke kernel to device reduction directly
371 cudaError_t error = cudaSuccess;
372 for (int i = 0; i < timing_iterations; ++i)
373 {
374 error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
375 d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
376 stream, debug_synchronous);
377 }
378 return error;
379 }
380
381 /**
382 * Dispatch to min entrypoint
383 */
384 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
385 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::Min,cudaStream_t stream,bool debug_synchronous)386 cudaError_t Dispatch(
387 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
388 int timing_iterations,
389 size_t */*d_temp_storage_bytes*/,
390 cudaError_t */*d_cdp_error*/,
391
392 void* d_temp_storage,
393 size_t& temp_storage_bytes,
394 InputIteratorT d_in,
395 OutputIteratorT d_out,
396 int /*num_items*/,
397 int max_segments,
398 OffsetIteratorT d_segment_offsets,
399 cub::Min /*reduction_op*/,
400 cudaStream_t stream,
401 bool debug_synchronous)
402 {
403 // Invoke kernel to device reduction directly
404 cudaError_t error = cudaSuccess;
405 for (int i = 0; i < timing_iterations; ++i)
406 {
407 error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
408 d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
409 stream, debug_synchronous);
410 }
411 return error;
412 }
413
414 /**
415 * Dispatch to max entrypoint
416 */
417 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
418 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::Max,cudaStream_t stream,bool debug_synchronous)419 cudaError_t Dispatch(
420 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
421 int timing_iterations,
422 size_t */*d_temp_storage_bytes*/,
423 cudaError_t */*d_cdp_error*/,
424
425 void* d_temp_storage,
426 size_t& temp_storage_bytes,
427 InputIteratorT d_in,
428 OutputIteratorT d_out,
429 int /*num_items*/,
430 int max_segments,
431 OffsetIteratorT d_segment_offsets,
432 cub::Max /*reduction_op*/,
433 cudaStream_t stream,
434 bool debug_synchronous)
435 {
436 // Invoke kernel to device reduction directly
437 cudaError_t error = cudaSuccess;
438 for (int i = 0; i < timing_iterations; ++i)
439 {
440 error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
441 d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
442 stream, debug_synchronous);
443 }
444 return error;
445 }
446
447 /**
448 * Dispatch to argmin entrypoint
449 */
450 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
451 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::ArgMin,cudaStream_t stream,bool debug_synchronous)452 cudaError_t Dispatch(
453 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
454 int timing_iterations,
455 size_t */*d_temp_storage_bytes*/,
456 cudaError_t */*d_cdp_error*/,
457
458 void* d_temp_storage,
459 size_t& temp_storage_bytes,
460 InputIteratorT d_in,
461 OutputIteratorT d_out,
462 int /*num_items*/,
463 int max_segments,
464 OffsetIteratorT d_segment_offsets,
465 cub::ArgMin /*reduction_op*/,
466 cudaStream_t stream,
467 bool debug_synchronous)
468 {
469 // Invoke kernel to device reduction directly
470 cudaError_t error = cudaSuccess;
471 for (int i = 0; i < timing_iterations; ++i)
472 {
473 error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
474 d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
475 stream, debug_synchronous);
476 }
477 return error;
478 }
479
480 /**
481 * Dispatch to argmax entrypoint
482 */
483 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
484 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_SEGMENTED>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int,int max_segments,OffsetIteratorT d_segment_offsets,cub::ArgMax,cudaStream_t stream,bool debug_synchronous)485 cudaError_t Dispatch(
486 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
487 int timing_iterations,
488 size_t */*d_temp_storage_bytes*/,
489 cudaError_t */*d_cdp_error*/,
490
491 void* d_temp_storage,
492 size_t& temp_storage_bytes,
493 InputIteratorT d_in,
494 OutputIteratorT d_out,
495 int /*num_items*/,
496 int max_segments,
497 OffsetIteratorT d_segment_offsets,
498 cub::ArgMax /*reduction_op*/,
499 cudaStream_t stream,
500 bool debug_synchronous)
501 {
502 // Invoke kernel to device reduction directly
503 cudaError_t error = cudaSuccess;
504 for (int i = 0; i < timing_iterations; ++i)
505 {
506 error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
507 d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
508 stream, debug_synchronous);
509 }
510 return error;
511 }
512
513
514 //---------------------------------------------------------------------
515 // Dispatch to different Thrust entrypoints
516 //---------------------------------------------------------------------
517
518 /**
519 * Dispatch to reduction entrypoint (min or max specialization)
520 */
521 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
Dispatch(Int2Type<THRUST>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,ReductionOpT reduction_op,cudaStream_t,bool)522 cudaError_t Dispatch(
523 Int2Type<THRUST> /*dispatch_to*/,
524 int timing_iterations,
525 size_t */*d_temp_storage_bytes*/,
526 cudaError_t */*d_cdp_error*/,
527
528 void* d_temp_storage,
529 size_t& temp_storage_bytes,
530 InputIteratorT d_in,
531 OutputIteratorT d_out,
532 int num_items,
533 int /*max_segments*/,
534 OffsetIteratorT /*d_segment_offsets*/,
535 ReductionOpT reduction_op,
536 cudaStream_t /*stream*/,
537 bool /*debug_synchronous*/)
538 {
539 // The output value type
540 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
541 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
542 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
543
544 if (d_temp_storage == 0)
545 {
546 temp_storage_bytes = 1;
547 }
548 else
549 {
550 OutputT init;
551 CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
552
553 thrust::device_ptr<OutputT> d_in_wrapper(d_in);
554 OutputT retval;
555 for (int i = 0; i < timing_iterations; ++i)
556 {
557 retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
558 }
559
560 if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
561 CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
562 }
563
564 return cudaSuccess;
565 }
566
567 /**
568 * Dispatch to reduction entrypoint (sum specialization)
569 */
570 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
Dispatch(Int2Type<THRUST>,int timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int,OffsetIteratorT,Sum,cudaStream_t,bool)571 cudaError_t Dispatch(
572 Int2Type<THRUST> /*dispatch_to*/,
573 int timing_iterations,
574 size_t */*d_temp_storage_bytes*/,
575 cudaError_t */*d_cdp_error*/,
576
577 void* d_temp_storage,
578 size_t& temp_storage_bytes,
579 InputIteratorT d_in,
580 OutputIteratorT d_out,
581 int num_items,
582 int /*max_segments*/,
583 OffsetIteratorT /*d_segment_offsets*/,
584 Sum /*reduction_op*/,
585 cudaStream_t /*stream*/,
586 bool /*debug_synchronous*/)
587 {
588 // The output value type
589 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
590 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
591 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
592
593 if (d_temp_storage == 0)
594 {
595 temp_storage_bytes = 1;
596 }
597 else
598 {
599 thrust::device_ptr<OutputT> d_in_wrapper(d_in);
600 OutputT retval;
601 for (int i = 0; i < timing_iterations; ++i)
602 {
603 retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
604 }
605
606 if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
607 CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
608 }
609
610 return cudaSuccess;
611 }
612
613
614 //---------------------------------------------------------------------
615 // CUDA nested-parallelism test kernel
616 //---------------------------------------------------------------------
617
618 /**
619 * Simple wrapper kernel to invoke DeviceReduce
620 */
621 template <
622 typename InputIteratorT,
623 typename OutputIteratorT,
624 typename OffsetIteratorT,
625 typename ReductionOpT>
CnpDispatchKernel(int timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int max_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,bool debug_synchronous)626 __global__ void CnpDispatchKernel(
627 int timing_iterations,
628 size_t *d_temp_storage_bytes,
629 cudaError_t *d_cdp_error,
630
631 void* d_temp_storage,
632 size_t temp_storage_bytes,
633 InputIteratorT d_in,
634 OutputIteratorT d_out,
635 int num_items,
636 int max_segments,
637 OffsetIteratorT d_segment_offsets,
638 ReductionOpT reduction_op,
639 bool debug_synchronous)
640 {
641 #ifndef CUB_CDP
642 (void)timing_iterations;
643 (void)d_temp_storage_bytes;
644 (void)d_cdp_error;
645 (void)d_temp_storage;
646 (void)temp_storage_bytes;
647 (void)d_in;
648 (void)d_out;
649 (void)num_items;
650 (void)max_segments;
651 (void)d_segment_offsets;
652 (void)reduction_op;
653 (void)debug_synchronous;
654 *d_cdp_error = cudaErrorNotSupported;
655 #else
656 *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
657 d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
658 *d_temp_storage_bytes = temp_storage_bytes;
659 #endif
660 }
661
662
663 /**
664 * Dispatch to CUB_CDP kernel
665 */
666 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
667 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB_CDP> dispatch_to,int timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,int num_items,int max_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,cudaStream_t stream,bool debug_synchronous)668 cudaError_t Dispatch(
669 Int2Type<CUB_CDP> dispatch_to,
670 int timing_iterations,
671 size_t *d_temp_storage_bytes,
672 cudaError_t *d_cdp_error,
673
674 void* d_temp_storage,
675 size_t& temp_storage_bytes,
676 InputIteratorT d_in,
677 OutputIteratorT d_out,
678 int num_items,
679 int max_segments,
680 OffsetIteratorT d_segment_offsets,
681 ReductionOpT reduction_op,
682 cudaStream_t stream,
683 bool debug_synchronous)
684 {
685 // Invoke kernel to invoke device-side dispatch
686 CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
687 d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
688
689 // Copy out temp_storage_bytes
690 CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
691
692 // Copy out error
693 cudaError_t retval;
694 CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
695 return retval;
696 }
697
698
699
700 //---------------------------------------------------------------------
701 // Problem generation
702 //---------------------------------------------------------------------
703
704 /// Initialize problem
705 template <typename InputT>
Initialize(GenMode gen_mode,InputT * h_in,int num_items)706 void Initialize(
707 GenMode gen_mode,
708 InputT *h_in,
709 int num_items)
710 {
711 for (int i = 0; i < num_items; ++i)
712 {
713 InitValue(gen_mode, h_in[i], i);
714 }
715
716 if (g_verbose_input)
717 {
718 printf("Input:\n");
719 DisplayResults(h_in, num_items);
720 printf("\n\n");
721 }
722 }
723
724
725 /// Solve problem (max/custom-max functor)
726 template <typename ReductionOpT, typename InputT, typename _OutputT>
727 struct Solution
728 {
729 typedef _OutputT OutputT;
730
731 template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution732 static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
733 ReductionOpT reduction_op)
734 {
735 for (int i = 0; i < num_segments; ++i)
736 {
737 OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
738 for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
739 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
740 h_reference[i] = aggregate;
741 }
742 }
743 };
744
745 /// Solve problem (min functor)
746 template <typename InputT, typename _OutputT>
747 struct Solution<cub::Min, InputT, _OutputT>
748 {
749 typedef _OutputT OutputT;
750
751 template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution752 static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
753 cub::Min reduction_op)
754 {
755 for (int i = 0; i < num_segments; ++i)
756 {
757 OutputT aggregate = Traits<InputT>::Max(); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
758 for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
759 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
760 h_reference[i] = aggregate;
761 }
762 }
763 };
764
765
766 /// Solve problem (sum functor)
767 template <typename InputT, typename _OutputT>
768 struct Solution<cub::Sum, InputT, _OutputT>
769 {
770 typedef _OutputT OutputT;
771
772 template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution773 static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
774 cub::Sum reduction_op)
775 {
776 for (int i = 0; i < num_segments; ++i)
777 {
778 OutputT aggregate;
779 InitValue(INTEGER_SEED, aggregate, 0);
780 for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
781 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
782 h_reference[i] = aggregate;
783 }
784 }
785 };
786
787 /// Solve problem (argmin functor)
788 template <typename InputValueT, typename OutputValueT>
789 struct Solution<cub::ArgMin, InputValueT, OutputValueT>
790 {
791 typedef KeyValuePair<int, OutputValueT> OutputT;
792
793 template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution794 static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
795 cub::ArgMin reduction_op)
796 {
797 for (int i = 0; i < num_segments; ++i)
798 {
799 OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
800 for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
801 {
802 OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
803 aggregate = reduction_op(aggregate, item);
804 }
805 h_reference[i] = aggregate;
806 }
807 }
808 };
809
810
811 /// Solve problem (argmax functor)
812 template <typename InputValueT, typename OutputValueT>
813 struct Solution<cub::ArgMax, InputValueT, OutputValueT>
814 {
815 typedef KeyValuePair<int, OutputValueT> OutputT;
816
817 template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
SolveSolution818 static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
819 cub::ArgMax reduction_op)
820 {
821 for (int i = 0; i < num_segments; ++i)
822 {
823 OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
824 for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
825 {
826 OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
827 aggregate = reduction_op(aggregate, item);
828 }
829 h_reference[i] = aggregate;
830 }
831 }
832 };
833
834
835 //---------------------------------------------------------------------
836 // Problem generation
837 //---------------------------------------------------------------------
838
839 /// Test DeviceReduce for a given problem input
840 template <
841 typename BackendT,
842 typename DeviceInputIteratorT,
843 typename DeviceOutputIteratorT,
844 typename HostReferenceIteratorT,
845 typename OffsetT,
846 typename OffsetIteratorT,
847 typename ReductionOpT>
Test(BackendT backend,DeviceInputIteratorT d_in,DeviceOutputIteratorT d_out,OffsetT num_items,OffsetT num_segments,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op,HostReferenceIteratorT h_reference)848 void Test(
849 BackendT backend,
850 DeviceInputIteratorT d_in,
851 DeviceOutputIteratorT d_out,
852 OffsetT num_items,
853 OffsetT num_segments,
854 OffsetIteratorT d_segment_offsets,
855 ReductionOpT reduction_op,
856 HostReferenceIteratorT h_reference)
857 {
858 // Input data types
859 typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
860
861 // Allocate CUB_CDP device arrays for temp storage size and error
862 size_t *d_temp_storage_bytes = NULL;
863 cudaError_t *d_cdp_error = NULL;
864 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
865 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
866
867 // Inquire temp device storage
868 void *d_temp_storage = NULL;
869 size_t temp_storage_bytes = 0;
870 CubDebugExit(Dispatch(backend, 1,
871 d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
872 d_in, d_out, num_items, num_segments, d_segment_offsets,
873 reduction_op, 0, true));
874
875 // Allocate temp device storage
876 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
877
878 // Run warmup/correctness iteration
879 CubDebugExit(Dispatch(backend, 1,
880 d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
881 d_in, d_out, num_items, num_segments, d_segment_offsets,
882 reduction_op, 0, true));
883
884 // Check for correctness (and display results, if specified)
885 int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
886 printf("\t%s", compare ? "FAIL" : "PASS");
887
888 // Flush any stdout/stderr
889 fflush(stdout);
890 fflush(stderr);
891
892 // Performance
893 if (g_timing_iterations > 0)
894 {
895 GpuTimer gpu_timer;
896 gpu_timer.Start();
897
898 CubDebugExit(Dispatch(backend, g_timing_iterations,
899 d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
900 d_in, d_out, num_items, num_segments, d_segment_offsets,
901 reduction_op, 0, false));
902
903 gpu_timer.Stop();
904 float elapsed_millis = gpu_timer.ElapsedMillis();
905
906 // Display performance
907 float avg_millis = elapsed_millis / g_timing_iterations;
908 float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
909 float giga_bandwidth = giga_rate * sizeof(InputT);
910 printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
911 avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
912
913 }
914
915 if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
916 if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
917 if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
918
919 // Correctness asserts
920 AssertEquals(0, compare);
921 }
922
923
924 /// Test DeviceReduce
925 template <
926 Backend BACKEND,
927 typename OutputValueT,
928 typename HostInputIteratorT,
929 typename DeviceInputIteratorT,
930 typename OffsetT,
931 typename OffsetIteratorT,
932 typename ReductionOpT>
SolveAndTest(HostInputIteratorT h_in,DeviceInputIteratorT d_in,OffsetT num_items,OffsetT num_segments,OffsetIteratorT h_segment_offsets,OffsetIteratorT d_segment_offsets,ReductionOpT reduction_op)933 void SolveAndTest(
934 HostInputIteratorT h_in,
935 DeviceInputIteratorT d_in,
936 OffsetT num_items,
937 OffsetT num_segments,
938 OffsetIteratorT h_segment_offsets,
939 OffsetIteratorT d_segment_offsets,
940 ReductionOpT reduction_op)
941 {
942 typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputValueT;
943 typedef Solution<ReductionOpT, InputValueT, OutputValueT> SolutionT;
944 typedef typename SolutionT::OutputT OutputT;
945
946 printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
947 (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
948 typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
949 fflush(stdout);
950
951 // Allocate and solve solution
952 OutputT *h_reference = new OutputT[num_segments];
953 SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);
954
955 // // Run with discard iterator
956 // DiscardOutputIterator<OffsetT> discard_itr;
957 // Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
958
959 // Run with output data (cleared for sanity-check)
960 OutputT *d_out = NULL;
961 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments));
962 CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
963 Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
964
965 // Cleanup
966 if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
967 if (h_reference) delete[] h_reference;
968 }
969
970
971 /// Test specific problem type
972 template <
973 Backend BACKEND,
974 typename InputT,
975 typename OutputT,
976 typename OffsetT,
977 typename ReductionOpT>
TestProblem(OffsetT num_items,OffsetT num_segments,GenMode gen_mode,ReductionOpT reduction_op)978 void TestProblem(
979 OffsetT num_items,
980 OffsetT num_segments,
981 GenMode gen_mode,
982 ReductionOpT reduction_op)
983 {
984 printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
985 fflush(stdout);
986
987 // Initialize value data
988 InputT* h_in = new InputT[num_items];
989 Initialize(gen_mode, h_in, num_items);
990
991 // Initialize segment data
992 OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
993 InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
994
995 // Initialize device data
996 OffsetT *d_segment_offsets = NULL;
997 InputT *d_in = NULL;
998 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
999 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
1000 CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
1001 CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
1002
1003 SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);
1004
1005 if (h_segment_offsets) delete[] h_segment_offsets;
1006 if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
1007 if (h_in) delete[] h_in;
1008 if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
1009 }
1010
1011
1012 /// Test different operators
1013 template <
1014 Backend BACKEND,
1015 typename OutputT,
1016 typename HostInputIteratorT,
1017 typename DeviceInputIteratorT,
1018 typename OffsetT,
1019 typename OffsetIteratorT>
TestByOp(HostInputIteratorT h_in,DeviceInputIteratorT d_in,OffsetT num_items,OffsetT num_segments,OffsetIteratorT h_segment_offsets,OffsetIteratorT d_segment_offsets)1020 void TestByOp(
1021 HostInputIteratorT h_in,
1022 DeviceInputIteratorT d_in,
1023 OffsetT num_items,
1024 OffsetT num_segments,
1025 OffsetIteratorT h_segment_offsets,
1026 OffsetIteratorT d_segment_offsets)
1027 {
1028 SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
1029 SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
1030 SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
1031 SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
1032 SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
1033 SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
1034 }
1035
1036
1037 /// Test different backends
1038 template <
1039 typename InputT,
1040 typename OutputT,
1041 typename OffsetT>
TestByBackend(OffsetT num_items,OffsetT max_segments,GenMode gen_mode)1042 void TestByBackend(
1043 OffsetT num_items,
1044 OffsetT max_segments,
1045 GenMode gen_mode)
1046 {
1047 // Initialize host data
1048 printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
1049 num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
1050
1051 InputT *h_in = new InputT[num_items];
1052 OffsetT *h_segment_offsets = new OffsetT[max_segments + 1];
1053 Initialize(gen_mode, h_in, num_items);
1054
1055 // Initialize device data
1056 InputT *d_in = NULL;
1057 OffsetT *d_segment_offsets = NULL;
1058 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
1059 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
1060 CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
1061
1062 //
1063 // Test single-segment implementations
1064 //
1065
1066 InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
1067
1068 // Page-aligned-input tests
1069 TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL); // Host-dispatch
1070 #ifdef CUB_CDP
1071 TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL); // Device-dispatch
1072 #endif
1073
1074 // Non-page-aligned-input tests
1075 if (num_items > 1)
1076 {
1077 InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
1078 TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
1079 }
1080
1081 //
1082 // Test segmented implementation
1083 //
1084
1085 // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
1086 int max_items_per_segment = 128000;
1087
1088 for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
1089 num_segments < max_segments;
1090 num_segments = (num_segments * 32) + 1)
1091 {
1092 // Test with segment pointer
1093 InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
1094 CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
1095 TestByOp<CUB_SEGMENTED, OutputT>(
1096 h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);
1097
1098 // Test with segment iterator
1099 typedef CastOp<OffsetT> IdentityOpT;
1100 IdentityOpT identity_op;
1101 TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
1102 h_segment_offsets,
1103 identity_op);
1104 TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
1105 d_segment_offsets,
1106 identity_op);
1107
1108 TestByOp<CUB_SEGMENTED, OutputT>(
1109 h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
1110 }
1111
1112 if (h_in) delete[] h_in;
1113 if (h_segment_offsets) delete[] h_segment_offsets;
1114 if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
1115 if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
1116 }
1117
1118
1119 /// Test different input-generation modes
1120 template <
1121 typename InputT,
1122 typename OutputT,
1123 typename OffsetT>
TestByGenMode(OffsetT num_items,OffsetT max_segments)1124 void TestByGenMode(
1125 OffsetT num_items,
1126 OffsetT max_segments)
1127 {
1128 //
1129 // Test pointer support using different input-generation modes
1130 //
1131
1132 TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
1133 TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
1134 TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
1135
1136 //
1137 // Test iterator support using a constant-iterator and SUM
1138 //
1139
1140 InputT val;
1141 InitValue(UNIFORM, val, 0);
1142 ConstantInputIterator<InputT, OffsetT> h_in(val);
1143
1144 OffsetT *h_segment_offsets = new OffsetT[1 + 1];
1145 InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
1146
1147 SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
1148 #ifdef CUB_CDP
1149 SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
1150 #endif
1151
1152 if (h_segment_offsets) delete[] h_segment_offsets;
1153 }
1154
1155
1156 /// Test different problem sizes
1157 template <
1158 typename InputT,
1159 typename OutputT,
1160 typename OffsetT>
1161 struct TestBySize
1162 {
1163 OffsetT max_items;
1164 OffsetT max_segments;
1165
TestBySizeTestBySize1166 TestBySize(OffsetT max_items, OffsetT max_segments) :
1167 max_items(max_items),
1168 max_segments(max_segments)
1169 {}
1170
1171 template <typename ActivePolicyT>
InvokeTestBySize1172 cudaError_t Invoke()
1173 {
1174 //
1175 // Black-box testing on all backends
1176 //
1177
1178 // Test 0, 1, many
1179 TestByGenMode<InputT, OutputT>(0, max_segments);
1180 TestByGenMode<InputT, OutputT>(1, max_segments);
1181 TestByGenMode<InputT, OutputT>(max_items, max_segments);
1182
1183 // Test random problem sizes from a log-distribution [8, max_items-ish)
1184 int num_iterations = 8;
1185 double max_exp = log(double(max_items)) / log(double(2.0));
1186 for (int i = 0; i < num_iterations; ++i)
1187 {
1188 OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
1189 TestByGenMode<InputT, OutputT>(num_items, max_segments);
1190 }
1191
1192 //
1193 // White-box testing of single-segment problems around specific sizes
1194 //
1195
1196 // Tile-boundaries: multiple blocks, one tile per block
1197 OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
1198 TestProblem<CUB, InputT, OutputT>(tile_size * 4, 1, RANDOM, Sum());
1199 TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1, RANDOM, Sum());
1200 TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1, RANDOM, Sum());
1201
1202 // Tile-boundaries: multiple blocks, multiple tiles per block
1203 OffsetT sm_occupancy = 32;
1204 OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
1205 TestProblem<CUB, InputT, OutputT>(occupancy, 1, RANDOM, Sum());
1206 TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1, RANDOM, Sum());
1207 TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1, RANDOM, Sum());
1208
1209 return cudaSuccess;
1210 }
1211 };
1212
1213
1214 /// Test problem type
1215 template <
1216 typename InputT,
1217 typename OutputT,
1218 typename OffsetT>
TestType(OffsetT max_items,OffsetT max_segments)1219 void TestType(
1220 OffsetT max_items,
1221 OffsetT max_segments)
1222 {
1223 typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
1224
1225 TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
1226
1227 MaxPolicyT::Invoke(g_ptx_version, dispatch);
1228 }
1229
1230
1231 //---------------------------------------------------------------------
1232 // Main
1233 //---------------------------------------------------------------------
1234
1235
1236 /**
1237 * Main
1238 */
main(int argc,char ** argv)1239 int main(int argc, char** argv)
1240 {
1241 typedef int OffsetT;
1242
1243 OffsetT max_items = 27000000;
1244 OffsetT max_segments = 34000;
1245
1246 // Initialize command line
1247 CommandLineArgs args(argc, argv);
1248 g_verbose = args.CheckCmdLineFlag("v");
1249 g_verbose_input = args.CheckCmdLineFlag("v2");
1250 args.GetCmdLineArgument("n", max_items);
1251 args.GetCmdLineArgument("s", max_segments);
1252 args.GetCmdLineArgument("i", g_timing_iterations);
1253 args.GetCmdLineArgument("repeat", g_repeat);
1254
1255 // Print usage
1256 if (args.CheckCmdLineFlag("help"))
1257 {
1258 printf("%s "
1259 "[--n=<input items> "
1260 "[--s=<num segments> "
1261 "[--i=<timing iterations> "
1262 "[--device=<device-id>] "
1263 "[--repeat=<repetitions of entire test suite>]"
1264 "[--v] "
1265 "[--cdp]"
1266 "\n", argv[0]);
1267 exit(0);
1268 }
1269
1270 // Initialize device
1271 CubDebugExit(args.DeviceInit());
1272 g_device_giga_bandwidth = args.device_giga_bandwidth;
1273
1274 // Get ptx version
1275 CubDebugExit(PtxVersion(g_ptx_version));
1276
1277 // Get SM count
1278 g_sm_count = args.deviceProp.multiProcessorCount;
1279
1280 #ifdef QUICKER_TEST
1281
1282 // Compile/run basic test
1283
1284
1285 TestProblem<CUB, char, int>( max_items, 1, RANDOM_BIT, Sum());
1286 TestProblem<CUB, short, int>( max_items, 1, RANDOM_BIT, Sum());
1287
1288 printf("\n-------------------------------\n");
1289
1290 TestProblem<CUB, int, int>( max_items, 1, RANDOM_BIT, Sum());
1291 TestProblem<CUB, long long, long long>( max_items, 1, RANDOM_BIT, Sum());
1292
1293 printf("\n-------------------------------\n");
1294
1295 TestProblem<CUB, float, float>( max_items, 1, RANDOM_BIT, Sum());
1296 TestProblem<CUB, double, double>( max_items, 1, RANDOM_BIT, Sum());
1297
1298 printf("\n-------------------------------\n");
1299
1300 TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM_BIT, Sum());
1301
1302
1303 #elif defined(QUICK_TEST)
1304
1305 // Compile/run quick comparison tests
1306
1307 TestProblem<CUB, char, char>( max_items * 4, 1, UNIFORM, Sum());
1308 TestProblem<THRUST, char, char>( max_items * 4, 1, UNIFORM, Sum());
1309
1310 printf("\n----------------------------\n");
1311 TestProblem<CUB, short, short>( max_items * 2, 1, UNIFORM, Sum());
1312 TestProblem<THRUST, short, short>( max_items * 2, 1, UNIFORM, Sum());
1313
1314 printf("\n----------------------------\n");
1315 TestProblem<CUB, int, int>( max_items, 1, UNIFORM, Sum());
1316 TestProblem<THRUST, int, int>( max_items, 1, UNIFORM, Sum());
1317
1318 printf("\n----------------------------\n");
1319 TestProblem<CUB, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
1320 TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
1321
1322 printf("\n----------------------------\n");
1323 TestProblem<CUB, TestFoo, TestFoo>( max_items / 4, 1, UNIFORM, Max());
1324 TestProblem<THRUST, TestFoo, TestFoo>( max_items / 4, 1, UNIFORM, Max());
1325
1326 #else
1327
1328 // Compile/run thorough tests
1329 for (int i = 0; i <= g_repeat; ++i)
1330 {
1331 // Test different input types
1332 TestType<char, char>(max_items, max_segments);
1333
1334 TestType<unsigned char, unsigned char>(max_items, max_segments);
1335
1336 TestType<char, int>(max_items, max_segments);
1337
1338 TestType<short, short>(max_items, max_segments);
1339 TestType<int, int>(max_items, max_segments);
1340 TestType<long, long>(max_items, max_segments);
1341 TestType<long long, long long>(max_items, max_segments);
1342
1343 TestType<uchar2, uchar2>(max_items, max_segments);
1344 TestType<uint2, uint2>(max_items, max_segments);
1345 TestType<ulonglong2, ulonglong2>(max_items, max_segments);
1346 TestType<ulonglong4, ulonglong4>(max_items, max_segments);
1347
1348 TestType<TestFoo, TestFoo>(max_items, max_segments);
1349 TestType<TestBar, TestBar>(max_items, max_segments);
1350 }
1351
1352 #endif
1353
1354
1355 printf("\n");
1356 return 0;
1357 }
1358
1359
1360
1361