1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Test of DeviceReduce::ReduceByKey utilities
31  ******************************************************************************/
32 
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35 
36 #include <stdio.h>
37 #include <typeinfo>
38 
39 #include <thrust/device_ptr.h>
40 #include <thrust/reduce.h>
41 #include <thrust/iterator/constant_iterator.h>
42 
43 #include <cub/util_allocator.cuh>
44 #include <cub/iterator/constant_input_iterator.cuh>
45 #include <cub/device/device_reduce.cuh>
46 #include <cub/device/device_run_length_encode.cuh>
47 #include <cub/thread/thread_operators.cuh>
48 
49 #include "test_util.h"
50 
51 using namespace cub;
52 
53 
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57 
58 bool                    g_verbose           = false;
59 int                     g_timing_iterations = 0;
60 int                     g_repeat            = 0;
61 CachingDeviceAllocator  g_allocator(true);
62 
63 // Dispatch types
64 enum Backend
65 {
66     CUB,        // CUB method
67     THRUST,     // Thrust method
68     CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
69 };
70 
71 
72 //---------------------------------------------------------------------
73 // Dispatch to different CUB entrypoints
74 //---------------------------------------------------------------------
75 
76 /**
77  * Dispatch to reduce-by-key entrypoint
78  */
79 template <
80     typename                    KeyInputIteratorT,
81     typename                    KeyOutputIteratorT,
82     typename                    ValueInputIteratorT,
83     typename                    ValueOutputIteratorT,
84     typename                    NumRunsIteratorT,
85     typename                    EqualityOpT,
86     typename                    ReductionOpT,
87     typename                    OffsetT>
88 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT,ReductionOpT reduction_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)89 cudaError_t Dispatch(
90     Int2Type<CUB>               /*dispatch_to*/,
91     int                         timing_timing_iterations,
92     size_t                      */*d_temp_storage_bytes*/,
93     cudaError_t                 */*d_cdp_error*/,
94 
95     void                        *d_temp_storage,
96     size_t                      &temp_storage_bytes,
97     KeyInputIteratorT           d_keys_in,
98     KeyOutputIteratorT          d_keys_out,
99     ValueInputIteratorT         d_values_in,
100     ValueOutputIteratorT        d_values_out,
101     NumRunsIteratorT            d_num_runs,
102     EqualityOpT                  /*equality_op*/,
103     ReductionOpT                 reduction_op,
104     OffsetT                     num_items,
105     cudaStream_t                stream,
106     bool                        debug_synchronous)
107 {
108     cudaError_t error = cudaSuccess;
109     for (int i = 0; i < timing_timing_iterations; ++i)
110     {
111         error = DeviceReduce::ReduceByKey(
112             d_temp_storage,
113             temp_storage_bytes,
114             d_keys_in,
115             d_keys_out,
116             d_values_in,
117             d_values_out,
118             d_num_runs,
119             reduction_op,
120             num_items,
121             stream,
122             debug_synchronous);
123     }
124     return error;
125 }
126 
127 
128 //---------------------------------------------------------------------
129 // Dispatch to different Thrust entrypoints
130 //---------------------------------------------------------------------
131 
132 /**
133  * Dispatch to reduce-by-key entrypoint
134  */
135 template <
136     typename                    KeyInputIteratorT,
137     typename                    KeyOutputIteratorT,
138     typename                    ValueInputIteratorT,
139     typename                    ValueOutputIteratorT,
140     typename                    NumRunsIteratorT,
141     typename                    EqualityOpT,
142     typename                    ReductionOpT,
143     typename                    OffsetT>
Dispatch(Int2Type<THRUST>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT,ReductionOpT,OffsetT num_items,cudaStream_t,bool)144 cudaError_t Dispatch(
145     Int2Type<THRUST>            /*dispatch_to*/,
146     int                         timing_timing_iterations,
147     size_t                      */*d_temp_storage_bytes*/,
148     cudaError_t                 */*d_cdp_error*/,
149 
150     void                        *d_temp_storage,
151     size_t                      &temp_storage_bytes,
152     KeyInputIteratorT           d_keys_in,
153     KeyOutputIteratorT          d_keys_out,
154     ValueInputIteratorT         d_values_in,
155     ValueOutputIteratorT        d_values_out,
156     NumRunsIteratorT            d_num_runs,
157     EqualityOpT                 /*equality_op*/,
158     ReductionOpT                /*reduction_op*/,
159     OffsetT                     num_items,
160     cudaStream_t                /*stream*/,
161     bool                        /*debug_synchronous*/)
162 {
163     // The input keys type
164     typedef typename std::iterator_traits<KeyInputIteratorT>::value_type KeyInputT;
165 
166     // The output keys type
167     typedef typename If<(Equals<typename std::iterator_traits<KeyOutputIteratorT>::value_type, void>::VALUE),   // OutputT =  (if output iterator's value type is void) ?
168         typename std::iterator_traits<KeyInputIteratorT>::value_type,                                           // ... then the input iterator's value type,
169         typename std::iterator_traits<KeyOutputIteratorT>::value_type>::Type KeyOutputT;                        // ... else the output iterator's value type
170 
171     // The input values type
172     typedef typename std::iterator_traits<ValueInputIteratorT>::value_type ValueInputT;
173 
174     // The output values type
175     typedef typename If<(Equals<typename std::iterator_traits<ValueOutputIteratorT>::value_type, void>::VALUE), // OutputT =  (if output iterator's value type is void) ?
176         typename std::iterator_traits<ValueInputIteratorT>::value_type,                                         // ... then the input iterator's value type,
177         typename std::iterator_traits<ValueOutputIteratorT>::value_type>::Type ValueOuputT;                     // ... else the output iterator's value type
178 
179     if (d_temp_storage == 0)
180     {
181         temp_storage_bytes = 1;
182     }
183     else
184     {
185         thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
186         thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
187 
188         thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
189         thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
190 
191         thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
192 
193         for (int i = 0; i < timing_timing_iterations; ++i)
194         {
195             d_out_ends = thrust::reduce_by_key(
196                 d_keys_in_wrapper,
197                 d_keys_in_wrapper + num_items,
198                 d_values_in_wrapper,
199                 d_keys_out_wrapper,
200                 d_values_out_wrapper);
201         }
202 
203         OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper);
204         CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice));
205 
206     }
207 
208     return cudaSuccess;
209 }
210 
211 
212 
213 //---------------------------------------------------------------------
214 // CUDA Nested Parallelism Test Kernel
215 //---------------------------------------------------------------------
216 
217 /**
218  * Simple wrapper kernel to invoke DeviceSelect
219  */
220 template <
221     typename                    KeyInputIteratorT,
222     typename                    KeyOutputIteratorT,
223     typename                    ValueInputIteratorT,
224     typename                    ValueOutputIteratorT,
225     typename                    NumRunsIteratorT,
226     typename                    EqualityOpT,
227     typename                    ReductionOpT,
228     typename                    OffsetT>
CnpDispatchKernel(int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT equality_op,ReductionOpT reduction_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)229 __global__ void CnpDispatchKernel(
230     int                         timing_timing_iterations,
231     size_t                      *d_temp_storage_bytes,
232     cudaError_t                 *d_cdp_error,
233 
234     void                        *d_temp_storage,
235     size_t                      temp_storage_bytes,
236     KeyInputIteratorT           d_keys_in,
237     KeyOutputIteratorT          d_keys_out,
238     ValueInputIteratorT         d_values_in,
239     ValueOutputIteratorT        d_values_out,
240     NumRunsIteratorT            d_num_runs,
241     EqualityOpT                 equality_op,
242     ReductionOpT                reduction_op,
243     OffsetT                     num_items,
244     cudaStream_t                stream,
245     bool                        debug_synchronous)
246 {
247 
248 #ifndef CUB_CDP
249     *d_cdp_error = cudaErrorNotSupported;
250 #else
251     *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
252         d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
253 
254     *d_temp_storage_bytes = temp_storage_bytes;
255 #endif
256 }
257 
258 
259 /**
260  * Dispatch to CDP kernel
261  */
262 template <
263     typename                    KeyInputIteratorT,
264     typename                    KeyOutputIteratorT,
265     typename                    ValueInputIteratorT,
266     typename                    ValueOutputIteratorT,
267     typename                    NumRunsIteratorT,
268     typename                    EqualityOpT,
269     typename                    ReductionOpT,
270     typename                    OffsetT>
271 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CDP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT equality_op,ReductionOpT reduction_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)272 cudaError_t Dispatch(
273     Int2Type<CDP>               dispatch_to,
274     int                         timing_timing_iterations,
275     size_t                      *d_temp_storage_bytes,
276     cudaError_t                 *d_cdp_error,
277 
278     void                        *d_temp_storage,
279     size_t                      &temp_storage_bytes,
280     KeyInputIteratorT           d_keys_in,
281     KeyOutputIteratorT          d_keys_out,
282     ValueInputIteratorT         d_values_in,
283     ValueOutputIteratorT        d_values_out,
284     NumRunsIteratorT            d_num_runs,
285     EqualityOpT                 equality_op,
286     ReductionOpT                reduction_op,
287     OffsetT                     num_items,
288     cudaStream_t                stream,
289     bool                        debug_synchronous)
290 {
291     // Invoke kernel to invoke device-side dispatch
292     CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
293         d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
294 
295     // Copy out temp_storage_bytes
296     CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
297 
298     // Copy out error
299     cudaError_t retval;
300     CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
301     return retval;
302 }
303 
304 
305 
306 //---------------------------------------------------------------------
307 // Test generation
308 //---------------------------------------------------------------------
309 
310 
311 /**
312  * Initialize problem
313  */
314 template <typename T>
Initialize(int entropy_reduction,T * h_in,int num_items,int max_segment)315 void Initialize(
316     int         entropy_reduction,
317     T           *h_in,
318     int         num_items,
319     int         max_segment)
320 {
321     unsigned int max_int = (unsigned int) -1;
322 
323     int key = 0;
324     int i = 0;
325     while (i < num_items)
326     {
327         // Select number of repeating occurrences
328 
329         int repeat;
330 
331         if (max_segment < 0)
332         {
333             repeat = num_items;
334         }
335         else if (max_segment < 2)
336         {
337             repeat = 1;
338         }
339         else
340         {
341             RandomBits(repeat, entropy_reduction);
342             repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
343             repeat = CUB_MAX(1, repeat);
344         }
345 
346         int j = i;
347         while (j < CUB_MIN(i + repeat, num_items))
348         {
349             InitValue(INTEGER_SEED, h_in[j], key);
350             j++;
351         }
352 
353         i = j;
354         key++;
355     }
356 
357     if (g_verbose)
358     {
359         printf("Input:\n");
360         DisplayResults(h_in, num_items);
361         printf("\n\n");
362     }
363 }
364 
365 
366 /**
367  * Solve problem.  Returns total number of segments identified
368  */
369 template <
370     typename        KeyInputIteratorT,
371     typename        ValueInputIteratorT,
372     typename        KeyT,
373     typename        ValueT,
374     typename        EqualityOpT,
375     typename        ReductionOpT>
Solve(KeyInputIteratorT h_keys_in,KeyT * h_keys_reference,ValueInputIteratorT h_values_in,ValueT * h_values_reference,EqualityOpT equality_op,ReductionOpT reduction_op,int num_items)376 int Solve(
377     KeyInputIteratorT       h_keys_in,
378     KeyT                    *h_keys_reference,
379     ValueInputIteratorT     h_values_in,
380     ValueT                  *h_values_reference,
381     EqualityOpT             equality_op,
382     ReductionOpT            reduction_op,
383     int                     num_items)
384 {
385     // First item
386     KeyT previous        = h_keys_in[0];
387     ValueT aggregate     = h_values_in[0];
388     int num_segments    = 0;
389 
390     // Subsequent items
391     for (int i = 1; i < num_items; ++i)
392     {
393         if (!equality_op(previous, h_keys_in[i]))
394         {
395             h_keys_reference[num_segments] = previous;
396             h_values_reference[num_segments] = aggregate;
397             num_segments++;
398             aggregate = h_values_in[i];
399         }
400         else
401         {
402             aggregate = reduction_op(aggregate, h_values_in[i]);
403         }
404         previous = h_keys_in[i];
405     }
406 
407     h_keys_reference[num_segments] = previous;
408     h_values_reference[num_segments] = aggregate;
409     num_segments++;
410 
411     return num_segments;
412 }
413 
414 
415 
416 /**
417  * Test DeviceSelect for a given problem input
418  */
419 template <
420     Backend             BACKEND,
421     typename            DeviceKeyInputIteratorT,
422     typename            DeviceValueInputIteratorT,
423     typename            KeyT,
424     typename            ValueT,
425     typename            EqualityOpT,
426     typename            ReductionOpT>
Test(DeviceKeyInputIteratorT d_keys_in,DeviceValueInputIteratorT d_values_in,KeyT * h_keys_reference,ValueT * h_values_reference,EqualityOpT equality_op,ReductionOpT reduction_op,int num_segments,int num_items)427 void Test(
428     DeviceKeyInputIteratorT     d_keys_in,
429     DeviceValueInputIteratorT   d_values_in,
430     KeyT*                       h_keys_reference,
431     ValueT*                     h_values_reference,
432     EqualityOpT                 equality_op,
433     ReductionOpT                reduction_op,
434     int                         num_segments,
435     int                         num_items)
436 {
437     // Allocate device output arrays and number of segments
438     KeyT*   d_keys_out             = NULL;
439     ValueT* d_values_out           = NULL;
440     int*    d_num_runs         = NULL;
441     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
442     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
443     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
444 
445     // Allocate CDP device arrays
446     size_t          *d_temp_storage_bytes = NULL;
447     cudaError_t     *d_cdp_error = NULL;
448     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
449     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
450 
451     // Allocate temporary storage
452     void            *d_temp_storage = NULL;
453     size_t          temp_storage_bytes = 0;
454     CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
455     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
456 
457     // Clear device output arrays
458     CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
459     CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
460     CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
461 
462     // Run warmup/correctness iteration
463     CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
464 
465     // Check for correctness (and display results, if specified)
466     int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
467     printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
468 
469     int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
470     printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
471 
472     int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
473     printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
474 
475     // Flush any stdout/stderr
476     fflush(stdout);
477     fflush(stderr);
478 
479     // Performance
480     GpuTimer gpu_timer;
481     gpu_timer.Start();
482     CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
483     gpu_timer.Stop();
484     float elapsed_millis = gpu_timer.ElapsedMillis();
485 
486     // Display performance
487     if (g_timing_iterations > 0)
488     {
489         float   avg_millis  = elapsed_millis / g_timing_iterations;
490         float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
491         int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
492         float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
493         printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
494     }
495     printf("\n\n");
496 
497     // Flush any stdout/stderr
498     fflush(stdout);
499     fflush(stderr);
500 
501     // Cleanup
502     if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
503     if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
504     if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
505     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
506     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
507     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
508 
509     // Correctness asserts
510     AssertEquals(0, compare1 | compare2 | compare3);
511 }
512 
513 
514 /**
515  * Test DeviceSelect on pointer type
516  */
517 template <
518     Backend         BACKEND,
519     typename        KeyT,
520     typename        ValueT,
521     typename        ReductionOpT>
TestPointer(int num_items,int entropy_reduction,int max_segment,ReductionOpT reduction_op)522 void TestPointer(
523     int             num_items,
524     int             entropy_reduction,
525     int             max_segment,
526     ReductionOpT    reduction_op)
527 {
528     // Allocate host arrays
529     KeyT* h_keys_in        = new KeyT[num_items];
530     KeyT* h_keys_reference = new KeyT[num_items];
531 
532     ValueT* h_values_in        = new ValueT[num_items];
533     ValueT* h_values_reference = new ValueT[num_items];
534 
535     for (int i = 0; i < num_items; ++i)
536         InitValue(INTEGER_SEED, h_values_in[i], 1);
537 
538     // Initialize problem and solution
539     Equality equality_op;
540     Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
541     int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
542 
543     printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
544         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
545         (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
546         num_items, num_segments, float(num_items) / num_segments,
547         typeid(KeyT).name(), typeid(ValueT).name(),
548         max_segment, entropy_reduction);
549     fflush(stdout);
550 
551     // Allocate problem device arrays
552     KeyT     *d_keys_in = NULL;
553     ValueT   *d_values_in = NULL;
554     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
555     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
556 
557     // Initialize device input
558     CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
559     CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
560 
561     // Run Test
562     Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
563 
564     // Cleanup
565     if (h_keys_in) delete[] h_keys_in;
566     if (h_values_in) delete[] h_values_in;
567     if (h_keys_reference) delete[] h_keys_reference;
568     if (h_values_reference) delete[] h_values_reference;
569     if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
570     if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
571 }
572 
573 
574 /**
575  * Test on iterator type
576  */
577 template <
578     Backend         BACKEND,
579     typename        KeyT,
580     typename        ValueT,
581     typename        ReductionOpT>
TestIterator(int num_items,int entropy_reduction,int max_segment,ReductionOpT reduction_op)582 void TestIterator(
583     int             num_items,
584     int             entropy_reduction,
585     int             max_segment,
586     ReductionOpT    reduction_op)
587 {
588     // Allocate host arrays
589     KeyT* h_keys_in        = new KeyT[num_items];
590     KeyT* h_keys_reference = new KeyT[num_items];
591 
592     ValueT one_val;
593     InitValue(INTEGER_SEED, one_val, 1);
594     ConstantInputIterator<ValueT, int> h_values_in(one_val);
595     ValueT* h_values_reference = new ValueT[num_items];
596 
597     // Initialize problem and solution
598     Equality equality_op;
599     Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
600     int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
601 
602     printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
603         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
604         (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
605         num_items, num_segments, float(num_items) / num_segments,
606         typeid(KeyT).name(), typeid(ValueT).name(),
607         max_segment, entropy_reduction);
608     fflush(stdout);
609 
610     // Allocate problem device arrays
611     KeyT     *d_keys_in = NULL;
612     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
613 
614     // Initialize device input
615     CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
616 
617     // Run Test
618     Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
619 
620     // Cleanup
621     if (h_keys_in) delete[] h_keys_in;
622     if (h_keys_reference) delete[] h_keys_reference;
623     if (h_values_reference) delete[] h_values_reference;
624     if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
625 }
626 
627 
628 /**
629  * Test different gen modes
630  */
631 template <
632     Backend         BACKEND,
633     typename        KeyT,
634     typename        ValueT,
635     typename        ReductionOpT>
Test(int num_items,ReductionOpT reduction_op,int max_segment)636 void Test(
637     int             num_items,
638     ReductionOpT    reduction_op,
639     int             max_segment)
640 {
641     // 0 key-bit entropy reduction rounds
642     TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
643 
644     if (max_segment > 1)
645     {
646         // 2 key-bit entropy reduction rounds
647         TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
648 
649         // 7 key-bit entropy reduction rounds
650         TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
651     }
652 }
653 
654 
655 /**
656  * Test different avg segment lengths modes
657  */
658 template <
659     Backend         BACKEND,
660     typename        KeyT,
661     typename        ValueT,
662     typename        ReductionOpT>
Test(int num_items,ReductionOpT reduction_op)663 void Test(
664     int             num_items,
665     ReductionOpT    reduction_op)
666 {
667     Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
668     Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
669 
670     // Evaluate different max-segment lengths
671     for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
672     {
673         Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
674     }
675 }
676 
677 
678 
679 /**
680  * Test different dispatch
681  */
682 template <
683     typename        KeyT,
684     typename        ValueT,
685     typename        ReductionOpT>
TestDispatch(int num_items,ReductionOpT reduction_op)686 void TestDispatch(
687     int             num_items,
688     ReductionOpT    reduction_op)
689 {
690     Test<CUB, KeyT, ValueT>(num_items, reduction_op);
691 #ifdef CUB_CDP
692     Test<CDP, KeyT, ValueT>(num_items, reduction_op);
693 #endif
694 }
695 
696 
697 /**
698  * Test different input sizes
699  */
700 template <
701     typename        KeyT,
702     typename        ValueT,
703     typename        ReductionOpT>
TestSize(int num_items,ReductionOpT reduction_op)704 void TestSize(
705     int             num_items,
706     ReductionOpT    reduction_op)
707 {
708     if (num_items < 0)
709     {
710         TestDispatch<KeyT, ValueT>(1,        reduction_op);
711         TestDispatch<KeyT, ValueT>(100,      reduction_op);
712         TestDispatch<KeyT, ValueT>(10000,    reduction_op);
713         TestDispatch<KeyT, ValueT>(1000000,  reduction_op);
714     }
715     else
716     {
717         TestDispatch<KeyT, ValueT>(num_items, reduction_op);
718     }
719 
720 }
721 
722 
723 template <
724     typename        KeyT,
725     typename        ValueT>
TestOp(int num_items)726 void TestOp(
727     int             num_items)
728 {
729     TestSize<KeyT, ValueT>(num_items, cub::Sum());
730     TestSize<KeyT, ValueT>(num_items, cub::Max());
731 }
732 
733 
734 
735 //---------------------------------------------------------------------
736 // Main
737 //---------------------------------------------------------------------
738 
739 /**
740  * Main
741  */
main(int argc,char ** argv)742 int main(int argc, char** argv)
743 {
744     int num_items           = -1;
745     int entropy_reduction   = 0;
746     int maxseg              = 1000;
747 
748     // Initialize command line
749     CommandLineArgs args(argc, argv);
750     g_verbose = args.CheckCmdLineFlag("v");
751     args.GetCmdLineArgument("n", num_items);
752     args.GetCmdLineArgument("i", g_timing_iterations);
753     args.GetCmdLineArgument("repeat", g_repeat);
754     args.GetCmdLineArgument("maxseg", maxseg);
755     args.GetCmdLineArgument("entropy", entropy_reduction);
756 
757     // Print usage
758     if (args.CheckCmdLineFlag("help"))
759     {
760         printf("%s "
761             "[--n=<input items> "
762             "[--i=<timing iterations> "
763             "[--device=<device-id>] "
764             "[--maxseg=<max segment length>]"
765             "[--entropy=<segment length bit entropy reduction rounds>]"
766             "[--repeat=<repetitions of entire test suite>]"
767             "[--v] "
768             "[--cdp]"
769             "\n", argv[0]);
770         exit(0);
771     }
772 
773     // Initialize device
774     CubDebugExit(args.DeviceInit());
775     printf("\n");
776 
777     // Get ptx version
778     int ptx_version = 0;
779     CubDebugExit(PtxVersion(ptx_version));
780 
781 #ifdef QUICKER_TEST
782 
783     // Compile/run basic CUB test
784     if (num_items < 0) num_items = 32000000;
785 
786     TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
787     TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
788     TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
789 
790 #elif defined(QUICK_TEST)
791 
792     // Compile/run quick tests
793     if (num_items < 0) num_items = 32000000;
794 
795     printf("---- RLE int ---- \n");
796     TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
797 
798     printf("---- RLE long long ---- \n");
799     TestIterator<CUB, long long, int>(num_items, entropy_reduction, maxseg, cub::Sum());
800 
801     printf("---- int ---- \n");
802     TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
803     TestPointer<THRUST, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
804 
805     printf("---- float ---- \n");
806     TestPointer<CUB, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
807     TestPointer<THRUST, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
808 
809     if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
810     {
811         printf("---- double ---- \n");
812         TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
813         TestPointer<THRUST, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
814     }
815 
816 #else
817 
818     // Compile/run thorough tests
819     for (int i = 0; i <= g_repeat; ++i)
820     {
821 
822         // Test different input types
823         TestOp<int, char>(num_items);
824         TestOp<int, short>(num_items);
825         TestOp<int, int>(num_items);
826         TestOp<int, long>(num_items);
827         TestOp<int, long long>(num_items);
828         TestOp<int, float>(num_items);
829         if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
830             TestOp<int, double>(num_items);
831 
832         TestOp<int, uchar2>(num_items);
833         TestOp<int, uint2>(num_items);
834         TestOp<int, uint3>(num_items);
835         TestOp<int, uint4>(num_items);
836         TestOp<int, ulonglong4>(num_items);
837         TestOp<int, TestFoo>(num_items);
838         TestOp<int, TestBar>(num_items);
839 
840         TestOp<char, int>(num_items);
841         TestOp<long long, int>(num_items);
842         TestOp<TestFoo, int>(num_items);
843         TestOp<TestBar, int>(num_items);
844 
845     }
846 
847 #endif
848 
849     return 0;
850 }
851 
852 
853 
854