1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Test of DeviceReduce::RunLengthEncode utilities
31  ******************************************************************************/
32 
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35 
36 #include <stdio.h>
37 #include <typeinfo>
38 
39 #include <thrust/device_ptr.h>
40 #include <thrust/reduce.h>
41 #include <thrust/iterator/constant_iterator.h>
42 
43 #include <cub/util_allocator.cuh>
44 #include <cub/iterator/constant_input_iterator.cuh>
45 #include <cub/device/device_reduce.cuh>
46 #include <cub/device/device_run_length_encode.cuh>
47 #include <cub/thread/thread_operators.cuh>
48 
49 #include "test_util.h"
50 
51 using namespace cub;
52 
53 
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57 
58 bool                    g_verbose           = false;
59 int                     g_timing_iterations = 0;
60 int                     g_repeat            = 0;
61 CachingDeviceAllocator  g_allocator(true);
62 
63 // Dispatch types
64 enum Backend
65 {
66     CUB,        // CUB method
67     THRUST,     // Thrust method
68     CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
69 };
70 
71 // Operation types
72 enum RleMethod
73 {
74     RLE,                // Run length encode
75     NON_TRIVIAL,
76     CSR,
77 };
78 
79 
80 //---------------------------------------------------------------------
81 // Dispatch to different CUB entrypoints
82 //---------------------------------------------------------------------
83 
84 
85 /**
86  * Dispatch to run-length encode entrypoint
87  */
88 template <
89     typename                    InputIteratorT,
90     typename                    UniqueOutputIteratorT,
91     typename                    OffsetsOutputIteratorT,
92     typename                    LengthsOutputIteratorT,
93     typename                    NumRunsIterator,
94     typename                    OffsetT>
95 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<RLE>,Int2Type<CUB>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)96 cudaError_t Dispatch(
97     Int2Type<RLE>               /*method*/,
98     Int2Type<CUB>               /*dispatch_to*/,
99     int                         timing_timing_iterations,
100     size_t                      */*d_temp_storage_bytes*/,
101     cudaError_t                 */*d_cdp_error*/,
102 
103     void*               d_temp_storage,
104     size_t                      &temp_storage_bytes,
105     InputIteratorT              d_in,
106     UniqueOutputIteratorT       d_unique_out,
107     OffsetsOutputIteratorT      /*d_offsets_out*/,
108     LengthsOutputIteratorT      d_lengths_out,
109     NumRunsIterator             d_num_runs,
110     cub::Equality               /*equality_op*/,
111     OffsetT                     num_items,
112     cudaStream_t                stream,
113     bool                        debug_synchronous)
114 {
115     cudaError_t error = cudaSuccess;
116     for (int i = 0; i < timing_timing_iterations; ++i)
117     {
118         error = DeviceRunLengthEncode::Encode(
119             d_temp_storage,
120             temp_storage_bytes,
121             d_in,
122             d_unique_out,
123             d_lengths_out,
124             d_num_runs,
125             num_items,
126             stream,
127             debug_synchronous);
128     }
129     return error;
130 }
131 
132 
133 /**
134  * Dispatch to non-trivial runs entrypoint
135  */
136 template <
137     typename                    InputIteratorT,
138     typename                    UniqueOutputIteratorT,
139     typename                    OffsetsOutputIteratorT,
140     typename                    LengthsOutputIteratorT,
141     typename                    NumRunsIterator,
142     typename                    OffsetT>
143 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<NON_TRIVIAL>,Int2Type<CUB>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT,OffsetsOutputIteratorT d_offsets_out,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)144 cudaError_t Dispatch(
145     Int2Type<NON_TRIVIAL>       /*method*/,
146     Int2Type<CUB>               /*dispatch_to*/,
147     int                         timing_timing_iterations,
148     size_t                      */*d_temp_storage_bytes*/,
149     cudaError_t                 */*d_cdp_error*/,
150 
151     void*               d_temp_storage,
152     size_t                      &temp_storage_bytes,
153     InputIteratorT              d_in,
154     UniqueOutputIteratorT       /*d_unique_out*/,
155     OffsetsOutputIteratorT      d_offsets_out,
156     LengthsOutputIteratorT      d_lengths_out,
157     NumRunsIterator             d_num_runs,
158     cub::Equality               /*equality_op*/,
159     OffsetT                     num_items,
160     cudaStream_t                stream,
161     bool                        debug_synchronous)
162 {
163     cudaError_t error = cudaSuccess;
164     for (int i = 0; i < timing_timing_iterations; ++i)
165     {
166         error = DeviceRunLengthEncode::NonTrivialRuns(
167             d_temp_storage,
168             temp_storage_bytes,
169             d_in,
170             d_offsets_out,
171             d_lengths_out,
172             d_num_runs,
173             num_items,
174             stream,
175             debug_synchronous);
176     }
177     return error;
178 }
179 
180 
181 
182 //---------------------------------------------------------------------
183 // Dispatch to different Thrust entrypoints
184 //---------------------------------------------------------------------
185 
186 /**
187  * Dispatch to run-length encode entrypoint
188  */
189 template <
190     typename                    InputIteratorT,
191     typename                    UniqueOutputIteratorT,
192     typename                    OffsetsOutputIteratorT,
193     typename                    LengthsOutputIteratorT,
194     typename                    NumRunsIterator,
195     typename                    OffsetT>
Dispatch(Int2Type<RLE>,Int2Type<THRUST>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality,OffsetT num_items,cudaStream_t,bool)196 cudaError_t Dispatch(
197     Int2Type<RLE>               /*method*/,
198     Int2Type<THRUST>            /*dispatch_to*/,
199     int                         timing_timing_iterations,
200     size_t                      */*d_temp_storage_bytes*/,
201     cudaError_t                 */*d_cdp_error*/,
202 
203     void                        *d_temp_storage,
204     size_t                      &temp_storage_bytes,
205     InputIteratorT              d_in,
206     UniqueOutputIteratorT       d_unique_out,
207     OffsetsOutputIteratorT      /*d_offsets_out*/,
208     LengthsOutputIteratorT      d_lengths_out,
209     NumRunsIterator             d_num_runs,
210     cub::Equality               /*equality_op*/,
211     OffsetT                     num_items,
212     cudaStream_t                /*stream*/,
213     bool                        /*debug_synchronous*/)
214 {
215     // The input value type
216     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
217 
218     // The output value type
219     typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
220         typename std::iterator_traits<InputIteratorT>::value_type,                                                // ... then the input iterator's value type,
221         typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT;                          // ... else the output iterator's value type
222 
223     // The lengths output value type
224     typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
225         OffsetT,                                                                                                    // ... then the OffsetT type,
226         typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
227 
228     if (d_temp_storage == 0)
229     {
230         temp_storage_bytes = 1;
231     }
232     else
233     {
234         thrust::device_ptr<InputT>      d_in_wrapper(d_in);
235         thrust::device_ptr<UniqueT>     d_unique_out_wrapper(d_unique_out);
236         thrust::device_ptr<LengthT>     d_lengths_out_wrapper(d_lengths_out);
237 
238         thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
239 
240         LengthT one_val;
241         InitValue(INTEGER_SEED, one_val, 1);
242         thrust::constant_iterator<LengthT> constant_one(one_val);
243 
244         for (int i = 0; i < timing_timing_iterations; ++i)
245         {
246             d_out_ends = thrust::reduce_by_key(
247                 d_in_wrapper,
248                 d_in_wrapper + num_items,
249                 constant_one,
250                 d_unique_out_wrapper,
251                 d_lengths_out_wrapper);
252         }
253 
254         OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
255         CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
256     }
257 
258     return cudaSuccess;
259 }
260 
261 
262 
263 //---------------------------------------------------------------------
264 // CUDA Nested Parallelism Test Kernel
265 //---------------------------------------------------------------------
266 
267 /**
268  * Simple wrapper kernel to invoke DeviceRunLengthEncode
269  */
270 template <
271     int                         RLE_METHOD,
272     typename                    InputIteratorT,
273     typename                    UniqueOutputIteratorT,
274     typename                    OffsetsOutputIteratorT,
275     typename                    LengthsOutputIteratorT,
276     typename                    NumRunsIterator,
277     typename                    EqualityOp,
278     typename                    OffsetT>
CnpDispatchKernel(Int2Type<RLE_METHOD> method,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT d_offsets_out,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality equality_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)279 __global__ void CnpDispatchKernel(
280     Int2Type<RLE_METHOD>            method,
281     int                         timing_timing_iterations,
282     size_t                      *d_temp_storage_bytes,
283     cudaError_t                 *d_cdp_error,
284 
285     void*               d_temp_storage,
286     size_t                      temp_storage_bytes,
287     InputIteratorT              d_in,
288     UniqueOutputIteratorT       d_unique_out,
289     OffsetsOutputIteratorT      d_offsets_out,
290     LengthsOutputIteratorT      d_lengths_out,
291     NumRunsIterator             d_num_runs,
292     cub::Equality               equality_op,
293     OffsetT                     num_items,
294     cudaStream_t                stream,
295     bool                        debug_synchronous)
296 {
297 
298 #ifndef CUB_CDP
299     *d_cdp_error = cudaErrorNotSupported;
300 #else
301     *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
302         d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
303 
304     *d_temp_storage_bytes = temp_storage_bytes;
305 #endif
306 }
307 
308 
309 /**
310  * Dispatch to CDP kernel
311  */
312 template <
313     int                         RLE_METHOD,
314     typename                    InputIteratorT,
315     typename                    UniqueOutputIteratorT,
316     typename                    OffsetsOutputIteratorT,
317     typename                    LengthsOutputIteratorT,
318     typename                    NumRunsIterator,
319     typename                    EqualityOp,
320     typename                    OffsetT>
321 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<RLE_METHOD> method,Int2Type<CDP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT d_offsets_out,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,EqualityOp equality_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)322 cudaError_t Dispatch(
323     Int2Type<RLE_METHOD>        method,
324     Int2Type<CDP>               dispatch_to,
325     int                         timing_timing_iterations,
326     size_t                      *d_temp_storage_bytes,
327     cudaError_t                 *d_cdp_error,
328 
329     void*               d_temp_storage,
330     size_t                      &temp_storage_bytes,
331     InputIteratorT              d_in,
332     UniqueOutputIteratorT       d_unique_out,
333     OffsetsOutputIteratorT      d_offsets_out,
334     LengthsOutputIteratorT      d_lengths_out,
335     NumRunsIterator             d_num_runs,
336     EqualityOp                  equality_op,
337     OffsetT                     num_items,
338     cudaStream_t                stream,
339     bool                        debug_synchronous)
340 {
341     // Invoke kernel to invoke device-side dispatch
342     CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
343         d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
344 
345     // Copy out temp_storage_bytes
346     CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
347 
348     // Copy out error
349     cudaError_t retval;
350     CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
351     return retval;
352 }
353 
354 
355 
356 //---------------------------------------------------------------------
357 // Test generation
358 //---------------------------------------------------------------------
359 
360 
361 /**
362  * Initialize problem
363  */
364 template <typename T>
Initialize(int entropy_reduction,T * h_in,int num_items,int max_segment)365 void Initialize(
366     int         entropy_reduction,
367     T           *h_in,
368     int         num_items,
369     int         max_segment)
370 {
371     unsigned int max_int = (unsigned int) -1;
372 
373     int key = 0;
374     int i = 0;
375     while (i < num_items)
376     {
377         // Select number of repeating occurrences for the current run
378         int repeat;
379         if (max_segment < 0)
380         {
381             repeat = num_items;
382         }
383         else if (max_segment < 2)
384         {
385             repeat = 1;
386         }
387         else
388         {
389             RandomBits(repeat, entropy_reduction);
390             repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
391             repeat = CUB_MAX(1, repeat);
392         }
393 
394         int j = i;
395         while (j < CUB_MIN(i + repeat, num_items))
396         {
397             InitValue(INTEGER_SEED, h_in[j], key);
398             j++;
399         }
400 
401         i = j;
402         key++;
403     }
404 
405     if (g_verbose)
406     {
407         printf("Input:\n");
408         DisplayResults(h_in, num_items);
409         printf("\n\n");
410     }
411 }
412 
413 
414 /**
415  * Solve problem.  Returns total number of segments identified
416  */
417 template <
418     RleMethod       RLE_METHOD,
419     typename        InputIteratorT,
420     typename        T,
421     typename        OffsetT,
422     typename        LengthT,
423     typename        EqualityOp>
Solve(InputIteratorT h_in,T * h_unique_reference,OffsetT * h_offsets_reference,LengthT * h_lengths_reference,EqualityOp equality_op,int num_items)424 int Solve(
425     InputIteratorT  h_in,
426     T               *h_unique_reference,
427     OffsetT         *h_offsets_reference,
428     LengthT         *h_lengths_reference,
429     EqualityOp      equality_op,
430     int             num_items)
431 {
432     if (num_items == 0)
433         return 0;
434 
435     // First item
436     T       previous        = h_in[0];
437     LengthT  length          = 1;
438     int     num_runs        = 0;
439     int     run_begin       = 0;
440 
441     // Subsequent items
442     for (int i = 1; i < num_items; ++i)
443     {
444         if (!equality_op(previous, h_in[i]))
445         {
446             if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
447             {
448                 h_unique_reference[num_runs]      = previous;
449                 h_offsets_reference[num_runs]     = run_begin;
450                 h_lengths_reference[num_runs]     = length;
451                 num_runs++;
452             }
453             length = 1;
454             run_begin = i;
455         }
456         else
457         {
458             length++;
459         }
460         previous = h_in[i];
461     }
462 
463     if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
464     {
465         h_unique_reference[num_runs]    = previous;
466         h_offsets_reference[num_runs]   = run_begin;
467         h_lengths_reference[num_runs]   = length;
468         num_runs++;
469     }
470 
471     return num_runs;
472 }
473 
474 
475 
476 /**
477  * Test DeviceRunLengthEncode for a given problem input
478  */
479 template <
480     RleMethod           RLE_METHOD,
481     Backend             BACKEND,
482     typename            DeviceInputIteratorT,
483     typename            T,
484     typename            OffsetT,
485     typename            LengthT,
486     typename            EqualityOp>
Test(DeviceInputIteratorT d_in,T * h_unique_reference,OffsetT * h_offsets_reference,LengthT * h_lengths_reference,EqualityOp equality_op,int num_runs,int num_items)487 void Test(
488     DeviceInputIteratorT d_in,
489     T                   *h_unique_reference,
490     OffsetT             *h_offsets_reference,
491     LengthT             *h_lengths_reference,
492     EqualityOp          equality_op,
493     int                 num_runs,
494     int                 num_items)
495 {
496     // Allocate device output arrays and number of segments
497     T*          d_unique_out       = NULL;
498     LengthT*    d_offsets_out      = NULL;
499     OffsetT*    d_lengths_out      = NULL;
500     int*        d_num_runs         = NULL;
501 
502     if (RLE_METHOD == RLE)
503         CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items));
504     if (RLE_METHOD == NON_TRIVIAL)
505         CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
506     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
507     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
508 
509     // Allocate CDP device arrays
510     size_t*          d_temp_storage_bytes = NULL;
511     cudaError_t*     d_cdp_error = NULL;
512     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
513     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
514 
515     // Allocate temporary storage
516     void*           d_temp_storage = NULL;
517     size_t          temp_storage_bytes = 0;
518     CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
519     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
520 
521     // Clear device output arrays
522     if (RLE_METHOD == RLE)
523         CubDebugExit(cudaMemset(d_unique_out,   0, sizeof(T) * num_items));
524     if (RLE_METHOD == NON_TRIVIAL)
525         CubDebugExit(cudaMemset(d_offsets_out,  0, sizeof(OffsetT) * num_items));
526     CubDebugExit(cudaMemset(d_lengths_out,  0, sizeof(LengthT) * num_items));
527     CubDebugExit(cudaMemset(d_num_runs,     0, sizeof(int)));
528 
529     // Run warmup/correctness iteration
530     CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
531 
532     // Check for correctness (and display results, if specified)
533     int compare0 = 0;
534     int compare1 = 0;
535     int compare2 = 0;
536     int compare3 = 0;
537 
538     if (RLE_METHOD == RLE)
539     {
540         compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
541         printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
542     }
543 
544     if (RLE_METHOD != RLE)
545     {
546         compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
547         printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
548     }
549 
550     if (RLE_METHOD != CSR)
551     {
552         compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
553         printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
554     }
555 
556     compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
557     printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
558 
559     // Flush any stdout/stderr
560     fflush(stdout);
561     fflush(stderr);
562 
563     // Performance
564     GpuTimer gpu_timer;
565     gpu_timer.Start();
566     CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
567     gpu_timer.Stop();
568     float elapsed_millis = gpu_timer.ElapsedMillis();
569 
570     // Display performance
571     if (g_timing_iterations > 0)
572     {
573         float avg_millis = elapsed_millis / g_timing_iterations;
574         float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
575         int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
576         float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
577         printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
578     }
579     printf("\n\n");
580 
581     // Flush any stdout/stderr
582     fflush(stdout);
583     fflush(stderr);
584 
585     // Cleanup
586     if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
587     if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
588     if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
589     if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
590     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
591     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
592     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
593 
594     // Correctness asserts
595     AssertEquals(0, compare0 | compare1 | compare2 | compare3);
596 }
597 
598 
599 /**
600  * Test DeviceRunLengthEncode on pointer type
601  */
602 template <
603     RleMethod       RLE_METHOD,
604     Backend         BACKEND,
605     typename        T,
606     typename        OffsetT,
607     typename        LengthT>
TestPointer(int num_items,int entropy_reduction,int max_segment)608 void TestPointer(
609     int             num_items,
610     int             entropy_reduction,
611     int             max_segment)
612 {
613     // Allocate host arrays
614     T*      h_in                    = new T[num_items];
615     T*      h_unique_reference      = new T[num_items];
616     OffsetT* h_offsets_reference     = new OffsetT[num_items];
617     LengthT* h_lengths_reference     = new LengthT[num_items];
618 
619     for (int i = 0; i < num_items; ++i)
620         InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
621 
622     // Initialize problem and solution
623     Equality equality_op;
624     Initialize(entropy_reduction, h_in, num_items, max_segment);
625 
626     int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
627 
628     printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
629         (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
630         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
631         num_items, num_runs, float(num_items) / num_runs,
632         typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
633         max_segment, entropy_reduction);
634     fflush(stdout);
635 
636     // Allocate problem device arrays
637     T* d_in = NULL;
638     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
639 
640     // Initialize device input
641     CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
642 
643     // Run Test
644     Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
645 
646     // Cleanup
647     if (h_in) delete[] h_in;
648     if (h_unique_reference) delete[] h_unique_reference;
649     if (h_offsets_reference) delete[] h_offsets_reference;
650     if (h_lengths_reference) delete[] h_lengths_reference;
651     if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
652 }
653 
654 
655 /**
656  * Test on iterator type
657  */
658 template <
659     RleMethod       RLE_METHOD,
660     Backend         BACKEND,
661     typename        T,
662     typename        OffsetT,
663     typename        LengthT>
TestIterator(int num_items,Int2Type<true>)664 void TestIterator(
665     int             num_items,
666     Int2Type<true>  /*is_primitive*/)
667 {
668     // Allocate host arrays
669     T* h_unique_reference       = new T[num_items];
670     OffsetT* h_offsets_reference = new OffsetT[num_items];
671     LengthT* h_lengths_reference = new LengthT[num_items];
672 
673     T one_val;
674     InitValue(INTEGER_SEED, one_val, 1);
675     ConstantInputIterator<T, int> h_in(one_val);
676 
677     // Initialize problem and solution
678     Equality equality_op;
679     int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
680 
681     printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
682         (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
683         (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
684         num_items, num_runs, float(num_items) / num_runs,
685         typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
686     fflush(stdout);
687 
688     // Run Test
689     Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
690 
691     // Cleanup
692     if (h_unique_reference) delete[] h_unique_reference;
693     if (h_offsets_reference) delete[] h_offsets_reference;
694     if (h_lengths_reference) delete[] h_lengths_reference;
695 }
696 
697 
698 template <
699     RleMethod       RLE_METHOD,
700     Backend         BACKEND,
701     typename        T,
702     typename        OffsetT,
703     typename        LengthT>
TestIterator(int,Int2Type<false>)704 void TestIterator(
705     int             /*num_items*/,
706     Int2Type<false> /*is_primitive*/)
707 {}
708 
709 
710 /**
711  * Test different gen modes
712  */
713 template <
714     RleMethod       RLE_METHOD,
715     Backend         BACKEND,
716     typename        T,
717     typename        OffsetT,
718     typename        LengthT>
Test(int num_items)719 void Test(
720     int             num_items)
721 {
722     // Test iterator (one run)
723     TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());
724 
725     // num_items runs
726     TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);
727 
728     // Evaluate different run lengths
729     for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
730     {
731         // Uniform selection run length
732         TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);
733 
734         // Reduced-entropy run length
735         TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
736     }
737 }
738 
739 
740 /**
741  * Test different dispatch
742  */
743 template <
744     typename        T,
745     typename        OffsetT,
746     typename        LengthT>
TestDispatch(int num_items)747 void TestDispatch(
748     int             num_items)
749 {
750     Test<RLE,           CUB, T, OffsetT, LengthT>(num_items);
751     Test<NON_TRIVIAL,   CUB, T, OffsetT, LengthT>(num_items);
752 
753 #ifdef CUB_CDP
754     Test<RLE,           CDP, T, OffsetT, LengthT>(num_items);
755     Test<NON_TRIVIAL,   CDP, T, OffsetT, LengthT>(num_items);
756 #endif
757 }
758 
759 
760 /**
761  * Test different input sizes
762  */
763 template <
764     typename        T,
765     typename        OffsetT,
766     typename        LengthT>
TestSize(int num_items)767 void TestSize(
768     int             num_items)
769 {
770     if (num_items < 0)
771     {
772         TestDispatch<T, OffsetT, LengthT>(0);
773         TestDispatch<T, OffsetT, LengthT>(1);
774         TestDispatch<T, OffsetT, LengthT>(100);
775         TestDispatch<T, OffsetT, LengthT>(10000);
776         TestDispatch<T, OffsetT, LengthT>(1000000);
777 
778         // Randomly select problem size between 1:10,000,000
779         unsigned int max_int = (unsigned int) -1;
780         for (int i = 0; i < 10; ++i)
781         {
782             unsigned int num_items;
783             RandomBits(num_items);
784             num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
785             num_items = CUB_MAX(1, num_items);
786             TestDispatch<T, OffsetT, LengthT>(num_items);
787         }
788     }
789     else
790     {
791         TestDispatch<T, OffsetT, LengthT>(num_items);
792     }
793 
794 }
795 
796 
797 //---------------------------------------------------------------------
798 // Main
799 //---------------------------------------------------------------------
800 
801 /**
802  * Main
803  */
main(int argc,char ** argv)804 int main(int argc, char** argv)
805 {
806     int num_items           = -1;
807     int entropy_reduction   = 0;
808     int max_segment              = 1000;
809 
810     // Initialize command line
811     CommandLineArgs args(argc, argv);
812     g_verbose = args.CheckCmdLineFlag("v");
813     args.GetCmdLineArgument("n", num_items);
814     args.GetCmdLineArgument("i", g_timing_iterations);
815     args.GetCmdLineArgument("repeat", g_repeat);
816     args.GetCmdLineArgument("maxseg", max_segment);
817     args.GetCmdLineArgument("entropy", entropy_reduction);
818 
819     // Print usage
820     if (args.CheckCmdLineFlag("help"))
821     {
822         printf("%s "
823             "[--n=<input items> "
824             "[--i=<timing iterations> "
825             "[--device=<device-id>] "
826             "[--maxseg=<max segment length>]"
827             "[--entropy=<segment length bit entropy reduction rounds>]"
828             "[--repeat=<repetitions of entire test suite>]"
829             "[--v] "
830             "[--cdp]"
831             "\n", argv[0]);
832         exit(0);
833     }
834 
835     // Initialize device
836     CubDebugExit(args.DeviceInit());
837     printf("\n");
838 
839     // Get ptx version
840     int ptx_version = 0;
841     CubDebugExit(PtxVersion(ptx_version));
842 
843 #ifdef QUICKER_TEST
844 
845     // Compile/run basic CUB test
846     if (num_items < 0) num_items = 32000000;
847 
848     TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
849     TestPointer<NON_TRIVIAL,    CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
850     TestIterator<RLE,           CUB, float, int, int>(  num_items, Int2Type<Traits<float>::PRIMITIVE>());
851 
852 
853 #elif defined(QUICK_TEST)
854 
855     // Compile/run quick tests
856     if (num_items < 0) num_items = 32000000;
857 
858     TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
859     TestPointer<RLE,            THRUST, int, int, int>(    num_items, entropy_reduction, max_segment);
860 
861 #else
862 
863     // Compile/run thorough tests
864     for (int i = 0; i <= g_repeat; ++i)
865     {
866         // Test different input types
867         TestSize<char,          int, int>(num_items);
868         TestSize<short,         int, int>(num_items);
869         TestSize<int,           int, int>(num_items);
870         TestSize<long,          int, int>(num_items);
871         TestSize<long long,     int, int>(num_items);
872         TestSize<float,         int, int>(num_items);
873         TestSize<double,        int, int>(num_items);
874 
875         TestSize<uchar2,        int, int>(num_items);
876         TestSize<uint2,         int, int>(num_items);
877         TestSize<uint3,         int, int>(num_items);
878         TestSize<uint4,         int, int>(num_items);
879         TestSize<ulonglong4,    int, int>(num_items);
880         TestSize<TestFoo,       int, int>(num_items);
881         TestSize<TestBar,       int, int>(num_items);
882     }
883 
884 #endif
885 
886     return 0;
887 }
888 
889 
890 
891