1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /******************************************************************************
30 * Test of DeviceReduce::RunLengthEncode utilities
31 ******************************************************************************/
32
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35
36 #include <stdio.h>
37 #include <typeinfo>
38
39 #include <thrust/device_ptr.h>
40 #include <thrust/reduce.h>
41 #include <thrust/iterator/constant_iterator.h>
42
43 #include <cub/util_allocator.cuh>
44 #include <cub/iterator/constant_input_iterator.cuh>
45 #include <cub/device/device_reduce.cuh>
46 #include <cub/device/device_run_length_encode.cuh>
47 #include <cub/thread/thread_operators.cuh>
48
49 #include "test_util.h"
50
51 using namespace cub;
52
53
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57
58 bool g_verbose = false;
59 int g_timing_iterations = 0;
60 int g_repeat = 0;
61 CachingDeviceAllocator g_allocator(true);
62
63 // Dispatch types
64 enum Backend
65 {
66 CUB, // CUB method
67 THRUST, // Thrust method
68 CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
69 };
70
71 // Operation types
72 enum RleMethod
73 {
74 RLE, // Run length encode
75 NON_TRIVIAL,
76 CSR,
77 };
78
79
80 //---------------------------------------------------------------------
81 // Dispatch to different CUB entrypoints
82 //---------------------------------------------------------------------
83
84
85 /**
86 * Dispatch to run-length encode entrypoint
87 */
88 template <
89 typename InputIteratorT,
90 typename UniqueOutputIteratorT,
91 typename OffsetsOutputIteratorT,
92 typename LengthsOutputIteratorT,
93 typename NumRunsIterator,
94 typename OffsetT>
95 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<RLE>,Int2Type<CUB>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)96 cudaError_t Dispatch(
97 Int2Type<RLE> /*method*/,
98 Int2Type<CUB> /*dispatch_to*/,
99 int timing_timing_iterations,
100 size_t */*d_temp_storage_bytes*/,
101 cudaError_t */*d_cdp_error*/,
102
103 void* d_temp_storage,
104 size_t &temp_storage_bytes,
105 InputIteratorT d_in,
106 UniqueOutputIteratorT d_unique_out,
107 OffsetsOutputIteratorT /*d_offsets_out*/,
108 LengthsOutputIteratorT d_lengths_out,
109 NumRunsIterator d_num_runs,
110 cub::Equality /*equality_op*/,
111 OffsetT num_items,
112 cudaStream_t stream,
113 bool debug_synchronous)
114 {
115 cudaError_t error = cudaSuccess;
116 for (int i = 0; i < timing_timing_iterations; ++i)
117 {
118 error = DeviceRunLengthEncode::Encode(
119 d_temp_storage,
120 temp_storage_bytes,
121 d_in,
122 d_unique_out,
123 d_lengths_out,
124 d_num_runs,
125 num_items,
126 stream,
127 debug_synchronous);
128 }
129 return error;
130 }
131
132
133 /**
134 * Dispatch to non-trivial runs entrypoint
135 */
136 template <
137 typename InputIteratorT,
138 typename UniqueOutputIteratorT,
139 typename OffsetsOutputIteratorT,
140 typename LengthsOutputIteratorT,
141 typename NumRunsIterator,
142 typename OffsetT>
143 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<NON_TRIVIAL>,Int2Type<CUB>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT,OffsetsOutputIteratorT d_offsets_out,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)144 cudaError_t Dispatch(
145 Int2Type<NON_TRIVIAL> /*method*/,
146 Int2Type<CUB> /*dispatch_to*/,
147 int timing_timing_iterations,
148 size_t */*d_temp_storage_bytes*/,
149 cudaError_t */*d_cdp_error*/,
150
151 void* d_temp_storage,
152 size_t &temp_storage_bytes,
153 InputIteratorT d_in,
154 UniqueOutputIteratorT /*d_unique_out*/,
155 OffsetsOutputIteratorT d_offsets_out,
156 LengthsOutputIteratorT d_lengths_out,
157 NumRunsIterator d_num_runs,
158 cub::Equality /*equality_op*/,
159 OffsetT num_items,
160 cudaStream_t stream,
161 bool debug_synchronous)
162 {
163 cudaError_t error = cudaSuccess;
164 for (int i = 0; i < timing_timing_iterations; ++i)
165 {
166 error = DeviceRunLengthEncode::NonTrivialRuns(
167 d_temp_storage,
168 temp_storage_bytes,
169 d_in,
170 d_offsets_out,
171 d_lengths_out,
172 d_num_runs,
173 num_items,
174 stream,
175 debug_synchronous);
176 }
177 return error;
178 }
179
180
181
182 //---------------------------------------------------------------------
183 // Dispatch to different Thrust entrypoints
184 //---------------------------------------------------------------------
185
186 /**
187 * Dispatch to run-length encode entrypoint
188 */
189 template <
190 typename InputIteratorT,
191 typename UniqueOutputIteratorT,
192 typename OffsetsOutputIteratorT,
193 typename LengthsOutputIteratorT,
194 typename NumRunsIterator,
195 typename OffsetT>
Dispatch(Int2Type<RLE>,Int2Type<THRUST>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality,OffsetT num_items,cudaStream_t,bool)196 cudaError_t Dispatch(
197 Int2Type<RLE> /*method*/,
198 Int2Type<THRUST> /*dispatch_to*/,
199 int timing_timing_iterations,
200 size_t */*d_temp_storage_bytes*/,
201 cudaError_t */*d_cdp_error*/,
202
203 void *d_temp_storage,
204 size_t &temp_storage_bytes,
205 InputIteratorT d_in,
206 UniqueOutputIteratorT d_unique_out,
207 OffsetsOutputIteratorT /*d_offsets_out*/,
208 LengthsOutputIteratorT d_lengths_out,
209 NumRunsIterator d_num_runs,
210 cub::Equality /*equality_op*/,
211 OffsetT num_items,
212 cudaStream_t /*stream*/,
213 bool /*debug_synchronous*/)
214 {
215 // The input value type
216 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
217
218 // The output value type
219 typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
220 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
221 typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT; // ... else the output iterator's value type
222
223 // The lengths output value type
224 typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ?
225 OffsetT, // ... then the OffsetT type,
226 typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT; // ... else the output iterator's value type
227
228 if (d_temp_storage == 0)
229 {
230 temp_storage_bytes = 1;
231 }
232 else
233 {
234 thrust::device_ptr<InputT> d_in_wrapper(d_in);
235 thrust::device_ptr<UniqueT> d_unique_out_wrapper(d_unique_out);
236 thrust::device_ptr<LengthT> d_lengths_out_wrapper(d_lengths_out);
237
238 thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
239
240 LengthT one_val;
241 InitValue(INTEGER_SEED, one_val, 1);
242 thrust::constant_iterator<LengthT> constant_one(one_val);
243
244 for (int i = 0; i < timing_timing_iterations; ++i)
245 {
246 d_out_ends = thrust::reduce_by_key(
247 d_in_wrapper,
248 d_in_wrapper + num_items,
249 constant_one,
250 d_unique_out_wrapper,
251 d_lengths_out_wrapper);
252 }
253
254 OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
255 CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
256 }
257
258 return cudaSuccess;
259 }
260
261
262
263 //---------------------------------------------------------------------
264 // CUDA Nested Parallelism Test Kernel
265 //---------------------------------------------------------------------
266
267 /**
268 * Simple wrapper kernel to invoke DeviceRunLengthEncode
269 */
270 template <
271 int RLE_METHOD,
272 typename InputIteratorT,
273 typename UniqueOutputIteratorT,
274 typename OffsetsOutputIteratorT,
275 typename LengthsOutputIteratorT,
276 typename NumRunsIterator,
277 typename EqualityOp,
278 typename OffsetT>
CnpDispatchKernel(Int2Type<RLE_METHOD> method,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT d_offsets_out,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,cub::Equality equality_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)279 __global__ void CnpDispatchKernel(
280 Int2Type<RLE_METHOD> method,
281 int timing_timing_iterations,
282 size_t *d_temp_storage_bytes,
283 cudaError_t *d_cdp_error,
284
285 void* d_temp_storage,
286 size_t temp_storage_bytes,
287 InputIteratorT d_in,
288 UniqueOutputIteratorT d_unique_out,
289 OffsetsOutputIteratorT d_offsets_out,
290 LengthsOutputIteratorT d_lengths_out,
291 NumRunsIterator d_num_runs,
292 cub::Equality equality_op,
293 OffsetT num_items,
294 cudaStream_t stream,
295 bool debug_synchronous)
296 {
297
298 #ifndef CUB_CDP
299 *d_cdp_error = cudaErrorNotSupported;
300 #else
301 *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
302 d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
303
304 *d_temp_storage_bytes = temp_storage_bytes;
305 #endif
306 }
307
308
309 /**
310 * Dispatch to CDP kernel
311 */
312 template <
313 int RLE_METHOD,
314 typename InputIteratorT,
315 typename UniqueOutputIteratorT,
316 typename OffsetsOutputIteratorT,
317 typename LengthsOutputIteratorT,
318 typename NumRunsIterator,
319 typename EqualityOp,
320 typename OffsetT>
321 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<RLE_METHOD> method,Int2Type<CDP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,InputIteratorT d_in,UniqueOutputIteratorT d_unique_out,OffsetsOutputIteratorT d_offsets_out,LengthsOutputIteratorT d_lengths_out,NumRunsIterator d_num_runs,EqualityOp equality_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)322 cudaError_t Dispatch(
323 Int2Type<RLE_METHOD> method,
324 Int2Type<CDP> dispatch_to,
325 int timing_timing_iterations,
326 size_t *d_temp_storage_bytes,
327 cudaError_t *d_cdp_error,
328
329 void* d_temp_storage,
330 size_t &temp_storage_bytes,
331 InputIteratorT d_in,
332 UniqueOutputIteratorT d_unique_out,
333 OffsetsOutputIteratorT d_offsets_out,
334 LengthsOutputIteratorT d_lengths_out,
335 NumRunsIterator d_num_runs,
336 EqualityOp equality_op,
337 OffsetT num_items,
338 cudaStream_t stream,
339 bool debug_synchronous)
340 {
341 // Invoke kernel to invoke device-side dispatch
342 CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
343 d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
344
345 // Copy out temp_storage_bytes
346 CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
347
348 // Copy out error
349 cudaError_t retval;
350 CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
351 return retval;
352 }
353
354
355
356 //---------------------------------------------------------------------
357 // Test generation
358 //---------------------------------------------------------------------
359
360
361 /**
362 * Initialize problem
363 */
364 template <typename T>
Initialize(int entropy_reduction,T * h_in,int num_items,int max_segment)365 void Initialize(
366 int entropy_reduction,
367 T *h_in,
368 int num_items,
369 int max_segment)
370 {
371 unsigned int max_int = (unsigned int) -1;
372
373 int key = 0;
374 int i = 0;
375 while (i < num_items)
376 {
377 // Select number of repeating occurrences for the current run
378 int repeat;
379 if (max_segment < 0)
380 {
381 repeat = num_items;
382 }
383 else if (max_segment < 2)
384 {
385 repeat = 1;
386 }
387 else
388 {
389 RandomBits(repeat, entropy_reduction);
390 repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
391 repeat = CUB_MAX(1, repeat);
392 }
393
394 int j = i;
395 while (j < CUB_MIN(i + repeat, num_items))
396 {
397 InitValue(INTEGER_SEED, h_in[j], key);
398 j++;
399 }
400
401 i = j;
402 key++;
403 }
404
405 if (g_verbose)
406 {
407 printf("Input:\n");
408 DisplayResults(h_in, num_items);
409 printf("\n\n");
410 }
411 }
412
413
414 /**
415 * Solve problem. Returns total number of segments identified
416 */
417 template <
418 RleMethod RLE_METHOD,
419 typename InputIteratorT,
420 typename T,
421 typename OffsetT,
422 typename LengthT,
423 typename EqualityOp>
Solve(InputIteratorT h_in,T * h_unique_reference,OffsetT * h_offsets_reference,LengthT * h_lengths_reference,EqualityOp equality_op,int num_items)424 int Solve(
425 InputIteratorT h_in,
426 T *h_unique_reference,
427 OffsetT *h_offsets_reference,
428 LengthT *h_lengths_reference,
429 EqualityOp equality_op,
430 int num_items)
431 {
432 if (num_items == 0)
433 return 0;
434
435 // First item
436 T previous = h_in[0];
437 LengthT length = 1;
438 int num_runs = 0;
439 int run_begin = 0;
440
441 // Subsequent items
442 for (int i = 1; i < num_items; ++i)
443 {
444 if (!equality_op(previous, h_in[i]))
445 {
446 if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
447 {
448 h_unique_reference[num_runs] = previous;
449 h_offsets_reference[num_runs] = run_begin;
450 h_lengths_reference[num_runs] = length;
451 num_runs++;
452 }
453 length = 1;
454 run_begin = i;
455 }
456 else
457 {
458 length++;
459 }
460 previous = h_in[i];
461 }
462
463 if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
464 {
465 h_unique_reference[num_runs] = previous;
466 h_offsets_reference[num_runs] = run_begin;
467 h_lengths_reference[num_runs] = length;
468 num_runs++;
469 }
470
471 return num_runs;
472 }
473
474
475
476 /**
477 * Test DeviceRunLengthEncode for a given problem input
478 */
479 template <
480 RleMethod RLE_METHOD,
481 Backend BACKEND,
482 typename DeviceInputIteratorT,
483 typename T,
484 typename OffsetT,
485 typename LengthT,
486 typename EqualityOp>
Test(DeviceInputIteratorT d_in,T * h_unique_reference,OffsetT * h_offsets_reference,LengthT * h_lengths_reference,EqualityOp equality_op,int num_runs,int num_items)487 void Test(
488 DeviceInputIteratorT d_in,
489 T *h_unique_reference,
490 OffsetT *h_offsets_reference,
491 LengthT *h_lengths_reference,
492 EqualityOp equality_op,
493 int num_runs,
494 int num_items)
495 {
496 // Allocate device output arrays and number of segments
497 T* d_unique_out = NULL;
498 LengthT* d_offsets_out = NULL;
499 OffsetT* d_lengths_out = NULL;
500 int* d_num_runs = NULL;
501
502 if (RLE_METHOD == RLE)
503 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items));
504 if (RLE_METHOD == NON_TRIVIAL)
505 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
506 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
507 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
508
509 // Allocate CDP device arrays
510 size_t* d_temp_storage_bytes = NULL;
511 cudaError_t* d_cdp_error = NULL;
512 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
513 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
514
515 // Allocate temporary storage
516 void* d_temp_storage = NULL;
517 size_t temp_storage_bytes = 0;
518 CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
519 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
520
521 // Clear device output arrays
522 if (RLE_METHOD == RLE)
523 CubDebugExit(cudaMemset(d_unique_out, 0, sizeof(T) * num_items));
524 if (RLE_METHOD == NON_TRIVIAL)
525 CubDebugExit(cudaMemset(d_offsets_out, 0, sizeof(OffsetT) * num_items));
526 CubDebugExit(cudaMemset(d_lengths_out, 0, sizeof(LengthT) * num_items));
527 CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
528
529 // Run warmup/correctness iteration
530 CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
531
532 // Check for correctness (and display results, if specified)
533 int compare0 = 0;
534 int compare1 = 0;
535 int compare2 = 0;
536 int compare3 = 0;
537
538 if (RLE_METHOD == RLE)
539 {
540 compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
541 printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
542 }
543
544 if (RLE_METHOD != RLE)
545 {
546 compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
547 printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
548 }
549
550 if (RLE_METHOD != CSR)
551 {
552 compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
553 printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
554 }
555
556 compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
557 printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
558
559 // Flush any stdout/stderr
560 fflush(stdout);
561 fflush(stderr);
562
563 // Performance
564 GpuTimer gpu_timer;
565 gpu_timer.Start();
566 CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
567 gpu_timer.Stop();
568 float elapsed_millis = gpu_timer.ElapsedMillis();
569
570 // Display performance
571 if (g_timing_iterations > 0)
572 {
573 float avg_millis = elapsed_millis / g_timing_iterations;
574 float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
575 int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
576 float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
577 printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
578 }
579 printf("\n\n");
580
581 // Flush any stdout/stderr
582 fflush(stdout);
583 fflush(stderr);
584
585 // Cleanup
586 if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
587 if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
588 if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
589 if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
590 if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
591 if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
592 if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
593
594 // Correctness asserts
595 AssertEquals(0, compare0 | compare1 | compare2 | compare3);
596 }
597
598
599 /**
600 * Test DeviceRunLengthEncode on pointer type
601 */
602 template <
603 RleMethod RLE_METHOD,
604 Backend BACKEND,
605 typename T,
606 typename OffsetT,
607 typename LengthT>
TestPointer(int num_items,int entropy_reduction,int max_segment)608 void TestPointer(
609 int num_items,
610 int entropy_reduction,
611 int max_segment)
612 {
613 // Allocate host arrays
614 T* h_in = new T[num_items];
615 T* h_unique_reference = new T[num_items];
616 OffsetT* h_offsets_reference = new OffsetT[num_items];
617 LengthT* h_lengths_reference = new LengthT[num_items];
618
619 for (int i = 0; i < num_items; ++i)
620 InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
621
622 // Initialize problem and solution
623 Equality equality_op;
624 Initialize(entropy_reduction, h_in, num_items, max_segment);
625
626 int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
627
628 printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
629 (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
630 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
631 num_items, num_runs, float(num_items) / num_runs,
632 typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
633 max_segment, entropy_reduction);
634 fflush(stdout);
635
636 // Allocate problem device arrays
637 T* d_in = NULL;
638 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
639
640 // Initialize device input
641 CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
642
643 // Run Test
644 Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
645
646 // Cleanup
647 if (h_in) delete[] h_in;
648 if (h_unique_reference) delete[] h_unique_reference;
649 if (h_offsets_reference) delete[] h_offsets_reference;
650 if (h_lengths_reference) delete[] h_lengths_reference;
651 if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
652 }
653
654
655 /**
656 * Test on iterator type
657 */
658 template <
659 RleMethod RLE_METHOD,
660 Backend BACKEND,
661 typename T,
662 typename OffsetT,
663 typename LengthT>
TestIterator(int num_items,Int2Type<true>)664 void TestIterator(
665 int num_items,
666 Int2Type<true> /*is_primitive*/)
667 {
668 // Allocate host arrays
669 T* h_unique_reference = new T[num_items];
670 OffsetT* h_offsets_reference = new OffsetT[num_items];
671 LengthT* h_lengths_reference = new LengthT[num_items];
672
673 T one_val;
674 InitValue(INTEGER_SEED, one_val, 1);
675 ConstantInputIterator<T, int> h_in(one_val);
676
677 // Initialize problem and solution
678 Equality equality_op;
679 int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
680
681 printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
682 (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
683 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
684 num_items, num_runs, float(num_items) / num_runs,
685 typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
686 fflush(stdout);
687
688 // Run Test
689 Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
690
691 // Cleanup
692 if (h_unique_reference) delete[] h_unique_reference;
693 if (h_offsets_reference) delete[] h_offsets_reference;
694 if (h_lengths_reference) delete[] h_lengths_reference;
695 }
696
697
698 template <
699 RleMethod RLE_METHOD,
700 Backend BACKEND,
701 typename T,
702 typename OffsetT,
703 typename LengthT>
TestIterator(int,Int2Type<false>)704 void TestIterator(
705 int /*num_items*/,
706 Int2Type<false> /*is_primitive*/)
707 {}
708
709
710 /**
711 * Test different gen modes
712 */
713 template <
714 RleMethod RLE_METHOD,
715 Backend BACKEND,
716 typename T,
717 typename OffsetT,
718 typename LengthT>
Test(int num_items)719 void Test(
720 int num_items)
721 {
722 // Test iterator (one run)
723 TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());
724
725 // num_items runs
726 TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);
727
728 // Evaluate different run lengths
729 for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
730 {
731 // Uniform selection run length
732 TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);
733
734 // Reduced-entropy run length
735 TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
736 }
737 }
738
739
740 /**
741 * Test different dispatch
742 */
743 template <
744 typename T,
745 typename OffsetT,
746 typename LengthT>
TestDispatch(int num_items)747 void TestDispatch(
748 int num_items)
749 {
750 Test<RLE, CUB, T, OffsetT, LengthT>(num_items);
751 Test<NON_TRIVIAL, CUB, T, OffsetT, LengthT>(num_items);
752
753 #ifdef CUB_CDP
754 Test<RLE, CDP, T, OffsetT, LengthT>(num_items);
755 Test<NON_TRIVIAL, CDP, T, OffsetT, LengthT>(num_items);
756 #endif
757 }
758
759
760 /**
761 * Test different input sizes
762 */
763 template <
764 typename T,
765 typename OffsetT,
766 typename LengthT>
TestSize(int num_items)767 void TestSize(
768 int num_items)
769 {
770 if (num_items < 0)
771 {
772 TestDispatch<T, OffsetT, LengthT>(0);
773 TestDispatch<T, OffsetT, LengthT>(1);
774 TestDispatch<T, OffsetT, LengthT>(100);
775 TestDispatch<T, OffsetT, LengthT>(10000);
776 TestDispatch<T, OffsetT, LengthT>(1000000);
777
778 // Randomly select problem size between 1:10,000,000
779 unsigned int max_int = (unsigned int) -1;
780 for (int i = 0; i < 10; ++i)
781 {
782 unsigned int num_items;
783 RandomBits(num_items);
784 num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
785 num_items = CUB_MAX(1, num_items);
786 TestDispatch<T, OffsetT, LengthT>(num_items);
787 }
788 }
789 else
790 {
791 TestDispatch<T, OffsetT, LengthT>(num_items);
792 }
793
794 }
795
796
797 //---------------------------------------------------------------------
798 // Main
799 //---------------------------------------------------------------------
800
801 /**
802 * Main
803 */
main(int argc,char ** argv)804 int main(int argc, char** argv)
805 {
806 int num_items = -1;
807 int entropy_reduction = 0;
808 int max_segment = 1000;
809
810 // Initialize command line
811 CommandLineArgs args(argc, argv);
812 g_verbose = args.CheckCmdLineFlag("v");
813 args.GetCmdLineArgument("n", num_items);
814 args.GetCmdLineArgument("i", g_timing_iterations);
815 args.GetCmdLineArgument("repeat", g_repeat);
816 args.GetCmdLineArgument("maxseg", max_segment);
817 args.GetCmdLineArgument("entropy", entropy_reduction);
818
819 // Print usage
820 if (args.CheckCmdLineFlag("help"))
821 {
822 printf("%s "
823 "[--n=<input items> "
824 "[--i=<timing iterations> "
825 "[--device=<device-id>] "
826 "[--maxseg=<max segment length>]"
827 "[--entropy=<segment length bit entropy reduction rounds>]"
828 "[--repeat=<repetitions of entire test suite>]"
829 "[--v] "
830 "[--cdp]"
831 "\n", argv[0]);
832 exit(0);
833 }
834
835 // Initialize device
836 CubDebugExit(args.DeviceInit());
837 printf("\n");
838
839 // Get ptx version
840 int ptx_version = 0;
841 CubDebugExit(PtxVersion(ptx_version));
842
843 #ifdef QUICKER_TEST
844
845 // Compile/run basic CUB test
846 if (num_items < 0) num_items = 32000000;
847
848 TestPointer<RLE, CUB, int, int, int>( num_items, entropy_reduction, max_segment);
849 TestPointer<NON_TRIVIAL, CUB, int, int, int>( num_items, entropy_reduction, max_segment);
850 TestIterator<RLE, CUB, float, int, int>( num_items, Int2Type<Traits<float>::PRIMITIVE>());
851
852
853 #elif defined(QUICK_TEST)
854
855 // Compile/run quick tests
856 if (num_items < 0) num_items = 32000000;
857
858 TestPointer<RLE, CUB, int, int, int>( num_items, entropy_reduction, max_segment);
859 TestPointer<RLE, THRUST, int, int, int>( num_items, entropy_reduction, max_segment);
860
861 #else
862
863 // Compile/run thorough tests
864 for (int i = 0; i <= g_repeat; ++i)
865 {
866 // Test different input types
867 TestSize<char, int, int>(num_items);
868 TestSize<short, int, int>(num_items);
869 TestSize<int, int, int>(num_items);
870 TestSize<long, int, int>(num_items);
871 TestSize<long long, int, int>(num_items);
872 TestSize<float, int, int>(num_items);
873 TestSize<double, int, int>(num_items);
874
875 TestSize<uchar2, int, int>(num_items);
876 TestSize<uint2, int, int>(num_items);
877 TestSize<uint3, int, int>(num_items);
878 TestSize<uint4, int, int>(num_items);
879 TestSize<ulonglong4, int, int>(num_items);
880 TestSize<TestFoo, int, int>(num_items);
881 TestSize<TestBar, int, int>(num_items);
882 }
883
884 #endif
885
886 return 0;
887 }
888
889
890
891