1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /******************************************************************************
30 * Test of DeviceReduce::ReduceByKey utilities
31 ******************************************************************************/
32
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35
36 #include <stdio.h>
37 #include <typeinfo>
38
39 #include <thrust/device_ptr.h>
40 #include <thrust/reduce.h>
41 #include <thrust/iterator/constant_iterator.h>
42
43 #include <cub/util_allocator.cuh>
44 #include <cub/iterator/constant_input_iterator.cuh>
45 #include <cub/device/device_reduce.cuh>
46 #include <cub/device/device_run_length_encode.cuh>
47 #include <cub/thread/thread_operators.cuh>
48
49 #include "test_util.h"
50
51 using namespace cub;
52
53
54 //---------------------------------------------------------------------
55 // Globals, constants and typedefs
56 //---------------------------------------------------------------------
57
58 bool g_verbose = false;
59 int g_timing_iterations = 0;
60 int g_repeat = 0;
61 CachingDeviceAllocator g_allocator(true);
62
63 // Dispatch types
64 enum Backend
65 {
66 CUB, // CUB method
67 THRUST, // Thrust method
68 CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
69 };
70
71
72 //---------------------------------------------------------------------
73 // Dispatch to different CUB entrypoints
74 //---------------------------------------------------------------------
75
76 /**
77 * Dispatch to reduce-by-key entrypoint
78 */
79 template <
80 typename KeyInputIteratorT,
81 typename KeyOutputIteratorT,
82 typename ValueInputIteratorT,
83 typename ValueOutputIteratorT,
84 typename NumRunsIteratorT,
85 typename EqualityOpT,
86 typename ReductionOpT,
87 typename OffsetT>
88 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CUB>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT,ReductionOpT reduction_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)89 cudaError_t Dispatch(
90 Int2Type<CUB> /*dispatch_to*/,
91 int timing_timing_iterations,
92 size_t */*d_temp_storage_bytes*/,
93 cudaError_t */*d_cdp_error*/,
94
95 void *d_temp_storage,
96 size_t &temp_storage_bytes,
97 KeyInputIteratorT d_keys_in,
98 KeyOutputIteratorT d_keys_out,
99 ValueInputIteratorT d_values_in,
100 ValueOutputIteratorT d_values_out,
101 NumRunsIteratorT d_num_runs,
102 EqualityOpT /*equality_op*/,
103 ReductionOpT reduction_op,
104 OffsetT num_items,
105 cudaStream_t stream,
106 bool debug_synchronous)
107 {
108 cudaError_t error = cudaSuccess;
109 for (int i = 0; i < timing_timing_iterations; ++i)
110 {
111 error = DeviceReduce::ReduceByKey(
112 d_temp_storage,
113 temp_storage_bytes,
114 d_keys_in,
115 d_keys_out,
116 d_values_in,
117 d_values_out,
118 d_num_runs,
119 reduction_op,
120 num_items,
121 stream,
122 debug_synchronous);
123 }
124 return error;
125 }
126
127
128 //---------------------------------------------------------------------
129 // Dispatch to different Thrust entrypoints
130 //---------------------------------------------------------------------
131
132 /**
133 * Dispatch to reduce-by-key entrypoint
134 */
135 template <
136 typename KeyInputIteratorT,
137 typename KeyOutputIteratorT,
138 typename ValueInputIteratorT,
139 typename ValueOutputIteratorT,
140 typename NumRunsIteratorT,
141 typename EqualityOpT,
142 typename ReductionOpT,
143 typename OffsetT>
Dispatch(Int2Type<THRUST>,int timing_timing_iterations,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT,ReductionOpT,OffsetT num_items,cudaStream_t,bool)144 cudaError_t Dispatch(
145 Int2Type<THRUST> /*dispatch_to*/,
146 int timing_timing_iterations,
147 size_t */*d_temp_storage_bytes*/,
148 cudaError_t */*d_cdp_error*/,
149
150 void *d_temp_storage,
151 size_t &temp_storage_bytes,
152 KeyInputIteratorT d_keys_in,
153 KeyOutputIteratorT d_keys_out,
154 ValueInputIteratorT d_values_in,
155 ValueOutputIteratorT d_values_out,
156 NumRunsIteratorT d_num_runs,
157 EqualityOpT /*equality_op*/,
158 ReductionOpT /*reduction_op*/,
159 OffsetT num_items,
160 cudaStream_t /*stream*/,
161 bool /*debug_synchronous*/)
162 {
163 // The input keys type
164 typedef typename std::iterator_traits<KeyInputIteratorT>::value_type KeyInputT;
165
166 // The output keys type
167 typedef typename If<(Equals<typename std::iterator_traits<KeyOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
168 typename std::iterator_traits<KeyInputIteratorT>::value_type, // ... then the input iterator's value type,
169 typename std::iterator_traits<KeyOutputIteratorT>::value_type>::Type KeyOutputT; // ... else the output iterator's value type
170
171 // The input values type
172 typedef typename std::iterator_traits<ValueInputIteratorT>::value_type ValueInputT;
173
174 // The output values type
175 typedef typename If<(Equals<typename std::iterator_traits<ValueOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
176 typename std::iterator_traits<ValueInputIteratorT>::value_type, // ... then the input iterator's value type,
177 typename std::iterator_traits<ValueOutputIteratorT>::value_type>::Type ValueOuputT; // ... else the output iterator's value type
178
179 if (d_temp_storage == 0)
180 {
181 temp_storage_bytes = 1;
182 }
183 else
184 {
185 thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
186 thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
187
188 thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
189 thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
190
191 thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
192
193 for (int i = 0; i < timing_timing_iterations; ++i)
194 {
195 d_out_ends = thrust::reduce_by_key(
196 d_keys_in_wrapper,
197 d_keys_in_wrapper + num_items,
198 d_values_in_wrapper,
199 d_keys_out_wrapper,
200 d_values_out_wrapper);
201 }
202
203 OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper);
204 CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice));
205
206 }
207
208 return cudaSuccess;
209 }
210
211
212
213 //---------------------------------------------------------------------
214 // CUDA Nested Parallelism Test Kernel
215 //---------------------------------------------------------------------
216
217 /**
218 * Simple wrapper kernel to invoke DeviceSelect
219 */
220 template <
221 typename KeyInputIteratorT,
222 typename KeyOutputIteratorT,
223 typename ValueInputIteratorT,
224 typename ValueOutputIteratorT,
225 typename NumRunsIteratorT,
226 typename EqualityOpT,
227 typename ReductionOpT,
228 typename OffsetT>
CnpDispatchKernel(int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT equality_op,ReductionOpT reduction_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)229 __global__ void CnpDispatchKernel(
230 int timing_timing_iterations,
231 size_t *d_temp_storage_bytes,
232 cudaError_t *d_cdp_error,
233
234 void *d_temp_storage,
235 size_t temp_storage_bytes,
236 KeyInputIteratorT d_keys_in,
237 KeyOutputIteratorT d_keys_out,
238 ValueInputIteratorT d_values_in,
239 ValueOutputIteratorT d_values_out,
240 NumRunsIteratorT d_num_runs,
241 EqualityOpT equality_op,
242 ReductionOpT reduction_op,
243 OffsetT num_items,
244 cudaStream_t stream,
245 bool debug_synchronous)
246 {
247
248 #ifndef CUB_CDP
249 *d_cdp_error = cudaErrorNotSupported;
250 #else
251 *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
252 d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
253
254 *d_temp_storage_bytes = temp_storage_bytes;
255 #endif
256 }
257
258
259 /**
260 * Dispatch to CDP kernel
261 */
262 template <
263 typename KeyInputIteratorT,
264 typename KeyOutputIteratorT,
265 typename ValueInputIteratorT,
266 typename ValueOutputIteratorT,
267 typename NumRunsIteratorT,
268 typename EqualityOpT,
269 typename ReductionOpT,
270 typename OffsetT>
271 CUB_RUNTIME_FUNCTION __forceinline__
Dispatch(Int2Type<CDP> dispatch_to,int timing_timing_iterations,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,KeyInputIteratorT d_keys_in,KeyOutputIteratorT d_keys_out,ValueInputIteratorT d_values_in,ValueOutputIteratorT d_values_out,NumRunsIteratorT d_num_runs,EqualityOpT equality_op,ReductionOpT reduction_op,OffsetT num_items,cudaStream_t stream,bool debug_synchronous)272 cudaError_t Dispatch(
273 Int2Type<CDP> dispatch_to,
274 int timing_timing_iterations,
275 size_t *d_temp_storage_bytes,
276 cudaError_t *d_cdp_error,
277
278 void *d_temp_storage,
279 size_t &temp_storage_bytes,
280 KeyInputIteratorT d_keys_in,
281 KeyOutputIteratorT d_keys_out,
282 ValueInputIteratorT d_values_in,
283 ValueOutputIteratorT d_values_out,
284 NumRunsIteratorT d_num_runs,
285 EqualityOpT equality_op,
286 ReductionOpT reduction_op,
287 OffsetT num_items,
288 cudaStream_t stream,
289 bool debug_synchronous)
290 {
291 // Invoke kernel to invoke device-side dispatch
292 CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
293 d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
294
295 // Copy out temp_storage_bytes
296 CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
297
298 // Copy out error
299 cudaError_t retval;
300 CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
301 return retval;
302 }
303
304
305
306 //---------------------------------------------------------------------
307 // Test generation
308 //---------------------------------------------------------------------
309
310
311 /**
312 * Initialize problem
313 */
314 template <typename T>
Initialize(int entropy_reduction,T * h_in,int num_items,int max_segment)315 void Initialize(
316 int entropy_reduction,
317 T *h_in,
318 int num_items,
319 int max_segment)
320 {
321 unsigned int max_int = (unsigned int) -1;
322
323 int key = 0;
324 int i = 0;
325 while (i < num_items)
326 {
327 // Select number of repeating occurrences
328
329 int repeat;
330
331 if (max_segment < 0)
332 {
333 repeat = num_items;
334 }
335 else if (max_segment < 2)
336 {
337 repeat = 1;
338 }
339 else
340 {
341 RandomBits(repeat, entropy_reduction);
342 repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
343 repeat = CUB_MAX(1, repeat);
344 }
345
346 int j = i;
347 while (j < CUB_MIN(i + repeat, num_items))
348 {
349 InitValue(INTEGER_SEED, h_in[j], key);
350 j++;
351 }
352
353 i = j;
354 key++;
355 }
356
357 if (g_verbose)
358 {
359 printf("Input:\n");
360 DisplayResults(h_in, num_items);
361 printf("\n\n");
362 }
363 }
364
365
366 /**
367 * Solve problem. Returns total number of segments identified
368 */
369 template <
370 typename KeyInputIteratorT,
371 typename ValueInputIteratorT,
372 typename KeyT,
373 typename ValueT,
374 typename EqualityOpT,
375 typename ReductionOpT>
Solve(KeyInputIteratorT h_keys_in,KeyT * h_keys_reference,ValueInputIteratorT h_values_in,ValueT * h_values_reference,EqualityOpT equality_op,ReductionOpT reduction_op,int num_items)376 int Solve(
377 KeyInputIteratorT h_keys_in,
378 KeyT *h_keys_reference,
379 ValueInputIteratorT h_values_in,
380 ValueT *h_values_reference,
381 EqualityOpT equality_op,
382 ReductionOpT reduction_op,
383 int num_items)
384 {
385 // First item
386 KeyT previous = h_keys_in[0];
387 ValueT aggregate = h_values_in[0];
388 int num_segments = 0;
389
390 // Subsequent items
391 for (int i = 1; i < num_items; ++i)
392 {
393 if (!equality_op(previous, h_keys_in[i]))
394 {
395 h_keys_reference[num_segments] = previous;
396 h_values_reference[num_segments] = aggregate;
397 num_segments++;
398 aggregate = h_values_in[i];
399 }
400 else
401 {
402 aggregate = reduction_op(aggregate, h_values_in[i]);
403 }
404 previous = h_keys_in[i];
405 }
406
407 h_keys_reference[num_segments] = previous;
408 h_values_reference[num_segments] = aggregate;
409 num_segments++;
410
411 return num_segments;
412 }
413
414
415
416 /**
417 * Test DeviceSelect for a given problem input
418 */
419 template <
420 Backend BACKEND,
421 typename DeviceKeyInputIteratorT,
422 typename DeviceValueInputIteratorT,
423 typename KeyT,
424 typename ValueT,
425 typename EqualityOpT,
426 typename ReductionOpT>
Test(DeviceKeyInputIteratorT d_keys_in,DeviceValueInputIteratorT d_values_in,KeyT * h_keys_reference,ValueT * h_values_reference,EqualityOpT equality_op,ReductionOpT reduction_op,int num_segments,int num_items)427 void Test(
428 DeviceKeyInputIteratorT d_keys_in,
429 DeviceValueInputIteratorT d_values_in,
430 KeyT* h_keys_reference,
431 ValueT* h_values_reference,
432 EqualityOpT equality_op,
433 ReductionOpT reduction_op,
434 int num_segments,
435 int num_items)
436 {
437 // Allocate device output arrays and number of segments
438 KeyT* d_keys_out = NULL;
439 ValueT* d_values_out = NULL;
440 int* d_num_runs = NULL;
441 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
442 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
443 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
444
445 // Allocate CDP device arrays
446 size_t *d_temp_storage_bytes = NULL;
447 cudaError_t *d_cdp_error = NULL;
448 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
449 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
450
451 // Allocate temporary storage
452 void *d_temp_storage = NULL;
453 size_t temp_storage_bytes = 0;
454 CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
455 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
456
457 // Clear device output arrays
458 CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
459 CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
460 CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
461
462 // Run warmup/correctness iteration
463 CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
464
465 // Check for correctness (and display results, if specified)
466 int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
467 printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
468
469 int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
470 printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
471
472 int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
473 printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
474
475 // Flush any stdout/stderr
476 fflush(stdout);
477 fflush(stderr);
478
479 // Performance
480 GpuTimer gpu_timer;
481 gpu_timer.Start();
482 CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
483 gpu_timer.Stop();
484 float elapsed_millis = gpu_timer.ElapsedMillis();
485
486 // Display performance
487 if (g_timing_iterations > 0)
488 {
489 float avg_millis = elapsed_millis / g_timing_iterations;
490 float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
491 int bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
492 float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
493 printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
494 }
495 printf("\n\n");
496
497 // Flush any stdout/stderr
498 fflush(stdout);
499 fflush(stderr);
500
501 // Cleanup
502 if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
503 if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
504 if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
505 if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
506 if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
507 if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
508
509 // Correctness asserts
510 AssertEquals(0, compare1 | compare2 | compare3);
511 }
512
513
514 /**
515 * Test DeviceSelect on pointer type
516 */
517 template <
518 Backend BACKEND,
519 typename KeyT,
520 typename ValueT,
521 typename ReductionOpT>
TestPointer(int num_items,int entropy_reduction,int max_segment,ReductionOpT reduction_op)522 void TestPointer(
523 int num_items,
524 int entropy_reduction,
525 int max_segment,
526 ReductionOpT reduction_op)
527 {
528 // Allocate host arrays
529 KeyT* h_keys_in = new KeyT[num_items];
530 KeyT* h_keys_reference = new KeyT[num_items];
531
532 ValueT* h_values_in = new ValueT[num_items];
533 ValueT* h_values_reference = new ValueT[num_items];
534
535 for (int i = 0; i < num_items; ++i)
536 InitValue(INTEGER_SEED, h_values_in[i], 1);
537
538 // Initialize problem and solution
539 Equality equality_op;
540 Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
541 int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
542
543 printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
544 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
545 (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
546 num_items, num_segments, float(num_items) / num_segments,
547 typeid(KeyT).name(), typeid(ValueT).name(),
548 max_segment, entropy_reduction);
549 fflush(stdout);
550
551 // Allocate problem device arrays
552 KeyT *d_keys_in = NULL;
553 ValueT *d_values_in = NULL;
554 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
555 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
556
557 // Initialize device input
558 CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
559 CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
560
561 // Run Test
562 Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
563
564 // Cleanup
565 if (h_keys_in) delete[] h_keys_in;
566 if (h_values_in) delete[] h_values_in;
567 if (h_keys_reference) delete[] h_keys_reference;
568 if (h_values_reference) delete[] h_values_reference;
569 if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
570 if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
571 }
572
573
574 /**
575 * Test on iterator type
576 */
577 template <
578 Backend BACKEND,
579 typename KeyT,
580 typename ValueT,
581 typename ReductionOpT>
TestIterator(int num_items,int entropy_reduction,int max_segment,ReductionOpT reduction_op)582 void TestIterator(
583 int num_items,
584 int entropy_reduction,
585 int max_segment,
586 ReductionOpT reduction_op)
587 {
588 // Allocate host arrays
589 KeyT* h_keys_in = new KeyT[num_items];
590 KeyT* h_keys_reference = new KeyT[num_items];
591
592 ValueT one_val;
593 InitValue(INTEGER_SEED, one_val, 1);
594 ConstantInputIterator<ValueT, int> h_values_in(one_val);
595 ValueT* h_values_reference = new ValueT[num_items];
596
597 // Initialize problem and solution
598 Equality equality_op;
599 Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
600 int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
601
602 printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
603 (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
604 (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
605 num_items, num_segments, float(num_items) / num_segments,
606 typeid(KeyT).name(), typeid(ValueT).name(),
607 max_segment, entropy_reduction);
608 fflush(stdout);
609
610 // Allocate problem device arrays
611 KeyT *d_keys_in = NULL;
612 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
613
614 // Initialize device input
615 CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
616
617 // Run Test
618 Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
619
620 // Cleanup
621 if (h_keys_in) delete[] h_keys_in;
622 if (h_keys_reference) delete[] h_keys_reference;
623 if (h_values_reference) delete[] h_values_reference;
624 if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
625 }
626
627
628 /**
629 * Test different gen modes
630 */
631 template <
632 Backend BACKEND,
633 typename KeyT,
634 typename ValueT,
635 typename ReductionOpT>
Test(int num_items,ReductionOpT reduction_op,int max_segment)636 void Test(
637 int num_items,
638 ReductionOpT reduction_op,
639 int max_segment)
640 {
641 // 0 key-bit entropy reduction rounds
642 TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
643
644 if (max_segment > 1)
645 {
646 // 2 key-bit entropy reduction rounds
647 TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
648
649 // 7 key-bit entropy reduction rounds
650 TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
651 }
652 }
653
654
655 /**
656 * Test different avg segment lengths modes
657 */
658 template <
659 Backend BACKEND,
660 typename KeyT,
661 typename ValueT,
662 typename ReductionOpT>
Test(int num_items,ReductionOpT reduction_op)663 void Test(
664 int num_items,
665 ReductionOpT reduction_op)
666 {
667 Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
668 Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
669
670 // Evaluate different max-segment lengths
671 for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
672 {
673 Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
674 }
675 }
676
677
678
679 /**
680 * Test different dispatch
681 */
682 template <
683 typename KeyT,
684 typename ValueT,
685 typename ReductionOpT>
TestDispatch(int num_items,ReductionOpT reduction_op)686 void TestDispatch(
687 int num_items,
688 ReductionOpT reduction_op)
689 {
690 Test<CUB, KeyT, ValueT>(num_items, reduction_op);
691 #ifdef CUB_CDP
692 Test<CDP, KeyT, ValueT>(num_items, reduction_op);
693 #endif
694 }
695
696
697 /**
698 * Test different input sizes
699 */
700 template <
701 typename KeyT,
702 typename ValueT,
703 typename ReductionOpT>
TestSize(int num_items,ReductionOpT reduction_op)704 void TestSize(
705 int num_items,
706 ReductionOpT reduction_op)
707 {
708 if (num_items < 0)
709 {
710 TestDispatch<KeyT, ValueT>(1, reduction_op);
711 TestDispatch<KeyT, ValueT>(100, reduction_op);
712 TestDispatch<KeyT, ValueT>(10000, reduction_op);
713 TestDispatch<KeyT, ValueT>(1000000, reduction_op);
714 }
715 else
716 {
717 TestDispatch<KeyT, ValueT>(num_items, reduction_op);
718 }
719
720 }
721
722
723 template <
724 typename KeyT,
725 typename ValueT>
TestOp(int num_items)726 void TestOp(
727 int num_items)
728 {
729 TestSize<KeyT, ValueT>(num_items, cub::Sum());
730 TestSize<KeyT, ValueT>(num_items, cub::Max());
731 }
732
733
734
735 //---------------------------------------------------------------------
736 // Main
737 //---------------------------------------------------------------------
738
739 /**
740 * Main
741 */
main(int argc,char ** argv)742 int main(int argc, char** argv)
743 {
744 int num_items = -1;
745 int entropy_reduction = 0;
746 int maxseg = 1000;
747
748 // Initialize command line
749 CommandLineArgs args(argc, argv);
750 g_verbose = args.CheckCmdLineFlag("v");
751 args.GetCmdLineArgument("n", num_items);
752 args.GetCmdLineArgument("i", g_timing_iterations);
753 args.GetCmdLineArgument("repeat", g_repeat);
754 args.GetCmdLineArgument("maxseg", maxseg);
755 args.GetCmdLineArgument("entropy", entropy_reduction);
756
757 // Print usage
758 if (args.CheckCmdLineFlag("help"))
759 {
760 printf("%s "
761 "[--n=<input items> "
762 "[--i=<timing iterations> "
763 "[--device=<device-id>] "
764 "[--maxseg=<max segment length>]"
765 "[--entropy=<segment length bit entropy reduction rounds>]"
766 "[--repeat=<repetitions of entire test suite>]"
767 "[--v] "
768 "[--cdp]"
769 "\n", argv[0]);
770 exit(0);
771 }
772
773 // Initialize device
774 CubDebugExit(args.DeviceInit());
775 printf("\n");
776
777 // Get ptx version
778 int ptx_version = 0;
779 CubDebugExit(PtxVersion(ptx_version));
780
781 #ifdef QUICKER_TEST
782
783 // Compile/run basic CUB test
784 if (num_items < 0) num_items = 32000000;
785
786 TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
787 TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
788 TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
789
790 #elif defined(QUICK_TEST)
791
792 // Compile/run quick tests
793 if (num_items < 0) num_items = 32000000;
794
795 printf("---- RLE int ---- \n");
796 TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
797
798 printf("---- RLE long long ---- \n");
799 TestIterator<CUB, long long, int>(num_items, entropy_reduction, maxseg, cub::Sum());
800
801 printf("---- int ---- \n");
802 TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
803 TestPointer<THRUST, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
804
805 printf("---- float ---- \n");
806 TestPointer<CUB, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
807 TestPointer<THRUST, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
808
809 if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
810 {
811 printf("---- double ---- \n");
812 TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
813 TestPointer<THRUST, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
814 }
815
816 #else
817
818 // Compile/run thorough tests
819 for (int i = 0; i <= g_repeat; ++i)
820 {
821
822 // Test different input types
823 TestOp<int, char>(num_items);
824 TestOp<int, short>(num_items);
825 TestOp<int, int>(num_items);
826 TestOp<int, long>(num_items);
827 TestOp<int, long long>(num_items);
828 TestOp<int, float>(num_items);
829 if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
830 TestOp<int, double>(num_items);
831
832 TestOp<int, uchar2>(num_items);
833 TestOp<int, uint2>(num_items);
834 TestOp<int, uint3>(num_items);
835 TestOp<int, uint4>(num_items);
836 TestOp<int, ulonglong4>(num_items);
837 TestOp<int, TestFoo>(num_items);
838 TestOp<int, TestBar>(num_items);
839
840 TestOp<char, int>(num_items);
841 TestOp<long long, int>(num_items);
842 TestOp<TestFoo, int>(num_items);
843 TestOp<TestBar, int>(num_items);
844
845 }
846
847 #endif
848
849 return 0;
850 }
851
852
853
854