1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /******************************************************************************
30 * Test of DeviceRadixSort utilities
31 ******************************************************************************/
32
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35
36 #include <stdio.h>
37 #include <algorithm>
38 #include <typeinfo>
39
40 #if (__CUDACC_VER_MAJOR__ >= 9)
41 #include <cuda_fp16.h>
42 #endif
43
44 #include <cub/util_allocator.cuh>
45 #include <cub/device/device_radix_sort.cuh>
46 #include <cub/device/device_segmented_radix_sort.cuh>
47
48 #include "test_util.h"
49
50 #include <thrust/device_ptr.h>
51 #include <thrust/sort.h>
52 #include <thrust/reverse.h>
53
54 using namespace cub;
55
56
57 //---------------------------------------------------------------------
58 // Globals, constants and typedefs
59 //---------------------------------------------------------------------
60
61 bool g_verbose = false;
62 int g_timing_iterations = 0;
63 int g_repeat = 0;
64 CachingDeviceAllocator g_allocator(true);
65
66 // Dispatch types
67 enum Backend
68 {
69 CUB, // CUB method (allows overwriting of input)
70 CUB_NO_OVERWRITE, // CUB method (disallows overwriting of input)
71
72 CUB_SEGMENTED, // CUB method (allows overwriting of input)
73 CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
74
75 THRUST, // Thrust method
76 CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
77 };
78
79
80 //---------------------------------------------------------------------
81 // Dispatch to different DeviceRadixSort entrypoints
82 //---------------------------------------------------------------------
83
84 /**
85 * Dispatch to CUB sorting entrypoint (specialized for ascending)
86 */
87 template <typename KeyT, typename ValueT>
88 CUB_RUNTIME_FUNCTION
89 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)90 cudaError_t Dispatch(
91 Int2Type<false> /*is_descending*/,
92 Int2Type<CUB> /*dispatch_to*/,
93 int */*d_selector*/,
94 size_t */*d_temp_storage_bytes*/,
95 cudaError_t */*d_cdp_error*/,
96
97 void* d_temp_storage,
98 size_t& temp_storage_bytes,
99 DoubleBuffer<KeyT> &d_keys,
100 DoubleBuffer<ValueT> &d_values,
101 int num_items,
102 int /*num_segments*/,
103 const int */*d_segment_offsets*/,
104 int begin_bit,
105 int end_bit,
106 cudaStream_t stream,
107 bool debug_synchronous)
108 {
109 return DeviceRadixSort::SortPairs(
110 d_temp_storage, temp_storage_bytes,
111 d_keys, d_values,
112 num_items, begin_bit, end_bit, stream, debug_synchronous);
113 }
114
115 /**
116 * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
117 */
118 template <typename KeyT, typename ValueT>
119 CUB_RUNTIME_FUNCTION
120 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)121 cudaError_t Dispatch(
122 Int2Type<false> /*is_descending*/,
123 Int2Type<CUB_NO_OVERWRITE> /*dispatch_to*/,
124 int */*d_selector*/,
125 size_t */*d_temp_storage_bytes*/,
126 cudaError_t */*d_cdp_error*/,
127
128 void* d_temp_storage,
129 size_t& temp_storage_bytes,
130 DoubleBuffer<KeyT> &d_keys,
131 DoubleBuffer<ValueT> &d_values,
132 int num_items,
133 int /*num_segments*/,
134 const int */*d_segment_offsets*/,
135 int begin_bit,
136 int end_bit,
137 cudaStream_t stream,
138 bool debug_synchronous)
139 {
140 KeyT const *const_keys_itr = d_keys.Current();
141 ValueT const *const_values_itr = d_values.Current();
142
143 cudaError_t retval = DeviceRadixSort::SortPairs(
144 d_temp_storage, temp_storage_bytes,
145 const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
146 num_items, begin_bit, end_bit, stream, debug_synchronous);
147
148 d_keys.selector ^= 1;
149 d_values.selector ^= 1;
150 return retval;
151 }
152
153 /**
154 * Dispatch to CUB sorting entrypoint (specialized for descending)
155 */
156 template <typename KeyT, typename ValueT>
157 CUB_RUNTIME_FUNCTION
158 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)159 cudaError_t Dispatch(
160 Int2Type<true> /*is_descending*/,
161 Int2Type<CUB> /*dispatch_to*/,
162 int */*d_selector*/,
163 size_t */*d_temp_storage_bytes*/,
164 cudaError_t */*d_cdp_error*/,
165
166 void* d_temp_storage,
167 size_t& temp_storage_bytes,
168 DoubleBuffer<KeyT> &d_keys,
169 DoubleBuffer<ValueT> &d_values,
170 int num_items,
171 int /*num_segments*/,
172 const int */*d_segment_offsets*/,
173 int begin_bit,
174 int end_bit,
175 cudaStream_t stream,
176 bool debug_synchronous)
177 {
178 return DeviceRadixSort::SortPairsDescending(
179 d_temp_storage, temp_storage_bytes,
180 d_keys, d_values,
181 num_items, begin_bit, end_bit, stream, debug_synchronous);
182 }
183
184
185 /**
186 * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
187 */
188 template <typename KeyT, typename ValueT>
189 CUB_RUNTIME_FUNCTION
190 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)191 cudaError_t Dispatch(
192 Int2Type<true> /*is_descending*/,
193 Int2Type<CUB_NO_OVERWRITE> /*dispatch_to*/,
194 int */*d_selector*/,
195 size_t */*d_temp_storage_bytes*/,
196 cudaError_t */*d_cdp_error*/,
197
198 void* d_temp_storage,
199 size_t& temp_storage_bytes,
200 DoubleBuffer<KeyT> &d_keys,
201 DoubleBuffer<ValueT> &d_values,
202 int num_items,
203 int /*num_segments*/,
204 const int */*d_segment_offsets*/,
205 int begin_bit,
206 int end_bit,
207 cudaStream_t stream,
208 bool debug_synchronous)
209 {
210 KeyT const *const_keys_itr = d_keys.Current();
211 ValueT const *const_values_itr = d_values.Current();
212
213 cudaError_t retval = DeviceRadixSort::SortPairsDescending(
214 d_temp_storage, temp_storage_bytes,
215 const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
216 num_items, begin_bit, end_bit, stream, debug_synchronous);
217
218 d_keys.selector ^= 1;
219 d_values.selector ^= 1;
220 return retval;
221 }
222
223 //---------------------------------------------------------------------
224 // Dispatch to different DeviceRadixSort entrypoints
225 //---------------------------------------------------------------------
226
227 /**
228 * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
229 */
230 template <typename KeyT, typename ValueT>
231 CUB_RUNTIME_FUNCTION
232 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB_SEGMENTED>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)233 cudaError_t Dispatch(
234 Int2Type<false> /*is_descending*/,
235 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
236 int */*d_selector*/,
237 size_t */*d_temp_storage_bytes*/,
238 cudaError_t */*d_cdp_error*/,
239
240 void* d_temp_storage,
241 size_t& temp_storage_bytes,
242 DoubleBuffer<KeyT> &d_keys,
243 DoubleBuffer<ValueT> &d_values,
244 int num_items,
245 int num_segments,
246 const int *d_segment_offsets,
247 int begin_bit,
248 int end_bit,
249 cudaStream_t stream,
250 bool debug_synchronous)
251 {
252 return DeviceSegmentedRadixSort::SortPairs(
253 d_temp_storage, temp_storage_bytes,
254 d_keys, d_values,
255 num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
256 begin_bit, end_bit, stream, debug_synchronous);
257 }
258
259 /**
260 * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
261 */
262 template <typename KeyT, typename ValueT>
263 CUB_RUNTIME_FUNCTION
264 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB_SEGMENTED_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)265 cudaError_t Dispatch(
266 Int2Type<false> /*is_descending*/,
267 Int2Type<CUB_SEGMENTED_NO_OVERWRITE> /*dispatch_to*/,
268 int */*d_selector*/,
269 size_t */*d_temp_storage_bytes*/,
270 cudaError_t */*d_cdp_error*/,
271
272 void* d_temp_storage,
273 size_t& temp_storage_bytes,
274 DoubleBuffer<KeyT> &d_keys,
275 DoubleBuffer<ValueT> &d_values,
276 int num_items,
277 int num_segments,
278 const int *d_segment_offsets,
279 int begin_bit,
280 int end_bit,
281 cudaStream_t stream,
282 bool debug_synchronous)
283 {
284 KeyT const *const_keys_itr = d_keys.Current();
285 ValueT const *const_values_itr = d_values.Current();
286
287 cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(
288 d_temp_storage, temp_storage_bytes,
289 const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
290 num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
291 begin_bit, end_bit, stream, debug_synchronous);
292
293 d_keys.selector ^= 1;
294 d_values.selector ^= 1;
295 return retval;
296 }
297
298
299 /**
300 * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
301 */
302 template <typename KeyT, typename ValueT>
303 CUB_RUNTIME_FUNCTION
304 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB_SEGMENTED>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)305 cudaError_t Dispatch(
306 Int2Type<true> /*is_descending*/,
307 Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
308 int */*d_selector*/,
309 size_t */*d_temp_storage_bytes*/,
310 cudaError_t */*d_cdp_error*/,
311
312 void* d_temp_storage,
313 size_t& temp_storage_bytes,
314 DoubleBuffer<KeyT> &d_keys,
315 DoubleBuffer<ValueT> &d_values,
316 int num_items,
317 int num_segments,
318 const int *d_segment_offsets,
319 int begin_bit,
320 int end_bit,
321 cudaStream_t stream,
322 bool debug_synchronous)
323 {
324 return DeviceSegmentedRadixSort::SortPairsDescending(
325 d_temp_storage, temp_storage_bytes,
326 d_keys, d_values,
327 num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
328 begin_bit, end_bit, stream, debug_synchronous);
329 }
330
331 /**
332 * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
333 */
334 template <typename KeyT, typename ValueT>
335 CUB_RUNTIME_FUNCTION
336 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB_SEGMENTED_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)337 cudaError_t Dispatch(
338 Int2Type<true> /*is_descending*/,
339 Int2Type<CUB_SEGMENTED_NO_OVERWRITE> /*dispatch_to*/,
340 int */*d_selector*/,
341 size_t */*d_temp_storage_bytes*/,
342 cudaError_t */*d_cdp_error*/,
343
344 void* d_temp_storage,
345 size_t& temp_storage_bytes,
346 DoubleBuffer<KeyT> &d_keys,
347 DoubleBuffer<ValueT> &d_values,
348 int num_items,
349 int num_segments,
350 const int *d_segment_offsets,
351 int begin_bit,
352 int end_bit,
353 cudaStream_t stream,
354 bool debug_synchronous)
355 {
356 KeyT const *const_keys_itr = d_keys.Current();
357 ValueT const *const_values_itr = d_values.Current();
358
359 cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(
360 d_temp_storage, temp_storage_bytes,
361 const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
362 num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
363 begin_bit, end_bit, stream, debug_synchronous);
364
365 d_keys.selector ^= 1;
366 d_values.selector ^= 1;
367 return retval;
368 }
369
370
371 //---------------------------------------------------------------------
372 // Dispatch to different Thrust entrypoints
373 //---------------------------------------------------------------------
374
375 /**
376 * Dispatch keys-only to Thrust sorting entrypoint
377 */
378 template <int IS_DESCENDING, typename KeyT>
Dispatch(Int2Type<IS_DESCENDING>,Int2Type<THRUST>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<NullType> &,int num_items,int,const int *,int,int,cudaStream_t,bool)379 cudaError_t Dispatch(
380 Int2Type<IS_DESCENDING> /*is_descending*/,
381 Int2Type<THRUST> /*dispatch_to*/,
382 int */*d_selector*/,
383 size_t */*d_temp_storage_bytes*/,
384 cudaError_t */*d_cdp_error*/,
385
386 void *d_temp_storage,
387 size_t &temp_storage_bytes,
388 DoubleBuffer<KeyT> &d_keys,
389 DoubleBuffer<NullType> &/*d_values*/,
390 int num_items,
391 int /*num_segments*/,
392 const int */*d_segment_offsets*/,
393 int /*begin_bit*/,
394 int /*end_bit*/,
395 cudaStream_t /*stream*/,
396 bool /*debug_synchronous*/)
397 {
398
399 if (d_temp_storage == 0)
400 {
401 temp_storage_bytes = 1;
402 }
403 else
404 {
405 thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
406
407 if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
408 thrust::sort(d_keys_wrapper, d_keys_wrapper + num_items);
409 if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
410 }
411
412 return cudaSuccess;
413 }
414
415
416 /**
417 * Dispatch key-value pairs to Thrust sorting entrypoint
418 */
419 template <int IS_DESCENDING, typename KeyT, typename ValueT>
Dispatch(Int2Type<IS_DESCENDING>,Int2Type<THRUST>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int,int,cudaStream_t,bool)420 cudaError_t Dispatch(
421 Int2Type<IS_DESCENDING> /*is_descending*/,
422 Int2Type<THRUST> /*dispatch_to*/,
423 int */*d_selector*/,
424 size_t */*d_temp_storage_bytes*/,
425 cudaError_t */*d_cdp_error*/,
426
427 void *d_temp_storage,
428 size_t &temp_storage_bytes,
429 DoubleBuffer<KeyT> &d_keys,
430 DoubleBuffer<ValueT> &d_values,
431 int num_items,
432 int /*num_segments*/,
433 const int */*d_segment_offsets*/,
434 int /*begin_bit*/,
435 int /*end_bit*/,
436 cudaStream_t /*stream*/,
437 bool /*debug_synchronous*/)
438 {
439
440 if (d_temp_storage == 0)
441 {
442 temp_storage_bytes = 1;
443 }
444 else
445 {
446 thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
447 thrust::device_ptr<ValueT> d_values_wrapper(d_values.Current());
448
449 if (IS_DESCENDING) {
450 thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
451 thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
452 }
453
454 thrust::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper);
455
456 if (IS_DESCENDING) {
457 thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
458 thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
459 }
460 }
461
462 return cudaSuccess;
463 }
464
465
466 //---------------------------------------------------------------------
467 // CUDA Nested Parallelism Test Kernel
468 //---------------------------------------------------------------------
469
470 /**
471 * Simple wrapper kernel to invoke DeviceRadixSort
472 */
473 template <int IS_DESCENDING, typename KeyT, typename ValueT>
CnpDispatchKernel(Int2Type<IS_DESCENDING> is_descending,int * d_selector,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,DoubleBuffer<KeyT> d_keys,DoubleBuffer<ValueT> d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,bool debug_synchronous)474 __global__ void CnpDispatchKernel(
475 Int2Type<IS_DESCENDING> is_descending,
476 int *d_selector,
477 size_t *d_temp_storage_bytes,
478 cudaError_t *d_cdp_error,
479
480 void *d_temp_storage,
481 size_t temp_storage_bytes,
482 DoubleBuffer<KeyT> d_keys,
483 DoubleBuffer<ValueT> d_values,
484 int num_items,
485 int num_segments,
486 const int *d_segment_offsets,
487 int begin_bit,
488 int end_bit,
489 bool debug_synchronous)
490 {
491 #ifndef CUB_CDP
492 (void)is_descending;
493 (void)d_selector;
494 (void)d_temp_storage_bytes;
495 (void)d_cdp_error;
496 (void)d_temp_storage;
497 (void)temp_storage_bytes;
498 (void)d_keys;
499 (void)d_values;
500 (void)num_items;
501 (void)num_segments;
502 (void)d_segment_offsets;
503 (void)begin_bit;
504 (void)end_bit;
505 (void)debug_synchronous;
506 *d_cdp_error = cudaErrorNotSupported;
507 #else
508 *d_cdp_error = Dispatch(
509 is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
510 d_temp_storage, temp_storage_bytes, d_keys, d_values,
511 num_items, num_segments, d_segment_offsets,
512 begin_bit, end_bit, 0, debug_synchronous);
513 *d_temp_storage_bytes = temp_storage_bytes;
514 *d_selector = d_keys.selector;
515 #endif
516 }
517
518
519 /**
520 * Dispatch to CDP kernel
521 */
522 template <int IS_DESCENDING, typename KeyT, typename ValueT>
Dispatch(Int2Type<IS_DESCENDING> is_descending,Int2Type<CDP> dispatch_to,int * d_selector,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)523 cudaError_t Dispatch(
524 Int2Type<IS_DESCENDING> is_descending,
525 Int2Type<CDP> dispatch_to,
526 int *d_selector,
527 size_t *d_temp_storage_bytes,
528 cudaError_t *d_cdp_error,
529
530 void *d_temp_storage,
531 size_t &temp_storage_bytes,
532 DoubleBuffer<KeyT> &d_keys,
533 DoubleBuffer<ValueT> &d_values,
534 int num_items,
535 int num_segments,
536 const int *d_segment_offsets,
537 int begin_bit,
538 int end_bit,
539 cudaStream_t stream,
540 bool debug_synchronous)
541 {
542 // Invoke kernel to invoke device-side dispatch
543 CnpDispatchKernel<<<1,1>>>(
544 is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
545 d_temp_storage, temp_storage_bytes, d_keys, d_values,
546 num_items, num_segments, d_segment_offsets,
547 begin_bit, end_bit, debug_synchronous);
548
549 // Copy out selector
550 CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
551 d_values.selector = d_keys.selector;
552
553 // Copy out temp_storage_bytes
554 CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
555
556 // Copy out error
557 cudaError_t retval;
558 CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
559 return retval;
560 }
561
562
563
564 //---------------------------------------------------------------------
565 // Problem generation
566 //---------------------------------------------------------------------
567
568
569 /**
570 * Simple key-value pairing
571 */
572 template <
573 typename KeyT,
574 typename ValueT,
575 bool IS_FLOAT = (Traits<KeyT>::CATEGORY == FLOATING_POINT)>
576 struct Pair
577 {
578 KeyT key;
579 ValueT value;
580
operator <Pair581 bool operator<(const Pair &b) const
582 {
583 return (key < b.key);
584 }
585 };
586
587
588 /**
589 * Simple key-value pairing (specialized for bool types)
590 */
591 template <typename ValueT>
592 struct Pair<bool, ValueT, false>
593 {
594 bool key;
595 ValueT value;
596
operator <Pair597 bool operator<(const Pair &b) const
598 {
599 return (!key && b.key);
600 }
601 };
602
603
604 /**
605 * Simple key-value pairing (specialized for floating point types)
606 */
607 template <typename KeyT, typename ValueT>
608 struct Pair<KeyT, ValueT, true>
609 {
610 KeyT key;
611 ValueT value;
612
operator <Pair613 bool operator<(const Pair &b) const
614 {
615 if (key < b.key)
616 return true;
617
618 if (key > b.key)
619 return false;
620
621 // KeyT in unsigned bits
622 typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
623
624 // Return true if key is negative zero and b.key is positive zero
625 UnsignedBits key_bits = SafeBitCast<UnsignedBits>(key);
626 UnsignedBits b_key_bits = SafeBitCast<UnsignedBits>(b.key);
627 UnsignedBits HIGH_BIT = Traits<KeyT>::HIGH_BIT;
628
629 return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
630 }
631 };
632
633
634 /**
635 * Initialize key data
636 */
637 template <typename KeyT>
InitializeKeyBits(GenMode gen_mode,KeyT * h_keys,int num_items,int)638 void InitializeKeyBits(
639 GenMode gen_mode,
640 KeyT *h_keys,
641 int num_items,
642 int /*entropy_reduction*/)
643 {
644 for (int i = 0; i < num_items; ++i)
645 InitValue(gen_mode, h_keys[i], i);
646 }
647
648
649 /**
650 * Initialize solution
651 */
652 template <bool IS_DESCENDING, typename KeyT>
InitializeSolution(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit,int * & h_reference_ranks,KeyT * & h_reference_keys)653 void InitializeSolution(
654 KeyT *h_keys,
655 int num_items,
656 int num_segments,
657 int *h_segment_offsets,
658 int begin_bit,
659 int end_bit,
660 int *&h_reference_ranks,
661 KeyT *&h_reference_keys)
662 {
663 typedef Pair<KeyT, int> PairT;
664
665 PairT *h_pairs = new PairT[num_items];
666
667 int num_bits = end_bit - begin_bit;
668 for (int i = 0; i < num_items; ++i)
669 {
670
671 // Mask off unwanted portions
672 if (num_bits < static_cast<int>(sizeof(KeyT) * 8))
673 {
674 unsigned long long base = 0;
675 memcpy(&base, &h_keys[i], sizeof(KeyT));
676 base &= ((1ull << num_bits) - 1) << begin_bit;
677 memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
678 }
679 else
680 {
681 h_pairs[i].key = h_keys[i];
682 }
683
684 h_pairs[i].value = i;
685 }
686
687 printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
688
689 for (int i = 0; i < num_segments; ++i)
690 {
691 if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
692 std::stable_sort( h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
693 if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
694 }
695
696 printf(" Done.\n"); fflush(stdout);
697
698 h_reference_ranks = new int[num_items];
699 h_reference_keys = new KeyT[num_items];
700
701 for (int i = 0; i < num_items; ++i)
702 {
703 h_reference_ranks[i] = h_pairs[i].value;
704 h_reference_keys[i] = h_keys[h_pairs[i].value];
705 }
706
707 if (h_pairs) delete[] h_pairs;
708 }
709
710
711 //---------------------------------------------------------------------
712 // Test generation
713 //---------------------------------------------------------------------
714
715
716 /**
717 * Test DeviceRadixSort
718 */
719 template <
720 Backend BACKEND,
721 bool IS_DESCENDING,
722 typename KeyT,
723 typename ValueT>
Test(KeyT * h_keys,ValueT * h_values,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit,KeyT * h_reference_keys,ValueT * h_reference_values)724 void Test(
725 KeyT *h_keys,
726 ValueT *h_values,
727 int num_items,
728 int num_segments,
729 int *h_segment_offsets,
730 int begin_bit,
731 int end_bit,
732 KeyT *h_reference_keys,
733 ValueT *h_reference_values)
734 {
735 // Key alias type
736 #if (__CUDACC_VER_MAJOR__ >= 9)
737 typedef typename If<Equals<KeyT, half_t>::VALUE, __half, KeyT>::Type KeyAliasT;
738 #else
739 typedef KeyT KeyAliasT;
740 #endif
741
742 const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
743
744 printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n",
745 (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
746 (KEYS_ONLY) ? "keys-only" : "key-value",
747 num_items, num_segments,
748 (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(),
749 IS_DESCENDING, begin_bit, end_bit);
750 fflush(stdout);
751
752 if (g_verbose)
753 {
754 printf("Input keys:\n");
755 DisplayResults(h_keys, num_items);
756 printf("\n\n");
757 }
758
759 // Allocate device arrays
760 DoubleBuffer<KeyAliasT> d_keys;
761 DoubleBuffer<ValueT> d_values;
762 int *d_selector;
763 int *d_segment_offsets;
764 size_t *d_temp_storage_bytes;
765 cudaError_t *d_cdp_error;
766 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
767 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
768 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
769 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
770 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
771 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
772 if (!KEYS_ONLY)
773 {
774 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
775 CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
776 }
777
778 // Allocate temporary storage (and make it un-aligned)
779 size_t temp_storage_bytes = 0;
780 void *d_temp_storage = NULL;
781 CubDebugExit(Dispatch(
782 Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
783 d_temp_storage, temp_storage_bytes, d_keys, d_values,
784 num_items, num_segments, d_segment_offsets,
785 begin_bit, end_bit, 0, true));
786
787 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
788 void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
789
790 // Initialize/clear device arrays
791 d_keys.selector = 0;
792 CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
793 CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
794 if (!KEYS_ONLY)
795 {
796 d_values.selector = 0;
797 CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
798 CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
799 }
800 CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
801
802 // Run warmup/correctness iteration
803 CubDebugExit(Dispatch(
804 Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
805 mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
806 num_items, num_segments, d_segment_offsets,
807 begin_bit, end_bit, 0, true));
808
809 // Flush any stdout/stderr
810 fflush(stdout);
811 fflush(stderr);
812
813 // Check for correctness (and display results, if specified)
814 printf("Warmup done. Checking results:\n"); fflush(stdout);
815 int compare = CompareDeviceResults(h_reference_keys, reinterpret_cast<KeyT*>(d_keys.Current()), num_items, true, g_verbose);
816 printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
817 if (!KEYS_ONLY)
818 {
819 int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
820 compare |= values_compare;
821 printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
822 }
823 if (BACKEND == CUB_NO_OVERWRITE)
824 {
825 // Check that input isn't overwritten
826 int input_compare = CompareDeviceResults(h_keys, reinterpret_cast<KeyT*>(d_keys.d_buffers[0]), num_items, true, g_verbose);
827 compare |= input_compare;
828 printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
829 }
830
831 // Performance
832 if (g_timing_iterations)
833 printf("\nPerforming timing iterations:\n"); fflush(stdout);
834
835 GpuTimer gpu_timer;
836 float elapsed_millis = 0.0f;
837 for (int i = 0; i < g_timing_iterations; ++i)
838 {
839 // Initialize/clear device arrays
840 CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
841 CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
842 if (!KEYS_ONLY)
843 {
844 CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
845 CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
846 }
847
848 gpu_timer.Start();
849 CubDebugExit(Dispatch(
850 Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
851 mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
852 num_items, num_segments, d_segment_offsets,
853 begin_bit, end_bit, 0, false));
854 gpu_timer.Stop();
855 elapsed_millis += gpu_timer.ElapsedMillis();
856 }
857
858 // Display performance
859 if (g_timing_iterations > 0)
860 {
861 float avg_millis = elapsed_millis / g_timing_iterations;
862 float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
863 float giga_bandwidth = (KEYS_ONLY) ?
864 giga_rate * sizeof(KeyT) * 2 :
865 giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
866 printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
867 }
868
869 printf("\n\n");
870
871 // Cleanup
872 if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
873 if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
874 if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
875 if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
876 if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
877 if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
878 if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
879 if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
880 if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
881
882 // Correctness asserts
883 AssertEquals(0, compare);
884 }
885
886
887 /**
888 * Test backend
889 */
890 template <bool IS_DESCENDING, typename KeyT, typename ValueT>
TestBackend(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit,KeyT * h_reference_keys,int * h_reference_ranks)891 void TestBackend(
892 KeyT *h_keys,
893 int num_items,
894 int num_segments,
895 int *h_segment_offsets,
896 int begin_bit,
897 int end_bit,
898 KeyT *h_reference_keys,
899 int *h_reference_ranks)
900 {
901 const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
902
903 ValueT *h_values = NULL;
904 ValueT *h_reference_values = NULL;
905
906 if (!KEYS_ONLY)
907 {
908 h_values = new ValueT[num_items];
909 h_reference_values = new ValueT[num_items];
910
911 for (int i = 0; i < num_items; ++i)
912 {
913 InitValue(INTEGER_SEED, h_values[i], i);
914 InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
915 }
916 }
917
918 #ifdef SEGMENTED_SORT
919 // Test multi-segment implementations
920 Test<CUB_SEGMENTED, IS_DESCENDING>( h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
921 Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>( h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
922 #else // SEGMENTED_SORT
923 if (num_segments == 1)
924 {
925 // Test single-segment implementations
926 Test<CUB, IS_DESCENDING>( h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
927 Test<CUB_NO_OVERWRITE, IS_DESCENDING>( h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
928 #ifdef CUB_CDP
929 Test<CDP, IS_DESCENDING>( h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
930 #endif
931 }
932 #endif // SEGMENTED_SORT
933
934 if (h_values) delete[] h_values;
935 if (h_reference_values) delete[] h_reference_values;
936 }
937
938
939
940
941 /**
942 * Test value type
943 */
944 template <bool IS_DESCENDING, typename KeyT>
TestValueTypes(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit)945 void TestValueTypes(
946 KeyT *h_keys,
947 int num_items,
948 int num_segments,
949 int *h_segment_offsets,
950 int begin_bit,
951 int end_bit)
952 {
953 // Initialize the solution
954
955 int *h_reference_ranks = NULL;
956 KeyT *h_reference_keys = NULL;
957 InitializeSolution<IS_DESCENDING>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
958
959 // Test keys-only
960 TestBackend<IS_DESCENDING, KeyT, NullType> (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
961
962 // Test with 8b value
963 TestBackend<IS_DESCENDING, KeyT, unsigned char> (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
964
965 // Test with 32b value
966 TestBackend<IS_DESCENDING, KeyT, unsigned int> (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
967
968 // Test with 64b value
969 TestBackend<IS_DESCENDING, KeyT, unsigned long long>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
970
971 // Test with non-trivially-constructable value
972 TestBackend<IS_DESCENDING, KeyT, TestBar> (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
973
974 // Cleanup
975 if (h_reference_ranks) delete[] h_reference_ranks;
976 if (h_reference_keys) delete[] h_reference_keys;
977 }
978
979
980
981 /**
982 * Test ascending/descending
983 */
984 template <typename KeyT>
TestDirection(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit)985 void TestDirection(
986 KeyT *h_keys,
987 int num_items,
988 int num_segments,
989 int *h_segment_offsets,
990 int begin_bit,
991 int end_bit)
992 {
993 TestValueTypes<true>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
994 TestValueTypes<false>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
995 }
996
997
998 /**
999 * Test different bit ranges
1000 */
1001 template <typename KeyT>
TestBits(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets)1002 void TestBits(
1003 KeyT *h_keys,
1004 int num_items,
1005 int num_segments,
1006 int *h_segment_offsets)
1007 {
1008 // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way)
1009 if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER) && (!Equals<KeyT, bool>::VALUE))
1010 {
1011 // Partial bits
1012 int begin_bit = 1;
1013 int end_bit = (sizeof(KeyT) * 8) - 1;
1014 printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
1015 TestDirection(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
1016
1017 // Across subword boundaries
1018 int mid_bit = sizeof(KeyT) * 4;
1019 printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
1020 TestDirection(h_keys, num_items, num_segments, h_segment_offsets, mid_bit - 1, mid_bit + 1);
1021 }
1022
1023 printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
1024 TestDirection(h_keys, num_items, num_segments, h_segment_offsets, 0, sizeof(KeyT) * 8);
1025 }
1026
1027
1028 /**
1029 * Test different segment compositions
1030 */
1031 template <typename KeyT>
TestSegments(KeyT * h_keys,int num_items,int max_segments)1032 void TestSegments(
1033 KeyT *h_keys,
1034 int num_items,
1035 int max_segments)
1036 {
1037 int *h_segment_offsets = new int[max_segments + 1];
1038
1039 #ifdef SEGMENTED_SORT
1040 for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32)
1041 {
1042 if (num_items / num_segments < 128 * 1000) {
1043 // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
1044 InitializeSegments(num_items, num_segments, h_segment_offsets);
1045 TestBits(h_keys, num_items, num_segments, h_segment_offsets);
1046 }
1047 }
1048 #else
1049 // Test single segment
1050 if (num_items < 128 * 1000) {
1051 // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
1052 InitializeSegments(num_items, 1, h_segment_offsets);
1053 TestBits(h_keys, num_items, 1, h_segment_offsets);
1054 }
1055 #endif
1056 if (h_segment_offsets) delete[] h_segment_offsets;
1057 }
1058
1059
1060 /**
1061 * Test different (sub)lengths and number of segments
1062 */
1063 template <typename KeyT>
TestSizes(KeyT * h_keys,int max_items,int max_segments)1064 void TestSizes(
1065 KeyT *h_keys,
1066 int max_items,
1067 int max_segments)
1068 {
1069 for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
1070 {
1071 TestSegments(h_keys, num_items, max_segments);
1072 }
1073 TestSegments(h_keys, 1, max_segments);
1074 TestSegments(h_keys, 0, max_segments);
1075 }
1076
1077
1078 /**
1079 * Test key sampling distributions
1080 */
1081 template <typename KeyT>
TestGen(int max_items,int max_segments)1082 void TestGen(
1083 int max_items,
1084 int max_segments)
1085 {
1086 if (max_items < 0)
1087 max_items = 9000003;
1088
1089 if (max_segments < 0)
1090 max_segments = 5003;
1091
1092 KeyT *h_keys = new KeyT[max_items];
1093
1094 for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 3)
1095 {
1096 printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
1097 InitializeKeyBits(RANDOM, h_keys, max_items, entropy_reduction);
1098 TestSizes(h_keys, max_items, max_segments);
1099 }
1100
1101 printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
1102 InitializeKeyBits(UNIFORM, h_keys, max_items, 0);
1103 TestSizes(h_keys, max_items, max_segments);
1104
1105 printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
1106 InitializeKeyBits(INTEGER_SEED, h_keys, max_items, 0);
1107 TestSizes(h_keys, max_items, max_segments);
1108
1109 if (h_keys) delete[] h_keys;
1110 }
1111
1112
1113 //---------------------------------------------------------------------
1114 // Simple test
1115 //---------------------------------------------------------------------
1116
1117 template <
1118 Backend BACKEND,
1119 typename KeyT,
1120 typename ValueT,
1121 bool IS_DESCENDING>
Test(int num_items,int num_segments,GenMode gen_mode,int entropy_reduction,int begin_bit,int end_bit)1122 void Test(
1123 int num_items,
1124 int num_segments,
1125 GenMode gen_mode,
1126 int entropy_reduction,
1127 int begin_bit,
1128 int end_bit)
1129 {
1130 const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
1131
1132 KeyT *h_keys = new KeyT[num_items];
1133 int *h_reference_ranks = NULL;
1134 KeyT *h_reference_keys = NULL;
1135 ValueT *h_values = NULL;
1136 ValueT *h_reference_values = NULL;
1137 int *h_segment_offsets = new int[num_segments + 1];
1138
1139 if (end_bit < 0)
1140 end_bit = sizeof(KeyT) * 8;
1141
1142 InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
1143 InitializeSegments(num_items, num_segments, h_segment_offsets);
1144 InitializeSolution<IS_DESCENDING>(
1145 h_keys, num_items, num_segments, h_segment_offsets,
1146 begin_bit, end_bit, h_reference_ranks, h_reference_keys);
1147
1148 if (!KEYS_ONLY)
1149 {
1150 h_values = new ValueT[num_items];
1151 h_reference_values = new ValueT[num_items];
1152
1153 for (int i = 0; i < num_items; ++i)
1154 {
1155 InitValue(INTEGER_SEED, h_values[i], i);
1156 InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
1157 }
1158 }
1159 if (h_reference_ranks) delete[] h_reference_ranks;
1160
1161 printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
1162 Test<BACKEND, IS_DESCENDING>(
1163 h_keys, h_values,
1164 num_items, num_segments, h_segment_offsets,
1165 begin_bit, end_bit, h_reference_keys, h_reference_values);
1166
1167 if (h_keys) delete[] h_keys;
1168 if (h_reference_keys) delete[] h_reference_keys;
1169 if (h_values) delete[] h_values;
1170 if (h_reference_values) delete[] h_reference_values;
1171 if (h_segment_offsets) delete[] h_segment_offsets;
1172 }
1173
1174
1175
1176 //---------------------------------------------------------------------
1177 // Main
1178 //---------------------------------------------------------------------
1179
1180 /**
1181 * Main
1182 */
main(int argc,char ** argv)1183 int main(int argc, char** argv)
1184 {
1185 int bits = -1;
1186 int num_items = -1;
1187 int num_segments = -1;
1188 int entropy_reduction = 0;
1189
1190 // Initialize command line
1191 CommandLineArgs args(argc, argv);
1192 g_verbose = args.CheckCmdLineFlag("v");
1193 args.GetCmdLineArgument("n", num_items);
1194 args.GetCmdLineArgument("s", num_segments);
1195 args.GetCmdLineArgument("i", g_timing_iterations);
1196 args.GetCmdLineArgument("repeat", g_repeat);
1197 args.GetCmdLineArgument("bits", bits);
1198 args.GetCmdLineArgument("entropy", entropy_reduction);
1199
1200 // Print usage
1201 if (args.CheckCmdLineFlag("help"))
1202 {
1203 printf("%s "
1204 "[--bits=<valid key bits>]"
1205 "[--n=<input items> "
1206 "[--s=<num segments> "
1207 "[--i=<timing iterations> "
1208 "[--device=<device-id>] "
1209 "[--repeat=<repetitions of entire test suite>]"
1210 "[--v] "
1211 "[--entropy=<entropy-reduction factor (default 0)>]"
1212 "\n", argv[0]);
1213 exit(0);
1214 }
1215
1216 // Initialize device
1217 CubDebugExit(args.DeviceInit());
1218
1219 // Get ptx version
1220 int ptx_version = 0;
1221 CubDebugExit(PtxVersion(ptx_version));
1222
1223 #ifdef QUICKER_TEST
1224
1225 enum {
1226 IS_DESCENDING = false
1227 };
1228
1229 // Compile/run basic CUB test
1230 if (num_items < 0) num_items = 48000000;
1231 if (num_segments < 0) num_segments = 5000;
1232
1233 Test<CUB_SEGMENTED, unsigned int, NullType, IS_DESCENDING>(num_items, num_segments, RANDOM, entropy_reduction, 0, bits);
1234
1235 printf("\n-------------------------------\n");
1236
1237 Test<CUB, unsigned char, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1238 Test<CUB, unsigned int, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1239 Test<CUB, unsigned long long, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1240
1241 printf("\n-------------------------------\n");
1242
1243 #if (__CUDACC_VER_MAJOR__ >= 9)
1244 Test<CUB, half_t, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1245 #endif
1246 Test<CUB, float, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1247 Test<CUB, double, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1248
1249 printf("\n-------------------------------\n");
1250
1251 Test<CUB, unsigned char, unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1252 Test<CUB, unsigned int, unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1253 Test<CUB, unsigned long long, unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1254
1255 #elif defined(QUICK_TEST)
1256
1257 // Compile/run quick tests
1258 if (num_items < 0) num_items = 48000000;
1259 if (num_segments < 0) num_segments = 5000;
1260
1261 // Compare CUB and thrust on 32b keys-only
1262 Test<CUB, unsigned int, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1263 Test<THRUST, unsigned int, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1264
1265 // Compare CUB and thrust on 64b keys-only
1266 Test<CUB, unsigned long long, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1267 Test<THRUST, unsigned long long, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1268
1269
1270 // Compare CUB and thrust on 32b key-value pairs
1271 Test<CUB, unsigned int, unsigned int, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1272 Test<THRUST, unsigned int, unsigned int, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1273
1274 // Compare CUB and thrust on 64b key + 32b value pairs
1275 Test<CUB, unsigned long long, unsigned int, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1276 Test<THRUST, unsigned long long, unsigned int, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
1277
1278
1279 #else
1280
1281 // Compile/run thorough tests
1282 for (int i = 0; i <= g_repeat; ++i)
1283 {
1284 TestGen<bool> (num_items, num_segments);
1285
1286 TestGen<char> (num_items, num_segments);
1287 TestGen<signed char> (num_items, num_segments);
1288 TestGen<unsigned char> (num_items, num_segments);
1289
1290 TestGen<short> (num_items, num_segments);
1291 TestGen<unsigned short> (num_items, num_segments);
1292
1293 TestGen<int> (num_items, num_segments);
1294 TestGen<unsigned int> (num_items, num_segments);
1295
1296 TestGen<long> (num_items, num_segments);
1297 TestGen<unsigned long> (num_items, num_segments);
1298
1299 TestGen<long long> (num_items, num_segments);
1300 TestGen<unsigned long long> (num_items, num_segments);
1301
1302 #if (__CUDACC_VER_MAJOR__ >= 9)
1303 TestGen<half_t> (num_items, num_segments);
1304 #endif
1305 TestGen<float> (num_items, num_segments);
1306
1307 if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
1308 TestGen<double> (num_items, num_segments);
1309
1310 }
1311
1312 #endif
1313
1314 return 0;
1315 }
1316
1317