1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Test of DeviceRadixSort utilities
31  ******************************************************************************/
32 
33 // Ensure printing of CUDA runtime errors to console
34 #define CUB_STDERR
35 
36 #include <stdio.h>
37 #include <algorithm>
38 #include <typeinfo>
39 
40 #if (__CUDACC_VER_MAJOR__ >= 9)
41     #include <cuda_fp16.h>
42 #endif
43 
44 #include <cub/util_allocator.cuh>
45 #include <cub/device/device_radix_sort.cuh>
46 #include <cub/device/device_segmented_radix_sort.cuh>
47 
48 #include "test_util.h"
49 
50 #include <thrust/device_ptr.h>
51 #include <thrust/sort.h>
52 #include <thrust/reverse.h>
53 
54 using namespace cub;
55 
56 
57 //---------------------------------------------------------------------
58 // Globals, constants and typedefs
59 //---------------------------------------------------------------------
60 
61 bool                    g_verbose           = false;
62 int                     g_timing_iterations = 0;
63 int                     g_repeat            = 0;
64 CachingDeviceAllocator  g_allocator(true);
65 
66 // Dispatch types
67 enum Backend
68 {
69     CUB,                        // CUB method (allows overwriting of input)
70     CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)
71 
72     CUB_SEGMENTED,              // CUB method (allows overwriting of input)
73     CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
74 
75     THRUST,                     // Thrust method
76     CDP,                        // GPU-based (dynamic parallelism) dispatch to CUB method
77 };
78 
79 
80 //---------------------------------------------------------------------
81 // Dispatch to different DeviceRadixSort entrypoints
82 //---------------------------------------------------------------------
83 
84 /**
85  * Dispatch to CUB sorting entrypoint (specialized for ascending)
86  */
87 template <typename KeyT, typename ValueT>
88 CUB_RUNTIME_FUNCTION
89 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)90 cudaError_t Dispatch(
91     Int2Type<false>         /*is_descending*/,
92     Int2Type<CUB>           /*dispatch_to*/,
93     int                     */*d_selector*/,
94     size_t                  */*d_temp_storage_bytes*/,
95     cudaError_t             */*d_cdp_error*/,
96 
97     void*                   d_temp_storage,
98     size_t&                 temp_storage_bytes,
99     DoubleBuffer<KeyT>      &d_keys,
100     DoubleBuffer<ValueT>    &d_values,
101     int                     num_items,
102     int                     /*num_segments*/,
103     const int               */*d_segment_offsets*/,
104     int                     begin_bit,
105     int                     end_bit,
106     cudaStream_t            stream,
107     bool                    debug_synchronous)
108 {
109     return DeviceRadixSort::SortPairs(
110         d_temp_storage, temp_storage_bytes,
111         d_keys, d_values,
112         num_items, begin_bit, end_bit, stream, debug_synchronous);
113 }
114 
115 /**
116  * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
117  */
118 template <typename KeyT, typename ValueT>
119 CUB_RUNTIME_FUNCTION
120 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)121 cudaError_t Dispatch(
122     Int2Type<false>             /*is_descending*/,
123     Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
124     int                         */*d_selector*/,
125     size_t                      */*d_temp_storage_bytes*/,
126     cudaError_t                 */*d_cdp_error*/,
127 
128     void*                   d_temp_storage,
129     size_t&                 temp_storage_bytes,
130     DoubleBuffer<KeyT>      &d_keys,
131     DoubleBuffer<ValueT>    &d_values,
132     int                     num_items,
133     int                     /*num_segments*/,
134     const int               */*d_segment_offsets*/,
135     int                     begin_bit,
136     int                     end_bit,
137     cudaStream_t            stream,
138     bool                    debug_synchronous)
139 {
140     KeyT      const *const_keys_itr     = d_keys.Current();
141     ValueT    const *const_values_itr   = d_values.Current();
142 
143     cudaError_t retval = DeviceRadixSort::SortPairs(
144         d_temp_storage, temp_storage_bytes,
145         const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
146         num_items, begin_bit, end_bit, stream, debug_synchronous);
147 
148     d_keys.selector ^= 1;
149     d_values.selector ^= 1;
150     return retval;
151 }
152 
153 /**
154  * Dispatch to CUB sorting entrypoint (specialized for descending)
155  */
156 template <typename KeyT, typename ValueT>
157 CUB_RUNTIME_FUNCTION
158 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)159 cudaError_t Dispatch(
160     Int2Type<true>          /*is_descending*/,
161     Int2Type<CUB>           /*dispatch_to*/,
162     int                     */*d_selector*/,
163     size_t                  */*d_temp_storage_bytes*/,
164     cudaError_t             */*d_cdp_error*/,
165 
166     void*                   d_temp_storage,
167     size_t&                 temp_storage_bytes,
168     DoubleBuffer<KeyT>      &d_keys,
169     DoubleBuffer<ValueT>    &d_values,
170     int                     num_items,
171     int                     /*num_segments*/,
172     const int               */*d_segment_offsets*/,
173     int                     begin_bit,
174     int                     end_bit,
175     cudaStream_t            stream,
176     bool                    debug_synchronous)
177 {
178     return DeviceRadixSort::SortPairsDescending(
179         d_temp_storage, temp_storage_bytes,
180         d_keys, d_values,
181         num_items, begin_bit, end_bit, stream, debug_synchronous);
182 }
183 
184 
185 /**
186  * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
187  */
188 template <typename KeyT, typename ValueT>
189 CUB_RUNTIME_FUNCTION
190 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)191 cudaError_t Dispatch(
192     Int2Type<true>              /*is_descending*/,
193     Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
194     int                         */*d_selector*/,
195     size_t                      */*d_temp_storage_bytes*/,
196     cudaError_t                 */*d_cdp_error*/,
197 
198     void*                   d_temp_storage,
199     size_t&                 temp_storage_bytes,
200     DoubleBuffer<KeyT>      &d_keys,
201     DoubleBuffer<ValueT>    &d_values,
202     int                     num_items,
203     int                     /*num_segments*/,
204     const int               */*d_segment_offsets*/,
205     int                     begin_bit,
206     int                     end_bit,
207     cudaStream_t            stream,
208     bool                    debug_synchronous)
209 {
210     KeyT      const *const_keys_itr     = d_keys.Current();
211     ValueT    const *const_values_itr   = d_values.Current();
212 
213     cudaError_t retval = DeviceRadixSort::SortPairsDescending(
214         d_temp_storage, temp_storage_bytes,
215         const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
216         num_items, begin_bit, end_bit, stream, debug_synchronous);
217 
218     d_keys.selector ^= 1;
219     d_values.selector ^= 1;
220     return retval;
221 }
222 
223 //---------------------------------------------------------------------
224 // Dispatch to different DeviceRadixSort entrypoints
225 //---------------------------------------------------------------------
226 
227 /**
228  * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
229  */
230 template <typename KeyT, typename ValueT>
231 CUB_RUNTIME_FUNCTION
232 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB_SEGMENTED>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)233 cudaError_t Dispatch(
234     Int2Type<false>         /*is_descending*/,
235     Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
236     int                     */*d_selector*/,
237     size_t                  */*d_temp_storage_bytes*/,
238     cudaError_t             */*d_cdp_error*/,
239 
240     void*                   d_temp_storage,
241     size_t&                 temp_storage_bytes,
242     DoubleBuffer<KeyT>      &d_keys,
243     DoubleBuffer<ValueT>    &d_values,
244     int                     num_items,
245     int                     num_segments,
246     const int               *d_segment_offsets,
247     int                     begin_bit,
248     int                     end_bit,
249     cudaStream_t            stream,
250     bool                    debug_synchronous)
251 {
252     return DeviceSegmentedRadixSort::SortPairs(
253         d_temp_storage, temp_storage_bytes,
254         d_keys, d_values,
255         num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
256         begin_bit, end_bit, stream, debug_synchronous);
257 }
258 
259 /**
260  * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
261  */
262 template <typename KeyT, typename ValueT>
263 CUB_RUNTIME_FUNCTION
264 __forceinline__
Dispatch(Int2Type<false>,Int2Type<CUB_SEGMENTED_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)265 cudaError_t Dispatch(
266     Int2Type<false>                         /*is_descending*/,
267     Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
268     int                                     */*d_selector*/,
269     size_t                                  */*d_temp_storage_bytes*/,
270     cudaError_t                             */*d_cdp_error*/,
271 
272     void*                   d_temp_storage,
273     size_t&                 temp_storage_bytes,
274     DoubleBuffer<KeyT>      &d_keys,
275     DoubleBuffer<ValueT>    &d_values,
276     int                     num_items,
277     int                     num_segments,
278     const int               *d_segment_offsets,
279     int                     begin_bit,
280     int                     end_bit,
281     cudaStream_t            stream,
282     bool                    debug_synchronous)
283 {
284     KeyT      const *const_keys_itr     = d_keys.Current();
285     ValueT    const *const_values_itr   = d_values.Current();
286 
287     cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(
288         d_temp_storage, temp_storage_bytes,
289         const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
290         num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
291         begin_bit, end_bit, stream, debug_synchronous);
292 
293     d_keys.selector ^= 1;
294     d_values.selector ^= 1;
295     return retval;
296 }
297 
298 
299 /**
300  * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
301  */
302 template <typename KeyT, typename ValueT>
303 CUB_RUNTIME_FUNCTION
304 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB_SEGMENTED>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)305 cudaError_t Dispatch(
306     Int2Type<true>          /*is_descending*/,
307     Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
308     int                     */*d_selector*/,
309     size_t                  */*d_temp_storage_bytes*/,
310     cudaError_t             */*d_cdp_error*/,
311 
312     void*                   d_temp_storage,
313     size_t&                 temp_storage_bytes,
314     DoubleBuffer<KeyT>      &d_keys,
315     DoubleBuffer<ValueT>    &d_values,
316     int                     num_items,
317     int                     num_segments,
318     const int               *d_segment_offsets,
319     int                     begin_bit,
320     int                     end_bit,
321     cudaStream_t            stream,
322     bool                    debug_synchronous)
323 {
324     return DeviceSegmentedRadixSort::SortPairsDescending(
325         d_temp_storage, temp_storage_bytes,
326         d_keys, d_values,
327         num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
328         begin_bit, end_bit, stream, debug_synchronous);
329 }
330 
331 /**
332  * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
333  */
334 template <typename KeyT, typename ValueT>
335 CUB_RUNTIME_FUNCTION
336 __forceinline__
Dispatch(Int2Type<true>,Int2Type<CUB_SEGMENTED_NO_OVERWRITE>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)337 cudaError_t Dispatch(
338     Int2Type<true>                          /*is_descending*/,
339     Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
340     int                                     */*d_selector*/,
341     size_t                                  */*d_temp_storage_bytes*/,
342     cudaError_t                             */*d_cdp_error*/,
343 
344     void*                   d_temp_storage,
345     size_t&                 temp_storage_bytes,
346     DoubleBuffer<KeyT>      &d_keys,
347     DoubleBuffer<ValueT>    &d_values,
348     int                     num_items,
349     int                     num_segments,
350     const int               *d_segment_offsets,
351     int                     begin_bit,
352     int                     end_bit,
353     cudaStream_t            stream,
354     bool                    debug_synchronous)
355 {
356     KeyT      const *const_keys_itr     = d_keys.Current();
357     ValueT    const *const_values_itr   = d_values.Current();
358 
359     cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(
360         d_temp_storage, temp_storage_bytes,
361         const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
362         num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
363         begin_bit, end_bit, stream, debug_synchronous);
364 
365     d_keys.selector ^= 1;
366     d_values.selector ^= 1;
367     return retval;
368 }
369 
370 
371 //---------------------------------------------------------------------
372 // Dispatch to different Thrust entrypoints
373 //---------------------------------------------------------------------
374 
375 /**
376  * Dispatch keys-only to Thrust sorting entrypoint
377  */
378 template <int IS_DESCENDING, typename KeyT>
Dispatch(Int2Type<IS_DESCENDING>,Int2Type<THRUST>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<NullType> &,int num_items,int,const int *,int,int,cudaStream_t,bool)379 cudaError_t Dispatch(
380     Int2Type<IS_DESCENDING> /*is_descending*/,
381     Int2Type<THRUST>        /*dispatch_to*/,
382     int                     */*d_selector*/,
383     size_t                  */*d_temp_storage_bytes*/,
384     cudaError_t             */*d_cdp_error*/,
385 
386     void                    *d_temp_storage,
387     size_t                  &temp_storage_bytes,
388     DoubleBuffer<KeyT>      &d_keys,
389     DoubleBuffer<NullType>  &/*d_values*/,
390     int                     num_items,
391     int                     /*num_segments*/,
392     const int               */*d_segment_offsets*/,
393     int                     /*begin_bit*/,
394     int                     /*end_bit*/,
395     cudaStream_t            /*stream*/,
396     bool                    /*debug_synchronous*/)
397 {
398 
399     if (d_temp_storage == 0)
400     {
401         temp_storage_bytes = 1;
402     }
403     else
404     {
405         thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
406 
407         if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
408         thrust::sort(d_keys_wrapper, d_keys_wrapper + num_items);
409         if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
410     }
411 
412     return cudaSuccess;
413 }
414 
415 
416 /**
417  * Dispatch key-value pairs to Thrust sorting entrypoint
418  */
419 template <int IS_DESCENDING, typename KeyT, typename ValueT>
Dispatch(Int2Type<IS_DESCENDING>,Int2Type<THRUST>,int *,size_t *,cudaError_t *,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int,const int *,int,int,cudaStream_t,bool)420 cudaError_t Dispatch(
421     Int2Type<IS_DESCENDING> /*is_descending*/,
422     Int2Type<THRUST>        /*dispatch_to*/,
423     int                     */*d_selector*/,
424     size_t                  */*d_temp_storage_bytes*/,
425     cudaError_t             */*d_cdp_error*/,
426 
427     void                    *d_temp_storage,
428     size_t                  &temp_storage_bytes,
429     DoubleBuffer<KeyT>      &d_keys,
430     DoubleBuffer<ValueT>    &d_values,
431     int                     num_items,
432     int                     /*num_segments*/,
433     const int               */*d_segment_offsets*/,
434     int                     /*begin_bit*/,
435     int                     /*end_bit*/,
436     cudaStream_t            /*stream*/,
437     bool                    /*debug_synchronous*/)
438 {
439 
440     if (d_temp_storage == 0)
441     {
442         temp_storage_bytes = 1;
443     }
444     else
445     {
446         thrust::device_ptr<KeyT>     d_keys_wrapper(d_keys.Current());
447         thrust::device_ptr<ValueT>   d_values_wrapper(d_values.Current());
448 
449         if (IS_DESCENDING) {
450             thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
451             thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
452         }
453 
454         thrust::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper);
455 
456         if (IS_DESCENDING) {
457             thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
458             thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
459         }
460     }
461 
462     return cudaSuccess;
463 }
464 
465 
466 //---------------------------------------------------------------------
467 // CUDA Nested Parallelism Test Kernel
468 //---------------------------------------------------------------------
469 
470 /**
471  * Simple wrapper kernel to invoke DeviceRadixSort
472  */
473 template <int IS_DESCENDING, typename KeyT, typename ValueT>
CnpDispatchKernel(Int2Type<IS_DESCENDING> is_descending,int * d_selector,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t temp_storage_bytes,DoubleBuffer<KeyT> d_keys,DoubleBuffer<ValueT> d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,bool debug_synchronous)474 __global__ void CnpDispatchKernel(
475     Int2Type<IS_DESCENDING> is_descending,
476     int                     *d_selector,
477     size_t                  *d_temp_storage_bytes,
478     cudaError_t             *d_cdp_error,
479 
480     void                    *d_temp_storage,
481     size_t                  temp_storage_bytes,
482     DoubleBuffer<KeyT>      d_keys,
483     DoubleBuffer<ValueT>    d_values,
484     int                     num_items,
485     int                     num_segments,
486     const int               *d_segment_offsets,
487     int                     begin_bit,
488     int                     end_bit,
489     bool                    debug_synchronous)
490 {
491 #ifndef CUB_CDP
492   (void)is_descending;
493   (void)d_selector;
494   (void)d_temp_storage_bytes;
495   (void)d_cdp_error;
496   (void)d_temp_storage;
497   (void)temp_storage_bytes;
498   (void)d_keys;
499   (void)d_values;
500   (void)num_items;
501   (void)num_segments;
502   (void)d_segment_offsets;
503   (void)begin_bit;
504   (void)end_bit;
505   (void)debug_synchronous;
506     *d_cdp_error            = cudaErrorNotSupported;
507 #else
508     *d_cdp_error            = Dispatch(
509                                 is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
510                                 d_temp_storage, temp_storage_bytes, d_keys, d_values,
511                                 num_items, num_segments, d_segment_offsets,
512                                 begin_bit, end_bit, 0, debug_synchronous);
513     *d_temp_storage_bytes   = temp_storage_bytes;
514     *d_selector             = d_keys.selector;
515 #endif
516 }
517 
518 
519 /**
520  * Dispatch to CDP kernel
521  */
522 template <int IS_DESCENDING, typename KeyT, typename ValueT>
Dispatch(Int2Type<IS_DESCENDING> is_descending,Int2Type<CDP> dispatch_to,int * d_selector,size_t * d_temp_storage_bytes,cudaError_t * d_cdp_error,void * d_temp_storage,size_t & temp_storage_bytes,DoubleBuffer<KeyT> & d_keys,DoubleBuffer<ValueT> & d_values,int num_items,int num_segments,const int * d_segment_offsets,int begin_bit,int end_bit,cudaStream_t stream,bool debug_synchronous)523 cudaError_t Dispatch(
524     Int2Type<IS_DESCENDING> is_descending,
525     Int2Type<CDP>           dispatch_to,
526     int                     *d_selector,
527     size_t                  *d_temp_storage_bytes,
528     cudaError_t             *d_cdp_error,
529 
530     void                    *d_temp_storage,
531     size_t                  &temp_storage_bytes,
532     DoubleBuffer<KeyT>      &d_keys,
533     DoubleBuffer<ValueT>    &d_values,
534     int                     num_items,
535     int                     num_segments,
536     const int               *d_segment_offsets,
537     int                     begin_bit,
538     int                     end_bit,
539     cudaStream_t            stream,
540     bool                    debug_synchronous)
541 {
542     // Invoke kernel to invoke device-side dispatch
543     CnpDispatchKernel<<<1,1>>>(
544         is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
545         d_temp_storage, temp_storage_bytes, d_keys, d_values,
546         num_items, num_segments, d_segment_offsets,
547         begin_bit, end_bit, debug_synchronous);
548 
549     // Copy out selector
550     CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
551     d_values.selector = d_keys.selector;
552 
553     // Copy out temp_storage_bytes
554     CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
555 
556     // Copy out error
557     cudaError_t retval;
558     CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
559     return retval;
560 }
561 
562 
563 
564 //---------------------------------------------------------------------
565 // Problem generation
566 //---------------------------------------------------------------------
567 
568 
569 /**
570  * Simple key-value pairing
571  */
572 template <
573     typename KeyT,
574     typename ValueT,
575     bool IS_FLOAT = (Traits<KeyT>::CATEGORY == FLOATING_POINT)>
576 struct Pair
577 {
578     KeyT     key;
579     ValueT   value;
580 
operator <Pair581     bool operator<(const Pair &b) const
582     {
583         return (key < b.key);
584     }
585 };
586 
587 
588 /**
589  * Simple key-value pairing (specialized for bool types)
590  */
591 template <typename ValueT>
592 struct Pair<bool, ValueT, false>
593 {
594     bool     key;
595     ValueT   value;
596 
operator <Pair597     bool operator<(const Pair &b) const
598     {
599         return (!key && b.key);
600     }
601 };
602 
603 
604 /**
605  * Simple key-value pairing (specialized for floating point types)
606  */
607 template <typename KeyT, typename ValueT>
608 struct Pair<KeyT, ValueT, true>
609 {
610     KeyT     key;
611     ValueT   value;
612 
operator <Pair613     bool operator<(const Pair &b) const
614     {
615         if (key < b.key)
616             return true;
617 
618         if (key > b.key)
619             return false;
620 
621         // KeyT in unsigned bits
622         typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
623 
624         // Return true if key is negative zero and b.key is positive zero
625         UnsignedBits key_bits   = SafeBitCast<UnsignedBits>(key);
626         UnsignedBits b_key_bits = SafeBitCast<UnsignedBits>(b.key);
627         UnsignedBits HIGH_BIT   = Traits<KeyT>::HIGH_BIT;
628 
629         return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
630     }
631 };
632 
633 
634 /**
635  * Initialize key data
636  */
637 template <typename KeyT>
InitializeKeyBits(GenMode gen_mode,KeyT * h_keys,int num_items,int)638 void InitializeKeyBits(
639     GenMode         gen_mode,
640     KeyT            *h_keys,
641     int             num_items,
642     int             /*entropy_reduction*/)
643 {
644     for (int i = 0; i < num_items; ++i)
645         InitValue(gen_mode, h_keys[i], i);
646 }
647 
648 
649 /**
650  * Initialize solution
651  */
652 template <bool IS_DESCENDING, typename KeyT>
InitializeSolution(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit,int * & h_reference_ranks,KeyT * & h_reference_keys)653 void InitializeSolution(
654     KeyT    *h_keys,
655     int     num_items,
656     int     num_segments,
657     int     *h_segment_offsets,
658     int     begin_bit,
659     int     end_bit,
660     int     *&h_reference_ranks,
661     KeyT    *&h_reference_keys)
662 {
663     typedef Pair<KeyT, int> PairT;
664 
665     PairT *h_pairs = new PairT[num_items];
666 
667     int num_bits = end_bit - begin_bit;
668     for (int i = 0; i < num_items; ++i)
669     {
670 
671         // Mask off unwanted portions
672         if (num_bits < static_cast<int>(sizeof(KeyT) * 8))
673         {
674             unsigned long long base = 0;
675             memcpy(&base, &h_keys[i], sizeof(KeyT));
676             base &= ((1ull << num_bits) - 1) << begin_bit;
677             memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
678         }
679         else
680         {
681             h_pairs[i].key = h_keys[i];
682         }
683 
684         h_pairs[i].value = i;
685     }
686 
687     printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
688 
689     for (int i = 0; i < num_segments; ++i)
690     {
691         if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
692         std::stable_sort(               h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
693         if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
694     }
695 
696     printf(" Done.\n"); fflush(stdout);
697 
698     h_reference_ranks  = new int[num_items];
699     h_reference_keys   = new KeyT[num_items];
700 
701     for (int i = 0; i < num_items; ++i)
702     {
703         h_reference_ranks[i]    = h_pairs[i].value;
704         h_reference_keys[i]     = h_keys[h_pairs[i].value];
705     }
706 
707     if (h_pairs) delete[] h_pairs;
708 }
709 
710 
711 //---------------------------------------------------------------------
712 // Test generation
713 //---------------------------------------------------------------------
714 
715 
716 /**
717  * Test DeviceRadixSort
718  */
719 template <
720     Backend     BACKEND,
721     bool        IS_DESCENDING,
722     typename    KeyT,
723     typename    ValueT>
Test(KeyT * h_keys,ValueT * h_values,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit,KeyT * h_reference_keys,ValueT * h_reference_values)724 void Test(
725     KeyT        *h_keys,
726     ValueT      *h_values,
727     int         num_items,
728     int         num_segments,
729     int         *h_segment_offsets,
730     int         begin_bit,
731     int         end_bit,
732     KeyT        *h_reference_keys,
733     ValueT      *h_reference_values)
734 {
735     // Key alias type
736 #if (__CUDACC_VER_MAJOR__ >= 9)
737     typedef typename If<Equals<KeyT, half_t>::VALUE, __half, KeyT>::Type KeyAliasT;
738 #else
739     typedef KeyT KeyAliasT;
740 #endif
741 
742     const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
743 
744     printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n",
745         (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
746         (KEYS_ONLY) ? "keys-only" : "key-value",
747         num_items, num_segments,
748         (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(),
749         IS_DESCENDING, begin_bit, end_bit);
750     fflush(stdout);
751 
752     if (g_verbose)
753     {
754         printf("Input keys:\n");
755         DisplayResults(h_keys, num_items);
756         printf("\n\n");
757     }
758 
759     // Allocate device arrays
760     DoubleBuffer<KeyAliasT> d_keys;
761     DoubleBuffer<ValueT>    d_values;
762     int                     *d_selector;
763     int                     *d_segment_offsets;
764     size_t                  *d_temp_storage_bytes;
765     cudaError_t             *d_cdp_error;
766     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
767     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
768     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
769     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
770     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
771     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
772     if (!KEYS_ONLY)
773     {
774         CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
775         CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
776     }
777 
778     // Allocate temporary storage (and make it un-aligned)
779     size_t  temp_storage_bytes  = 0;
780     void    *d_temp_storage     = NULL;
781     CubDebugExit(Dispatch(
782         Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
783         d_temp_storage, temp_storage_bytes, d_keys, d_values,
784         num_items, num_segments, d_segment_offsets,
785         begin_bit, end_bit, 0, true));
786 
787     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
788     void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
789 
790     // Initialize/clear device arrays
791     d_keys.selector = 0;
792     CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
793     CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
794     if (!KEYS_ONLY)
795     {
796         d_values.selector = 0;
797         CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
798         CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
799     }
800     CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
801 
802     // Run warmup/correctness iteration
803     CubDebugExit(Dispatch(
804         Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
805         mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
806         num_items, num_segments, d_segment_offsets,
807         begin_bit, end_bit, 0, true));
808 
809     // Flush any stdout/stderr
810     fflush(stdout);
811     fflush(stderr);
812 
813     // Check for correctness (and display results, if specified)
814     printf("Warmup done.  Checking results:\n"); fflush(stdout);
815     int compare = CompareDeviceResults(h_reference_keys, reinterpret_cast<KeyT*>(d_keys.Current()), num_items, true, g_verbose);
816     printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
817     if (!KEYS_ONLY)
818     {
819         int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
820         compare |= values_compare;
821         printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
822     }
823     if (BACKEND == CUB_NO_OVERWRITE)
824     {
825         // Check that input isn't overwritten
826         int input_compare = CompareDeviceResults(h_keys, reinterpret_cast<KeyT*>(d_keys.d_buffers[0]), num_items, true, g_verbose);
827         compare |= input_compare;
828         printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
829     }
830 
831     // Performance
832     if (g_timing_iterations)
833         printf("\nPerforming timing iterations:\n"); fflush(stdout);
834 
835     GpuTimer gpu_timer;
836     float elapsed_millis = 0.0f;
837     for (int i = 0; i < g_timing_iterations; ++i)
838     {
839         // Initialize/clear device arrays
840         CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
841         CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
842         if (!KEYS_ONLY)
843         {
844             CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
845             CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
846         }
847 
848         gpu_timer.Start();
849         CubDebugExit(Dispatch(
850             Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
851             mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
852             num_items, num_segments, d_segment_offsets,
853             begin_bit, end_bit, 0, false));
854         gpu_timer.Stop();
855         elapsed_millis += gpu_timer.ElapsedMillis();
856     }
857 
858     // Display performance
859     if (g_timing_iterations > 0)
860     {
861         float avg_millis = elapsed_millis / g_timing_iterations;
862         float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
863         float giga_bandwidth = (KEYS_ONLY) ?
864             giga_rate * sizeof(KeyT) * 2 :
865             giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
866         printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
867     }
868 
869     printf("\n\n");
870 
871     // Cleanup
872     if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
873     if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
874     if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
875     if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
876     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
877     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
878     if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
879     if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
880     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
881 
882     // Correctness asserts
883     AssertEquals(0, compare);
884 }
885 
886 
887 /**
888  * Test backend
889  */
890 template <bool IS_DESCENDING, typename KeyT, typename ValueT>
TestBackend(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit,KeyT * h_reference_keys,int * h_reference_ranks)891 void TestBackend(
892     KeyT    *h_keys,
893     int     num_items,
894     int     num_segments,
895     int     *h_segment_offsets,
896     int     begin_bit,
897     int     end_bit,
898     KeyT    *h_reference_keys,
899     int     *h_reference_ranks)
900 {
901     const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
902 
903     ValueT *h_values             = NULL;
904     ValueT *h_reference_values   = NULL;
905 
906     if (!KEYS_ONLY)
907     {
908         h_values            = new ValueT[num_items];
909         h_reference_values  = new ValueT[num_items];
910 
911         for (int i = 0; i < num_items; ++i)
912         {
913             InitValue(INTEGER_SEED, h_values[i], i);
914             InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
915         }
916     }
917 
918 #ifdef SEGMENTED_SORT
919     // Test multi-segment implementations
920     Test<CUB_SEGMENTED, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
921     Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
922 #else   // SEGMENTED_SORT
923     if (num_segments == 1)
924     {
925         // Test single-segment implementations
926         Test<CUB, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
927         Test<CUB_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
928     #ifdef CUB_CDP
929         Test<CDP, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
930     #endif
931     }
932 #endif  // SEGMENTED_SORT
933 
934     if (h_values) delete[] h_values;
935     if (h_reference_values) delete[] h_reference_values;
936 }
937 
938 
939 
940 
941 /**
942  * Test value type
943  */
944 template <bool IS_DESCENDING, typename KeyT>
TestValueTypes(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit)945 void TestValueTypes(
946     KeyT    *h_keys,
947     int     num_items,
948     int     num_segments,
949     int     *h_segment_offsets,
950     int     begin_bit,
951     int     end_bit)
952 {
953     // Initialize the solution
954 
955     int *h_reference_ranks = NULL;
956     KeyT *h_reference_keys = NULL;
957     InitializeSolution<IS_DESCENDING>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
958 
959     // Test keys-only
960     TestBackend<IS_DESCENDING, KeyT, NullType>          (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
961 
962     // Test with 8b value
963     TestBackend<IS_DESCENDING, KeyT, unsigned char>     (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
964 
965     // Test with 32b value
966     TestBackend<IS_DESCENDING, KeyT, unsigned int>      (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
967 
968     // Test with 64b value
969     TestBackend<IS_DESCENDING, KeyT, unsigned long long>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
970 
971     // Test with non-trivially-constructable value
972     TestBackend<IS_DESCENDING, KeyT, TestBar>           (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
973 
974     // Cleanup
975     if (h_reference_ranks) delete[] h_reference_ranks;
976     if (h_reference_keys) delete[] h_reference_keys;
977 }
978 
979 
980 
981 /**
982  * Test ascending/descending
983  */
984 template <typename KeyT>
TestDirection(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets,int begin_bit,int end_bit)985 void TestDirection(
986     KeyT    *h_keys,
987     int     num_items,
988     int     num_segments,
989     int     *h_segment_offsets,
990     int     begin_bit,
991     int     end_bit)
992 {
993     TestValueTypes<true>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
994     TestValueTypes<false>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
995 }
996 
997 
998 /**
999  * Test different bit ranges
1000  */
1001 template <typename KeyT>
TestBits(KeyT * h_keys,int num_items,int num_segments,int * h_segment_offsets)1002 void TestBits(
1003     KeyT    *h_keys,
1004     int     num_items,
1005     int     num_segments,
1006     int     *h_segment_offsets)
1007 {
1008     // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way)
1009     if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER) && (!Equals<KeyT, bool>::VALUE))
1010     {
1011         // Partial bits
1012         int begin_bit = 1;
1013         int end_bit = (sizeof(KeyT) * 8) - 1;
1014         printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
1015         TestDirection(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
1016 
1017         // Across subword boundaries
1018         int mid_bit = sizeof(KeyT) * 4;
1019         printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
1020         TestDirection(h_keys, num_items, num_segments, h_segment_offsets, mid_bit - 1, mid_bit + 1);
1021     }
1022 
1023     printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
1024     TestDirection(h_keys, num_items, num_segments, h_segment_offsets, 0, sizeof(KeyT) * 8);
1025 }
1026 
1027 
1028 /**
1029  * Test different segment compositions
1030  */
1031 template <typename KeyT>
TestSegments(KeyT * h_keys,int num_items,int max_segments)1032 void TestSegments(
1033     KeyT    *h_keys,
1034     int     num_items,
1035     int     max_segments)
1036 {
1037     int *h_segment_offsets = new int[max_segments + 1];
1038 
1039 #ifdef SEGMENTED_SORT
1040     for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32)
1041     {
1042         if (num_items / num_segments < 128 * 1000) {
1043             // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
1044             InitializeSegments(num_items, num_segments, h_segment_offsets);
1045             TestBits(h_keys, num_items, num_segments, h_segment_offsets);
1046         }
1047     }
1048 #else
1049     // Test single segment
1050     if (num_items < 128 * 1000) {
1051         // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
1052         InitializeSegments(num_items, 1, h_segment_offsets);
1053         TestBits(h_keys, num_items, 1, h_segment_offsets);
1054     }
1055 #endif
1056     if (h_segment_offsets) delete[] h_segment_offsets;
1057 }
1058 
1059 
1060 /**
1061  * Test different (sub)lengths and number of segments
1062  */
1063 template <typename KeyT>
TestSizes(KeyT * h_keys,int max_items,int max_segments)1064 void TestSizes(
1065     KeyT    *h_keys,
1066     int     max_items,
1067     int     max_segments)
1068 {
1069     for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
1070     {
1071         TestSegments(h_keys, num_items, max_segments);
1072     }
1073     TestSegments(h_keys, 1, max_segments);
1074     TestSegments(h_keys, 0, max_segments);
1075 }
1076 
1077 
1078 /**
1079  * Test key sampling distributions
1080  */
1081 template <typename KeyT>
TestGen(int max_items,int max_segments)1082 void TestGen(
1083     int             max_items,
1084     int             max_segments)
1085 {
1086     if (max_items < 0)
1087         max_items = 9000003;
1088 
1089     if (max_segments < 0)
1090         max_segments = 5003;
1091 
1092     KeyT *h_keys = new KeyT[max_items];
1093 
1094     for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 3)
1095     {
1096         printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
1097         InitializeKeyBits(RANDOM, h_keys, max_items, entropy_reduction);
1098         TestSizes(h_keys, max_items, max_segments);
1099     }
1100 
1101     printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
1102     InitializeKeyBits(UNIFORM, h_keys, max_items, 0);
1103     TestSizes(h_keys, max_items, max_segments);
1104 
1105     printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
1106     InitializeKeyBits(INTEGER_SEED, h_keys, max_items, 0);
1107     TestSizes(h_keys, max_items, max_segments);
1108 
1109     if (h_keys) delete[] h_keys;
1110 }
1111 
1112 
1113 //---------------------------------------------------------------------
1114 // Simple test
1115 //---------------------------------------------------------------------
1116 
1117 template <
1118     Backend     BACKEND,
1119     typename    KeyT,
1120     typename    ValueT,
1121     bool        IS_DESCENDING>
Test(int num_items,int num_segments,GenMode gen_mode,int entropy_reduction,int begin_bit,int end_bit)1122 void Test(
1123     int         num_items,
1124     int         num_segments,
1125     GenMode     gen_mode,
1126     int         entropy_reduction,
1127     int         begin_bit,
1128     int         end_bit)
1129 {
1130     const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
1131 
1132     KeyT    *h_keys             = new KeyT[num_items];
1133     int     *h_reference_ranks  = NULL;
1134     KeyT    *h_reference_keys   = NULL;
1135     ValueT  *h_values           = NULL;
1136     ValueT  *h_reference_values = NULL;
1137     int     *h_segment_offsets  = new int[num_segments + 1];
1138 
1139     if (end_bit < 0)
1140         end_bit = sizeof(KeyT) * 8;
1141 
1142     InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
1143     InitializeSegments(num_items, num_segments, h_segment_offsets);
1144     InitializeSolution<IS_DESCENDING>(
1145         h_keys, num_items, num_segments, h_segment_offsets,
1146         begin_bit, end_bit, h_reference_ranks, h_reference_keys);
1147 
1148     if (!KEYS_ONLY)
1149     {
1150         h_values            = new ValueT[num_items];
1151         h_reference_values  = new ValueT[num_items];
1152 
1153         for (int i = 0; i < num_items; ++i)
1154         {
1155             InitValue(INTEGER_SEED, h_values[i], i);
1156             InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
1157         }
1158     }
1159     if (h_reference_ranks) delete[] h_reference_ranks;
1160 
1161     printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
1162     Test<BACKEND, IS_DESCENDING>(
1163         h_keys, h_values,
1164         num_items, num_segments, h_segment_offsets,
1165         begin_bit, end_bit, h_reference_keys, h_reference_values);
1166 
1167     if (h_keys)             delete[] h_keys;
1168     if (h_reference_keys)   delete[] h_reference_keys;
1169     if (h_values)           delete[] h_values;
1170     if (h_reference_values) delete[] h_reference_values;
1171     if (h_segment_offsets)  delete[] h_segment_offsets;
1172 }
1173 
1174 
1175 
1176 //---------------------------------------------------------------------
1177 // Main
1178 //---------------------------------------------------------------------
1179 
1180 /**
1181  * Main
1182  */
main(int argc,char ** argv)1183 int main(int argc, char** argv)
1184 {
1185     int bits = -1;
1186     int num_items = -1;
1187     int num_segments = -1;
1188     int entropy_reduction = 0;
1189 
1190     // Initialize command line
1191     CommandLineArgs args(argc, argv);
1192     g_verbose = args.CheckCmdLineFlag("v");
1193     args.GetCmdLineArgument("n", num_items);
1194     args.GetCmdLineArgument("s", num_segments);
1195     args.GetCmdLineArgument("i", g_timing_iterations);
1196     args.GetCmdLineArgument("repeat", g_repeat);
1197     args.GetCmdLineArgument("bits", bits);
1198     args.GetCmdLineArgument("entropy", entropy_reduction);
1199 
1200     // Print usage
1201     if (args.CheckCmdLineFlag("help"))
1202     {
1203         printf("%s "
1204             "[--bits=<valid key bits>]"
1205             "[--n=<input items> "
1206             "[--s=<num segments> "
1207             "[--i=<timing iterations> "
1208             "[--device=<device-id>] "
1209             "[--repeat=<repetitions of entire test suite>]"
1210             "[--v] "
1211             "[--entropy=<entropy-reduction factor (default 0)>]"
1212             "\n", argv[0]);
1213         exit(0);
1214     }
1215 
1216     // Initialize device
1217     CubDebugExit(args.DeviceInit());
1218 
1219     // Get ptx version
1220     int ptx_version = 0;
1221     CubDebugExit(PtxVersion(ptx_version));
1222 
1223 #ifdef QUICKER_TEST
1224 
1225     enum {
1226         IS_DESCENDING   = false
1227     };
1228 
1229     // Compile/run basic CUB test
1230     if (num_items < 0)      num_items       = 48000000;
1231     if (num_segments < 0)   num_segments    = 5000;
1232 
1233     Test<CUB_SEGMENTED, unsigned int,       NullType, IS_DESCENDING>(num_items, num_segments, RANDOM, entropy_reduction, 0, bits);
1234 
1235     printf("\n-------------------------------\n");
1236 
1237     Test<CUB,           unsigned char,      NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1238     Test<CUB,           unsigned int,       NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1239     Test<CUB,           unsigned long long, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1240 
1241     printf("\n-------------------------------\n");
1242 
1243 #if (__CUDACC_VER_MAJOR__ >= 9)
1244     Test<CUB,           half_t,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1245 #endif
1246     Test<CUB,           float,              NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1247     Test<CUB,           double,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1248 
1249     printf("\n-------------------------------\n");
1250 
1251     Test<CUB,           unsigned char,      unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1252     Test<CUB,           unsigned int,       unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1253     Test<CUB,           unsigned long long, unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
1254 
1255 #elif defined(QUICK_TEST)
1256 
1257     // Compile/run quick tests
1258     if (num_items < 0)      num_items       = 48000000;
1259     if (num_segments < 0)   num_segments    = 5000;
1260 
1261     // Compare CUB and thrust on 32b keys-only
1262     Test<CUB, unsigned int, NullType, false> (                      num_items, 1, RANDOM, entropy_reduction, 0, bits);
1263     Test<THRUST, unsigned int, NullType, false> (                   num_items, 1, RANDOM, entropy_reduction, 0, bits);
1264 
1265     // Compare CUB and thrust on 64b keys-only
1266     Test<CUB, unsigned long long, NullType, false> (                num_items, 1, RANDOM, entropy_reduction, 0, bits);
1267     Test<THRUST, unsigned long long, NullType, false> (             num_items, 1, RANDOM, entropy_reduction, 0, bits);
1268 
1269 
1270     // Compare CUB and thrust on 32b key-value pairs
1271     Test<CUB, unsigned int, unsigned int, false> (                  num_items, 1, RANDOM, entropy_reduction, 0, bits);
1272     Test<THRUST, unsigned int, unsigned int, false> (               num_items, 1, RANDOM, entropy_reduction, 0, bits);
1273 
1274     // Compare CUB and thrust on 64b key + 32b value pairs
1275     Test<CUB, unsigned long long, unsigned int, false> (      num_items, 1, RANDOM, entropy_reduction, 0, bits);
1276     Test<THRUST, unsigned long long, unsigned int, false> (   num_items, 1, RANDOM, entropy_reduction, 0, bits);
1277 
1278 
1279 #else
1280 
1281     // Compile/run thorough tests
1282     for (int i = 0; i <= g_repeat; ++i)
1283     {
1284         TestGen<bool>                 (num_items, num_segments);
1285 
1286         TestGen<char>                 (num_items, num_segments);
1287         TestGen<signed char>          (num_items, num_segments);
1288         TestGen<unsigned char>        (num_items, num_segments);
1289 
1290         TestGen<short>                (num_items, num_segments);
1291         TestGen<unsigned short>       (num_items, num_segments);
1292 
1293         TestGen<int>                  (num_items, num_segments);
1294         TestGen<unsigned int>         (num_items, num_segments);
1295 
1296         TestGen<long>                 (num_items, num_segments);
1297         TestGen<unsigned long>        (num_items, num_segments);
1298 
1299         TestGen<long long>            (num_items, num_segments);
1300         TestGen<unsigned long long>   (num_items, num_segments);
1301 
1302 #if (__CUDACC_VER_MAJOR__ >= 9)
1303         TestGen<half_t>                (num_items, num_segments);
1304 #endif
1305         TestGen<float>                (num_items, num_segments);
1306 
1307         if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
1308             TestGen<double>           (num_items, num_segments);
1309 
1310     }
1311 
1312 #endif
1313 
1314     return 0;
1315 }
1316 
1317