1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *     * Redistributions of source code must retain the above copyright
9  *       notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above copyright
11  *       notice, this list of conditions and the following disclaimer in the
12  *       documentation and/or other materials provided with the distribution.
13  *     * Neither the name of the NVIDIA CORPORATION nor the
14  *       names of its contributors may be used to endorse or promote products
15  *       derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 /**
31  * \file
32  * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
33  */
34 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "dispatch/dispatch_radix_sort.cuh"
41 #include "../util_arch.cuh"
42 #include "../util_namespace.cuh"
43 
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46 
47 /// CUB namespace
48 namespace cub {
49 
50 
51 /**
52  * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
53  * \ingroup SingleModule
54  *
55  * \par Overview
56  * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
57  * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
58  * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
59  * characters, etc.) specified from least-significant to most-significant.  For a
60  * given input sequence of keys and a set of rules specifying a total ordering
61  * of the symbolic alphabet, the radix sorting method produces a lexicographic
62  * ordering of those keys.
63  *
64  * \par
65  * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
66  * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
67  * half-precision floating-point type.  Although the direct radix sorting
68  * method can only be applied to unsigned integral types, DeviceRadixSort
69  * is able to sort signed and floating-point types via simple bit-wise transformations
70  * that ensure lexicographic key ordering.
71  *
72  * \par Usage Considerations
73  * \cdp_class{DeviceRadixSort}
74  *
75  * \par Performance
76  * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
77  * performance across different CUDA architectures for uniform-random \p uint32 keys.
78  * \plots_below
79  *
80  * \image html lsb_radix_sort_int32_keys.png
81  *
82  */
83 struct DeviceRadixSort
84 {
85 
86     /******************************************************************//**
87      * \name KeyT-value pairs
88      *********************************************************************/
89     //@{
90 
91     /**
92      * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
93      *
94      * \par
95      * - The contents of the input data are not altered by the sorting operation
96      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
97      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
98      * - \devicestorage
99      *
100      * \par Performance
101      * The following charts illustrate saturated sorting performance across different
102      * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
103      * <tt>uint64,uint64</tt> pairs, respectively.
104      *
105      * \image html lsb_radix_sort_int32_pairs.png
106      * \image html lsb_radix_sort_int64_pairs.png
107      *
108      * \par Snippet
109      * The code snippet below illustrates the sorting of a device vector of \p int keys
110      * with associated vector of \p int values.
111      * \par
112      * \code
113      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
114      *
115      * // Declare, allocate, and initialize device-accessible pointers for sorting data
116      * int  num_items;          // e.g., 7
117      * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
118      * int  *d_keys_out;        // e.g., [        ...        ]
119      * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
120      * int  *d_values_out;      // e.g., [        ...        ]
121      * ...
122      *
123      * // Determine temporary device storage requirements
124      * void     *d_temp_storage = NULL;
125      * size_t   temp_storage_bytes = 0;
126      * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
127      *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
128      *
129      * // Allocate temporary storage
130      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
131      *
132      * // Run sorting operation
133      * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
134      *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
135      *
136      * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
137      * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
138      *
139      * \endcode
140      *
141      * \tparam KeyT      <b>[inferred]</b> KeyT type
142      * \tparam ValueT    <b>[inferred]</b> ValueT type
143      */
144     template <
145         typename            KeyT,
146         typename            ValueT>
147     CUB_RUNTIME_FUNCTION
SortPairscub::DeviceRadixSort148     static cudaError_t SortPairs(
149         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
150         size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
151         const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
152         KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
153         const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
154         ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
155         int                 num_items,                              ///< [in] Number of items to sort
156         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
157         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
158         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
159         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
160     {
161         // Signed integer type for global offsets
162         typedef int OffsetT;
163 
164         DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
165         DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
166 
167         return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
168             d_temp_storage,
169             temp_storage_bytes,
170             d_keys,
171             d_values,
172             num_items,
173             begin_bit,
174             end_bit,
175             false,
176             stream,
177             debug_synchronous);
178     }
179 
180 
181     /**
182      * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
183      *
184      * \par
185      * - The sorting operation is given a pair of key buffers and a corresponding
186      *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
187      *   structure that indicates which of the two buffers is "current" (and thus
188      *   contains the input data to be sorted).
189      * - The contents of both buffers within each pair may be altered by the sorting
190      *   operation.
191      * - Upon completion, the sorting operation will update the "current" indicator
192      *   within each DoubleBuffer wrapper to reference which of the two buffers
193      *   now contains the sorted output sequence (a function of the number of key bits
194      *   specified and the targeted device architecture).
195      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
196      * - \devicestorageP
197      * - \devicestorage
198      *
199      * \par Performance
200      * The following charts illustrate saturated sorting performance across different
201      * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
202      * <tt>uint64,uint64</tt> pairs, respectively.
203      *
204      * \image html lsb_radix_sort_int32_pairs.png
205      * \image html lsb_radix_sort_int64_pairs.png
206      *
207      * \par Snippet
208      * The code snippet below illustrates the sorting of a device vector of \p int keys
209      * with associated vector of \p int values.
210      * \par
211      * \code
212      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
213      *
214      * // Declare, allocate, and initialize device-accessible pointers for sorting data
215      * int  num_items;          // e.g., 7
216      * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
217      * int  *d_key_alt_buf;     // e.g., [        ...        ]
218      * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
219      * int  *d_value_alt_buf;   // e.g., [        ...        ]
220      * ...
221      *
222      * // Create a set of DoubleBuffers to wrap pairs of device pointers
223      * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
224      * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
225      *
226      * // Determine temporary device storage requirements
227      * void     *d_temp_storage = NULL;
228      * size_t   temp_storage_bytes = 0;
229      * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
230      *
231      * // Allocate temporary storage
232      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
233      *
234      * // Run sorting operation
235      * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
236      *
237      * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
238      * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
239      *
240      * \endcode
241      *
242      * \tparam KeyT      <b>[inferred]</b> KeyT type
243      * \tparam ValueT    <b>[inferred]</b> ValueT type
244      */
245     template <
246         typename            KeyT,
247         typename            ValueT>
248     CUB_RUNTIME_FUNCTION
SortPairscub::DeviceRadixSort249     static cudaError_t SortPairs(
250         void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
251         size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
252         DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
253         DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
254         int                     num_items,                              ///< [in] Number of items to sort
255         int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
256         int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
257         cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
258         bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
259     {
260         // Signed integer type for global offsets
261         typedef int OffsetT;
262 
263         return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
264             d_temp_storage,
265             temp_storage_bytes,
266             d_keys,
267             d_values,
268             num_items,
269             begin_bit,
270             end_bit,
271             true,
272             stream,
273             debug_synchronous);
274     }
275 
276 
277     /**
278      * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
279      *
280      * \par
281      * - The contents of the input data are not altered by the sorting operation
282      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
283      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
284      * - \devicestorage
285      *
286      * \par Performance
287      * Performance is similar to DeviceRadixSort::SortPairs.
288      *
289      * \par Snippet
290      * The code snippet below illustrates the sorting of a device vector of \p int keys
291      * with associated vector of \p int values.
292      * \par
293      * \code
294      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
295      *
296      * // Declare, allocate, and initialize device-accessible pointers for sorting data
297      * int  num_items;          // e.g., 7
298      * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
299      * int  *d_keys_out;        // e.g., [        ...        ]
300      * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
301      * int  *d_values_out;      // e.g., [        ...        ]
302      * ...
303      *
304      * // Determine temporary device storage requirements
305      * void     *d_temp_storage = NULL;
306      * size_t   temp_storage_bytes = 0;
307      * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
308      *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
309      *
310      * // Allocate temporary storage
311      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
312      *
313      * // Run sorting operation
314      * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
315      *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
316      *
317      * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
318      * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
319      *
320      * \endcode
321      *
322      * \tparam KeyT      <b>[inferred]</b> KeyT type
323      * \tparam ValueT    <b>[inferred]</b> ValueT type
324      */
325     template <
326         typename            KeyT,
327         typename            ValueT>
328     CUB_RUNTIME_FUNCTION
SortPairsDescendingcub::DeviceRadixSort329     static cudaError_t SortPairsDescending(
330         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
331         size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
332         const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
333         KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
334         const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
335         ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
336         int                 num_items,                              ///< [in] Number of items to sort
337         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
338         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
339         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
340         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
341     {
342         // Signed integer type for global offsets
343         typedef int OffsetT;
344 
345         DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
346         DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
347 
348         return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
349             d_temp_storage,
350             temp_storage_bytes,
351             d_keys,
352             d_values,
353             num_items,
354             begin_bit,
355             end_bit,
356             false,
357             stream,
358             debug_synchronous);
359     }
360 
361 
362     /**
363      * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
364      *
365      * \par
366      * - The sorting operation is given a pair of key buffers and a corresponding
367      *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
368      *   structure that indicates which of the two buffers is "current" (and thus
369      *   contains the input data to be sorted).
370      * - The contents of both buffers within each pair may be altered by the sorting
371      *   operation.
372      * - Upon completion, the sorting operation will update the "current" indicator
373      *   within each DoubleBuffer wrapper to reference which of the two buffers
374      *   now contains the sorted output sequence (a function of the number of key bits
375      *   specified and the targeted device architecture).
376      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
377      * - \devicestorageP
378      * - \devicestorage
379      *
380      * \par Performance
381      * Performance is similar to DeviceRadixSort::SortPairs.
382      *
383      * \par Snippet
384      * The code snippet below illustrates the sorting of a device vector of \p int keys
385      * with associated vector of \p int values.
386      * \par
387      * \code
388      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
389      *
390      * // Declare, allocate, and initialize device-accessible pointers for sorting data
391      * int  num_items;          // e.g., 7
392      * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
393      * int  *d_key_alt_buf;     // e.g., [        ...        ]
394      * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
395      * int  *d_value_alt_buf;   // e.g., [        ...        ]
396      * ...
397      *
398      * // Create a set of DoubleBuffers to wrap pairs of device pointers
399      * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
400      * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
401      *
402      * // Determine temporary device storage requirements
403      * void     *d_temp_storage = NULL;
404      * size_t   temp_storage_bytes = 0;
405      * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
406      *
407      * // Allocate temporary storage
408      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
409      *
410      * // Run sorting operation
411      * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
412      *
413      * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
414      * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
415      *
416      * \endcode
417      *
418      * \tparam KeyT      <b>[inferred]</b> KeyT type
419      * \tparam ValueT    <b>[inferred]</b> ValueT type
420      */
421     template <
422         typename            KeyT,
423         typename            ValueT>
424     CUB_RUNTIME_FUNCTION
SortPairsDescendingcub::DeviceRadixSort425     static cudaError_t SortPairsDescending(
426         void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
427         size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
428         DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
429         DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
430         int                     num_items,                              ///< [in] Number of items to sort
431         int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
432         int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
433         cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
434         bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
435     {
436         // Signed integer type for global offsets
437         typedef int OffsetT;
438 
439         return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
440             d_temp_storage,
441             temp_storage_bytes,
442             d_keys,
443             d_values,
444             num_items,
445             begin_bit,
446             end_bit,
447             true,
448             stream,
449             debug_synchronous);
450     }
451 
452 
453     //@}  end member group
454     /******************************************************************//**
455      * \name Keys-only
456      *********************************************************************/
457     //@{
458 
459 
460     /**
461      * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
462      *
463      * \par
464      * - The contents of the input data are not altered by the sorting operation
465      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
466      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
467      * - \devicestorage
468      *
469      * \par Performance
470      * The following charts illustrate saturated sorting performance across different
471      * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
472      *
473      * \image html lsb_radix_sort_int32_keys.png
474      * \image html lsb_radix_sort_int64_keys.png
475      *
476      * \par Snippet
477      * The code snippet below illustrates the sorting of a device vector of \p int keys.
478      * \par
479      * \code
480      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
481      *
482      * // Declare, allocate, and initialize device-accessible pointers for sorting data
483      * int  num_items;          // e.g., 7
484      * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
485      * int  *d_keys_out;        // e.g., [        ...        ]
486      * ...
487      *
488      * // Determine temporary device storage requirements
489      * void     *d_temp_storage = NULL;
490      * size_t   temp_storage_bytes = 0;
491      * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
492      *
493      * // Allocate temporary storage
494      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
495      *
496      * // Run sorting operation
497      * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
498      *
499      * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
500      *
501      * \endcode
502      *
503      * \tparam KeyT      <b>[inferred]</b> KeyT type
504      */
505     template <typename KeyT>
506     CUB_RUNTIME_FUNCTION
SortKeyscub::DeviceRadixSort507     static cudaError_t SortKeys(
508         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
509         size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
510         const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
511         KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
512         int                 num_items,                              ///< [in] Number of items to sort
513         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
514         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
515         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
516         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
517     {
518         // Signed integer type for global offsets
519         typedef int OffsetT;
520 
521         // Null value type
522         DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
523         DoubleBuffer<NullType>  d_values;
524 
525         return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
526             d_temp_storage,
527             temp_storage_bytes,
528             d_keys,
529             d_values,
530             num_items,
531             begin_bit,
532             end_bit,
533             false,
534             stream,
535             debug_synchronous);
536     }
537 
538 
539     /**
540      * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
541      *
542      * \par
543      * - The sorting operation is given a pair of key buffers managed by a
544      *   DoubleBuffer structure that indicates which of the two buffers is
545      *   "current" (and thus contains the input data to be sorted).
546      * - The contents of both buffers may be altered by the sorting operation.
547      * - Upon completion, the sorting operation will update the "current" indicator
548      *   within the DoubleBuffer wrapper to reference which of the two buffers
549      *   now contains the sorted output sequence (a function of the number of key bits
550      *   specified and the targeted device architecture).
551      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
552      * - \devicestorageP
553      * - \devicestorage
554      *
555      * \par Performance
556      * The following charts illustrate saturated sorting performance across different
557      * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
558      *
559      * \image html lsb_radix_sort_int32_keys.png
560      * \image html lsb_radix_sort_int64_keys.png
561      *
562      * \par Snippet
563      * The code snippet below illustrates the sorting of a device vector of \p int keys.
564      * \par
565      * \code
566      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
567      *
568      * // Declare, allocate, and initialize device-accessible pointers for sorting data
569      * int  num_items;          // e.g., 7
570      * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
571      * int  *d_key_alt_buf;     // e.g., [        ...        ]
572      * ...
573      *
574      * // Create a DoubleBuffer to wrap the pair of device pointers
575      * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
576      *
577      * // Determine temporary device storage requirements
578      * void     *d_temp_storage = NULL;
579      * size_t   temp_storage_bytes = 0;
580      * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
581      *
582      * // Allocate temporary storage
583      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
584      *
585      * // Run sorting operation
586      * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
587      *
588      * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
589      *
590      * \endcode
591      *
592      * \tparam KeyT      <b>[inferred]</b> KeyT type
593      */
594     template <typename KeyT>
595     CUB_RUNTIME_FUNCTION
SortKeyscub::DeviceRadixSort596     static cudaError_t SortKeys(
597         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
598         size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
599         DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
600         int                 num_items,                              ///< [in] Number of items to sort
601         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
602         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
603         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
604         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
605     {
606         // Signed integer type for global offsets
607         typedef int OffsetT;
608 
609         // Null value type
610         DoubleBuffer<NullType> d_values;
611 
612         return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
613             d_temp_storage,
614             temp_storage_bytes,
615             d_keys,
616             d_values,
617             num_items,
618             begin_bit,
619             end_bit,
620             true,
621             stream,
622             debug_synchronous);
623     }
624 
625     /**
626      * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
627      *
628      * \par
629      * - The contents of the input data are not altered by the sorting operation
630      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
631      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
632      * - \devicestorage
633      *
634      * \par Performance
635      * Performance is similar to DeviceRadixSort::SortKeys.
636      *
637      * \par Snippet
638      * The code snippet below illustrates the sorting of a device vector of \p int keys.
639      * \par
640      * \code
641      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
642      *
643      * // Declare, allocate, and initialize device-accessible pointers for sorting data
644      * int  num_items;          // e.g., 7
645      * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
646      * int  *d_keys_out;        // e.g., [        ...        ]
647      * ...
648      *
649      * // Create a DoubleBuffer to wrap the pair of device pointers
650      * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
651      *
652      * // Determine temporary device storage requirements
653      * void     *d_temp_storage = NULL;
654      * size_t   temp_storage_bytes = 0;
655      * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
656      *
657      * // Allocate temporary storage
658      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
659      *
660      * // Run sorting operation
661      * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
662      *
663      * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
664      *
665      * \endcode
666      *
667      * \tparam KeyT      <b>[inferred]</b> KeyT type
668      */
669     template <typename KeyT>
670     CUB_RUNTIME_FUNCTION
SortKeysDescendingcub::DeviceRadixSort671     static cudaError_t SortKeysDescending(
672         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
673         size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
674         const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
675         KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
676         int                 num_items,                              ///< [in] Number of items to sort
677         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
678         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
679         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
680         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
681     {
682         // Signed integer type for global offsets
683         typedef int OffsetT;
684 
685         DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
686         DoubleBuffer<NullType>  d_values;
687 
688         return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
689             d_temp_storage,
690             temp_storage_bytes,
691             d_keys,
692             d_values,
693             num_items,
694             begin_bit,
695             end_bit,
696             false,
697             stream,
698             debug_synchronous);
699     }
700 
701 
702     /**
703      * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
704      *
705      * \par
706      * - The sorting operation is given a pair of key buffers managed by a
707      *   DoubleBuffer structure that indicates which of the two buffers is
708      *   "current" (and thus contains the input data to be sorted).
709      * - The contents of both buffers may be altered by the sorting operation.
710      * - Upon completion, the sorting operation will update the "current" indicator
711      *   within the DoubleBuffer wrapper to reference which of the two buffers
712      *   now contains the sorted output sequence (a function of the number of key bits
713      *   specified and the targeted device architecture).
714      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
715      * - \devicestorageP
716      * - \devicestorage
717      *
718      * \par Performance
719      * Performance is similar to DeviceRadixSort::SortKeys.
720      *
721      * \par Snippet
722      * The code snippet below illustrates the sorting of a device vector of \p int keys.
723      * \par
724      * \code
725      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
726      *
727      * // Declare, allocate, and initialize device-accessible pointers for sorting data
728      * int  num_items;          // e.g., 7
729      * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
730      * int  *d_key_alt_buf;     // e.g., [        ...        ]
731      * ...
732      *
733      * // Create a DoubleBuffer to wrap the pair of device pointers
734      * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
735      *
736      * // Determine temporary device storage requirements
737      * void     *d_temp_storage = NULL;
738      * size_t   temp_storage_bytes = 0;
739      * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
740      *
741      * // Allocate temporary storage
742      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
743      *
744      * // Run sorting operation
745      * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
746      *
747      * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
748      *
749      * \endcode
750      *
751      * \tparam KeyT      <b>[inferred]</b> KeyT type
752      */
753     template <typename KeyT>
754     CUB_RUNTIME_FUNCTION
SortKeysDescendingcub::DeviceRadixSort755     static cudaError_t SortKeysDescending(
756         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
757         size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
758         DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
759         int                 num_items,                              ///< [in] Number of items to sort
760         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
761         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
762         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
763         bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
764     {
765         // Signed integer type for global offsets
766         typedef int OffsetT;
767 
768         // Null value type
769         DoubleBuffer<NullType> d_values;
770 
771         return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
772             d_temp_storage,
773             temp_storage_bytes,
774             d_keys,
775             d_values,
776             num_items,
777             begin_bit,
778             end_bit,
779             true,
780             stream,
781             debug_synchronous);
782     }
783 
784 
785     //@}  end member group
786 
787 
788 };
789 
790 /**
791  * \example example_device_radix_sort.cu
792  */
793 
794 }               // CUB namespace
795 CUB_NS_POSTFIX  // Optional outer namespace(s)
796 
797 
798