1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
32  */
33 
34 
35 #pragma once
36 
37 #include "block_exchange.cuh"
38 #include "block_radix_rank.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_arch.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46 
47 /// CUB namespace
48 namespace cub {
49 
50 /**
51  * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
52  * \ingroup BlockModule
53  *
54  * \tparam KeyT                 KeyT type
55  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
56  * \tparam ITEMS_PER_THREAD     The number of items per thread
57  * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
58  * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
59  * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
60  * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
61  * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
62  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
63  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
64  * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
65  *
66  * \par Overview
67  * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
68  *   items into ascending order.  It relies upon a positional representation for
69  *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
70  *   characters, etc.) specified from least-significant to most-significant.  For a
71  *   given input sequence of keys and a set of rules specifying a total ordering
72  *   of the symbolic alphabet, the radix sorting method produces a lexicographic
73  *   ordering of those keys.
74  * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
75  *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
76  *   half-precision floating-point type. Within each key, the implementation treats fixed-length
77  *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
78  *   method can only be applied to unsigned integral types, BlockRadixSort
79  *   is able to sort signed and floating-point types via simple bit-wise transformations
80  *   that ensure lexicographic key ordering.
81  * - \rowmajor
82  *
83  * \par Performance Considerations
84  * - \granularity
85  *
86  * \par A Simple Example
87  * \blockcollective{BlockRadixSort}
88  * \par
89  * The code snippet below illustrates a sort of 512 integer keys that
90  * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
91  * where each thread owns 4 consecutive items.
92  * \par
93  * \code
94  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
95  *
96  * __global__ void ExampleKernel(...)
97  * {
98  *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
99  *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
100  *
101  *     // Allocate shared memory for BlockRadixSort
102  *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
103  *
104  *     // Obtain a segment of consecutive items that are blocked across threads
105  *     int thread_keys[4];
106  *     ...
107  *
108  *     // Collectively sort the keys
109  *     BlockRadixSort(temp_storage).Sort(thread_keys);
110  *
111  *     ...
112  * \endcode
113  * \par
114  * Suppose the set of input \p thread_keys across the block of threads is
115  * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
116  * corresponding output \p thread_keys in those threads will be
117  * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
118  *
119  */
120 template <
121     typename                KeyT,
122     int                     BLOCK_DIM_X,
123     int                     ITEMS_PER_THREAD,
124     typename                ValueT                   = NullType,
125     int                     RADIX_BITS              = 4,
126     bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
127     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
128     cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
129     int                     BLOCK_DIM_Y             = 1,
130     int                     BLOCK_DIM_Z             = 1,
131     int                     PTX_ARCH                = CUB_PTX_ARCH>
132 class BlockRadixSort
133 {
134 private:
135 
136     /******************************************************************************
137      * Constants and type definitions
138      ******************************************************************************/
139 
140     enum
141     {
142         // The thread block size in threads
143         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
144 
145         // Whether or not there are values to be trucked along with keys
146         KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
147     };
148 
149     // KeyT traits and unsigned bits type
150     typedef Traits<KeyT>                        KeyTraits;
151     typedef typename KeyTraits::UnsignedBits    UnsignedBits;
152 
153     /// Ascending BlockRadixRank utility type
154     typedef BlockRadixRank<
155             BLOCK_DIM_X,
156             RADIX_BITS,
157             false,
158             MEMOIZE_OUTER_SCAN,
159             INNER_SCAN_ALGORITHM,
160             SMEM_CONFIG,
161             BLOCK_DIM_Y,
162             BLOCK_DIM_Z,
163             PTX_ARCH>
164         AscendingBlockRadixRank;
165 
166     /// Descending BlockRadixRank utility type
167     typedef BlockRadixRank<
168             BLOCK_DIM_X,
169             RADIX_BITS,
170             true,
171             MEMOIZE_OUTER_SCAN,
172             INNER_SCAN_ALGORITHM,
173             SMEM_CONFIG,
174             BLOCK_DIM_Y,
175             BLOCK_DIM_Z,
176             PTX_ARCH>
177         DescendingBlockRadixRank;
178 
179     /// BlockExchange utility type for keys
180     typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
181 
182     /// BlockExchange utility type for values
183     typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
184 
185     /// Shared memory storage layout type
186     union _TempStorage
187     {
188         typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
189         typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
190         typename BlockExchangeKeys::TempStorage        exchange_keys;
191         typename BlockExchangeValues::TempStorage      exchange_values;
192     };
193 
194 
195     /******************************************************************************
196      * Thread fields
197      ******************************************************************************/
198 
199     /// Shared storage reference
200     _TempStorage &temp_storage;
201 
202     /// Linear thread-id
203     unsigned int linear_tid;
204 
205     /******************************************************************************
206      * Utility methods
207      ******************************************************************************/
208 
209     /// Internal storage allocator
PrivateStorage()210     __device__ __forceinline__ _TempStorage& PrivateStorage()
211     {
212         __shared__ _TempStorage private_storage;
213         return private_storage;
214     }
215 
216     /// Rank keys (specialized for ascending sort)
RankKeys(UnsignedBits (& unsigned_keys)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],int begin_bit,int pass_bits,Int2Type<false>)217     __device__ __forceinline__ void RankKeys(
218         UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
219         int             (&ranks)[ITEMS_PER_THREAD],
220         int             begin_bit,
221         int             pass_bits,
222         Int2Type<false> /*is_descending*/)
223     {
224         AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
225             unsigned_keys,
226             ranks,
227             begin_bit,
228             pass_bits);
229     }
230 
231     /// Rank keys (specialized for descending sort)
RankKeys(UnsignedBits (& unsigned_keys)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],int begin_bit,int pass_bits,Int2Type<true>)232     __device__ __forceinline__ void RankKeys(
233         UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
234         int             (&ranks)[ITEMS_PER_THREAD],
235         int             begin_bit,
236         int             pass_bits,
237         Int2Type<true>  /*is_descending*/)
238     {
239         DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
240             unsigned_keys,
241             ranks,
242             begin_bit,
243             pass_bits);
244     }
245 
246     /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
ExchangeValues(ValueT (& values)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],Int2Type<false>,Int2Type<true>)247     __device__ __forceinline__ void ExchangeValues(
248         ValueT          (&values)[ITEMS_PER_THREAD],
249         int             (&ranks)[ITEMS_PER_THREAD],
250         Int2Type<false> /*is_keys_only*/,
251         Int2Type<true>  /*is_blocked*/)
252     {
253         CTA_SYNC();
254 
255         // Exchange values through shared memory in blocked arrangement
256         BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
257     }
258 
259     /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
ExchangeValues(ValueT (& values)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],Int2Type<false>,Int2Type<false>)260     __device__ __forceinline__ void ExchangeValues(
261         ValueT          (&values)[ITEMS_PER_THREAD],
262         int             (&ranks)[ITEMS_PER_THREAD],
263         Int2Type<false> /*is_keys_only*/,
264         Int2Type<false> /*is_blocked*/)
265     {
266         CTA_SYNC();
267 
268         // Exchange values through shared memory in blocked arrangement
269         BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
270     }
271 
272     /// ExchangeValues (specialized for keys-only sort)
273     template <int IS_BLOCKED>
ExchangeValues(ValueT (&)[ITEMS_PER_THREAD],int (&)[ITEMS_PER_THREAD],Int2Type<true>,Int2Type<IS_BLOCKED>)274     __device__ __forceinline__ void ExchangeValues(
275         ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
276         int                     (&/*ranks*/)[ITEMS_PER_THREAD],
277         Int2Type<true>          /*is_keys_only*/,
278         Int2Type<IS_BLOCKED>    /*is_blocked*/)
279     {}
280 
281     /// Sort blocked arrangement
282     template <int DESCENDING, int KEYS_ONLY>
SortBlocked(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit,int end_bit,Int2Type<DESCENDING> is_descending,Int2Type<KEYS_ONLY> is_keys_only)283     __device__ __forceinline__ void SortBlocked(
284         KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
285         ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
286         int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
287         int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
288         Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
289         Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
290     {
291         UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
292             reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
293 
294         // Twiddle bits if necessary
295         #pragma unroll
296         for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
297         {
298             unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
299         }
300 
301         // Radix sorting passes
302         while (true)
303         {
304             int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
305 
306             // Rank the blocked keys
307             int ranks[ITEMS_PER_THREAD];
308             RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
309             begin_bit += RADIX_BITS;
310 
311             CTA_SYNC();
312 
313             // Exchange keys through shared memory in blocked arrangement
314             BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
315 
316             // Exchange values through shared memory in blocked arrangement
317             ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
318 
319             // Quit if done
320             if (begin_bit >= end_bit) break;
321 
322             CTA_SYNC();
323         }
324 
325         // Untwiddle bits if necessary
326         #pragma unroll
327         for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
328         {
329             unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
330         }
331     }
332 
333 public:
334 
335 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
336 
337     /// Sort blocked -> striped arrangement
338     template <int DESCENDING, int KEYS_ONLY>
SortBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit,int end_bit,Int2Type<DESCENDING> is_descending,Int2Type<KEYS_ONLY> is_keys_only)339     __device__ __forceinline__ void SortBlockedToStriped(
340         KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
341         ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
342         int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
343         int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
344         Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
345         Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
346     {
347         UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
348             reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
349 
350         // Twiddle bits if necessary
351         #pragma unroll
352         for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
353         {
354             unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
355         }
356 
357         // Radix sorting passes
358         while (true)
359         {
360             int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
361 
362             // Rank the blocked keys
363             int ranks[ITEMS_PER_THREAD];
364             RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
365             begin_bit += RADIX_BITS;
366 
367             CTA_SYNC();
368 
369             // Check if this is the last pass
370             if (begin_bit >= end_bit)
371             {
372                 // Last pass exchanges keys through shared memory in striped arrangement
373                 BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
374 
375                 // Last pass exchanges through shared memory in striped arrangement
376                 ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
377 
378                 // Quit
379                 break;
380             }
381 
382             // Exchange keys through shared memory in blocked arrangement
383             BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
384 
385             // Exchange values through shared memory in blocked arrangement
386             ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
387 
388             CTA_SYNC();
389         }
390 
391         // Untwiddle bits if necessary
392         #pragma unroll
393         for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
394         {
395             unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
396         }
397     }
398 
399 #endif // DOXYGEN_SHOULD_SKIP_THIS
400 
401     /// \smemstorage{BlockRadixSort}
402     struct TempStorage : Uninitialized<_TempStorage> {};
403 
404 
405     /******************************************************************//**
406      * \name Collective constructors
407      *********************************************************************/
408     //@{
409 
410     /**
411      * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
412      */
BlockRadixSort()413     __device__ __forceinline__ BlockRadixSort()
414     :
415         temp_storage(PrivateStorage()),
416         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
417     {}
418 
419 
420     /**
421      * \brief Collective constructor using the specified memory allocation as temporary storage.
422      */
BlockRadixSort(TempStorage & temp_storage)423     __device__ __forceinline__ BlockRadixSort(
424         TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
425     :
426         temp_storage(temp_storage.Alias()),
427         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
428     {}
429 
430 
431     //@}  end member group
432     /******************************************************************//**
433      * \name Sorting (blocked arrangements)
434      *********************************************************************/
435     //@{
436 
437     /**
438      * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
439      *
440      * \par
441      * - \granularity
442      * - \smemreuse
443      *
444      * \par Snippet
445      * The code snippet below illustrates a sort of 512 integer keys that
446      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
447      * where each thread owns 4 consecutive keys.
448      * \par
449      * \code
450      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
451      *
452      * __global__ void ExampleKernel(...)
453      * {
454      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
455      *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
456      *
457      *     // Allocate shared memory for BlockRadixSort
458      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
459      *
460      *     // Obtain a segment of consecutive items that are blocked across threads
461      *     int thread_keys[4];
462      *     ...
463      *
464      *     // Collectively sort the keys
465      *     BlockRadixSort(temp_storage).Sort(thread_keys);
466      *
467      * \endcode
468      * \par
469      * Suppose the set of input \p thread_keys across the block of threads is
470      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
471      * The corresponding output \p thread_keys in those threads will be
472      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
473      */
Sort(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)474     __device__ __forceinline__ void Sort(
475         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
476         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
477         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
478     {
479         NullType values[ITEMS_PER_THREAD];
480 
481         SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
482     }
483 
484 
485     /**
486      * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
487      *
488      * \par
489      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
490      *   more than one tile of values, simply perform a key-value sort of the keys paired
491      *   with a temporary value array that enumerates the key indices.  The reordered indices
492      *   can then be used as a gather-vector for exchanging other associated tile data through
493      *   shared memory.
494      * - \granularity
495      * - \smemreuse
496      *
497      * \par Snippet
498      * The code snippet below illustrates a sort of 512 integer keys and values that
499      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
500      * where each thread owns 4 consecutive pairs.
501      * \par
502      * \code
503      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
504      *
505      * __global__ void ExampleKernel(...)
506      * {
507      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
508      *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
509      *
510      *     // Allocate shared memory for BlockRadixSort
511      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
512      *
513      *     // Obtain a segment of consecutive items that are blocked across threads
514      *     int thread_keys[4];
515      *     int thread_values[4];
516      *     ...
517      *
518      *     // Collectively sort the keys and values among block threads
519      *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
520      *
521      * \endcode
522      * \par
523      * Suppose the set of input \p thread_keys across the block of threads is
524      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
525      * corresponding output \p thread_keys in those threads will be
526      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
527      *
528      */
Sort(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)529     __device__ __forceinline__ void Sort(
530         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
531         ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
532         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
533         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
534     {
535         SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
536     }
537 
538     /**
539      * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
540      *
541      * \par
542      * - \granularity
543      * - \smemreuse
544      *
545      * \par Snippet
546      * The code snippet below illustrates a sort of 512 integer keys that
547      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
548      * where each thread owns 4 consecutive keys.
549      * \par
550      * \code
551      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
552      *
553      * __global__ void ExampleKernel(...)
554      * {
555      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
556      *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
557      *
558      *     // Allocate shared memory for BlockRadixSort
559      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
560      *
561      *     // Obtain a segment of consecutive items that are blocked across threads
562      *     int thread_keys[4];
563      *     ...
564      *
565      *     // Collectively sort the keys
566      *     BlockRadixSort(temp_storage).Sort(thread_keys);
567      *
568      * \endcode
569      * \par
570      * Suppose the set of input \p thread_keys across the block of threads is
571      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
572      * The corresponding output \p thread_keys in those threads will be
573      * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
574      */
SortDescending(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)575     __device__ __forceinline__ void SortDescending(
576         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
577         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
578         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
579     {
580         NullType values[ITEMS_PER_THREAD];
581 
582         SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
583     }
584 
585 
586     /**
587      * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
588      *
589      * \par
590      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
591      *   more than one tile of values, simply perform a key-value sort of the keys paired
592      *   with a temporary value array that enumerates the key indices.  The reordered indices
593      *   can then be used as a gather-vector for exchanging other associated tile data through
594      *   shared memory.
595      * - \granularity
596      * - \smemreuse
597      *
598      * \par Snippet
599      * The code snippet below illustrates a sort of 512 integer keys and values that
600      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
601      * where each thread owns 4 consecutive pairs.
602      * \par
603      * \code
604      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
605      *
606      * __global__ void ExampleKernel(...)
607      * {
608      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
609      *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
610      *
611      *     // Allocate shared memory for BlockRadixSort
612      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
613      *
614      *     // Obtain a segment of consecutive items that are blocked across threads
615      *     int thread_keys[4];
616      *     int thread_values[4];
617      *     ...
618      *
619      *     // Collectively sort the keys and values among block threads
620      *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
621      *
622      * \endcode
623      * \par
624      * Suppose the set of input \p thread_keys across the block of threads is
625      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
626      * corresponding output \p thread_keys in those threads will be
627      * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
628      *
629      */
SortDescending(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)630     __device__ __forceinline__ void SortDescending(
631         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
632         ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
633         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
634         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
635     {
636         SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
637     }
638 
639 
640     //@}  end member group
641     /******************************************************************//**
642      * \name Sorting (blocked arrangement -> striped arrangement)
643      *********************************************************************/
644     //@{
645 
646 
647     /**
648      * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
649      *
650      * \par
651      * - \granularity
652      * - \smemreuse
653      *
654      * \par Snippet
655      * The code snippet below illustrates a sort of 512 integer keys that
656      * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
657      * where each thread owns 4 consecutive keys.  The final partitioning is striped.
658      * \par
659      * \code
660      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
661      *
662      * __global__ void ExampleKernel(...)
663      * {
664      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
665      *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
666      *
667      *     // Allocate shared memory for BlockRadixSort
668      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
669      *
670      *     // Obtain a segment of consecutive items that are blocked across threads
671      *     int thread_keys[4];
672      *     ...
673      *
674      *     // Collectively sort the keys
675      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
676      *
677      * \endcode
678      * \par
679      * Suppose the set of input \p thread_keys across the block of threads is
680      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
681      * corresponding output \p thread_keys in those threads will be
682      * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
683      *
684      */
SortBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)685     __device__ __forceinline__ void SortBlockedToStriped(
686         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
687         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
688         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
689     {
690         NullType values[ITEMS_PER_THREAD];
691 
692         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
693     }
694 
695 
696     /**
697      * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
698      *
699      * \par
700      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
701      *   more than one tile of values, simply perform a key-value sort of the keys paired
702      *   with a temporary value array that enumerates the key indices.  The reordered indices
703      *   can then be used as a gather-vector for exchanging other associated tile data through
704      *   shared memory.
705      * - \granularity
706      * - \smemreuse
707      *
708      * \par Snippet
709      * The code snippet below illustrates a sort of 512 integer keys and values that
710      * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
711      * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
712      * \par
713      * \code
714      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
715      *
716      * __global__ void ExampleKernel(...)
717      * {
718      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
719      *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
720      *
721      *     // Allocate shared memory for BlockRadixSort
722      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
723      *
724      *     // Obtain a segment of consecutive items that are blocked across threads
725      *     int thread_keys[4];
726      *     int thread_values[4];
727      *     ...
728      *
729      *     // Collectively sort the keys and values among block threads
730      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
731      *
732      * \endcode
733      * \par
734      * Suppose the set of input \p thread_keys across the block of threads is
735      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
736      * corresponding output \p thread_keys in those threads will be
737      * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
738      *
739      */
SortBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)740     __device__ __forceinline__ void SortBlockedToStriped(
741         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
742         ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
743         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
744         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
745     {
746         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
747     }
748 
749 
750     /**
751      * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
752      *
753      * \par
754      * - \granularity
755      * - \smemreuse
756      *
757      * \par Snippet
758      * The code snippet below illustrates a sort of 512 integer keys that
759      * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
760      * where each thread owns 4 consecutive keys.  The final partitioning is striped.
761      * \par
762      * \code
763      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
764      *
765      * __global__ void ExampleKernel(...)
766      * {
767      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
768      *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
769      *
770      *     // Allocate shared memory for BlockRadixSort
771      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
772      *
773      *     // Obtain a segment of consecutive items that are blocked across threads
774      *     int thread_keys[4];
775      *     ...
776      *
777      *     // Collectively sort the keys
778      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
779      *
780      * \endcode
781      * \par
782      * Suppose the set of input \p thread_keys across the block of threads is
783      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
784      * corresponding output \p thread_keys in those threads will be
785      * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
786      *
787      */
SortDescendingBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)788     __device__ __forceinline__ void SortDescendingBlockedToStriped(
789         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
790         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
791         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
792     {
793         NullType values[ITEMS_PER_THREAD];
794 
795         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
796     }
797 
798 
799     /**
800      * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
801      *
802      * \par
803      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
804      *   more than one tile of values, simply perform a key-value sort of the keys paired
805      *   with a temporary value array that enumerates the key indices.  The reordered indices
806      *   can then be used as a gather-vector for exchanging other associated tile data through
807      *   shared memory.
808      * - \granularity
809      * - \smemreuse
810      *
811      * \par Snippet
812      * The code snippet below illustrates a sort of 512 integer keys and values that
813      * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
814      * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
815      * \par
816      * \code
817      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
818      *
819      * __global__ void ExampleKernel(...)
820      * {
821      *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
822      *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
823      *
824      *     // Allocate shared memory for BlockRadixSort
825      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
826      *
827      *     // Obtain a segment of consecutive items that are blocked across threads
828      *     int thread_keys[4];
829      *     int thread_values[4];
830      *     ...
831      *
832      *     // Collectively sort the keys and values among block threads
833      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
834      *
835      * \endcode
836      * \par
837      * Suppose the set of input \p thread_keys across the block of threads is
838      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
839      * corresponding output \p thread_keys in those threads will be
840      * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
841      *
842      */
SortDescendingBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)843     __device__ __forceinline__ void SortDescendingBlockedToStriped(
844         KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
845         ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
846         int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
847         int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
848     {
849         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
850     }
851 
852 
853     //@}  end member group
854 
855 };
856 
857 /**
858  * \example example_block_radix_sort.cu
859  */
860 
861 }               // CUB namespace
862 CUB_NS_POSTFIX  // Optional outer namespace(s)
863 
864