1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. 32 */ 33 34 35 #pragma once 36 37 #include "block_exchange.cuh" 38 #include "block_radix_rank.cuh" 39 #include "../util_ptx.cuh" 40 #include "../util_arch.cuh" 41 #include "../util_type.cuh" 42 #include "../util_namespace.cuh" 43 44 /// Optional outer namespace(s) 45 CUB_NS_PREFIX 46 47 /// CUB namespace 48 namespace cub { 49 50 /** 51 * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) 52 * \ingroup BlockModule 53 * 54 * \tparam KeyT KeyT type 55 * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension 56 * \tparam ITEMS_PER_THREAD The number of items per thread 57 * \tparam ValueT <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort) 58 * \tparam RADIX_BITS <b>[optional]</b> The number of radix bits per digit place (default: 4 bits) 59 * \tparam MEMOIZE_OUTER_SCAN <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). 60 * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) 61 * \tparam SMEM_CONFIG <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) 62 * \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1) 63 * \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1) 64 * \tparam PTX_ARCH <b>[optional]</b> \ptxversion 65 * 66 * \par Overview 67 * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges 68 * items into ascending order. It relies upon a positional representation for 69 * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, 70 * characters, etc.) specified from least-significant to most-significant. For a 71 * given input sequence of keys and a set of rules specifying a total ordering 72 * of the symbolic alphabet, the radix sorting method produces a lexicographic 73 * ordering of those keys. 74 * - BlockRadixSort can sort all of the built-in C++ numeric primitive types 75 * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half 76 * half-precision floating-point type. Within each key, the implementation treats fixed-length 77 * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting 78 * method can only be applied to unsigned integral types, BlockRadixSort 79 * is able to sort signed and floating-point types via simple bit-wise transformations 80 * that ensure lexicographic key ordering. 81 * - \rowmajor 82 * 83 * \par Performance Considerations 84 * - \granularity 85 * 86 * \par A Simple Example 87 * \blockcollective{BlockRadixSort} 88 * \par 89 * The code snippet below illustrates a sort of 512 integer keys that 90 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 91 * where each thread owns 4 consecutive items. 92 * \par 93 * \code 94 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 95 * 96 * __global__ void ExampleKernel(...) 97 * { 98 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each 99 * typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort; 100 * 101 * // Allocate shared memory for BlockRadixSort 102 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 103 * 104 * // Obtain a segment of consecutive items that are blocked across threads 105 * int thread_keys[4]; 106 * ... 107 * 108 * // Collectively sort the keys 109 * BlockRadixSort(temp_storage).Sort(thread_keys); 110 * 111 * ... 112 * \endcode 113 * \par 114 * Suppose the set of input \p thread_keys across the block of threads is 115 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The 116 * corresponding output \p thread_keys in those threads will be 117 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 118 * 119 */ 120 template < 121 typename KeyT, 122 int BLOCK_DIM_X, 123 int ITEMS_PER_THREAD, 124 typename ValueT = NullType, 125 int RADIX_BITS = 4, 126 bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, 127 BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, 128 cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, 129 int BLOCK_DIM_Y = 1, 130 int BLOCK_DIM_Z = 1, 131 int PTX_ARCH = CUB_PTX_ARCH> 132 class BlockRadixSort 133 { 134 private: 135 136 /****************************************************************************** 137 * Constants and type definitions 138 ******************************************************************************/ 139 140 enum 141 { 142 // The thread block size in threads 143 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 144 145 // Whether or not there are values to be trucked along with keys 146 KEYS_ONLY = Equals<ValueT, NullType>::VALUE, 147 }; 148 149 // KeyT traits and unsigned bits type 150 typedef Traits<KeyT> KeyTraits; 151 typedef typename KeyTraits::UnsignedBits UnsignedBits; 152 153 /// Ascending BlockRadixRank utility type 154 typedef BlockRadixRank< 155 BLOCK_DIM_X, 156 RADIX_BITS, 157 false, 158 MEMOIZE_OUTER_SCAN, 159 INNER_SCAN_ALGORITHM, 160 SMEM_CONFIG, 161 BLOCK_DIM_Y, 162 BLOCK_DIM_Z, 163 PTX_ARCH> 164 AscendingBlockRadixRank; 165 166 /// Descending BlockRadixRank utility type 167 typedef BlockRadixRank< 168 BLOCK_DIM_X, 169 RADIX_BITS, 170 true, 171 MEMOIZE_OUTER_SCAN, 172 INNER_SCAN_ALGORITHM, 173 SMEM_CONFIG, 174 BLOCK_DIM_Y, 175 BLOCK_DIM_Z, 176 PTX_ARCH> 177 DescendingBlockRadixRank; 178 179 /// BlockExchange utility type for keys 180 typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys; 181 182 /// BlockExchange utility type for values 183 typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues; 184 185 /// Shared memory storage layout type 186 union _TempStorage 187 { 188 typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; 189 typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; 190 typename BlockExchangeKeys::TempStorage exchange_keys; 191 typename BlockExchangeValues::TempStorage exchange_values; 192 }; 193 194 195 /****************************************************************************** 196 * Thread fields 197 ******************************************************************************/ 198 199 /// Shared storage reference 200 _TempStorage &temp_storage; 201 202 /// Linear thread-id 203 unsigned int linear_tid; 204 205 /****************************************************************************** 206 * Utility methods 207 ******************************************************************************/ 208 209 /// Internal storage allocator PrivateStorage()210 __device__ __forceinline__ _TempStorage& PrivateStorage() 211 { 212 __shared__ _TempStorage private_storage; 213 return private_storage; 214 } 215 216 /// Rank keys (specialized for ascending sort) RankKeys(UnsignedBits (& unsigned_keys)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],int begin_bit,int pass_bits,Int2Type<false>)217 __device__ __forceinline__ void RankKeys( 218 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], 219 int (&ranks)[ITEMS_PER_THREAD], 220 int begin_bit, 221 int pass_bits, 222 Int2Type<false> /*is_descending*/) 223 { 224 AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( 225 unsigned_keys, 226 ranks, 227 begin_bit, 228 pass_bits); 229 } 230 231 /// Rank keys (specialized for descending sort) RankKeys(UnsignedBits (& unsigned_keys)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],int begin_bit,int pass_bits,Int2Type<true>)232 __device__ __forceinline__ void RankKeys( 233 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], 234 int (&ranks)[ITEMS_PER_THREAD], 235 int begin_bit, 236 int pass_bits, 237 Int2Type<true> /*is_descending*/) 238 { 239 DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( 240 unsigned_keys, 241 ranks, 242 begin_bit, 243 pass_bits); 244 } 245 246 /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) ExchangeValues(ValueT (& values)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],Int2Type<false>,Int2Type<true>)247 __device__ __forceinline__ void ExchangeValues( 248 ValueT (&values)[ITEMS_PER_THREAD], 249 int (&ranks)[ITEMS_PER_THREAD], 250 Int2Type<false> /*is_keys_only*/, 251 Int2Type<true> /*is_blocked*/) 252 { 253 CTA_SYNC(); 254 255 // Exchange values through shared memory in blocked arrangement 256 BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); 257 } 258 259 /// ExchangeValues (specialized for key-value sort, to-striped arrangement) ExchangeValues(ValueT (& values)[ITEMS_PER_THREAD],int (& ranks)[ITEMS_PER_THREAD],Int2Type<false>,Int2Type<false>)260 __device__ __forceinline__ void ExchangeValues( 261 ValueT (&values)[ITEMS_PER_THREAD], 262 int (&ranks)[ITEMS_PER_THREAD], 263 Int2Type<false> /*is_keys_only*/, 264 Int2Type<false> /*is_blocked*/) 265 { 266 CTA_SYNC(); 267 268 // Exchange values through shared memory in blocked arrangement 269 BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); 270 } 271 272 /// ExchangeValues (specialized for keys-only sort) 273 template <int IS_BLOCKED> ExchangeValues(ValueT (&)[ITEMS_PER_THREAD],int (&)[ITEMS_PER_THREAD],Int2Type<true>,Int2Type<IS_BLOCKED>)274 __device__ __forceinline__ void ExchangeValues( 275 ValueT (&/*values*/)[ITEMS_PER_THREAD], 276 int (&/*ranks*/)[ITEMS_PER_THREAD], 277 Int2Type<true> /*is_keys_only*/, 278 Int2Type<IS_BLOCKED> /*is_blocked*/) 279 {} 280 281 /// Sort blocked arrangement 282 template <int DESCENDING, int KEYS_ONLY> SortBlocked(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit,int end_bit,Int2Type<DESCENDING> is_descending,Int2Type<KEYS_ONLY> is_keys_only)283 __device__ __forceinline__ void SortBlocked( 284 KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort 285 ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort 286 int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison 287 int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison 288 Int2Type<DESCENDING> is_descending, ///< Tag whether is a descending-order sort 289 Int2Type<KEYS_ONLY> is_keys_only) ///< Tag whether is keys-only sort 290 { 291 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = 292 reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys); 293 294 // Twiddle bits if necessary 295 #pragma unroll 296 for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) 297 { 298 unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); 299 } 300 301 // Radix sorting passes 302 while (true) 303 { 304 int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); 305 306 // Rank the blocked keys 307 int ranks[ITEMS_PER_THREAD]; 308 RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); 309 begin_bit += RADIX_BITS; 310 311 CTA_SYNC(); 312 313 // Exchange keys through shared memory in blocked arrangement 314 BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); 315 316 // Exchange values through shared memory in blocked arrangement 317 ExchangeValues(values, ranks, is_keys_only, Int2Type<true>()); 318 319 // Quit if done 320 if (begin_bit >= end_bit) break; 321 322 CTA_SYNC(); 323 } 324 325 // Untwiddle bits if necessary 326 #pragma unroll 327 for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) 328 { 329 unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); 330 } 331 } 332 333 public: 334 335 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 336 337 /// Sort blocked -> striped arrangement 338 template <int DESCENDING, int KEYS_ONLY> SortBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit,int end_bit,Int2Type<DESCENDING> is_descending,Int2Type<KEYS_ONLY> is_keys_only)339 __device__ __forceinline__ void SortBlockedToStriped( 340 KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort 341 ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort 342 int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison 343 int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison 344 Int2Type<DESCENDING> is_descending, ///< Tag whether is a descending-order sort 345 Int2Type<KEYS_ONLY> is_keys_only) ///< Tag whether is keys-only sort 346 { 347 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = 348 reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys); 349 350 // Twiddle bits if necessary 351 #pragma unroll 352 for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) 353 { 354 unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); 355 } 356 357 // Radix sorting passes 358 while (true) 359 { 360 int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); 361 362 // Rank the blocked keys 363 int ranks[ITEMS_PER_THREAD]; 364 RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); 365 begin_bit += RADIX_BITS; 366 367 CTA_SYNC(); 368 369 // Check if this is the last pass 370 if (begin_bit >= end_bit) 371 { 372 // Last pass exchanges keys through shared memory in striped arrangement 373 BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); 374 375 // Last pass exchanges through shared memory in striped arrangement 376 ExchangeValues(values, ranks, is_keys_only, Int2Type<false>()); 377 378 // Quit 379 break; 380 } 381 382 // Exchange keys through shared memory in blocked arrangement 383 BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); 384 385 // Exchange values through shared memory in blocked arrangement 386 ExchangeValues(values, ranks, is_keys_only, Int2Type<true>()); 387 388 CTA_SYNC(); 389 } 390 391 // Untwiddle bits if necessary 392 #pragma unroll 393 for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) 394 { 395 unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); 396 } 397 } 398 399 #endif // DOXYGEN_SHOULD_SKIP_THIS 400 401 /// \smemstorage{BlockRadixSort} 402 struct TempStorage : Uninitialized<_TempStorage> {}; 403 404 405 /******************************************************************//** 406 * \name Collective constructors 407 *********************************************************************/ 408 //@{ 409 410 /** 411 * \brief Collective constructor using a private static allocation of shared memory as temporary storage. 412 */ BlockRadixSort()413 __device__ __forceinline__ BlockRadixSort() 414 : 415 temp_storage(PrivateStorage()), 416 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 417 {} 418 419 420 /** 421 * \brief Collective constructor using the specified memory allocation as temporary storage. 422 */ BlockRadixSort(TempStorage & temp_storage)423 __device__ __forceinline__ BlockRadixSort( 424 TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage 425 : 426 temp_storage(temp_storage.Alias()), 427 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 428 {} 429 430 431 //@} end member group 432 /******************************************************************//** 433 * \name Sorting (blocked arrangements) 434 *********************************************************************/ 435 //@{ 436 437 /** 438 * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys. 439 * 440 * \par 441 * - \granularity 442 * - \smemreuse 443 * 444 * \par Snippet 445 * The code snippet below illustrates a sort of 512 integer keys that 446 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 447 * where each thread owns 4 consecutive keys. 448 * \par 449 * \code 450 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 451 * 452 * __global__ void ExampleKernel(...) 453 * { 454 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each 455 * typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort; 456 * 457 * // Allocate shared memory for BlockRadixSort 458 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 459 * 460 * // Obtain a segment of consecutive items that are blocked across threads 461 * int thread_keys[4]; 462 * ... 463 * 464 * // Collectively sort the keys 465 * BlockRadixSort(temp_storage).Sort(thread_keys); 466 * 467 * \endcode 468 * \par 469 * Suppose the set of input \p thread_keys across the block of threads is 470 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. 471 * The corresponding output \p thread_keys in those threads will be 472 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 473 */ Sort(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)474 __device__ __forceinline__ void Sort( 475 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 476 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 477 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 478 { 479 NullType values[ITEMS_PER_THREAD]; 480 481 SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>()); 482 } 483 484 485 /** 486 * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values. 487 * 488 * \par 489 * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" 490 * more than one tile of values, simply perform a key-value sort of the keys paired 491 * with a temporary value array that enumerates the key indices. The reordered indices 492 * can then be used as a gather-vector for exchanging other associated tile data through 493 * shared memory. 494 * - \granularity 495 * - \smemreuse 496 * 497 * \par Snippet 498 * The code snippet below illustrates a sort of 512 integer keys and values that 499 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 500 * where each thread owns 4 consecutive pairs. 501 * \par 502 * \code 503 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 504 * 505 * __global__ void ExampleKernel(...) 506 * { 507 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each 508 * typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort; 509 * 510 * // Allocate shared memory for BlockRadixSort 511 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 512 * 513 * // Obtain a segment of consecutive items that are blocked across threads 514 * int thread_keys[4]; 515 * int thread_values[4]; 516 * ... 517 * 518 * // Collectively sort the keys and values among block threads 519 * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); 520 * 521 * \endcode 522 * \par 523 * Suppose the set of input \p thread_keys across the block of threads is 524 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The 525 * corresponding output \p thread_keys in those threads will be 526 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 527 * 528 */ Sort(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)529 __device__ __forceinline__ void Sort( 530 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 531 ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort 532 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 533 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 534 { 535 SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>()); 536 } 537 538 /** 539 * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys. 540 * 541 * \par 542 * - \granularity 543 * - \smemreuse 544 * 545 * \par Snippet 546 * The code snippet below illustrates a sort of 512 integer keys that 547 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 548 * where each thread owns 4 consecutive keys. 549 * \par 550 * \code 551 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 552 * 553 * __global__ void ExampleKernel(...) 554 * { 555 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each 556 * typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort; 557 * 558 * // Allocate shared memory for BlockRadixSort 559 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 560 * 561 * // Obtain a segment of consecutive items that are blocked across threads 562 * int thread_keys[4]; 563 * ... 564 * 565 * // Collectively sort the keys 566 * BlockRadixSort(temp_storage).Sort(thread_keys); 567 * 568 * \endcode 569 * \par 570 * Suppose the set of input \p thread_keys across the block of threads is 571 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. 572 * The corresponding output \p thread_keys in those threads will be 573 * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>. 574 */ SortDescending(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)575 __device__ __forceinline__ void SortDescending( 576 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 577 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 578 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 579 { 580 NullType values[ITEMS_PER_THREAD]; 581 582 SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>()); 583 } 584 585 586 /** 587 * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values. 588 * 589 * \par 590 * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" 591 * more than one tile of values, simply perform a key-value sort of the keys paired 592 * with a temporary value array that enumerates the key indices. The reordered indices 593 * can then be used as a gather-vector for exchanging other associated tile data through 594 * shared memory. 595 * - \granularity 596 * - \smemreuse 597 * 598 * \par Snippet 599 * The code snippet below illustrates a sort of 512 integer keys and values that 600 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 601 * where each thread owns 4 consecutive pairs. 602 * \par 603 * \code 604 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 605 * 606 * __global__ void ExampleKernel(...) 607 * { 608 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each 609 * typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort; 610 * 611 * // Allocate shared memory for BlockRadixSort 612 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 613 * 614 * // Obtain a segment of consecutive items that are blocked across threads 615 * int thread_keys[4]; 616 * int thread_values[4]; 617 * ... 618 * 619 * // Collectively sort the keys and values among block threads 620 * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); 621 * 622 * \endcode 623 * \par 624 * Suppose the set of input \p thread_keys across the block of threads is 625 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The 626 * corresponding output \p thread_keys in those threads will be 627 * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>. 628 * 629 */ SortDescending(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)630 __device__ __forceinline__ void SortDescending( 631 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 632 ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort 633 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 634 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 635 { 636 SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>()); 637 } 638 639 640 //@} end member group 641 /******************************************************************//** 642 * \name Sorting (blocked arrangement -> striped arrangement) 643 *********************************************************************/ 644 //@{ 645 646 647 /** 648 * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3). 649 * 650 * \par 651 * - \granularity 652 * - \smemreuse 653 * 654 * \par Snippet 655 * The code snippet below illustrates a sort of 512 integer keys that 656 * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 657 * where each thread owns 4 consecutive keys. The final partitioning is striped. 658 * \par 659 * \code 660 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 661 * 662 * __global__ void ExampleKernel(...) 663 * { 664 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each 665 * typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort; 666 * 667 * // Allocate shared memory for BlockRadixSort 668 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 669 * 670 * // Obtain a segment of consecutive items that are blocked across threads 671 * int thread_keys[4]; 672 * ... 673 * 674 * // Collectively sort the keys 675 * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); 676 * 677 * \endcode 678 * \par 679 * Suppose the set of input \p thread_keys across the block of threads is 680 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The 681 * corresponding output \p thread_keys in those threads will be 682 * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>. 683 * 684 */ SortBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)685 __device__ __forceinline__ void SortBlockedToStriped( 686 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 687 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 688 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 689 { 690 NullType values[ITEMS_PER_THREAD]; 691 692 SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>()); 693 } 694 695 696 /** 697 * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3). 698 * 699 * \par 700 * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" 701 * more than one tile of values, simply perform a key-value sort of the keys paired 702 * with a temporary value array that enumerates the key indices. The reordered indices 703 * can then be used as a gather-vector for exchanging other associated tile data through 704 * shared memory. 705 * - \granularity 706 * - \smemreuse 707 * 708 * \par Snippet 709 * The code snippet below illustrates a sort of 512 integer keys and values that 710 * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 711 * where each thread owns 4 consecutive pairs. The final partitioning is striped. 712 * \par 713 * \code 714 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 715 * 716 * __global__ void ExampleKernel(...) 717 * { 718 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each 719 * typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort; 720 * 721 * // Allocate shared memory for BlockRadixSort 722 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 723 * 724 * // Obtain a segment of consecutive items that are blocked across threads 725 * int thread_keys[4]; 726 * int thread_values[4]; 727 * ... 728 * 729 * // Collectively sort the keys and values among block threads 730 * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); 731 * 732 * \endcode 733 * \par 734 * Suppose the set of input \p thread_keys across the block of threads is 735 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The 736 * corresponding output \p thread_keys in those threads will be 737 * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>. 738 * 739 */ SortBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)740 __device__ __forceinline__ void SortBlockedToStriped( 741 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 742 ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort 743 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 744 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 745 { 746 SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>()); 747 } 748 749 750 /** 751 * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3). 752 * 753 * \par 754 * - \granularity 755 * - \smemreuse 756 * 757 * \par Snippet 758 * The code snippet below illustrates a sort of 512 integer keys that 759 * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 760 * where each thread owns 4 consecutive keys. The final partitioning is striped. 761 * \par 762 * \code 763 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 764 * 765 * __global__ void ExampleKernel(...) 766 * { 767 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each 768 * typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort; 769 * 770 * // Allocate shared memory for BlockRadixSort 771 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 772 * 773 * // Obtain a segment of consecutive items that are blocked across threads 774 * int thread_keys[4]; 775 * ... 776 * 777 * // Collectively sort the keys 778 * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); 779 * 780 * \endcode 781 * \par 782 * Suppose the set of input \p thread_keys across the block of threads is 783 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The 784 * corresponding output \p thread_keys in those threads will be 785 * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>. 786 * 787 */ SortDescendingBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)788 __device__ __forceinline__ void SortDescendingBlockedToStriped( 789 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 790 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 791 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 792 { 793 NullType values[ITEMS_PER_THREAD]; 794 795 SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>()); 796 } 797 798 799 /** 800 * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3). 801 * 802 * \par 803 * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" 804 * more than one tile of values, simply perform a key-value sort of the keys paired 805 * with a temporary value array that enumerates the key indices. The reordered indices 806 * can then be used as a gather-vector for exchanging other associated tile data through 807 * shared memory. 808 * - \granularity 809 * - \smemreuse 810 * 811 * \par Snippet 812 * The code snippet below illustrates a sort of 512 integer keys and values that 813 * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads 814 * where each thread owns 4 consecutive pairs. The final partitioning is striped. 815 * \par 816 * \code 817 * #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh> 818 * 819 * __global__ void ExampleKernel(...) 820 * { 821 * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each 822 * typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort; 823 * 824 * // Allocate shared memory for BlockRadixSort 825 * __shared__ typename BlockRadixSort::TempStorage temp_storage; 826 * 827 * // Obtain a segment of consecutive items that are blocked across threads 828 * int thread_keys[4]; 829 * int thread_values[4]; 830 * ... 831 * 832 * // Collectively sort the keys and values among block threads 833 * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); 834 * 835 * \endcode 836 * \par 837 * Suppose the set of input \p thread_keys across the block of threads is 838 * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The 839 * corresponding output \p thread_keys in those threads will be 840 * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>. 841 * 842 */ SortDescendingBlockedToStriped(KeyT (& keys)[ITEMS_PER_THREAD],ValueT (& values)[ITEMS_PER_THREAD],int begin_bit=0,int end_bit=sizeof (KeyT)* 8)843 __device__ __forceinline__ void SortDescendingBlockedToStriped( 844 KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort 845 ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort 846 int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison 847 int end_bit = sizeof(KeyT) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison 848 { 849 SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>()); 850 } 851 852 853 //@} end member group 854 855 }; 856 857 /** 858 * \example example_block_radix_sort.cu 859 */ 860 861 } // CUB namespace 862 CUB_NS_POSTFIX // Optional outer namespace(s) 863 864