1 2 /****************************************************************************** 3 * Copyright (c) 2011, Duane Merrill. All rights reserved. 4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * * Neither the name of the NVIDIA CORPORATION nor the 14 * names of its contributors may be used to endorse or promote products 15 * derived from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 ******************************************************************************/ 29 30 /** 31 * \file 32 * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. 33 */ 34 35 #pragma once 36 37 #include <stdio.h> 38 #include <iterator> 39 40 #include "dispatch/dispatch_radix_sort.cuh" 41 #include "../util_arch.cuh" 42 #include "../util_namespace.cuh" 43 44 /// Optional outer namespace(s) 45 CUB_NS_PREFIX 46 47 /// CUB namespace 48 namespace cub { 49 50 51 /** 52 * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) 53 * \ingroup SingleModule 54 * 55 * \par Overview 56 * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges 57 * items into ascending (or descending) order. The algorithm relies upon a positional representation for 58 * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, 59 * characters, etc.) specified from least-significant to most-significant. For a 60 * given input sequence of keys and a set of rules specifying a total ordering 61 * of the symbolic alphabet, the radix sorting method produces a lexicographic 62 * ordering of those keys. 63 * 64 * \par 65 * DeviceRadixSort can sort all of the built-in C++ numeric primitive types 66 * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half 67 * half-precision floating-point type. Although the direct radix sorting 68 * method can only be applied to unsigned integral types, DeviceRadixSort 69 * is able to sort signed and floating-point types via simple bit-wise transformations 70 * that ensure lexicographic key ordering. 71 * 72 * \par Usage Considerations 73 * \cdp_class{DeviceRadixSort} 74 * 75 * \par Performance 76 * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys 77 * performance across different CUDA architectures for uniform-random \p uint32 keys. 78 * \plots_below 79 * 80 * \image html lsb_radix_sort_int32_keys.png 81 * 82 */ 83 struct DeviceRadixSort 84 { 85 86 /******************************************************************//** 87 * \name KeyT-value pairs 88 *********************************************************************/ 89 //@{ 90 91 /** 92 * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required) 93 * 94 * \par 95 * - The contents of the input data are not altered by the sorting operation 96 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 97 * - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below. 98 * - \devicestorage 99 * 100 * \par Performance 101 * The following charts illustrate saturated sorting performance across different 102 * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and 103 * <tt>uint64,uint64</tt> pairs, respectively. 104 * 105 * \image html lsb_radix_sort_int32_pairs.png 106 * \image html lsb_radix_sort_int64_pairs.png 107 * 108 * \par Snippet 109 * The code snippet below illustrates the sorting of a device vector of \p int keys 110 * with associated vector of \p int values. 111 * \par 112 * \code 113 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 114 * 115 * // Declare, allocate, and initialize device-accessible pointers for sorting data 116 * int num_items; // e.g., 7 117 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] 118 * int *d_keys_out; // e.g., [ ... ] 119 * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] 120 * int *d_values_out; // e.g., [ ... ] 121 * ... 122 * 123 * // Determine temporary device storage requirements 124 * void *d_temp_storage = NULL; 125 * size_t temp_storage_bytes = 0; 126 * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, 127 * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); 128 * 129 * // Allocate temporary storage 130 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 131 * 132 * // Run sorting operation 133 * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, 134 * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); 135 * 136 * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] 137 * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] 138 * 139 * \endcode 140 * 141 * \tparam KeyT <b>[inferred]</b> KeyT type 142 * \tparam ValueT <b>[inferred]</b> ValueT type 143 */ 144 template < 145 typename KeyT, 146 typename ValueT> 147 CUB_RUNTIME_FUNCTION SortPairscub::DeviceRadixSort148 static cudaError_t SortPairs( 149 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 150 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 151 const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort 152 KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data 153 const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items 154 ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items 155 int num_items, ///< [in] Number of items to sort 156 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 157 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 158 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 159 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 160 { 161 // Signed integer type for global offsets 162 typedef int OffsetT; 163 164 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out); 165 DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out); 166 167 return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch( 168 d_temp_storage, 169 temp_storage_bytes, 170 d_keys, 171 d_values, 172 num_items, 173 begin_bit, 174 end_bit, 175 false, 176 stream, 177 debug_synchronous); 178 } 179 180 181 /** 182 * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required) 183 * 184 * \par 185 * - The sorting operation is given a pair of key buffers and a corresponding 186 * pair of associated value buffers. Each pair is managed by a DoubleBuffer 187 * structure that indicates which of the two buffers is "current" (and thus 188 * contains the input data to be sorted). 189 * - The contents of both buffers within each pair may be altered by the sorting 190 * operation. 191 * - Upon completion, the sorting operation will update the "current" indicator 192 * within each DoubleBuffer wrapper to reference which of the two buffers 193 * now contains the sorted output sequence (a function of the number of key bits 194 * specified and the targeted device architecture). 195 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 196 * - \devicestorageP 197 * - \devicestorage 198 * 199 * \par Performance 200 * The following charts illustrate saturated sorting performance across different 201 * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and 202 * <tt>uint64,uint64</tt> pairs, respectively. 203 * 204 * \image html lsb_radix_sort_int32_pairs.png 205 * \image html lsb_radix_sort_int64_pairs.png 206 * 207 * \par Snippet 208 * The code snippet below illustrates the sorting of a device vector of \p int keys 209 * with associated vector of \p int values. 210 * \par 211 * \code 212 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 213 * 214 * // Declare, allocate, and initialize device-accessible pointers for sorting data 215 * int num_items; // e.g., 7 216 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] 217 * int *d_key_alt_buf; // e.g., [ ... ] 218 * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] 219 * int *d_value_alt_buf; // e.g., [ ... ] 220 * ... 221 * 222 * // Create a set of DoubleBuffers to wrap pairs of device pointers 223 * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); 224 * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf); 225 * 226 * // Determine temporary device storage requirements 227 * void *d_temp_storage = NULL; 228 * size_t temp_storage_bytes = 0; 229 * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); 230 * 231 * // Allocate temporary storage 232 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 233 * 234 * // Run sorting operation 235 * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); 236 * 237 * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] 238 * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] 239 * 240 * \endcode 241 * 242 * \tparam KeyT <b>[inferred]</b> KeyT type 243 * \tparam ValueT <b>[inferred]</b> ValueT type 244 */ 245 template < 246 typename KeyT, 247 typename ValueT> 248 CUB_RUNTIME_FUNCTION SortPairscub::DeviceRadixSort249 static cudaError_t SortPairs( 250 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 251 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 252 DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys 253 DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values 254 int num_items, ///< [in] Number of items to sort 255 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 256 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 257 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 258 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 259 { 260 // Signed integer type for global offsets 261 typedef int OffsetT; 262 263 return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch( 264 d_temp_storage, 265 temp_storage_bytes, 266 d_keys, 267 d_values, 268 num_items, 269 begin_bit, 270 end_bit, 271 true, 272 stream, 273 debug_synchronous); 274 } 275 276 277 /** 278 * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required). 279 * 280 * \par 281 * - The contents of the input data are not altered by the sorting operation 282 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 283 * - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below. 284 * - \devicestorage 285 * 286 * \par Performance 287 * Performance is similar to DeviceRadixSort::SortPairs. 288 * 289 * \par Snippet 290 * The code snippet below illustrates the sorting of a device vector of \p int keys 291 * with associated vector of \p int values. 292 * \par 293 * \code 294 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 295 * 296 * // Declare, allocate, and initialize device-accessible pointers for sorting data 297 * int num_items; // e.g., 7 298 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] 299 * int *d_keys_out; // e.g., [ ... ] 300 * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] 301 * int *d_values_out; // e.g., [ ... ] 302 * ... 303 * 304 * // Determine temporary device storage requirements 305 * void *d_temp_storage = NULL; 306 * size_t temp_storage_bytes = 0; 307 * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, 308 * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); 309 * 310 * // Allocate temporary storage 311 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 312 * 313 * // Run sorting operation 314 * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, 315 * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); 316 * 317 * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] 318 * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] 319 * 320 * \endcode 321 * 322 * \tparam KeyT <b>[inferred]</b> KeyT type 323 * \tparam ValueT <b>[inferred]</b> ValueT type 324 */ 325 template < 326 typename KeyT, 327 typename ValueT> 328 CUB_RUNTIME_FUNCTION SortPairsDescendingcub::DeviceRadixSort329 static cudaError_t SortPairsDescending( 330 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 331 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 332 const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort 333 KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data 334 const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items 335 ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items 336 int num_items, ///< [in] Number of items to sort 337 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 338 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 339 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 340 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 341 { 342 // Signed integer type for global offsets 343 typedef int OffsetT; 344 345 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out); 346 DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out); 347 348 return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch( 349 d_temp_storage, 350 temp_storage_bytes, 351 d_keys, 352 d_values, 353 num_items, 354 begin_bit, 355 end_bit, 356 false, 357 stream, 358 debug_synchronous); 359 } 360 361 362 /** 363 * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required). 364 * 365 * \par 366 * - The sorting operation is given a pair of key buffers and a corresponding 367 * pair of associated value buffers. Each pair is managed by a DoubleBuffer 368 * structure that indicates which of the two buffers is "current" (and thus 369 * contains the input data to be sorted). 370 * - The contents of both buffers within each pair may be altered by the sorting 371 * operation. 372 * - Upon completion, the sorting operation will update the "current" indicator 373 * within each DoubleBuffer wrapper to reference which of the two buffers 374 * now contains the sorted output sequence (a function of the number of key bits 375 * specified and the targeted device architecture). 376 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 377 * - \devicestorageP 378 * - \devicestorage 379 * 380 * \par Performance 381 * Performance is similar to DeviceRadixSort::SortPairs. 382 * 383 * \par Snippet 384 * The code snippet below illustrates the sorting of a device vector of \p int keys 385 * with associated vector of \p int values. 386 * \par 387 * \code 388 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 389 * 390 * // Declare, allocate, and initialize device-accessible pointers for sorting data 391 * int num_items; // e.g., 7 392 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] 393 * int *d_key_alt_buf; // e.g., [ ... ] 394 * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] 395 * int *d_value_alt_buf; // e.g., [ ... ] 396 * ... 397 * 398 * // Create a set of DoubleBuffers to wrap pairs of device pointers 399 * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); 400 * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf); 401 * 402 * // Determine temporary device storage requirements 403 * void *d_temp_storage = NULL; 404 * size_t temp_storage_bytes = 0; 405 * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); 406 * 407 * // Allocate temporary storage 408 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 409 * 410 * // Run sorting operation 411 * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); 412 * 413 * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] 414 * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] 415 * 416 * \endcode 417 * 418 * \tparam KeyT <b>[inferred]</b> KeyT type 419 * \tparam ValueT <b>[inferred]</b> ValueT type 420 */ 421 template < 422 typename KeyT, 423 typename ValueT> 424 CUB_RUNTIME_FUNCTION SortPairsDescendingcub::DeviceRadixSort425 static cudaError_t SortPairsDescending( 426 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 427 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 428 DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys 429 DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values 430 int num_items, ///< [in] Number of items to sort 431 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 432 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 433 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 434 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 435 { 436 // Signed integer type for global offsets 437 typedef int OffsetT; 438 439 return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch( 440 d_temp_storage, 441 temp_storage_bytes, 442 d_keys, 443 d_values, 444 num_items, 445 begin_bit, 446 end_bit, 447 true, 448 stream, 449 debug_synchronous); 450 } 451 452 453 //@} end member group 454 /******************************************************************//** 455 * \name Keys-only 456 *********************************************************************/ 457 //@{ 458 459 460 /** 461 * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required) 462 * 463 * \par 464 * - The contents of the input data are not altered by the sorting operation 465 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 466 * - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below. 467 * - \devicestorage 468 * 469 * \par Performance 470 * The following charts illustrate saturated sorting performance across different 471 * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. 472 * 473 * \image html lsb_radix_sort_int32_keys.png 474 * \image html lsb_radix_sort_int64_keys.png 475 * 476 * \par Snippet 477 * The code snippet below illustrates the sorting of a device vector of \p int keys. 478 * \par 479 * \code 480 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 481 * 482 * // Declare, allocate, and initialize device-accessible pointers for sorting data 483 * int num_items; // e.g., 7 484 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] 485 * int *d_keys_out; // e.g., [ ... ] 486 * ... 487 * 488 * // Determine temporary device storage requirements 489 * void *d_temp_storage = NULL; 490 * size_t temp_storage_bytes = 0; 491 * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); 492 * 493 * // Allocate temporary storage 494 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 495 * 496 * // Run sorting operation 497 * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); 498 * 499 * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] 500 * 501 * \endcode 502 * 503 * \tparam KeyT <b>[inferred]</b> KeyT type 504 */ 505 template <typename KeyT> 506 CUB_RUNTIME_FUNCTION SortKeyscub::DeviceRadixSort507 static cudaError_t SortKeys( 508 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 509 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 510 const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort 511 KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data 512 int num_items, ///< [in] Number of items to sort 513 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 514 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 515 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 516 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 517 { 518 // Signed integer type for global offsets 519 typedef int OffsetT; 520 521 // Null value type 522 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out); 523 DoubleBuffer<NullType> d_values; 524 525 return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch( 526 d_temp_storage, 527 temp_storage_bytes, 528 d_keys, 529 d_values, 530 num_items, 531 begin_bit, 532 end_bit, 533 false, 534 stream, 535 debug_synchronous); 536 } 537 538 539 /** 540 * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required). 541 * 542 * \par 543 * - The sorting operation is given a pair of key buffers managed by a 544 * DoubleBuffer structure that indicates which of the two buffers is 545 * "current" (and thus contains the input data to be sorted). 546 * - The contents of both buffers may be altered by the sorting operation. 547 * - Upon completion, the sorting operation will update the "current" indicator 548 * within the DoubleBuffer wrapper to reference which of the two buffers 549 * now contains the sorted output sequence (a function of the number of key bits 550 * specified and the targeted device architecture). 551 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 552 * - \devicestorageP 553 * - \devicestorage 554 * 555 * \par Performance 556 * The following charts illustrate saturated sorting performance across different 557 * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. 558 * 559 * \image html lsb_radix_sort_int32_keys.png 560 * \image html lsb_radix_sort_int64_keys.png 561 * 562 * \par Snippet 563 * The code snippet below illustrates the sorting of a device vector of \p int keys. 564 * \par 565 * \code 566 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 567 * 568 * // Declare, allocate, and initialize device-accessible pointers for sorting data 569 * int num_items; // e.g., 7 570 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] 571 * int *d_key_alt_buf; // e.g., [ ... ] 572 * ... 573 * 574 * // Create a DoubleBuffer to wrap the pair of device pointers 575 * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); 576 * 577 * // Determine temporary device storage requirements 578 * void *d_temp_storage = NULL; 579 * size_t temp_storage_bytes = 0; 580 * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); 581 * 582 * // Allocate temporary storage 583 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 584 * 585 * // Run sorting operation 586 * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); 587 * 588 * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] 589 * 590 * \endcode 591 * 592 * \tparam KeyT <b>[inferred]</b> KeyT type 593 */ 594 template <typename KeyT> 595 CUB_RUNTIME_FUNCTION SortKeyscub::DeviceRadixSort596 static cudaError_t SortKeys( 597 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 598 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 599 DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys 600 int num_items, ///< [in] Number of items to sort 601 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 602 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 603 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 604 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 605 { 606 // Signed integer type for global offsets 607 typedef int OffsetT; 608 609 // Null value type 610 DoubleBuffer<NullType> d_values; 611 612 return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch( 613 d_temp_storage, 614 temp_storage_bytes, 615 d_keys, 616 d_values, 617 num_items, 618 begin_bit, 619 end_bit, 620 true, 621 stream, 622 debug_synchronous); 623 } 624 625 /** 626 * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required). 627 * 628 * \par 629 * - The contents of the input data are not altered by the sorting operation 630 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 631 * - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below. 632 * - \devicestorage 633 * 634 * \par Performance 635 * Performance is similar to DeviceRadixSort::SortKeys. 636 * 637 * \par Snippet 638 * The code snippet below illustrates the sorting of a device vector of \p int keys. 639 * \par 640 * \code 641 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 642 * 643 * // Declare, allocate, and initialize device-accessible pointers for sorting data 644 * int num_items; // e.g., 7 645 * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] 646 * int *d_keys_out; // e.g., [ ... ] 647 * ... 648 * 649 * // Create a DoubleBuffer to wrap the pair of device pointers 650 * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); 651 * 652 * // Determine temporary device storage requirements 653 * void *d_temp_storage = NULL; 654 * size_t temp_storage_bytes = 0; 655 * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); 656 * 657 * // Allocate temporary storage 658 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 659 * 660 * // Run sorting operation 661 * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); 662 * 663 * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s 664 * 665 * \endcode 666 * 667 * \tparam KeyT <b>[inferred]</b> KeyT type 668 */ 669 template <typename KeyT> 670 CUB_RUNTIME_FUNCTION SortKeysDescendingcub::DeviceRadixSort671 static cudaError_t SortKeysDescending( 672 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 673 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 674 const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort 675 KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data 676 int num_items, ///< [in] Number of items to sort 677 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 678 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 679 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 680 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 681 { 682 // Signed integer type for global offsets 683 typedef int OffsetT; 684 685 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out); 686 DoubleBuffer<NullType> d_values; 687 688 return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch( 689 d_temp_storage, 690 temp_storage_bytes, 691 d_keys, 692 d_values, 693 num_items, 694 begin_bit, 695 end_bit, 696 false, 697 stream, 698 debug_synchronous); 699 } 700 701 702 /** 703 * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required). 704 * 705 * \par 706 * - The sorting operation is given a pair of key buffers managed by a 707 * DoubleBuffer structure that indicates which of the two buffers is 708 * "current" (and thus contains the input data to be sorted). 709 * - The contents of both buffers may be altered by the sorting operation. 710 * - Upon completion, the sorting operation will update the "current" indicator 711 * within the DoubleBuffer wrapper to reference which of the two buffers 712 * now contains the sorted output sequence (a function of the number of key bits 713 * specified and the targeted device architecture). 714 * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. 715 * - \devicestorageP 716 * - \devicestorage 717 * 718 * \par Performance 719 * Performance is similar to DeviceRadixSort::SortKeys. 720 * 721 * \par Snippet 722 * The code snippet below illustrates the sorting of a device vector of \p int keys. 723 * \par 724 * \code 725 * #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh> 726 * 727 * // Declare, allocate, and initialize device-accessible pointers for sorting data 728 * int num_items; // e.g., 7 729 * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] 730 * int *d_key_alt_buf; // e.g., [ ... ] 731 * ... 732 * 733 * // Create a DoubleBuffer to wrap the pair of device pointers 734 * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); 735 * 736 * // Determine temporary device storage requirements 737 * void *d_temp_storage = NULL; 738 * size_t temp_storage_bytes = 0; 739 * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); 740 * 741 * // Allocate temporary storage 742 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 743 * 744 * // Run sorting operation 745 * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); 746 * 747 * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] 748 * 749 * \endcode 750 * 751 * \tparam KeyT <b>[inferred]</b> KeyT type 752 */ 753 template <typename KeyT> 754 CUB_RUNTIME_FUNCTION SortKeysDescendingcub::DeviceRadixSort755 static cudaError_t SortKeysDescending( 756 void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 757 size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 758 DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys 759 int num_items, ///< [in] Number of items to sort 760 int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison 761 int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) 762 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 763 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. 764 { 765 // Signed integer type for global offsets 766 typedef int OffsetT; 767 768 // Null value type 769 DoubleBuffer<NullType> d_values; 770 771 return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch( 772 d_temp_storage, 773 temp_storage_bytes, 774 d_keys, 775 d_values, 776 num_items, 777 begin_bit, 778 end_bit, 779 true, 780 stream, 781 debug_synchronous); 782 } 783 784 785 //@} end member group 786 787 788 }; 789 790 /** 791 * \example example_device_radix_sort.cu 792 */ 793 794 } // CUB namespace 795 CUB_NS_POSTFIX // Optional outer namespace(s) 796 797 798