1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. 32 */ 33 34 #pragma once 35 36 #include "specializations/warp_scan_shfl.cuh" 37 #include "specializations/warp_scan_smem.cuh" 38 #include "../thread/thread_operators.cuh" 39 #include "../util_arch.cuh" 40 #include "../util_type.cuh" 41 #include "../util_namespace.cuh" 42 43 /// Optional outer namespace(s) 44 CUB_NS_PREFIX 45 46 /// CUB namespace 47 namespace cub { 48 49 /** 50 * \addtogroup WarpModule 51 * @{ 52 */ 53 54 /** 55 * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) 56 * 57 * \tparam T The scan input/output element type 58 * \tparam LOGICAL_WARP_THREADS <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). 59 * \tparam PTX_ARCH <b>[optional]</b> \ptxversion 60 * 61 * \par Overview 62 * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum) 63 * produces an output list where each element is computed to be the reduction 64 * of the elements occurring earlier in the input list. <em>Prefix sum</em> 65 * connotes a prefix scan with the addition operator. The term \em inclusive indicates 66 * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input. 67 * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into 68 * the <em>i</em><sup>th</sup> output reduction. 69 * - Supports non-commutative scan operators 70 * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) 71 * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS 72 * 73 * \par Performance Considerations 74 * - Uses special instructions when applicable (e.g., warp \p SHFL) 75 * - Uses synchronization-free communication between warp lanes when applicable 76 * - Incurs zero bank conflicts for most types 77 * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: 78 * - Summation (<b><em>vs.</em></b> generic scan) 79 * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS 80 * 81 * \par Simple Examples 82 * \warpcollective{WarpScan} 83 * \par 84 * The code snippet below illustrates four concurrent warp prefix sums within a block of 85 * 128 threads (one per each of the 32-thread warps). 86 * \par 87 * \code 88 * #include <cub/cub.cuh> 89 * 90 * __global__ void ExampleKernel(...) 91 * { 92 * // Specialize WarpScan for type int 93 * typedef cub::WarpScan<int> WarpScan; 94 * 95 * // Allocate WarpScan shared memory for 4 warps 96 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 97 * 98 * // Obtain one input item per thread 99 * int thread_data = ... 100 * 101 * // Compute warp-wide prefix sums 102 * int warp_id = threadIdx.x / 32; 103 * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); 104 * 105 * \endcode 106 * \par 107 * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>. 108 * The corresponding output \p thread_data in each of the four warps of threads will be 109 * <tt>0, 1, 2, 3, ..., 31}</tt>. 110 * 111 * \par 112 * The code snippet below illustrates a single warp prefix sum within a block of 113 * 128 threads. 114 * \par 115 * \code 116 * #include <cub/cub.cuh> 117 * 118 * __global__ void ExampleKernel(...) 119 * { 120 * // Specialize WarpScan for type int 121 * typedef cub::WarpScan<int> WarpScan; 122 * 123 * // Allocate WarpScan shared memory for one warp 124 * __shared__ typename WarpScan::TempStorage temp_storage; 125 * ... 126 * 127 * // Only the first warp performs a prefix sum 128 * if (threadIdx.x < 32) 129 * { 130 * // Obtain one input item per thread 131 * int thread_data = ... 132 * 133 * // Compute warp-wide prefix sums 134 * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); 135 * 136 * \endcode 137 * \par 138 * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>. 139 * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>. 140 * 141 */ 142 template < 143 typename T, 144 int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, 145 int PTX_ARCH = CUB_PTX_ARCH> 146 class WarpScan 147 { 148 private: 149 150 /****************************************************************************** 151 * Constants and type definitions 152 ******************************************************************************/ 153 154 enum 155 { 156 /// Whether the logical warp size and the PTX warp size coincide 157 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), 158 159 /// Whether the logical warp size is a power-of-two 160 IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), 161 162 /// Whether the data type is an integer (which has fully-associative addition) 163 IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER)) 164 }; 165 166 /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) 167 typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), 168 WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>, 169 WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan; 170 171 /// Shared memory storage layout type for WarpScan 172 typedef typename InternalWarpScan::TempStorage _TempStorage; 173 174 175 /****************************************************************************** 176 * Thread fields 177 ******************************************************************************/ 178 179 /// Shared storage reference 180 _TempStorage &temp_storage; 181 unsigned int lane_id; 182 183 184 185 /****************************************************************************** 186 * Public types 187 ******************************************************************************/ 188 189 public: 190 191 /// \smemstorage{WarpScan} 192 struct TempStorage : Uninitialized<_TempStorage> {}; 193 194 195 /******************************************************************//** 196 * \name Collective constructors 197 *********************************************************************/ 198 //@{ 199 200 /** 201 * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>. 202 */ WarpScan(TempStorage & temp_storage)203 __device__ __forceinline__ WarpScan( 204 TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage 205 : 206 temp_storage(temp_storage.Alias()), 207 lane_id(IS_ARCH_WARP ? 208 LaneId() : 209 LaneId() % LOGICAL_WARP_THREADS) 210 {} 211 212 213 //@} end member group 214 /******************************************************************//** 215 * \name Inclusive prefix sums 216 *********************************************************************/ 217 //@{ 218 219 220 /** 221 * \brief Computes an inclusive prefix sum across the calling warp. 222 * 223 * \par 224 * - \smemreuse 225 * 226 * \par Snippet 227 * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of 228 * 128 threads (one per each of the 32-thread warps). 229 * \par 230 * \code 231 * #include <cub/cub.cuh> 232 * 233 * __global__ void ExampleKernel(...) 234 * { 235 * // Specialize WarpScan for type int 236 * typedef cub::WarpScan<int> WarpScan; 237 * 238 * // Allocate WarpScan shared memory for 4 warps 239 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 240 * 241 * // Obtain one input item per thread 242 * int thread_data = ... 243 * 244 * // Compute inclusive warp-wide prefix sums 245 * int warp_id = threadIdx.x / 32; 246 * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); 247 * 248 * \endcode 249 * \par 250 * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>. 251 * The corresponding output \p thread_data in each of the four warps of threads will be 252 * <tt>1, 2, 3, ..., 32}</tt>. 253 */ InclusiveSum(T input,T & inclusive_output)254 __device__ __forceinline__ void InclusiveSum( 255 T input, ///< [in] Calling thread's input item. 256 T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. 257 { 258 InclusiveScan(input, inclusive_output, cub::Sum()); 259 } 260 261 262 /** 263 * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. 264 * 265 * \par 266 * - \smemreuse 267 * 268 * \par Snippet 269 * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of 270 * 128 threads (one per each of the 32-thread warps). 271 * \par 272 * \code 273 * #include <cub/cub.cuh> 274 * 275 * __global__ void ExampleKernel(...) 276 * { 277 * // Specialize WarpScan for type int 278 * typedef cub::WarpScan<int> WarpScan; 279 * 280 * // Allocate WarpScan shared memory for 4 warps 281 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 282 * 283 * // Obtain one input item per thread 284 * int thread_data = ... 285 * 286 * // Compute inclusive warp-wide prefix sums 287 * int warp_aggregate; 288 * int warp_id = threadIdx.x / 32; 289 * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); 290 * 291 * \endcode 292 * \par 293 * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>. 294 * The corresponding output \p thread_data in each of the four warps of threads will be 295 * <tt>1, 2, 3, ..., 32}</tt>. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. 296 */ InclusiveSum(T input,T & inclusive_output,T & warp_aggregate)297 __device__ __forceinline__ void InclusiveSum( 298 T input, ///< [in] Calling thread's input item. 299 T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 300 T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. 301 { 302 InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); 303 } 304 305 306 //@} end member group 307 /******************************************************************//** 308 * \name Exclusive prefix sums 309 *********************************************************************/ 310 //@{ 311 312 313 /** 314 * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>. 315 * 316 * \par 317 * - \identityzero 318 * - \smemreuse 319 * 320 * \par Snippet 321 * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of 322 * 128 threads (one per each of the 32-thread warps). 323 * \par 324 * \code 325 * #include <cub/cub.cuh> 326 * 327 * __global__ void ExampleKernel(...) 328 * { 329 * // Specialize WarpScan for type int 330 * typedef cub::WarpScan<int> WarpScan; 331 * 332 * // Allocate WarpScan shared memory for 4 warps 333 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 334 * 335 * // Obtain one input item per thread 336 * int thread_data = ... 337 * 338 * // Compute exclusive warp-wide prefix sums 339 * int warp_id = threadIdx.x / 32; 340 * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); 341 * 342 * \endcode 343 * \par 344 * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>. 345 * The corresponding output \p thread_data in each of the four warps of threads will be 346 * <tt>0, 1, 2, ..., 31}</tt>. 347 * 348 */ ExclusiveSum(T input,T & exclusive_output)349 __device__ __forceinline__ void ExclusiveSum( 350 T input, ///< [in] Calling thread's input item. 351 T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. 352 { 353 T initial_value = 0; 354 ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); 355 } 356 357 358 /** 359 * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. 360 * 361 * \par 362 * - \identityzero 363 * - \smemreuse 364 * 365 * \par Snippet 366 * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of 367 * 128 threads (one per each of the 32-thread warps). 368 * \par 369 * \code 370 * #include <cub/cub.cuh> 371 * 372 * __global__ void ExampleKernel(...) 373 * { 374 * // Specialize WarpScan for type int 375 * typedef cub::WarpScan<int> WarpScan; 376 * 377 * // Allocate WarpScan shared memory for 4 warps 378 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 379 * 380 * // Obtain one input item per thread 381 * int thread_data = ... 382 * 383 * // Compute exclusive warp-wide prefix sums 384 * int warp_aggregate; 385 * int warp_id = threadIdx.x / 32; 386 * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); 387 * 388 * \endcode 389 * \par 390 * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>. 391 * The corresponding output \p thread_data in each of the four warps of threads will be 392 * <tt>0, 1, 2, ..., 31}</tt>. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. 393 */ ExclusiveSum(T input,T & exclusive_output,T & warp_aggregate)394 __device__ __forceinline__ void ExclusiveSum( 395 T input, ///< [in] Calling thread's input item. 396 T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 397 T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. 398 { 399 T initial_value = 0; 400 ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); 401 } 402 403 404 //@} end member group 405 /******************************************************************//** 406 * \name Inclusive prefix scans 407 *********************************************************************/ 408 //@{ 409 410 /** 411 * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. 412 * 413 * \par 414 * - \smemreuse 415 * 416 * \par Snippet 417 * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of 418 * 128 threads (one per each of the 32-thread warps). 419 * \par 420 * \code 421 * #include <cub/cub.cuh> 422 * 423 * __global__ void ExampleKernel(...) 424 * { 425 * // Specialize WarpScan for type int 426 * typedef cub::WarpScan<int> WarpScan; 427 * 428 * // Allocate WarpScan shared memory for 4 warps 429 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 430 * 431 * // Obtain one input item per thread 432 * int thread_data = ... 433 * 434 * // Compute inclusive warp-wide prefix max scans 435 * int warp_id = threadIdx.x / 32; 436 * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); 437 * 438 * \endcode 439 * \par 440 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 441 * The corresponding output \p thread_data in the first warp would be 442 * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc. 443 * 444 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 445 */ 446 template <typename ScanOp> InclusiveScan(T input,T & inclusive_output,ScanOp scan_op)447 __device__ __forceinline__ void InclusiveScan( 448 T input, ///< [in] Calling thread's input item. 449 T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 450 ScanOp scan_op) ///< [in] Binary scan operator 451 { 452 InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); 453 } 454 455 456 /** 457 * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. 458 * 459 * \par 460 * - \smemreuse 461 * 462 * \par Snippet 463 * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of 464 * 128 threads (one per each of the 32-thread warps). 465 * \par 466 * \code 467 * #include <cub/cub.cuh> 468 * 469 * __global__ void ExampleKernel(...) 470 * { 471 * // Specialize WarpScan for type int 472 * typedef cub::WarpScan<int> WarpScan; 473 * 474 * // Allocate WarpScan shared memory for 4 warps 475 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 476 * 477 * // Obtain one input item per thread 478 * int thread_data = ... 479 * 480 * // Compute inclusive warp-wide prefix max scans 481 * int warp_aggregate; 482 * int warp_id = threadIdx.x / 32; 483 * WarpScan(temp_storage[warp_id]).InclusiveScan( 484 * thread_data, thread_data, cub::Max(), warp_aggregate); 485 * 486 * \endcode 487 * \par 488 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 489 * The corresponding output \p thread_data in the first warp would be 490 * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc. 491 * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads 492 * in the second warp, etc. 493 * 494 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 495 */ 496 template <typename ScanOp> InclusiveScan(T input,T & inclusive_output,ScanOp scan_op,T & warp_aggregate)497 __device__ __forceinline__ void InclusiveScan( 498 T input, ///< [in] Calling thread's input item. 499 T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 500 ScanOp scan_op, ///< [in] Binary scan operator 501 T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. 502 { 503 InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); 504 } 505 506 507 //@} end member group 508 /******************************************************************//** 509 * \name Exclusive prefix scans 510 *********************************************************************/ 511 //@{ 512 513 /** 514 * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined. 515 * 516 * \par 517 * - \smemreuse 518 * 519 * \par Snippet 520 * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 521 * 128 threads (one per each of the 32-thread warps). 522 * \par 523 * \code 524 * #include <cub/cub.cuh> 525 * 526 * __global__ void ExampleKernel(...) 527 * { 528 * // Specialize WarpScan for type int 529 * typedef cub::WarpScan<int> WarpScan; 530 * 531 * // Allocate WarpScan shared memory for 4 warps 532 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 533 * 534 * // Obtain one input item per thread 535 * int thread_data = ... 536 * 537 * // Compute exclusive warp-wide prefix max scans 538 * int warp_id = threadIdx.x / 32; 539 * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); 540 * 541 * \endcode 542 * \par 543 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 544 * The corresponding output \p thread_data in the first warp would be 545 * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc. 546 * (The output \p thread_data in warp lane<sub>0</sub> is undefined.) 547 * 548 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 549 */ 550 template <typename ScanOp> ExclusiveScan(T input,T & exclusive_output,ScanOp scan_op)551 __device__ __forceinline__ void ExclusiveScan( 552 T input, ///< [in] Calling thread's input item. 553 T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 554 ScanOp scan_op) ///< [in] Binary scan operator 555 { 556 InternalWarpScan internal(temp_storage); 557 558 T inclusive_output; 559 internal.InclusiveScan(input, inclusive_output, scan_op); 560 561 internal.Update( 562 input, 563 inclusive_output, 564 exclusive_output, 565 scan_op, 566 Int2Type<IS_INTEGER>()); 567 } 568 569 570 /** 571 * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. 572 * 573 * \par 574 * - \smemreuse 575 * 576 * \par Snippet 577 * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 578 * 128 threads (one per each of the 32-thread warps). 579 * \par 580 * \code 581 * #include <cub/cub.cuh> 582 * 583 * __global__ void ExampleKernel(...) 584 * { 585 * // Specialize WarpScan for type int 586 * typedef cub::WarpScan<int> WarpScan; 587 * 588 * // Allocate WarpScan shared memory for 4 warps 589 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 590 * 591 * // Obtain one input item per thread 592 * int thread_data = ... 593 * 594 * // Compute exclusive warp-wide prefix max scans 595 * int warp_id = threadIdx.x / 32; 596 * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); 597 * 598 * \endcode 599 * \par 600 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 601 * The corresponding output \p thread_data in the first warp would be 602 * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc. 603 * 604 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 605 */ 606 template <typename ScanOp> ExclusiveScan(T input,T & exclusive_output,T initial_value,ScanOp scan_op)607 __device__ __forceinline__ void ExclusiveScan( 608 T input, ///< [in] Calling thread's input item. 609 T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 610 T initial_value, ///< [in] Initial value to seed the exclusive scan 611 ScanOp scan_op) ///< [in] Binary scan operator 612 { 613 InternalWarpScan internal(temp_storage); 614 615 T inclusive_output; 616 internal.InclusiveScan(input, inclusive_output, scan_op); 617 618 internal.Update( 619 input, 620 inclusive_output, 621 exclusive_output, 622 scan_op, 623 initial_value, 624 Int2Type<IS_INTEGER>()); 625 } 626 627 628 /** 629 * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. 630 * 631 * \par 632 * - \smemreuse 633 * 634 * \par Snippet 635 * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 636 * 128 threads (one per each of the 32-thread warps). 637 * \par 638 * \code 639 * #include <cub/cub.cuh> 640 * 641 * __global__ void ExampleKernel(...) 642 * { 643 * // Specialize WarpScan for type int 644 * typedef cub::WarpScan<int> WarpScan; 645 * 646 * // Allocate WarpScan shared memory for 4 warps 647 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 648 * 649 * // Obtain one input item per thread 650 * int thread_data = ... 651 * 652 * // Compute exclusive warp-wide prefix max scans 653 * int warp_aggregate; 654 * int warp_id = threadIdx.x / 32; 655 * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); 656 * 657 * \endcode 658 * \par 659 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 660 * The corresponding output \p thread_data in the first warp would be 661 * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc. 662 * (The output \p thread_data in warp lane<sub>0</sub> is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads 663 * in the second warp, etc. 664 * 665 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 666 */ 667 template <typename ScanOp> ExclusiveScan(T input,T & exclusive_output,ScanOp scan_op,T & warp_aggregate)668 __device__ __forceinline__ void ExclusiveScan( 669 T input, ///< [in] Calling thread's input item. 670 T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 671 ScanOp scan_op, ///< [in] Binary scan operator 672 T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. 673 { 674 InternalWarpScan internal(temp_storage); 675 676 T inclusive_output; 677 internal.InclusiveScan(input, inclusive_output, scan_op); 678 679 internal.Update( 680 input, 681 inclusive_output, 682 exclusive_output, 683 warp_aggregate, 684 scan_op, 685 Int2Type<IS_INTEGER>()); 686 } 687 688 689 /** 690 * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. 691 * 692 * \par 693 * - \smemreuse 694 * 695 * \par Snippet 696 * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 697 * 128 threads (one per each of the 32-thread warps). 698 * \par 699 * \code 700 * #include <cub/cub.cuh> 701 * 702 * __global__ void ExampleKernel(...) 703 * { 704 * // Specialize WarpScan for type int 705 * typedef cub::WarpScan<int> WarpScan; 706 * 707 * // Allocate WarpScan shared memory for 4 warps 708 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 709 * 710 * // Obtain one input item per thread 711 * int thread_data = ... 712 * 713 * // Compute exclusive warp-wide prefix max scans 714 * int warp_aggregate; 715 * int warp_id = threadIdx.x / 32; 716 * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); 717 * 718 * \endcode 719 * \par 720 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 721 * The corresponding output \p thread_data in the first warp would be 722 * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc. 723 * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads 724 * in the second warp, etc. 725 * 726 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 727 */ 728 template <typename ScanOp> ExclusiveScan(T input,T & exclusive_output,T initial_value,ScanOp scan_op,T & warp_aggregate)729 __device__ __forceinline__ void ExclusiveScan( 730 T input, ///< [in] Calling thread's input item. 731 T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. 732 T initial_value, ///< [in] Initial value to seed the exclusive scan 733 ScanOp scan_op, ///< [in] Binary scan operator 734 T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. 735 { 736 InternalWarpScan internal(temp_storage); 737 738 T inclusive_output; 739 internal.InclusiveScan(input, inclusive_output, scan_op); 740 741 internal.Update( 742 input, 743 inclusive_output, 744 exclusive_output, 745 warp_aggregate, 746 scan_op, 747 initial_value, 748 Int2Type<IS_INTEGER>()); 749 } 750 751 752 //@} end member group 753 /******************************************************************//** 754 * \name Combination (inclusive & exclusive) prefix scans 755 *********************************************************************/ 756 //@{ 757 758 759 /** 760 * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined. 761 * 762 * \par 763 * - \smemreuse 764 * 765 * \par Snippet 766 * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of 767 * 128 threads (one per each of the 32-thread warps). 768 * \par 769 * \code 770 * #include <cub/cub.cuh> 771 * 772 * __global__ void ExampleKernel(...) 773 * { 774 * // Specialize WarpScan for type int 775 * typedef cub::WarpScan<int> WarpScan; 776 * 777 * // Allocate WarpScan shared memory for 4 warps 778 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 779 * 780 * // Obtain one input item per thread 781 * int thread_data = ... 782 * 783 * // Compute exclusive warp-wide prefix max scans 784 * int inclusive_partial, exclusive_partial; 785 * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); 786 * 787 * \endcode 788 * \par 789 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 790 * The corresponding output \p inclusive_partial in the first warp would be 791 * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc. 792 * The corresponding output \p exclusive_partial in the first warp would be 793 * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc. 794 * (The output \p thread_data in warp lane<sub>0</sub> is undefined.) 795 * 796 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 797 */ 798 template <typename ScanOp> Scan(T input,T & inclusive_output,T & exclusive_output,ScanOp scan_op)799 __device__ __forceinline__ void Scan( 800 T input, ///< [in] Calling thread's input item. 801 T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. 802 T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. 803 ScanOp scan_op) ///< [in] Binary scan operator 804 { 805 InternalWarpScan internal(temp_storage); 806 807 internal.InclusiveScan(input, inclusive_output, scan_op); 808 809 internal.Update( 810 input, 811 inclusive_output, 812 exclusive_output, 813 scan_op, 814 Int2Type<IS_INTEGER>()); 815 } 816 817 818 /** 819 * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. 820 * 821 * \par 822 * - \smemreuse 823 * 824 * \par Snippet 825 * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of 826 * 128 threads (one per each of the 32-thread warps). 827 * \par 828 * \code 829 * #include <cub/cub.cuh> 830 * 831 * __global__ void ExampleKernel(...) 832 * { 833 * // Specialize WarpScan for type int 834 * typedef cub::WarpScan<int> WarpScan; 835 * 836 * // Allocate WarpScan shared memory for 4 warps 837 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 838 * 839 * // Obtain one input item per thread 840 * int thread_data = ... 841 * 842 * // Compute inclusive warp-wide prefix max scans 843 * int warp_id = threadIdx.x / 32; 844 * int inclusive_partial, exclusive_partial; 845 * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); 846 * 847 * \endcode 848 * \par 849 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>. 850 * The corresponding output \p inclusive_partial in the first warp would be 851 * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc. 852 * The corresponding output \p exclusive_partial in the first warp would be 853 * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc. 854 * 855 * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> 856 */ 857 template <typename ScanOp> Scan(T input,T & inclusive_output,T & exclusive_output,T initial_value,ScanOp scan_op)858 __device__ __forceinline__ void Scan( 859 T input, ///< [in] Calling thread's input item. 860 T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. 861 T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. 862 T initial_value, ///< [in] Initial value to seed the exclusive scan 863 ScanOp scan_op) ///< [in] Binary scan operator 864 { 865 InternalWarpScan internal(temp_storage); 866 867 internal.InclusiveScan(input, inclusive_output, scan_op); 868 869 internal.Update( 870 input, 871 inclusive_output, 872 exclusive_output, 873 scan_op, 874 initial_value, 875 Int2Type<IS_INTEGER>()); 876 } 877 878 879 880 //@} end member group 881 /******************************************************************//** 882 * \name Data exchange 883 *********************************************************************/ 884 //@{ 885 886 /** 887 * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp 888 * 889 * \par 890 * - \smemreuse 891 * 892 * \par Snippet 893 * The code snippet below illustrates the warp-wide broadcasts of values from 894 * lanes<sub>0</sub> in each of four warps to all other threads in those warps. 895 * \par 896 * \code 897 * #include <cub/cub.cuh> 898 * 899 * __global__ void ExampleKernel(...) 900 * { 901 * // Specialize WarpScan for type int 902 * typedef cub::WarpScan<int> WarpScan; 903 * 904 * // Allocate WarpScan shared memory for 4 warps 905 * __shared__ typename WarpScan::TempStorage temp_storage[4]; 906 * 907 * // Obtain one input item per thread 908 * int thread_data = ... 909 * 910 * // Broadcast from lane0 in each warp to all other threads in the warp 911 * int warp_id = threadIdx.x / 32; 912 * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); 913 * 914 * \endcode 915 * \par 916 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>. 917 * The corresponding output \p thread_data will be 918 * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>, 919 * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>, 920 * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc. 921 */ Broadcast(T input,unsigned int src_lane)922 __device__ __forceinline__ T Broadcast( 923 T input, ///< [in] The value to broadcast 924 unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting 925 { 926 return InternalWarpScan(temp_storage).Broadcast(input, src_lane); 927 } 928 929 //@} end member group 930 931 }; 932 933 /** @} */ // end group WarpModule 934 935 } // CUB namespace 936 CUB_NS_POSTFIX // Optional outer namespace(s) 937