1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 30 /** 31 * \file 32 * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. 33 */ 34 35 #pragma once 36 37 #include "../../util_ptx.cuh" 38 #include "../../util_arch.cuh" 39 #include "../../block/block_raking_layout.cuh" 40 #include "../../thread/thread_reduce.cuh" 41 #include "../../thread/thread_scan.cuh" 42 #include "../../warp/warp_scan.cuh" 43 #include "../../util_namespace.cuh" 44 45 /// Optional outer namespace(s) 46 CUB_NS_PREFIX 47 48 /// CUB namespace 49 namespace cub { 50 51 52 /** 53 * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. 54 */ 55 template < 56 typename T, ///< Data type being scanned 57 int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension 58 int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension 59 int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension 60 bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure 61 int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 62 struct BlockScanRaking 63 { 64 //--------------------------------------------------------------------- 65 // Types and constants 66 //--------------------------------------------------------------------- 67 68 /// Constants 69 enum 70 { 71 /// The thread block size in threads 72 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 73 }; 74 75 /// Layout type for padded thread block raking grid 76 typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout; 77 78 /// Constants 79 enum 80 { 81 /// Number of raking threads 82 RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, 83 84 /// Number of raking elements per warp synchronous raking thread 85 SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, 86 87 /// Cooperative work can be entirely warp synchronous 88 WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), 89 }; 90 91 /// WarpScan utility type 92 typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan; 93 94 /// Shared memory storage layout type 95 struct _TempStorage 96 { 97 typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan 98 typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid 99 T block_aggregate; ///< Block aggregate 100 }; 101 102 103 /// Alias wrapper allowing storage to be unioned 104 struct TempStorage : Uninitialized<_TempStorage> {}; 105 106 107 //--------------------------------------------------------------------- 108 // Per-thread fields 109 //--------------------------------------------------------------------- 110 111 // Thread fields 112 _TempStorage &temp_storage; 113 unsigned int linear_tid; 114 T cached_segment[SEGMENT_LENGTH]; 115 116 117 //--------------------------------------------------------------------- 118 // Utility methods 119 //--------------------------------------------------------------------- 120 121 /// Templated reduction 122 template <int ITERATION, typename ScanOp> GuardedReducecub::BlockScanRaking123 __device__ __forceinline__ T GuardedReduce( 124 T* raking_ptr, ///< [in] Input array 125 ScanOp scan_op, ///< [in] Binary reduction operator 126 T raking_partial, ///< [in] Prefix to seed reduction with 127 Int2Type<ITERATION> /*iteration*/) 128 { 129 if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) 130 { 131 T addend = raking_ptr[ITERATION]; 132 raking_partial = scan_op(raking_partial, addend); 133 } 134 135 return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>()); 136 } 137 138 139 /// Templated reduction (base case) 140 template <typename ScanOp> GuardedReducecub::BlockScanRaking141 __device__ __forceinline__ T GuardedReduce( 142 T* /*raking_ptr*/, ///< [in] Input array 143 ScanOp /*scan_op*/, ///< [in] Binary reduction operator 144 T raking_partial, ///< [in] Prefix to seed reduction with 145 Int2Type<SEGMENT_LENGTH> /*iteration*/) 146 { 147 return raking_partial; 148 } 149 150 151 /// Templated copy 152 template <int ITERATION> CopySegmentcub::BlockScanRaking153 __device__ __forceinline__ void CopySegment( 154 T* out, ///< [out] Out array 155 T* in, ///< [in] Input array 156 Int2Type<ITERATION> /*iteration*/) 157 { 158 out[ITERATION] = in[ITERATION]; 159 CopySegment(out, in, Int2Type<ITERATION + 1>()); 160 } 161 162 163 /// Templated copy (base case) CopySegmentcub::BlockScanRaking164 __device__ __forceinline__ void CopySegment( 165 T* /*out*/, ///< [out] Out array 166 T* /*in*/, ///< [in] Input array 167 Int2Type<SEGMENT_LENGTH> /*iteration*/) 168 {} 169 170 171 /// Performs upsweep raking reduction, returning the aggregate 172 template <typename ScanOp> Upsweepcub::BlockScanRaking173 __device__ __forceinline__ T Upsweep( 174 ScanOp scan_op) 175 { 176 T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 177 178 // Read data into registers 179 CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); 180 181 T raking_partial = cached_segment[0]; 182 183 return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); 184 } 185 186 187 /// Performs exclusive downsweep raking scan 188 template <typename ScanOp> ExclusiveDownsweepcub::BlockScanRaking189 __device__ __forceinline__ void ExclusiveDownsweep( 190 ScanOp scan_op, 191 T raking_partial, 192 bool apply_prefix = true) 193 { 194 T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 195 196 // Read data back into registers 197 if (!MEMOIZE) 198 { 199 CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); 200 } 201 202 internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); 203 204 // Write data back to smem 205 CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); 206 } 207 208 209 /// Performs inclusive downsweep raking scan 210 template <typename ScanOp> InclusiveDownsweepcub::BlockScanRaking211 __device__ __forceinline__ void InclusiveDownsweep( 212 ScanOp scan_op, 213 T raking_partial, 214 bool apply_prefix = true) 215 { 216 T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 217 218 // Read data back into registers 219 if (!MEMOIZE) 220 { 221 CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); 222 } 223 224 internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); 225 226 // Write data back to smem 227 CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); 228 } 229 230 231 //--------------------------------------------------------------------- 232 // Constructors 233 //--------------------------------------------------------------------- 234 235 /// Constructor BlockScanRakingcub::BlockScanRaking236 __device__ __forceinline__ BlockScanRaking( 237 TempStorage &temp_storage) 238 : 239 temp_storage(temp_storage.Alias()), 240 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 241 {} 242 243 244 //--------------------------------------------------------------------- 245 // Exclusive scans 246 //--------------------------------------------------------------------- 247 248 /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined. 249 template <typename ScanOp> ExclusiveScancub::BlockScanRaking250 __device__ __forceinline__ void ExclusiveScan( 251 T input, ///< [in] Calling thread's input item 252 T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) 253 ScanOp scan_op) ///< [in] Binary scan operator 254 { 255 if (WARP_SYNCHRONOUS) 256 { 257 // Short-circuit directly to warp-synchronous scan 258 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); 259 } 260 else 261 { 262 // Place thread partial into shared memory raking grid 263 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 264 *placement_ptr = input; 265 266 CTA_SYNC(); 267 268 // Reduce parallelism down to just raking threads 269 if (linear_tid < RAKING_THREADS) 270 { 271 // Raking upsweep reduction across shared partials 272 T upsweep_partial = Upsweep(scan_op); 273 274 // Warp-synchronous scan 275 T exclusive_partial; 276 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); 277 278 // Exclusive raking downsweep scan 279 ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); 280 } 281 282 CTA_SYNC(); 283 284 // Grab thread prefix from shared memory 285 exclusive_output = *placement_ptr; 286 } 287 } 288 289 /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. 290 template <typename ScanOp> ExclusiveScancub::BlockScanRaking291 __device__ __forceinline__ void ExclusiveScan( 292 T input, ///< [in] Calling thread's input items 293 T &output, ///< [out] Calling thread's output items (may be aliased to \p input) 294 const T &initial_value, ///< [in] Initial value to seed the exclusive scan 295 ScanOp scan_op) ///< [in] Binary scan operator 296 { 297 if (WARP_SYNCHRONOUS) 298 { 299 // Short-circuit directly to warp-synchronous scan 300 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); 301 } 302 else 303 { 304 // Place thread partial into shared memory raking grid 305 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 306 *placement_ptr = input; 307 308 CTA_SYNC(); 309 310 // Reduce parallelism down to just raking threads 311 if (linear_tid < RAKING_THREADS) 312 { 313 // Raking upsweep reduction across shared partials 314 T upsweep_partial = Upsweep(scan_op); 315 316 // Exclusive Warp-synchronous scan 317 T exclusive_partial; 318 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); 319 320 // Exclusive raking downsweep scan 321 ExclusiveDownsweep(scan_op, exclusive_partial); 322 } 323 324 CTA_SYNC(); 325 326 // Grab exclusive partial from shared memory 327 output = *placement_ptr; 328 } 329 } 330 331 332 /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined. 333 template <typename ScanOp> ExclusiveScancub::BlockScanRaking334 __device__ __forceinline__ void ExclusiveScan( 335 T input, ///< [in] Calling thread's input item 336 T &output, ///< [out] Calling thread's output item (may be aliased to \p input) 337 ScanOp scan_op, ///< [in] Binary scan operator 338 T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items 339 { 340 if (WARP_SYNCHRONOUS) 341 { 342 // Short-circuit directly to warp-synchronous scan 343 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); 344 } 345 else 346 { 347 // Place thread partial into shared memory raking grid 348 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 349 *placement_ptr = input; 350 351 CTA_SYNC(); 352 353 // Reduce parallelism down to just raking threads 354 if (linear_tid < RAKING_THREADS) 355 { 356 // Raking upsweep reduction across shared partials 357 T upsweep_partial= Upsweep(scan_op); 358 359 // Warp-synchronous scan 360 T inclusive_partial; 361 T exclusive_partial; 362 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); 363 364 // Exclusive raking downsweep scan 365 ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); 366 367 // Broadcast aggregate to all threads 368 if (linear_tid == RAKING_THREADS - 1) 369 temp_storage.block_aggregate = inclusive_partial; 370 } 371 372 CTA_SYNC(); 373 374 // Grab thread prefix from shared memory 375 output = *placement_ptr; 376 377 // Retrieve block aggregate 378 block_aggregate = temp_storage.block_aggregate; 379 } 380 } 381 382 383 /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. 384 template <typename ScanOp> ExclusiveScancub::BlockScanRaking385 __device__ __forceinline__ void ExclusiveScan( 386 T input, ///< [in] Calling thread's input items 387 T &output, ///< [out] Calling thread's output items (may be aliased to \p input) 388 const T &initial_value, ///< [in] Initial value to seed the exclusive scan 389 ScanOp scan_op, ///< [in] Binary scan operator 390 T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items 391 { 392 if (WARP_SYNCHRONOUS) 393 { 394 // Short-circuit directly to warp-synchronous scan 395 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); 396 } 397 else 398 { 399 // Place thread partial into shared memory raking grid 400 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 401 *placement_ptr = input; 402 403 CTA_SYNC(); 404 405 // Reduce parallelism down to just raking threads 406 if (linear_tid < RAKING_THREADS) 407 { 408 // Raking upsweep reduction across shared partials 409 T upsweep_partial = Upsweep(scan_op); 410 411 // Warp-synchronous scan 412 T exclusive_partial; 413 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); 414 415 // Exclusive raking downsweep scan 416 ExclusiveDownsweep(scan_op, exclusive_partial); 417 418 // Broadcast aggregate to other threads 419 if (linear_tid == 0) 420 temp_storage.block_aggregate = block_aggregate; 421 } 422 423 CTA_SYNC(); 424 425 // Grab exclusive partial from shared memory 426 output = *placement_ptr; 427 428 // Retrieve block aggregate 429 block_aggregate = temp_storage.block_aggregate; 430 } 431 } 432 433 434 /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. 435 template < 436 typename ScanOp, 437 typename BlockPrefixCallbackOp> ExclusiveScancub::BlockScanRaking438 __device__ __forceinline__ void ExclusiveScan( 439 T input, ///< [in] Calling thread's input item 440 T &output, ///< [out] Calling thread's output item (may be aliased to \p input) 441 ScanOp scan_op, ///< [in] Binary scan operator 442 BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. 443 { 444 if (WARP_SYNCHRONOUS) 445 { 446 // Short-circuit directly to warp-synchronous scan 447 T block_aggregate; 448 WarpScan warp_scan(temp_storage.warp_scan); 449 warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); 450 451 // Obtain warp-wide prefix in lane0, then broadcast to other lanes 452 T block_prefix = block_prefix_callback_op(block_aggregate); 453 block_prefix = warp_scan.Broadcast(block_prefix, 0); 454 455 output = scan_op(block_prefix, output); 456 if (linear_tid == 0) 457 output = block_prefix; 458 } 459 else 460 { 461 // Place thread partial into shared memory raking grid 462 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 463 *placement_ptr = input; 464 465 CTA_SYNC(); 466 467 // Reduce parallelism down to just raking threads 468 if (linear_tid < RAKING_THREADS) 469 { 470 WarpScan warp_scan(temp_storage.warp_scan); 471 472 // Raking upsweep reduction across shared partials 473 T upsweep_partial = Upsweep(scan_op); 474 475 // Warp-synchronous scan 476 T exclusive_partial, block_aggregate; 477 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); 478 479 // Obtain block-wide prefix in lane0, then broadcast to other lanes 480 T block_prefix = block_prefix_callback_op(block_aggregate); 481 block_prefix = warp_scan.Broadcast(block_prefix, 0); 482 483 // Update prefix with warpscan exclusive partial 484 T downsweep_prefix = scan_op(block_prefix, exclusive_partial); 485 if (linear_tid == 0) 486 downsweep_prefix = block_prefix; 487 488 // Exclusive raking downsweep scan 489 ExclusiveDownsweep(scan_op, downsweep_prefix); 490 } 491 492 CTA_SYNC(); 493 494 // Grab thread prefix from shared memory 495 output = *placement_ptr; 496 } 497 } 498 499 500 //--------------------------------------------------------------------- 501 // Inclusive scans 502 //--------------------------------------------------------------------- 503 504 /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. 505 template <typename ScanOp> InclusiveScancub::BlockScanRaking506 __device__ __forceinline__ void InclusiveScan( 507 T input, ///< [in] Calling thread's input item 508 T &output, ///< [out] Calling thread's output item (may be aliased to \p input) 509 ScanOp scan_op) ///< [in] Binary scan operator 510 { 511 if (WARP_SYNCHRONOUS) 512 { 513 // Short-circuit directly to warp-synchronous scan 514 WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); 515 } 516 else 517 { 518 // Place thread partial into shared memory raking grid 519 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 520 *placement_ptr = input; 521 522 CTA_SYNC(); 523 524 // Reduce parallelism down to just raking threads 525 if (linear_tid < RAKING_THREADS) 526 { 527 // Raking upsweep reduction across shared partials 528 T upsweep_partial = Upsweep(scan_op); 529 530 // Exclusive Warp-synchronous scan 531 T exclusive_partial; 532 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); 533 534 // Inclusive raking downsweep scan 535 InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); 536 } 537 538 CTA_SYNC(); 539 540 // Grab thread prefix from shared memory 541 output = *placement_ptr; 542 } 543 } 544 545 546 /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. 547 template <typename ScanOp> InclusiveScancub::BlockScanRaking548 __device__ __forceinline__ void InclusiveScan( 549 T input, ///< [in] Calling thread's input item 550 T &output, ///< [out] Calling thread's output item (may be aliased to \p input) 551 ScanOp scan_op, ///< [in] Binary scan operator 552 T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items 553 { 554 if (WARP_SYNCHRONOUS) 555 { 556 // Short-circuit directly to warp-synchronous scan 557 WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); 558 } 559 else 560 { 561 // Place thread partial into shared memory raking grid 562 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 563 *placement_ptr = input; 564 565 CTA_SYNC(); 566 567 // Reduce parallelism down to just raking threads 568 if (linear_tid < RAKING_THREADS) 569 { 570 // Raking upsweep reduction across shared partials 571 T upsweep_partial = Upsweep(scan_op); 572 573 // Warp-synchronous scan 574 T inclusive_partial; 575 T exclusive_partial; 576 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); 577 578 // Inclusive raking downsweep scan 579 InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); 580 581 // Broadcast aggregate to all threads 582 if (linear_tid == RAKING_THREADS - 1) 583 temp_storage.block_aggregate = inclusive_partial; 584 } 585 586 CTA_SYNC(); 587 588 // Grab thread prefix from shared memory 589 output = *placement_ptr; 590 591 // Retrieve block aggregate 592 block_aggregate = temp_storage.block_aggregate; 593 } 594 } 595 596 597 /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. 598 template < 599 typename ScanOp, 600 typename BlockPrefixCallbackOp> InclusiveScancub::BlockScanRaking601 __device__ __forceinline__ void InclusiveScan( 602 T input, ///< [in] Calling thread's input item 603 T &output, ///< [out] Calling thread's output item (may be aliased to \p input) 604 ScanOp scan_op, ///< [in] Binary scan operator 605 BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. 606 { 607 if (WARP_SYNCHRONOUS) 608 { 609 // Short-circuit directly to warp-synchronous scan 610 T block_aggregate; 611 WarpScan warp_scan(temp_storage.warp_scan); 612 warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); 613 614 // Obtain warp-wide prefix in lane0, then broadcast to other lanes 615 T block_prefix = block_prefix_callback_op(block_aggregate); 616 block_prefix = warp_scan.Broadcast(block_prefix, 0); 617 618 // Update prefix with exclusive warpscan partial 619 output = scan_op(block_prefix, output); 620 } 621 else 622 { 623 // Place thread partial into shared memory raking grid 624 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); 625 *placement_ptr = input; 626 627 CTA_SYNC(); 628 629 // Reduce parallelism down to just raking threads 630 if (linear_tid < RAKING_THREADS) 631 { 632 WarpScan warp_scan(temp_storage.warp_scan); 633 634 // Raking upsweep reduction across shared partials 635 T upsweep_partial = Upsweep(scan_op); 636 637 // Warp-synchronous scan 638 T exclusive_partial, block_aggregate; 639 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); 640 641 // Obtain block-wide prefix in lane0, then broadcast to other lanes 642 T block_prefix = block_prefix_callback_op(block_aggregate); 643 block_prefix = warp_scan.Broadcast(block_prefix, 0); 644 645 // Update prefix with warpscan exclusive partial 646 T downsweep_prefix = scan_op(block_prefix, exclusive_partial); 647 if (linear_tid == 0) 648 downsweep_prefix = block_prefix; 649 650 // Inclusive raking downsweep scan 651 InclusiveDownsweep(scan_op, downsweep_prefix); 652 } 653 654 CTA_SYNC(); 655 656 // Grab thread prefix from shared memory 657 output = *placement_ptr; 658 } 659 } 660 661 }; 662 663 664 } // CUB namespace 665 CUB_NS_POSTFIX // Optional outer namespace(s) 666 667