1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 
30 /**
31  * \file
32  * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
33  */
34 
35 #pragma once
36 
37 #include "../../util_ptx.cuh"
38 #include "../../util_arch.cuh"
39 #include "../../block/block_raking_layout.cuh"
40 #include "../../thread/thread_reduce.cuh"
41 #include "../../thread/thread_scan.cuh"
42 #include "../../warp/warp_scan.cuh"
43 #include "../../util_namespace.cuh"
44 
45 /// Optional outer namespace(s)
46 CUB_NS_PREFIX
47 
48 /// CUB namespace
49 namespace cub {
50 
51 
52 /**
53  * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
54  */
55 template <
56     typename    T,              ///< Data type being scanned
57     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
58     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
59     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
60     bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
61     int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
62 struct BlockScanRaking
63 {
64     //---------------------------------------------------------------------
65     // Types and constants
66     //---------------------------------------------------------------------
67 
68     /// Constants
69     enum
70     {
71         /// The thread block size in threads
72         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
73     };
74 
75     /// Layout type for padded thread block raking grid
76     typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
77 
78     /// Constants
79     enum
80     {
81         /// Number of raking threads
82         RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
83 
84         /// Number of raking elements per warp synchronous raking thread
85         SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
86 
87         /// Cooperative work can be entirely warp synchronous
88         WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
89     };
90 
91     ///  WarpScan utility type
92     typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
93 
94     /// Shared memory storage layout type
95     struct _TempStorage
96     {
97         typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
98         typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
99         T                                           block_aggregate;    ///< Block aggregate
100     };
101 
102 
103     /// Alias wrapper allowing storage to be unioned
104     struct TempStorage : Uninitialized<_TempStorage> {};
105 
106 
107     //---------------------------------------------------------------------
108     // Per-thread fields
109     //---------------------------------------------------------------------
110 
111     // Thread fields
112     _TempStorage    &temp_storage;
113     unsigned int    linear_tid;
114     T               cached_segment[SEGMENT_LENGTH];
115 
116 
117     //---------------------------------------------------------------------
118     // Utility methods
119     //---------------------------------------------------------------------
120 
121     /// Templated reduction
122     template <int ITERATION, typename ScanOp>
GuardedReducecub::BlockScanRaking123     __device__ __forceinline__ T GuardedReduce(
124         T*                  raking_ptr,         ///< [in] Input array
125         ScanOp              scan_op,            ///< [in] Binary reduction operator
126         T                   raking_partial,     ///< [in] Prefix to seed reduction with
127         Int2Type<ITERATION> /*iteration*/)
128     {
129         if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
130         {
131             T addend = raking_ptr[ITERATION];
132             raking_partial = scan_op(raking_partial, addend);
133         }
134 
135         return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
136     }
137 
138 
139     /// Templated reduction (base case)
140     template <typename ScanOp>
GuardedReducecub::BlockScanRaking141     __device__ __forceinline__ T GuardedReduce(
142         T*                          /*raking_ptr*/,    ///< [in] Input array
143         ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
144         T                           raking_partial,    ///< [in] Prefix to seed reduction with
145         Int2Type<SEGMENT_LENGTH>    /*iteration*/)
146     {
147         return raking_partial;
148     }
149 
150 
151     /// Templated copy
152     template <int ITERATION>
CopySegmentcub::BlockScanRaking153     __device__ __forceinline__ void CopySegment(
154         T*                  out,            ///< [out] Out array
155         T*                  in,             ///< [in] Input array
156         Int2Type<ITERATION> /*iteration*/)
157     {
158         out[ITERATION] = in[ITERATION];
159         CopySegment(out, in, Int2Type<ITERATION + 1>());
160     }
161 
162 
163     /// Templated copy (base case)
CopySegmentcub::BlockScanRaking164     __device__ __forceinline__ void CopySegment(
165         T*                  /*out*/,            ///< [out] Out array
166         T*                  /*in*/,             ///< [in] Input array
167         Int2Type<SEGMENT_LENGTH> /*iteration*/)
168     {}
169 
170 
171     /// Performs upsweep raking reduction, returning the aggregate
172     template <typename ScanOp>
Upsweepcub::BlockScanRaking173     __device__ __forceinline__ T Upsweep(
174         ScanOp scan_op)
175     {
176         T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
177 
178         // Read data into registers
179         CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
180 
181         T raking_partial = cached_segment[0];
182 
183         return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
184     }
185 
186 
187     /// Performs exclusive downsweep raking scan
188     template <typename ScanOp>
ExclusiveDownsweepcub::BlockScanRaking189     __device__ __forceinline__ void ExclusiveDownsweep(
190         ScanOp          scan_op,
191         T               raking_partial,
192         bool            apply_prefix = true)
193     {
194         T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
195 
196         // Read data back into registers
197         if (!MEMOIZE)
198         {
199             CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
200         }
201 
202         internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
203 
204         // Write data back to smem
205         CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
206     }
207 
208 
209     /// Performs inclusive downsweep raking scan
210     template <typename ScanOp>
InclusiveDownsweepcub::BlockScanRaking211     __device__ __forceinline__ void InclusiveDownsweep(
212         ScanOp          scan_op,
213         T               raking_partial,
214         bool            apply_prefix = true)
215     {
216         T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
217 
218         // Read data back into registers
219         if (!MEMOIZE)
220         {
221             CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
222         }
223 
224         internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
225 
226         // Write data back to smem
227         CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
228     }
229 
230 
231     //---------------------------------------------------------------------
232     // Constructors
233     //---------------------------------------------------------------------
234 
235     /// Constructor
BlockScanRakingcub::BlockScanRaking236     __device__ __forceinline__ BlockScanRaking(
237         TempStorage &temp_storage)
238     :
239         temp_storage(temp_storage.Alias()),
240         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
241     {}
242 
243 
244     //---------------------------------------------------------------------
245     // Exclusive scans
246     //---------------------------------------------------------------------
247 
248     /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
249     template <typename ScanOp>
ExclusiveScancub::BlockScanRaking250     __device__ __forceinline__ void ExclusiveScan(
251         T               input,                          ///< [in] Calling thread's input item
252         T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
253         ScanOp          scan_op)                        ///< [in] Binary scan operator
254     {
255         if (WARP_SYNCHRONOUS)
256         {
257             // Short-circuit directly to warp-synchronous scan
258             WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
259         }
260         else
261         {
262             // Place thread partial into shared memory raking grid
263             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
264             *placement_ptr = input;
265 
266             CTA_SYNC();
267 
268             // Reduce parallelism down to just raking threads
269             if (linear_tid < RAKING_THREADS)
270             {
271                 // Raking upsweep reduction across shared partials
272                 T upsweep_partial = Upsweep(scan_op);
273 
274                 // Warp-synchronous scan
275                 T exclusive_partial;
276                 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
277 
278                 // Exclusive raking downsweep scan
279                 ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
280             }
281 
282             CTA_SYNC();
283 
284             // Grab thread prefix from shared memory
285             exclusive_output = *placement_ptr;
286         }
287     }
288 
289     /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
290     template <typename ScanOp>
ExclusiveScancub::BlockScanRaking291     __device__ __forceinline__ void ExclusiveScan(
292         T               input,              ///< [in] Calling thread's input items
293         T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
294         const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
295         ScanOp          scan_op)            ///< [in] Binary scan operator
296     {
297         if (WARP_SYNCHRONOUS)
298         {
299             // Short-circuit directly to warp-synchronous scan
300             WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
301         }
302         else
303         {
304             // Place thread partial into shared memory raking grid
305             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
306             *placement_ptr = input;
307 
308             CTA_SYNC();
309 
310             // Reduce parallelism down to just raking threads
311             if (linear_tid < RAKING_THREADS)
312             {
313                 // Raking upsweep reduction across shared partials
314                 T upsweep_partial = Upsweep(scan_op);
315 
316                 // Exclusive Warp-synchronous scan
317                 T exclusive_partial;
318                 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
319 
320                 // Exclusive raking downsweep scan
321                 ExclusiveDownsweep(scan_op, exclusive_partial);
322             }
323 
324             CTA_SYNC();
325 
326             // Grab exclusive partial from shared memory
327             output = *placement_ptr;
328         }
329     }
330 
331 
332     /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
333     template <typename ScanOp>
ExclusiveScancub::BlockScanRaking334     __device__ __forceinline__ void ExclusiveScan(
335         T               input,                          ///< [in] Calling thread's input item
336         T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
337         ScanOp          scan_op,                        ///< [in] Binary scan operator
338         T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
339     {
340         if (WARP_SYNCHRONOUS)
341         {
342             // Short-circuit directly to warp-synchronous scan
343             WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
344         }
345         else
346         {
347             // Place thread partial into shared memory raking grid
348             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
349             *placement_ptr = input;
350 
351             CTA_SYNC();
352 
353             // Reduce parallelism down to just raking threads
354             if (linear_tid < RAKING_THREADS)
355             {
356                 // Raking upsweep reduction across shared partials
357                 T upsweep_partial= Upsweep(scan_op);
358 
359                 // Warp-synchronous scan
360                 T inclusive_partial;
361                 T exclusive_partial;
362                 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
363 
364                 // Exclusive raking downsweep scan
365                 ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
366 
367                 // Broadcast aggregate to all threads
368                 if (linear_tid == RAKING_THREADS - 1)
369                     temp_storage.block_aggregate = inclusive_partial;
370             }
371 
372             CTA_SYNC();
373 
374             // Grab thread prefix from shared memory
375             output = *placement_ptr;
376 
377             // Retrieve block aggregate
378             block_aggregate = temp_storage.block_aggregate;
379         }
380     }
381 
382 
383     /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
384     template <typename ScanOp>
ExclusiveScancub::BlockScanRaking385     __device__ __forceinline__ void ExclusiveScan(
386         T               input,              ///< [in] Calling thread's input items
387         T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
388         const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
389         ScanOp          scan_op,            ///< [in] Binary scan operator
390         T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
391     {
392         if (WARP_SYNCHRONOUS)
393         {
394             // Short-circuit directly to warp-synchronous scan
395             WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
396         }
397         else
398         {
399             // Place thread partial into shared memory raking grid
400             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
401             *placement_ptr = input;
402 
403             CTA_SYNC();
404 
405             // Reduce parallelism down to just raking threads
406             if (linear_tid < RAKING_THREADS)
407             {
408                 // Raking upsweep reduction across shared partials
409                 T upsweep_partial = Upsweep(scan_op);
410 
411                 // Warp-synchronous scan
412                 T exclusive_partial;
413                 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
414 
415                 // Exclusive raking downsweep scan
416                 ExclusiveDownsweep(scan_op, exclusive_partial);
417 
418                 // Broadcast aggregate to other threads
419                 if (linear_tid == 0)
420                     temp_storage.block_aggregate = block_aggregate;
421             }
422 
423             CTA_SYNC();
424 
425             // Grab exclusive partial from shared memory
426             output = *placement_ptr;
427 
428             // Retrieve block aggregate
429             block_aggregate = temp_storage.block_aggregate;
430         }
431     }
432 
433 
434     /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
435     template <
436         typename ScanOp,
437         typename BlockPrefixCallbackOp>
ExclusiveScancub::BlockScanRaking438     __device__ __forceinline__ void ExclusiveScan(
439         T                       input,                          ///< [in] Calling thread's input item
440         T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
441         ScanOp                  scan_op,                        ///< [in] Binary scan operator
442         BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
443     {
444         if (WARP_SYNCHRONOUS)
445         {
446             // Short-circuit directly to warp-synchronous scan
447             T block_aggregate;
448             WarpScan warp_scan(temp_storage.warp_scan);
449             warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
450 
451             // Obtain warp-wide prefix in lane0, then broadcast to other lanes
452             T block_prefix = block_prefix_callback_op(block_aggregate);
453             block_prefix = warp_scan.Broadcast(block_prefix, 0);
454 
455             output = scan_op(block_prefix, output);
456             if (linear_tid == 0)
457                 output = block_prefix;
458         }
459         else
460         {
461             // Place thread partial into shared memory raking grid
462             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
463             *placement_ptr = input;
464 
465             CTA_SYNC();
466 
467             // Reduce parallelism down to just raking threads
468             if (linear_tid < RAKING_THREADS)
469             {
470                 WarpScan warp_scan(temp_storage.warp_scan);
471 
472                 // Raking upsweep reduction across shared partials
473                 T upsweep_partial = Upsweep(scan_op);
474 
475                 // Warp-synchronous scan
476                 T exclusive_partial, block_aggregate;
477                 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
478 
479                 // Obtain block-wide prefix in lane0, then broadcast to other lanes
480                 T block_prefix = block_prefix_callback_op(block_aggregate);
481                 block_prefix = warp_scan.Broadcast(block_prefix, 0);
482 
483                 // Update prefix with warpscan exclusive partial
484                 T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
485                 if (linear_tid == 0)
486                     downsweep_prefix = block_prefix;
487 
488                 // Exclusive raking downsweep scan
489                 ExclusiveDownsweep(scan_op, downsweep_prefix);
490             }
491 
492             CTA_SYNC();
493 
494             // Grab thread prefix from shared memory
495             output = *placement_ptr;
496         }
497     }
498 
499 
500     //---------------------------------------------------------------------
501     // Inclusive scans
502     //---------------------------------------------------------------------
503 
504     /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
505     template <typename ScanOp>
InclusiveScancub::BlockScanRaking506     __device__ __forceinline__ void InclusiveScan(
507         T               input,                          ///< [in] Calling thread's input item
508         T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
509         ScanOp          scan_op)                        ///< [in] Binary scan operator
510     {
511         if (WARP_SYNCHRONOUS)
512         {
513             // Short-circuit directly to warp-synchronous scan
514             WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
515         }
516         else
517         {
518             // Place thread partial into shared memory raking grid
519             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
520             *placement_ptr = input;
521 
522             CTA_SYNC();
523 
524             // Reduce parallelism down to just raking threads
525             if (linear_tid < RAKING_THREADS)
526             {
527                 // Raking upsweep reduction across shared partials
528                 T upsweep_partial = Upsweep(scan_op);
529 
530                 // Exclusive Warp-synchronous scan
531                 T exclusive_partial;
532                 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
533 
534                 // Inclusive raking downsweep scan
535                 InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
536             }
537 
538             CTA_SYNC();
539 
540             // Grab thread prefix from shared memory
541             output = *placement_ptr;
542         }
543     }
544 
545 
546     /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
547     template <typename ScanOp>
InclusiveScancub::BlockScanRaking548     __device__ __forceinline__ void InclusiveScan(
549         T               input,                          ///< [in] Calling thread's input item
550         T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
551         ScanOp          scan_op,                        ///< [in] Binary scan operator
552         T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
553     {
554         if (WARP_SYNCHRONOUS)
555         {
556             // Short-circuit directly to warp-synchronous scan
557             WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
558         }
559         else
560         {
561             // Place thread partial into shared memory raking grid
562             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
563             *placement_ptr = input;
564 
565             CTA_SYNC();
566 
567             // Reduce parallelism down to just raking threads
568             if (linear_tid < RAKING_THREADS)
569             {
570                 // Raking upsweep reduction across shared partials
571                 T upsweep_partial = Upsweep(scan_op);
572 
573                 // Warp-synchronous scan
574                 T inclusive_partial;
575                 T exclusive_partial;
576                 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
577 
578                 // Inclusive raking downsweep scan
579                 InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
580 
581                 // Broadcast aggregate to all threads
582                 if (linear_tid == RAKING_THREADS - 1)
583                     temp_storage.block_aggregate = inclusive_partial;
584             }
585 
586             CTA_SYNC();
587 
588             // Grab thread prefix from shared memory
589             output = *placement_ptr;
590 
591             // Retrieve block aggregate
592             block_aggregate = temp_storage.block_aggregate;
593         }
594     }
595 
596 
597     /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
598     template <
599         typename ScanOp,
600         typename BlockPrefixCallbackOp>
InclusiveScancub::BlockScanRaking601     __device__ __forceinline__ void InclusiveScan(
602         T                       input,                          ///< [in] Calling thread's input item
603         T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
604         ScanOp                  scan_op,                        ///< [in] Binary scan operator
605         BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
606     {
607         if (WARP_SYNCHRONOUS)
608         {
609             // Short-circuit directly to warp-synchronous scan
610             T block_aggregate;
611             WarpScan warp_scan(temp_storage.warp_scan);
612             warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
613 
614             // Obtain warp-wide prefix in lane0, then broadcast to other lanes
615             T block_prefix = block_prefix_callback_op(block_aggregate);
616             block_prefix = warp_scan.Broadcast(block_prefix, 0);
617 
618             // Update prefix with exclusive warpscan partial
619             output = scan_op(block_prefix, output);
620         }
621         else
622         {
623             // Place thread partial into shared memory raking grid
624             T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
625             *placement_ptr = input;
626 
627             CTA_SYNC();
628 
629             // Reduce parallelism down to just raking threads
630             if (linear_tid < RAKING_THREADS)
631             {
632                 WarpScan warp_scan(temp_storage.warp_scan);
633 
634                 // Raking upsweep reduction across shared partials
635                 T upsweep_partial = Upsweep(scan_op);
636 
637                 // Warp-synchronous scan
638                 T exclusive_partial, block_aggregate;
639                 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
640 
641                 // Obtain block-wide prefix in lane0, then broadcast to other lanes
642                 T block_prefix = block_prefix_callback_op(block_aggregate);
643                 block_prefix = warp_scan.Broadcast(block_prefix, 0);
644 
645                 // Update prefix with warpscan exclusive partial
646                 T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
647                 if (linear_tid == 0)
648                     downsweep_prefix = block_prefix;
649 
650                 // Inclusive raking downsweep scan
651                 InclusiveDownsweep(scan_op, downsweep_prefix);
652             }
653 
654             CTA_SYNC();
655 
656             // Grab thread prefix from shared memory
657             output = *placement_ptr;
658         }
659     }
660 
661 };
662 
663 
664 }               // CUB namespace
665 CUB_NS_POSTFIX  // Optional outer namespace(s)
666 
667