1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
32  */
33 
34 #pragma once
35 
36 #include "specializations/warp_scan_shfl.cuh"
37 #include "specializations/warp_scan_smem.cuh"
38 #include "../thread/thread_operators.cuh"
39 #include "../util_arch.cuh"
40 #include "../util_type.cuh"
41 #include "../util_namespace.cuh"
42 
43 /// Optional outer namespace(s)
44 CUB_NS_PREFIX
45 
46 /// CUB namespace
47 namespace cub {
48 
49 /**
50  * \addtogroup WarpModule
51  * @{
52  */
53 
54 /**
55  * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
56  *
57  * \tparam T                        The scan input/output element type
58  * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
59  * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
60  *
61  * \par Overview
62  * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
63  *   produces an output list where each element is computed to be the reduction
64  *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
65  *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
66  *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
67  *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
68  *   the <em>i</em><sup>th</sup> output reduction.
69  * - Supports non-commutative scan operators
70  * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
71  * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
72  *
73  * \par Performance Considerations
74  * - Uses special instructions when applicable (e.g., warp \p SHFL)
75  * - Uses synchronization-free communication between warp lanes when applicable
76  * - Incurs zero bank conflicts for most types
77  * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
78  *     - Summation (<b><em>vs.</em></b> generic scan)
79  *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
80  *
81  * \par Simple Examples
82  * \warpcollective{WarpScan}
83  * \par
84  * The code snippet below illustrates four concurrent warp prefix sums within a block of
85  * 128 threads (one per each of the 32-thread warps).
86  * \par
87  * \code
88  * #include <cub/cub.cuh>
89  *
90  * __global__ void ExampleKernel(...)
91  * {
92  *     // Specialize WarpScan for type int
93  *     typedef cub::WarpScan<int> WarpScan;
94  *
95  *     // Allocate WarpScan shared memory for 4 warps
96  *     __shared__ typename WarpScan::TempStorage temp_storage[4];
97  *
98  *     // Obtain one input item per thread
99  *     int thread_data = ...
100  *
101  *     // Compute warp-wide prefix sums
102  *     int warp_id = threadIdx.x / 32;
103  *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
104  *
105  * \endcode
106  * \par
107  * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
108  * The corresponding output \p thread_data in each of the four warps of threads will be
109  * <tt>0, 1, 2, 3, ..., 31}</tt>.
110  *
111  * \par
112  * The code snippet below illustrates a single warp prefix sum within a block of
113  * 128 threads.
114  * \par
115  * \code
116  * #include <cub/cub.cuh>
117  *
118  * __global__ void ExampleKernel(...)
119  * {
120  *     // Specialize WarpScan for type int
121  *     typedef cub::WarpScan<int> WarpScan;
122  *
123  *     // Allocate WarpScan shared memory for one warp
124  *     __shared__ typename WarpScan::TempStorage temp_storage;
125  *     ...
126  *
127  *     // Only the first warp performs a prefix sum
128  *     if (threadIdx.x < 32)
129  *     {
130  *         // Obtain one input item per thread
131  *         int thread_data = ...
132  *
133  *         // Compute warp-wide prefix sums
134  *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
135  *
136  * \endcode
137  * \par
138  * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
139  * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
140  *
141  */
142 template <
143     typename    T,
144     int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
145     int         PTX_ARCH                = CUB_PTX_ARCH>
146 class WarpScan
147 {
148 private:
149 
150     /******************************************************************************
151      * Constants and type definitions
152      ******************************************************************************/
153 
154     enum
155     {
156         /// Whether the logical warp size and the PTX warp size coincide
157         IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
158 
159         /// Whether the logical warp size is a power-of-two
160         IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
161 
162         /// Whether the data type is an integer (which has fully-associative addition)
163         IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
164     };
165 
166     /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
167     typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
168         WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
169         WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
170 
171     /// Shared memory storage layout type for WarpScan
172     typedef typename InternalWarpScan::TempStorage _TempStorage;
173 
174 
175     /******************************************************************************
176      * Thread fields
177      ******************************************************************************/
178 
179     /// Shared storage reference
180     _TempStorage    &temp_storage;
181     unsigned int    lane_id;
182 
183 
184 
185     /******************************************************************************
186      * Public types
187      ******************************************************************************/
188 
189 public:
190 
191     /// \smemstorage{WarpScan}
192     struct TempStorage : Uninitialized<_TempStorage> {};
193 
194 
195     /******************************************************************//**
196      * \name Collective constructors
197      *********************************************************************/
198     //@{
199 
200     /**
201      * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
202      */
WarpScan(TempStorage & temp_storage)203     __device__ __forceinline__ WarpScan(
204         TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
205     :
206         temp_storage(temp_storage.Alias()),
207         lane_id(IS_ARCH_WARP ?
208             LaneId() :
209             LaneId() % LOGICAL_WARP_THREADS)
210     {}
211 
212 
213     //@}  end member group
214     /******************************************************************//**
215      * \name Inclusive prefix sums
216      *********************************************************************/
217     //@{
218 
219 
220     /**
221      * \brief Computes an inclusive prefix sum across the calling warp.
222      *
223      * \par
224      * - \smemreuse
225      *
226      * \par Snippet
227      * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
228      * 128 threads (one per each of the 32-thread warps).
229      * \par
230      * \code
231      * #include <cub/cub.cuh>
232      *
233      * __global__ void ExampleKernel(...)
234      * {
235      *     // Specialize WarpScan for type int
236      *     typedef cub::WarpScan<int> WarpScan;
237      *
238      *     // Allocate WarpScan shared memory for 4 warps
239      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
240      *
241      *     // Obtain one input item per thread
242      *     int thread_data = ...
243      *
244      *     // Compute inclusive warp-wide prefix sums
245      *     int warp_id = threadIdx.x / 32;
246      *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
247      *
248      * \endcode
249      * \par
250      * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
251      * The corresponding output \p thread_data in each of the four warps of threads will be
252      * <tt>1, 2, 3, ..., 32}</tt>.
253      */
InclusiveSum(T input,T & inclusive_output)254     __device__ __forceinline__ void InclusiveSum(
255         T               input,              ///< [in] Calling thread's input item.
256         T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
257     {
258         InclusiveScan(input, inclusive_output, cub::Sum());
259     }
260 
261 
262     /**
263      * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
264      *
265      * \par
266      * - \smemreuse
267      *
268      * \par Snippet
269      * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
270      * 128 threads (one per each of the 32-thread warps).
271      * \par
272      * \code
273      * #include <cub/cub.cuh>
274      *
275      * __global__ void ExampleKernel(...)
276      * {
277      *     // Specialize WarpScan for type int
278      *     typedef cub::WarpScan<int> WarpScan;
279      *
280      *     // Allocate WarpScan shared memory for 4 warps
281      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
282      *
283      *     // Obtain one input item per thread
284      *     int thread_data = ...
285      *
286      *     // Compute inclusive warp-wide prefix sums
287      *     int warp_aggregate;
288      *     int warp_id = threadIdx.x / 32;
289      *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
290      *
291      * \endcode
292      * \par
293      * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
294      * The corresponding output \p thread_data in each of the four warps of threads will be
295      * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
296      */
InclusiveSum(T input,T & inclusive_output,T & warp_aggregate)297     __device__ __forceinline__ void InclusiveSum(
298         T               input,              ///< [in] Calling thread's input item.
299         T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
300         T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
301     {
302         InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
303     }
304 
305 
306     //@}  end member group
307     /******************************************************************//**
308      * \name Exclusive prefix sums
309      *********************************************************************/
310     //@{
311 
312 
313     /**
314      * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
315      *
316      * \par
317      *  - \identityzero
318      *  - \smemreuse
319      *
320      * \par Snippet
321      * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
322      * 128 threads (one per each of the 32-thread warps).
323      * \par
324      * \code
325      * #include <cub/cub.cuh>
326      *
327      * __global__ void ExampleKernel(...)
328      * {
329      *     // Specialize WarpScan for type int
330      *     typedef cub::WarpScan<int> WarpScan;
331      *
332      *     // Allocate WarpScan shared memory for 4 warps
333      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
334      *
335      *     // Obtain one input item per thread
336      *     int thread_data = ...
337      *
338      *     // Compute exclusive warp-wide prefix sums
339      *     int warp_id = threadIdx.x / 32;
340      *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
341      *
342      * \endcode
343      * \par
344      * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
345      * The corresponding output \p thread_data in each of the four warps of threads will be
346      * <tt>0, 1, 2, ..., 31}</tt>.
347      *
348      */
ExclusiveSum(T input,T & exclusive_output)349     __device__ __forceinline__ void ExclusiveSum(
350         T               input,              ///< [in] Calling thread's input item.
351         T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
352     {
353         T initial_value = 0;
354         ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
355     }
356 
357 
358     /**
359      * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
360      *
361      * \par
362      *  - \identityzero
363      *  - \smemreuse
364      *
365      * \par Snippet
366      * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
367      * 128 threads (one per each of the 32-thread warps).
368      * \par
369      * \code
370      * #include <cub/cub.cuh>
371      *
372      * __global__ void ExampleKernel(...)
373      * {
374      *     // Specialize WarpScan for type int
375      *     typedef cub::WarpScan<int> WarpScan;
376      *
377      *     // Allocate WarpScan shared memory for 4 warps
378      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
379      *
380      *     // Obtain one input item per thread
381      *     int thread_data = ...
382      *
383      *     // Compute exclusive warp-wide prefix sums
384      *     int warp_aggregate;
385      *     int warp_id = threadIdx.x / 32;
386      *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
387      *
388      * \endcode
389      * \par
390      * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
391      * The corresponding output \p thread_data in each of the four warps of threads will be
392      * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
393      */
ExclusiveSum(T input,T & exclusive_output,T & warp_aggregate)394     __device__ __forceinline__ void ExclusiveSum(
395         T               input,              ///< [in] Calling thread's input item.
396         T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
397         T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
398     {
399         T initial_value = 0;
400         ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
401     }
402 
403 
404     //@}  end member group
405     /******************************************************************//**
406      * \name Inclusive prefix scans
407      *********************************************************************/
408     //@{
409 
410     /**
411      * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
412      *
413      * \par
414      *  - \smemreuse
415      *
416      * \par Snippet
417      * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
418      * 128 threads (one per each of the 32-thread warps).
419      * \par
420      * \code
421      * #include <cub/cub.cuh>
422      *
423      * __global__ void ExampleKernel(...)
424      * {
425      *     // Specialize WarpScan for type int
426      *     typedef cub::WarpScan<int> WarpScan;
427      *
428      *     // Allocate WarpScan shared memory for 4 warps
429      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
430      *
431      *     // Obtain one input item per thread
432      *     int thread_data = ...
433      *
434      *     // Compute inclusive warp-wide prefix max scans
435      *     int warp_id = threadIdx.x / 32;
436      *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
437      *
438      * \endcode
439      * \par
440      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
441      * The corresponding output \p thread_data in the first warp would be
442      * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
443      *
444      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
445      */
446     template <typename ScanOp>
InclusiveScan(T input,T & inclusive_output,ScanOp scan_op)447     __device__ __forceinline__ void InclusiveScan(
448         T               input,              ///< [in] Calling thread's input item.
449         T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
450         ScanOp          scan_op)            ///< [in] Binary scan operator
451     {
452         InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
453     }
454 
455 
456     /**
457      * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
458      *
459      * \par
460      * - \smemreuse
461      *
462      * \par Snippet
463      * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
464      * 128 threads (one per each of the 32-thread warps).
465      * \par
466      * \code
467      * #include <cub/cub.cuh>
468      *
469      * __global__ void ExampleKernel(...)
470      * {
471      *     // Specialize WarpScan for type int
472      *     typedef cub::WarpScan<int> WarpScan;
473      *
474      *     // Allocate WarpScan shared memory for 4 warps
475      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
476      *
477      *     // Obtain one input item per thread
478      *     int thread_data = ...
479      *
480      *     // Compute inclusive warp-wide prefix max scans
481      *     int warp_aggregate;
482      *     int warp_id = threadIdx.x / 32;
483      *     WarpScan(temp_storage[warp_id]).InclusiveScan(
484      *         thread_data, thread_data, cub::Max(), warp_aggregate);
485      *
486      * \endcode
487      * \par
488      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
489      * The corresponding output \p thread_data in the first warp would be
490      * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
491      * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
492      * in the second warp, etc.
493      *
494      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
495      */
496     template <typename ScanOp>
InclusiveScan(T input,T & inclusive_output,ScanOp scan_op,T & warp_aggregate)497     __device__ __forceinline__ void InclusiveScan(
498         T               input,              ///< [in] Calling thread's input item.
499         T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
500         ScanOp          scan_op,            ///< [in] Binary scan operator
501         T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
502     {
503         InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
504     }
505 
506 
507     //@}  end member group
508     /******************************************************************//**
509      * \name Exclusive prefix scans
510      *********************************************************************/
511     //@{
512 
513     /**
514      * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
515      *
516      * \par
517      * - \smemreuse
518      *
519      * \par Snippet
520      * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
521      * 128 threads (one per each of the 32-thread warps).
522      * \par
523      * \code
524      * #include <cub/cub.cuh>
525      *
526      * __global__ void ExampleKernel(...)
527      * {
528      *     // Specialize WarpScan for type int
529      *     typedef cub::WarpScan<int> WarpScan;
530      *
531      *     // Allocate WarpScan shared memory for 4 warps
532      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
533      *
534      *     // Obtain one input item per thread
535      *     int thread_data = ...
536      *
537      *     // Compute exclusive warp-wide prefix max scans
538      *     int warp_id = threadIdx.x / 32;
539      *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
540      *
541      * \endcode
542      * \par
543      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
544      * The corresponding output \p thread_data in the first warp would be
545      * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
546      * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
547      *
548      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
549      */
550     template <typename ScanOp>
ExclusiveScan(T input,T & exclusive_output,ScanOp scan_op)551     __device__ __forceinline__ void ExclusiveScan(
552         T               input,              ///< [in] Calling thread's input item.
553         T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
554         ScanOp          scan_op)            ///< [in] Binary scan operator
555     {
556         InternalWarpScan internal(temp_storage);
557 
558         T inclusive_output;
559         internal.InclusiveScan(input, inclusive_output, scan_op);
560 
561         internal.Update(
562             input,
563             inclusive_output,
564             exclusive_output,
565             scan_op,
566             Int2Type<IS_INTEGER>());
567     }
568 
569 
570     /**
571      * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
572      *
573      * \par
574      * - \smemreuse
575      *
576      * \par Snippet
577      * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
578      * 128 threads (one per each of the 32-thread warps).
579      * \par
580      * \code
581      * #include <cub/cub.cuh>
582      *
583      * __global__ void ExampleKernel(...)
584      * {
585      *     // Specialize WarpScan for type int
586      *     typedef cub::WarpScan<int> WarpScan;
587      *
588      *     // Allocate WarpScan shared memory for 4 warps
589      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
590      *
591      *     // Obtain one input item per thread
592      *     int thread_data = ...
593      *
594      *     // Compute exclusive warp-wide prefix max scans
595      *     int warp_id = threadIdx.x / 32;
596      *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
597      *
598      * \endcode
599      * \par
600      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
601      * The corresponding output \p thread_data in the first warp would be
602      * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
603      *
604      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
605      */
606     template <typename ScanOp>
ExclusiveScan(T input,T & exclusive_output,T initial_value,ScanOp scan_op)607     __device__ __forceinline__ void ExclusiveScan(
608         T               input,              ///< [in] Calling thread's input item.
609         T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
610         T               initial_value,      ///< [in] Initial value to seed the exclusive scan
611         ScanOp          scan_op)            ///< [in] Binary scan operator
612     {
613         InternalWarpScan internal(temp_storage);
614 
615         T inclusive_output;
616         internal.InclusiveScan(input, inclusive_output, scan_op);
617 
618         internal.Update(
619             input,
620             inclusive_output,
621             exclusive_output,
622             scan_op,
623             initial_value,
624             Int2Type<IS_INTEGER>());
625     }
626 
627 
628     /**
629      * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
630      *
631      * \par
632      * - \smemreuse
633      *
634      * \par Snippet
635      * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
636      * 128 threads (one per each of the 32-thread warps).
637      * \par
638      * \code
639      * #include <cub/cub.cuh>
640      *
641      * __global__ void ExampleKernel(...)
642      * {
643      *     // Specialize WarpScan for type int
644      *     typedef cub::WarpScan<int> WarpScan;
645      *
646      *     // Allocate WarpScan shared memory for 4 warps
647      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
648      *
649      *     // Obtain one input item per thread
650      *     int thread_data = ...
651      *
652      *     // Compute exclusive warp-wide prefix max scans
653      *     int warp_aggregate;
654      *     int warp_id = threadIdx.x / 32;
655      *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
656      *
657      * \endcode
658      * \par
659      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
660      * The corresponding output \p thread_data in the first warp would be
661      * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
662      * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
663      * in the second warp, etc.
664      *
665      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
666      */
667     template <typename ScanOp>
ExclusiveScan(T input,T & exclusive_output,ScanOp scan_op,T & warp_aggregate)668     __device__ __forceinline__ void ExclusiveScan(
669         T               input,              ///< [in] Calling thread's input item.
670         T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
671         ScanOp          scan_op,            ///< [in] Binary scan operator
672         T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
673     {
674         InternalWarpScan internal(temp_storage);
675 
676         T inclusive_output;
677         internal.InclusiveScan(input, inclusive_output, scan_op);
678 
679         internal.Update(
680             input,
681             inclusive_output,
682             exclusive_output,
683             warp_aggregate,
684             scan_op,
685             Int2Type<IS_INTEGER>());
686     }
687 
688 
689     /**
690      * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
691      *
692      * \par
693      * - \smemreuse
694      *
695      * \par Snippet
696      * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
697      * 128 threads (one per each of the 32-thread warps).
698      * \par
699      * \code
700      * #include <cub/cub.cuh>
701      *
702      * __global__ void ExampleKernel(...)
703      * {
704      *     // Specialize WarpScan for type int
705      *     typedef cub::WarpScan<int> WarpScan;
706      *
707      *     // Allocate WarpScan shared memory for 4 warps
708      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
709      *
710      *     // Obtain one input item per thread
711      *     int thread_data = ...
712      *
713      *     // Compute exclusive warp-wide prefix max scans
714      *     int warp_aggregate;
715      *     int warp_id = threadIdx.x / 32;
716      *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
717      *
718      * \endcode
719      * \par
720      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
721      * The corresponding output \p thread_data in the first warp would be
722      * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
723      * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
724      * in the second warp, etc.
725      *
726      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
727      */
728     template <typename ScanOp>
ExclusiveScan(T input,T & exclusive_output,T initial_value,ScanOp scan_op,T & warp_aggregate)729     __device__ __forceinline__ void ExclusiveScan(
730         T               input,              ///< [in] Calling thread's input item.
731         T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
732         T               initial_value,      ///< [in] Initial value to seed the exclusive scan
733         ScanOp          scan_op,            ///< [in] Binary scan operator
734         T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
735     {
736         InternalWarpScan internal(temp_storage);
737 
738         T inclusive_output;
739         internal.InclusiveScan(input, inclusive_output, scan_op);
740 
741         internal.Update(
742             input,
743             inclusive_output,
744             exclusive_output,
745             warp_aggregate,
746             scan_op,
747             initial_value,
748             Int2Type<IS_INTEGER>());
749     }
750 
751 
752     //@}  end member group
753     /******************************************************************//**
754      * \name Combination (inclusive & exclusive) prefix scans
755      *********************************************************************/
756     //@{
757 
758 
759     /**
760      * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
761      *
762      * \par
763      * - \smemreuse
764      *
765      * \par Snippet
766      * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
767      * 128 threads (one per each of the 32-thread warps).
768      * \par
769      * \code
770      * #include <cub/cub.cuh>
771      *
772      * __global__ void ExampleKernel(...)
773      * {
774      *     // Specialize WarpScan for type int
775      *     typedef cub::WarpScan<int> WarpScan;
776      *
777      *     // Allocate WarpScan shared memory for 4 warps
778      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
779      *
780      *     // Obtain one input item per thread
781      *     int thread_data = ...
782      *
783      *     // Compute exclusive warp-wide prefix max scans
784      *     int inclusive_partial, exclusive_partial;
785      *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
786      *
787      * \endcode
788      * \par
789      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
790      * The corresponding output \p inclusive_partial in the first warp would be
791      * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
792      * The corresponding output \p exclusive_partial in the first warp would be
793      * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
794      * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
795      *
796      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
797      */
798     template <typename ScanOp>
Scan(T input,T & inclusive_output,T & exclusive_output,ScanOp scan_op)799     __device__ __forceinline__ void Scan(
800         T               input,              ///< [in] Calling thread's input item.
801         T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
802         T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
803         ScanOp          scan_op)            ///< [in] Binary scan operator
804     {
805         InternalWarpScan internal(temp_storage);
806 
807         internal.InclusiveScan(input, inclusive_output, scan_op);
808 
809         internal.Update(
810             input,
811             inclusive_output,
812             exclusive_output,
813             scan_op,
814             Int2Type<IS_INTEGER>());
815     }
816 
817 
818     /**
819      * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
820      *
821      * \par
822      *  - \smemreuse
823      *
824      * \par Snippet
825      * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
826      * 128 threads (one per each of the 32-thread warps).
827      * \par
828      * \code
829      * #include <cub/cub.cuh>
830      *
831      * __global__ void ExampleKernel(...)
832      * {
833      *     // Specialize WarpScan for type int
834      *     typedef cub::WarpScan<int> WarpScan;
835      *
836      *     // Allocate WarpScan shared memory for 4 warps
837      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
838      *
839      *     // Obtain one input item per thread
840      *     int thread_data = ...
841      *
842      *     // Compute inclusive warp-wide prefix max scans
843      *     int warp_id = threadIdx.x / 32;
844      *     int inclusive_partial, exclusive_partial;
845      *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
846      *
847      * \endcode
848      * \par
849      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
850      * The corresponding output \p inclusive_partial in the first warp would be
851      * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
852      * The corresponding output \p exclusive_partial in the first warp would be
853      * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
854      *
855      * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
856      */
857     template <typename ScanOp>
Scan(T input,T & inclusive_output,T & exclusive_output,T initial_value,ScanOp scan_op)858     __device__ __forceinline__ void Scan(
859         T               input,              ///< [in] Calling thread's input item.
860         T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
861         T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
862         T               initial_value,      ///< [in] Initial value to seed the exclusive scan
863         ScanOp          scan_op)            ///< [in] Binary scan operator
864     {
865         InternalWarpScan internal(temp_storage);
866 
867         internal.InclusiveScan(input, inclusive_output, scan_op);
868 
869         internal.Update(
870             input,
871             inclusive_output,
872             exclusive_output,
873             scan_op,
874             initial_value,
875             Int2Type<IS_INTEGER>());
876     }
877 
878 
879 
880     //@}  end member group
881     /******************************************************************//**
882      * \name Data exchange
883      *********************************************************************/
884     //@{
885 
886     /**
887      * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
888      *
889      * \par
890      * - \smemreuse
891      *
892      * \par Snippet
893      * The code snippet below illustrates the warp-wide broadcasts of values from
894      * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
895      * \par
896      * \code
897      * #include <cub/cub.cuh>
898      *
899      * __global__ void ExampleKernel(...)
900      * {
901      *     // Specialize WarpScan for type int
902      *     typedef cub::WarpScan<int> WarpScan;
903      *
904      *     // Allocate WarpScan shared memory for 4 warps
905      *     __shared__ typename WarpScan::TempStorage temp_storage[4];
906      *
907      *     // Obtain one input item per thread
908      *     int thread_data = ...
909      *
910      *     // Broadcast from lane0 in each warp to all other threads in the warp
911      *     int warp_id = threadIdx.x / 32;
912      *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
913      *
914      * \endcode
915      * \par
916      * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
917      * The corresponding output \p thread_data will be
918      * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
919      * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
920      * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
921      */
Broadcast(T input,unsigned int src_lane)922     __device__ __forceinline__ T Broadcast(
923         T               input,              ///< [in] The value to broadcast
924         unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
925     {
926         return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
927     }
928 
929     //@}  end member group
930 
931 };
932 
933 /** @} */       // end group WarpModule
934 
935 }               // CUB namespace
936 CUB_NS_POSTFIX  // Optional outer namespace(s)
937