1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
32  */
33 
34 #pragma once
35 
36 #include "../../thread/thread_operators.cuh"
37 #include "../../thread/thread_load.cuh"
38 #include "../../thread/thread_store.cuh"
39 #include "../../util_type.cuh"
40 #include "../../util_namespace.cuh"
41 
42 /// Optional outer namespace(s)
43 CUB_NS_PREFIX
44 
45 /// CUB namespace
46 namespace cub {
47 
48 /**
49  * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
50  */
51 template <
52     typename    T,                      ///< Data type being reduced
53     int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
54     int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
55 struct WarpReduceSmem
56 {
57     /******************************************************************************
58      * Constants and type definitions
59      ******************************************************************************/
60 
61     enum
62     {
63         /// Whether the logical warp size and the PTX warp size coincide
64         IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
65 
66         /// Whether the logical warp size is a power-of-two
67         IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
68 
69         /// The number of warp scan steps
70         STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
71 
72         /// The number of threads in half a warp
73         HALF_WARP_THREADS = 1 << (STEPS - 1),
74 
75         /// The number of shared memory elements per warp
76         WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
77 
78         /// FlagT status (when not using ballot)
79         UNSET   = 0x0,  // Is initially unset
80         SET     = 0x1,  // Is initially set
81         SEEN    = 0x2,  // Has seen another head flag from a successor peer
82     };
83 
84     /// Shared memory flag type
85     typedef unsigned char SmemFlag;
86 
87     /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
88     struct _TempStorage
89     {
90         T           reduce[WARP_SMEM_ELEMENTS];
91         SmemFlag    flags[WARP_SMEM_ELEMENTS];
92     };
93 
94     // Alias wrapper allowing storage to be unioned
95     struct TempStorage : Uninitialized<_TempStorage> {};
96 
97 
98     /******************************************************************************
99      * Thread fields
100      ******************************************************************************/
101 
102     _TempStorage    &temp_storage;
103     unsigned int    lane_id;
104     unsigned int    member_mask;
105 
106 
107     /******************************************************************************
108      * Construction
109      ******************************************************************************/
110 
111     /// Constructor
WarpReduceSmemcub::WarpReduceSmem112     __device__ __forceinline__ WarpReduceSmem(
113         TempStorage     &temp_storage)
114     :
115         temp_storage(temp_storage.Alias()),
116 
117         lane_id(IS_ARCH_WARP ?
118             LaneId() :
119             LaneId() % LOGICAL_WARP_THREADS),
120 
121         member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
122             0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
123             ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
124     {}
125 
126     /******************************************************************************
127      * Utility methods
128      ******************************************************************************/
129 
130     //---------------------------------------------------------------------
131     // Regular reduction
132     //---------------------------------------------------------------------
133 
134     /**
135      * Reduction step
136      */
137     template <
138         bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
139         typename            ReductionOp,
140         int                 STEP>
ReduceStepcub::WarpReduceSmem141     __device__ __forceinline__ T ReduceStep(
142         T                   input,                  ///< [in] Calling thread's input
143         int                 valid_items,            ///< [in] Total number of valid items across the logical warp
144         ReductionOp         reduction_op,           ///< [in] Reduction operator
145         Int2Type<STEP>      /*step*/)
146     {
147         const int OFFSET = 1 << STEP;
148 
149         // Share input through buffer
150         ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
151 
152         WARP_SYNC(member_mask);
153 
154         // Update input if peer_addend is in range
155         if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
156         {
157             T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
158             input = reduction_op(input, peer_addend);
159         }
160 
161         WARP_SYNC(member_mask);
162 
163         return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
164     }
165 
166 
167     /**
168      * Reduction step (terminate)
169      */
170     template <
171         bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
172         typename            ReductionOp>
ReduceStepcub::WarpReduceSmem173     __device__ __forceinline__ T ReduceStep(
174         T                   input,                      ///< [in] Calling thread's input
175         int                 valid_items,                ///< [in] Total number of valid items across the logical warp
176         ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
177         Int2Type<STEPS>     /*step*/)
178     {
179         return input;
180     }
181 
182 
183     //---------------------------------------------------------------------
184     // Segmented reduction
185     //---------------------------------------------------------------------
186 
187 
188     /**
189      * Ballot-based segmented reduce
190      */
191     template <
192         bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
193         typename        FlagT,
194         typename        ReductionOp>
SegmentedReducecub::WarpReduceSmem195     __device__ __forceinline__ T SegmentedReduce(
196         T               input,                  ///< [in] Calling thread's input
197         FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
198         ReductionOp     reduction_op,           ///< [in] Reduction operator
199         Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
200     {
201         // Get the start flags for each thread in the warp.
202         int warp_flags = WARP_BALLOT(flag, member_mask);
203 
204         if (!HEAD_SEGMENTED)
205             warp_flags <<= 1;
206 
207         // Keep bits above the current thread.
208         warp_flags &= LaneMaskGt();
209 
210         // Accommodate packing of multiple logical warps in a single physical warp
211         if (!IS_ARCH_WARP)
212         {
213             warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
214         }
215 
216         // Find next flag
217         int next_flag = __clz(__brev(warp_flags));
218 
219         // Clip the next segment at the warp boundary if necessary
220         if (LOGICAL_WARP_THREADS != 32)
221             next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
222 
223         #pragma unroll
224         for (int STEP = 0; STEP < STEPS; STEP++)
225         {
226             const int OFFSET = 1 << STEP;
227 
228             // Share input into buffer
229             ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
230 
231             WARP_SYNC(member_mask);
232 
233             // Update input if peer_addend is in range
234             if (OFFSET + lane_id < next_flag)
235             {
236                 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
237                 input = reduction_op(input, peer_addend);
238             }
239 
240             WARP_SYNC(member_mask);
241         }
242 
243         return input;
244     }
245 
246 
247     /**
248      * Smem-based segmented reduce
249      */
250     template <
251         bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
252         typename        FlagT,
253         typename        ReductionOp>
SegmentedReducecub::WarpReduceSmem254     __device__ __forceinline__ T SegmentedReduce(
255         T               input,                  ///< [in] Calling thread's input
256         FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
257         ReductionOp     reduction_op,           ///< [in] Reduction operator
258         Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
259     {
260         enum
261         {
262             UNSET   = 0x0,  // Is initially unset
263             SET     = 0x1,  // Is initially set
264             SEEN    = 0x2,  // Has seen another head flag from a successor peer
265         };
266 
267         // Alias flags onto shared data storage
268         volatile SmemFlag *flag_storage = temp_storage.flags;
269 
270         SmemFlag flag_status = (flag) ? SET : UNSET;
271 
272         for (int STEP = 0; STEP < STEPS; STEP++)
273         {
274             const int OFFSET = 1 << STEP;
275 
276             // Share input through buffer
277             ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
278 
279             WARP_SYNC(member_mask);
280 
281             // Get peer from buffer
282             T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
283 
284             WARP_SYNC(member_mask);
285 
286             // Share flag through buffer
287             flag_storage[lane_id] = flag_status;
288 
289             // Get peer flag from buffer
290             SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
291 
292             // Update input if peer was in range
293             if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
294             {
295                 if (HEAD_SEGMENTED)
296                 {
297                     // Head-segmented
298                     if ((flag_status & SEEN) == 0)
299                     {
300                         // Has not seen a more distant head flag
301                         if (peer_flag_status & SET)
302                         {
303                             // Has now seen a head flag
304                             flag_status |= SEEN;
305                         }
306                         else
307                         {
308                             // Peer is not a head flag: grab its count
309                             input = reduction_op(input, peer_addend);
310                         }
311 
312                         // Update seen status to include that of peer
313                         flag_status |= (peer_flag_status & SEEN);
314                     }
315                 }
316                 else
317                 {
318                     // Tail-segmented.  Simply propagate flag status
319                     if (!flag_status)
320                     {
321                         input = reduction_op(input, peer_addend);
322                         flag_status |= peer_flag_status;
323                     }
324 
325                 }
326             }
327         }
328 
329         return input;
330     }
331 
332 
333     /******************************************************************************
334      * Interface
335      ******************************************************************************/
336 
337     /**
338      * Reduction
339      */
340     template <
341         bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
342         typename            ReductionOp>
Reducecub::WarpReduceSmem343     __device__ __forceinline__ T Reduce(
344         T                   input,                  ///< [in] Calling thread's input
345         int                 valid_items,            ///< [in] Total number of valid items across the logical warp
346         ReductionOp         reduction_op)           ///< [in] Reduction operator
347     {
348         return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
349     }
350 
351 
352     /**
353      * Segmented reduction
354      */
355     template <
356         bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
357         typename        FlagT,
358         typename        ReductionOp>
SegmentedReducecub::WarpReduceSmem359     __device__ __forceinline__ T SegmentedReduce(
360         T               input,              ///< [in] Calling thread's input
361         FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
362         ReductionOp     reduction_op)       ///< [in] Reduction operator
363     {
364         return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
365     }
366 
367 
368 };
369 
370 
371 }               // CUB namespace
372 CUB_NS_POSTFIX  // Optional outer namespace(s)
373