1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
32  */
33 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "../block/block_load.cuh"
39 #include "../block/block_reduce.cuh"
40 #include "../grid/grid_mapping.cuh"
41 #include "../grid/grid_even_share.cuh"
42 #include "../util_type.cuh"
43 #include "../iterator/cache_modified_input_iterator.cuh"
44 #include "../util_namespace.cuh"
45 
46 
47 /// Optional outer namespace(s)
48 CUB_NS_PREFIX
49 
50 /// CUB namespace
51 namespace cub {
52 
53 
54 /******************************************************************************
55  * Tuning policy types
56  ******************************************************************************/
57 
58 /**
59  * Parameterizable tuning policy type for AgentReduce
60  */
61 template <
62     int                     _BLOCK_THREADS,         ///< Threads per thread block
63     int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
64     int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
65     BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
66     CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
67 struct AgentReducePolicy
68 {
69     enum
70     {
71         BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
72         ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
73         VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
74     };
75 
76     static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
77     static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
78 };
79 
80 
81 
82 /******************************************************************************
83  * Thread block abstractions
84  ******************************************************************************/
85 
86 /**
87  * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
88  *
89  * Each thread reduces only the values it loads. If \p FIRST_TILE, this
90  * partial reduction is stored into \p thread_aggregate.  Otherwise it is
91  * accumulated into \p thread_aggregate.
92  */
93 template <
94     typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
95     typename InputIteratorT,           ///< Random-access iterator type for input
96     typename OutputIteratorT,          ///< Random-access iterator type for output
97     typename OffsetT,                  ///< Signed integer type for global offsets
98     typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
99 struct AgentReduce
100 {
101 
102     //---------------------------------------------------------------------
103     // Types and constants
104     //---------------------------------------------------------------------
105 
106     /// The input value type
107     typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
108 
109     /// The output value type
110     typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
111         typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
112         typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
113 
114     /// Vector type of InputT for data movement
115     typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
116 
117     /// Input iterator wrapper type (for applying cache modifier)
118     typedef typename If<IsPointer<InputIteratorT>::VALUE,
119             CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
120             InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
121         WrappedInputIteratorT;
122 
123     /// Constants
124     enum
125     {
126         BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
127         ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
128         VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
129         TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
130 
131         // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
132         ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
133                                     (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
134                                     (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
135 
136     };
137 
138     static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
139     static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
140 
141     /// Parameterized BlockReduce primitive
142     typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
143 
144     /// Shared memory type required by this thread block
145     struct _TempStorage
146     {
147         typename BlockReduceT::TempStorage  reduce;
148     };
149 
150     /// Alias wrapper allowing storage to be unioned
151     struct TempStorage : Uninitialized<_TempStorage> {};
152 
153 
154     //---------------------------------------------------------------------
155     // Per-thread fields
156     //---------------------------------------------------------------------
157 
158     _TempStorage&           temp_storage;       ///< Reference to temp_storage
159     InputIteratorT          d_in;               ///< Input data to reduce
160     WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
161     ReductionOp             reduction_op;       ///< Binary reduction operator
162 
163 
164     //---------------------------------------------------------------------
165     // Utility
166     //---------------------------------------------------------------------
167 
168 
169     // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
170     template <typename Iterator>
IsAlignedcub::AgentReduce171     static __device__ __forceinline__ bool IsAligned(
172         Iterator        d_in,
173         Int2Type<true>  /*can_vectorize*/)
174     {
175         return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
176     }
177 
178     // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
179     template <typename Iterator>
IsAlignedcub::AgentReduce180     static __device__ __forceinline__ bool IsAligned(
181         Iterator        /*d_in*/,
182         Int2Type<false> /*can_vectorize*/)
183     {
184         return false;
185     }
186 
187 
188     //---------------------------------------------------------------------
189     // Constructor
190     //---------------------------------------------------------------------
191 
192     /**
193      * Constructor
194      */
AgentReducecub::AgentReduce195     __device__ __forceinline__ AgentReduce(
196         TempStorage&            temp_storage,       ///< Reference to temp_storage
197         InputIteratorT          d_in,               ///< Input data to reduce
198         ReductionOp             reduction_op)       ///< Binary reduction operator
199     :
200         temp_storage(temp_storage.Alias()),
201         d_in(d_in),
202         d_wrapped_in(d_in),
203         reduction_op(reduction_op)
204     {}
205 
206 
207     //---------------------------------------------------------------------
208     // Tile consumption
209     //---------------------------------------------------------------------
210 
211     /**
212      * Consume a full tile of input (non-vectorized)
213      */
214     template <int IS_FIRST_TILE>
ConsumeTilecub::AgentReduce215     __device__ __forceinline__ void ConsumeTile(
216         OutputT                 &thread_aggregate,
217         OffsetT                 block_offset,       ///< The offset the tile to consume
218         int                     /*valid_items*/,    ///< The number of valid items in the tile
219         Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
220         Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
221     {
222         OutputT items[ITEMS_PER_THREAD];
223 
224         // Load items in striped fashion
225         LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
226 
227         // Reduce items within each thread stripe
228         thread_aggregate = (IS_FIRST_TILE) ?
229             internal::ThreadReduce(items, reduction_op) :
230             internal::ThreadReduce(items, reduction_op, thread_aggregate);
231     }
232 
233 
234     /**
235      * Consume a full tile of input (vectorized)
236      */
237     template <int IS_FIRST_TILE>
ConsumeTilecub::AgentReduce238     __device__ __forceinline__ void ConsumeTile(
239         OutputT                 &thread_aggregate,
240         OffsetT                 block_offset,       ///< The offset the tile to consume
241         int                     /*valid_items*/,    ///< The number of valid items in the tile
242         Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
243         Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
244     {
245         // Alias items as an array of VectorT and load it in striped fashion
246         enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
247 
248         // Fabricate a vectorized input iterator
249         InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
250         CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
251             reinterpret_cast<VectorT*>(d_in_unqualified));
252 
253         // Load items as vector items
254         InputT input_items[ITEMS_PER_THREAD];
255         VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
256         #pragma unroll
257         for (int i = 0; i < WORDS; ++i)
258             vec_items[i] = d_vec_in[BLOCK_THREADS * i];
259 
260         // Convert from input type to output type
261         OutputT items[ITEMS_PER_THREAD];
262         #pragma unroll
263         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
264             items[i] = input_items[i];
265 
266         // Reduce items within each thread stripe
267         thread_aggregate = (IS_FIRST_TILE) ?
268             internal::ThreadReduce(items, reduction_op) :
269             internal::ThreadReduce(items, reduction_op, thread_aggregate);
270     }
271 
272 
273     /**
274      * Consume a partial tile of input
275      */
276     template <int IS_FIRST_TILE, int CAN_VECTORIZE>
ConsumeTilecub::AgentReduce277     __device__ __forceinline__ void ConsumeTile(
278         OutputT                 &thread_aggregate,
279         OffsetT                 block_offset,       ///< The offset the tile to consume
280         int                     valid_items,        ///< The number of valid items in the tile
281         Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
282         Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
283     {
284         // Partial tile
285         int thread_offset = threadIdx.x;
286 
287         // Read first item
288         if ((IS_FIRST_TILE) && (thread_offset < valid_items))
289         {
290             thread_aggregate = d_wrapped_in[block_offset + thread_offset];
291             thread_offset += BLOCK_THREADS;
292         }
293 
294         // Continue reading items (block-striped)
295         while (thread_offset < valid_items)
296         {
297             OutputT item        = d_wrapped_in[block_offset + thread_offset];
298             thread_aggregate    = reduction_op(thread_aggregate, item);
299             thread_offset       += BLOCK_THREADS;
300         }
301     }
302 
303 
304     //---------------------------------------------------------------
305     // Consume a contiguous segment of tiles
306     //---------------------------------------------------------------------
307 
308     /**
309      * \brief Reduce a contiguous segment of input tiles
310      */
311     template <int CAN_VECTORIZE>
ConsumeRangecub::AgentReduce312     __device__ __forceinline__ OutputT ConsumeRange(
313         GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
314         Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
315     {
316         OutputT thread_aggregate;
317 
318         if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
319         {
320             // First tile isn't full (not all threads have valid items)
321             int valid_items = even_share.block_end - even_share.block_offset;
322             ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
323             return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
324         }
325 
326         // At least one full block
327         ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
328         even_share.block_offset += even_share.block_stride;
329 
330         // Consume subsequent full tiles of input
331         while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
332         {
333             ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
334             even_share.block_offset += even_share.block_stride;
335         }
336 
337         // Consume a partially-full tile
338         if (even_share.block_offset < even_share.block_end)
339         {
340             int valid_items = even_share.block_end - even_share.block_offset;
341             ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
342         }
343 
344         // Compute block-wide reduction (all threads have valid items)
345         return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
346     }
347 
348 
349     /**
350      * \brief Reduce a contiguous segment of input tiles
351      */
ConsumeRangecub::AgentReduce352     __device__ __forceinline__ OutputT ConsumeRange(
353         OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
354         OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
355     {
356         GridEvenShare<OffsetT> even_share;
357         even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
358 
359         return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
360             ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
361             ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
362     }
363 
364 
365     /**
366      * Reduce a contiguous segment of input tiles
367      */
ConsumeTilescub::AgentReduce368     __device__ __forceinline__ OutputT ConsumeTiles(
369         GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
370     {
371         // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
372         even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
373 
374         return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
375             ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
376             ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
377 
378     }
379 
380 };
381 
382 
383 }               // CUB namespace
384 CUB_NS_POSTFIX  // Optional outer namespace(s)
385 
386