1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . 32 */ 33 34 #pragma once 35 36 #include <iterator> 37 38 #include "../block/block_load.cuh" 39 #include "../block/block_reduce.cuh" 40 #include "../grid/grid_mapping.cuh" 41 #include "../grid/grid_even_share.cuh" 42 #include "../util_type.cuh" 43 #include "../iterator/cache_modified_input_iterator.cuh" 44 #include "../util_namespace.cuh" 45 46 47 /// Optional outer namespace(s) 48 CUB_NS_PREFIX 49 50 /// CUB namespace 51 namespace cub { 52 53 54 /****************************************************************************** 55 * Tuning policy types 56 ******************************************************************************/ 57 58 /** 59 * Parameterizable tuning policy type for AgentReduce 60 */ 61 template < 62 int _BLOCK_THREADS, ///< Threads per thread block 63 int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) 64 int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load 65 BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use 66 CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements 67 struct AgentReducePolicy 68 { 69 enum 70 { 71 BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block 72 ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) 73 VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load 74 }; 75 76 static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use 77 static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements 78 }; 79 80 81 82 /****************************************************************************** 83 * Thread block abstractions 84 ******************************************************************************/ 85 86 /** 87 * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . 88 * 89 * Each thread reduces only the values it loads. If \p FIRST_TILE, this 90 * partial reduction is stored into \p thread_aggregate. Otherwise it is 91 * accumulated into \p thread_aggregate. 92 */ 93 template < 94 typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type 95 typename InputIteratorT, ///< Random-access iterator type for input 96 typename OutputIteratorT, ///< Random-access iterator type for output 97 typename OffsetT, ///< Signed integer type for global offsets 98 typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt> 99 struct AgentReduce 100 { 101 102 //--------------------------------------------------------------------- 103 // Types and constants 104 //--------------------------------------------------------------------- 105 106 /// The input value type 107 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT; 108 109 /// The output value type 110 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? 111 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type, 112 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type 113 114 /// Vector type of InputT for data movement 115 typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT; 116 117 /// Input iterator wrapper type (for applying cache modifier) 118 typedef typename If<IsPointer<InputIteratorT>::VALUE, 119 CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator 120 InputIteratorT>::Type // Directly use the supplied input iterator type 121 WrappedInputIteratorT; 122 123 /// Constants 124 enum 125 { 126 BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS, 127 ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD, 128 VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH), 129 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, 130 131 // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type 132 ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && 133 (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && 134 (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE, 135 136 }; 137 138 static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; 139 static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; 140 141 /// Parameterized BlockReduce primitive 142 typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT; 143 144 /// Shared memory type required by this thread block 145 struct _TempStorage 146 { 147 typename BlockReduceT::TempStorage reduce; 148 }; 149 150 /// Alias wrapper allowing storage to be unioned 151 struct TempStorage : Uninitialized<_TempStorage> {}; 152 153 154 //--------------------------------------------------------------------- 155 // Per-thread fields 156 //--------------------------------------------------------------------- 157 158 _TempStorage& temp_storage; ///< Reference to temp_storage 159 InputIteratorT d_in; ///< Input data to reduce 160 WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce 161 ReductionOp reduction_op; ///< Binary reduction operator 162 163 164 //--------------------------------------------------------------------- 165 // Utility 166 //--------------------------------------------------------------------- 167 168 169 // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) 170 template <typename Iterator> IsAlignedcub::AgentReduce171 static __device__ __forceinline__ bool IsAligned( 172 Iterator d_in, 173 Int2Type<true> /*can_vectorize*/) 174 { 175 return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; 176 } 177 178 // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) 179 template <typename Iterator> IsAlignedcub::AgentReduce180 static __device__ __forceinline__ bool IsAligned( 181 Iterator /*d_in*/, 182 Int2Type<false> /*can_vectorize*/) 183 { 184 return false; 185 } 186 187 188 //--------------------------------------------------------------------- 189 // Constructor 190 //--------------------------------------------------------------------- 191 192 /** 193 * Constructor 194 */ AgentReducecub::AgentReduce195 __device__ __forceinline__ AgentReduce( 196 TempStorage& temp_storage, ///< Reference to temp_storage 197 InputIteratorT d_in, ///< Input data to reduce 198 ReductionOp reduction_op) ///< Binary reduction operator 199 : 200 temp_storage(temp_storage.Alias()), 201 d_in(d_in), 202 d_wrapped_in(d_in), 203 reduction_op(reduction_op) 204 {} 205 206 207 //--------------------------------------------------------------------- 208 // Tile consumption 209 //--------------------------------------------------------------------- 210 211 /** 212 * Consume a full tile of input (non-vectorized) 213 */ 214 template <int IS_FIRST_TILE> ConsumeTilecub::AgentReduce215 __device__ __forceinline__ void ConsumeTile( 216 OutputT &thread_aggregate, 217 OffsetT block_offset, ///< The offset the tile to consume 218 int /*valid_items*/, ///< The number of valid items in the tile 219 Int2Type<true> /*is_full_tile*/, ///< Whether or not this is a full tile 220 Int2Type<false> /*can_vectorize*/) ///< Whether or not we can vectorize loads 221 { 222 OutputT items[ITEMS_PER_THREAD]; 223 224 // Load items in striped fashion 225 LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items); 226 227 // Reduce items within each thread stripe 228 thread_aggregate = (IS_FIRST_TILE) ? 229 internal::ThreadReduce(items, reduction_op) : 230 internal::ThreadReduce(items, reduction_op, thread_aggregate); 231 } 232 233 234 /** 235 * Consume a full tile of input (vectorized) 236 */ 237 template <int IS_FIRST_TILE> ConsumeTilecub::AgentReduce238 __device__ __forceinline__ void ConsumeTile( 239 OutputT &thread_aggregate, 240 OffsetT block_offset, ///< The offset the tile to consume 241 int /*valid_items*/, ///< The number of valid items in the tile 242 Int2Type<true> /*is_full_tile*/, ///< Whether or not this is a full tile 243 Int2Type<true> /*can_vectorize*/) ///< Whether or not we can vectorize loads 244 { 245 // Alias items as an array of VectorT and load it in striped fashion 246 enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; 247 248 // Fabricate a vectorized input iterator 249 InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); 250 CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in( 251 reinterpret_cast<VectorT*>(d_in_unqualified)); 252 253 // Load items as vector items 254 InputT input_items[ITEMS_PER_THREAD]; 255 VectorT *vec_items = reinterpret_cast<VectorT*>(input_items); 256 #pragma unroll 257 for (int i = 0; i < WORDS; ++i) 258 vec_items[i] = d_vec_in[BLOCK_THREADS * i]; 259 260 // Convert from input type to output type 261 OutputT items[ITEMS_PER_THREAD]; 262 #pragma unroll 263 for (int i = 0; i < ITEMS_PER_THREAD; ++i) 264 items[i] = input_items[i]; 265 266 // Reduce items within each thread stripe 267 thread_aggregate = (IS_FIRST_TILE) ? 268 internal::ThreadReduce(items, reduction_op) : 269 internal::ThreadReduce(items, reduction_op, thread_aggregate); 270 } 271 272 273 /** 274 * Consume a partial tile of input 275 */ 276 template <int IS_FIRST_TILE, int CAN_VECTORIZE> ConsumeTilecub::AgentReduce277 __device__ __forceinline__ void ConsumeTile( 278 OutputT &thread_aggregate, 279 OffsetT block_offset, ///< The offset the tile to consume 280 int valid_items, ///< The number of valid items in the tile 281 Int2Type<false> /*is_full_tile*/, ///< Whether or not this is a full tile 282 Int2Type<CAN_VECTORIZE> /*can_vectorize*/) ///< Whether or not we can vectorize loads 283 { 284 // Partial tile 285 int thread_offset = threadIdx.x; 286 287 // Read first item 288 if ((IS_FIRST_TILE) && (thread_offset < valid_items)) 289 { 290 thread_aggregate = d_wrapped_in[block_offset + thread_offset]; 291 thread_offset += BLOCK_THREADS; 292 } 293 294 // Continue reading items (block-striped) 295 while (thread_offset < valid_items) 296 { 297 OutputT item = d_wrapped_in[block_offset + thread_offset]; 298 thread_aggregate = reduction_op(thread_aggregate, item); 299 thread_offset += BLOCK_THREADS; 300 } 301 } 302 303 304 //--------------------------------------------------------------- 305 // Consume a contiguous segment of tiles 306 //--------------------------------------------------------------------- 307 308 /** 309 * \brief Reduce a contiguous segment of input tiles 310 */ 311 template <int CAN_VECTORIZE> ConsumeRangecub::AgentReduce312 __device__ __forceinline__ OutputT ConsumeRange( 313 GridEvenShare<OffsetT> &even_share, ///< GridEvenShare descriptor 314 Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads 315 { 316 OutputT thread_aggregate; 317 318 if (even_share.block_offset + TILE_ITEMS > even_share.block_end) 319 { 320 // First tile isn't full (not all threads have valid items) 321 int valid_items = even_share.block_end - even_share.block_offset; 322 ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize); 323 return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); 324 } 325 326 // At least one full block 327 ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize); 328 even_share.block_offset += even_share.block_stride; 329 330 // Consume subsequent full tiles of input 331 while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) 332 { 333 ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize); 334 even_share.block_offset += even_share.block_stride; 335 } 336 337 // Consume a partially-full tile 338 if (even_share.block_offset < even_share.block_end) 339 { 340 int valid_items = even_share.block_end - even_share.block_offset; 341 ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize); 342 } 343 344 // Compute block-wide reduction (all threads have valid items) 345 return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); 346 } 347 348 349 /** 350 * \brief Reduce a contiguous segment of input tiles 351 */ ConsumeRangecub::AgentReduce352 __device__ __forceinline__ OutputT ConsumeRange( 353 OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) 354 OffsetT block_end) ///< [in] Threadblock end offset (exclusive) 355 { 356 GridEvenShare<OffsetT> even_share; 357 even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end); 358 359 return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ? 360 ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) : 361 ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>()); 362 } 363 364 365 /** 366 * Reduce a contiguous segment of input tiles 367 */ ConsumeTilescub::AgentReduce368 __device__ __forceinline__ OutputT ConsumeTiles( 369 GridEvenShare<OffsetT> &even_share) ///< [in] GridEvenShare descriptor 370 { 371 // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block 372 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>(); 373 374 return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ? 375 ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) : 376 ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>()); 377 378 } 379 380 }; 381 382 383 } // CUB namespace 384 CUB_NS_POSTFIX // Optional outer namespace(s) 385 386