1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. 32 */ 33 34 #pragma once 35 36 #include "../../thread/thread_operators.cuh" 37 #include "../../thread/thread_load.cuh" 38 #include "../../thread/thread_store.cuh" 39 #include "../../util_type.cuh" 40 #include "../../util_namespace.cuh" 41 42 /// Optional outer namespace(s) 43 CUB_NS_PREFIX 44 45 /// CUB namespace 46 namespace cub { 47 48 /** 49 * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. 50 */ 51 template < 52 typename T, ///< Data type being reduced 53 int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp 54 int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 55 struct WarpReduceSmem 56 { 57 /****************************************************************************** 58 * Constants and type definitions 59 ******************************************************************************/ 60 61 enum 62 { 63 /// Whether the logical warp size and the PTX warp size coincide 64 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), 65 66 /// Whether the logical warp size is a power-of-two 67 IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE, 68 69 /// The number of warp scan steps 70 STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE, 71 72 /// The number of threads in half a warp 73 HALF_WARP_THREADS = 1 << (STEPS - 1), 74 75 /// The number of shared memory elements per warp 76 WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, 77 78 /// FlagT status (when not using ballot) 79 UNSET = 0x0, // Is initially unset 80 SET = 0x1, // Is initially set 81 SEEN = 0x2, // Has seen another head flag from a successor peer 82 }; 83 84 /// Shared memory flag type 85 typedef unsigned char SmemFlag; 86 87 /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) 88 struct _TempStorage 89 { 90 T reduce[WARP_SMEM_ELEMENTS]; 91 SmemFlag flags[WARP_SMEM_ELEMENTS]; 92 }; 93 94 // Alias wrapper allowing storage to be unioned 95 struct TempStorage : Uninitialized<_TempStorage> {}; 96 97 98 /****************************************************************************** 99 * Thread fields 100 ******************************************************************************/ 101 102 _TempStorage &temp_storage; 103 unsigned int lane_id; 104 unsigned int member_mask; 105 106 107 /****************************************************************************** 108 * Construction 109 ******************************************************************************/ 110 111 /// Constructor WarpReduceSmemcub::WarpReduceSmem112 __device__ __forceinline__ WarpReduceSmem( 113 TempStorage &temp_storage) 114 : 115 temp_storage(temp_storage.Alias()), 116 117 lane_id(IS_ARCH_WARP ? 118 LaneId() : 119 LaneId() % LOGICAL_WARP_THREADS), 120 121 member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? 122 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp 123 ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) 124 {} 125 126 /****************************************************************************** 127 * Utility methods 128 ******************************************************************************/ 129 130 //--------------------------------------------------------------------- 131 // Regular reduction 132 //--------------------------------------------------------------------- 133 134 /** 135 * Reduction step 136 */ 137 template < 138 bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items 139 typename ReductionOp, 140 int STEP> ReduceStepcub::WarpReduceSmem141 __device__ __forceinline__ T ReduceStep( 142 T input, ///< [in] Calling thread's input 143 int valid_items, ///< [in] Total number of valid items across the logical warp 144 ReductionOp reduction_op, ///< [in] Reduction operator 145 Int2Type<STEP> /*step*/) 146 { 147 const int OFFSET = 1 << STEP; 148 149 // Share input through buffer 150 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input); 151 152 WARP_SYNC(member_mask); 153 154 // Update input if peer_addend is in range 155 if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) 156 { 157 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]); 158 input = reduction_op(input, peer_addend); 159 } 160 161 WARP_SYNC(member_mask); 162 163 return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>()); 164 } 165 166 167 /** 168 * Reduction step (terminate) 169 */ 170 template < 171 bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items 172 typename ReductionOp> ReduceStepcub::WarpReduceSmem173 __device__ __forceinline__ T ReduceStep( 174 T input, ///< [in] Calling thread's input 175 int valid_items, ///< [in] Total number of valid items across the logical warp 176 ReductionOp /*reduction_op*/, ///< [in] Reduction operator 177 Int2Type<STEPS> /*step*/) 178 { 179 return input; 180 } 181 182 183 //--------------------------------------------------------------------- 184 // Segmented reduction 185 //--------------------------------------------------------------------- 186 187 188 /** 189 * Ballot-based segmented reduce 190 */ 191 template < 192 bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail 193 typename FlagT, 194 typename ReductionOp> SegmentedReducecub::WarpReduceSmem195 __device__ __forceinline__ T SegmentedReduce( 196 T input, ///< [in] Calling thread's input 197 FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail 198 ReductionOp reduction_op, ///< [in] Reduction operator 199 Int2Type<true> /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality 200 { 201 // Get the start flags for each thread in the warp. 202 int warp_flags = WARP_BALLOT(flag, member_mask); 203 204 if (!HEAD_SEGMENTED) 205 warp_flags <<= 1; 206 207 // Keep bits above the current thread. 208 warp_flags &= LaneMaskGt(); 209 210 // Accommodate packing of multiple logical warps in a single physical warp 211 if (!IS_ARCH_WARP) 212 { 213 warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; 214 } 215 216 // Find next flag 217 int next_flag = __clz(__brev(warp_flags)); 218 219 // Clip the next segment at the warp boundary if necessary 220 if (LOGICAL_WARP_THREADS != 32) 221 next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); 222 223 #pragma unroll 224 for (int STEP = 0; STEP < STEPS; STEP++) 225 { 226 const int OFFSET = 1 << STEP; 227 228 // Share input into buffer 229 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input); 230 231 WARP_SYNC(member_mask); 232 233 // Update input if peer_addend is in range 234 if (OFFSET + lane_id < next_flag) 235 { 236 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]); 237 input = reduction_op(input, peer_addend); 238 } 239 240 WARP_SYNC(member_mask); 241 } 242 243 return input; 244 } 245 246 247 /** 248 * Smem-based segmented reduce 249 */ 250 template < 251 bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail 252 typename FlagT, 253 typename ReductionOp> SegmentedReducecub::WarpReduceSmem254 __device__ __forceinline__ T SegmentedReduce( 255 T input, ///< [in] Calling thread's input 256 FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail 257 ReductionOp reduction_op, ///< [in] Reduction operator 258 Int2Type<false> /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality 259 { 260 enum 261 { 262 UNSET = 0x0, // Is initially unset 263 SET = 0x1, // Is initially set 264 SEEN = 0x2, // Has seen another head flag from a successor peer 265 }; 266 267 // Alias flags onto shared data storage 268 volatile SmemFlag *flag_storage = temp_storage.flags; 269 270 SmemFlag flag_status = (flag) ? SET : UNSET; 271 272 for (int STEP = 0; STEP < STEPS; STEP++) 273 { 274 const int OFFSET = 1 << STEP; 275 276 // Share input through buffer 277 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input); 278 279 WARP_SYNC(member_mask); 280 281 // Get peer from buffer 282 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]); 283 284 WARP_SYNC(member_mask); 285 286 // Share flag through buffer 287 flag_storage[lane_id] = flag_status; 288 289 // Get peer flag from buffer 290 SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; 291 292 // Update input if peer was in range 293 if (lane_id < LOGICAL_WARP_THREADS - OFFSET) 294 { 295 if (HEAD_SEGMENTED) 296 { 297 // Head-segmented 298 if ((flag_status & SEEN) == 0) 299 { 300 // Has not seen a more distant head flag 301 if (peer_flag_status & SET) 302 { 303 // Has now seen a head flag 304 flag_status |= SEEN; 305 } 306 else 307 { 308 // Peer is not a head flag: grab its count 309 input = reduction_op(input, peer_addend); 310 } 311 312 // Update seen status to include that of peer 313 flag_status |= (peer_flag_status & SEEN); 314 } 315 } 316 else 317 { 318 // Tail-segmented. Simply propagate flag status 319 if (!flag_status) 320 { 321 input = reduction_op(input, peer_addend); 322 flag_status |= peer_flag_status; 323 } 324 325 } 326 } 327 } 328 329 return input; 330 } 331 332 333 /****************************************************************************** 334 * Interface 335 ******************************************************************************/ 336 337 /** 338 * Reduction 339 */ 340 template < 341 bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items 342 typename ReductionOp> Reducecub::WarpReduceSmem343 __device__ __forceinline__ T Reduce( 344 T input, ///< [in] Calling thread's input 345 int valid_items, ///< [in] Total number of valid items across the logical warp 346 ReductionOp reduction_op) ///< [in] Reduction operator 347 { 348 return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>()); 349 } 350 351 352 /** 353 * Segmented reduction 354 */ 355 template < 356 bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail 357 typename FlagT, 358 typename ReductionOp> SegmentedReducecub::WarpReduceSmem359 __device__ __forceinline__ T SegmentedReduce( 360 T input, ///< [in] Calling thread's input 361 FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail 362 ReductionOp reduction_op) ///< [in] Reduction operator 363 { 364 return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); 365 } 366 367 368 }; 369 370 371 } // CUB namespace 372 CUB_NS_POSTFIX // Optional outer namespace(s) 373