1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. 32 */ 33 34 #pragma once 35 36 #include "../util_ptx.cuh" 37 #include "../util_arch.cuh" 38 #include "../util_macro.cuh" 39 #include "../util_type.cuh" 40 #include "../util_namespace.cuh" 41 42 /// Optional outer namespace(s) 43 CUB_NS_PREFIX 44 45 /// CUB namespace 46 namespace cub { 47 48 /** 49 * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) 50 * \ingroup BlockModule 51 * 52 * \tparam T The data type to be exchanged. 53 * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension 54 * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. 55 * \tparam WARP_TIME_SLICING <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) 56 * \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1) 57 * \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1) 58 * \tparam PTX_ARCH <b>[optional]</b> \ptxversion 59 * 60 * \par Overview 61 * - It is commonplace for blocks of threads to rearrange data items between 62 * threads. For example, the device-accessible memory subsystem prefers access patterns 63 * where data items are "striped" across threads (where consecutive threads access consecutive items), 64 * yet most block-wide operations prefer a "blocked" partitioning of items across threads 65 * (where consecutive items belong to a single thread). 66 * - BlockExchange supports the following types of data exchanges: 67 * - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements 68 * - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements 69 * - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3) 70 * - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3) 71 * - \rowmajor 72 * 73 * \par A Simple Example 74 * \blockcollective{BlockExchange} 75 * \par 76 * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement 77 * of 512 integer items partitioned across 128 threads where each thread owns 4 items. 78 * \par 79 * \code 80 * #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh> 81 * 82 * __global__ void ExampleKernel(int *d_data, ...) 83 * { 84 * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each 85 * typedef cub::BlockExchange<int, 128, 4> BlockExchange; 86 * 87 * // Allocate shared memory for BlockExchange 88 * __shared__ typename BlockExchange::TempStorage temp_storage; 89 * 90 * // Load a tile of data striped across threads 91 * int thread_data[4]; 92 * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); 93 * 94 * // Collectively exchange data into a blocked arrangement across threads 95 * BlockExchange(temp_storage).StripedToBlocked(thread_data); 96 * 97 * \endcode 98 * \par 99 * Suppose the set of striped input \p thread_data across the block of threads is 100 * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>. 101 * The corresponding output \p thread_data in those threads will be 102 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 103 * 104 * \par Performance Considerations 105 * - Proper device-specific padding ensures zero bank conflicts for most types. 106 * 107 */ 108 template < 109 typename InputT, 110 int BLOCK_DIM_X, 111 int ITEMS_PER_THREAD, 112 bool WARP_TIME_SLICING = false, 113 int BLOCK_DIM_Y = 1, 114 int BLOCK_DIM_Z = 1, 115 int PTX_ARCH = CUB_PTX_ARCH> 116 class BlockExchange 117 { 118 private: 119 120 /****************************************************************************** 121 * Constants 122 ******************************************************************************/ 123 124 /// Constants 125 enum 126 { 127 /// The thread block size in threads 128 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 129 130 LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), 131 WARP_THREADS = 1 << LOG_WARP_THREADS, 132 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, 133 134 LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), 135 SMEM_BANKS = 1 << LOG_SMEM_BANKS, 136 137 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, 138 139 TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, 140 141 TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, 142 TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, 143 144 WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), 145 WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, 146 147 // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) 148 INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE), 149 PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, 150 }; 151 152 /****************************************************************************** 153 * Type definitions 154 ******************************************************************************/ 155 156 /// Shared memory storage layout type 157 struct __align__(16) _TempStorage 158 { 159 InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; 160 }; 161 162 public: 163 164 /// \smemstorage{BlockExchange} 165 struct TempStorage : Uninitialized<_TempStorage> {}; 166 167 private: 168 169 170 /****************************************************************************** 171 * Thread fields 172 ******************************************************************************/ 173 174 /// Shared storage reference 175 _TempStorage &temp_storage; 176 177 /// Linear thread-id 178 unsigned int linear_tid; 179 unsigned int lane_id; 180 unsigned int warp_id; 181 unsigned int warp_offset; 182 183 184 /****************************************************************************** 185 * Utility methods 186 ******************************************************************************/ 187 188 /// Internal storage allocator PrivateStorage()189 __device__ __forceinline__ _TempStorage& PrivateStorage() 190 { 191 __shared__ _TempStorage private_storage; 192 return private_storage; 193 } 194 195 196 /** 197 * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement. Specialized for no timeslicing. 198 */ 199 template <typename OutputT> BlockedToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)200 __device__ __forceinline__ void BlockedToStriped( 201 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 202 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 203 Int2Type<false> /*time_slicing*/) 204 { 205 #pragma unroll 206 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 207 { 208 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; 209 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 210 temp_storage.buff[item_offset] = input_items[ITEM]; 211 } 212 213 CTA_SYNC(); 214 215 #pragma unroll 216 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 217 { 218 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; 219 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 220 output_items[ITEM] = temp_storage.buff[item_offset]; 221 } 222 } 223 224 225 /** 226 * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement. Specialized for warp-timeslicing. 227 */ 228 template <typename OutputT> BlockedToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)229 __device__ __forceinline__ void BlockedToStriped( 230 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 231 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 232 Int2Type<true> /*time_slicing*/) 233 { 234 InputT temp_items[ITEMS_PER_THREAD]; 235 236 #pragma unroll 237 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) 238 { 239 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; 240 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; 241 242 CTA_SYNC(); 243 244 if (warp_id == SLICE) 245 { 246 #pragma unroll 247 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 248 { 249 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; 250 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 251 temp_storage.buff[item_offset] = input_items[ITEM]; 252 } 253 } 254 255 CTA_SYNC(); 256 257 #pragma unroll 258 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 259 { 260 // Read a strip of items 261 const int STRIP_OFFSET = ITEM * BLOCK_THREADS; 262 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; 263 264 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) 265 { 266 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; 267 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) 268 { 269 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 270 temp_items[ITEM] = temp_storage.buff[item_offset]; 271 } 272 } 273 } 274 } 275 276 // Copy 277 #pragma unroll 278 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 279 { 280 output_items[ITEM] = temp_items[ITEM]; 281 } 282 } 283 284 285 /** 286 * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing 287 */ 288 template <typename OutputT> BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)289 __device__ __forceinline__ void BlockedToWarpStriped( 290 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 291 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 292 Int2Type<false> /*time_slicing*/) 293 { 294 #pragma unroll 295 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 296 { 297 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); 298 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 299 temp_storage.buff[item_offset] = input_items[ITEM]; 300 } 301 302 WARP_SYNC(0xffffffff); 303 304 #pragma unroll 305 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 306 { 307 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; 308 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 309 output_items[ITEM] = temp_storage.buff[item_offset]; 310 } 311 } 312 313 /** 314 * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing 315 */ 316 template <typename OutputT> BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)317 __device__ __forceinline__ void BlockedToWarpStriped( 318 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 319 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 320 Int2Type<true> /*time_slicing*/) 321 { 322 if (warp_id == 0) 323 { 324 #pragma unroll 325 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 326 { 327 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); 328 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 329 temp_storage.buff[item_offset] = input_items[ITEM]; 330 } 331 332 WARP_SYNC(0xffffffff); 333 334 #pragma unroll 335 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 336 { 337 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; 338 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 339 output_items[ITEM] = temp_storage.buff[item_offset]; 340 } 341 } 342 343 #pragma unroll 344 for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) 345 { 346 CTA_SYNC(); 347 348 if (warp_id == SLICE) 349 { 350 #pragma unroll 351 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 352 { 353 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); 354 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 355 temp_storage.buff[item_offset] = input_items[ITEM]; 356 } 357 358 WARP_SYNC(0xffffffff); 359 360 #pragma unroll 361 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 362 { 363 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; 364 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 365 output_items[ITEM] = temp_storage.buff[item_offset]; 366 } 367 } 368 } 369 } 370 371 372 /** 373 * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement. Specialized for no timeslicing. 374 */ 375 template <typename OutputT> StripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)376 __device__ __forceinline__ void StripedToBlocked( 377 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 378 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 379 Int2Type<false> /*time_slicing*/) 380 { 381 #pragma unroll 382 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 383 { 384 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; 385 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 386 temp_storage.buff[item_offset] = input_items[ITEM]; 387 } 388 389 CTA_SYNC(); 390 391 // No timeslicing 392 #pragma unroll 393 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 394 { 395 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; 396 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 397 output_items[ITEM] = temp_storage.buff[item_offset]; 398 } 399 } 400 401 402 /** 403 * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement. Specialized for warp-timeslicing. 404 */ 405 template <typename OutputT> StripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)406 __device__ __forceinline__ void StripedToBlocked( 407 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 408 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 409 Int2Type<true> /*time_slicing*/) 410 { 411 // Warp time-slicing 412 InputT temp_items[ITEMS_PER_THREAD]; 413 414 #pragma unroll 415 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) 416 { 417 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; 418 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; 419 420 CTA_SYNC(); 421 422 #pragma unroll 423 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 424 { 425 // Write a strip of items 426 const int STRIP_OFFSET = ITEM * BLOCK_THREADS; 427 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; 428 429 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) 430 { 431 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; 432 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) 433 { 434 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 435 temp_storage.buff[item_offset] = input_items[ITEM]; 436 } 437 } 438 } 439 440 CTA_SYNC(); 441 442 if (warp_id == SLICE) 443 { 444 #pragma unroll 445 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 446 { 447 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; 448 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 449 temp_items[ITEM] = temp_storage.buff[item_offset]; 450 } 451 } 452 } 453 454 // Copy 455 #pragma unroll 456 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 457 { 458 output_items[ITEM] = temp_items[ITEM]; 459 } 460 } 461 462 463 /** 464 * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement. Specialized for no timeslicing 465 */ 466 template <typename OutputT> WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)467 __device__ __forceinline__ void WarpStripedToBlocked( 468 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 469 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 470 Int2Type<false> /*time_slicing*/) 471 { 472 #pragma unroll 473 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 474 { 475 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; 476 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 477 temp_storage.buff[item_offset] = input_items[ITEM]; 478 } 479 480 WARP_SYNC(0xffffffff); 481 482 #pragma unroll 483 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 484 { 485 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); 486 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 487 output_items[ITEM] = temp_storage.buff[item_offset]; 488 } 489 } 490 491 492 /** 493 * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement. Specialized for warp-timeslicing 494 */ 495 template <typename OutputT> WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)496 __device__ __forceinline__ void WarpStripedToBlocked( 497 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 498 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 499 Int2Type<true> /*time_slicing*/) 500 { 501 #pragma unroll 502 for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) 503 { 504 CTA_SYNC(); 505 506 if (warp_id == SLICE) 507 { 508 #pragma unroll 509 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 510 { 511 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; 512 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 513 temp_storage.buff[item_offset] = input_items[ITEM]; 514 } 515 516 WARP_SYNC(0xffffffff); 517 518 #pragma unroll 519 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 520 { 521 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); 522 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 523 output_items[ITEM] = temp_storage.buff[item_offset]; 524 } 525 } 526 } 527 } 528 529 530 /** 531 * Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized for no timeslicing. 532 */ 533 template <typename OutputT, typename OffsetT> ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<false>)534 __device__ __forceinline__ void ScatterToBlocked( 535 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 536 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 537 OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks 538 Int2Type<false> /*time_slicing*/) 539 { 540 #pragma unroll 541 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 542 { 543 int item_offset = ranks[ITEM]; 544 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 545 temp_storage.buff[item_offset] = input_items[ITEM]; 546 } 547 548 CTA_SYNC(); 549 550 #pragma unroll 551 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 552 { 553 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; 554 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 555 output_items[ITEM] = temp_storage.buff[item_offset]; 556 } 557 } 558 559 /** 560 * Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized for warp-timeslicing. 561 */ 562 template <typename OutputT, typename OffsetT> ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<true>)563 __device__ __forceinline__ void ScatterToBlocked( 564 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 565 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 566 OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks 567 Int2Type<true> /*time_slicing*/) 568 { 569 InputT temp_items[ITEMS_PER_THREAD]; 570 571 #pragma unroll 572 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) 573 { 574 CTA_SYNC(); 575 576 const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; 577 578 #pragma unroll 579 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 580 { 581 int item_offset = ranks[ITEM] - SLICE_OFFSET; 582 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) 583 { 584 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 585 temp_storage.buff[item_offset] = input_items[ITEM]; 586 } 587 } 588 589 CTA_SYNC(); 590 591 if (warp_id == SLICE) 592 { 593 #pragma unroll 594 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 595 { 596 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; 597 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 598 temp_items[ITEM] = temp_storage.buff[item_offset]; 599 } 600 } 601 } 602 603 // Copy 604 #pragma unroll 605 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 606 { 607 output_items[ITEM] = temp_items[ITEM]; 608 } 609 } 610 611 612 /** 613 * Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized for no timeslicing. 614 */ 615 template <typename OutputT, typename OffsetT> ScatterToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<false>)616 __device__ __forceinline__ void ScatterToStriped( 617 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 618 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 619 OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks 620 Int2Type<false> /*time_slicing*/) 621 { 622 #pragma unroll 623 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 624 { 625 int item_offset = ranks[ITEM]; 626 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 627 temp_storage.buff[item_offset] = input_items[ITEM]; 628 } 629 630 CTA_SYNC(); 631 632 #pragma unroll 633 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 634 { 635 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; 636 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 637 output_items[ITEM] = temp_storage.buff[item_offset]; 638 } 639 } 640 641 642 /** 643 * Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized for warp-timeslicing. 644 */ 645 template <typename OutputT, typename OffsetT> ScatterToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<true>)646 __device__ __forceinline__ void ScatterToStriped( 647 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 648 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements. 649 OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks 650 Int2Type<true> /*time_slicing*/) 651 { 652 InputT temp_items[ITEMS_PER_THREAD]; 653 654 #pragma unroll 655 for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) 656 { 657 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; 658 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; 659 660 CTA_SYNC(); 661 662 #pragma unroll 663 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 664 { 665 int item_offset = ranks[ITEM] - SLICE_OFFSET; 666 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) 667 { 668 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 669 temp_storage.buff[item_offset] = input_items[ITEM]; 670 } 671 } 672 673 CTA_SYNC(); 674 675 #pragma unroll 676 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 677 { 678 // Read a strip of items 679 const int STRIP_OFFSET = ITEM * BLOCK_THREADS; 680 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; 681 682 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) 683 { 684 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; 685 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) 686 { 687 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; 688 temp_items[ITEM] = temp_storage.buff[item_offset]; 689 } 690 } 691 } 692 } 693 694 // Copy 695 #pragma unroll 696 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 697 { 698 output_items[ITEM] = temp_items[ITEM]; 699 } 700 } 701 702 703 public: 704 705 /******************************************************************//** 706 * \name Collective constructors 707 *********************************************************************/ 708 //@{ 709 710 /** 711 * \brief Collective constructor using a private static allocation of shared memory as temporary storage. 712 */ BlockExchange()713 __device__ __forceinline__ BlockExchange() 714 : 715 temp_storage(PrivateStorage()), 716 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), 717 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), 718 lane_id(LaneId()), 719 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) 720 {} 721 722 723 /** 724 * \brief Collective constructor using the specified memory allocation as temporary storage. 725 */ BlockExchange(TempStorage & temp_storage)726 __device__ __forceinline__ BlockExchange( 727 TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage 728 : 729 temp_storage(temp_storage.Alias()), 730 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), 731 lane_id(LaneId()), 732 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), 733 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) 734 {} 735 736 737 //@} end member group 738 /******************************************************************//** 739 * \name Structured exchanges 740 *********************************************************************/ 741 //@{ 742 743 /** 744 * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement. 745 * 746 * \par 747 * - \smemreuse 748 * 749 * \par Snippet 750 * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement 751 * of 512 integer items partitioned across 128 threads where each thread owns 4 items. 752 * \par 753 * \code 754 * #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh> 755 * 756 * __global__ void ExampleKernel(int *d_data, ...) 757 * { 758 * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each 759 * typedef cub::BlockExchange<int, 128, 4> BlockExchange; 760 * 761 * // Allocate shared memory for BlockExchange 762 * __shared__ typename BlockExchange::TempStorage temp_storage; 763 * 764 * // Load a tile of ordered data into a striped arrangement across block threads 765 * int thread_data[4]; 766 * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); 767 * 768 * // Collectively exchange data into a blocked arrangement across threads 769 * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); 770 * 771 * \endcode 772 * \par 773 * Suppose the set of striped input \p thread_data across the block of threads is 774 * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory. 775 * The corresponding output \p thread_data in those threads will be 776 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 777 * 778 */ 779 template <typename OutputT> StripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])780 __device__ __forceinline__ void StripedToBlocked( 781 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 782 OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 783 { 784 StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>()); 785 } 786 787 788 /** 789 * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement. 790 * 791 * \par 792 * - \smemreuse 793 * 794 * \par Snippet 795 * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement 796 * of 512 integer items partitioned across 128 threads where each thread owns 4 items. 797 * \par 798 * \code 799 * #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh> 800 * 801 * __global__ void ExampleKernel(int *d_data, ...) 802 * { 803 * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each 804 * typedef cub::BlockExchange<int, 128, 4> BlockExchange; 805 * 806 * // Allocate shared memory for BlockExchange 807 * __shared__ typename BlockExchange::TempStorage temp_storage; 808 * 809 * // Obtain a segment of consecutive items that are blocked across threads 810 * int thread_data[4]; 811 * ... 812 * 813 * // Collectively exchange data into a striped arrangement across threads 814 * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); 815 * 816 * // Store data striped across block threads into an ordered tile 817 * cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data); 818 * 819 * \endcode 820 * \par 821 * Suppose the set of blocked input \p thread_data across the block of threads is 822 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 823 * The corresponding output \p thread_data in those threads will be 824 * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in 825 * preparation for storing to device-accessible memory. 826 * 827 */ 828 template <typename OutputT> BlockedToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])829 __device__ __forceinline__ void BlockedToStriped( 830 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 831 OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 832 { 833 BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>()); 834 } 835 836 837 838 /** 839 * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement. 840 * 841 * \par 842 * - \smemreuse 843 * 844 * \par Snippet 845 * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement 846 * of 512 integer items partitioned across 128 threads where each thread owns 4 items. 847 * \par 848 * \code 849 * #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh> 850 * 851 * __global__ void ExampleKernel(int *d_data, ...) 852 * { 853 * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each 854 * typedef cub::BlockExchange<int, 128, 4> BlockExchange; 855 * 856 * // Allocate shared memory for BlockExchange 857 * __shared__ typename BlockExchange::TempStorage temp_storage; 858 * 859 * // Load a tile of ordered data into a warp-striped arrangement across warp threads 860 * int thread_data[4]; 861 * cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data); 862 * 863 * // Collectively exchange data into a blocked arrangement across threads 864 * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); 865 * 866 * \endcode 867 * \par 868 * Suppose the set of warp-striped input \p thread_data across the block of threads is 869 * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt> 870 * after loading from device-accessible memory. (The first 128 items are striped across 871 * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) 872 * The corresponding output \p thread_data in those threads will be 873 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 874 * 875 */ 876 template <typename OutputT> WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])877 __device__ __forceinline__ void WarpStripedToBlocked( 878 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 879 OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 880 { 881 WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>()); 882 } 883 884 885 886 /** 887 * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. 888 * 889 * \par 890 * - \smemreuse 891 * 892 * \par Snippet 893 * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement 894 * of 512 integer items partitioned across 128 threads where each thread owns 4 items. 895 * \par 896 * \code 897 * #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh> 898 * 899 * __global__ void ExampleKernel(int *d_data, ...) 900 * { 901 * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each 902 * typedef cub::BlockExchange<int, 128, 4> BlockExchange; 903 * 904 * // Allocate shared memory for BlockExchange 905 * __shared__ typename BlockExchange::TempStorage temp_storage; 906 * 907 * // Obtain a segment of consecutive items that are blocked across threads 908 * int thread_data[4]; 909 * ... 910 * 911 * // Collectively exchange data into a warp-striped arrangement across threads 912 * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); 913 * 914 * // Store data striped across warp threads into an ordered tile 915 * cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data); 916 * 917 * \endcode 918 * \par 919 * Suppose the set of blocked input \p thread_data across the block of threads is 920 * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>. 921 * The corresponding output \p thread_data in those threads will be 922 * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt> 923 * in preparation for storing to device-accessible memory. (The first 128 items are striped across 924 * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) 925 * 926 */ 927 template <typename OutputT> BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])928 __device__ __forceinline__ void BlockedToWarpStriped( 929 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 930 OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 931 { 932 BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>()); 933 } 934 935 936 937 //@} end member group 938 /******************************************************************//** 939 * \name Scatter exchanges 940 *********************************************************************/ 941 //@{ 942 943 944 /** 945 * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement. 946 * 947 * \par 948 * - \smemreuse 949 * 950 * \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets 951 */ 952 template <typename OutputT, typename OffsetT> ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])953 __device__ __forceinline__ void ScatterToBlocked( 954 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 955 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 956 OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks 957 { 958 ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>()); 959 } 960 961 962 963 /** 964 * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement. 965 * 966 * \par 967 * - \smemreuse 968 * 969 * \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets 970 */ 971 template <typename OutputT, typename OffsetT> ScatterToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])972 __device__ __forceinline__ void ScatterToStriped( 973 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 974 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 975 OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks 976 { 977 ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>()); 978 } 979 980 981 982 /** 983 * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement. Items with rank -1 are not exchanged. 984 * 985 * \par 986 * - \smemreuse 987 * 988 * \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets 989 */ 990 template <typename OutputT, typename OffsetT> ScatterToStripedGuarded(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])991 __device__ __forceinline__ void ScatterToStripedGuarded( 992 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 993 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 994 OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks 995 { 996 #pragma unroll 997 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 998 { 999 int item_offset = ranks[ITEM]; 1000 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 1001 if (ranks[ITEM] >= 0) 1002 temp_storage.buff[item_offset] = input_items[ITEM]; 1003 } 1004 1005 CTA_SYNC(); 1006 1007 #pragma unroll 1008 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 1009 { 1010 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; 1011 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 1012 output_items[ITEM] = temp_storage.buff[item_offset]; 1013 } 1014 } 1015 1016 1017 1018 1019 /** 1020 * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement. 1021 * 1022 * \par 1023 * - \smemreuse 1024 * 1025 * \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets 1026 * \tparam ValidFlag <b>[inferred]</b> FlagT type denoting which items are valid 1027 */ 1028 template <typename OutputT, typename OffsetT, typename ValidFlag> ScatterToStripedFlagged(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],ValidFlag is_valid[ITEMS_PER_THREAD])1029 __device__ __forceinline__ void ScatterToStripedFlagged( 1030 InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1031 OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1032 OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks 1033 ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity 1034 { 1035 #pragma unroll 1036 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 1037 { 1038 int item_offset = ranks[ITEM]; 1039 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 1040 if (is_valid[ITEM]) 1041 temp_storage.buff[item_offset] = input_items[ITEM]; 1042 } 1043 1044 CTA_SYNC(); 1045 1046 #pragma unroll 1047 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 1048 { 1049 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; 1050 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 1051 output_items[ITEM] = temp_storage.buff[item_offset]; 1052 } 1053 } 1054 1055 1056 //@} end member group 1057 1058 1059 1060 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 1061 1062 StripedToBlocked(InputT items[ITEMS_PER_THREAD])1063 __device__ __forceinline__ void StripedToBlocked( 1064 InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1065 { 1066 StripedToBlocked(items, items); 1067 } 1068 BlockedToStriped(InputT items[ITEMS_PER_THREAD])1069 __device__ __forceinline__ void BlockedToStriped( 1070 InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1071 { 1072 BlockedToStriped(items, items); 1073 } 1074 WarpStripedToBlocked(InputT items[ITEMS_PER_THREAD])1075 __device__ __forceinline__ void WarpStripedToBlocked( 1076 InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1077 { 1078 WarpStripedToBlocked(items, items); 1079 } 1080 BlockedToWarpStriped(InputT items[ITEMS_PER_THREAD])1081 __device__ __forceinline__ void BlockedToWarpStriped( 1082 InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1083 { 1084 BlockedToWarpStriped(items, items); 1085 } 1086 1087 template <typename OffsetT> ScatterToBlocked(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1088 __device__ __forceinline__ void ScatterToBlocked( 1089 InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1090 OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks 1091 { 1092 ScatterToBlocked(items, items, ranks); 1093 } 1094 1095 template <typename OffsetT> ScatterToStriped(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1096 __device__ __forceinline__ void ScatterToStriped( 1097 InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1098 OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks 1099 { 1100 ScatterToStriped(items, items, ranks); 1101 } 1102 1103 template <typename OffsetT> ScatterToStripedGuarded(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1104 __device__ __forceinline__ void ScatterToStripedGuarded( 1105 InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1106 OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks 1107 { 1108 ScatterToStripedGuarded(items, items, ranks); 1109 } 1110 1111 template <typename OffsetT, typename ValidFlag> ScatterToStripedFlagged(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],ValidFlag is_valid[ITEMS_PER_THREAD])1112 __device__ __forceinline__ void ScatterToStripedFlagged( 1113 InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements. 1114 OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks 1115 ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity 1116 { 1117 ScatterToStriped(items, items, ranks, is_valid); 1118 } 1119 1120 #endif // DOXYGEN_SHOULD_SKIP_THIS 1121 1122 1123 }; 1124 1125 1126 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 1127 1128 1129 template < 1130 typename T, 1131 int ITEMS_PER_THREAD, 1132 int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, 1133 int PTX_ARCH = CUB_PTX_ARCH> 1134 class WarpExchange 1135 { 1136 private: 1137 1138 /****************************************************************************** 1139 * Constants 1140 ******************************************************************************/ 1141 1142 /// Constants 1143 enum 1144 { 1145 // Whether the logical warp size and the PTX warp size coincide 1146 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), 1147 1148 WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1, 1149 1150 LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), 1151 SMEM_BANKS = 1 << LOG_SMEM_BANKS, 1152 1153 // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) 1154 INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE), 1155 PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0, 1156 }; 1157 1158 /****************************************************************************** 1159 * Type definitions 1160 ******************************************************************************/ 1161 1162 /// Shared memory storage layout type 1163 struct _TempStorage 1164 { 1165 T buff[WARP_ITEMS + PADDING_ITEMS]; 1166 }; 1167 1168 public: 1169 1170 /// \smemstorage{WarpExchange} 1171 struct TempStorage : Uninitialized<_TempStorage> {}; 1172 1173 private: 1174 1175 1176 /****************************************************************************** 1177 * Thread fields 1178 ******************************************************************************/ 1179 1180 _TempStorage &temp_storage; 1181 int lane_id; 1182 1183 public: 1184 1185 /****************************************************************************** 1186 * Construction 1187 ******************************************************************************/ 1188 1189 /// Constructor WarpExchange(TempStorage & temp_storage)1190 __device__ __forceinline__ WarpExchange( 1191 TempStorage &temp_storage) 1192 : 1193 temp_storage(temp_storage.Alias()), 1194 lane_id(IS_ARCH_WARP ? 1195 LaneId() : 1196 LaneId() % LOGICAL_WARP_THREADS) 1197 {} 1198 1199 1200 /****************************************************************************** 1201 * Interface 1202 ******************************************************************************/ 1203 1204 /** 1205 * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement. 1206 * 1207 * \par 1208 * - \smemreuse 1209 * 1210 * \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets 1211 */ 1212 template <typename OffsetT> ScatterToStriped(T items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1213 __device__ __forceinline__ void ScatterToStriped( 1214 T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange 1215 OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks 1216 { 1217 #pragma unroll 1218 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 1219 { 1220 if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); 1221 temp_storage.buff[ranks[ITEM]] = items[ITEM]; 1222 } 1223 1224 WARP_SYNC(0xffffffff); 1225 1226 #pragma unroll 1227 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 1228 { 1229 int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; 1230 if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); 1231 items[ITEM] = temp_storage.buff[item_offset]; 1232 } 1233 } 1234 1235 }; 1236 1237 1238 1239 1240 #endif // DOXYGEN_SHOULD_SKIP_THIS 1241 1242 1243 1244 1245 1246 } // CUB namespace 1247 CUB_NS_POSTFIX // Optional outer namespace(s) 1248 1249