1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
32  */
33 
34 #pragma once
35 
36 #include "../util_ptx.cuh"
37 #include "../util_arch.cuh"
38 #include "../util_macro.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
42 /// Optional outer namespace(s)
43 CUB_NS_PREFIX
44 
45 /// CUB namespace
46 namespace cub {
47 
48 /**
49  * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
50  * \ingroup BlockModule
51  *
52  * \tparam T                    The data type to be exchanged.
53  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
54  * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
55  * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
56  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
57  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
58  * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
59  *
60  * \par Overview
61  * - It is commonplace for blocks of threads to rearrange data items between
62  *   threads.  For example, the device-accessible memory subsystem prefers access patterns
63  *   where data items are "striped" across threads (where consecutive threads access consecutive items),
64  *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
65  *   (where consecutive items belong to a single thread).
66  * - BlockExchange supports the following types of data exchanges:
67  *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
68  *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
69  *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
70  *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
71  * - \rowmajor
72  *
73  * \par A Simple Example
74  * \blockcollective{BlockExchange}
75  * \par
76  * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
77  * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
78  * \par
79  * \code
80  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
81  *
82  * __global__ void ExampleKernel(int *d_data, ...)
83  * {
84  *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
85  *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
86  *
87  *     // Allocate shared memory for BlockExchange
88  *     __shared__ typename BlockExchange::TempStorage temp_storage;
89  *
90  *     // Load a tile of data striped across threads
91  *     int thread_data[4];
92  *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
93  *
94  *     // Collectively exchange data into a blocked arrangement across threads
95  *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
96  *
97  * \endcode
98  * \par
99  * Suppose the set of striped input \p thread_data across the block of threads is
100  * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
101  * The corresponding output \p thread_data in those threads will be
102  * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
103  *
104  * \par Performance Considerations
105  * - Proper device-specific padding ensures zero bank conflicts for most types.
106  *
107  */
108 template <
109     typename    InputT,
110     int         BLOCK_DIM_X,
111     int         ITEMS_PER_THREAD,
112     bool        WARP_TIME_SLICING   = false,
113     int         BLOCK_DIM_Y         = 1,
114     int         BLOCK_DIM_Z         = 1,
115     int         PTX_ARCH            = CUB_PTX_ARCH>
116 class BlockExchange
117 {
118 private:
119 
120     /******************************************************************************
121      * Constants
122      ******************************************************************************/
123 
124     /// Constants
125     enum
126     {
127         /// The thread block size in threads
128         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
129 
130         LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
131         WARP_THREADS                = 1 << LOG_WARP_THREADS,
132         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
133 
134         LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
135         SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
136 
137         TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
138 
139         TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
140 
141         TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
142         TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
143 
144         WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
145         WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
146 
147         // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
148         INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
149         PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
150     };
151 
152     /******************************************************************************
153      * Type definitions
154      ******************************************************************************/
155 
156     /// Shared memory storage layout type
157     struct __align__(16) _TempStorage
158     {
159         InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
160     };
161 
162 public:
163 
164     /// \smemstorage{BlockExchange}
165     struct TempStorage : Uninitialized<_TempStorage> {};
166 
167 private:
168 
169 
170     /******************************************************************************
171      * Thread fields
172      ******************************************************************************/
173 
174     /// Shared storage reference
175     _TempStorage &temp_storage;
176 
177     /// Linear thread-id
178     unsigned int linear_tid;
179     unsigned int lane_id;
180     unsigned int warp_id;
181     unsigned int warp_offset;
182 
183 
184     /******************************************************************************
185      * Utility methods
186      ******************************************************************************/
187 
188     /// Internal storage allocator
PrivateStorage()189     __device__ __forceinline__ _TempStorage& PrivateStorage()
190     {
191         __shared__ _TempStorage private_storage;
192         return private_storage;
193     }
194 
195 
196     /**
197      * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
198      */
199     template <typename OutputT>
BlockedToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)200     __device__ __forceinline__ void BlockedToStriped(
201         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
202         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
203         Int2Type<false> /*time_slicing*/)
204     {
205         #pragma unroll
206         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
207         {
208             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
209             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
210             temp_storage.buff[item_offset] = input_items[ITEM];
211         }
212 
213         CTA_SYNC();
214 
215         #pragma unroll
216         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
217         {
218             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
219             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
220             output_items[ITEM] = temp_storage.buff[item_offset];
221         }
222     }
223 
224 
225     /**
226      * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
227      */
228     template <typename OutputT>
BlockedToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)229     __device__ __forceinline__ void BlockedToStriped(
230         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
231         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
232         Int2Type<true>  /*time_slicing*/)
233     {
234         InputT temp_items[ITEMS_PER_THREAD];
235 
236         #pragma unroll
237         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
238         {
239             const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
240             const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
241 
242             CTA_SYNC();
243 
244             if (warp_id == SLICE)
245             {
246                 #pragma unroll
247                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
248                 {
249                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
250                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
251                     temp_storage.buff[item_offset] = input_items[ITEM];
252                 }
253             }
254 
255             CTA_SYNC();
256 
257             #pragma unroll
258             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
259             {
260                 // Read a strip of items
261                 const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
262                 const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
263 
264                 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
265                 {
266                     int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
267                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
268                     {
269                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
270                         temp_items[ITEM] = temp_storage.buff[item_offset];
271                     }
272                 }
273             }
274         }
275 
276         // Copy
277         #pragma unroll
278         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
279         {
280             output_items[ITEM] = temp_items[ITEM];
281         }
282     }
283 
284 
285     /**
286      * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
287      */
288     template <typename OutputT>
BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)289     __device__ __forceinline__ void BlockedToWarpStriped(
290         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
291         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
292         Int2Type<false> /*time_slicing*/)
293     {
294         #pragma unroll
295         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
296         {
297             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
298             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
299             temp_storage.buff[item_offset] = input_items[ITEM];
300         }
301 
302         WARP_SYNC(0xffffffff);
303 
304         #pragma unroll
305         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
306         {
307             int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
308             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
309             output_items[ITEM] = temp_storage.buff[item_offset];
310         }
311     }
312 
313     /**
314      * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
315      */
316     template <typename OutputT>
BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)317     __device__ __forceinline__ void BlockedToWarpStriped(
318         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
319         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
320         Int2Type<true>  /*time_slicing*/)
321     {
322         if (warp_id == 0)
323         {
324             #pragma unroll
325             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
326             {
327                 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
328                 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
329                 temp_storage.buff[item_offset] = input_items[ITEM];
330             }
331 
332             WARP_SYNC(0xffffffff);
333 
334             #pragma unroll
335             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
336             {
337                 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
338                 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
339                 output_items[ITEM] = temp_storage.buff[item_offset];
340             }
341         }
342 
343         #pragma unroll
344         for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
345         {
346             CTA_SYNC();
347 
348             if (warp_id == SLICE)
349             {
350                 #pragma unroll
351                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
352                 {
353                     int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
354                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
355                     temp_storage.buff[item_offset] = input_items[ITEM];
356                 }
357 
358                 WARP_SYNC(0xffffffff);
359 
360                 #pragma unroll
361                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
362                 {
363                     int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
364                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
365                     output_items[ITEM] = temp_storage.buff[item_offset];
366                 }
367             }
368         }
369     }
370 
371 
372     /**
373      * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
374      */
375     template <typename OutputT>
StripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)376     __device__ __forceinline__ void StripedToBlocked(
377         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
378         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
379         Int2Type<false> /*time_slicing*/)
380     {
381         #pragma unroll
382         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
383         {
384             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
385             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
386             temp_storage.buff[item_offset] = input_items[ITEM];
387         }
388 
389         CTA_SYNC();
390 
391         // No timeslicing
392         #pragma unroll
393         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
394         {
395             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
396             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
397             output_items[ITEM] = temp_storage.buff[item_offset];
398         }
399     }
400 
401 
402     /**
403      * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
404      */
405     template <typename OutputT>
StripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)406     __device__ __forceinline__ void StripedToBlocked(
407         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
408         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
409         Int2Type<true>  /*time_slicing*/)
410     {
411         // Warp time-slicing
412         InputT temp_items[ITEMS_PER_THREAD];
413 
414         #pragma unroll
415         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
416         {
417             const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
418             const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
419 
420             CTA_SYNC();
421 
422             #pragma unroll
423             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
424             {
425                 // Write a strip of items
426                 const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
427                 const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
428 
429                 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
430                 {
431                     int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
432                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
433                     {
434                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
435                         temp_storage.buff[item_offset] = input_items[ITEM];
436                     }
437                 }
438             }
439 
440             CTA_SYNC();
441 
442             if (warp_id == SLICE)
443             {
444                 #pragma unroll
445                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
446                 {
447                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
448                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
449                     temp_items[ITEM] = temp_storage.buff[item_offset];
450                 }
451             }
452         }
453 
454         // Copy
455         #pragma unroll
456         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
457         {
458             output_items[ITEM] = temp_items[ITEM];
459         }
460     }
461 
462 
463     /**
464      * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
465      */
466     template <typename OutputT>
WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<false>)467     __device__ __forceinline__ void WarpStripedToBlocked(
468         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
469         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
470         Int2Type<false> /*time_slicing*/)
471     {
472         #pragma unroll
473         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
474         {
475             int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
476             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
477             temp_storage.buff[item_offset] = input_items[ITEM];
478         }
479 
480         WARP_SYNC(0xffffffff);
481 
482         #pragma unroll
483         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
484         {
485             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
486             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
487             output_items[ITEM] = temp_storage.buff[item_offset];
488         }
489     }
490 
491 
492     /**
493      * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
494      */
495     template <typename OutputT>
WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],Int2Type<true>)496     __device__ __forceinline__ void WarpStripedToBlocked(
497         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
498         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
499         Int2Type<true>  /*time_slicing*/)
500     {
501         #pragma unroll
502         for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
503         {
504             CTA_SYNC();
505 
506             if (warp_id == SLICE)
507             {
508                 #pragma unroll
509                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
510                 {
511                     int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
512                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
513                     temp_storage.buff[item_offset] = input_items[ITEM];
514                 }
515 
516                 WARP_SYNC(0xffffffff);
517 
518                 #pragma unroll
519                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
520                 {
521                     int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
522                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
523                     output_items[ITEM] = temp_storage.buff[item_offset];
524                 }
525             }
526         }
527     }
528 
529 
530     /**
531      * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
532      */
533     template <typename OutputT, typename OffsetT>
ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<false>)534     __device__ __forceinline__ void ScatterToBlocked(
535         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
536         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
537         OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
538         Int2Type<false> /*time_slicing*/)
539     {
540         #pragma unroll
541         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
542         {
543             int item_offset = ranks[ITEM];
544             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
545             temp_storage.buff[item_offset] = input_items[ITEM];
546         }
547 
548         CTA_SYNC();
549 
550         #pragma unroll
551         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
552         {
553             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
554             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
555             output_items[ITEM] = temp_storage.buff[item_offset];
556         }
557     }
558 
559     /**
560      * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
561      */
562     template <typename OutputT, typename OffsetT>
ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<true>)563     __device__ __forceinline__ void ScatterToBlocked(
564         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
565         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
566         OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
567         Int2Type<true>  /*time_slicing*/)
568     {
569         InputT temp_items[ITEMS_PER_THREAD];
570 
571         #pragma unroll
572         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
573         {
574             CTA_SYNC();
575 
576             const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
577 
578             #pragma unroll
579             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
580             {
581                 int item_offset = ranks[ITEM] - SLICE_OFFSET;
582                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
583                 {
584                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
585                     temp_storage.buff[item_offset] = input_items[ITEM];
586                 }
587             }
588 
589             CTA_SYNC();
590 
591             if (warp_id == SLICE)
592             {
593                 #pragma unroll
594                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
595                 {
596                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
597                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
598                     temp_items[ITEM] = temp_storage.buff[item_offset];
599                 }
600             }
601         }
602 
603         // Copy
604         #pragma unroll
605         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
606         {
607             output_items[ITEM] = temp_items[ITEM];
608         }
609     }
610 
611 
612     /**
613      * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
614      */
615     template <typename OutputT, typename OffsetT>
ScatterToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<false>)616     __device__ __forceinline__ void ScatterToStriped(
617         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
618         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
619         OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
620         Int2Type<false> /*time_slicing*/)
621     {
622         #pragma unroll
623         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
624         {
625             int item_offset = ranks[ITEM];
626             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
627             temp_storage.buff[item_offset] = input_items[ITEM];
628         }
629 
630         CTA_SYNC();
631 
632         #pragma unroll
633         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
634         {
635             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
636             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
637             output_items[ITEM] = temp_storage.buff[item_offset];
638         }
639     }
640 
641 
642     /**
643      * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
644      */
645     template <typename OutputT, typename OffsetT>
ScatterToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],Int2Type<true>)646     __device__ __forceinline__ void ScatterToStriped(
647         InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
648         OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
649         OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
650         Int2Type<true> /*time_slicing*/)
651     {
652         InputT temp_items[ITEMS_PER_THREAD];
653 
654         #pragma unroll
655         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
656         {
657             const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
658             const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
659 
660             CTA_SYNC();
661 
662             #pragma unroll
663             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
664             {
665                 int item_offset = ranks[ITEM] - SLICE_OFFSET;
666                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
667                 {
668                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
669                     temp_storage.buff[item_offset] = input_items[ITEM];
670                 }
671             }
672 
673             CTA_SYNC();
674 
675             #pragma unroll
676             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
677             {
678                 // Read a strip of items
679                 const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
680                 const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
681 
682                 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
683                 {
684                     int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
685                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
686                     {
687                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
688                         temp_items[ITEM] = temp_storage.buff[item_offset];
689                     }
690                 }
691             }
692         }
693 
694         // Copy
695         #pragma unroll
696         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
697         {
698             output_items[ITEM] = temp_items[ITEM];
699         }
700     }
701 
702 
703 public:
704 
705     /******************************************************************//**
706      * \name Collective constructors
707      *********************************************************************/
708     //@{
709 
710     /**
711      * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
712      */
BlockExchange()713     __device__ __forceinline__ BlockExchange()
714     :
715         temp_storage(PrivateStorage()),
716         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
717         warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
718         lane_id(LaneId()),
719         warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
720     {}
721 
722 
723     /**
724      * \brief Collective constructor using the specified memory allocation as temporary storage.
725      */
BlockExchange(TempStorage & temp_storage)726     __device__ __forceinline__ BlockExchange(
727         TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
728     :
729         temp_storage(temp_storage.Alias()),
730         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
731         lane_id(LaneId()),
732         warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
733         warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
734     {}
735 
736 
737     //@}  end member group
738     /******************************************************************//**
739      * \name Structured exchanges
740      *********************************************************************/
741     //@{
742 
743     /**
744      * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
745      *
746      * \par
747      * - \smemreuse
748      *
749      * \par Snippet
750      * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
751      * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
752      * \par
753      * \code
754      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
755      *
756      * __global__ void ExampleKernel(int *d_data, ...)
757      * {
758      *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
759      *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
760      *
761      *     // Allocate shared memory for BlockExchange
762      *     __shared__ typename BlockExchange::TempStorage temp_storage;
763      *
764      *     // Load a tile of ordered data into a striped arrangement across block threads
765      *     int thread_data[4];
766      *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
767      *
768      *     // Collectively exchange data into a blocked arrangement across threads
769      *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
770      *
771      * \endcode
772      * \par
773      * Suppose the set of striped input \p thread_data across the block of threads is
774      * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
775      * The corresponding output \p thread_data in those threads will be
776      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
777      *
778      */
779     template <typename OutputT>
StripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])780     __device__ __forceinline__ void StripedToBlocked(
781         InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
782         OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
783     {
784         StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
785     }
786 
787 
788     /**
789      * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
790      *
791      * \par
792      * - \smemreuse
793      *
794      * \par Snippet
795      * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
796      * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
797      * \par
798      * \code
799      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
800      *
801      * __global__ void ExampleKernel(int *d_data, ...)
802      * {
803      *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
804      *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
805      *
806      *     // Allocate shared memory for BlockExchange
807      *     __shared__ typename BlockExchange::TempStorage temp_storage;
808      *
809      *     // Obtain a segment of consecutive items that are blocked across threads
810      *     int thread_data[4];
811      *     ...
812      *
813      *     // Collectively exchange data into a striped arrangement across threads
814      *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
815      *
816      *     // Store data striped across block threads into an ordered tile
817      *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
818      *
819      * \endcode
820      * \par
821      * Suppose the set of blocked input \p thread_data across the block of threads is
822      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
823      * The corresponding output \p thread_data in those threads will be
824      * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
825      * preparation for storing to device-accessible memory.
826      *
827      */
828     template <typename OutputT>
BlockedToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])829     __device__ __forceinline__ void BlockedToStriped(
830         InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
831         OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
832     {
833         BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
834     }
835 
836 
837 
838     /**
839      * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
840      *
841      * \par
842      * - \smemreuse
843      *
844      * \par Snippet
845      * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
846      * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
847      * \par
848      * \code
849      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
850      *
851      * __global__ void ExampleKernel(int *d_data, ...)
852      * {
853      *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
854      *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
855      *
856      *     // Allocate shared memory for BlockExchange
857      *     __shared__ typename BlockExchange::TempStorage temp_storage;
858      *
859      *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
860      *     int thread_data[4];
861      *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
862      *
863      *     // Collectively exchange data into a blocked arrangement across threads
864      *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
865      *
866      * \endcode
867      * \par
868      * Suppose the set of warp-striped input \p thread_data across the block of threads is
869      * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
870      * after loading from device-accessible memory.  (The first 128 items are striped across
871      * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
872      * The corresponding output \p thread_data in those threads will be
873      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
874      *
875      */
876     template <typename OutputT>
WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])877     __device__ __forceinline__ void WarpStripedToBlocked(
878         InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
879         OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
880     {
881         WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
882     }
883 
884 
885 
886     /**
887      * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
888      *
889      * \par
890      * - \smemreuse
891      *
892      * \par Snippet
893      * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
894      * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
895      * \par
896      * \code
897      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
898      *
899      * __global__ void ExampleKernel(int *d_data, ...)
900      * {
901      *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
902      *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
903      *
904      *     // Allocate shared memory for BlockExchange
905      *     __shared__ typename BlockExchange::TempStorage temp_storage;
906      *
907      *     // Obtain a segment of consecutive items that are blocked across threads
908      *     int thread_data[4];
909      *     ...
910      *
911      *     // Collectively exchange data into a warp-striped arrangement across threads
912      *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
913      *
914      *     // Store data striped across warp threads into an ordered tile
915      *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
916      *
917      * \endcode
918      * \par
919      * Suppose the set of blocked input \p thread_data across the block of threads is
920      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
921      * The corresponding output \p thread_data in those threads will be
922      * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
923      * in preparation for storing to device-accessible memory. (The first 128 items are striped across
924      * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
925      *
926      */
927     template <typename OutputT>
BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD])928     __device__ __forceinline__ void BlockedToWarpStriped(
929         InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
930         OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
931     {
932         BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
933     }
934 
935 
936 
937     //@}  end member group
938     /******************************************************************//**
939      * \name Scatter exchanges
940      *********************************************************************/
941     //@{
942 
943 
944     /**
945      * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
946      *
947      * \par
948      * - \smemreuse
949      *
950      * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
951      */
952     template <typename OutputT, typename OffsetT>
ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])953     __device__ __forceinline__ void ScatterToBlocked(
954         InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
955         OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
956         OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
957     {
958         ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
959     }
960 
961 
962 
963     /**
964      * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
965      *
966      * \par
967      * - \smemreuse
968      *
969      * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
970      */
971     template <typename OutputT, typename OffsetT>
ScatterToStriped(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])972     __device__ __forceinline__ void ScatterToStriped(
973         InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
974         OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
975         OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
976     {
977         ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
978     }
979 
980 
981 
982     /**
983      * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
984      *
985      * \par
986      * - \smemreuse
987      *
988      * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
989      */
990     template <typename OutputT, typename OffsetT>
ScatterToStripedGuarded(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])991     __device__ __forceinline__ void ScatterToStripedGuarded(
992         InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
993         OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
994         OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
995     {
996         #pragma unroll
997         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
998         {
999             int item_offset = ranks[ITEM];
1000             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1001             if (ranks[ITEM] >= 0)
1002                 temp_storage.buff[item_offset] = input_items[ITEM];
1003         }
1004 
1005         CTA_SYNC();
1006 
1007         #pragma unroll
1008         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1009         {
1010             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
1011             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1012             output_items[ITEM] = temp_storage.buff[item_offset];
1013         }
1014     }
1015 
1016 
1017 
1018 
1019     /**
1020      * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
1021      *
1022      * \par
1023      * - \smemreuse
1024      *
1025      * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
1026      * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
1027      */
1028     template <typename OutputT, typename OffsetT, typename ValidFlag>
ScatterToStripedFlagged(InputT input_items[ITEMS_PER_THREAD],OutputT output_items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],ValidFlag is_valid[ITEMS_PER_THREAD])1029     __device__ __forceinline__ void ScatterToStripedFlagged(
1030         InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1031         OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1032         OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
1033         ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
1034     {
1035         #pragma unroll
1036         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1037         {
1038             int item_offset = ranks[ITEM];
1039             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1040             if (is_valid[ITEM])
1041                 temp_storage.buff[item_offset] = input_items[ITEM];
1042         }
1043 
1044         CTA_SYNC();
1045 
1046         #pragma unroll
1047         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1048         {
1049             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
1050             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1051             output_items[ITEM] = temp_storage.buff[item_offset];
1052         }
1053     }
1054 
1055 
1056     //@}  end member group
1057 
1058 
1059 
1060 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
1061 
1062 
StripedToBlocked(InputT items[ITEMS_PER_THREAD])1063     __device__ __forceinline__ void StripedToBlocked(
1064         InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1065     {
1066         StripedToBlocked(items, items);
1067     }
1068 
BlockedToStriped(InputT items[ITEMS_PER_THREAD])1069     __device__ __forceinline__ void BlockedToStriped(
1070         InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1071     {
1072         BlockedToStriped(items, items);
1073     }
1074 
WarpStripedToBlocked(InputT items[ITEMS_PER_THREAD])1075     __device__ __forceinline__ void WarpStripedToBlocked(
1076         InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1077     {
1078         WarpStripedToBlocked(items, items);
1079     }
1080 
BlockedToWarpStriped(InputT items[ITEMS_PER_THREAD])1081     __device__ __forceinline__ void BlockedToWarpStriped(
1082         InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1083     {
1084         BlockedToWarpStriped(items, items);
1085     }
1086 
1087     template <typename OffsetT>
ScatterToBlocked(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1088     __device__ __forceinline__ void ScatterToBlocked(
1089         InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1090         OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
1091     {
1092         ScatterToBlocked(items, items, ranks);
1093     }
1094 
1095     template <typename OffsetT>
ScatterToStriped(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1096     __device__ __forceinline__ void ScatterToStriped(
1097         InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1098         OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
1099     {
1100         ScatterToStriped(items, items, ranks);
1101     }
1102 
1103     template <typename OffsetT>
ScatterToStripedGuarded(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1104     __device__ __forceinline__ void ScatterToStripedGuarded(
1105         InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1106         OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
1107     {
1108         ScatterToStripedGuarded(items, items, ranks);
1109     }
1110 
1111     template <typename OffsetT, typename ValidFlag>
ScatterToStripedFlagged(InputT items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD],ValidFlag is_valid[ITEMS_PER_THREAD])1112     __device__ __forceinline__ void ScatterToStripedFlagged(
1113         InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
1114         OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
1115         ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
1116     {
1117         ScatterToStriped(items, items, ranks, is_valid);
1118     }
1119 
1120 #endif // DOXYGEN_SHOULD_SKIP_THIS
1121 
1122 
1123 };
1124 
1125 
1126 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
1127 
1128 
1129 template <
1130     typename    T,
1131     int         ITEMS_PER_THREAD,
1132     int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
1133     int         PTX_ARCH                = CUB_PTX_ARCH>
1134 class WarpExchange
1135 {
1136 private:
1137 
1138     /******************************************************************************
1139      * Constants
1140      ******************************************************************************/
1141 
1142     /// Constants
1143     enum
1144     {
1145         // Whether the logical warp size and the PTX warp size coincide
1146         IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
1147 
1148         WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
1149 
1150         LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
1151         SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
1152 
1153         // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
1154         INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
1155         PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
1156     };
1157 
1158     /******************************************************************************
1159      * Type definitions
1160      ******************************************************************************/
1161 
1162     /// Shared memory storage layout type
1163     struct _TempStorage
1164     {
1165         T buff[WARP_ITEMS + PADDING_ITEMS];
1166     };
1167 
1168 public:
1169 
1170     /// \smemstorage{WarpExchange}
1171     struct TempStorage : Uninitialized<_TempStorage> {};
1172 
1173 private:
1174 
1175 
1176     /******************************************************************************
1177      * Thread fields
1178      ******************************************************************************/
1179 
1180     _TempStorage    &temp_storage;
1181     int             lane_id;
1182 
1183 public:
1184 
1185     /******************************************************************************
1186      * Construction
1187      ******************************************************************************/
1188 
1189     /// Constructor
WarpExchange(TempStorage & temp_storage)1190     __device__ __forceinline__ WarpExchange(
1191         TempStorage &temp_storage)
1192     :
1193         temp_storage(temp_storage.Alias()),
1194         lane_id(IS_ARCH_WARP ?
1195             LaneId() :
1196             LaneId() % LOGICAL_WARP_THREADS)
1197     {}
1198 
1199 
1200     /******************************************************************************
1201      * Interface
1202      ******************************************************************************/
1203 
1204     /**
1205      * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
1206      *
1207      * \par
1208      * - \smemreuse
1209      *
1210      * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
1211      */
1212     template <typename OffsetT>
ScatterToStriped(T items[ITEMS_PER_THREAD],OffsetT ranks[ITEMS_PER_THREAD])1213     __device__ __forceinline__ void ScatterToStriped(
1214         T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
1215         OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
1216     {
1217         #pragma unroll
1218         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1219         {
1220             if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
1221             temp_storage.buff[ranks[ITEM]] = items[ITEM];
1222         }
1223 
1224         WARP_SYNC(0xffffffff);
1225 
1226         #pragma unroll
1227         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
1228         {
1229             int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
1230             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
1231             items[ITEM] = temp_storage.buff[item_offset];
1232         }
1233     }
1234 
1235 };
1236 
1237 
1238 
1239 
1240 #endif // DOXYGEN_SHOULD_SKIP_THIS
1241 
1242 
1243 
1244 
1245 
1246 }               // CUB namespace
1247 CUB_NS_POSTFIX  // Optional outer namespace(s)
1248 
1249