1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * Operations for writing linear segments of data from the CUDA thread block
32  */
33 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "block_exchange.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46 
47 /// CUB namespace
48 namespace cub {
49 
50 /**
51  * \addtogroup UtilIo
52  * @{
53  */
54 
55 
56 /******************************************************************//**
57  * \name Blocked arrangement I/O (direct)
58  *********************************************************************/
59 //@{
60 
61 /**
62  * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
63  *
64  * \blocked
65  *
66  * \tparam T                    <b>[inferred]</b> The data type to store.
67  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
68  * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
69  */
70 template <
71     typename            T,
72     int                 ITEMS_PER_THREAD,
73     typename            OutputIteratorT>
StoreDirectBlocked(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])74 __device__ __forceinline__ void StoreDirectBlocked(
75     int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
76     OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
77     T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
78 {
79     OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
80 
81     // Store directly in thread-blocked order
82     #pragma unroll
83     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
84     {
85         thread_itr[ITEM] = items[ITEM];
86     }
87 }
88 
89 
90 /**
91  * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
92  *
93  * \blocked
94  *
95  * \tparam T                    <b>[inferred]</b> The data type to store.
96  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
97  * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
98  */
99 template <
100     typename            T,
101     int                 ITEMS_PER_THREAD,
102     typename            OutputIteratorT>
StoreDirectBlocked(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)103 __device__ __forceinline__ void StoreDirectBlocked(
104     int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
105     OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
106     T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
107     int                 valid_items)                ///< [in] Number of valid items to write
108 {
109     OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
110 
111     // Store directly in thread-blocked order
112     #pragma unroll
113     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
114     {
115         if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
116         {
117             thread_itr[ITEM] = items[ITEM];
118         }
119     }
120 }
121 
122 
123 /**
124  * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
125  *
126  * \blocked
127  *
128  * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
129  * which is the default starting offset returned by \p cudaMalloc()
130  *
131  * \par
132  * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
133  *   - \p ITEMS_PER_THREAD is odd
134  *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
135  *
136  * \tparam T                    <b>[inferred]</b> The data type to store.
137  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
138  *
139  */
140 template <
141     typename            T,
142     int                 ITEMS_PER_THREAD>
StoreDirectBlockedVectorized(int linear_tid,T * block_ptr,T (& items)[ITEMS_PER_THREAD])143 __device__ __forceinline__ void StoreDirectBlockedVectorized(
144     int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
145     T                   *block_ptr,                 ///< [in] Input pointer for storing from
146     T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
147 {
148     enum
149     {
150         // Maximum CUDA vector size is 4 elements
151         MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
152 
153         // Vector size must be a power of two and an even divisor of the items per thread
154         VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
155             MAX_VEC_SIZE :
156             1,
157 
158         VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
159     };
160 
161     // Vector type
162     typedef typename CubVector<T, VEC_SIZE>::Type Vector;
163 
164     // Alias global pointer
165     Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
166 
167     // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
168     Vector raw_vector[VECTORS_PER_THREAD];
169     T *raw_items = reinterpret_cast<T*>(raw_vector);
170 
171     // Copy
172     #pragma unroll
173     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
174     {
175         raw_items[ITEM] = items[ITEM];
176     }
177 
178     // Direct-store using vector types
179     StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
180 }
181 
182 
183 
184 //@}  end member group
185 /******************************************************************//**
186  * \name Striped arrangement I/O (direct)
187  *********************************************************************/
188 //@{
189 
190 
191 /**
192  * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
193  *
194  * \striped
195  *
196  * \tparam BLOCK_THREADS        The thread block size in threads
197  * \tparam T                    <b>[inferred]</b> The data type to store.
198  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
199  * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
200  */
201 template <
202     int                 BLOCK_THREADS,
203     typename            T,
204     int                 ITEMS_PER_THREAD,
205     typename            OutputIteratorT>
StoreDirectStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])206 __device__ __forceinline__ void StoreDirectStriped(
207     int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
208     OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
209     T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
210 {
211     OutputIteratorT thread_itr = block_itr + linear_tid;
212 
213     // Store directly in striped order
214     #pragma unroll
215     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
216     {
217         thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
218     }
219 }
220 
221 
222 /**
223  * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
224  *
225  * \striped
226  *
227  * \tparam BLOCK_THREADS        The thread block size in threads
228  * \tparam T                    <b>[inferred]</b> The data type to store.
229  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
230  * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
231  */
232 template <
233     int                 BLOCK_THREADS,
234     typename            T,
235     int                 ITEMS_PER_THREAD,
236     typename            OutputIteratorT>
StoreDirectStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)237 __device__ __forceinline__ void StoreDirectStriped(
238     int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
239     OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
240     T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
241     int                 valid_items)                ///< [in] Number of valid items to write
242 {
243     OutputIteratorT thread_itr = block_itr + linear_tid;
244 
245     // Store directly in striped order
246     #pragma unroll
247     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
248     {
249         if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
250         {
251             thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
252         }
253     }
254 }
255 
256 
257 
258 //@}  end member group
259 /******************************************************************//**
260  * \name Warp-striped arrangement I/O (direct)
261  *********************************************************************/
262 //@{
263 
264 
265 /**
266  * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
267  *
268  * \warpstriped
269  *
270  * \par Usage Considerations
271  * The number of threads in the thread block must be a multiple of the architecture's warp size.
272  *
273  * \tparam T                    <b>[inferred]</b> The data type to store.
274  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
275  * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
276  */
277 template <
278     typename            T,
279     int                 ITEMS_PER_THREAD,
280     typename            OutputIteratorT>
StoreDirectWarpStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])281 __device__ __forceinline__ void StoreDirectWarpStriped(
282     int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
283     OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
284     T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
285 {
286     int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
287     int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
288     int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
289 
290     OutputIteratorT thread_itr = block_itr + warp_offset + tid;
291 
292     // Store directly in warp-striped order
293     #pragma unroll
294     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
295     {
296         thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
297     }
298 }
299 
300 
301 /**
302  * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
303  *
304  * \warpstriped
305  *
306  * \par Usage Considerations
307  * The number of threads in the thread block must be a multiple of the architecture's warp size.
308  *
309  * \tparam T                    <b>[inferred]</b> The data type to store.
310  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
311  * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
312  */
313 template <
314     typename            T,
315     int                 ITEMS_PER_THREAD,
316     typename            OutputIteratorT>
StoreDirectWarpStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)317 __device__ __forceinline__ void StoreDirectWarpStriped(
318     int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
319     OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
320     T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
321     int                 valid_items)                ///< [in] Number of valid items to write
322 {
323     int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
324     int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
325     int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
326 
327     OutputIteratorT thread_itr = block_itr + warp_offset + tid;
328 
329     // Store directly in warp-striped order
330     #pragma unroll
331     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332     {
333         if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
334         {
335             thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
336         }
337     }
338 }
339 
340 
341 //@}  end member group
342 
343 
344 /** @} */       // end group UtilIo
345 
346 
347 //-----------------------------------------------------------------------------
348 // Generic BlockStore abstraction
349 //-----------------------------------------------------------------------------
350 
351 /**
352  * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
353  */
354 enum BlockStoreAlgorithm
355 {
356     /**
357      * \par Overview
358      *
359      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
360      * directly to memory.
361      *
362      * \par Performance Considerations
363      * - The utilization of memory transactions (coalescing) decreases as the
364      *   access stride between threads increases (i.e., the number items per thread).
365      */
366     BLOCK_STORE_DIRECT,
367 
368     /**
369      * \par Overview
370      *
371      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
372      * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
373      * For example, <tt>st.global.v4.s32</tt> instructions will be generated
374      * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
375      *
376      * \par Performance Considerations
377      * - The utilization of memory transactions (coalescing) remains high until the the
378      *   access stride between threads (i.e., the number items per thread) exceeds the
379      *   maximum vector store width (typically 4 items or 64B, whichever is lower).
380      * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
381      *   - \p ITEMS_PER_THREAD is odd
382      *   - The \p OutputIteratorT is not a simple pointer type
383      *   - The block output offset is not quadword-aligned
384      *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
385      */
386     BLOCK_STORE_VECTORIZE,
387 
388     /**
389      * \par Overview
390      * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
391      * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
392      *
393      * \par Performance Considerations
394      * - The utilization of memory transactions (coalescing) remains high regardless
395      *   of items written per thread.
396      * - The local reordering incurs slightly longer latencies and throughput than the
397      *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
398      */
399     BLOCK_STORE_TRANSPOSE,
400 
401     /**
402      * \par Overview
403      * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
404      * transposed and then efficiently written to memory as a
405      * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
406      *
407      * \par Usage Considerations
408      * - BLOCK_THREADS must be a multiple of WARP_THREADS
409      *
410      * \par Performance Considerations
411      * - The utilization of memory transactions (coalescing) remains high regardless
412      *   of items written per thread.
413      * - The local reordering incurs slightly longer latencies and throughput than the
414      *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
415      */
416     BLOCK_STORE_WARP_TRANSPOSE,
417 
418     /**
419      * \par Overview
420      * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
421      * transposed and then efficiently written to memory as a
422      * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
423      * To reduce the shared memory requirement, only one warp's worth of shared
424      * memory is provisioned and is subsequently time-sliced among warps.
425      *
426      * \par Usage Considerations
427      * - BLOCK_THREADS must be a multiple of WARP_THREADS
428      *
429      * \par Performance Considerations
430      * - The utilization of memory transactions (coalescing) remains high regardless
431      *   of items written per thread.
432      * - Provisions less shared memory temporary storage, but incurs larger
433      *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
434      */
435     BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
436 
437 };
438 
439 
440 /**
441  * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
442  * \ingroup BlockModule
443  * \ingroup UtilIo
444  *
445  * \tparam T                    The type of data to be written.
446  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
447  * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
448  * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
449  * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
450  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
451  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
452  * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
453  *
454  * \par Overview
455  * - The BlockStore class provides a single data movement abstraction that can be specialized
456  *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
457  *   performance policies for different architectures, data types, granularity sizes, etc.
458  * - BlockStore can be optionally specialized by different data movement strategies:
459  *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
460  *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
461  *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
462  *      of data is written directly to memory using CUDA's built-in vectorized stores as a
463  *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
464  *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
465  *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
466  *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
467  *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
468  *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
469  *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
470  * - \rowmajor
471  *
472  * \par A Simple Example
473  * \blockcollective{BlockStore}
474  * \par
475  * The code snippet below illustrates the storing of a "blocked" arrangement
476  * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
477  * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
478  * meaning items are locally reordered among threads so that memory references will be
479  * efficiently coalesced using a warp-striped access pattern.
480  * \par
481  * \code
482  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
483  *
484  * __global__ void ExampleKernel(int *d_data, ...)
485  * {
486  *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
487  *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
488  *
489  *     // Allocate shared memory for BlockStore
490  *     __shared__ typename BlockStore::TempStorage temp_storage;
491  *
492  *     // Obtain a segment of consecutive items that are blocked across threads
493  *     int thread_data[4];
494  *     ...
495  *
496  *     // Store items to linear memory
497  *     int thread_data[4];
498  *     BlockStore(temp_storage).Store(d_data, thread_data);
499  *
500  * \endcode
501  * \par
502  * Suppose the set of \p thread_data across the block of threads is
503  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
504  * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
505  *
506  */
507 template <
508     typename                T,
509     int                     BLOCK_DIM_X,
510     int                     ITEMS_PER_THREAD,
511     BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
512     int                     BLOCK_DIM_Y         = 1,
513     int                     BLOCK_DIM_Z         = 1,
514     int                     PTX_ARCH            = CUB_PTX_ARCH>
515 class BlockStore
516 {
517 private:
518     /******************************************************************************
519      * Constants and typed definitions
520      ******************************************************************************/
521 
522     /// Constants
523     enum
524     {
525         /// The thread block size in threads
526         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
527     };
528 
529 
530     /******************************************************************************
531      * Algorithmic variants
532      ******************************************************************************/
533 
534     /// Store helper
535     template <BlockStoreAlgorithm _POLICY, int DUMMY>
536     struct StoreInternal;
537 
538 
539     /**
540      * BLOCK_STORE_DIRECT specialization of store helper
541      */
542     template <int DUMMY>
543     struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
544     {
545         /// Shared memory storage layout type
546         typedef NullType TempStorage;
547 
548         /// Linear thread-id
549         int linear_tid;
550 
551         /// Constructor
StoreInternalcub::BlockStore::StoreInternal552         __device__ __forceinline__ StoreInternal(
553             TempStorage &/*temp_storage*/,
554             int linear_tid)
555         :
556             linear_tid(linear_tid)
557         {}
558 
559         /// Store items into a linear segment of memory
560         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal561         __device__ __forceinline__ void Store(
562             OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
563             T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
564         {
565             StoreDirectBlocked(linear_tid, block_itr, items);
566         }
567 
568         /// Store items into a linear segment of memory, guarded by range
569         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal570         __device__ __forceinline__ void Store(
571             OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
572             T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
573             int                 valid_items)                ///< [in] Number of valid items to write
574         {
575             StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
576         }
577     };
578 
579 
580     /**
581      * BLOCK_STORE_VECTORIZE specialization of store helper
582      */
583     template <int DUMMY>
584     struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
585     {
586         /// Shared memory storage layout type
587         typedef NullType TempStorage;
588 
589         /// Linear thread-id
590         int linear_tid;
591 
592         /// Constructor
StoreInternalcub::BlockStore::StoreInternal593         __device__ __forceinline__ StoreInternal(
594             TempStorage &/*temp_storage*/,
595             int linear_tid)
596         :
597             linear_tid(linear_tid)
598         {}
599 
600         /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
Storecub::BlockStore::StoreInternal601         __device__ __forceinline__ void Store(
602             T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
603             T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
604         {
605             StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
606         }
607 
608         /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
609         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal610         __device__ __forceinline__ void Store(
611             OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
612             T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
613         {
614             StoreDirectBlocked(linear_tid, block_itr, items);
615         }
616 
617         /// Store items into a linear segment of memory, guarded by range
618         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal619         __device__ __forceinline__ void Store(
620             OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
621             T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
622             int                 valid_items)                ///< [in] Number of valid items to write
623         {
624             StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
625         }
626     };
627 
628 
629     /**
630      * BLOCK_STORE_TRANSPOSE specialization of store helper
631      */
632     template <int DUMMY>
633     struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
634     {
635         // BlockExchange utility type for keys
636         typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
637 
638         /// Shared memory storage layout type
639         struct _TempStorage : BlockExchange::TempStorage
640         {
641             /// Temporary storage for partially-full block guard
642             volatile int valid_items;
643         };
644 
645         /// Alias wrapper allowing storage to be unioned
646         struct TempStorage : Uninitialized<_TempStorage> {};
647 
648         /// Thread reference to shared storage
649         _TempStorage &temp_storage;
650 
651         /// Linear thread-id
652         int linear_tid;
653 
654         /// Constructor
StoreInternalcub::BlockStore::StoreInternal655         __device__ __forceinline__ StoreInternal(
656             TempStorage &temp_storage,
657             int linear_tid)
658         :
659             temp_storage(temp_storage.Alias()),
660             linear_tid(linear_tid)
661         {}
662 
663         /// Store items into a linear segment of memory
664         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal665         __device__ __forceinline__ void Store(
666             OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
667             T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
668         {
669             BlockExchange(temp_storage).BlockedToStriped(items);
670             StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
671         }
672 
673         /// Store items into a linear segment of memory, guarded by range
674         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal675         __device__ __forceinline__ void Store(
676             OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
677             T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
678             int                 valid_items)                ///< [in] Number of valid items to write
679         {
680             BlockExchange(temp_storage).BlockedToStriped(items);
681             if (linear_tid == 0)
682                 temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
683             CTA_SYNC();
684             StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
685         }
686     };
687 
688 
689     /**
690      * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
691      */
692     template <int DUMMY>
693     struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
694     {
695         enum
696         {
697             WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
698         };
699 
700         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
701         CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
702 
703         // BlockExchange utility type for keys
704         typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
705 
706         /// Shared memory storage layout type
707         struct _TempStorage : BlockExchange::TempStorage
708         {
709             /// Temporary storage for partially-full block guard
710             volatile int valid_items;
711         };
712 
713         /// Alias wrapper allowing storage to be unioned
714         struct TempStorage : Uninitialized<_TempStorage> {};
715 
716         /// Thread reference to shared storage
717         _TempStorage &temp_storage;
718 
719         /// Linear thread-id
720         int linear_tid;
721 
722         /// Constructor
StoreInternalcub::BlockStore::StoreInternal723         __device__ __forceinline__ StoreInternal(
724             TempStorage &temp_storage,
725             int linear_tid)
726         :
727             temp_storage(temp_storage.Alias()),
728             linear_tid(linear_tid)
729         {}
730 
731         /// Store items into a linear segment of memory
732         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal733         __device__ __forceinline__ void Store(
734             OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
735             T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
736         {
737             BlockExchange(temp_storage).BlockedToWarpStriped(items);
738             StoreDirectWarpStriped(linear_tid, block_itr, items);
739         }
740 
741         /// Store items into a linear segment of memory, guarded by range
742         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal743         __device__ __forceinline__ void Store(
744             OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
745             T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
746             int               valid_items)                  ///< [in] Number of valid items to write
747         {
748             BlockExchange(temp_storage).BlockedToWarpStriped(items);
749             if (linear_tid == 0)
750                 temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
751             CTA_SYNC();
752             StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
753         }
754     };
755 
756 
757     /**
758      * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
759      */
760     template <int DUMMY>
761     struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
762     {
763         enum
764         {
765             WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
766         };
767 
768         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
769         CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
770 
771         // BlockExchange utility type for keys
772         typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
773 
774         /// Shared memory storage layout type
775         struct _TempStorage : BlockExchange::TempStorage
776         {
777             /// Temporary storage for partially-full block guard
778             volatile int valid_items;
779         };
780 
781         /// Alias wrapper allowing storage to be unioned
782         struct TempStorage : Uninitialized<_TempStorage> {};
783 
784         /// Thread reference to shared storage
785         _TempStorage &temp_storage;
786 
787         /// Linear thread-id
788         int linear_tid;
789 
790         /// Constructor
StoreInternalcub::BlockStore::StoreInternal791         __device__ __forceinline__ StoreInternal(
792             TempStorage &temp_storage,
793             int linear_tid)
794         :
795             temp_storage(temp_storage.Alias()),
796             linear_tid(linear_tid)
797         {}
798 
799         /// Store items into a linear segment of memory
800         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal801         __device__ __forceinline__ void Store(
802             OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
803             T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
804         {
805             BlockExchange(temp_storage).BlockedToWarpStriped(items);
806             StoreDirectWarpStriped(linear_tid, block_itr, items);
807         }
808 
809         /// Store items into a linear segment of memory, guarded by range
810         template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal811         __device__ __forceinline__ void Store(
812             OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
813             T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
814             int                 valid_items)                ///< [in] Number of valid items to write
815         {
816             BlockExchange(temp_storage).BlockedToWarpStriped(items);
817             if (linear_tid == 0)
818                 temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
819             CTA_SYNC();
820             StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
821         }
822     };
823 
824     /******************************************************************************
825      * Type definitions
826      ******************************************************************************/
827 
828     /// Internal load implementation to use
829     typedef StoreInternal<ALGORITHM, 0> InternalStore;
830 
831 
832     /// Shared memory storage layout type
833     typedef typename InternalStore::TempStorage _TempStorage;
834 
835 
836     /******************************************************************************
837      * Utility methods
838      ******************************************************************************/
839 
840     /// Internal storage allocator
PrivateStorage()841     __device__ __forceinline__ _TempStorage& PrivateStorage()
842     {
843         __shared__ _TempStorage private_storage;
844         return private_storage;
845     }
846 
847 
848     /******************************************************************************
849      * Thread fields
850      ******************************************************************************/
851 
852     /// Thread reference to shared storage
853     _TempStorage &temp_storage;
854 
855     /// Linear thread-id
856     int linear_tid;
857 
858 public:
859 
860 
861     /// \smemstorage{BlockStore}
862     struct TempStorage : Uninitialized<_TempStorage> {};
863 
864 
865     /******************************************************************//**
866      * \name Collective constructors
867      *********************************************************************/
868     //@{
869 
870     /**
871      * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
872      */
BlockStore()873     __device__ __forceinline__ BlockStore()
874     :
875         temp_storage(PrivateStorage()),
876         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
877     {}
878 
879 
880     /**
881      * \brief Collective constructor using the specified memory allocation as temporary storage.
882      */
BlockStore(TempStorage & temp_storage)883     __device__ __forceinline__ BlockStore(
884         TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
885     :
886         temp_storage(temp_storage.Alias()),
887         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
888     {}
889 
890 
891     //@}  end member group
892     /******************************************************************//**
893      * \name Data movement
894      *********************************************************************/
895     //@{
896 
897 
898     /**
899      * \brief Store items into a linear segment of memory.
900      *
901      * \par
902      * - \blocked
903      * - \smemreuse
904      *
905      * \par Snippet
906      * The code snippet below illustrates the storing of a "blocked" arrangement
907      * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
908      * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
909      * meaning items are locally reordered among threads so that memory references will be
910      * efficiently coalesced using a warp-striped access pattern.
911      * \par
912      * \code
913      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
914      *
915      * __global__ void ExampleKernel(int *d_data, ...)
916      * {
917      *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
918      *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
919      *
920      *     // Allocate shared memory for BlockStore
921      *     __shared__ typename BlockStore::TempStorage temp_storage;
922      *
923      *     // Obtain a segment of consecutive items that are blocked across threads
924      *     int thread_data[4];
925      *     ...
926      *
927      *     // Store items to linear memory
928      *     int thread_data[4];
929      *     BlockStore(temp_storage).Store(d_data, thread_data);
930      *
931      * \endcode
932      * \par
933      * Suppose the set of \p thread_data across the block of threads is
934      * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
935      * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
936      *
937      */
938     template <typename OutputIteratorT>
Store(OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])939     __device__ __forceinline__ void Store(
940         OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
941         T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
942     {
943         InternalStore(temp_storage, linear_tid).Store(block_itr, items);
944     }
945 
946     /**
947      * \brief Store items into a linear segment of memory, guarded by range.
948      *
949      * \par
950      * - \blocked
951      * - \smemreuse
952      *
953      * \par Snippet
954      * The code snippet below illustrates the guarded storing of a "blocked" arrangement
955      * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
956      * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
957      * meaning items are locally reordered among threads so that memory references will be
958      * efficiently coalesced using a warp-striped access pattern.
959      * \par
960      * \code
961      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
962      *
963      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
964      * {
965      *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
966      *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
967      *
968      *     // Allocate shared memory for BlockStore
969      *     __shared__ typename BlockStore::TempStorage temp_storage;
970      *
971      *     // Obtain a segment of consecutive items that are blocked across threads
972      *     int thread_data[4];
973      *     ...
974      *
975      *     // Store items to linear memory
976      *     int thread_data[4];
977      *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
978      *
979      * \endcode
980      * \par
981      * Suppose the set of \p thread_data across the block of threads is
982      * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
983      * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
984      * only the first two threads being unmasked to store portions of valid data.
985      *
986      */
987     template <typename OutputIteratorT>
Store(OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)988     __device__ __forceinline__ void Store(
989         OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
990         T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
991         int                 valid_items)                ///< [in] Number of valid items to write
992     {
993         InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
994     }
995 };
996 
997 
998 }               // CUB namespace
999 CUB_NS_POSTFIX  // Optional outer namespace(s)
1000 
1001