1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /**
30 * \file
31 * Operations for writing linear segments of data from the CUDA thread block
32 */
33
34 #pragma once
35
36 #include <iterator>
37
38 #include "block_exchange.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46
47 /// CUB namespace
48 namespace cub {
49
50 /**
51 * \addtogroup UtilIo
52 * @{
53 */
54
55
56 /******************************************************************//**
57 * \name Blocked arrangement I/O (direct)
58 *********************************************************************/
59 //@{
60
61 /**
62 * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
63 *
64 * \blocked
65 *
66 * \tparam T <b>[inferred]</b> The data type to store.
67 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
68 * \tparam OutputIteratorT <b>[inferred]</b> The random-access iterator type for output \iterator.
69 */
70 template <
71 typename T,
72 int ITEMS_PER_THREAD,
73 typename OutputIteratorT>
StoreDirectBlocked(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])74 __device__ __forceinline__ void StoreDirectBlocked(
75 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
76 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
77 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
78 {
79 OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
80
81 // Store directly in thread-blocked order
82 #pragma unroll
83 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
84 {
85 thread_itr[ITEM] = items[ITEM];
86 }
87 }
88
89
90 /**
91 * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
92 *
93 * \blocked
94 *
95 * \tparam T <b>[inferred]</b> The data type to store.
96 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
97 * \tparam OutputIteratorT <b>[inferred]</b> The random-access iterator type for output \iterator.
98 */
99 template <
100 typename T,
101 int ITEMS_PER_THREAD,
102 typename OutputIteratorT>
StoreDirectBlocked(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)103 __device__ __forceinline__ void StoreDirectBlocked(
104 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
105 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
106 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
107 int valid_items) ///< [in] Number of valid items to write
108 {
109 OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
110
111 // Store directly in thread-blocked order
112 #pragma unroll
113 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
114 {
115 if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
116 {
117 thread_itr[ITEM] = items[ITEM];
118 }
119 }
120 }
121
122
123 /**
124 * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
125 *
126 * \blocked
127 *
128 * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
129 * which is the default starting offset returned by \p cudaMalloc()
130 *
131 * \par
132 * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
133 * - \p ITEMS_PER_THREAD is odd
134 * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
135 *
136 * \tparam T <b>[inferred]</b> The data type to store.
137 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
138 *
139 */
140 template <
141 typename T,
142 int ITEMS_PER_THREAD>
StoreDirectBlockedVectorized(int linear_tid,T * block_ptr,T (& items)[ITEMS_PER_THREAD])143 __device__ __forceinline__ void StoreDirectBlockedVectorized(
144 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
145 T *block_ptr, ///< [in] Input pointer for storing from
146 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
147 {
148 enum
149 {
150 // Maximum CUDA vector size is 4 elements
151 MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
152
153 // Vector size must be a power of two and an even divisor of the items per thread
154 VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
155 MAX_VEC_SIZE :
156 1,
157
158 VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
159 };
160
161 // Vector type
162 typedef typename CubVector<T, VEC_SIZE>::Type Vector;
163
164 // Alias global pointer
165 Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
166
167 // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
168 Vector raw_vector[VECTORS_PER_THREAD];
169 T *raw_items = reinterpret_cast<T*>(raw_vector);
170
171 // Copy
172 #pragma unroll
173 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
174 {
175 raw_items[ITEM] = items[ITEM];
176 }
177
178 // Direct-store using vector types
179 StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
180 }
181
182
183
184 //@} end member group
185 /******************************************************************//**
186 * \name Striped arrangement I/O (direct)
187 *********************************************************************/
188 //@{
189
190
191 /**
192 * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
193 *
194 * \striped
195 *
196 * \tparam BLOCK_THREADS The thread block size in threads
197 * \tparam T <b>[inferred]</b> The data type to store.
198 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
199 * \tparam OutputIteratorT <b>[inferred]</b> The random-access iterator type for output \iterator.
200 */
201 template <
202 int BLOCK_THREADS,
203 typename T,
204 int ITEMS_PER_THREAD,
205 typename OutputIteratorT>
StoreDirectStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])206 __device__ __forceinline__ void StoreDirectStriped(
207 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
208 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
209 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
210 {
211 OutputIteratorT thread_itr = block_itr + linear_tid;
212
213 // Store directly in striped order
214 #pragma unroll
215 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
216 {
217 thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
218 }
219 }
220
221
222 /**
223 * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
224 *
225 * \striped
226 *
227 * \tparam BLOCK_THREADS The thread block size in threads
228 * \tparam T <b>[inferred]</b> The data type to store.
229 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
230 * \tparam OutputIteratorT <b>[inferred]</b> The random-access iterator type for output \iterator.
231 */
232 template <
233 int BLOCK_THREADS,
234 typename T,
235 int ITEMS_PER_THREAD,
236 typename OutputIteratorT>
StoreDirectStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)237 __device__ __forceinline__ void StoreDirectStriped(
238 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
239 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
240 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
241 int valid_items) ///< [in] Number of valid items to write
242 {
243 OutputIteratorT thread_itr = block_itr + linear_tid;
244
245 // Store directly in striped order
246 #pragma unroll
247 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
248 {
249 if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
250 {
251 thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
252 }
253 }
254 }
255
256
257
258 //@} end member group
259 /******************************************************************//**
260 * \name Warp-striped arrangement I/O (direct)
261 *********************************************************************/
262 //@{
263
264
265 /**
266 * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
267 *
268 * \warpstriped
269 *
270 * \par Usage Considerations
271 * The number of threads in the thread block must be a multiple of the architecture's warp size.
272 *
273 * \tparam T <b>[inferred]</b> The data type to store.
274 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
275 * \tparam OutputIteratorT <b>[inferred]</b> The random-access iterator type for output \iterator.
276 */
277 template <
278 typename T,
279 int ITEMS_PER_THREAD,
280 typename OutputIteratorT>
StoreDirectWarpStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])281 __device__ __forceinline__ void StoreDirectWarpStriped(
282 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
283 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
284 T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
285 {
286 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
287 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
288 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
289
290 OutputIteratorT thread_itr = block_itr + warp_offset + tid;
291
292 // Store directly in warp-striped order
293 #pragma unroll
294 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
295 {
296 thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
297 }
298 }
299
300
301 /**
302 * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
303 *
304 * \warpstriped
305 *
306 * \par Usage Considerations
307 * The number of threads in the thread block must be a multiple of the architecture's warp size.
308 *
309 * \tparam T <b>[inferred]</b> The data type to store.
310 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
311 * \tparam OutputIteratorT <b>[inferred]</b> The random-access iterator type for output \iterator.
312 */
313 template <
314 typename T,
315 int ITEMS_PER_THREAD,
316 typename OutputIteratorT>
StoreDirectWarpStriped(int linear_tid,OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)317 __device__ __forceinline__ void StoreDirectWarpStriped(
318 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
319 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
320 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
321 int valid_items) ///< [in] Number of valid items to write
322 {
323 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
324 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
325 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
326
327 OutputIteratorT thread_itr = block_itr + warp_offset + tid;
328
329 // Store directly in warp-striped order
330 #pragma unroll
331 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332 {
333 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
334 {
335 thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
336 }
337 }
338 }
339
340
341 //@} end member group
342
343
344 /** @} */ // end group UtilIo
345
346
347 //-----------------------------------------------------------------------------
348 // Generic BlockStore abstraction
349 //-----------------------------------------------------------------------------
350
351 /**
352 * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
353 */
354 enum BlockStoreAlgorithm
355 {
356 /**
357 * \par Overview
358 *
359 * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
360 * directly to memory.
361 *
362 * \par Performance Considerations
363 * - The utilization of memory transactions (coalescing) decreases as the
364 * access stride between threads increases (i.e., the number items per thread).
365 */
366 BLOCK_STORE_DIRECT,
367
368 /**
369 * \par Overview
370 *
371 * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
372 * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
373 * For example, <tt>st.global.v4.s32</tt> instructions will be generated
374 * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
375 *
376 * \par Performance Considerations
377 * - The utilization of memory transactions (coalescing) remains high until the the
378 * access stride between threads (i.e., the number items per thread) exceeds the
379 * maximum vector store width (typically 4 items or 64B, whichever is lower).
380 * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
381 * - \p ITEMS_PER_THREAD is odd
382 * - The \p OutputIteratorT is not a simple pointer type
383 * - The block output offset is not quadword-aligned
384 * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
385 */
386 BLOCK_STORE_VECTORIZE,
387
388 /**
389 * \par Overview
390 * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
391 * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
392 *
393 * \par Performance Considerations
394 * - The utilization of memory transactions (coalescing) remains high regardless
395 * of items written per thread.
396 * - The local reordering incurs slightly longer latencies and throughput than the
397 * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
398 */
399 BLOCK_STORE_TRANSPOSE,
400
401 /**
402 * \par Overview
403 * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
404 * transposed and then efficiently written to memory as a
405 * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
406 *
407 * \par Usage Considerations
408 * - BLOCK_THREADS must be a multiple of WARP_THREADS
409 *
410 * \par Performance Considerations
411 * - The utilization of memory transactions (coalescing) remains high regardless
412 * of items written per thread.
413 * - The local reordering incurs slightly longer latencies and throughput than the
414 * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
415 */
416 BLOCK_STORE_WARP_TRANSPOSE,
417
418 /**
419 * \par Overview
420 * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
421 * transposed and then efficiently written to memory as a
422 * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
423 * To reduce the shared memory requirement, only one warp's worth of shared
424 * memory is provisioned and is subsequently time-sliced among warps.
425 *
426 * \par Usage Considerations
427 * - BLOCK_THREADS must be a multiple of WARP_THREADS
428 *
429 * \par Performance Considerations
430 * - The utilization of memory transactions (coalescing) remains high regardless
431 * of items written per thread.
432 * - Provisions less shared memory temporary storage, but incurs larger
433 * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
434 */
435 BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
436
437 };
438
439
440 /**
441 * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png)
442 * \ingroup BlockModule
443 * \ingroup UtilIo
444 *
445 * \tparam T The type of data to be written.
446 * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
447 * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread.
448 * \tparam ALGORITHM <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT.
449 * \tparam WARP_TIME_SLICING <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
450 * \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
451 * \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
452 * \tparam PTX_ARCH <b>[optional]</b> \ptxversion
453 *
454 * \par Overview
455 * - The BlockStore class provides a single data movement abstraction that can be specialized
456 * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different
457 * performance policies for different architectures, data types, granularity sizes, etc.
458 * - BlockStore can be optionally specialized by different data movement strategies:
459 * -# <b>cub::BLOCK_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
460 * directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
461 * -# <b>cub::BLOCK_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
462 * of data is written directly to memory using CUDA's built-in vectorized stores as a
463 * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm)
464 * -# <b>cub::BLOCK_STORE_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
465 * is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
466 * then written to memory. [More...](\ref cub::BlockStoreAlgorithm)
467 * -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
468 * is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
469 * then written to memory. [More...](\ref cub::BlockStoreAlgorithm)
470 * - \rowmajor
471 *
472 * \par A Simple Example
473 * \blockcollective{BlockStore}
474 * \par
475 * The code snippet below illustrates the storing of a "blocked" arrangement
476 * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
477 * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
478 * meaning items are locally reordered among threads so that memory references will be
479 * efficiently coalesced using a warp-striped access pattern.
480 * \par
481 * \code
482 * #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
483 *
484 * __global__ void ExampleKernel(int *d_data, ...)
485 * {
486 * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
487 * typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
488 *
489 * // Allocate shared memory for BlockStore
490 * __shared__ typename BlockStore::TempStorage temp_storage;
491 *
492 * // Obtain a segment of consecutive items that are blocked across threads
493 * int thread_data[4];
494 * ...
495 *
496 * // Store items to linear memory
497 * int thread_data[4];
498 * BlockStore(temp_storage).Store(d_data, thread_data);
499 *
500 * \endcode
501 * \par
502 * Suppose the set of \p thread_data across the block of threads is
503 * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
504 * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
505 *
506 */
507 template <
508 typename T,
509 int BLOCK_DIM_X,
510 int ITEMS_PER_THREAD,
511 BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
512 int BLOCK_DIM_Y = 1,
513 int BLOCK_DIM_Z = 1,
514 int PTX_ARCH = CUB_PTX_ARCH>
515 class BlockStore
516 {
517 private:
518 /******************************************************************************
519 * Constants and typed definitions
520 ******************************************************************************/
521
522 /// Constants
523 enum
524 {
525 /// The thread block size in threads
526 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
527 };
528
529
530 /******************************************************************************
531 * Algorithmic variants
532 ******************************************************************************/
533
534 /// Store helper
535 template <BlockStoreAlgorithm _POLICY, int DUMMY>
536 struct StoreInternal;
537
538
539 /**
540 * BLOCK_STORE_DIRECT specialization of store helper
541 */
542 template <int DUMMY>
543 struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
544 {
545 /// Shared memory storage layout type
546 typedef NullType TempStorage;
547
548 /// Linear thread-id
549 int linear_tid;
550
551 /// Constructor
StoreInternalcub::BlockStore::StoreInternal552 __device__ __forceinline__ StoreInternal(
553 TempStorage &/*temp_storage*/,
554 int linear_tid)
555 :
556 linear_tid(linear_tid)
557 {}
558
559 /// Store items into a linear segment of memory
560 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal561 __device__ __forceinline__ void Store(
562 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
563 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
564 {
565 StoreDirectBlocked(linear_tid, block_itr, items);
566 }
567
568 /// Store items into a linear segment of memory, guarded by range
569 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal570 __device__ __forceinline__ void Store(
571 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
572 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
573 int valid_items) ///< [in] Number of valid items to write
574 {
575 StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
576 }
577 };
578
579
580 /**
581 * BLOCK_STORE_VECTORIZE specialization of store helper
582 */
583 template <int DUMMY>
584 struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
585 {
586 /// Shared memory storage layout type
587 typedef NullType TempStorage;
588
589 /// Linear thread-id
590 int linear_tid;
591
592 /// Constructor
StoreInternalcub::BlockStore::StoreInternal593 __device__ __forceinline__ StoreInternal(
594 TempStorage &/*temp_storage*/,
595 int linear_tid)
596 :
597 linear_tid(linear_tid)
598 {}
599
600 /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
Storecub::BlockStore::StoreInternal601 __device__ __forceinline__ void Store(
602 T *block_ptr, ///< [in] The thread block's base output iterator for storing to
603 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
604 {
605 StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
606 }
607
608 /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
609 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal610 __device__ __forceinline__ void Store(
611 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
612 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
613 {
614 StoreDirectBlocked(linear_tid, block_itr, items);
615 }
616
617 /// Store items into a linear segment of memory, guarded by range
618 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal619 __device__ __forceinline__ void Store(
620 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
621 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
622 int valid_items) ///< [in] Number of valid items to write
623 {
624 StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
625 }
626 };
627
628
629 /**
630 * BLOCK_STORE_TRANSPOSE specialization of store helper
631 */
632 template <int DUMMY>
633 struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
634 {
635 // BlockExchange utility type for keys
636 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
637
638 /// Shared memory storage layout type
639 struct _TempStorage : BlockExchange::TempStorage
640 {
641 /// Temporary storage for partially-full block guard
642 volatile int valid_items;
643 };
644
645 /// Alias wrapper allowing storage to be unioned
646 struct TempStorage : Uninitialized<_TempStorage> {};
647
648 /// Thread reference to shared storage
649 _TempStorage &temp_storage;
650
651 /// Linear thread-id
652 int linear_tid;
653
654 /// Constructor
StoreInternalcub::BlockStore::StoreInternal655 __device__ __forceinline__ StoreInternal(
656 TempStorage &temp_storage,
657 int linear_tid)
658 :
659 temp_storage(temp_storage.Alias()),
660 linear_tid(linear_tid)
661 {}
662
663 /// Store items into a linear segment of memory
664 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal665 __device__ __forceinline__ void Store(
666 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
667 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
668 {
669 BlockExchange(temp_storage).BlockedToStriped(items);
670 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
671 }
672
673 /// Store items into a linear segment of memory, guarded by range
674 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal675 __device__ __forceinline__ void Store(
676 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
677 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
678 int valid_items) ///< [in] Number of valid items to write
679 {
680 BlockExchange(temp_storage).BlockedToStriped(items);
681 if (linear_tid == 0)
682 temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
683 CTA_SYNC();
684 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
685 }
686 };
687
688
689 /**
690 * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
691 */
692 template <int DUMMY>
693 struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
694 {
695 enum
696 {
697 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
698 };
699
700 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
701 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
702
703 // BlockExchange utility type for keys
704 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
705
706 /// Shared memory storage layout type
707 struct _TempStorage : BlockExchange::TempStorage
708 {
709 /// Temporary storage for partially-full block guard
710 volatile int valid_items;
711 };
712
713 /// Alias wrapper allowing storage to be unioned
714 struct TempStorage : Uninitialized<_TempStorage> {};
715
716 /// Thread reference to shared storage
717 _TempStorage &temp_storage;
718
719 /// Linear thread-id
720 int linear_tid;
721
722 /// Constructor
StoreInternalcub::BlockStore::StoreInternal723 __device__ __forceinline__ StoreInternal(
724 TempStorage &temp_storage,
725 int linear_tid)
726 :
727 temp_storage(temp_storage.Alias()),
728 linear_tid(linear_tid)
729 {}
730
731 /// Store items into a linear segment of memory
732 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal733 __device__ __forceinline__ void Store(
734 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
735 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
736 {
737 BlockExchange(temp_storage).BlockedToWarpStriped(items);
738 StoreDirectWarpStriped(linear_tid, block_itr, items);
739 }
740
741 /// Store items into a linear segment of memory, guarded by range
742 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal743 __device__ __forceinline__ void Store(
744 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
745 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
746 int valid_items) ///< [in] Number of valid items to write
747 {
748 BlockExchange(temp_storage).BlockedToWarpStriped(items);
749 if (linear_tid == 0)
750 temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
751 CTA_SYNC();
752 StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
753 }
754 };
755
756
757 /**
758 * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
759 */
760 template <int DUMMY>
761 struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
762 {
763 enum
764 {
765 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
766 };
767
768 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
769 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
770
771 // BlockExchange utility type for keys
772 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
773
774 /// Shared memory storage layout type
775 struct _TempStorage : BlockExchange::TempStorage
776 {
777 /// Temporary storage for partially-full block guard
778 volatile int valid_items;
779 };
780
781 /// Alias wrapper allowing storage to be unioned
782 struct TempStorage : Uninitialized<_TempStorage> {};
783
784 /// Thread reference to shared storage
785 _TempStorage &temp_storage;
786
787 /// Linear thread-id
788 int linear_tid;
789
790 /// Constructor
StoreInternalcub::BlockStore::StoreInternal791 __device__ __forceinline__ StoreInternal(
792 TempStorage &temp_storage,
793 int linear_tid)
794 :
795 temp_storage(temp_storage.Alias()),
796 linear_tid(linear_tid)
797 {}
798
799 /// Store items into a linear segment of memory
800 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal801 __device__ __forceinline__ void Store(
802 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
803 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
804 {
805 BlockExchange(temp_storage).BlockedToWarpStriped(items);
806 StoreDirectWarpStriped(linear_tid, block_itr, items);
807 }
808
809 /// Store items into a linear segment of memory, guarded by range
810 template <typename OutputIteratorT>
Storecub::BlockStore::StoreInternal811 __device__ __forceinline__ void Store(
812 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
813 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
814 int valid_items) ///< [in] Number of valid items to write
815 {
816 BlockExchange(temp_storage).BlockedToWarpStriped(items);
817 if (linear_tid == 0)
818 temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
819 CTA_SYNC();
820 StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
821 }
822 };
823
824 /******************************************************************************
825 * Type definitions
826 ******************************************************************************/
827
828 /// Internal load implementation to use
829 typedef StoreInternal<ALGORITHM, 0> InternalStore;
830
831
832 /// Shared memory storage layout type
833 typedef typename InternalStore::TempStorage _TempStorage;
834
835
836 /******************************************************************************
837 * Utility methods
838 ******************************************************************************/
839
840 /// Internal storage allocator
PrivateStorage()841 __device__ __forceinline__ _TempStorage& PrivateStorage()
842 {
843 __shared__ _TempStorage private_storage;
844 return private_storage;
845 }
846
847
848 /******************************************************************************
849 * Thread fields
850 ******************************************************************************/
851
852 /// Thread reference to shared storage
853 _TempStorage &temp_storage;
854
855 /// Linear thread-id
856 int linear_tid;
857
858 public:
859
860
861 /// \smemstorage{BlockStore}
862 struct TempStorage : Uninitialized<_TempStorage> {};
863
864
865 /******************************************************************//**
866 * \name Collective constructors
867 *********************************************************************/
868 //@{
869
870 /**
871 * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
872 */
BlockStore()873 __device__ __forceinline__ BlockStore()
874 :
875 temp_storage(PrivateStorage()),
876 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
877 {}
878
879
880 /**
881 * \brief Collective constructor using the specified memory allocation as temporary storage.
882 */
BlockStore(TempStorage & temp_storage)883 __device__ __forceinline__ BlockStore(
884 TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
885 :
886 temp_storage(temp_storage.Alias()),
887 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
888 {}
889
890
891 //@} end member group
892 /******************************************************************//**
893 * \name Data movement
894 *********************************************************************/
895 //@{
896
897
898 /**
899 * \brief Store items into a linear segment of memory.
900 *
901 * \par
902 * - \blocked
903 * - \smemreuse
904 *
905 * \par Snippet
906 * The code snippet below illustrates the storing of a "blocked" arrangement
907 * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
908 * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
909 * meaning items are locally reordered among threads so that memory references will be
910 * efficiently coalesced using a warp-striped access pattern.
911 * \par
912 * \code
913 * #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
914 *
915 * __global__ void ExampleKernel(int *d_data, ...)
916 * {
917 * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
918 * typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
919 *
920 * // Allocate shared memory for BlockStore
921 * __shared__ typename BlockStore::TempStorage temp_storage;
922 *
923 * // Obtain a segment of consecutive items that are blocked across threads
924 * int thread_data[4];
925 * ...
926 *
927 * // Store items to linear memory
928 * int thread_data[4];
929 * BlockStore(temp_storage).Store(d_data, thread_data);
930 *
931 * \endcode
932 * \par
933 * Suppose the set of \p thread_data across the block of threads is
934 * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
935 * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
936 *
937 */
938 template <typename OutputIteratorT>
Store(OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD])939 __device__ __forceinline__ void Store(
940 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
941 T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
942 {
943 InternalStore(temp_storage, linear_tid).Store(block_itr, items);
944 }
945
946 /**
947 * \brief Store items into a linear segment of memory, guarded by range.
948 *
949 * \par
950 * - \blocked
951 * - \smemreuse
952 *
953 * \par Snippet
954 * The code snippet below illustrates the guarded storing of a "blocked" arrangement
955 * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
956 * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
957 * meaning items are locally reordered among threads so that memory references will be
958 * efficiently coalesced using a warp-striped access pattern.
959 * \par
960 * \code
961 * #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
962 *
963 * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
964 * {
965 * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
966 * typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
967 *
968 * // Allocate shared memory for BlockStore
969 * __shared__ typename BlockStore::TempStorage temp_storage;
970 *
971 * // Obtain a segment of consecutive items that are blocked across threads
972 * int thread_data[4];
973 * ...
974 *
975 * // Store items to linear memory
976 * int thread_data[4];
977 * BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
978 *
979 * \endcode
980 * \par
981 * Suppose the set of \p thread_data across the block of threads is
982 * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
983 * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
984 * only the first two threads being unmasked to store portions of valid data.
985 *
986 */
987 template <typename OutputIteratorT>
Store(OutputIteratorT block_itr,T (& items)[ITEMS_PER_THREAD],int valid_items)988 __device__ __forceinline__ void Store(
989 OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
990 T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
991 int valid_items) ///< [in] Number of valid items to write
992 {
993 InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
994 }
995 };
996
997
998 } // CUB namespace
999 CUB_NS_POSTFIX // Optional outer namespace(s)
1000
1001