1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /**
30 * \file
31 * Operations for reading linear tiles of data into the CUDA thread block.
32 */
33
34 #pragma once
35
36 #include <iterator>
37
38 #include "block_exchange.cuh"
39 #include "../iterator/cache_modified_input_iterator.cuh"
40 #include "../util_ptx.cuh"
41 #include "../util_macro.cuh"
42 #include "../util_type.cuh"
43 #include "../util_namespace.cuh"
44
45 /// Optional outer namespace(s)
46 CUB_NS_PREFIX
47
48 /// CUB namespace
49 namespace cub {
50
51 /**
52 * \addtogroup UtilIo
53 * @{
54 */
55
56
57 /******************************************************************//**
58 * \name Blocked arrangement I/O (direct)
59 *********************************************************************/
60 //@{
61
62
63 /**
64 * \brief Load a linear segment of items into a blocked arrangement across the thread block.
65 *
66 * \blocked
67 *
68 * \tparam T <b>[inferred]</b> The data type to load.
69 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
70 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
71 */
72 template <
73 typename InputT,
74 int ITEMS_PER_THREAD,
75 typename InputIteratorT>
LoadDirectBlocked(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])76 __device__ __forceinline__ void LoadDirectBlocked(
77 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
78 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
79 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
80 {
81 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
82
83 // Load directly in thread-blocked order
84 #pragma unroll
85 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
86 {
87 items[ITEM] = thread_itr[ITEM];
88 }
89 }
90
91
92 /**
93 * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
94 *
95 * \blocked
96 *
97 * \tparam T <b>[inferred]</b> The data type to load.
98 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
99 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
100 */
101 template <
102 typename InputT,
103 int ITEMS_PER_THREAD,
104 typename InputIteratorT>
LoadDirectBlocked(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)105 __device__ __forceinline__ void LoadDirectBlocked(
106 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
107 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
108 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
109 int valid_items) ///< [in] Number of valid items to load
110 {
111 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
112
113 #pragma unroll
114 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
115 {
116 if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
117 {
118 items[ITEM] = thread_itr[ITEM];
119 }
120 }
121 }
122
123
124 /**
125 * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
126 *
127 * \blocked
128 *
129 * \tparam T <b>[inferred]</b> The data type to load.
130 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
131 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
132 */
133 template <
134 typename InputT,
135 typename DefaultT,
136 int ITEMS_PER_THREAD,
137 typename InputIteratorT>
LoadDirectBlocked(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)138 __device__ __forceinline__ void LoadDirectBlocked(
139 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
140 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
141 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
142 int valid_items, ///< [in] Number of valid items to load
143 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
144 {
145 #pragma unroll
146 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
147 items[ITEM] = oob_default;
148
149 LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
150 }
151
152
153 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
154
155 /**
156 * Internal implementation for load vectorization
157 */
158 template <
159 CacheLoadModifier MODIFIER,
160 typename T,
161 int ITEMS_PER_THREAD>
InternalLoadDirectBlockedVectorized(int linear_tid,T * block_ptr,T (& items)[ITEMS_PER_THREAD])162 __device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
163 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
164 T *block_ptr, ///< [in] Input pointer for loading from
165 T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
166 {
167 // Biggest memory access word that T is a whole multiple of
168 typedef typename UnitWord<T>::DeviceWord DeviceWord;
169
170 enum
171 {
172 TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
173
174 VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
175 4 :
176 (TOTAL_WORDS % 2 == 0) ?
177 2 :
178 1,
179
180 VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
181 };
182
183 // Vector type
184 typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
185
186 // Vector items
187 Vector vec_items[VECTORS_PER_THREAD];
188
189 // Aliased input ptr
190 Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
191
192 // Load directly in thread-blocked order
193 #pragma unroll
194 for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
195 {
196 vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
197 }
198
199 // Copy
200 #pragma unroll
201 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
202 {
203 items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
204 }
205 }
206
207 #endif // DOXYGEN_SHOULD_SKIP_THIS
208
209
210 /**
211 * \brief Load a linear segment of items into a blocked arrangement across the thread block.
212 *
213 * \blocked
214 *
215 * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
216 *
217 * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
218 * - \p ITEMS_PER_THREAD is odd
219 * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
220 *
221 * \tparam T <b>[inferred]</b> The data type to load.
222 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
223 */
224 template <
225 typename T,
226 int ITEMS_PER_THREAD>
LoadDirectBlockedVectorized(int linear_tid,T * block_ptr,T (& items)[ITEMS_PER_THREAD])227 __device__ __forceinline__ void LoadDirectBlockedVectorized(
228 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
229 T *block_ptr, ///< [in] Input pointer for loading from
230 T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
231 {
232 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
233 }
234
235
236 //@} end member group
237 /******************************************************************//**
238 * \name Striped arrangement I/O (direct)
239 *********************************************************************/
240 //@{
241
242
243 /**
244 * \brief Load a linear segment of items into a striped arrangement across the thread block.
245 *
246 * \striped
247 *
248 * \tparam BLOCK_THREADS The thread block size in threads
249 * \tparam T <b>[inferred]</b> The data type to load.
250 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
251 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
252 */
253 template <
254 int BLOCK_THREADS,
255 typename InputT,
256 int ITEMS_PER_THREAD,
257 typename InputIteratorT>
LoadDirectStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])258 __device__ __forceinline__ void LoadDirectStriped(
259 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
260 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
261 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
262 {
263 InputIteratorT thread_itr = block_itr + linear_tid;
264
265 #pragma unroll
266 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
267 {
268 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
269 }
270 }
271
272
273 /**
274 * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
275 *
276 * \striped
277 *
278 * \tparam BLOCK_THREADS The thread block size in threads
279 * \tparam T <b>[inferred]</b> The data type to load.
280 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
281 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
282 */
283 template <
284 int BLOCK_THREADS,
285 typename InputT,
286 int ITEMS_PER_THREAD,
287 typename InputIteratorT>
LoadDirectStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)288 __device__ __forceinline__ void LoadDirectStriped(
289 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
290 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
291 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
292 int valid_items) ///< [in] Number of valid items to load
293 {
294 InputIteratorT thread_itr = block_itr + linear_tid;
295
296 #pragma unroll
297 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
298 {
299 if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
300 {
301 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
302 }
303 }
304 }
305
306
307 /**
308 * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
309 *
310 * \striped
311 *
312 * \tparam BLOCK_THREADS The thread block size in threads
313 * \tparam T <b>[inferred]</b> The data type to load.
314 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
315 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
316 */
317 template <
318 int BLOCK_THREADS,
319 typename InputT,
320 typename DefaultT,
321 int ITEMS_PER_THREAD,
322 typename InputIteratorT>
LoadDirectStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)323 __device__ __forceinline__ void LoadDirectStriped(
324 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
325 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
326 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
327 int valid_items, ///< [in] Number of valid items to load
328 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
329 {
330 #pragma unroll
331 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332 items[ITEM] = oob_default;
333
334 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
335 }
336
337
338
339 //@} end member group
340 /******************************************************************//**
341 * \name Warp-striped arrangement I/O (direct)
342 *********************************************************************/
343 //@{
344
345
346 /**
347 * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
348 *
349 * \warpstriped
350 *
351 * \par Usage Considerations
352 * The number of threads in the thread block must be a multiple of the architecture's warp size.
353 *
354 * \tparam T <b>[inferred]</b> The data type to load.
355 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
356 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
357 */
358 template <
359 typename InputT,
360 int ITEMS_PER_THREAD,
361 typename InputIteratorT>
LoadDirectWarpStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])362 __device__ __forceinline__ void LoadDirectWarpStriped(
363 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
364 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
365 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
366 {
367 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
368 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
369 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
370
371 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
372
373 // Load directly in warp-striped order
374 #pragma unroll
375 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
376 {
377 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
378 }
379 }
380
381
382 /**
383 * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
384 *
385 * \warpstriped
386 *
387 * \par Usage Considerations
388 * The number of threads in the thread block must be a multiple of the architecture's warp size.
389 *
390 * \tparam T <b>[inferred]</b> The data type to load.
391 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
392 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
393 */
394 template <
395 typename InputT,
396 int ITEMS_PER_THREAD,
397 typename InputIteratorT>
LoadDirectWarpStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)398 __device__ __forceinline__ void LoadDirectWarpStriped(
399 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
400 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
401 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
402 int valid_items) ///< [in] Number of valid items to load
403 {
404 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
405 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
406 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
407
408 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
409
410 // Load directly in warp-striped order
411 #pragma unroll
412 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
413 {
414 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
415 {
416 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
417 }
418 }
419 }
420
421
422 /**
423 * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
424 *
425 * \warpstriped
426 *
427 * \par Usage Considerations
428 * The number of threads in the thread block must be a multiple of the architecture's warp size.
429 *
430 * \tparam T <b>[inferred]</b> The data type to load.
431 * \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
432 * \tparam InputIteratorT <b>[inferred]</b> The random-access iterator type for input \iterator.
433 */
434 template <
435 typename InputT,
436 typename DefaultT,
437 int ITEMS_PER_THREAD,
438 typename InputIteratorT>
LoadDirectWarpStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)439 __device__ __forceinline__ void LoadDirectWarpStriped(
440 int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
441 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
442 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
443 int valid_items, ///< [in] Number of valid items to load
444 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
445 {
446 // Load directly in warp-striped order
447 #pragma unroll
448 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
449 items[ITEM] = oob_default;
450
451 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
452 }
453
454
455
456 //@} end member group
457
458 /** @} */ // end group UtilIo
459
460
461
462 //-----------------------------------------------------------------------------
463 // Generic BlockLoad abstraction
464 //-----------------------------------------------------------------------------
465
466 /**
467 * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
468 */
469
470 /**
471 * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
472 */
473 enum BlockLoadAlgorithm
474 {
475 /**
476 * \par Overview
477 *
478 * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
479 * directly from memory.
480 *
481 * \par Performance Considerations
482 * - The utilization of memory transactions (coalescing) decreases as the
483 * access stride between threads increases (i.e., the number items per thread).
484 */
485 BLOCK_LOAD_DIRECT,
486
487 /**
488 * \par Overview
489 *
490 * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
491 * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
492 * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
493 * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
494 *
495 * \par Performance Considerations
496 * - The utilization of memory transactions (coalescing) remains high until the the
497 * access stride between threads (i.e., the number items per thread) exceeds the
498 * maximum vector load width (typically 4 items or 64B, whichever is lower).
499 * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
500 * - \p ITEMS_PER_THREAD is odd
501 * - The \p InputIteratorTis not a simple pointer type
502 * - The block input offset is not quadword-aligned
503 * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
504 */
505 BLOCK_LOAD_VECTORIZE,
506
507 /**
508 * \par Overview
509 *
510 * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
511 * efficiently from memory and then locally transposed into a
512 * [<em>blocked arrangement</em>](index.html#sec5sec3).
513 *
514 * \par Performance Considerations
515 * - The utilization of memory transactions (coalescing) remains high regardless
516 * of items loaded per thread.
517 * - The local reordering incurs slightly longer latencies and throughput than the
518 * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
519 */
520 BLOCK_LOAD_TRANSPOSE,
521
522
523 /**
524 * \par Overview
525 *
526 * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
527 * read efficiently from memory and then locally transposed into a
528 * [<em>blocked arrangement</em>](index.html#sec5sec3).
529 *
530 * \par Usage Considerations
531 * - BLOCK_THREADS must be a multiple of WARP_THREADS
532 *
533 * \par Performance Considerations
534 * - The utilization of memory transactions (coalescing) remains high regardless
535 * of items loaded per thread.
536 * - The local reordering incurs slightly larger latencies than the
537 * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
538 * - Provisions more shared storage, but incurs smaller latencies than the
539 * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
540 */
541 BLOCK_LOAD_WARP_TRANSPOSE,
542
543
544 /**
545 * \par Overview
546 *
547 * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
548 * of data is read directly from memory and then is locally transposed into a
549 * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
550 * requirement, only one warp's worth of shared memory is provisioned and is
551 * subsequently time-sliced among warps.
552 *
553 * \par Usage Considerations
554 * - BLOCK_THREADS must be a multiple of WARP_THREADS
555 *
556 * \par Performance Considerations
557 * - The utilization of memory transactions (coalescing) remains high regardless
558 * of items loaded per thread.
559 * - Provisions less shared memory temporary storage, but incurs larger
560 * latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
561 */
562 BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
563 };
564
565
566 /**
567 * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png)
568 * \ingroup BlockModule
569 * \ingroup UtilIo
570 *
571 * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type).
572 * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
573 * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread.
574 * \tparam ALGORITHM <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT.
575 * \tparam WARP_TIME_SLICING <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
576 * \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
577 * \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
578 * \tparam PTX_ARCH <b>[optional]</b> \ptxversion
579 *
580 * \par Overview
581 * - The BlockLoad class provides a single data movement abstraction that can be specialized
582 * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different
583 * performance policies for different architectures, data types, granularity sizes, etc.
584 * - BlockLoad can be optionally specialized by different data movement strategies:
585 * -# <b>cub::BLOCK_LOAD_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
586 * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm)
587 * -# <b>cub::BLOCK_LOAD_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
588 * of data is read directly from memory using CUDA's built-in vectorized loads as a
589 * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm)
590 * -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
591 * of data is read directly from memory and is then locally transposed into a
592 * [<em>blocked arrangement</em>](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm)
593 * -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>. A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
594 * of data is read directly from memory and is then locally transposed into a
595 * [<em>blocked arrangement</em>](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm)
596 * -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>. A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
597 * of data is read directly from memory and is then locally transposed into a
598 * [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm)
599 * - \rowmajor
600 *
601 * \par A Simple Example
602 * \blockcollective{BlockLoad}
603 * \par
604 * The code snippet below illustrates the loading of a linear
605 * segment of 512 integers into a "blocked" arrangement across 128 threads where each
606 * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
607 * meaning memory references are efficiently coalesced using a warp-striped access
608 * pattern (after which items are locally reordered among threads).
609 * \par
610 * \code
611 * #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
612 *
613 * __global__ void ExampleKernel(int *d_data, ...)
614 * {
615 * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
616 * typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
617 *
618 * // Allocate shared memory for BlockLoad
619 * __shared__ typename BlockLoad::TempStorage temp_storage;
620 *
621 * // Load a segment of consecutive items that are blocked across threads
622 * int thread_data[4];
623 * BlockLoad(temp_storage).Load(d_data, thread_data);
624 *
625 * \endcode
626 * \par
627 * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
628 * The set of \p thread_data across the block of threads in those threads will be
629 * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
630 *
631 */
632 template <
633 typename InputT,
634 int BLOCK_DIM_X,
635 int ITEMS_PER_THREAD,
636 BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
637 int BLOCK_DIM_Y = 1,
638 int BLOCK_DIM_Z = 1,
639 int PTX_ARCH = CUB_PTX_ARCH>
640 class BlockLoad
641 {
642 private:
643
644 /******************************************************************************
645 * Constants and typed definitions
646 ******************************************************************************/
647
648 /// Constants
649 enum
650 {
651 /// The thread block size in threads
652 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
653 };
654
655
656 /******************************************************************************
657 * Algorithmic variants
658 ******************************************************************************/
659
660 /// Load helper
661 template <BlockLoadAlgorithm _POLICY, int DUMMY>
662 struct LoadInternal;
663
664
665 /**
666 * BLOCK_LOAD_DIRECT specialization of load helper
667 */
668 template <int DUMMY>
669 struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
670 {
671 /// Shared memory storage layout type
672 typedef NullType TempStorage;
673
674 /// Linear thread-id
675 int linear_tid;
676
677 /// Constructor
LoadInternalcub::BlockLoad::LoadInternal678 __device__ __forceinline__ LoadInternal(
679 TempStorage &/*temp_storage*/,
680 int linear_tid)
681 :
682 linear_tid(linear_tid)
683 {}
684
685 /// Load a linear segment of items from memory
686 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal687 __device__ __forceinline__ void Load(
688 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
689 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
690 {
691 LoadDirectBlocked(linear_tid, block_itr, items);
692 }
693
694 /// Load a linear segment of items from memory, guarded by range
695 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal696 __device__ __forceinline__ void Load(
697 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
698 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
699 int valid_items) ///< [in] Number of valid items to load
700 {
701 LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
702 }
703
704 /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
705 template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal706 __device__ __forceinline__ void Load(
707 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
708 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
709 int valid_items, ///< [in] Number of valid items to load
710 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
711 {
712 LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
713 }
714
715 };
716
717
718 /**
719 * BLOCK_LOAD_VECTORIZE specialization of load helper
720 */
721 template <int DUMMY>
722 struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
723 {
724 /// Shared memory storage layout type
725 typedef NullType TempStorage;
726
727 /// Linear thread-id
728 int linear_tid;
729
730 /// Constructor
LoadInternalcub::BlockLoad::LoadInternal731 __device__ __forceinline__ LoadInternal(
732 TempStorage &/*temp_storage*/,
733 int linear_tid)
734 :
735 linear_tid(linear_tid)
736 {}
737
738 /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
739 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal740 __device__ __forceinline__ void Load(
741 InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from
742 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
743 {
744 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
745 }
746
747 /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
748 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal749 __device__ __forceinline__ void Load(
750 const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from
751 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
752 {
753 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
754 }
755
756 /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
757 template <
758 CacheLoadModifier MODIFIER,
759 typename ValueType,
760 typename OffsetT>
Loadcub::BlockLoad::LoadInternal761 __device__ __forceinline__ void Load(
762 CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_itr, ///< [in] The thread block's base input iterator for loading from
763 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
764 {
765 InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
766 }
767
768 /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
769 template <typename _InputIteratorT>
Loadcub::BlockLoad::LoadInternal770 __device__ __forceinline__ void Load(
771 _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
772 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
773 {
774 LoadDirectBlocked(linear_tid, block_itr, items);
775 }
776
777 /// Load a linear segment of items from memory, guarded by range (skips vectorization)
778 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal779 __device__ __forceinline__ void Load(
780 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
781 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
782 int valid_items) ///< [in] Number of valid items to load
783 {
784 LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
785 }
786
787 /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
788 template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal789 __device__ __forceinline__ void Load(
790 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
791 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
792 int valid_items, ///< [in] Number of valid items to load
793 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
794 {
795 LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
796 }
797
798 };
799
800
801 /**
802 * BLOCK_LOAD_TRANSPOSE specialization of load helper
803 */
804 template <int DUMMY>
805 struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
806 {
807 // BlockExchange utility type for keys
808 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
809
810 /// Shared memory storage layout type
811 struct _TempStorage : BlockExchange::TempStorage
812 {};
813
814 /// Alias wrapper allowing storage to be unioned
815 struct TempStorage : Uninitialized<_TempStorage> {};
816
817 /// Thread reference to shared storage
818 _TempStorage &temp_storage;
819
820 /// Linear thread-id
821 int linear_tid;
822
823 /// Constructor
LoadInternalcub::BlockLoad::LoadInternal824 __device__ __forceinline__ LoadInternal(
825 TempStorage &temp_storage,
826 int linear_tid)
827 :
828 temp_storage(temp_storage.Alias()),
829 linear_tid(linear_tid)
830 {}
831
832 /// Load a linear segment of items from memory
833 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal834 __device__ __forceinline__ void Load(
835 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
836 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{
837 {
838 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
839 BlockExchange(temp_storage).StripedToBlocked(items, items);
840 }
841
842 /// Load a linear segment of items from memory, guarded by range
843 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal844 __device__ __forceinline__ void Load(
845 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
846 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
847 int valid_items) ///< [in] Number of valid items to load
848 {
849 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
850 BlockExchange(temp_storage).StripedToBlocked(items, items);
851 }
852
853 /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
854 template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal855 __device__ __forceinline__ void Load(
856 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
857 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
858 int valid_items, ///< [in] Number of valid items to load
859 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
860 {
861 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
862 BlockExchange(temp_storage).StripedToBlocked(items, items);
863 }
864
865 };
866
867
868 /**
869 * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
870 */
871 template <int DUMMY>
872 struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
873 {
874 enum
875 {
876 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
877 };
878
879 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
880 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
881
882 // BlockExchange utility type for keys
883 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
884
885 /// Shared memory storage layout type
886 struct _TempStorage : BlockExchange::TempStorage
887 {};
888
889 /// Alias wrapper allowing storage to be unioned
890 struct TempStorage : Uninitialized<_TempStorage> {};
891
892 /// Thread reference to shared storage
893 _TempStorage &temp_storage;
894
895 /// Linear thread-id
896 int linear_tid;
897
898 /// Constructor
LoadInternalcub::BlockLoad::LoadInternal899 __device__ __forceinline__ LoadInternal(
900 TempStorage &temp_storage,
901 int linear_tid)
902 :
903 temp_storage(temp_storage.Alias()),
904 linear_tid(linear_tid)
905 {}
906
907 /// Load a linear segment of items from memory
908 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal909 __device__ __forceinline__ void Load(
910 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
911 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{
912 {
913 LoadDirectWarpStriped(linear_tid, block_itr, items);
914 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
915 }
916
917 /// Load a linear segment of items from memory, guarded by range
918 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal919 __device__ __forceinline__ void Load(
920 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
921 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
922 int valid_items) ///< [in] Number of valid items to load
923 {
924 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
925 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
926 }
927
928
929 /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
930 template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal931 __device__ __forceinline__ void Load(
932 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
933 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
934 int valid_items, ///< [in] Number of valid items to load
935 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
936 {
937 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
938 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
939 }
940 };
941
942
943 /**
944 * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
945 */
946 template <int DUMMY>
947 struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
948 {
949 enum
950 {
951 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
952 };
953
954 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
955 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
956
957 // BlockExchange utility type for keys
958 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
959
960 /// Shared memory storage layout type
961 struct _TempStorage : BlockExchange::TempStorage
962 {};
963
964 /// Alias wrapper allowing storage to be unioned
965 struct TempStorage : Uninitialized<_TempStorage> {};
966
967 /// Thread reference to shared storage
968 _TempStorage &temp_storage;
969
970 /// Linear thread-id
971 int linear_tid;
972
973 /// Constructor
LoadInternalcub::BlockLoad::LoadInternal974 __device__ __forceinline__ LoadInternal(
975 TempStorage &temp_storage,
976 int linear_tid)
977 :
978 temp_storage(temp_storage.Alias()),
979 linear_tid(linear_tid)
980 {}
981
982 /// Load a linear segment of items from memory
983 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal984 __device__ __forceinline__ void Load(
985 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
986 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{
987 {
988 LoadDirectWarpStriped(linear_tid, block_itr, items);
989 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
990 }
991
992 /// Load a linear segment of items from memory, guarded by range
993 template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal994 __device__ __forceinline__ void Load(
995 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
996 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
997 int valid_items) ///< [in] Number of valid items to load
998 {
999 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
1000 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1001 }
1002
1003
1004 /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
1005 template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal1006 __device__ __forceinline__ void Load(
1007 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
1008 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
1009 int valid_items, ///< [in] Number of valid items to load
1010 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
1011 {
1012 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
1013 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1014 }
1015 };
1016
1017
1018 /******************************************************************************
1019 * Type definitions
1020 ******************************************************************************/
1021
1022 /// Internal load implementation to use
1023 typedef LoadInternal<ALGORITHM, 0> InternalLoad;
1024
1025
1026 /// Shared memory storage layout type
1027 typedef typename InternalLoad::TempStorage _TempStorage;
1028
1029
1030 /******************************************************************************
1031 * Utility methods
1032 ******************************************************************************/
1033
1034 /// Internal storage allocator
PrivateStorage()1035 __device__ __forceinline__ _TempStorage& PrivateStorage()
1036 {
1037 __shared__ _TempStorage private_storage;
1038 return private_storage;
1039 }
1040
1041
1042 /******************************************************************************
1043 * Thread fields
1044 ******************************************************************************/
1045
1046 /// Thread reference to shared storage
1047 _TempStorage &temp_storage;
1048
1049 /// Linear thread-id
1050 int linear_tid;
1051
1052 public:
1053
1054 /// \smemstorage{BlockLoad}
1055 struct TempStorage : Uninitialized<_TempStorage> {};
1056
1057
1058 /******************************************************************//**
1059 * \name Collective constructors
1060 *********************************************************************/
1061 //@{
1062
1063 /**
1064 * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
1065 */
BlockLoad()1066 __device__ __forceinline__ BlockLoad()
1067 :
1068 temp_storage(PrivateStorage()),
1069 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1070 {}
1071
1072
1073 /**
1074 * \brief Collective constructor using the specified memory allocation as temporary storage.
1075 */
BlockLoad(TempStorage & temp_storage)1076 __device__ __forceinline__ BlockLoad(
1077 TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
1078 :
1079 temp_storage(temp_storage.Alias()),
1080 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1081 {}
1082
1083
1084
1085
1086 //@} end member group
1087 /******************************************************************//**
1088 * \name Data movement
1089 *********************************************************************/
1090 //@{
1091
1092
1093 /**
1094 * \brief Load a linear segment of items from memory.
1095 *
1096 * \par
1097 * - \blocked
1098 * - \smemreuse
1099 *
1100 * \par Snippet
1101 * The code snippet below illustrates the loading of a linear
1102 * segment of 512 integers into a "blocked" arrangement across 128 threads where each
1103 * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
1104 * meaning memory references are efficiently coalesced using a warp-striped access
1105 * pattern (after which items are locally reordered among threads).
1106 * \par
1107 * \code
1108 * #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
1109 *
1110 * __global__ void ExampleKernel(int *d_data, ...)
1111 * {
1112 * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1113 * typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
1114 *
1115 * // Allocate shared memory for BlockLoad
1116 * __shared__ typename BlockLoad::TempStorage temp_storage;
1117 *
1118 * // Load a segment of consecutive items that are blocked across threads
1119 * int thread_data[4];
1120 * BlockLoad(temp_storage).Load(d_data, thread_data);
1121 *
1122 * \endcode
1123 * \par
1124 * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
1125 * The set of \p thread_data across the block of threads in those threads will be
1126 * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
1127 *
1128 */
1129 template <typename InputIteratorT>
Load(InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])1130 __device__ __forceinline__ void Load(
1131 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
1132 InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
1133 {
1134 InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
1135 }
1136
1137
1138 /**
1139 * \brief Load a linear segment of items from memory, guarded by range.
1140 *
1141 * \par
1142 * - \blocked
1143 * - \smemreuse
1144 *
1145 * \par Snippet
1146 * The code snippet below illustrates the guarded loading of a linear
1147 * segment of 512 integers into a "blocked" arrangement across 128 threads where each
1148 * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
1149 * meaning memory references are efficiently coalesced using a warp-striped access
1150 * pattern (after which items are locally reordered among threads).
1151 * \par
1152 * \code
1153 * #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
1154 *
1155 * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
1156 * {
1157 * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1158 * typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
1159 *
1160 * // Allocate shared memory for BlockLoad
1161 * __shared__ typename BlockLoad::TempStorage temp_storage;
1162 *
1163 * // Load a segment of consecutive items that are blocked across threads
1164 * int thread_data[4];
1165 * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
1166 *
1167 * \endcode
1168 * \par
1169 * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
1170 * The set of \p thread_data across the block of threads in those threads will be
1171 * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
1172 * being unmasked to load portions of valid data (and other items remaining unassigned).
1173 *
1174 */
1175 template <typename InputIteratorT>
Load(InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)1176 __device__ __forceinline__ void Load(
1177 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
1178 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
1179 int valid_items) ///< [in] Number of valid items to load
1180 {
1181 InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
1182 }
1183
1184
1185 /**
1186 * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
1187 *
1188 * \par
1189 * - \blocked
1190 * - \smemreuse
1191 *
1192 * \par Snippet
1193 * The code snippet below illustrates the guarded loading of a linear
1194 * segment of 512 integers into a "blocked" arrangement across 128 threads where each
1195 * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
1196 * meaning memory references are efficiently coalesced using a warp-striped access
1197 * pattern (after which items are locally reordered among threads).
1198 * \par
1199 * \code
1200 * #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
1201 *
1202 * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
1203 * {
1204 * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1205 * typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
1206 *
1207 * // Allocate shared memory for BlockLoad
1208 * __shared__ typename BlockLoad::TempStorage temp_storage;
1209 *
1210 * // Load a segment of consecutive items that are blocked across threads
1211 * int thread_data[4];
1212 * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
1213 *
1214 * \endcode
1215 * \par
1216 * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
1217 * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
1218 * The set of \p thread_data across the block of threads in those threads will be
1219 * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
1220 * being unmasked to load portions of valid data (and other items are assigned \p -1)
1221 *
1222 */
1223 template <typename InputIteratorT, typename DefaultT>
Load(InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)1224 __device__ __forceinline__ void Load(
1225 InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
1226 InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
1227 int valid_items, ///< [in] Number of valid items to load
1228 DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
1229 {
1230 InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
1231 }
1232
1233
1234 //@} end member group
1235
1236 };
1237
1238
1239 } // CUB namespace
1240 CUB_NS_POSTFIX // Optional outer namespace(s)
1241
1242