1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * Operations for reading linear tiles of data into the CUDA thread block.
32  */
33 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "block_exchange.cuh"
39 #include "../iterator/cache_modified_input_iterator.cuh"
40 #include "../util_ptx.cuh"
41 #include "../util_macro.cuh"
42 #include "../util_type.cuh"
43 #include "../util_namespace.cuh"
44 
45 /// Optional outer namespace(s)
46 CUB_NS_PREFIX
47 
48 /// CUB namespace
49 namespace cub {
50 
51 /**
52  * \addtogroup UtilIo
53  * @{
54  */
55 
56 
57 /******************************************************************//**
58  * \name Blocked arrangement I/O (direct)
59  *********************************************************************/
60 //@{
61 
62 
63 /**
64  * \brief Load a linear segment of items into a blocked arrangement across the thread block.
65  *
66  * \blocked
67  *
68  * \tparam T                    <b>[inferred]</b> The data type to load.
69  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
70  * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
71  */
72 template <
73     typename        InputT,
74     int             ITEMS_PER_THREAD,
75     typename        InputIteratorT>
LoadDirectBlocked(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])76 __device__ __forceinline__ void LoadDirectBlocked(
77     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
78     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
79     InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
80 {
81     InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
82 
83     // Load directly in thread-blocked order
84     #pragma unroll
85     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
86     {
87         items[ITEM] = thread_itr[ITEM];
88     }
89 }
90 
91 
92 /**
93  * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
94  *
95  * \blocked
96  *
97  * \tparam T                    <b>[inferred]</b> The data type to load.
98  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
99  * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
100  */
101 template <
102     typename        InputT,
103     int             ITEMS_PER_THREAD,
104     typename        InputIteratorT>
LoadDirectBlocked(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)105 __device__ __forceinline__ void LoadDirectBlocked(
106     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
107     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
108     InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
109     int             valid_items)                ///< [in] Number of valid items to load
110 {
111     InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
112 
113     #pragma unroll
114     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
115     {
116         if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
117         {
118             items[ITEM] = thread_itr[ITEM];
119         }
120     }
121 }
122 
123 
124 /**
125  * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
126  *
127  * \blocked
128  *
129  * \tparam T                    <b>[inferred]</b> The data type to load.
130  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
131  * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
132  */
133 template <
134     typename        InputT,
135     typename        DefaultT,
136     int             ITEMS_PER_THREAD,
137     typename        InputIteratorT>
LoadDirectBlocked(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)138 __device__ __forceinline__ void LoadDirectBlocked(
139     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
140     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
141     InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
142     int             valid_items,                ///< [in] Number of valid items to load
143     DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
144 {
145     #pragma unroll
146     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
147         items[ITEM] = oob_default;
148 
149     LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
150 }
151 
152 
153 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
154 
155 /**
156  * Internal implementation for load vectorization
157  */
158 template <
159     CacheLoadModifier   MODIFIER,
160     typename            T,
161     int                 ITEMS_PER_THREAD>
InternalLoadDirectBlockedVectorized(int linear_tid,T * block_ptr,T (& items)[ITEMS_PER_THREAD])162 __device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
163     int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
164     T      *block_ptr,                 ///< [in] Input pointer for loading from
165     T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
166 {
167     // Biggest memory access word that T is a whole multiple of
168     typedef typename UnitWord<T>::DeviceWord DeviceWord;
169 
170     enum
171     {
172         TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
173 
174         VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
175             4 :
176             (TOTAL_WORDS % 2 == 0) ?
177                 2 :
178                 1,
179 
180         VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
181     };
182 
183     // Vector type
184     typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
185 
186     // Vector items
187     Vector vec_items[VECTORS_PER_THREAD];
188 
189     // Aliased input ptr
190     Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
191 
192     // Load directly in thread-blocked order
193     #pragma unroll
194     for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
195     {
196         vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
197     }
198 
199     // Copy
200     #pragma unroll
201     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
202     {
203         items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
204     }
205 }
206 
207 #endif // DOXYGEN_SHOULD_SKIP_THIS
208 
209 
210 /**
211  * \brief Load a linear segment of items into a blocked arrangement across the thread block.
212  *
213  * \blocked
214  *
215  * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
216  *
217  * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
218  *   - \p ITEMS_PER_THREAD is odd
219  *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
220  *
221  * \tparam T                    <b>[inferred]</b> The data type to load.
222  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
223  */
224 template <
225     typename        T,
226     int             ITEMS_PER_THREAD>
LoadDirectBlockedVectorized(int linear_tid,T * block_ptr,T (& items)[ITEMS_PER_THREAD])227 __device__ __forceinline__ void LoadDirectBlockedVectorized(
228     int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
229     T   *block_ptr,                 ///< [in] Input pointer for loading from
230     T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
231 {
232     InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
233 }
234 
235 
236 //@}  end member group
237 /******************************************************************//**
238  * \name Striped arrangement I/O (direct)
239  *********************************************************************/
240 //@{
241 
242 
243 /**
244  * \brief Load a linear segment of items into a striped arrangement across the thread block.
245  *
246  * \striped
247  *
248  * \tparam BLOCK_THREADS        The thread block size in threads
249  * \tparam T                    <b>[inferred]</b> The data type to load.
250  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
251  * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
252  */
253 template <
254     int             BLOCK_THREADS,
255     typename        InputT,
256     int             ITEMS_PER_THREAD,
257     typename        InputIteratorT>
LoadDirectStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])258 __device__ __forceinline__ void LoadDirectStriped(
259     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
260     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
261     InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
262 {
263     InputIteratorT thread_itr = block_itr + linear_tid;
264 
265     #pragma unroll
266     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
267     {
268         items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
269     }
270 }
271 
272 
273 /**
274  * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
275  *
276  * \striped
277  *
278  * \tparam BLOCK_THREADS        The thread block size in threads
279  * \tparam T                    <b>[inferred]</b> The data type to load.
280  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
281  * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
282  */
283 template <
284     int             BLOCK_THREADS,
285     typename        InputT,
286     int             ITEMS_PER_THREAD,
287     typename        InputIteratorT>
LoadDirectStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)288 __device__ __forceinline__ void LoadDirectStriped(
289     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
290     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
291     InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
292     int             valid_items)                ///< [in] Number of valid items to load
293 {
294     InputIteratorT thread_itr = block_itr + linear_tid;
295 
296     #pragma unroll
297     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
298     {
299         if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
300         {
301             items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
302         }
303     }
304 }
305 
306 
307 /**
308  * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
309  *
310  * \striped
311  *
312  * \tparam BLOCK_THREADS        The thread block size in threads
313  * \tparam T                    <b>[inferred]</b> The data type to load.
314  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
315  * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
316  */
317 template <
318     int             BLOCK_THREADS,
319     typename        InputT,
320     typename        DefaultT,
321     int             ITEMS_PER_THREAD,
322     typename        InputIteratorT>
LoadDirectStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)323 __device__ __forceinline__ void LoadDirectStriped(
324     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
325     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
326     InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
327     int             valid_items,                ///< [in] Number of valid items to load
328     DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
329 {
330     #pragma unroll
331     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332         items[ITEM] = oob_default;
333 
334     LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
335 }
336 
337 
338 
339 //@}  end member group
340 /******************************************************************//**
341  * \name Warp-striped arrangement I/O (direct)
342  *********************************************************************/
343 //@{
344 
345 
346 /**
347  * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
348  *
349  * \warpstriped
350  *
351  * \par Usage Considerations
352  * The number of threads in the thread block must be a multiple of the architecture's warp size.
353  *
354  * \tparam T                    <b>[inferred]</b> The data type to load.
355  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
356  * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
357  */
358 template <
359     typename        InputT,
360     int             ITEMS_PER_THREAD,
361     typename        InputIteratorT>
LoadDirectWarpStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])362 __device__ __forceinline__ void LoadDirectWarpStriped(
363     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
364     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
365     InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
366 {
367     int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
368     int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
369     int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
370 
371     InputIteratorT thread_itr = block_itr + warp_offset + tid ;
372 
373     // Load directly in warp-striped order
374     #pragma unroll
375     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
376     {
377         items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
378     }
379 }
380 
381 
382 /**
383  * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
384  *
385  * \warpstriped
386  *
387  * \par Usage Considerations
388  * The number of threads in the thread block must be a multiple of the architecture's warp size.
389  *
390  * \tparam T                    <b>[inferred]</b> The data type to load.
391  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
392  * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
393  */
394 template <
395     typename        InputT,
396     int             ITEMS_PER_THREAD,
397     typename        InputIteratorT>
LoadDirectWarpStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)398 __device__ __forceinline__ void LoadDirectWarpStriped(
399     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
400     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
401     InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
402     int             valid_items)                ///< [in] Number of valid items to load
403 {
404     int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
405     int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
406     int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
407 
408     InputIteratorT thread_itr = block_itr + warp_offset + tid ;
409 
410     // Load directly in warp-striped order
411     #pragma unroll
412     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
413     {
414         if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
415         {
416             items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
417         }
418     }
419 }
420 
421 
422 /**
423  * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
424  *
425  * \warpstriped
426  *
427  * \par Usage Considerations
428  * The number of threads in the thread block must be a multiple of the architecture's warp size.
429  *
430  * \tparam T                    <b>[inferred]</b> The data type to load.
431  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
432  * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
433  */
434 template <
435     typename        InputT,
436     typename        DefaultT,
437     int             ITEMS_PER_THREAD,
438     typename        InputIteratorT>
LoadDirectWarpStriped(int linear_tid,InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)439 __device__ __forceinline__ void LoadDirectWarpStriped(
440     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
441     InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
442     InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
443     int             valid_items,                ///< [in] Number of valid items to load
444     DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
445 {
446     // Load directly in warp-striped order
447     #pragma unroll
448     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
449         items[ITEM] = oob_default;
450 
451     LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
452 }
453 
454 
455 
456 //@}  end member group
457 
458 /** @} */       // end group UtilIo
459 
460 
461 
462 //-----------------------------------------------------------------------------
463 // Generic BlockLoad abstraction
464 //-----------------------------------------------------------------------------
465 
466 /**
467  * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
468  */
469 
470 /**
471  * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
472  */
473 enum BlockLoadAlgorithm
474 {
475     /**
476      * \par Overview
477      *
478      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
479      * directly from memory.
480      *
481      * \par Performance Considerations
482      * - The utilization of memory transactions (coalescing) decreases as the
483      *   access stride between threads increases (i.e., the number items per thread).
484      */
485     BLOCK_LOAD_DIRECT,
486 
487     /**
488      * \par Overview
489      *
490      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
491      * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
492      * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
493      * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
494      *
495      * \par Performance Considerations
496      * - The utilization of memory transactions (coalescing) remains high until the the
497      *   access stride between threads (i.e., the number items per thread) exceeds the
498      *   maximum vector load width (typically 4 items or 64B, whichever is lower).
499      * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
500      *   - \p ITEMS_PER_THREAD is odd
501      *   - The \p InputIteratorTis not a simple pointer type
502      *   - The block input offset is not quadword-aligned
503      *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
504      */
505     BLOCK_LOAD_VECTORIZE,
506 
507     /**
508      * \par Overview
509      *
510      * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
511      * efficiently from memory and then locally transposed into a
512      * [<em>blocked arrangement</em>](index.html#sec5sec3).
513      *
514      * \par Performance Considerations
515      * - The utilization of memory transactions (coalescing) remains high regardless
516      *   of items loaded per thread.
517      * - The local reordering incurs slightly longer latencies and throughput than the
518      *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
519      */
520     BLOCK_LOAD_TRANSPOSE,
521 
522 
523     /**
524      * \par Overview
525      *
526      * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
527      * read efficiently from memory and then locally transposed into a
528      * [<em>blocked arrangement</em>](index.html#sec5sec3).
529      *
530      * \par Usage Considerations
531      * - BLOCK_THREADS must be a multiple of WARP_THREADS
532      *
533      * \par Performance Considerations
534      * - The utilization of memory transactions (coalescing) remains high regardless
535      *   of items loaded per thread.
536      * - The local reordering incurs slightly larger latencies than the
537      *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
538      * - Provisions more shared storage, but incurs smaller latencies than the
539      *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
540      */
541     BLOCK_LOAD_WARP_TRANSPOSE,
542 
543 
544     /**
545      * \par Overview
546      *
547      * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
548      * of data is read directly from memory and then is locally transposed into a
549      * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
550      * requirement, only one warp's worth of shared memory is provisioned and is
551      * subsequently time-sliced among warps.
552      *
553      * \par Usage Considerations
554      * - BLOCK_THREADS must be a multiple of WARP_THREADS
555      *
556      * \par Performance Considerations
557      * - The utilization of memory transactions (coalescing) remains high regardless
558      *   of items loaded per thread.
559      * - Provisions less shared memory temporary storage, but incurs larger
560      *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
561      */
562     BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
563 };
564 
565 
566 /**
567  * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
568  * \ingroup BlockModule
569  * \ingroup UtilIo
570  *
571  * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
572  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
573  * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
574  * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
575  * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
576  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
577  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
578  * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
579  *
580  * \par Overview
581  * - The BlockLoad class provides a single data movement abstraction that can be specialized
582  *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
583  *   performance policies for different architectures, data types, granularity sizes, etc.
584  * - BlockLoad can be optionally specialized by different data movement strategies:
585  *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
586  *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
587  *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
588  *      of data is read directly from memory using CUDA's built-in vectorized loads as a
589  *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
590  *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
591  *      of data is read directly from memory and is then locally transposed into a
592  *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
593  *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
594  *      of data is read directly from memory and is then locally transposed into a
595  *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
596  *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
597  *      of data is read directly from memory and is then locally transposed into a
598  *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
599  * - \rowmajor
600  *
601  * \par A Simple Example
602  * \blockcollective{BlockLoad}
603  * \par
604  * The code snippet below illustrates the loading of a linear
605  * segment of 512 integers into a "blocked" arrangement across 128 threads where each
606  * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
607  * meaning memory references are efficiently coalesced using a warp-striped access
608  * pattern (after which items are locally reordered among threads).
609  * \par
610  * \code
611  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
612  *
613  * __global__ void ExampleKernel(int *d_data, ...)
614  * {
615  *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
616  *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
617  *
618  *     // Allocate shared memory for BlockLoad
619  *     __shared__ typename BlockLoad::TempStorage temp_storage;
620  *
621  *     // Load a segment of consecutive items that are blocked across threads
622  *     int thread_data[4];
623  *     BlockLoad(temp_storage).Load(d_data, thread_data);
624  *
625  * \endcode
626  * \par
627  * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
628  * The set of \p thread_data across the block of threads in those threads will be
629  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
630  *
631  */
632 template <
633     typename            InputT,
634     int                 BLOCK_DIM_X,
635     int                 ITEMS_PER_THREAD,
636     BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
637     int                 BLOCK_DIM_Y         = 1,
638     int                 BLOCK_DIM_Z         = 1,
639     int                 PTX_ARCH            = CUB_PTX_ARCH>
640 class BlockLoad
641 {
642 private:
643 
644     /******************************************************************************
645      * Constants and typed definitions
646      ******************************************************************************/
647 
648     /// Constants
649     enum
650     {
651         /// The thread block size in threads
652         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
653     };
654 
655 
656     /******************************************************************************
657      * Algorithmic variants
658      ******************************************************************************/
659 
660     /// Load helper
661     template <BlockLoadAlgorithm _POLICY, int DUMMY>
662     struct LoadInternal;
663 
664 
665     /**
666      * BLOCK_LOAD_DIRECT specialization of load helper
667      */
668     template <int DUMMY>
669     struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
670     {
671         /// Shared memory storage layout type
672         typedef NullType TempStorage;
673 
674         /// Linear thread-id
675         int linear_tid;
676 
677         /// Constructor
LoadInternalcub::BlockLoad::LoadInternal678         __device__ __forceinline__ LoadInternal(
679             TempStorage &/*temp_storage*/,
680             int linear_tid)
681         :
682             linear_tid(linear_tid)
683         {}
684 
685         /// Load a linear segment of items from memory
686         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal687         __device__ __forceinline__ void Load(
688             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
689             InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
690         {
691             LoadDirectBlocked(linear_tid, block_itr, items);
692         }
693 
694         /// Load a linear segment of items from memory, guarded by range
695         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal696         __device__ __forceinline__ void Load(
697             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
698             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
699             int             valid_items)                    ///< [in] Number of valid items to load
700         {
701             LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
702         }
703 
704         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
705         template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal706         __device__ __forceinline__ void Load(
707             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
708             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
709             int             valid_items,                    ///< [in] Number of valid items to load
710             DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
711         {
712             LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
713         }
714 
715     };
716 
717 
718     /**
719      * BLOCK_LOAD_VECTORIZE specialization of load helper
720      */
721     template <int DUMMY>
722     struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
723     {
724         /// Shared memory storage layout type
725         typedef NullType TempStorage;
726 
727         /// Linear thread-id
728         int linear_tid;
729 
730         /// Constructor
LoadInternalcub::BlockLoad::LoadInternal731         __device__ __forceinline__ LoadInternal(
732             TempStorage &/*temp_storage*/,
733             int linear_tid)
734         :
735             linear_tid(linear_tid)
736         {}
737 
738         /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
739         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal740         __device__ __forceinline__ void Load(
741             InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
742             InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
743         {
744             InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
745         }
746 
747         /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
748         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal749         __device__ __forceinline__ void Load(
750             const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
751             InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
752         {
753             InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
754         }
755 
756         /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
757         template <
758             CacheLoadModifier   MODIFIER,
759             typename            ValueType,
760             typename            OffsetT>
Loadcub::BlockLoad::LoadInternal761         __device__ __forceinline__ void Load(
762             CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
763             InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
764         {
765             InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
766         }
767 
768         /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
769         template <typename _InputIteratorT>
Loadcub::BlockLoad::LoadInternal770         __device__ __forceinline__ void Load(
771             _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
772             InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
773         {
774             LoadDirectBlocked(linear_tid, block_itr, items);
775         }
776 
777         /// Load a linear segment of items from memory, guarded by range (skips vectorization)
778         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal779         __device__ __forceinline__ void Load(
780             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
781             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
782             int             valid_items)                    ///< [in] Number of valid items to load
783         {
784             LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
785         }
786 
787         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
788         template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal789         __device__ __forceinline__ void Load(
790             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
791             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
792             int             valid_items,                    ///< [in] Number of valid items to load
793             DefaultT          oob_default)                    ///< [in] Default value to assign out-of-bound items
794         {
795             LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
796         }
797 
798     };
799 
800 
801     /**
802      * BLOCK_LOAD_TRANSPOSE specialization of load helper
803      */
804     template <int DUMMY>
805     struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
806     {
807         // BlockExchange utility type for keys
808         typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
809 
810         /// Shared memory storage layout type
811         struct _TempStorage : BlockExchange::TempStorage
812         {};
813 
814         /// Alias wrapper allowing storage to be unioned
815         struct TempStorage : Uninitialized<_TempStorage> {};
816 
817         /// Thread reference to shared storage
818         _TempStorage &temp_storage;
819 
820         /// Linear thread-id
821         int linear_tid;
822 
823         /// Constructor
LoadInternalcub::BlockLoad::LoadInternal824         __device__ __forceinline__ LoadInternal(
825             TempStorage &temp_storage,
826             int linear_tid)
827         :
828             temp_storage(temp_storage.Alias()),
829             linear_tid(linear_tid)
830         {}
831 
832         /// Load a linear segment of items from memory
833         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal834         __device__ __forceinline__ void Load(
835             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
836             InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
837         {
838             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
839             BlockExchange(temp_storage).StripedToBlocked(items, items);
840         }
841 
842         /// Load a linear segment of items from memory, guarded by range
843         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal844         __device__ __forceinline__ void Load(
845             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
846             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
847             int             valid_items)                    ///< [in] Number of valid items to load
848         {
849             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
850             BlockExchange(temp_storage).StripedToBlocked(items, items);
851         }
852 
853         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
854         template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal855         __device__ __forceinline__ void Load(
856             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
857             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
858             int             valid_items,                    ///< [in] Number of valid items to load
859             DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
860         {
861             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
862             BlockExchange(temp_storage).StripedToBlocked(items, items);
863         }
864 
865     };
866 
867 
868     /**
869      * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
870      */
871     template <int DUMMY>
872     struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
873     {
874         enum
875         {
876             WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
877         };
878 
879         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
880         CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
881 
882         // BlockExchange utility type for keys
883         typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
884 
885         /// Shared memory storage layout type
886         struct _TempStorage : BlockExchange::TempStorage
887         {};
888 
889         /// Alias wrapper allowing storage to be unioned
890         struct TempStorage : Uninitialized<_TempStorage> {};
891 
892         /// Thread reference to shared storage
893         _TempStorage &temp_storage;
894 
895         /// Linear thread-id
896         int linear_tid;
897 
898         /// Constructor
LoadInternalcub::BlockLoad::LoadInternal899         __device__ __forceinline__ LoadInternal(
900             TempStorage &temp_storage,
901             int linear_tid)
902         :
903             temp_storage(temp_storage.Alias()),
904             linear_tid(linear_tid)
905         {}
906 
907         /// Load a linear segment of items from memory
908         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal909         __device__ __forceinline__ void Load(
910             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
911             InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
912         {
913             LoadDirectWarpStriped(linear_tid, block_itr, items);
914             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
915         }
916 
917         /// Load a linear segment of items from memory, guarded by range
918         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal919         __device__ __forceinline__ void Load(
920             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
921             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
922             int             valid_items)                    ///< [in] Number of valid items to load
923         {
924             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
925             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
926         }
927 
928 
929         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
930         template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal931         __device__ __forceinline__ void Load(
932             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
933             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
934             int             valid_items,                    ///< [in] Number of valid items to load
935             DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
936         {
937             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
938             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
939         }
940     };
941 
942 
943     /**
944      * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
945      */
946     template <int DUMMY>
947     struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
948     {
949         enum
950         {
951             WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
952         };
953 
954         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
955         CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
956 
957         // BlockExchange utility type for keys
958         typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
959 
960         /// Shared memory storage layout type
961         struct _TempStorage : BlockExchange::TempStorage
962         {};
963 
964         /// Alias wrapper allowing storage to be unioned
965         struct TempStorage : Uninitialized<_TempStorage> {};
966 
967         /// Thread reference to shared storage
968         _TempStorage &temp_storage;
969 
970         /// Linear thread-id
971         int linear_tid;
972 
973         /// Constructor
LoadInternalcub::BlockLoad::LoadInternal974         __device__ __forceinline__ LoadInternal(
975             TempStorage &temp_storage,
976             int linear_tid)
977         :
978             temp_storage(temp_storage.Alias()),
979             linear_tid(linear_tid)
980         {}
981 
982         /// Load a linear segment of items from memory
983         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal984         __device__ __forceinline__ void Load(
985             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
986             InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
987         {
988             LoadDirectWarpStriped(linear_tid, block_itr, items);
989             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
990         }
991 
992         /// Load a linear segment of items from memory, guarded by range
993         template <typename InputIteratorT>
Loadcub::BlockLoad::LoadInternal994         __device__ __forceinline__ void Load(
995             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
996             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
997             int             valid_items)                    ///< [in] Number of valid items to load
998         {
999             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
1000             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1001         }
1002 
1003 
1004         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
1005         template <typename InputIteratorT, typename DefaultT>
Loadcub::BlockLoad::LoadInternal1006         __device__ __forceinline__ void Load(
1007             InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
1008             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
1009             int             valid_items,                    ///< [in] Number of valid items to load
1010             DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
1011         {
1012             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
1013             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1014         }
1015     };
1016 
1017 
1018     /******************************************************************************
1019      * Type definitions
1020      ******************************************************************************/
1021 
1022     /// Internal load implementation to use
1023     typedef LoadInternal<ALGORITHM, 0> InternalLoad;
1024 
1025 
1026     /// Shared memory storage layout type
1027     typedef typename InternalLoad::TempStorage _TempStorage;
1028 
1029 
1030     /******************************************************************************
1031      * Utility methods
1032      ******************************************************************************/
1033 
1034     /// Internal storage allocator
PrivateStorage()1035     __device__ __forceinline__ _TempStorage& PrivateStorage()
1036     {
1037         __shared__ _TempStorage private_storage;
1038         return private_storage;
1039     }
1040 
1041 
1042     /******************************************************************************
1043      * Thread fields
1044      ******************************************************************************/
1045 
1046     /// Thread reference to shared storage
1047     _TempStorage &temp_storage;
1048 
1049     /// Linear thread-id
1050     int linear_tid;
1051 
1052 public:
1053 
1054     /// \smemstorage{BlockLoad}
1055     struct TempStorage : Uninitialized<_TempStorage> {};
1056 
1057 
1058     /******************************************************************//**
1059      * \name Collective constructors
1060      *********************************************************************/
1061     //@{
1062 
1063     /**
1064      * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
1065      */
BlockLoad()1066     __device__ __forceinline__ BlockLoad()
1067     :
1068         temp_storage(PrivateStorage()),
1069         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1070     {}
1071 
1072 
1073     /**
1074      * \brief Collective constructor using the specified memory allocation as temporary storage.
1075      */
BlockLoad(TempStorage & temp_storage)1076     __device__ __forceinline__ BlockLoad(
1077         TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
1078     :
1079         temp_storage(temp_storage.Alias()),
1080         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1081     {}
1082 
1083 
1084 
1085 
1086     //@}  end member group
1087     /******************************************************************//**
1088      * \name Data movement
1089      *********************************************************************/
1090     //@{
1091 
1092 
1093     /**
1094      * \brief Load a linear segment of items from memory.
1095      *
1096      * \par
1097      * - \blocked
1098      * - \smemreuse
1099      *
1100      * \par Snippet
1101      * The code snippet below illustrates the loading of a linear
1102      * segment of 512 integers into a "blocked" arrangement across 128 threads where each
1103      * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
1104      * meaning memory references are efficiently coalesced using a warp-striped access
1105      * pattern (after which items are locally reordered among threads).
1106      * \par
1107      * \code
1108      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
1109      *
1110      * __global__ void ExampleKernel(int *d_data, ...)
1111      * {
1112      *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1113      *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
1114      *
1115      *     // Allocate shared memory for BlockLoad
1116      *     __shared__ typename BlockLoad::TempStorage temp_storage;
1117      *
1118      *     // Load a segment of consecutive items that are blocked across threads
1119      *     int thread_data[4];
1120      *     BlockLoad(temp_storage).Load(d_data, thread_data);
1121      *
1122      * \endcode
1123      * \par
1124      * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
1125      * The set of \p thread_data across the block of threads in those threads will be
1126      * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
1127      *
1128      */
1129     template <typename InputIteratorT>
Load(InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD])1130     __device__ __forceinline__ void Load(
1131         InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
1132         InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
1133     {
1134         InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
1135     }
1136 
1137 
1138     /**
1139      * \brief Load a linear segment of items from memory, guarded by range.
1140      *
1141      * \par
1142      * - \blocked
1143      * - \smemreuse
1144      *
1145      * \par Snippet
1146      * The code snippet below illustrates the guarded loading of a linear
1147      * segment of 512 integers into a "blocked" arrangement across 128 threads where each
1148      * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
1149      * meaning memory references are efficiently coalesced using a warp-striped access
1150      * pattern (after which items are locally reordered among threads).
1151      * \par
1152      * \code
1153      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
1154      *
1155      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
1156      * {
1157      *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1158      *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
1159      *
1160      *     // Allocate shared memory for BlockLoad
1161      *     __shared__ typename BlockLoad::TempStorage temp_storage;
1162      *
1163      *     // Load a segment of consecutive items that are blocked across threads
1164      *     int thread_data[4];
1165      *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
1166      *
1167      * \endcode
1168      * \par
1169      * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
1170      * The set of \p thread_data across the block of threads in those threads will be
1171      * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
1172      * being unmasked to load portions of valid data (and other items remaining unassigned).
1173      *
1174      */
1175     template <typename InputIteratorT>
Load(InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items)1176     __device__ __forceinline__ void Load(
1177         InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
1178         InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
1179         int             valid_items)                ///< [in] Number of valid items to load
1180     {
1181         InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
1182     }
1183 
1184 
1185     /**
1186      * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
1187      *
1188      * \par
1189      * - \blocked
1190      * - \smemreuse
1191      *
1192      * \par Snippet
1193      * The code snippet below illustrates the guarded loading of a linear
1194      * segment of 512 integers into a "blocked" arrangement across 128 threads where each
1195      * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
1196      * meaning memory references are efficiently coalesced using a warp-striped access
1197      * pattern (after which items are locally reordered among threads).
1198      * \par
1199      * \code
1200      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
1201      *
1202      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
1203      * {
1204      *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1205      *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
1206      *
1207      *     // Allocate shared memory for BlockLoad
1208      *     __shared__ typename BlockLoad::TempStorage temp_storage;
1209      *
1210      *     // Load a segment of consecutive items that are blocked across threads
1211      *     int thread_data[4];
1212      *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
1213      *
1214      * \endcode
1215      * \par
1216      * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
1217      * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
1218      * The set of \p thread_data across the block of threads in those threads will be
1219      * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
1220      * being unmasked to load portions of valid data (and other items are assigned \p -1)
1221      *
1222      */
1223     template <typename InputIteratorT, typename DefaultT>
Load(InputIteratorT block_itr,InputT (& items)[ITEMS_PER_THREAD],int valid_items,DefaultT oob_default)1224     __device__ __forceinline__ void Load(
1225         InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
1226         InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
1227         int             valid_items,                ///< [in] Number of valid items to load
1228         DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
1229     {
1230         InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
1231     }
1232 
1233 
1234     //@}  end member group
1235 
1236 };
1237 
1238 
1239 }               // CUB namespace
1240 CUB_NS_POSTFIX  // Optional outer namespace(s)
1241 
1242