1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * Thread utilities for reading memory using PTX cache modifiers.
32  */
33 
34 #pragma once
35 
36 #include <cuda.h>
37 
38 #include <iterator>
39 
40 #include "../util_ptx.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46 
47 /// CUB namespace
48 namespace cub {
49 
50 /**
51  * \addtogroup UtilIo
52  * @{
53  */
54 
55 //-----------------------------------------------------------------------------
56 // Tags and constants
57 //-----------------------------------------------------------------------------
58 
59 /**
60  * \brief Enumeration of cache modifiers for memory load operations.
61  */
62 enum CacheLoadModifier
63 {
64     LOAD_DEFAULT,       ///< Default (no modifier)
65     LOAD_CA,            ///< Cache at all levels
66     LOAD_CG,            ///< Cache at global level
67     LOAD_CS,            ///< Cache streaming (likely to be accessed once)
68     LOAD_CV,            ///< Cache as volatile (including cached system lines)
69     LOAD_LDG,           ///< Cache as texture
70     LOAD_VOLATILE,      ///< Volatile (any memory space)
71 };
72 
73 
74 /**
75  * \name Thread I/O (cache modified)
76  * @{
77  */
78 
79 /**
80  * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
81  *
82  * \par Example
83  * \code
84  * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
85  *
86  * // 32-bit load using cache-global modifier:
87  * int *d_in;
88  * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
89  *
90  * // 16-bit load using default modifier
91  * short *d_in;
92  * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
93  *
94  * // 256-bit load using cache-volatile modifier
95  * double4 *d_in;
96  * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
97  *
98  * // 96-bit load using cache-streaming modifier
99  * struct TestFoo { bool a; short b; };
100  * TestFoo *d_struct;
101  * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
102  * \endcode
103  *
104  * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
105  * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
106  */
107 template <
108     CacheLoadModifier MODIFIER,
109     typename InputIteratorT>
110 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
111 
112 
113 //@}  end member group
114 
115 
116 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
117 
118 
119 /// Helper structure for templated load iteration (inductive case)
120 template <int COUNT, int MAX>
121 struct IterateThreadLoad
122 {
123     template <CacheLoadModifier MODIFIER, typename T>
Loadcub::IterateThreadLoad124     static __device__ __forceinline__ void Load(T const *ptr, T *vals)
125     {
126         vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
127         IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
128     }
129 
130     template <typename InputIteratorT, typename T>
Dereferencecub::IterateThreadLoad131     static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
132     {
133         vals[COUNT] = itr[COUNT];
134         IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
135     }
136 };
137 
138 
139 /// Helper structure for templated load iteration (termination case)
140 template <int MAX>
141 struct IterateThreadLoad<MAX, MAX>
142 {
143     template <CacheLoadModifier MODIFIER, typename T>
Loadcub::IterateThreadLoad144     static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
145 
146     template <typename InputIteratorT, typename T>
Dereferencecub::IterateThreadLoad147     static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
148 };
149 
150 
151 /**
152  * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
153  */
154 #define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
155     template<>                                                                              \
156     __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
157     {                                                                                       \
158         uint4 retval;                                                                       \
159         asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
160             "=r"(retval.x),                                                                 \
161             "=r"(retval.y),                                                                 \
162             "=r"(retval.z),                                                                 \
163             "=r"(retval.w) :                                                                \
164             _CUB_ASM_PTR_(ptr));                                                            \
165         return retval;                                                                      \
166     }                                                                                       \
167     template<>                                                                              \
168     __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
169     {                                                                                       \
170         ulonglong2 retval;                                                                  \
171         asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
172             "=l"(retval.x),                                                                 \
173             "=l"(retval.y) :                                                                \
174             _CUB_ASM_PTR_(ptr));                                                            \
175         return retval;                                                                      \
176     }
177 
178 /**
179  * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
180  */
181 #define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
182     template<>                                                                              \
183     __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
184     {                                                                                       \
185         ushort4 retval;                                                                     \
186         asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
187             "=h"(retval.x),                                                                 \
188             "=h"(retval.y),                                                                 \
189             "=h"(retval.z),                                                                 \
190             "=h"(retval.w) :                                                                \
191             _CUB_ASM_PTR_(ptr));                                                            \
192         return retval;                                                                      \
193     }                                                                                       \
194     template<>                                                                              \
195     __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
196     {                                                                                       \
197         uint2 retval;                                                                       \
198         asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
199             "=r"(retval.x),                                                                 \
200             "=r"(retval.y) :                                                                \
201             _CUB_ASM_PTR_(ptr));                                                            \
202         return retval;                                                                      \
203     }                                                                                       \
204     template<>                                                                              \
205     __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
206     {                                                                                       \
207         unsigned long long retval;                                                          \
208         asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
209             "=l"(retval) :                                                                  \
210             _CUB_ASM_PTR_(ptr));                                                            \
211         return retval;                                                                      \
212     }
213 
214 /**
215  * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
216  */
217 #define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
218     template<>                                                                              \
219     __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
220     {                                                                                       \
221         unsigned int retval;                                                                \
222         asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
223             "=r"(retval) :                                                                  \
224             _CUB_ASM_PTR_(ptr));                                                            \
225         return retval;                                                                      \
226     }
227 
228 
229 /**
230  * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
231  */
232 #define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
233     template<>                                                                              \
234     __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
235     {                                                                                       \
236         unsigned short retval;                                                              \
237         asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
238             "=h"(retval) :                                                                  \
239             _CUB_ASM_PTR_(ptr));                                                            \
240         return retval;                                                                      \
241     }
242 
243 
244 /**
245  * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
246  */
247 #define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
248     template<>                                                                              \
249     __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
250     {                                                                                       \
251         unsigned short retval;                                                              \
252         asm volatile (                                                                      \
253         "{"                                                                                 \
254         "   .reg .u8 datum;"                                                                \
255         "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
256         "    cvt.u16.u8 %0, datum;"                                                         \
257         "}" :                                                                               \
258             "=h"(retval) :                                                                  \
259             _CUB_ASM_PTR_(ptr));                                                            \
260         return (unsigned char) retval;                                                      \
261     }
262 
263 
264 /**
265  * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
266  */
267 #define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
268     _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
269     _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
270     _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
271     _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
272     _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
273 
274 
275 /**
276  * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
277  */
278 #if CUB_PTX_ARCH >= 200
_CUB_LOAD_ALL(LOAD_CA,ca)279     _CUB_LOAD_ALL(LOAD_CA, ca)
280     _CUB_LOAD_ALL(LOAD_CG, cg)
281     _CUB_LOAD_ALL(LOAD_CS, cs)
282     _CUB_LOAD_ALL(LOAD_CV, cv)
283 #else
284     _CUB_LOAD_ALL(LOAD_CA, global)
285     // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
286     _CUB_LOAD_ALL(LOAD_CG, volatile.global)
287     _CUB_LOAD_ALL(LOAD_CS, global)
288     _CUB_LOAD_ALL(LOAD_CV, volatile.global)
289 #endif
290 
291 #if CUB_PTX_ARCH >= 350
292     _CUB_LOAD_ALL(LOAD_LDG, global.nc)
293 #else
294     _CUB_LOAD_ALL(LOAD_LDG, global)
295 #endif
296 
297 
298 // Macro cleanup
299 #undef _CUB_LOAD_ALL
300 #undef _CUB_LOAD_1
301 #undef _CUB_LOAD_2
302 #undef _CUB_LOAD_4
303 #undef _CUB_LOAD_8
304 #undef _CUB_LOAD_16
305 
306 
307 
308 /**
309  * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
310  */
311 template <typename InputIteratorT>
312 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
313     InputIteratorT          itr,
314     Int2Type<LOAD_DEFAULT>  /*modifier*/,
315     Int2Type<false>         /*is_pointer*/)
316 {
317     return *itr;
318 }
319 
320 
321 /**
322  * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
323  */
324 template <typename T>
ThreadLoad(T * ptr,Int2Type<LOAD_DEFAULT>,Int2Type<true>)325 __device__ __forceinline__ T ThreadLoad(
326     T                       *ptr,
327     Int2Type<LOAD_DEFAULT>  /*modifier*/,
328     Int2Type<true>          /*is_pointer*/)
329 {
330     return *ptr;
331 }
332 
333 
334 /**
335  * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
336  */
337 template <typename T>
ThreadLoadVolatilePointer(T * ptr,Int2Type<true>)338 __device__ __forceinline__ T ThreadLoadVolatilePointer(
339     T                       *ptr,
340     Int2Type<true>          /*is_primitive*/)
341 {
342     T retval = *reinterpret_cast<volatile T*>(ptr);
343     return retval;
344 }
345 
346 
347 /**
348  * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
349  */
350 template <typename T>
ThreadLoadVolatilePointer(T * ptr,Int2Type<false>)351 __device__ __forceinline__ T ThreadLoadVolatilePointer(
352     T                       *ptr,
353     Int2Type<false>         /*is_primitive*/)
354 {
355     typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
356 
357     const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
358 /*
359     VolatileWord words[VOLATILE_MULTIPLE];
360 
361     IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
362         reinterpret_cast<volatile VolatileWord*>(ptr),
363         words);
364 
365     return *reinterpret_cast<T*>(words);
366 */
367 
368     T retval;
369     VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
370     IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
371         reinterpret_cast<volatile VolatileWord*>(ptr),
372         words);
373     return retval;
374 }
375 
376 
377 /**
378  * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
379  */
380 template <typename T>
ThreadLoad(T * ptr,Int2Type<LOAD_VOLATILE>,Int2Type<true>)381 __device__ __forceinline__ T ThreadLoad(
382     T                       *ptr,
383     Int2Type<LOAD_VOLATILE> /*modifier*/,
384     Int2Type<true>          /*is_pointer*/)
385 {
386     // Apply tags for partial-specialization
387     return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
388 }
389 
390 
391 /**
392  * ThreadLoad definition for generic modifiers on pointer types
393  */
394 template <typename T, int MODIFIER>
ThreadLoad(T const * ptr,Int2Type<MODIFIER>,Int2Type<true>)395 __device__ __forceinline__ T ThreadLoad(
396     T const                 *ptr,
397     Int2Type<MODIFIER>      /*modifier*/,
398     Int2Type<true>          /*is_pointer*/)
399 {
400     typedef typename UnitWord<T>::DeviceWord DeviceWord;
401 
402     const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
403 
404     DeviceWord words[DEVICE_MULTIPLE];
405 
406     IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
407         reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
408         words);
409 
410     return *reinterpret_cast<T*>(words);
411 }
412 
413 
414 /**
415  * ThreadLoad definition for generic modifiers
416  */
417 template <
418     CacheLoadModifier MODIFIER,
419     typename InputIteratorT>
ThreadLoad(InputIteratorT itr)420 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
421 {
422     // Apply tags for partial-specialization
423     return ThreadLoad(
424         itr,
425         Int2Type<MODIFIER>(),
426         Int2Type<IsPointer<InputIteratorT>::VALUE>());
427 }
428 
429 
430 
431 #endif // DOXYGEN_SHOULD_SKIP_THIS
432 
433 
434 /** @} */       // end group UtilIo
435 
436 
437 }               // CUB namespace
438 CUB_NS_POSTFIX  // Optional outer namespace(s)
439