1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /**
30 * \file
31 * Thread utilities for reading memory using PTX cache modifiers.
32 */
33
34 #pragma once
35
36 #include <cuda.h>
37
38 #include <iterator>
39
40 #include "../util_ptx.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46
47 /// CUB namespace
48 namespace cub {
49
50 /**
51 * \addtogroup UtilIo
52 * @{
53 */
54
55 //-----------------------------------------------------------------------------
56 // Tags and constants
57 //-----------------------------------------------------------------------------
58
59 /**
60 * \brief Enumeration of cache modifiers for memory load operations.
61 */
62 enum CacheLoadModifier
63 {
64 LOAD_DEFAULT, ///< Default (no modifier)
65 LOAD_CA, ///< Cache at all levels
66 LOAD_CG, ///< Cache at global level
67 LOAD_CS, ///< Cache streaming (likely to be accessed once)
68 LOAD_CV, ///< Cache as volatile (including cached system lines)
69 LOAD_LDG, ///< Cache as texture
70 LOAD_VOLATILE, ///< Volatile (any memory space)
71 };
72
73
74 /**
75 * \name Thread I/O (cache modified)
76 * @{
77 */
78
79 /**
80 * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.
81 *
82 * \par Example
83 * \code
84 * #include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh>
85 *
86 * // 32-bit load using cache-global modifier:
87 * int *d_in;
88 * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
89 *
90 * // 16-bit load using default modifier
91 * short *d_in;
92 * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
93 *
94 * // 256-bit load using cache-volatile modifier
95 * double4 *d_in;
96 * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
97 *
98 * // 96-bit load using cache-streaming modifier
99 * struct TestFoo { bool a; short b; };
100 * TestFoo *d_struct;
101 * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
102 * \endcode
103 *
104 * \tparam MODIFIER <b>[inferred]</b> CacheLoadModifier enumeration
105 * \tparam InputIteratorT <b>[inferred]</b> Input iterator type \iterator
106 */
107 template <
108 CacheLoadModifier MODIFIER,
109 typename InputIteratorT>
110 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
111
112
113 //@} end member group
114
115
116 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
117
118
119 /// Helper structure for templated load iteration (inductive case)
120 template <int COUNT, int MAX>
121 struct IterateThreadLoad
122 {
123 template <CacheLoadModifier MODIFIER, typename T>
Loadcub::IterateThreadLoad124 static __device__ __forceinline__ void Load(T const *ptr, T *vals)
125 {
126 vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
127 IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
128 }
129
130 template <typename InputIteratorT, typename T>
Dereferencecub::IterateThreadLoad131 static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
132 {
133 vals[COUNT] = itr[COUNT];
134 IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
135 }
136 };
137
138
139 /// Helper structure for templated load iteration (termination case)
140 template <int MAX>
141 struct IterateThreadLoad<MAX, MAX>
142 {
143 template <CacheLoadModifier MODIFIER, typename T>
Loadcub::IterateThreadLoad144 static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
145
146 template <typename InputIteratorT, typename T>
Dereferencecub::IterateThreadLoad147 static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
148 };
149
150
151 /**
152 * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
153 */
154 #define _CUB_LOAD_16(cub_modifier, ptx_modifier) \
155 template<> \
156 __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \
157 { \
158 uint4 retval; \
159 asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \
160 "=r"(retval.x), \
161 "=r"(retval.y), \
162 "=r"(retval.z), \
163 "=r"(retval.w) : \
164 _CUB_ASM_PTR_(ptr)); \
165 return retval; \
166 } \
167 template<> \
168 __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \
169 { \
170 ulonglong2 retval; \
171 asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \
172 "=l"(retval.x), \
173 "=l"(retval.y) : \
174 _CUB_ASM_PTR_(ptr)); \
175 return retval; \
176 }
177
178 /**
179 * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
180 */
181 #define _CUB_LOAD_8(cub_modifier, ptx_modifier) \
182 template<> \
183 __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr) \
184 { \
185 ushort4 retval; \
186 asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \
187 "=h"(retval.x), \
188 "=h"(retval.y), \
189 "=h"(retval.z), \
190 "=h"(retval.w) : \
191 _CUB_ASM_PTR_(ptr)); \
192 return retval; \
193 } \
194 template<> \
195 __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr) \
196 { \
197 uint2 retval; \
198 asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \
199 "=r"(retval.x), \
200 "=r"(retval.y) : \
201 _CUB_ASM_PTR_(ptr)); \
202 return retval; \
203 } \
204 template<> \
205 __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr) \
206 { \
207 unsigned long long retval; \
208 asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \
209 "=l"(retval) : \
210 _CUB_ASM_PTR_(ptr)); \
211 return retval; \
212 }
213
214 /**
215 * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
216 */
217 #define _CUB_LOAD_4(cub_modifier, ptx_modifier) \
218 template<> \
219 __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \
220 { \
221 unsigned int retval; \
222 asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \
223 "=r"(retval) : \
224 _CUB_ASM_PTR_(ptr)); \
225 return retval; \
226 }
227
228
229 /**
230 * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
231 */
232 #define _CUB_LOAD_2(cub_modifier, ptx_modifier) \
233 template<> \
234 __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \
235 { \
236 unsigned short retval; \
237 asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \
238 "=h"(retval) : \
239 _CUB_ASM_PTR_(ptr)); \
240 return retval; \
241 }
242
243
244 /**
245 * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
246 */
247 #define _CUB_LOAD_1(cub_modifier, ptx_modifier) \
248 template<> \
249 __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \
250 { \
251 unsigned short retval; \
252 asm volatile ( \
253 "{" \
254 " .reg .u8 datum;" \
255 " ld."#ptx_modifier".u8 datum, [%1];" \
256 " cvt.u16.u8 %0, datum;" \
257 "}" : \
258 "=h"(retval) : \
259 _CUB_ASM_PTR_(ptr)); \
260 return (unsigned char) retval; \
261 }
262
263
264 /**
265 * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
266 */
267 #define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
268 _CUB_LOAD_16(cub_modifier, ptx_modifier) \
269 _CUB_LOAD_8(cub_modifier, ptx_modifier) \
270 _CUB_LOAD_4(cub_modifier, ptx_modifier) \
271 _CUB_LOAD_2(cub_modifier, ptx_modifier) \
272 _CUB_LOAD_1(cub_modifier, ptx_modifier) \
273
274
275 /**
276 * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
277 */
278 #if CUB_PTX_ARCH >= 200
_CUB_LOAD_ALL(LOAD_CA,ca)279 _CUB_LOAD_ALL(LOAD_CA, ca)
280 _CUB_LOAD_ALL(LOAD_CG, cg)
281 _CUB_LOAD_ALL(LOAD_CS, cs)
282 _CUB_LOAD_ALL(LOAD_CV, cv)
283 #else
284 _CUB_LOAD_ALL(LOAD_CA, global)
285 // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
286 _CUB_LOAD_ALL(LOAD_CG, volatile.global)
287 _CUB_LOAD_ALL(LOAD_CS, global)
288 _CUB_LOAD_ALL(LOAD_CV, volatile.global)
289 #endif
290
291 #if CUB_PTX_ARCH >= 350
292 _CUB_LOAD_ALL(LOAD_LDG, global.nc)
293 #else
294 _CUB_LOAD_ALL(LOAD_LDG, global)
295 #endif
296
297
298 // Macro cleanup
299 #undef _CUB_LOAD_ALL
300 #undef _CUB_LOAD_1
301 #undef _CUB_LOAD_2
302 #undef _CUB_LOAD_4
303 #undef _CUB_LOAD_8
304 #undef _CUB_LOAD_16
305
306
307
308 /**
309 * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
310 */
311 template <typename InputIteratorT>
312 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
313 InputIteratorT itr,
314 Int2Type<LOAD_DEFAULT> /*modifier*/,
315 Int2Type<false> /*is_pointer*/)
316 {
317 return *itr;
318 }
319
320
321 /**
322 * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
323 */
324 template <typename T>
ThreadLoad(T * ptr,Int2Type<LOAD_DEFAULT>,Int2Type<true>)325 __device__ __forceinline__ T ThreadLoad(
326 T *ptr,
327 Int2Type<LOAD_DEFAULT> /*modifier*/,
328 Int2Type<true> /*is_pointer*/)
329 {
330 return *ptr;
331 }
332
333
334 /**
335 * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
336 */
337 template <typename T>
ThreadLoadVolatilePointer(T * ptr,Int2Type<true>)338 __device__ __forceinline__ T ThreadLoadVolatilePointer(
339 T *ptr,
340 Int2Type<true> /*is_primitive*/)
341 {
342 T retval = *reinterpret_cast<volatile T*>(ptr);
343 return retval;
344 }
345
346
347 /**
348 * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
349 */
350 template <typename T>
ThreadLoadVolatilePointer(T * ptr,Int2Type<false>)351 __device__ __forceinline__ T ThreadLoadVolatilePointer(
352 T *ptr,
353 Int2Type<false> /*is_primitive*/)
354 {
355 typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying
356
357 const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
358 /*
359 VolatileWord words[VOLATILE_MULTIPLE];
360
361 IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
362 reinterpret_cast<volatile VolatileWord*>(ptr),
363 words);
364
365 return *reinterpret_cast<T*>(words);
366 */
367
368 T retval;
369 VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
370 IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
371 reinterpret_cast<volatile VolatileWord*>(ptr),
372 words);
373 return retval;
374 }
375
376
377 /**
378 * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
379 */
380 template <typename T>
ThreadLoad(T * ptr,Int2Type<LOAD_VOLATILE>,Int2Type<true>)381 __device__ __forceinline__ T ThreadLoad(
382 T *ptr,
383 Int2Type<LOAD_VOLATILE> /*modifier*/,
384 Int2Type<true> /*is_pointer*/)
385 {
386 // Apply tags for partial-specialization
387 return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
388 }
389
390
391 /**
392 * ThreadLoad definition for generic modifiers on pointer types
393 */
394 template <typename T, int MODIFIER>
ThreadLoad(T const * ptr,Int2Type<MODIFIER>,Int2Type<true>)395 __device__ __forceinline__ T ThreadLoad(
396 T const *ptr,
397 Int2Type<MODIFIER> /*modifier*/,
398 Int2Type<true> /*is_pointer*/)
399 {
400 typedef typename UnitWord<T>::DeviceWord DeviceWord;
401
402 const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
403
404 DeviceWord words[DEVICE_MULTIPLE];
405
406 IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
407 reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
408 words);
409
410 return *reinterpret_cast<T*>(words);
411 }
412
413
414 /**
415 * ThreadLoad definition for generic modifiers
416 */
417 template <
418 CacheLoadModifier MODIFIER,
419 typename InputIteratorT>
ThreadLoad(InputIteratorT itr)420 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
421 {
422 // Apply tags for partial-specialization
423 return ThreadLoad(
424 itr,
425 Int2Type<MODIFIER>(),
426 Int2Type<IsPointer<InputIteratorT>::VALUE>());
427 }
428
429
430
431 #endif // DOXYGEN_SHOULD_SKIP_THIS
432
433
434 /** @} */ // end group UtilIo
435
436
437 } // CUB namespace
438 CUB_NS_POSTFIX // Optional outer namespace(s)
439