1 /******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29 /**
30 * \file
31 * Thread utilities for writing memory using PTX cache modifiers.
32 */
33
34 #pragma once
35
36 #include <cuda.h>
37
38 #include "../util_ptx.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41
42 /// Optional outer namespace(s)
43 CUB_NS_PREFIX
44
45 /// CUB namespace
46 namespace cub {
47
48 /**
49 * \addtogroup UtilIo
50 * @{
51 */
52
53
54 //-----------------------------------------------------------------------------
55 // Tags and constants
56 //-----------------------------------------------------------------------------
57
58 /**
59 * \brief Enumeration of cache modifiers for memory store operations.
60 */
61 enum CacheStoreModifier
62 {
63 STORE_DEFAULT, ///< Default (no modifier)
64 STORE_WB, ///< Cache write-back all coherent levels
65 STORE_CG, ///< Cache at global level
66 STORE_CS, ///< Cache streaming (likely to be accessed once)
67 STORE_WT, ///< Cache write-through (to system memory)
68 STORE_VOLATILE, ///< Volatile shared (any memory space)
69 };
70
71
72 /**
73 * \name Thread I/O (cache modified)
74 * @{
75 */
76
77 /**
78 * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.
79 *
80 * \par Example
81 * \code
82 * #include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh>
83 *
84 * // 32-bit store using cache-global modifier:
85 * int *d_out;
86 * int val;
87 * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
88 *
89 * // 16-bit store using default modifier
90 * short *d_out;
91 * short val;
92 * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
93 *
94 * // 256-bit store using write-through modifier
95 * double4 *d_out;
96 * double4 val;
97 * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
98 *
99 * // 96-bit store using cache-streaming cache modifier
100 * struct TestFoo { bool a; short b; };
101 * TestFoo *d_struct;
102 * TestFoo val;
103 * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
104 * \endcode
105 *
106 * \tparam MODIFIER <b>[inferred]</b> CacheStoreModifier enumeration
107 * \tparam InputIteratorT <b>[inferred]</b> Output iterator type \iterator
108 * \tparam T <b>[inferred]</b> Data type of output value
109 */
110 template <
111 CacheStoreModifier MODIFIER,
112 typename OutputIteratorT,
113 typename T>
114 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
115
116
117 //@} end member group
118
119
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
121
122
123 /// Helper structure for templated store iteration (inductive case)
124 template <int COUNT, int MAX>
125 struct IterateThreadStore
126 {
127 template <CacheStoreModifier MODIFIER, typename T>
Storecub::IterateThreadStore128 static __device__ __forceinline__ void Store(T *ptr, T *vals)
129 {
130 ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131 IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
132 }
133
134 template <typename OutputIteratorT, typename T>
Dereferencecub::IterateThreadStore135 static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
136 {
137 ptr[COUNT] = vals[COUNT];
138 IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
139 }
140
141 };
142
143 /// Helper structure for templated store iteration (termination case)
144 template <int MAX>
145 struct IterateThreadStore<MAX, MAX>
146 {
147 template <CacheStoreModifier MODIFIER, typename T>
Storecub::IterateThreadStore148 static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
149
150 template <typename OutputIteratorT, typename T>
Dereferencecub::IterateThreadStore151 static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
152 };
153
154
155 /**
156 * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
157 */
158 #define _CUB_STORE_16(cub_modifier, ptx_modifier) \
159 template<> \
160 __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
161 { \
162 asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163 _CUB_ASM_PTR_(ptr), \
164 "r"(val.x), \
165 "r"(val.y), \
166 "r"(val.z), \
167 "r"(val.w)); \
168 } \
169 template<> \
170 __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
171 { \
172 asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173 _CUB_ASM_PTR_(ptr), \
174 "l"(val.x), \
175 "l"(val.y)); \
176 }
177
178
179 /**
180 * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
181 */
182 #define _CUB_STORE_8(cub_modifier, ptx_modifier) \
183 template<> \
184 __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
185 { \
186 asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187 _CUB_ASM_PTR_(ptr), \
188 "h"(val.x), \
189 "h"(val.y), \
190 "h"(val.z), \
191 "h"(val.w)); \
192 } \
193 template<> \
194 __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
195 { \
196 asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197 _CUB_ASM_PTR_(ptr), \
198 "r"(val.x), \
199 "r"(val.y)); \
200 } \
201 template<> \
202 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
203 { \
204 asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205 _CUB_ASM_PTR_(ptr), \
206 "l"(val)); \
207 }
208
209 /**
210 * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
211 */
212 #define _CUB_STORE_4(cub_modifier, ptx_modifier) \
213 template<> \
214 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
215 { \
216 asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217 _CUB_ASM_PTR_(ptr), \
218 "r"(val)); \
219 }
220
221
222 /**
223 * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
224 */
225 #define _CUB_STORE_2(cub_modifier, ptx_modifier) \
226 template<> \
227 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
228 { \
229 asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230 _CUB_ASM_PTR_(ptr), \
231 "h"(val)); \
232 }
233
234
235 /**
236 * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
237 */
238 #define _CUB_STORE_1(cub_modifier, ptx_modifier) \
239 template<> \
240 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
241 { \
242 asm volatile ( \
243 "{" \
244 " .reg .u8 datum;" \
245 " cvt.u8.u16 datum, %1;" \
246 " st."#ptx_modifier".u8 [%0], datum;" \
247 "}" : : \
248 _CUB_ASM_PTR_(ptr), \
249 "h"((unsigned short) val)); \
250 }
251
252 /**
253 * Define powers-of-two ThreadStore specializations for the given Cache load modifier
254 */
255 #define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256 _CUB_STORE_16(cub_modifier, ptx_modifier) \
257 _CUB_STORE_8(cub_modifier, ptx_modifier) \
258 _CUB_STORE_4(cub_modifier, ptx_modifier) \
259 _CUB_STORE_2(cub_modifier, ptx_modifier) \
260 _CUB_STORE_1(cub_modifier, ptx_modifier) \
261
262
263 /**
264 * Define ThreadStore specializations for the various Cache load modifiers
265 */
266 #if CUB_PTX_ARCH >= 200
_CUB_STORE_ALL(STORE_WB,wb)267 _CUB_STORE_ALL(STORE_WB, wb)
268 _CUB_STORE_ALL(STORE_CG, cg)
269 _CUB_STORE_ALL(STORE_CS, cs)
270 _CUB_STORE_ALL(STORE_WT, wt)
271 #else
272 _CUB_STORE_ALL(STORE_WB, global)
273 _CUB_STORE_ALL(STORE_CG, global)
274 _CUB_STORE_ALL(STORE_CS, global)
275 _CUB_STORE_ALL(STORE_WT, volatile.global)
276 #endif
277
278
279 // Macro cleanup
280 #undef _CUB_STORE_ALL
281 #undef _CUB_STORE_1
282 #undef _CUB_STORE_2
283 #undef _CUB_STORE_4
284 #undef _CUB_STORE_8
285 #undef _CUB_STORE_16
286
287
288 /**
289 * ThreadStore definition for STORE_DEFAULT modifier on iterator types
290 */
291 template <typename OutputIteratorT, typename T>
292 __device__ __forceinline__ void ThreadStore(
293 OutputIteratorT itr,
294 T val,
295 Int2Type<STORE_DEFAULT> /*modifier*/,
296 Int2Type<false> /*is_pointer*/)
297 {
298 *itr = val;
299 }
300
301
302 /**
303 * ThreadStore definition for STORE_DEFAULT modifier on pointer types
304 */
305 template <typename T>
ThreadStore(T * ptr,T val,Int2Type<STORE_DEFAULT>,Int2Type<true>)306 __device__ __forceinline__ void ThreadStore(
307 T *ptr,
308 T val,
309 Int2Type<STORE_DEFAULT> /*modifier*/,
310 Int2Type<true> /*is_pointer*/)
311 {
312 *ptr = val;
313 }
314
315
316 /**
317 * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
318 */
319 template <typename T>
ThreadStoreVolatilePtr(T * ptr,T val,Int2Type<true>)320 __device__ __forceinline__ void ThreadStoreVolatilePtr(
321 T *ptr,
322 T val,
323 Int2Type<true> /*is_primitive*/)
324 {
325 *reinterpret_cast<volatile T*>(ptr) = val;
326 }
327
328
329 /**
330 * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
331 */
332 template <typename T>
ThreadStoreVolatilePtr(T * ptr,T val,Int2Type<false>)333 __device__ __forceinline__ void ThreadStoreVolatilePtr(
334 T *ptr,
335 T val,
336 Int2Type<false> /*is_primitive*/)
337 {
338 // Create a temporary using shuffle-words, then store using volatile-words
339 typedef typename UnitWord<T>::VolatileWord VolatileWord;
340 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
341
342 const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
343 const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
344
345 VolatileWord words[VOLATILE_MULTIPLE];
346
347 #pragma unroll
348 for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
349 reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
350
351 IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
352 reinterpret_cast<volatile VolatileWord*>(ptr),
353 words);
354 }
355
356
357 /**
358 * ThreadStore definition for STORE_VOLATILE modifier on pointer types
359 */
360 template <typename T>
ThreadStore(T * ptr,T val,Int2Type<STORE_VOLATILE>,Int2Type<true>)361 __device__ __forceinline__ void ThreadStore(
362 T *ptr,
363 T val,
364 Int2Type<STORE_VOLATILE> /*modifier*/,
365 Int2Type<true> /*is_pointer*/)
366 {
367 ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
368 }
369
370
371 /**
372 * ThreadStore definition for generic modifiers on pointer types
373 */
374 template <typename T, int MODIFIER>
ThreadStore(T * ptr,T val,Int2Type<MODIFIER>,Int2Type<true>)375 __device__ __forceinline__ void ThreadStore(
376 T *ptr,
377 T val,
378 Int2Type<MODIFIER> /*modifier*/,
379 Int2Type<true> /*is_pointer*/)
380 {
381 // Create a temporary using shuffle-words, then store using device-words
382 typedef typename UnitWord<T>::DeviceWord DeviceWord;
383 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
384
385 const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
386 const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
387
388 DeviceWord words[DEVICE_MULTIPLE];
389
390 #pragma unroll
391 for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
392 reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
393
394 IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
395 reinterpret_cast<DeviceWord*>(ptr),
396 words);
397 }
398
399
400 /**
401 * ThreadStore definition for generic modifiers
402 */
403 template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
ThreadStore(OutputIteratorT itr,T val)404 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
405 {
406 ThreadStore(
407 itr,
408 val,
409 Int2Type<MODIFIER>(),
410 Int2Type<IsPointer<OutputIteratorT>::VALUE>());
411 }
412
413
414
415 #endif // DOXYGEN_SHOULD_SKIP_THIS
416
417
418 /** @} */ // end group UtilIo
419
420
421 } // CUB namespace
422 CUB_NS_POSTFIX // Optional outer namespace(s)
423