1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * Thread utilities for writing memory using PTX cache modifiers.
32  */
33 
34 #pragma once
35 
36 #include <cuda.h>
37 
38 #include "../util_ptx.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
42 /// Optional outer namespace(s)
43 CUB_NS_PREFIX
44 
45 /// CUB namespace
46 namespace cub {
47 
48 /**
49  * \addtogroup UtilIo
50  * @{
51  */
52 
53 
54 //-----------------------------------------------------------------------------
55 // Tags and constants
56 //-----------------------------------------------------------------------------
57 
58 /**
59  * \brief Enumeration of cache modifiers for memory store operations.
60  */
61 enum CacheStoreModifier
62 {
63     STORE_DEFAULT,              ///< Default (no modifier)
64     STORE_WB,                   ///< Cache write-back all coherent levels
65     STORE_CG,                   ///< Cache at global level
66     STORE_CS,                   ///< Cache streaming (likely to be accessed once)
67     STORE_WT,                   ///< Cache write-through (to system memory)
68     STORE_VOLATILE,             ///< Volatile shared (any memory space)
69 };
70 
71 
72 /**
73  * \name Thread I/O (cache modified)
74  * @{
75  */
76 
77 /**
78  * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
79  *
80  * \par Example
81  * \code
82  * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
83  *
84  * // 32-bit store using cache-global modifier:
85  * int *d_out;
86  * int val;
87  * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
88  *
89  * // 16-bit store using default modifier
90  * short *d_out;
91  * short val;
92  * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
93  *
94  * // 256-bit store using write-through modifier
95  * double4 *d_out;
96  * double4 val;
97  * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
98  *
99  * // 96-bit store using cache-streaming cache modifier
100  * struct TestFoo { bool a; short b; };
101  * TestFoo *d_struct;
102  * TestFoo val;
103  * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
104  * \endcode
105  *
106  * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
107  * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
108  * \tparam T                    <b>[inferred]</b> Data type of output value
109  */
110 template <
111     CacheStoreModifier  MODIFIER,
112     typename            OutputIteratorT,
113     typename            T>
114 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
115 
116 
117 //@}  end member group
118 
119 
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
121 
122 
123 /// Helper structure for templated store iteration (inductive case)
124 template <int COUNT, int MAX>
125 struct IterateThreadStore
126 {
127     template <CacheStoreModifier MODIFIER, typename T>
Storecub::IterateThreadStore128     static __device__ __forceinline__ void Store(T *ptr, T *vals)
129     {
130         ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131         IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
132     }
133 
134     template <typename OutputIteratorT, typename T>
Dereferencecub::IterateThreadStore135     static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
136     {
137         ptr[COUNT] = vals[COUNT];
138         IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
139     }
140 
141 };
142 
143 /// Helper structure for templated store iteration (termination case)
144 template <int MAX>
145 struct IterateThreadStore<MAX, MAX>
146 {
147     template <CacheStoreModifier MODIFIER, typename T>
Storecub::IterateThreadStore148     static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
149 
150     template <typename OutputIteratorT, typename T>
Dereferencecub::IterateThreadStore151     static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
152 };
153 
154 
155 /**
156  * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
157  */
158 #define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
159     template<>                                                                              \
160     __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
161     {                                                                                       \
162         asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
163             _CUB_ASM_PTR_(ptr),                                                             \
164             "r"(val.x),                                                                     \
165             "r"(val.y),                                                                     \
166             "r"(val.z),                                                                     \
167             "r"(val.w));                                                                    \
168     }                                                                                       \
169     template<>                                                                              \
170     __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
171     {                                                                                       \
172         asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
173             _CUB_ASM_PTR_(ptr),                                                             \
174             "l"(val.x),                                                                     \
175             "l"(val.y));                                                                    \
176     }
177 
178 
179 /**
180  * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
181  */
182 #define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
183     template<>                                                                              \
184     __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
185     {                                                                                       \
186         asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
187             _CUB_ASM_PTR_(ptr),                                                             \
188             "h"(val.x),                                                                     \
189             "h"(val.y),                                                                     \
190             "h"(val.z),                                                                     \
191             "h"(val.w));                                                                    \
192     }                                                                                       \
193     template<>                                                                              \
194     __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
195     {                                                                                       \
196         asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
197             _CUB_ASM_PTR_(ptr),                                                             \
198             "r"(val.x),                                                                     \
199             "r"(val.y));                                                                    \
200     }                                                                                       \
201     template<>                                                                              \
202     __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
203     {                                                                                       \
204         asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
205             _CUB_ASM_PTR_(ptr),                                                             \
206             "l"(val));                                                                      \
207     }
208 
209 /**
210  * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
211  */
212 #define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
213     template<>                                                                              \
214     __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
215     {                                                                                       \
216         asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
217             _CUB_ASM_PTR_(ptr),                                                             \
218             "r"(val));                                                                      \
219     }
220 
221 
222 /**
223  * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
224  */
225 #define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
226     template<>                                                                              \
227     __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
228     {                                                                                       \
229         asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
230             _CUB_ASM_PTR_(ptr),                                                             \
231             "h"(val));                                                                      \
232     }
233 
234 
235 /**
236  * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
237  */
238 #define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
239     template<>                                                                              \
240     __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
241     {                                                                                       \
242         asm volatile (                                                                      \
243         "{"                                                                                 \
244         "   .reg .u8 datum;"                                                                \
245         "   cvt.u8.u16 datum, %1;"                                                          \
246         "   st."#ptx_modifier".u8 [%0], datum;"                                             \
247         "}" : :                                                                             \
248             _CUB_ASM_PTR_(ptr),                                                             \
249             "h"((unsigned short) val));                                                               \
250     }
251 
252 /**
253  * Define powers-of-two ThreadStore specializations for the given Cache load modifier
254  */
255 #define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
256     _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
257     _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
258     _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
259     _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
260     _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
261 
262 
263 /**
264  * Define ThreadStore specializations for the various Cache load modifiers
265  */
266 #if CUB_PTX_ARCH >= 200
_CUB_STORE_ALL(STORE_WB,wb)267     _CUB_STORE_ALL(STORE_WB, wb)
268     _CUB_STORE_ALL(STORE_CG, cg)
269     _CUB_STORE_ALL(STORE_CS, cs)
270     _CUB_STORE_ALL(STORE_WT, wt)
271 #else
272     _CUB_STORE_ALL(STORE_WB, global)
273     _CUB_STORE_ALL(STORE_CG, global)
274     _CUB_STORE_ALL(STORE_CS, global)
275     _CUB_STORE_ALL(STORE_WT, volatile.global)
276 #endif
277 
278 
279 // Macro cleanup
280 #undef _CUB_STORE_ALL
281 #undef _CUB_STORE_1
282 #undef _CUB_STORE_2
283 #undef _CUB_STORE_4
284 #undef _CUB_STORE_8
285 #undef _CUB_STORE_16
286 
287 
288 /**
289  * ThreadStore definition for STORE_DEFAULT modifier on iterator types
290  */
291 template <typename OutputIteratorT, typename T>
292 __device__ __forceinline__ void ThreadStore(
293     OutputIteratorT             itr,
294     T                           val,
295     Int2Type<STORE_DEFAULT>     /*modifier*/,
296     Int2Type<false>             /*is_pointer*/)
297 {
298     *itr = val;
299 }
300 
301 
302 /**
303  * ThreadStore definition for STORE_DEFAULT modifier on pointer types
304  */
305 template <typename T>
ThreadStore(T * ptr,T val,Int2Type<STORE_DEFAULT>,Int2Type<true>)306 __device__ __forceinline__ void ThreadStore(
307     T                           *ptr,
308     T                           val,
309     Int2Type<STORE_DEFAULT>     /*modifier*/,
310     Int2Type<true>              /*is_pointer*/)
311 {
312     *ptr = val;
313 }
314 
315 
316 /**
317  * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
318  */
319 template <typename T>
ThreadStoreVolatilePtr(T * ptr,T val,Int2Type<true>)320 __device__ __forceinline__ void ThreadStoreVolatilePtr(
321     T                           *ptr,
322     T                           val,
323     Int2Type<true>              /*is_primitive*/)
324 {
325     *reinterpret_cast<volatile T*>(ptr) = val;
326 }
327 
328 
329 /**
330  * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
331  */
332 template <typename T>
ThreadStoreVolatilePtr(T * ptr,T val,Int2Type<false>)333 __device__ __forceinline__ void ThreadStoreVolatilePtr(
334     T                           *ptr,
335     T                           val,
336     Int2Type<false>             /*is_primitive*/)
337 {
338     // Create a temporary using shuffle-words, then store using volatile-words
339     typedef typename UnitWord<T>::VolatileWord  VolatileWord;
340     typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
341 
342     const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
343     const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
344 
345     VolatileWord words[VOLATILE_MULTIPLE];
346 
347     #pragma unroll
348     for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
349         reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
350 
351     IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
352         reinterpret_cast<volatile VolatileWord*>(ptr),
353         words);
354 }
355 
356 
357 /**
358  * ThreadStore definition for STORE_VOLATILE modifier on pointer types
359  */
360 template <typename T>
ThreadStore(T * ptr,T val,Int2Type<STORE_VOLATILE>,Int2Type<true>)361 __device__ __forceinline__ void ThreadStore(
362     T                           *ptr,
363     T                           val,
364     Int2Type<STORE_VOLATILE>    /*modifier*/,
365     Int2Type<true>              /*is_pointer*/)
366 {
367     ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
368 }
369 
370 
371 /**
372  * ThreadStore definition for generic modifiers on pointer types
373  */
374 template <typename T, int MODIFIER>
ThreadStore(T * ptr,T val,Int2Type<MODIFIER>,Int2Type<true>)375 __device__ __forceinline__ void ThreadStore(
376     T                           *ptr,
377     T                           val,
378     Int2Type<MODIFIER>          /*modifier*/,
379     Int2Type<true>              /*is_pointer*/)
380 {
381     // Create a temporary using shuffle-words, then store using device-words
382     typedef typename UnitWord<T>::DeviceWord    DeviceWord;
383     typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
384 
385     const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
386     const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
387 
388     DeviceWord words[DEVICE_MULTIPLE];
389 
390     #pragma unroll
391     for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
392         reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
393 
394     IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
395         reinterpret_cast<DeviceWord*>(ptr),
396         words);
397 }
398 
399 
400 /**
401  * ThreadStore definition for generic modifiers
402  */
403 template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
ThreadStore(OutputIteratorT itr,T val)404 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
405 {
406     ThreadStore(
407         itr,
408         val,
409         Int2Type<MODIFIER>(),
410         Int2Type<IsPointer<OutputIteratorT>::VALUE>());
411 }
412 
413 
414 
415 #endif // DOXYGEN_SHOULD_SKIP_THIS
416 
417 
418 /** @} */       // end group UtilIo
419 
420 
421 }               // CUB namespace
422 CUB_NS_POSTFIX  // Optional outer namespace(s)
423