1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * Common type manipulation (metaprogramming) utilities
32  */
33 
34 #pragma once
35 
36 #include <iostream>
37 #include <limits>
38 #include <cfloat>
39 
40 #if (__CUDACC_VER_MAJOR__ >= 9)
41     #include <cuda_fp16.h>
42 #endif
43 
44 #include "util_macro.cuh"
45 #include "util_arch.cuh"
46 #include "util_namespace.cuh"
47 
48 
49 
50 /// Optional outer namespace(s)
51 CUB_NS_PREFIX
52 
53 /// CUB namespace
54 namespace cub {
55 
56 
57 /**
58  * \addtogroup UtilModule
59  * @{
60  */
61 
62 
63 
64 /******************************************************************************
65  * Type equality
66  ******************************************************************************/
67 
68 /**
69  * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
70  */
71 template <bool IF, typename ThenType, typename ElseType>
72 struct If
73 {
74     /// Conditional type result
75     typedef ThenType Type;      // true
76 };
77 
78 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
79 
80 template <typename ThenType, typename ElseType>
81 struct If<false, ThenType, ElseType>
82 {
83     typedef ElseType Type;      // false
84 };
85 
86 #endif // DOXYGEN_SHOULD_SKIP_THIS
87 
88 
89 
90 /******************************************************************************
91  * Conditional types
92  ******************************************************************************/
93 
94 /**
95  * \brief Type equality test
96  */
97 template <typename A, typename B>
98 struct Equals
99 {
100     enum {
101         VALUE = 0,
102         NEGATE = 1
103     };
104 };
105 
106 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
107 
108 template <typename A>
109 struct Equals <A, A>
110 {
111     enum {
112         VALUE = 1,
113         NEGATE = 0
114     };
115 };
116 
117 #endif // DOXYGEN_SHOULD_SKIP_THIS
118 
119 
120 /******************************************************************************
121  * Static math
122  ******************************************************************************/
123 
124 /**
125  * \brief Statically determine log2(N), rounded up.
126  *
127  * For example:
128  *     Log2<8>::VALUE   // 3
129  *     Log2<3>::VALUE   // 2
130  */
131 template <int N, int CURRENT_VAL = N, int COUNT = 0>
132 struct Log2
133 {
134     /// Static logarithm value
135     enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
136 };
137 
138 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
139 
140 template <int N, int COUNT>
141 struct Log2<N, 0, COUNT>
142 {
143     enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
144         COUNT :
145         COUNT - 1 };
146 };
147 
148 #endif // DOXYGEN_SHOULD_SKIP_THIS
149 
150 
151 /**
152  * \brief Statically determine if N is a power-of-two
153  */
154 template <int N>
155 struct PowerOfTwo
156 {
157     enum { VALUE = ((N & (N - 1)) == 0) };
158 };
159 
160 
161 
162 /******************************************************************************
163  * Pointer vs. iterator detection
164  ******************************************************************************/
165 
166 /**
167  * \brief Pointer vs. iterator
168  */
169 template <typename Tp>
170 struct IsPointer
171 {
172     enum { VALUE = 0 };
173 };
174 
175 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
176 
177 template <typename Tp>
178 struct IsPointer<Tp*>
179 {
180     enum { VALUE = 1 };
181 };
182 
183 #endif // DOXYGEN_SHOULD_SKIP_THIS
184 
185 
186 
187 /******************************************************************************
188  * Qualifier detection
189  ******************************************************************************/
190 
191 /**
192  * \brief Volatile modifier test
193  */
194 template <typename Tp>
195 struct IsVolatile
196 {
197     enum { VALUE = 0 };
198 };
199 
200 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
201 
202 template <typename Tp>
203 struct IsVolatile<Tp volatile>
204 {
205     enum { VALUE = 1 };
206 };
207 
208 #endif // DOXYGEN_SHOULD_SKIP_THIS
209 
210 
211 /******************************************************************************
212  * Qualifier removal
213  ******************************************************************************/
214 
215 /**
216  * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
217  *
218  * For example:
219  *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
220  */
221 template <typename Tp, typename Up = Tp>
222 struct RemoveQualifiers
223 {
224     /// Type without \p const and \p volatile qualifiers
225     typedef Up Type;
226 };
227 
228 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
229 
230 template <typename Tp, typename Up>
231 struct RemoveQualifiers<Tp, volatile Up>
232 {
233     typedef Up Type;
234 };
235 
236 template <typename Tp, typename Up>
237 struct RemoveQualifiers<Tp, const Up>
238 {
239     typedef Up Type;
240 };
241 
242 template <typename Tp, typename Up>
243 struct RemoveQualifiers<Tp, const volatile Up>
244 {
245     typedef Up Type;
246 };
247 
248 
249 /******************************************************************************
250  * Marker types
251  ******************************************************************************/
252 
253 /**
254  * \brief A simple "NULL" marker type
255  */
256 struct NullType
257 {
258 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
259 
260     template <typename T>
operator =cub::NullType261     __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
262 
operator ==cub::NullType263     __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
264 
operator !=cub::NullType265     __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
266 
267 #endif // DOXYGEN_SHOULD_SKIP_THIS
268 };
269 
270 
271 /**
272  * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
273  */
274 template <int A>
275 struct Int2Type
276 {
277    enum {VALUE = A};
278 };
279 
280 
281 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
282 
283 
284 /******************************************************************************
285  * Size and alignment
286  ******************************************************************************/
287 
288 /// Structure alignment
289 template <typename T>
290 struct AlignBytes
291 {
292     struct Pad
293     {
294         T       val;
295         char    byte;
296     };
297 
298     enum
299     {
300         /// The "true CUDA" alignment of T in bytes
301         ALIGN_BYTES = sizeof(Pad) - sizeof(T)
302     };
303 
304     /// The "truly aligned" type
305     typedef T Type;
306 };
307 
308 // Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
309 // with device C++ compilers (EDG) on types passed as template parameters through
310 // kernel functions
311 
312 #define __CUB_ALIGN_BYTES(t, b)         \
313     template <> struct AlignBytes<t>    \
314     { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
315 
316 __CUB_ALIGN_BYTES(short4, 8)
317 __CUB_ALIGN_BYTES(ushort4, 8)
318 __CUB_ALIGN_BYTES(int2, 8)
319 __CUB_ALIGN_BYTES(uint2, 8)
320 __CUB_ALIGN_BYTES(long long, 8)
321 __CUB_ALIGN_BYTES(unsigned long long, 8)
322 __CUB_ALIGN_BYTES(float2, 8)
323 __CUB_ALIGN_BYTES(double, 8)
324 #ifdef _WIN32
325     __CUB_ALIGN_BYTES(long2, 8)
326     __CUB_ALIGN_BYTES(ulong2, 8)
327 #else
328     __CUB_ALIGN_BYTES(long2, 16)
329     __CUB_ALIGN_BYTES(ulong2, 16)
330 #endif
331 __CUB_ALIGN_BYTES(int4, 16)
332 __CUB_ALIGN_BYTES(uint4, 16)
333 __CUB_ALIGN_BYTES(float4, 16)
334 __CUB_ALIGN_BYTES(long4, 16)
335 __CUB_ALIGN_BYTES(ulong4, 16)
336 __CUB_ALIGN_BYTES(longlong2, 16)
337 __CUB_ALIGN_BYTES(ulonglong2, 16)
338 __CUB_ALIGN_BYTES(double2, 16)
339 __CUB_ALIGN_BYTES(longlong4, 16)
340 __CUB_ALIGN_BYTES(ulonglong4, 16)
341 __CUB_ALIGN_BYTES(double4, 16)
342 
343 template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
344 template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
345 template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
346 
347 
348 /// Unit-words of data movement
349 template <typename T>
350 struct UnitWord
351 {
352     enum {
353         ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
354     };
355 
356     template <typename Unit>
357     struct IsMultiple
358     {
359         enum {
360             UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
361             IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
362         };
363     };
364 
365     /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
366     typedef typename If<IsMultiple<int>::IS_MULTIPLE,
367         unsigned int,
368         typename If<IsMultiple<short>::IS_MULTIPLE,
369             unsigned short,
370             unsigned char>::Type>::Type         ShuffleWord;
371 
372     /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
373     typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
374         unsigned long long,
375         ShuffleWord>::Type                      VolatileWord;
376 
377     /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
378     typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
379         ulonglong2,
380         VolatileWord>::Type                     DeviceWord;
381 
382     /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
383     typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
384         uint4,
385         typename If<IsMultiple<int2>::IS_MULTIPLE,
386             uint2,
387             ShuffleWord>::Type>::Type           TextureWord;
388 };
389 
390 
391 // float2 specialization workaround (for SM10-SM13)
392 template <>
393 struct UnitWord <float2>
394 {
395     typedef int         ShuffleWord;
396 #if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
397     typedef float       VolatileWord;
398     typedef uint2       DeviceWord;
399 #else
400     typedef unsigned long long   VolatileWord;
401     typedef unsigned long long   DeviceWord;
402 #endif
403     typedef float2      TextureWord;
404 };
405 
406 // float4 specialization workaround (for SM10-SM13)
407 template <>
408 struct UnitWord <float4>
409 {
410     typedef int         ShuffleWord;
411 #if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
412     typedef float               VolatileWord;
413     typedef uint4               DeviceWord;
414 #else
415     typedef unsigned long long  VolatileWord;
416     typedef ulonglong2          DeviceWord;
417 #endif
418     typedef float4              TextureWord;
419 };
420 
421 
422 // char2 specialization workaround (for SM10-SM13)
423 template <>
424 struct UnitWord <char2>
425 {
426     typedef unsigned short      ShuffleWord;
427 #if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
428     typedef unsigned short      VolatileWord;
429     typedef short               DeviceWord;
430 #else
431     typedef unsigned short      VolatileWord;
432     typedef unsigned short      DeviceWord;
433 #endif
434     typedef unsigned short      TextureWord;
435 };
436 
437 
438 template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
439 template <typename T> struct UnitWord<const T> : UnitWord<T> {};
440 template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
441 
442 
443 #endif // DOXYGEN_SHOULD_SKIP_THIS
444 
445 
446 
447 /******************************************************************************
448  * Vector type inference utilities.
449  ******************************************************************************/
450 
451 /**
452  * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
453  */
454 template <typename T, int vec_elements> struct CubVector;
455 
456 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
457 
458 enum
459 {
460     /// The maximum number of elements in CUDA vector types
461     MAX_VEC_ELEMENTS = 4,
462 };
463 
464 
465 /**
466  * Generic vector-1 type
467  */
468 template <typename T>
469 struct CubVector<T, 1>
470 {
471     T x;
472 
473     typedef T BaseType;
474     typedef CubVector<T, 1> Type;
475 };
476 
477 /**
478  * Generic vector-2 type
479  */
480 template <typename T>
481 struct CubVector<T, 2>
482 {
483     T x;
484     T y;
485 
486     typedef T BaseType;
487     typedef CubVector<T, 2> Type;
488 };
489 
490 /**
491  * Generic vector-3 type
492  */
493 template <typename T>
494 struct CubVector<T, 3>
495 {
496     T x;
497     T y;
498     T z;
499 
500     typedef T BaseType;
501     typedef CubVector<T, 3> Type;
502 };
503 
504 /**
505  * Generic vector-4 type
506  */
507 template <typename T>
508 struct CubVector<T, 4>
509 {
510     T x;
511     T y;
512     T z;
513     T w;
514 
515     typedef T BaseType;
516     typedef CubVector<T, 4> Type;
517 };
518 
519 
520 /**
521  * Macro for expanding partially-specialized built-in vector types
522  */
523 #define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
524                                                                                                         \
525     template<> struct CubVector<base_type, 1> : short_type##1                                           \
526     {                                                                                                   \
527       typedef base_type       BaseType;                                                                 \
528       typedef short_type##1   Type;                                                                     \
529       __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
530           CubVector retval;                                                                             \
531           retval.x = x + other.x;                                                                       \
532           return retval;                                                                                \
533       }                                                                                                 \
534       __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
535           CubVector retval;                                                                             \
536           retval.x = x - other.x;                                                                       \
537           return retval;                                                                                \
538       }                                                                                                 \
539     };                                                                                                  \
540                                                                                                         \
541     template<> struct CubVector<base_type, 2> : short_type##2                                           \
542     {                                                                                                   \
543         typedef base_type       BaseType;                                                               \
544         typedef short_type##2   Type;                                                                   \
545         __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
546             CubVector retval;                                                                           \
547             retval.x = x + other.x;                                                                     \
548             retval.y = y + other.y;                                                                     \
549             return retval;                                                                              \
550         }                                                                                               \
551         __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
552             CubVector retval;                                                                           \
553             retval.x = x - other.x;                                                                     \
554             retval.y = y - other.y;                                                                     \
555             return retval;                                                                              \
556         }                                                                                               \
557     };                                                                                                  \
558                                                                                                         \
559     template<> struct CubVector<base_type, 3> : short_type##3                                           \
560     {                                                                                                   \
561         typedef base_type       BaseType;                                                               \
562         typedef short_type##3   Type;                                                                   \
563         __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
564             CubVector retval;                                                                           \
565             retval.x = x + other.x;                                                                     \
566             retval.y = y + other.y;                                                                     \
567             retval.z = z + other.z;                                                                     \
568             return retval;                                                                              \
569         }                                                                                               \
570         __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
571             CubVector retval;                                                                           \
572             retval.x = x - other.x;                                                                     \
573             retval.y = y - other.y;                                                                     \
574             retval.z = z - other.z;                                                                     \
575             return retval;                                                                              \
576         }                                                                                               \
577     };                                                                                                  \
578                                                                                                         \
579     template<> struct CubVector<base_type, 4> : short_type##4                                           \
580     {                                                                                                   \
581         typedef base_type       BaseType;                                                               \
582         typedef short_type##4   Type;                                                                   \
583         __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
584             CubVector retval;                                                                           \
585             retval.x = x + other.x;                                                                     \
586             retval.y = y + other.y;                                                                     \
587             retval.z = z + other.z;                                                                     \
588             retval.w = w + other.w;                                                                     \
589             return retval;                                                                              \
590         }                                                                                               \
591         __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
592             CubVector retval;                                                                           \
593             retval.x = x - other.x;                                                                     \
594             retval.y = y - other.y;                                                                     \
595             retval.z = z - other.z;                                                                     \
596             retval.w = w - other.w;                                                                     \
597             return retval;                                                                              \
598         }                                                                                               \
599     };
600 
601 
602 
603 // Expand CUDA vector types for built-in primitives
604 CUB_DEFINE_VECTOR_TYPE(char,               char)
605 CUB_DEFINE_VECTOR_TYPE(signed char,        char)
606 CUB_DEFINE_VECTOR_TYPE(short,              short)
607 CUB_DEFINE_VECTOR_TYPE(int,                int)
608 CUB_DEFINE_VECTOR_TYPE(long,               long)
609 CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
610 CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
611 CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
612 CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
613 CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
614 CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
615 CUB_DEFINE_VECTOR_TYPE(float,              float)
616 CUB_DEFINE_VECTOR_TYPE(double,             double)
617 CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
618 
619 // Undefine macros
620 #undef CUB_DEFINE_VECTOR_TYPE
621 
622 #endif // DOXYGEN_SHOULD_SKIP_THIS
623 
624 
625 
626 /******************************************************************************
627  * Wrapper types
628  ******************************************************************************/
629 
630 /**
631  * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
632  */
633 template <typename T>
634 struct Uninitialized
635 {
636     /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
637     typedef typename UnitWord<T>::DeviceWord DeviceWord;
638 
639     enum
640     {
641         WORDS = sizeof(T) / sizeof(DeviceWord)
642     };
643 
644     /// Backing storage
645     DeviceWord storage[WORDS];
646 
647     /// Alias
Aliascub::Uninitialized648     __host__ __device__ __forceinline__ T& Alias()
649     {
650         return reinterpret_cast<T&>(*this);
651     }
652 };
653 
654 
655 /**
656  * \brief A key identifier paired with a corresponding value
657  */
658 template <
659     typename    _Key,
660     typename    _Value
661 #if defined(_WIN32) && !defined(_WIN64)
662     , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
663     , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
664 #endif // #if defined(_WIN32) && !defined(_WIN64)
665     >
666 struct KeyValuePair
667 {
668     typedef _Key    Key;                ///< Key data type
669     typedef _Value  Value;              ///< Value data type
670 
671     Key     key;                        ///< Item key
672     Value   value;                      ///< Item value
673 
674     /// Constructor
675     __host__ __device__ __forceinline__
KeyValuePaircub::KeyValuePair676     KeyValuePair() {}
677 
678     /// Constructor
679     __host__ __device__ __forceinline__
KeyValuePaircub::KeyValuePair680     KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
681 
682     /// Inequality operator
operator !=cub::KeyValuePair683     __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
684     {
685         return (value != b.value) || (key != b.key);
686     }
687 };
688 
689 #if defined(_WIN32) && !defined(_WIN64)
690 
691 /**
692  * Win32 won't do 16B alignment.  This can present two problems for
693  * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
694  * 1) If a smaller-aligned item were to be listed first, the host compiler places the
695  *    should-be-16B item at too early an offset (and disagrees with device compiler)
696  * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
697  *    of the struct wrong (and disagrees with device compiler)
698  *
699  * So we put the larger-should-be-aligned item first, and explicitly pad the
700  * end of the struct
701  */
702 
703 /// Smaller key specialization
704 template <typename K, typename V>
705 struct KeyValuePair<K, V, true, false>
706 {
707     typedef K Key;
708     typedef V Value;
709 
710     typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
711 
712     Value   value;  // Value has larger would-be alignment and goes first
713     Key     key;
714     Pad     pad;
715 
716     /// Constructor
717     __host__ __device__ __forceinline__
KeyValuePaircub::KeyValuePair718     KeyValuePair() {}
719 
720     /// Constructor
721     __host__ __device__ __forceinline__
KeyValuePaircub::KeyValuePair722     KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
723 
724     /// Inequality operator
operator !=cub::KeyValuePair725     __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
726     {
727         return (value != b.value) || (key != b.key);
728     }
729 };
730 
731 
732 /// Smaller value specialization
733 template <typename K, typename V>
734 struct KeyValuePair<K, V, false, true>
735 {
736     typedef K Key;
737     typedef V Value;
738 
739     typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
740 
741     Key     key;    // Key has larger would-be alignment and goes first
742     Value   value;
743     Pad     pad;
744 
745     /// Constructor
746     __host__ __device__ __forceinline__
KeyValuePaircub::KeyValuePair747     KeyValuePair() {}
748 
749     /// Constructor
750     __host__ __device__ __forceinline__
KeyValuePaircub::KeyValuePair751     KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
752 
753     /// Inequality operator
operator !=cub::KeyValuePair754     __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
755     {
756         return (value != b.value) || (key != b.key);
757     }
758 };
759 
760 #endif // #if defined(_WIN32) && !defined(_WIN64)
761 
762 
763 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
764 
765 
766 /**
767  * \brief A wrapper for passing simple static arrays as kernel parameters
768  */
769 template <typename T, int COUNT>
770 struct ArrayWrapper
771 {
772 
773     /// Statically-sized array of type \p T
774     T array[COUNT];
775 
776     /// Constructor
ArrayWrappercub::ArrayWrapper777     __host__ __device__ __forceinline__ ArrayWrapper() {}
778 };
779 
780 #endif // DOXYGEN_SHOULD_SKIP_THIS
781 
782 /**
783  * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
784  *
785  * Many multi-pass computations require a pair of "ping-pong" storage
786  * buffers (e.g., one for reading from and the other for writing to, and then
787  * vice-versa for the subsequent pass).  This structure wraps a set of device
788  * buffers and a "selector" member to track which is "current".
789  */
790 template <typename T>
791 struct DoubleBuffer
792 {
793     /// Pair of device buffer pointers
794     T *d_buffers[2];
795 
796     ///  Selector into \p d_buffers (i.e., the active/valid buffer)
797     int selector;
798 
799     /// \brief Constructor
DoubleBuffercub::DoubleBuffer800     __host__ __device__ __forceinline__ DoubleBuffer()
801     {
802         selector = 0;
803         d_buffers[0] = NULL;
804         d_buffers[1] = NULL;
805     }
806 
807     /// \brief Constructor
DoubleBuffercub::DoubleBuffer808     __host__ __device__ __forceinline__ DoubleBuffer(
809         T *d_current,         ///< The currently valid buffer
810         T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
811     {
812         selector = 0;
813         d_buffers[0] = d_current;
814         d_buffers[1] = d_alternate;
815     }
816 
817     /// \brief Return pointer to the currently valid buffer
Currentcub::DoubleBuffer818     __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
819 
820     /// \brief Return pointer to the currently invalid buffer
Alternatecub::DoubleBuffer821     __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
822 
823 };
824 
825 
826 
827 /******************************************************************************
828  * Typedef-detection
829  ******************************************************************************/
830 
831 
832 /**
833  * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
834  */
835 #define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
836     template <typename T>                                               \
837     struct detector_name                                                \
838     {                                                                   \
839         template <typename C>                                           \
840         static char& test(typename C::nested_type_name*);               \
841         template <typename>                                             \
842         static int& test(...);                                          \
843         enum                                                            \
844         {                                                               \
845             VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
846         };                                                              \
847     };
848 
849 
850 
851 /******************************************************************************
852  * Simple enable-if (similar to Boost)
853  ******************************************************************************/
854 
855 /**
856  * \brief Simple enable-if (similar to Boost)
857  */
858 template <bool Condition, class T = void>
859 struct EnableIf
860 {
861     /// Enable-if type for SFINAE dummy variables
862     typedef T Type;
863 };
864 
865 
866 template <class T>
867 struct EnableIf<false, T> {};
868 
869 
870 
871 /******************************************************************************
872  * Typedef-detection
873  ******************************************************************************/
874 
875 /**
876  * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
877  */
878 template <typename T, typename BinaryOp>
879 struct BinaryOpHasIdxParam
880 {
881 private:
882 /*
883     template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
884     template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
885     template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
886     template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
887 */
888     template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
889     template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
890     template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
891     template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
892 /*
893     template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
894     template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
895     template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
896     template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
897 */
898     template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
899     template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
900     template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
901     template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
902 
903     template <typename BinaryOpT> static int Test(...);
904 
905 public:
906 
907     /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
908     static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
909 };
910 
911 
912 
913 
914 /******************************************************************************
915  * Simple type traits utilities.
916  *
917  * For example:
918  *     Traits<int>::CATEGORY             // SIGNED_INTEGER
919  *     Traits<NullType>::NULL_TYPE       // true
920  *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
921  *     Traits<uint4>::PRIMITIVE;         // false
922  *
923  ******************************************************************************/
924 
925 /**
926  * \brief Basic type traits categories
927  */
928 enum Category
929 {
930     NOT_A_NUMBER,
931     SIGNED_INTEGER,
932     UNSIGNED_INTEGER,
933     FLOATING_POINT
934 };
935 
936 
937 /**
938  * \brief Basic type traits
939  */
940 template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
941 struct BaseTraits
942 {
943     /// Category
944     static const Category CATEGORY      = _CATEGORY;
945     enum
946     {
947         PRIMITIVE       = _PRIMITIVE,
948         NULL_TYPE       = _NULL_TYPE,
949     };
950 };
951 
952 
953 /**
954  * Basic type traits (unsigned primitive specialization)
955  */
956 template <typename _UnsignedBits, typename T>
957 struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
958 {
959     typedef _UnsignedBits       UnsignedBits;
960 
961     static const Category       CATEGORY    = UNSIGNED_INTEGER;
962     static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
963     static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
964 
965     enum
966     {
967         PRIMITIVE       = true,
968         NULL_TYPE       = false,
969     };
970 
971 
TwiddleIncub::BaseTraits972     static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
973     {
974         return key;
975     }
976 
TwiddleOutcub::BaseTraits977     static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
978     {
979         return key;
980     }
981 
Maxcub::BaseTraits982     static __host__ __device__ __forceinline__ T Max()
983     {
984         UnsignedBits retval = MAX_KEY;
985         return reinterpret_cast<T&>(retval);
986     }
987 
Lowestcub::BaseTraits988     static __host__ __device__ __forceinline__ T Lowest()
989     {
990         UnsignedBits retval = LOWEST_KEY;
991         return reinterpret_cast<T&>(retval);
992     }
993 };
994 
995 
996 /**
997  * Basic type traits (signed primitive specialization)
998  */
999 template <typename _UnsignedBits, typename T>
1000 struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
1001 {
1002     typedef _UnsignedBits       UnsignedBits;
1003 
1004     static const Category       CATEGORY    = SIGNED_INTEGER;
1005     static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
1006     static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
1007     static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
1008 
1009     enum
1010     {
1011         PRIMITIVE       = true,
1012         NULL_TYPE       = false,
1013     };
1014 
TwiddleIncub::BaseTraits1015     static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
1016     {
1017         return key ^ HIGH_BIT;
1018     };
1019 
TwiddleOutcub::BaseTraits1020     static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
1021     {
1022         return key ^ HIGH_BIT;
1023     };
1024 
Maxcub::BaseTraits1025     static __host__ __device__ __forceinline__ T Max()
1026     {
1027         UnsignedBits retval = MAX_KEY;
1028         return reinterpret_cast<T&>(retval);
1029     }
1030 
Lowestcub::BaseTraits1031     static __host__ __device__ __forceinline__ T Lowest()
1032     {
1033         UnsignedBits retval = LOWEST_KEY;
1034         return reinterpret_cast<T&>(retval);
1035     }
1036 };
1037 
1038 template <typename _T>
1039 struct FpLimits;
1040 
1041 template <>
1042 struct FpLimits<float>
1043 {
Maxcub::FpLimits1044     static __host__ __device__ __forceinline__ float Max() {
1045         return FLT_MAX;
1046     }
1047 
Lowestcub::FpLimits1048     static __host__ __device__ __forceinline__ float Lowest() {
1049         return FLT_MAX * float(-1);
1050     }
1051 };
1052 
1053 template <>
1054 struct FpLimits<double>
1055 {
Maxcub::FpLimits1056     static __host__ __device__ __forceinline__ double Max() {
1057         return DBL_MAX;
1058     }
1059 
Lowestcub::FpLimits1060     static __host__ __device__ __forceinline__ double Lowest() {
1061         return DBL_MAX  * double(-1);
1062     }
1063 };
1064 
1065 
1066 #if (__CUDACC_VER_MAJOR__ >= 9)
1067 template <>
1068 struct FpLimits<__half>
1069 {
Maxcub::FpLimits1070     static __host__ __device__ __forceinline__ __half Max() {
1071         unsigned short max_word = 0x7BFF;
1072         return reinterpret_cast<__half&>(max_word);
1073     }
1074 
Lowestcub::FpLimits1075     static __host__ __device__ __forceinline__ __half Lowest() {
1076         unsigned short lowest_word = 0xFBFF;
1077         return reinterpret_cast<__half&>(lowest_word);
1078     }
1079 };
1080 #endif
1081 
1082 
1083 /**
1084  * Basic type traits (fp primitive specialization)
1085  */
1086 template <typename _UnsignedBits, typename T>
1087 struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
1088 {
1089     typedef _UnsignedBits       UnsignedBits;
1090 
1091     static const Category       CATEGORY    = FLOATING_POINT;
1092     static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
1093     static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
1094     static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
1095 
1096     enum
1097     {
1098         PRIMITIVE       = true,
1099         NULL_TYPE       = false,
1100     };
1101 
TwiddleIncub::BaseTraits1102     static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
1103     {
1104         UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
1105         return key ^ mask;
1106     };
1107 
TwiddleOutcub::BaseTraits1108     static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
1109     {
1110         UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
1111         return key ^ mask;
1112     };
1113 
Maxcub::BaseTraits1114     static __host__ __device__ __forceinline__ T Max() {
1115         return FpLimits<T>::Max();
1116     }
1117 
Lowestcub::BaseTraits1118     static __host__ __device__ __forceinline__ T Lowest() {
1119         return FpLimits<T>::Lowest();
1120     }
1121 };
1122 
1123 
1124 /**
1125  * \brief Numeric type traits
1126  */
1127 template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
1128 
1129 template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
1130 
1131 template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
1132 template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
1133 template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
1134 template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
1135 template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
1136 template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
1137 
1138 template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
1139 template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
1140 template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
1141 template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
1142 template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
1143 
1144 template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
1145 template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
1146 #if (__CUDACC_VER_MAJOR__ >= 9)
1147     template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
1148 #endif
1149 
1150 template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
1151 
1152 
1153 
1154 /**
1155  * \brief Type traits
1156  */
1157 template <typename T>
1158 struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
1159 
1160 
1161 #endif // DOXYGEN_SHOULD_SKIP_THIS
1162 
1163 
1164 /** @} */       // end group UtilModule
1165 
1166 }               // CUB namespace
1167 CUB_NS_POSTFIX  // Optional outer namespace(s)
1168