1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * Static architectural properties by SM version.
32  */
33 
34 #pragma once
35 
36 #include "util_cpp_dialect.cuh"
37 #include "util_namespace.cuh"
38 #include "util_macro.cuh"
39 
40 /// Optional outer namespace(s)
41 CUB_NS_PREFIX
42 
43 /// CUB namespace
44 namespace cub {
45 
46 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
47 
48 #if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
49         !defined(CUB_USE_COOPERATIVE_GROUPS)
50     #define CUB_USE_COOPERATIVE_GROUPS
51 #endif
52 
53 /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
54 /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
55 #ifndef CUB_PTX_ARCH
56     #if defined(__NVCOMPILER_CUDA__)
57         // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
58         // when compiling both host code and device code. Currently, only one
59         // PTX version can be targeted.
60         #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
61     #elif !defined(__CUDA_ARCH__)
62         #define CUB_PTX_ARCH 0
63     #else
64         #define CUB_PTX_ARCH __CUDA_ARCH__
65     #endif
66 #endif
67 
68 #ifndef CUB_IS_DEVICE_CODE
69     #if defined(__NVCOMPILER_CUDA__)
70         #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
71         #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
72         #define CUB_INCLUDE_DEVICE_CODE 1
73         #define CUB_INCLUDE_HOST_CODE 1
74     #elif CUB_PTX_ARCH > 0
75         #define CUB_IS_DEVICE_CODE 1
76         #define CUB_IS_HOST_CODE 0
77         #define CUB_INCLUDE_DEVICE_CODE 1
78         #define CUB_INCLUDE_HOST_CODE 0
79     #else
80         #define CUB_IS_DEVICE_CODE 0
81         #define CUB_IS_HOST_CODE 1
82         #define CUB_INCLUDE_DEVICE_CODE 0
83         #define CUB_INCLUDE_HOST_CODE 1
84     #endif
85 #endif
86 
87 /// Maximum number of devices supported.
88 #ifndef CUB_MAX_DEVICES
89     #define CUB_MAX_DEVICES 128
90 #endif
91 
92 #if CUB_CPP_DIALECT >= 2011
93     static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
94 #endif
95 
96 /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
97 #ifndef CUB_RUNTIME_FUNCTION
98     #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
99         #define CUB_RUNTIME_ENABLED
100         #define CUB_RUNTIME_FUNCTION __host__ __device__
101     #else
102         #define CUB_RUNTIME_FUNCTION __host__
103     #endif
104 #endif
105 
106 
107 /// Number of threads per warp
108 #ifndef CUB_LOG_WARP_THREADS
109     #define CUB_LOG_WARP_THREADS(arch)                      \
110         (5)
111     #define CUB_WARP_THREADS(arch)                          \
112         (1 << CUB_LOG_WARP_THREADS(arch))
113 
114     #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
115     #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
116 #endif
117 
118 
119 /// Number of smem banks
120 #ifndef CUB_LOG_SMEM_BANKS
121     #define CUB_LOG_SMEM_BANKS(arch)                        \
122         ((arch >= 200) ?                                    \
123             (5) :                                           \
124             (4))
125     #define CUB_SMEM_BANKS(arch)                            \
126         (1 << CUB_LOG_SMEM_BANKS(arch))
127 
128     #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
129     #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
130 #endif
131 
132 
133 /// Oversubscription factor
134 #ifndef CUB_SUBSCRIPTION_FACTOR
135     #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
136         ((arch >= 300) ?                                    \
137             (5) :                                           \
138             ((arch >= 200) ?                                \
139                 (3) :                                       \
140                 (10)))
141     #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
142 #endif
143 
144 
145 /// Prefer padding overhead vs X-way conflicts greater than this threshold
146 #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
147     #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
148         ((arch >= 300) ?                                    \
149             (1) :                                           \
150             (4))
151     #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
152 #endif
153 
154 
155 template <
156     int NOMINAL_4B_BLOCK_THREADS,
157     int NOMINAL_4B_ITEMS_PER_THREAD,
158     typename T>
159 struct RegBoundScaling
160 {
161     enum {
162         ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
163         BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
164     };
165 };
166 
167 
168 template <
169     int NOMINAL_4B_BLOCK_THREADS,
170     int NOMINAL_4B_ITEMS_PER_THREAD,
171     typename T>
172 struct MemBoundScaling
173 {
174     enum {
175         ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
176         BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
177     };
178 };
179 
180 
181 
182 
183 #endif  // Do not document
184 
185 }               // CUB namespace
186 CUB_NS_POSTFIX  // Optional outer namespace(s)
187