1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_CUDA_INTERNAL_HPP
46 #define KOKKOS_CUDA_INTERNAL_HPP
47
48 #include <Kokkos_Macros.hpp>
49 #ifdef KOKKOS_ENABLE_CUDA
50
51 #include <Cuda/Kokkos_Cuda_Error.hpp>
52
53 namespace Kokkos {
54 namespace Impl {
55
cuda_max_active_blocks_per_sm(cudaDeviceProp const & properties,cudaFuncAttributes const & attributes,int block_size,size_t dynamic_shmem)56 inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
57 cudaFuncAttributes const& attributes,
58 int block_size, size_t dynamic_shmem) {
59 // Limits due do registers/SM
60 int const regs_per_sm = properties.regsPerMultiprocessor;
61 int const regs_per_thread = attributes.numRegs;
62 int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
63
64 // Limits due to shared memory/SM
65 size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
66 size_t const shmem_per_block = properties.sharedMemPerBlock;
67 size_t const static_shmem = attributes.sharedSizeBytes;
68 size_t const dynamic_shmem_per_block = attributes.maxDynamicSharedSizeBytes;
69 size_t const total_shmem = static_shmem + dynamic_shmem;
70
71 int const max_blocks_shmem =
72 total_shmem > shmem_per_block || dynamic_shmem > dynamic_shmem_per_block
73 ? 0
74 : (total_shmem > 0 ? (int)shmem_per_sm / total_shmem
75 : max_blocks_regs);
76
77 // Limits due to blocks/SM
78 #if CUDA_VERSION >= 11000
79 int const max_blocks_per_sm = properties.maxBlocksPerMultiProcessor;
80 #else
81 int const max_blocks_per_sm = [&properties]() {
82 switch (properties.major) {
83 case 3: return 16;
84 case 5:
85 case 6: return 32;
86 case 7: {
87 int isTuring = properties.minor == 5;
88 return (isTuring) ? 16 : 32;
89 }
90 default:
91 throw_runtime_exception("Unknown device in cuda block size deduction");
92 return 0;
93 }
94 }();
95 #endif
96
97 // Overall occupancy in blocks
98 return std::min({max_blocks_regs, max_blocks_shmem, max_blocks_per_sm});
99 }
100
101 template <typename UnaryFunction, typename LaunchBounds>
cuda_deduce_block_size(bool early_termination,cudaDeviceProp const & properties,cudaFuncAttributes const & attributes,UnaryFunction block_size_to_dynamic_shmem,LaunchBounds)102 inline int cuda_deduce_block_size(bool early_termination,
103 cudaDeviceProp const& properties,
104 cudaFuncAttributes const& attributes,
105 UnaryFunction block_size_to_dynamic_shmem,
106 LaunchBounds) {
107 // Limits
108 int const max_threads_per_sm = properties.maxThreadsPerMultiProcessor;
109 // unsure if I need to do that or if this is already accounted for in the
110 // functor attributes
111 int const max_threads_per_block =
112 std::min(LaunchBounds::maxTperB == 0 ? (int)properties.maxThreadsPerBlock
113 : (int)LaunchBounds::maxTperB,
114 attributes.maxThreadsPerBlock);
115 int const min_blocks_per_sm =
116 LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
117
118 // Recorded maximum
119 int opt_block_size = 0;
120 int opt_threads_per_sm = 0;
121
122 for (int block_size = max_threads_per_block; block_size > 0;
123 block_size -= 32) {
124 size_t const dynamic_shmem = block_size_to_dynamic_shmem(block_size);
125
126 int blocks_per_sm = cuda_max_active_blocks_per_sm(
127 properties, attributes, block_size, dynamic_shmem);
128
129 int threads_per_sm = blocks_per_sm * block_size;
130
131 if (threads_per_sm > max_threads_per_sm) {
132 blocks_per_sm = max_threads_per_sm / block_size;
133 threads_per_sm = blocks_per_sm * block_size;
134 }
135
136 if (blocks_per_sm >= min_blocks_per_sm) {
137 if (threads_per_sm >= opt_threads_per_sm) {
138 opt_block_size = block_size;
139 opt_threads_per_sm = threads_per_sm;
140 }
141 }
142
143 if (early_termination && opt_block_size != 0) break;
144 }
145
146 return opt_block_size;
147 }
148
149 template <class FunctorType, class LaunchBounds>
cuda_get_max_block_size(const CudaInternal * cuda_instance,const cudaFuncAttributes & attr,const FunctorType & f,const size_t vector_length,const size_t shmem_block,const size_t shmem_thread)150 int cuda_get_max_block_size(const CudaInternal* cuda_instance,
151 const cudaFuncAttributes& attr,
152 const FunctorType& f, const size_t vector_length,
153 const size_t shmem_block,
154 const size_t shmem_thread) {
155 (void)cuda_instance;
156
157 auto const& prop = Kokkos::Cuda().cuda_device_prop();
158
159 auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
160 shmem_thread](int block_size) {
161 size_t const functor_shmem =
162 Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
163 f, block_size / vector_length);
164
165 size_t const dynamic_shmem = shmem_block +
166 shmem_thread * (block_size / vector_length) +
167 functor_shmem;
168 return dynamic_shmem;
169 };
170
171 return cuda_deduce_block_size(true, prop, attr, block_size_to_dynamic_shmem,
172 LaunchBounds{});
173 }
174
175 template <class FunctorType, class LaunchBounds>
cuda_get_opt_block_size(const CudaInternal * cuda_instance,const cudaFuncAttributes & attr,const FunctorType & f,const size_t vector_length,const size_t shmem_block,const size_t shmem_thread)176 int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
177 const cudaFuncAttributes& attr,
178 const FunctorType& f, const size_t vector_length,
179 const size_t shmem_block,
180 const size_t shmem_thread) {
181 (void)cuda_instance;
182
183 auto const& prop = Kokkos::Cuda().cuda_device_prop();
184
185 auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
186 shmem_thread](int block_size) {
187 size_t const functor_shmem =
188 Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
189 f, block_size / vector_length);
190
191 size_t const dynamic_shmem = shmem_block +
192 shmem_thread * (block_size / vector_length) +
193 functor_shmem;
194 return dynamic_shmem;
195 };
196
197 return cuda_deduce_block_size(false, prop, attr, block_size_to_dynamic_shmem,
198 LaunchBounds{});
199 }
200
201 // Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
202 // NOTE these number can be obtained several ways:
203 // * One option is to download the CUDA Occupancy Calculator spreadsheet, select
204 // "Compute Capability" first and check what is the smallest "Shared Memory
205 // Size Config" that is available. The "Shared Memory Per Multiprocessor" in
206 // bytes is then to be found below in the summary.
207 // * Another option would be to look for the information in the "Tuning
208 // Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
209 // the "Shared Memory" section (more tedious)
get_shmem_per_sm_prefer_l1(cudaDeviceProp const & properties)210 inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
211 int const compute_capability = properties.major * 10 + properties.minor;
212 return [compute_capability]() {
213 switch (compute_capability) {
214 case 30:
215 case 32:
216 case 35: return 16;
217 case 37: return 80;
218 case 50:
219 case 53:
220 case 60:
221 case 62: return 64;
222 case 52:
223 case 61: return 96;
224 case 70:
225 case 80:
226 case 86: return 8;
227 case 75: return 32;
228 default:
229 Kokkos::Impl::throw_runtime_exception(
230 "Unknown device in cuda block size deduction");
231 }
232 return 0;
233 }() * 1024;
234 }
235 } // namespace Impl
236 } // namespace Kokkos
237
238 #endif // KOKKOS_ENABLE_CUDA
239 #endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
240