1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_CUDA_INTERNAL_HPP
46 #define KOKKOS_CUDA_INTERNAL_HPP
47 
48 #include <Kokkos_Macros.hpp>
49 #ifdef KOKKOS_ENABLE_CUDA
50 
51 #include <Cuda/Kokkos_Cuda_Error.hpp>
52 
53 namespace Kokkos {
54 namespace Impl {
55 
cuda_max_active_blocks_per_sm(cudaDeviceProp const & properties,cudaFuncAttributes const & attributes,int block_size,size_t dynamic_shmem)56 inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
57                                          cudaFuncAttributes const& attributes,
58                                          int block_size, size_t dynamic_shmem) {
59   // Limits due do registers/SM
60   int const regs_per_sm     = properties.regsPerMultiprocessor;
61   int const regs_per_thread = attributes.numRegs;
62   int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
63 
64   // Limits due to shared memory/SM
65   size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
66   size_t const shmem_per_block         = properties.sharedMemPerBlock;
67   size_t const static_shmem            = attributes.sharedSizeBytes;
68   size_t const dynamic_shmem_per_block = attributes.maxDynamicSharedSizeBytes;
69   size_t const total_shmem             = static_shmem + dynamic_shmem;
70 
71   int const max_blocks_shmem =
72       total_shmem > shmem_per_block || dynamic_shmem > dynamic_shmem_per_block
73           ? 0
74           : (total_shmem > 0 ? (int)shmem_per_sm / total_shmem
75                              : max_blocks_regs);
76 
77   // Limits due to blocks/SM
78 #if CUDA_VERSION >= 11000
79   int const max_blocks_per_sm = properties.maxBlocksPerMultiProcessor;
80 #else
81   int const max_blocks_per_sm = [&properties]() {
82     switch (properties.major) {
83       case 3: return 16;
84       case 5:
85       case 6: return 32;
86       case 7: {
87         int isTuring = properties.minor == 5;
88         return (isTuring) ? 16 : 32;
89       }
90       default:
91         throw_runtime_exception("Unknown device in cuda block size deduction");
92         return 0;
93     }
94   }();
95 #endif
96 
97   // Overall occupancy in blocks
98   return std::min({max_blocks_regs, max_blocks_shmem, max_blocks_per_sm});
99 }
100 
101 template <typename UnaryFunction, typename LaunchBounds>
cuda_deduce_block_size(bool early_termination,cudaDeviceProp const & properties,cudaFuncAttributes const & attributes,UnaryFunction block_size_to_dynamic_shmem,LaunchBounds)102 inline int cuda_deduce_block_size(bool early_termination,
103                                   cudaDeviceProp const& properties,
104                                   cudaFuncAttributes const& attributes,
105                                   UnaryFunction block_size_to_dynamic_shmem,
106                                   LaunchBounds) {
107   // Limits
108   int const max_threads_per_sm = properties.maxThreadsPerMultiProcessor;
109   // unsure if I need to do that or if this is already accounted for in the
110   // functor attributes
111   int const max_threads_per_block =
112       std::min(LaunchBounds::maxTperB == 0 ? (int)properties.maxThreadsPerBlock
113                                            : (int)LaunchBounds::maxTperB,
114                attributes.maxThreadsPerBlock);
115   int const min_blocks_per_sm =
116       LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
117 
118   // Recorded maximum
119   int opt_block_size     = 0;
120   int opt_threads_per_sm = 0;
121 
122   for (int block_size = max_threads_per_block; block_size > 0;
123        block_size -= 32) {
124     size_t const dynamic_shmem = block_size_to_dynamic_shmem(block_size);
125 
126     int blocks_per_sm = cuda_max_active_blocks_per_sm(
127         properties, attributes, block_size, dynamic_shmem);
128 
129     int threads_per_sm = blocks_per_sm * block_size;
130 
131     if (threads_per_sm > max_threads_per_sm) {
132       blocks_per_sm  = max_threads_per_sm / block_size;
133       threads_per_sm = blocks_per_sm * block_size;
134     }
135 
136     if (blocks_per_sm >= min_blocks_per_sm) {
137       if (threads_per_sm >= opt_threads_per_sm) {
138         opt_block_size     = block_size;
139         opt_threads_per_sm = threads_per_sm;
140       }
141     }
142 
143     if (early_termination && opt_block_size != 0) break;
144   }
145 
146   return opt_block_size;
147 }
148 
149 template <class FunctorType, class LaunchBounds>
cuda_get_max_block_size(const CudaInternal * cuda_instance,const cudaFuncAttributes & attr,const FunctorType & f,const size_t vector_length,const size_t shmem_block,const size_t shmem_thread)150 int cuda_get_max_block_size(const CudaInternal* cuda_instance,
151                             const cudaFuncAttributes& attr,
152                             const FunctorType& f, const size_t vector_length,
153                             const size_t shmem_block,
154                             const size_t shmem_thread) {
155   (void)cuda_instance;
156 
157   auto const& prop = Kokkos::Cuda().cuda_device_prop();
158 
159   auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
160                                             shmem_thread](int block_size) {
161     size_t const functor_shmem =
162         Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
163             f, block_size / vector_length);
164 
165     size_t const dynamic_shmem = shmem_block +
166                                  shmem_thread * (block_size / vector_length) +
167                                  functor_shmem;
168     return dynamic_shmem;
169   };
170 
171   return cuda_deduce_block_size(true, prop, attr, block_size_to_dynamic_shmem,
172                                 LaunchBounds{});
173 }
174 
175 template <class FunctorType, class LaunchBounds>
cuda_get_opt_block_size(const CudaInternal * cuda_instance,const cudaFuncAttributes & attr,const FunctorType & f,const size_t vector_length,const size_t shmem_block,const size_t shmem_thread)176 int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
177                             const cudaFuncAttributes& attr,
178                             const FunctorType& f, const size_t vector_length,
179                             const size_t shmem_block,
180                             const size_t shmem_thread) {
181   (void)cuda_instance;
182 
183   auto const& prop = Kokkos::Cuda().cuda_device_prop();
184 
185   auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
186                                             shmem_thread](int block_size) {
187     size_t const functor_shmem =
188         Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
189             f, block_size / vector_length);
190 
191     size_t const dynamic_shmem = shmem_block +
192                                  shmem_thread * (block_size / vector_length) +
193                                  functor_shmem;
194     return dynamic_shmem;
195   };
196 
197   return cuda_deduce_block_size(false, prop, attr, block_size_to_dynamic_shmem,
198                                 LaunchBounds{});
199 }
200 
201 // Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
202 // NOTE these number can be obtained several ways:
203 // * One option is to download the CUDA Occupancy Calculator spreadsheet, select
204 // "Compute Capability" first and check what is the smallest "Shared Memory
205 // Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
206 // bytes is then to be found below in the summary.
207 // * Another option would be to look for the information in the "Tuning
208 // Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
209 // the "Shared Memory" section (more tedious)
get_shmem_per_sm_prefer_l1(cudaDeviceProp const & properties)210 inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
211   int const compute_capability = properties.major * 10 + properties.minor;
212   return [compute_capability]() {
213     switch (compute_capability) {
214       case 30:
215       case 32:
216       case 35: return 16;
217       case 37: return 80;
218       case 50:
219       case 53:
220       case 60:
221       case 62: return 64;
222       case 52:
223       case 61: return 96;
224       case 70:
225       case 80:
226       case 86: return 8;
227       case 75: return 32;
228       default:
229         Kokkos::Impl::throw_runtime_exception(
230             "Unknown device in cuda block size deduction");
231     }
232     return 0;
233   }() * 1024;
234 }
235 }  // namespace Impl
236 }  // namespace Kokkos
237 
238 #endif  // KOKKOS_ENABLE_CUDA
239 #endif  /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
240