1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_CUDAEXEC_HPP
46 #define KOKKOS_CUDAEXEC_HPP
47 
48 #include <Kokkos_Macros.hpp>
49 #ifdef KOKKOS_ENABLE_CUDA
50 
51 #include <mutex>
52 #include <string>
53 #include <cstdint>
54 #include <cmath>
55 #include <Kokkos_Parallel.hpp>
56 #include <impl/Kokkos_Error.hpp>
57 #include <Cuda/Kokkos_Cuda_abort.hpp>
58 #include <Cuda/Kokkos_Cuda_Error.hpp>
59 #include <Cuda/Kokkos_Cuda_Locks.hpp>
60 #include <Cuda/Kokkos_Cuda_Instance.hpp>
61 #include <impl/Kokkos_GraphImpl_fwd.hpp>
62 #include <Cuda/Kokkos_Cuda_GraphNodeKernel.hpp>
63 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
64 
65 //----------------------------------------------------------------------------
66 //----------------------------------------------------------------------------
67 
68 /** \brief  Access to constant memory on the device */
69 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
70 
71 __device__ __constant__ extern unsigned long
72     kokkos_impl_cuda_constant_memory_buffer[];
73 
74 #else
75 
76 __device__ __constant__ unsigned long kokkos_impl_cuda_constant_memory_buffer
77     [Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long)];
78 
79 #endif
80 
81 template <typename T>
kokkos_impl_cuda_shared_memory()82 inline __device__ T* kokkos_impl_cuda_shared_memory() {
83   extern __shared__ Kokkos::CudaSpace::size_type sh[];
84   return (T*)sh;
85 }
86 
87 namespace Kokkos {
88 namespace Impl {
89 
90 //----------------------------------------------------------------------------
91 // See section B.17 of Cuda C Programming Guide Version 3.2
92 // for discussion of
93 //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
94 // function qualifier which could be used to improve performance.
95 //----------------------------------------------------------------------------
96 // Maximize L1 cache and minimize shared memory:
97 //   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
98 // For 2.0 capability: 48 KB L1 and 16 KB shared
99 //----------------------------------------------------------------------------
100 
101 template <class DriverType>
cuda_parallel_launch_constant_memory()102 __global__ static void cuda_parallel_launch_constant_memory() {
103   const DriverType& driver =
104       *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer);
105 
106   driver();
107 }
108 
109 template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)110 __global__ __launch_bounds__(
111     maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() {
112   const DriverType& driver =
113       *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer);
114 
115   driver();
116 }
117 
118 template <class DriverType>
cuda_parallel_launch_local_memory(const DriverType driver)119 __global__ static void cuda_parallel_launch_local_memory(
120     const DriverType driver) {
121   driver();
122 }
123 
124 template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)125 __global__ __launch_bounds__(
126     maxTperB,
127     minBperSM) static void cuda_parallel_launch_local_memory(const DriverType
128                                                                  driver) {
129   driver();
130 }
131 
132 template <class DriverType>
cuda_parallel_launch_global_memory(const DriverType * driver)133 __global__ static void cuda_parallel_launch_global_memory(
134     const DriverType* driver) {
135   driver->operator()();
136 }
137 
138 template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)139 __global__ __launch_bounds__(
140     maxTperB,
141     minBperSM) static void cuda_parallel_launch_global_memory(const DriverType*
142                                                                   driver) {
143   driver->operator()();
144 }
145 
146 //==============================================================================
147 // <editor-fold desc="Some helper functions for launch code readability"> {{{1
148 
is_empty_launch(dim3 const & grid,dim3 const & block)149 inline bool is_empty_launch(dim3 const& grid, dim3 const& block) {
150   return (grid.x == 0) || ((block.x * block.y * block.z) == 0);
151 }
152 
check_shmem_request(CudaInternal const * cuda_instance,int shmem)153 inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
154   if (cuda_instance->m_maxShmemPerBlock < shmem) {
155     Kokkos::Impl::throw_runtime_exception(
156         std::string("CudaParallelLaunch (or graph node creation) FAILED: shared"
157                     " memory request is too large"));
158   }
159 }
160 
161 // This function needs to be template on DriverType and LaunchBounds
162 // so that the static bool is unique for each type combo
163 // KernelFuncPtr does not necessarily contain that type information.
164 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
configure_shmem_preference(KernelFuncPtr const & func,bool prefer_shmem)165 inline void configure_shmem_preference(KernelFuncPtr const& func,
166                                        bool prefer_shmem) {
167 #ifndef KOKKOS_ARCH_KEPLER
168   // On Kepler the L1 has no benefit since it doesn't cache reads
169   auto set_cache_config = [&] {
170     CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
171         func,
172         (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
173     return prefer_shmem;
174   };
175   static bool cache_config_preference_cached = set_cache_config();
176   if (cache_config_preference_cached != prefer_shmem) {
177     cache_config_preference_cached = set_cache_config();
178   }
179 #else
180   // Use the parameters so we don't get a warning
181   (void)func;
182   (void)prefer_shmem;
183 #endif
184 }
185 
186 template <class Policy>
187 std::enable_if_t<Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(Policy const & policy,cudaDeviceProp const & properties,cudaFuncAttributes const & attributes,dim3 const & block,int & shmem,bool & prefer_shmem)188 modify_launch_configuration_if_desired_occupancy_is_specified(
189     Policy const& policy, cudaDeviceProp const& properties,
190     cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
191     bool& prefer_shmem) {
192   int const block_size        = block.x * block.y * block.z;
193   int const desired_occupancy = policy.impl_get_desired_occupancy().value();
194 
195   size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
196   size_t const static_shmem           = attributes.sharedSizeBytes;
197 
198   // round to nearest integer and avoid division by zero
199   int active_blocks = std::max(
200       1, static_cast<int>(std::round(
201              static_cast<double>(properties.maxThreadsPerMultiProcessor) /
202              block_size * desired_occupancy / 100)));
203   int const dynamic_shmem =
204       shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
205 
206   if (dynamic_shmem > shmem) {
207     shmem        = dynamic_shmem;
208     prefer_shmem = false;
209   }
210 }
211 
212 template <class Policy>
213 std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(Policy const &,cudaDeviceProp const &,cudaFuncAttributes const &,dim3 const &,int &,bool &)214 modify_launch_configuration_if_desired_occupancy_is_specified(
215     Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
216     dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
217 
218 // </editor-fold> end Some helper functions for launch code readability }}}1
219 //==============================================================================
220 
221 //==============================================================================
222 // <editor-fold desc="DeduceCudaLaunchMechanism"> {{{2
223 
224 // Use local memory up to ConstantMemoryUseThreshold
225 // Use global memory above ConstantMemoryUsage
226 // In between use ConstantMemory
227 
228 template <class DriverType>
229 struct DeduceCudaLaunchMechanism {
230   constexpr static const Kokkos::Experimental::WorkItemProperty::
231       HintLightWeight_t light_weight =
232           Kokkos::Experimental::WorkItemProperty::HintLightWeight;
233   constexpr static const Kokkos::Experimental::WorkItemProperty::
234       HintHeavyWeight_t heavy_weight =
235           Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
236   constexpr static const typename DriverType::Policy::work_item_property
237       property = typename DriverType::Policy::work_item_property();
238 
239   static constexpr const Experimental::CudaLaunchMechanism
240       valid_launch_mechanism =
241           // BuildValidMask
242       (sizeof(DriverType) < CudaTraits::KernelArgumentLimit
243            ? Experimental::CudaLaunchMechanism::LocalMemory
244            : Experimental::CudaLaunchMechanism::Default) |
245       (sizeof(DriverType) < CudaTraits::ConstantMemoryUsage
246            ? Experimental::CudaLaunchMechanism::ConstantMemory
247            : Experimental::CudaLaunchMechanism::Default) |
248       Experimental::CudaLaunchMechanism::GlobalMemory;
249 
250   static constexpr const Experimental::CudaLaunchMechanism
251       requested_launch_mechanism =
252           (((property & light_weight) == light_weight)
253                ? Experimental::CudaLaunchMechanism::LocalMemory
254                : Experimental::CudaLaunchMechanism::ConstantMemory) |
255           Experimental::CudaLaunchMechanism::GlobalMemory;
256 
257   static constexpr const Experimental::CudaLaunchMechanism
258       default_launch_mechanism =
259           // BuildValidMask
260       (sizeof(DriverType) < CudaTraits::ConstantMemoryUseThreshold)
261           ? Experimental::CudaLaunchMechanism::LocalMemory
262           : ((sizeof(DriverType) < CudaTraits::ConstantMemoryUsage)
263                  ? Experimental::CudaLaunchMechanism::ConstantMemory
264                  : Experimental::CudaLaunchMechanism::GlobalMemory);
265 
266   //              None                LightWeight    HeavyWeight
267   // F<UseT       LCG LCG L  L        LCG  LG L  L    LCG  CG L  C
268   // UseT<F<KAL   LCG LCG C  C        LCG  LG C  L    LCG  CG C  C
269   // Kal<F<CMU     CG LCG C  C         CG  LG C  G     CG  CG C  C
270   // CMU<F          G LCG G  G          G  LG G  G      G  CG G  G
271   static constexpr const Experimental::CudaLaunchMechanism launch_mechanism =
272       ((property & light_weight) == light_weight)
273           ? (sizeof(DriverType) < CudaTraits::KernelArgumentLimit
274                  ? Experimental::CudaLaunchMechanism::LocalMemory
275                  : Experimental::CudaLaunchMechanism::GlobalMemory)
276           : (((property & heavy_weight) == heavy_weight)
277                  ? (sizeof(DriverType) < CudaTraits::ConstantMemoryUsage
278                         ? Experimental::CudaLaunchMechanism::ConstantMemory
279                         : Experimental::CudaLaunchMechanism::GlobalMemory)
280                  : (default_launch_mechanism));
281 };
282 
283 // </editor-fold> end DeduceCudaLaunchMechanism }}}2
284 //==============================================================================
285 
286 //==============================================================================
287 // <editor-fold desc="CudaParallelLaunchKernelInvoker"> {{{1
288 
289 // Base classes that summarize the differences between the different launch
290 // mechanisms
291 
292 template <class DriverType, class LaunchBounds,
293           Experimental::CudaLaunchMechanism LaunchMechanism>
294 struct CudaParallelLaunchKernelFunc;
295 
296 template <class DriverType, class LaunchBounds,
297           Experimental::CudaLaunchMechanism LaunchMechanism>
298 struct CudaParallelLaunchKernelInvoker;
299 
300 //------------------------------------------------------------------------------
301 // <editor-fold desc="Local memory"> {{{2
302 
303 template <class DriverType, unsigned int MaxThreadsPerBlock,
304           unsigned int MinBlocksPerSM>
305 struct CudaParallelLaunchKernelFunc<
306     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
307     Experimental::CudaLaunchMechanism::LocalMemory> {
308   static std::decay_t<decltype(cuda_parallel_launch_local_memory<
309                                DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc310   get_kernel_func() {
311     return cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
312                                              MinBlocksPerSM>;
313   }
314 };
315 
316 template <class DriverType>
317 struct CudaParallelLaunchKernelFunc<
318     DriverType, Kokkos::LaunchBounds<0, 0>,
319     Experimental::CudaLaunchMechanism::LocalMemory> {
320   static std::decay_t<decltype(cuda_parallel_launch_local_memory<DriverType>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc321   get_kernel_func() {
322     return cuda_parallel_launch_local_memory<DriverType>;
323   }
324 };
325 
326 //------------------------------------------------------------------------------
327 
328 template <class DriverType, class LaunchBounds>
329 struct CudaParallelLaunchKernelInvoker<
330     DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>
331     : CudaParallelLaunchKernelFunc<
332           DriverType, LaunchBounds,
333           Experimental::CudaLaunchMechanism::LocalMemory> {
334   using base_t = CudaParallelLaunchKernelFunc<
335       DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>;
336   static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit,
337                 "Kokkos Error: Requested CudaLaunchLocalMemory with a Functor "
338                 "larger than 4096 bytes.");
339 
invoke_kernelKokkos::Impl::CudaParallelLaunchKernelInvoker340   static void invoke_kernel(DriverType const& driver, dim3 const& grid,
341                             dim3 const& block, int shmem,
342                             CudaInternal const* cuda_instance) {
343     (base_t::
344          get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>(
345         driver);
346   }
347 
348 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
create_parallel_launch_graph_nodeKokkos::Impl::CudaParallelLaunchKernelInvoker349   inline static void create_parallel_launch_graph_node(
350       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
351       CudaInternal const* cuda_instance, bool prefer_shmem) {
352     //----------------------------------------
353     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
354     KOKKOS_EXPECTS(bool(graph));
355     auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver);
356     // Expect node not yet initialized
357     KOKKOS_EXPECTS(!bool(graph_node));
358 
359     if (!Impl::is_empty_launch(grid, block)) {
360       Impl::check_shmem_request(cuda_instance, shmem);
361       Impl::configure_shmem_preference<DriverType, LaunchBounds>(
362           base_t::get_kernel_func(), prefer_shmem);
363 
364       void const* args[] = {&driver};
365 
366       cudaKernelNodeParams params = {};
367 
368       params.blockDim       = block;
369       params.gridDim        = grid;
370       params.sharedMemBytes = shmem;
371       params.func           = (void*)base_t::get_kernel_func();
372       params.kernelParams   = (void**)args;
373       params.extra          = nullptr;
374 
375       CUDA_SAFE_CALL(cudaGraphAddKernelNode(
376           &graph_node, graph, /* dependencies = */ nullptr,
377           /* numDependencies = */ 0, &params));
378     } else {
379       // We still need an empty node for the dependency structure
380       CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
381                                            /* dependencies = */ nullptr,
382                                            /* numDependencies = */ 0));
383     }
384     KOKKOS_ENSURES(bool(graph_node))
385   }
386 #endif
387 };
388 
389 // </editor-fold> end local memory }}}2
390 //------------------------------------------------------------------------------
391 
392 //------------------------------------------------------------------------------
393 // <editor-fold desc="Global Memory"> {{{2
394 
395 template <class DriverType, unsigned int MaxThreadsPerBlock,
396           unsigned int MinBlocksPerSM>
397 struct CudaParallelLaunchKernelFunc<
398     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
399     Experimental::CudaLaunchMechanism::GlobalMemory> {
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc400   static void* get_kernel_func() {
401     return cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
402                                               MinBlocksPerSM>;
403   }
404 };
405 
406 template <class DriverType>
407 struct CudaParallelLaunchKernelFunc<
408     DriverType, Kokkos::LaunchBounds<0, 0>,
409     Experimental::CudaLaunchMechanism::GlobalMemory> {
410   static std::decay_t<decltype(cuda_parallel_launch_global_memory<DriverType>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc411   get_kernel_func() {
412     return cuda_parallel_launch_global_memory<DriverType>;
413   }
414 };
415 
416 //------------------------------------------------------------------------------
417 
418 template <class DriverType, class LaunchBounds>
419 struct CudaParallelLaunchKernelInvoker<
420     DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory>
421     : CudaParallelLaunchKernelFunc<
422           DriverType, LaunchBounds,
423           Experimental::CudaLaunchMechanism::GlobalMemory> {
424   using base_t = CudaParallelLaunchKernelFunc<
425       DriverType, LaunchBounds,
426       Experimental::CudaLaunchMechanism::GlobalMemory>;
427 
invoke_kernelKokkos::Impl::CudaParallelLaunchKernelInvoker428   static void invoke_kernel(DriverType const& driver, dim3 const& grid,
429                             dim3 const& block, int shmem,
430                             CudaInternal const* cuda_instance) {
431     DriverType* driver_ptr = reinterpret_cast<DriverType*>(
432         cuda_instance->scratch_functor(sizeof(DriverType)));
433 
434     cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault,
435                     cuda_instance->m_stream);
436     (base_t::
437          get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>(
438         driver_ptr);
439   }
440 
441 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
create_parallel_launch_graph_nodeKokkos::Impl::CudaParallelLaunchKernelInvoker442   inline static void create_parallel_launch_graph_node(
443       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
444       CudaInternal const* cuda_instance, bool prefer_shmem) {
445     //----------------------------------------
446     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
447     KOKKOS_EXPECTS(bool(graph));
448     auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver);
449     // Expect node not yet initialized
450     KOKKOS_EXPECTS(!bool(graph_node));
451 
452     if (!Impl::is_empty_launch(grid, block)) {
453       Impl::check_shmem_request(cuda_instance, shmem);
454       Impl::configure_shmem_preference<DriverType, LaunchBounds>(
455           base_t::get_kernel_func(), prefer_shmem);
456 
457       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
458 
459       // Unlike in the non-graph case, we can get away with doing an async copy
460       // here because the `DriverType` instance is held in the GraphNodeImpl
461       // which is guaranteed to be alive until the graph instance itself is
462       // destroyed, where there should be a fence ensuring that the allocation
463       // associated with this kernel on the device side isn't deleted.
464       cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType),
465                       cudaMemcpyDefault, cuda_instance->m_stream);
466 
467       void const* args[] = {&driver_ptr};
468 
469       cudaKernelNodeParams params = {};
470 
471       params.blockDim       = block;
472       params.gridDim        = grid;
473       params.sharedMemBytes = shmem;
474       params.func           = (void*)base_t::get_kernel_func();
475       params.kernelParams   = (void**)args;
476       params.extra          = nullptr;
477 
478       CUDA_SAFE_CALL(cudaGraphAddKernelNode(
479           &graph_node, graph, /* dependencies = */ nullptr,
480           /* numDependencies = */ 0, &params));
481     } else {
482       // We still need an empty node for the dependency structure
483       CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
484                                            /* dependencies = */ nullptr,
485                                            /* numDependencies = */ 0));
486     }
487     KOKKOS_ENSURES(bool(graph_node))
488   }
489 #endif
490 };
491 
492 // </editor-fold> end Global Memory }}}2
493 //------------------------------------------------------------------------------
494 
495 //------------------------------------------------------------------------------
496 // <editor-fold desc="Constant Memory"> {{{2
497 
498 template <class DriverType, unsigned int MaxThreadsPerBlock,
499           unsigned int MinBlocksPerSM>
500 struct CudaParallelLaunchKernelFunc<
501     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
502     Experimental::CudaLaunchMechanism::ConstantMemory> {
503   static std::decay_t<decltype(cuda_parallel_launch_constant_memory<
504                                DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc505   get_kernel_func() {
506     return cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
507                                                 MinBlocksPerSM>;
508   }
509 };
510 
511 template <class DriverType>
512 struct CudaParallelLaunchKernelFunc<
513     DriverType, Kokkos::LaunchBounds<0, 0>,
514     Experimental::CudaLaunchMechanism::ConstantMemory> {
515   static std::decay_t<
516       decltype(cuda_parallel_launch_constant_memory<DriverType>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc517   get_kernel_func() {
518     return cuda_parallel_launch_constant_memory<DriverType>;
519   }
520 };
521 
522 //------------------------------------------------------------------------------
523 
524 template <class DriverType, class LaunchBounds>
525 struct CudaParallelLaunchKernelInvoker<
526     DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::ConstantMemory>
527     : CudaParallelLaunchKernelFunc<
528           DriverType, LaunchBounds,
529           Experimental::CudaLaunchMechanism::ConstantMemory> {
530   using base_t = CudaParallelLaunchKernelFunc<
531       DriverType, LaunchBounds,
532       Experimental::CudaLaunchMechanism::ConstantMemory>;
533   static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage,
534                 "Kokkos Error: Requested CudaLaunchConstantMemory with a "
535                 "Functor larger than 32kB.");
536 
invoke_kernelKokkos::Impl::CudaParallelLaunchKernelInvoker537   static void invoke_kernel(DriverType const& driver, dim3 const& grid,
538                             dim3 const& block, int shmem,
539                             CudaInternal const* cuda_instance) {
540     // Wait until the previous kernel that uses the constant buffer is done
541     CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
542 
543     // Copy functor (synchronously) to staging buffer in pinned host memory
544     unsigned long* staging = cuda_instance->constantMemHostStaging;
545     memcpy(staging, &driver, sizeof(DriverType));
546 
547     // Copy functor asynchronously from there to constant memory on the device
548     cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging,
549                             sizeof(DriverType), 0, cudaMemcpyHostToDevice,
550                             cudaStream_t(cuda_instance->m_stream));
551 
552     // Invoke the driver function on the device
553     (base_t::
554          get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();
555 
556     // Record an event that says when the constant buffer can be reused
557     CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
558                                    cudaStream_t(cuda_instance->m_stream)));
559   }
560 
561 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
create_parallel_launch_graph_nodeKokkos::Impl::CudaParallelLaunchKernelInvoker562   inline static void create_parallel_launch_graph_node(
563       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
564       CudaInternal const* cuda_instance, bool prefer_shmem) {
565     // Just use global memory; coordinating through events to share constant
566     // memory with the non-graph interface is not really reasonable since
567     // events don't work with Graphs directly, and this would anyway require
568     // a much more complicated structure that finds previous nodes in the
569     // dependency structure of the graph and creates an implicit dependence
570     // based on the need for constant memory (which we would then have to
571     // somehow go and prove was not creating a dependency cycle, and I don't
572     // even know if there's an efficient way to do that, let alone in the
573     // structure we currenty have).
574     using global_launch_impl_t = CudaParallelLaunchKernelInvoker<
575         DriverType, LaunchBounds,
576         Experimental::CudaLaunchMechanism::GlobalMemory>;
577     global_launch_impl_t::create_parallel_launch_graph_node(
578         driver, grid, block, shmem, cuda_instance, prefer_shmem);
579   }
580 #endif
581 };
582 
583 // </editor-fold> end Constant Memory }}}2
584 //------------------------------------------------------------------------------
585 
586 // </editor-fold> end CudaParallelLaunchKernelInvoker }}}1
587 //==============================================================================
588 
589 //==============================================================================
590 // <editor-fold desc="CudaParallelLaunchImpl"> {{{1
591 
592 template <class DriverType, class LaunchBounds,
593           Experimental::CudaLaunchMechanism LaunchMechanism>
594 struct CudaParallelLaunchImpl;
595 
596 template <class DriverType, unsigned int MaxThreadsPerBlock,
597           unsigned int MinBlocksPerSM,
598           Experimental::CudaLaunchMechanism LaunchMechanism>
599 struct CudaParallelLaunchImpl<
600     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
601     LaunchMechanism>
602     : CudaParallelLaunchKernelInvoker<
603           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
604           LaunchMechanism> {
605   using base_t = CudaParallelLaunchKernelInvoker<
606       DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
607       LaunchMechanism>;
608 
launch_kernelKokkos::Impl::CudaParallelLaunchImpl609   inline static void launch_kernel(const DriverType& driver, const dim3& grid,
610                                    const dim3& block, int shmem,
611                                    const CudaInternal* cuda_instance,
612                                    bool prefer_shmem) {
613     if (!Impl::is_empty_launch(grid, block)) {
614       // Prevent multiple threads to simultaneously set the cache configuration
615       // preference and launch the same kernel
616       static std::mutex mutex;
617       std::lock_guard<std::mutex> lock(mutex);
618 
619       Impl::check_shmem_request(cuda_instance, shmem);
620 
621       // If a desired occupancy is specified, we compute how much shared memory
622       // to ask for to achieve that occupancy, assuming that the cache
623       // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
624       // shared memory computed is actually smaller than `shmem` we overwrite
625       // `shmem` and set `prefer_shmem` to `false`.
626       modify_launch_configuration_if_desired_occupancy_is_specified(
627           driver.get_policy(), cuda_instance->m_deviceProp,
628           get_cuda_func_attributes(), block, shmem, prefer_shmem);
629 
630       Impl::configure_shmem_preference<
631           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
632           base_t::get_kernel_func(), prefer_shmem);
633 
634       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
635 
636       // Invoke the driver function on the device
637       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
638 
639 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
640       CUDA_SAFE_CALL(cudaGetLastError());
641       cuda_instance->fence();
642 #endif
643     }
644   }
645 
get_cuda_func_attributesKokkos::Impl::CudaParallelLaunchImpl646   static cudaFuncAttributes get_cuda_func_attributes() {
647     // Race condition inside of cudaFuncGetAttributes if the same address is
648     // given requires using a local variable as input instead of a static Rely
649     // on static variable initialization to make sure only one thread executes
650     // the code and the result is visible.
651     auto wrap_get_attributes = []() -> cudaFuncAttributes {
652       cudaFuncAttributes attr_tmp;
653       CUDA_SAFE_CALL(
654           cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
655       return attr_tmp;
656     };
657     static cudaFuncAttributes attr = wrap_get_attributes();
658     return attr;
659   }
660 };
661 
662 // </editor-fold> end CudaParallelLaunchImpl }}}1
663 //==============================================================================
664 
665 //==============================================================================
666 // <editor-fold desc="CudaParallelLaunch"> {{{1
667 
668 template <class DriverType, class LaunchBounds = Kokkos::LaunchBounds<>,
669           Experimental::CudaLaunchMechanism LaunchMechanism =
670               DeduceCudaLaunchMechanism<DriverType>::launch_mechanism,
671           bool DoGraph = DriverType::Policy::is_graph_kernel::value
672 #ifndef KOKKOS_CUDA_ENABLE_GRAPHS
673                          && false
674 #endif
675           >
676 struct CudaParallelLaunch;
677 
678 // General launch mechanism
679 template <class DriverType, class LaunchBounds,
680           Experimental::CudaLaunchMechanism LaunchMechanism>
681 struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism,
682                           /* DoGraph = */ false>
683     : CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> {
684   using base_t =
685       CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>;
686   template <class... Args>
CudaParallelLaunchKokkos::Impl::CudaParallelLaunch687   CudaParallelLaunch(Args&&... args) {
688     base_t::launch_kernel((Args &&) args...);
689   }
690 };
691 
692 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
693 // Launch mechanism for creating graph nodes
694 template <class DriverType, class LaunchBounds,
695           Experimental::CudaLaunchMechanism LaunchMechanism>
696 struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism,
697                           /* DoGraph = */ true>
698     : CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> {
699   using base_t =
700       CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>;
701   template <class... Args>
CudaParallelLaunchKokkos::Impl::CudaParallelLaunch702   CudaParallelLaunch(Args&&... args) {
703     base_t::create_parallel_launch_graph_node((Args &&) args...);
704   }
705 };
706 #endif
707 
708 // </editor-fold> end CudaParallelLaunch }}}1
709 //==============================================================================
710 
711 }  // namespace Impl
712 }  // namespace Kokkos
713 
714 //----------------------------------------------------------------------------
715 //----------------------------------------------------------------------------
716 
717 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
718 #endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
719