1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_CUDAEXEC_HPP
46 #define KOKKOS_CUDAEXEC_HPP
47
48 #include <Kokkos_Macros.hpp>
49 #ifdef KOKKOS_ENABLE_CUDA
50
51 #include <mutex>
52 #include <string>
53 #include <cstdint>
54 #include <cmath>
55 #include <Kokkos_Parallel.hpp>
56 #include <impl/Kokkos_Error.hpp>
57 #include <Cuda/Kokkos_Cuda_abort.hpp>
58 #include <Cuda/Kokkos_Cuda_Error.hpp>
59 #include <Cuda/Kokkos_Cuda_Locks.hpp>
60 #include <Cuda/Kokkos_Cuda_Instance.hpp>
61 #include <impl/Kokkos_GraphImpl_fwd.hpp>
62 #include <Cuda/Kokkos_Cuda_GraphNodeKernel.hpp>
63 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
64
65 //----------------------------------------------------------------------------
66 //----------------------------------------------------------------------------
67
68 /** \brief Access to constant memory on the device */
69 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
70
71 __device__ __constant__ extern unsigned long
72 kokkos_impl_cuda_constant_memory_buffer[];
73
74 #else
75
76 __device__ __constant__ unsigned long kokkos_impl_cuda_constant_memory_buffer
77 [Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long)];
78
79 #endif
80
81 template <typename T>
kokkos_impl_cuda_shared_memory()82 inline __device__ T* kokkos_impl_cuda_shared_memory() {
83 extern __shared__ Kokkos::CudaSpace::size_type sh[];
84 return (T*)sh;
85 }
86
87 namespace Kokkos {
88 namespace Impl {
89
90 //----------------------------------------------------------------------------
91 // See section B.17 of Cuda C Programming Guide Version 3.2
92 // for discussion of
93 // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
94 // function qualifier which could be used to improve performance.
95 //----------------------------------------------------------------------------
96 // Maximize L1 cache and minimize shared memory:
97 // cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
98 // For 2.0 capability: 48 KB L1 and 16 KB shared
99 //----------------------------------------------------------------------------
100
101 template <class DriverType>
cuda_parallel_launch_constant_memory()102 __global__ static void cuda_parallel_launch_constant_memory() {
103 const DriverType& driver =
104 *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer);
105
106 driver();
107 }
108
109 template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)110 __global__ __launch_bounds__(
111 maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() {
112 const DriverType& driver =
113 *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer);
114
115 driver();
116 }
117
118 template <class DriverType>
cuda_parallel_launch_local_memory(const DriverType driver)119 __global__ static void cuda_parallel_launch_local_memory(
120 const DriverType driver) {
121 driver();
122 }
123
124 template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)125 __global__ __launch_bounds__(
126 maxTperB,
127 minBperSM) static void cuda_parallel_launch_local_memory(const DriverType
128 driver) {
129 driver();
130 }
131
132 template <class DriverType>
cuda_parallel_launch_global_memory(const DriverType * driver)133 __global__ static void cuda_parallel_launch_global_memory(
134 const DriverType* driver) {
135 driver->operator()();
136 }
137
138 template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)139 __global__ __launch_bounds__(
140 maxTperB,
141 minBperSM) static void cuda_parallel_launch_global_memory(const DriverType*
142 driver) {
143 driver->operator()();
144 }
145
146 //==============================================================================
147 // <editor-fold desc="Some helper functions for launch code readability"> {{{1
148
is_empty_launch(dim3 const & grid,dim3 const & block)149 inline bool is_empty_launch(dim3 const& grid, dim3 const& block) {
150 return (grid.x == 0) || ((block.x * block.y * block.z) == 0);
151 }
152
check_shmem_request(CudaInternal const * cuda_instance,int shmem)153 inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
154 if (cuda_instance->m_maxShmemPerBlock < shmem) {
155 Kokkos::Impl::throw_runtime_exception(
156 std::string("CudaParallelLaunch (or graph node creation) FAILED: shared"
157 " memory request is too large"));
158 }
159 }
160
161 // This function needs to be template on DriverType and LaunchBounds
162 // so that the static bool is unique for each type combo
163 // KernelFuncPtr does not necessarily contain that type information.
164 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
configure_shmem_preference(KernelFuncPtr const & func,bool prefer_shmem)165 inline void configure_shmem_preference(KernelFuncPtr const& func,
166 bool prefer_shmem) {
167 #ifndef KOKKOS_ARCH_KEPLER
168 // On Kepler the L1 has no benefit since it doesn't cache reads
169 auto set_cache_config = [&] {
170 CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
171 func,
172 (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
173 return prefer_shmem;
174 };
175 static bool cache_config_preference_cached = set_cache_config();
176 if (cache_config_preference_cached != prefer_shmem) {
177 cache_config_preference_cached = set_cache_config();
178 }
179 #else
180 // Use the parameters so we don't get a warning
181 (void)func;
182 (void)prefer_shmem;
183 #endif
184 }
185
186 template <class Policy>
187 std::enable_if_t<Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(Policy const & policy,cudaDeviceProp const & properties,cudaFuncAttributes const & attributes,dim3 const & block,int & shmem,bool & prefer_shmem)188 modify_launch_configuration_if_desired_occupancy_is_specified(
189 Policy const& policy, cudaDeviceProp const& properties,
190 cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
191 bool& prefer_shmem) {
192 int const block_size = block.x * block.y * block.z;
193 int const desired_occupancy = policy.impl_get_desired_occupancy().value();
194
195 size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
196 size_t const static_shmem = attributes.sharedSizeBytes;
197
198 // round to nearest integer and avoid division by zero
199 int active_blocks = std::max(
200 1, static_cast<int>(std::round(
201 static_cast<double>(properties.maxThreadsPerMultiProcessor) /
202 block_size * desired_occupancy / 100)));
203 int const dynamic_shmem =
204 shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
205
206 if (dynamic_shmem > shmem) {
207 shmem = dynamic_shmem;
208 prefer_shmem = false;
209 }
210 }
211
212 template <class Policy>
213 std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(Policy const &,cudaDeviceProp const &,cudaFuncAttributes const &,dim3 const &,int &,bool &)214 modify_launch_configuration_if_desired_occupancy_is_specified(
215 Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
216 dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
217
218 // </editor-fold> end Some helper functions for launch code readability }}}1
219 //==============================================================================
220
221 //==============================================================================
222 // <editor-fold desc="DeduceCudaLaunchMechanism"> {{{2
223
224 // Use local memory up to ConstantMemoryUseThreshold
225 // Use global memory above ConstantMemoryUsage
226 // In between use ConstantMemory
227
228 template <class DriverType>
229 struct DeduceCudaLaunchMechanism {
230 constexpr static const Kokkos::Experimental::WorkItemProperty::
231 HintLightWeight_t light_weight =
232 Kokkos::Experimental::WorkItemProperty::HintLightWeight;
233 constexpr static const Kokkos::Experimental::WorkItemProperty::
234 HintHeavyWeight_t heavy_weight =
235 Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
236 constexpr static const typename DriverType::Policy::work_item_property
237 property = typename DriverType::Policy::work_item_property();
238
239 static constexpr const Experimental::CudaLaunchMechanism
240 valid_launch_mechanism =
241 // BuildValidMask
242 (sizeof(DriverType) < CudaTraits::KernelArgumentLimit
243 ? Experimental::CudaLaunchMechanism::LocalMemory
244 : Experimental::CudaLaunchMechanism::Default) |
245 (sizeof(DriverType) < CudaTraits::ConstantMemoryUsage
246 ? Experimental::CudaLaunchMechanism::ConstantMemory
247 : Experimental::CudaLaunchMechanism::Default) |
248 Experimental::CudaLaunchMechanism::GlobalMemory;
249
250 static constexpr const Experimental::CudaLaunchMechanism
251 requested_launch_mechanism =
252 (((property & light_weight) == light_weight)
253 ? Experimental::CudaLaunchMechanism::LocalMemory
254 : Experimental::CudaLaunchMechanism::ConstantMemory) |
255 Experimental::CudaLaunchMechanism::GlobalMemory;
256
257 static constexpr const Experimental::CudaLaunchMechanism
258 default_launch_mechanism =
259 // BuildValidMask
260 (sizeof(DriverType) < CudaTraits::ConstantMemoryUseThreshold)
261 ? Experimental::CudaLaunchMechanism::LocalMemory
262 : ((sizeof(DriverType) < CudaTraits::ConstantMemoryUsage)
263 ? Experimental::CudaLaunchMechanism::ConstantMemory
264 : Experimental::CudaLaunchMechanism::GlobalMemory);
265
266 // None LightWeight HeavyWeight
267 // F<UseT LCG LCG L L LCG LG L L LCG CG L C
268 // UseT<F<KAL LCG LCG C C LCG LG C L LCG CG C C
269 // Kal<F<CMU CG LCG C C CG LG C G CG CG C C
270 // CMU<F G LCG G G G LG G G G CG G G
271 static constexpr const Experimental::CudaLaunchMechanism launch_mechanism =
272 ((property & light_weight) == light_weight)
273 ? (sizeof(DriverType) < CudaTraits::KernelArgumentLimit
274 ? Experimental::CudaLaunchMechanism::LocalMemory
275 : Experimental::CudaLaunchMechanism::GlobalMemory)
276 : (((property & heavy_weight) == heavy_weight)
277 ? (sizeof(DriverType) < CudaTraits::ConstantMemoryUsage
278 ? Experimental::CudaLaunchMechanism::ConstantMemory
279 : Experimental::CudaLaunchMechanism::GlobalMemory)
280 : (default_launch_mechanism));
281 };
282
283 // </editor-fold> end DeduceCudaLaunchMechanism }}}2
284 //==============================================================================
285
286 //==============================================================================
287 // <editor-fold desc="CudaParallelLaunchKernelInvoker"> {{{1
288
289 // Base classes that summarize the differences between the different launch
290 // mechanisms
291
292 template <class DriverType, class LaunchBounds,
293 Experimental::CudaLaunchMechanism LaunchMechanism>
294 struct CudaParallelLaunchKernelFunc;
295
296 template <class DriverType, class LaunchBounds,
297 Experimental::CudaLaunchMechanism LaunchMechanism>
298 struct CudaParallelLaunchKernelInvoker;
299
300 //------------------------------------------------------------------------------
301 // <editor-fold desc="Local memory"> {{{2
302
303 template <class DriverType, unsigned int MaxThreadsPerBlock,
304 unsigned int MinBlocksPerSM>
305 struct CudaParallelLaunchKernelFunc<
306 DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
307 Experimental::CudaLaunchMechanism::LocalMemory> {
308 static std::decay_t<decltype(cuda_parallel_launch_local_memory<
309 DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc310 get_kernel_func() {
311 return cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
312 MinBlocksPerSM>;
313 }
314 };
315
316 template <class DriverType>
317 struct CudaParallelLaunchKernelFunc<
318 DriverType, Kokkos::LaunchBounds<0, 0>,
319 Experimental::CudaLaunchMechanism::LocalMemory> {
320 static std::decay_t<decltype(cuda_parallel_launch_local_memory<DriverType>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc321 get_kernel_func() {
322 return cuda_parallel_launch_local_memory<DriverType>;
323 }
324 };
325
326 //------------------------------------------------------------------------------
327
328 template <class DriverType, class LaunchBounds>
329 struct CudaParallelLaunchKernelInvoker<
330 DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>
331 : CudaParallelLaunchKernelFunc<
332 DriverType, LaunchBounds,
333 Experimental::CudaLaunchMechanism::LocalMemory> {
334 using base_t = CudaParallelLaunchKernelFunc<
335 DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>;
336 static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit,
337 "Kokkos Error: Requested CudaLaunchLocalMemory with a Functor "
338 "larger than 4096 bytes.");
339
invoke_kernelKokkos::Impl::CudaParallelLaunchKernelInvoker340 static void invoke_kernel(DriverType const& driver, dim3 const& grid,
341 dim3 const& block, int shmem,
342 CudaInternal const* cuda_instance) {
343 (base_t::
344 get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>(
345 driver);
346 }
347
348 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
create_parallel_launch_graph_nodeKokkos::Impl::CudaParallelLaunchKernelInvoker349 inline static void create_parallel_launch_graph_node(
350 DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
351 CudaInternal const* cuda_instance, bool prefer_shmem) {
352 //----------------------------------------
353 auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
354 KOKKOS_EXPECTS(bool(graph));
355 auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver);
356 // Expect node not yet initialized
357 KOKKOS_EXPECTS(!bool(graph_node));
358
359 if (!Impl::is_empty_launch(grid, block)) {
360 Impl::check_shmem_request(cuda_instance, shmem);
361 Impl::configure_shmem_preference<DriverType, LaunchBounds>(
362 base_t::get_kernel_func(), prefer_shmem);
363
364 void const* args[] = {&driver};
365
366 cudaKernelNodeParams params = {};
367
368 params.blockDim = block;
369 params.gridDim = grid;
370 params.sharedMemBytes = shmem;
371 params.func = (void*)base_t::get_kernel_func();
372 params.kernelParams = (void**)args;
373 params.extra = nullptr;
374
375 CUDA_SAFE_CALL(cudaGraphAddKernelNode(
376 &graph_node, graph, /* dependencies = */ nullptr,
377 /* numDependencies = */ 0, ¶ms));
378 } else {
379 // We still need an empty node for the dependency structure
380 CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
381 /* dependencies = */ nullptr,
382 /* numDependencies = */ 0));
383 }
384 KOKKOS_ENSURES(bool(graph_node))
385 }
386 #endif
387 };
388
389 // </editor-fold> end local memory }}}2
390 //------------------------------------------------------------------------------
391
392 //------------------------------------------------------------------------------
393 // <editor-fold desc="Global Memory"> {{{2
394
395 template <class DriverType, unsigned int MaxThreadsPerBlock,
396 unsigned int MinBlocksPerSM>
397 struct CudaParallelLaunchKernelFunc<
398 DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
399 Experimental::CudaLaunchMechanism::GlobalMemory> {
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc400 static void* get_kernel_func() {
401 return cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
402 MinBlocksPerSM>;
403 }
404 };
405
406 template <class DriverType>
407 struct CudaParallelLaunchKernelFunc<
408 DriverType, Kokkos::LaunchBounds<0, 0>,
409 Experimental::CudaLaunchMechanism::GlobalMemory> {
410 static std::decay_t<decltype(cuda_parallel_launch_global_memory<DriverType>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc411 get_kernel_func() {
412 return cuda_parallel_launch_global_memory<DriverType>;
413 }
414 };
415
416 //------------------------------------------------------------------------------
417
418 template <class DriverType, class LaunchBounds>
419 struct CudaParallelLaunchKernelInvoker<
420 DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory>
421 : CudaParallelLaunchKernelFunc<
422 DriverType, LaunchBounds,
423 Experimental::CudaLaunchMechanism::GlobalMemory> {
424 using base_t = CudaParallelLaunchKernelFunc<
425 DriverType, LaunchBounds,
426 Experimental::CudaLaunchMechanism::GlobalMemory>;
427
invoke_kernelKokkos::Impl::CudaParallelLaunchKernelInvoker428 static void invoke_kernel(DriverType const& driver, dim3 const& grid,
429 dim3 const& block, int shmem,
430 CudaInternal const* cuda_instance) {
431 DriverType* driver_ptr = reinterpret_cast<DriverType*>(
432 cuda_instance->scratch_functor(sizeof(DriverType)));
433
434 cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault,
435 cuda_instance->m_stream);
436 (base_t::
437 get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>(
438 driver_ptr);
439 }
440
441 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
create_parallel_launch_graph_nodeKokkos::Impl::CudaParallelLaunchKernelInvoker442 inline static void create_parallel_launch_graph_node(
443 DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
444 CudaInternal const* cuda_instance, bool prefer_shmem) {
445 //----------------------------------------
446 auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
447 KOKKOS_EXPECTS(bool(graph));
448 auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver);
449 // Expect node not yet initialized
450 KOKKOS_EXPECTS(!bool(graph_node));
451
452 if (!Impl::is_empty_launch(grid, block)) {
453 Impl::check_shmem_request(cuda_instance, shmem);
454 Impl::configure_shmem_preference<DriverType, LaunchBounds>(
455 base_t::get_kernel_func(), prefer_shmem);
456
457 auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
458
459 // Unlike in the non-graph case, we can get away with doing an async copy
460 // here because the `DriverType` instance is held in the GraphNodeImpl
461 // which is guaranteed to be alive until the graph instance itself is
462 // destroyed, where there should be a fence ensuring that the allocation
463 // associated with this kernel on the device side isn't deleted.
464 cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType),
465 cudaMemcpyDefault, cuda_instance->m_stream);
466
467 void const* args[] = {&driver_ptr};
468
469 cudaKernelNodeParams params = {};
470
471 params.blockDim = block;
472 params.gridDim = grid;
473 params.sharedMemBytes = shmem;
474 params.func = (void*)base_t::get_kernel_func();
475 params.kernelParams = (void**)args;
476 params.extra = nullptr;
477
478 CUDA_SAFE_CALL(cudaGraphAddKernelNode(
479 &graph_node, graph, /* dependencies = */ nullptr,
480 /* numDependencies = */ 0, ¶ms));
481 } else {
482 // We still need an empty node for the dependency structure
483 CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
484 /* dependencies = */ nullptr,
485 /* numDependencies = */ 0));
486 }
487 KOKKOS_ENSURES(bool(graph_node))
488 }
489 #endif
490 };
491
492 // </editor-fold> end Global Memory }}}2
493 //------------------------------------------------------------------------------
494
495 //------------------------------------------------------------------------------
496 // <editor-fold desc="Constant Memory"> {{{2
497
498 template <class DriverType, unsigned int MaxThreadsPerBlock,
499 unsigned int MinBlocksPerSM>
500 struct CudaParallelLaunchKernelFunc<
501 DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
502 Experimental::CudaLaunchMechanism::ConstantMemory> {
503 static std::decay_t<decltype(cuda_parallel_launch_constant_memory<
504 DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc505 get_kernel_func() {
506 return cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
507 MinBlocksPerSM>;
508 }
509 };
510
511 template <class DriverType>
512 struct CudaParallelLaunchKernelFunc<
513 DriverType, Kokkos::LaunchBounds<0, 0>,
514 Experimental::CudaLaunchMechanism::ConstantMemory> {
515 static std::decay_t<
516 decltype(cuda_parallel_launch_constant_memory<DriverType>)>
get_kernel_funcKokkos::Impl::CudaParallelLaunchKernelFunc517 get_kernel_func() {
518 return cuda_parallel_launch_constant_memory<DriverType>;
519 }
520 };
521
522 //------------------------------------------------------------------------------
523
524 template <class DriverType, class LaunchBounds>
525 struct CudaParallelLaunchKernelInvoker<
526 DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::ConstantMemory>
527 : CudaParallelLaunchKernelFunc<
528 DriverType, LaunchBounds,
529 Experimental::CudaLaunchMechanism::ConstantMemory> {
530 using base_t = CudaParallelLaunchKernelFunc<
531 DriverType, LaunchBounds,
532 Experimental::CudaLaunchMechanism::ConstantMemory>;
533 static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage,
534 "Kokkos Error: Requested CudaLaunchConstantMemory with a "
535 "Functor larger than 32kB.");
536
invoke_kernelKokkos::Impl::CudaParallelLaunchKernelInvoker537 static void invoke_kernel(DriverType const& driver, dim3 const& grid,
538 dim3 const& block, int shmem,
539 CudaInternal const* cuda_instance) {
540 // Wait until the previous kernel that uses the constant buffer is done
541 CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
542
543 // Copy functor (synchronously) to staging buffer in pinned host memory
544 unsigned long* staging = cuda_instance->constantMemHostStaging;
545 memcpy(staging, &driver, sizeof(DriverType));
546
547 // Copy functor asynchronously from there to constant memory on the device
548 cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging,
549 sizeof(DriverType), 0, cudaMemcpyHostToDevice,
550 cudaStream_t(cuda_instance->m_stream));
551
552 // Invoke the driver function on the device
553 (base_t::
554 get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();
555
556 // Record an event that says when the constant buffer can be reused
557 CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
558 cudaStream_t(cuda_instance->m_stream)));
559 }
560
561 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
create_parallel_launch_graph_nodeKokkos::Impl::CudaParallelLaunchKernelInvoker562 inline static void create_parallel_launch_graph_node(
563 DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
564 CudaInternal const* cuda_instance, bool prefer_shmem) {
565 // Just use global memory; coordinating through events to share constant
566 // memory with the non-graph interface is not really reasonable since
567 // events don't work with Graphs directly, and this would anyway require
568 // a much more complicated structure that finds previous nodes in the
569 // dependency structure of the graph and creates an implicit dependence
570 // based on the need for constant memory (which we would then have to
571 // somehow go and prove was not creating a dependency cycle, and I don't
572 // even know if there's an efficient way to do that, let alone in the
573 // structure we currenty have).
574 using global_launch_impl_t = CudaParallelLaunchKernelInvoker<
575 DriverType, LaunchBounds,
576 Experimental::CudaLaunchMechanism::GlobalMemory>;
577 global_launch_impl_t::create_parallel_launch_graph_node(
578 driver, grid, block, shmem, cuda_instance, prefer_shmem);
579 }
580 #endif
581 };
582
583 // </editor-fold> end Constant Memory }}}2
584 //------------------------------------------------------------------------------
585
586 // </editor-fold> end CudaParallelLaunchKernelInvoker }}}1
587 //==============================================================================
588
589 //==============================================================================
590 // <editor-fold desc="CudaParallelLaunchImpl"> {{{1
591
592 template <class DriverType, class LaunchBounds,
593 Experimental::CudaLaunchMechanism LaunchMechanism>
594 struct CudaParallelLaunchImpl;
595
596 template <class DriverType, unsigned int MaxThreadsPerBlock,
597 unsigned int MinBlocksPerSM,
598 Experimental::CudaLaunchMechanism LaunchMechanism>
599 struct CudaParallelLaunchImpl<
600 DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
601 LaunchMechanism>
602 : CudaParallelLaunchKernelInvoker<
603 DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
604 LaunchMechanism> {
605 using base_t = CudaParallelLaunchKernelInvoker<
606 DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
607 LaunchMechanism>;
608
launch_kernelKokkos::Impl::CudaParallelLaunchImpl609 inline static void launch_kernel(const DriverType& driver, const dim3& grid,
610 const dim3& block, int shmem,
611 const CudaInternal* cuda_instance,
612 bool prefer_shmem) {
613 if (!Impl::is_empty_launch(grid, block)) {
614 // Prevent multiple threads to simultaneously set the cache configuration
615 // preference and launch the same kernel
616 static std::mutex mutex;
617 std::lock_guard<std::mutex> lock(mutex);
618
619 Impl::check_shmem_request(cuda_instance, shmem);
620
621 // If a desired occupancy is specified, we compute how much shared memory
622 // to ask for to achieve that occupancy, assuming that the cache
623 // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
624 // shared memory computed is actually smaller than `shmem` we overwrite
625 // `shmem` and set `prefer_shmem` to `false`.
626 modify_launch_configuration_if_desired_occupancy_is_specified(
627 driver.get_policy(), cuda_instance->m_deviceProp,
628 get_cuda_func_attributes(), block, shmem, prefer_shmem);
629
630 Impl::configure_shmem_preference<
631 DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
632 base_t::get_kernel_func(), prefer_shmem);
633
634 KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
635
636 // Invoke the driver function on the device
637 base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
638
639 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
640 CUDA_SAFE_CALL(cudaGetLastError());
641 cuda_instance->fence();
642 #endif
643 }
644 }
645
get_cuda_func_attributesKokkos::Impl::CudaParallelLaunchImpl646 static cudaFuncAttributes get_cuda_func_attributes() {
647 // Race condition inside of cudaFuncGetAttributes if the same address is
648 // given requires using a local variable as input instead of a static Rely
649 // on static variable initialization to make sure only one thread executes
650 // the code and the result is visible.
651 auto wrap_get_attributes = []() -> cudaFuncAttributes {
652 cudaFuncAttributes attr_tmp;
653 CUDA_SAFE_CALL(
654 cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
655 return attr_tmp;
656 };
657 static cudaFuncAttributes attr = wrap_get_attributes();
658 return attr;
659 }
660 };
661
662 // </editor-fold> end CudaParallelLaunchImpl }}}1
663 //==============================================================================
664
665 //==============================================================================
666 // <editor-fold desc="CudaParallelLaunch"> {{{1
667
668 template <class DriverType, class LaunchBounds = Kokkos::LaunchBounds<>,
669 Experimental::CudaLaunchMechanism LaunchMechanism =
670 DeduceCudaLaunchMechanism<DriverType>::launch_mechanism,
671 bool DoGraph = DriverType::Policy::is_graph_kernel::value
672 #ifndef KOKKOS_CUDA_ENABLE_GRAPHS
673 && false
674 #endif
675 >
676 struct CudaParallelLaunch;
677
678 // General launch mechanism
679 template <class DriverType, class LaunchBounds,
680 Experimental::CudaLaunchMechanism LaunchMechanism>
681 struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism,
682 /* DoGraph = */ false>
683 : CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> {
684 using base_t =
685 CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>;
686 template <class... Args>
CudaParallelLaunchKokkos::Impl::CudaParallelLaunch687 CudaParallelLaunch(Args&&... args) {
688 base_t::launch_kernel((Args &&) args...);
689 }
690 };
691
692 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
693 // Launch mechanism for creating graph nodes
694 template <class DriverType, class LaunchBounds,
695 Experimental::CudaLaunchMechanism LaunchMechanism>
696 struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism,
697 /* DoGraph = */ true>
698 : CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> {
699 using base_t =
700 CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>;
701 template <class... Args>
CudaParallelLaunchKokkos::Impl::CudaParallelLaunch702 CudaParallelLaunch(Args&&... args) {
703 base_t::create_parallel_launch_graph_node((Args &&) args...);
704 }
705 };
706 #endif
707
708 // </editor-fold> end CudaParallelLaunch }}}1
709 //==============================================================================
710
711 } // namespace Impl
712 } // namespace Kokkos
713
714 //----------------------------------------------------------------------------
715 //----------------------------------------------------------------------------
716
717 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
718 #endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
719