1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_HIP_KERNEL_LAUNCH_HPP
46 #define KOKKOS_HIP_KERNEL_LAUNCH_HPP
47 
48 #include <Kokkos_Macros.hpp>
49 
50 #if defined(__HIPCC__)
51 
52 #include <HIP/Kokkos_HIP_Error.hpp>
53 #include <HIP/Kokkos_HIP_Instance.hpp>
54 #include <Kokkos_HIP_Space.hpp>
55 
56 // Must use global variable on the device with HIP-Clang
57 #ifdef __HIP__
58 __device__ __constant__ unsigned long kokkos_impl_hip_constant_memory_buffer
59     [Kokkos::Experimental::Impl::HIPTraits::ConstantMemoryUsage /
60      sizeof(unsigned long)];
61 #endif
62 
63 namespace Kokkos {
64 namespace Experimental {
65 template <typename T>
kokkos_impl_hip_shared_memory()66 inline __device__ T *kokkos_impl_hip_shared_memory() {
67   HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh);
68   return (T *)sh;
69 }
70 }  // namespace Experimental
71 }  // namespace Kokkos
72 
73 namespace Kokkos {
74 namespace Experimental {
75 namespace Impl {
76 
77 template <typename DriverType>
hip_parallel_launch_constant_memory()78 __global__ static void hip_parallel_launch_constant_memory() {
79   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
80       kokkos_impl_hip_constant_memory_buffer));
81   driver();
82 }
83 
84 template <typename DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)85 __global__ __launch_bounds__(
86     maxTperB, minBperSM) static void hip_parallel_launch_constant_memory() {
87   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
88       kokkos_impl_hip_constant_memory_buffer));
89 
90   driver->operator()();
91 }
92 
93 template <class DriverType>
hip_parallel_launch_local_memory(const DriverType * driver)94 __global__ static void hip_parallel_launch_local_memory(
95     const DriverType *driver) {
96   driver->operator()();
97 }
98 
99 template <class DriverType, unsigned int maxTperB, unsigned int minBperSM>
__launch_bounds__(maxTperB,minBperSM)100 __global__ __launch_bounds__(
101     maxTperB,
102     minBperSM) static void hip_parallel_launch_local_memory(const DriverType
103                                                                 *driver) {
104   driver->operator()();
105 }
106 
107 enum class HIPLaunchMechanism : unsigned {
108   Default        = 0,
109   ConstantMemory = 1,
110   GlobalMemory   = 2,
111   LocalMemory    = 4
112 };
113 
operator |(HIPLaunchMechanism p1,HIPLaunchMechanism p2)114 constexpr inline HIPLaunchMechanism operator|(HIPLaunchMechanism p1,
115                                               HIPLaunchMechanism p2) {
116   return static_cast<HIPLaunchMechanism>(static_cast<unsigned>(p1) |
117                                          static_cast<unsigned>(p2));
118 }
operator &(HIPLaunchMechanism p1,HIPLaunchMechanism p2)119 constexpr inline HIPLaunchMechanism operator&(HIPLaunchMechanism p1,
120                                               HIPLaunchMechanism p2) {
121   return static_cast<HIPLaunchMechanism>(static_cast<unsigned>(p1) &
122                                          static_cast<unsigned>(p2));
123 }
124 
125 template <HIPLaunchMechanism l>
126 struct HIPDispatchProperties {
127   HIPLaunchMechanism launch_mechanism = l;
128 };
129 
130 template <typename DriverType, typename LaunchBounds,
131           HIPLaunchMechanism LaunchMechanism>
132 struct HIPParallelLaunchKernelFunc;
133 
134 template <typename DriverType, unsigned int MaxThreadsPerBlock,
135           unsigned int MinBlocksPerSM>
136 struct HIPParallelLaunchKernelFunc<
137     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
138     HIPLaunchMechanism::LocalMemory> {
get_kernel_funcKokkos::Experimental::Impl::HIPParallelLaunchKernelFunc139   static auto get_kernel_func() {
140     return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
141                                             MinBlocksPerSM>;
142   }
143 };
144 
145 template <typename DriverType>
146 struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
147                                    HIPLaunchMechanism::LocalMemory> {
get_kernel_funcKokkos::Experimental::Impl::HIPParallelLaunchKernelFunc148   static auto get_kernel_func() {
149     return hip_parallel_launch_local_memory<DriverType, 1024, 1>;
150   }
151 };
152 
153 template <typename DriverType, typename LaunchBounds,
154           HIPLaunchMechanism LaunchMechanism>
155 struct HIPParallelLaunchKernelInvoker;
156 
157 template <typename DriverType, typename LaunchBounds>
158 struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
159                                       HIPLaunchMechanism::LocalMemory>
160     : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
161                                   HIPLaunchMechanism::LocalMemory> {
162   using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
163                                              HIPLaunchMechanism::LocalMemory>;
164 
invoke_kernelKokkos::Experimental::Impl::HIPParallelLaunchKernelInvoker165   static void invoke_kernel(DriverType const *driver, dim3 const &grid,
166                             dim3 const &block, int shmem,
167                             HIPInternal const *hip_instance) {
168     (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
169         driver);
170   }
171 };
172 
173 template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
174           HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
175 struct HIPParallelLaunch;
176 
177 template <typename DriverType, unsigned int MaxThreadsPerBlock,
178           unsigned int MinBlocksPerSM>
179 struct HIPParallelLaunch<
180     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
181     HIPLaunchMechanism::LocalMemory>
182     : HIPParallelLaunchKernelInvoker<
183           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
184           HIPLaunchMechanism::LocalMemory> {
185   using base_t = HIPParallelLaunchKernelInvoker<
186       DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
187       HIPLaunchMechanism::LocalMemory>;
188 
HIPParallelLaunchKokkos::Experimental::Impl::HIPParallelLaunch189   HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
190                     const dim3 &block, const int shmem,
191                     const HIPInternal *hip_instance,
192                     const bool /*prefer_shmem*/) {
193     if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) {
194       if (hip_instance->m_maxShmemPerBlock < shmem) {
195         Kokkos::Impl::throw_runtime_exception(
196             "HIPParallelLaunch FAILED: shared memory request is too large");
197       }
198 
199       KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
200 
201       // Invoke the driver function on the device
202       DriverType *d_driver = reinterpret_cast<DriverType *>(
203           hip_instance->get_next_driver(sizeof(DriverType)));
204       std::memcpy((void *)d_driver, (void *)&driver, sizeof(DriverType));
205       base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance);
206 
207 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
208       HIP_SAFE_CALL(hipGetLastError());
209       hip_instance->fence();
210 #endif
211     }
212   }
213 
get_hip_func_attributesKokkos::Experimental::Impl::HIPParallelLaunch214   static hipFuncAttributes get_hip_func_attributes() {
215     static hipFuncAttributes attr = []() {
216       hipFuncAttributes attr;
217       HIP_SAFE_CALL(hipFuncGetAttributes(
218           &attr, reinterpret_cast<void const *>(base_t::get_kernel_func())));
219       return attr;
220     }();
221     return attr;
222   }
223 };
224 }  // namespace Impl
225 }  // namespace Experimental
226 }  // namespace Kokkos
227 
228 #endif
229 
230 #endif
231