1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 /*--------------------------------------------------------------------------*/
46 /* Kokkos interfaces */
47 
48 #include <Kokkos_Core.hpp>
49 
50 #include <HIP/Kokkos_HIP_Instance.hpp>
51 #include <Kokkos_HIP.hpp>
52 #include <Kokkos_HIP_Space.hpp>
53 #include <impl/Kokkos_Error.hpp>
54 
55 /*--------------------------------------------------------------------------*/
56 /* Standard 'C' libraries */
57 #include <stdlib.h>
58 
59 /* Standard 'C++' libraries */
60 #include <iostream>
61 #include <sstream>
62 #include <string>
63 #include <vector>
64 
65 namespace Kokkos {
66 namespace Experimental {
67 namespace {
68 class HIPInternalDevices {
69  public:
70   enum { MAXIMUM_DEVICE_COUNT = 64 };
71   struct hipDeviceProp_t m_hipProp[MAXIMUM_DEVICE_COUNT];
72   int m_hipDevCount;
73 
74   HIPInternalDevices();
75 
76   static HIPInternalDevices const &singleton();
77 };
78 
HIPInternalDevices()79 HIPInternalDevices::HIPInternalDevices() {
80   HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
81 
82   if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) {
83     Kokkos::abort(
84         "Sorry, you have more GPUs per node than we thought anybody would ever "
85         "have. Please report this to github.com/kokkos/kokkos.");
86   }
87   for (int i = 0; i < m_hipDevCount; ++i) {
88     HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
89   }
90 }
91 
singleton()92 const HIPInternalDevices &HIPInternalDevices::singleton() {
93   static HIPInternalDevices self;
94   return self;
95 }
96 }  // namespace
97 
98 namespace Impl {
99 
100 //----------------------------------------------------------------------------
101 
print_configuration(std::ostream & s) const102 void HIPInternal::print_configuration(std::ostream &s) const {
103   const HIPInternalDevices &dev_info = HIPInternalDevices::singleton();
104 
105   s << "macro  KOKKOS_ENABLE_HIP : defined" << '\n';
106 #if defined(HIP_VERSION)
107   s << "macro  HIP_VERSION = " << HIP_VERSION << " = version "
108     << HIP_VERSION / 100 << "." << HIP_VERSION % 100 << '\n';
109 #endif
110 
111   for (int i = 0; i < dev_info.m_hipDevCount; ++i) {
112     s << "Kokkos::Experimental::HIP[ " << i << " ] "
113       << dev_info.m_hipProp[i].name << " version "
114       << (dev_info.m_hipProp[i].major) << "." << dev_info.m_hipProp[i].minor
115       << ", Total Global Memory: "
116       << ::Kokkos::Impl::human_memory_size(dev_info.m_hipProp[i].totalGlobalMem)
117       << ", Shared Memory per Block: "
118       << ::Kokkos::Impl::human_memory_size(
119              dev_info.m_hipProp[i].sharedMemPerBlock);
120     if (m_hipDev == i) s << " : Selected";
121     s << '\n';
122   }
123 }
124 
125 //----------------------------------------------------------------------------
126 
~HIPInternal()127 HIPInternal::~HIPInternal() {
128   if (m_scratchSpace || m_scratchFlags || m_scratchConcurrentBitset) {
129     std::cerr << "Kokkos::Experimental::HIP ERROR: Failed to call "
130                  "Kokkos::Experimental::HIP::finalize()"
131               << std::endl;
132     std::cerr.flush();
133   }
134 
135   m_hipDev                  = -1;
136   m_hipArch                 = -1;
137   m_multiProcCount          = 0;
138   m_maxWarpCount            = 0;
139   m_maxSharedWords          = 0;
140   m_maxShmemPerBlock        = 0;
141   m_scratchSpaceCount       = 0;
142   m_scratchFlagsCount       = 0;
143   m_scratchSpace            = nullptr;
144   m_scratchFlags            = nullptr;
145   m_scratchConcurrentBitset = nullptr;
146   m_stream                  = nullptr;
147 }
148 
verify_is_initialized(const char * const label) const149 int HIPInternal::verify_is_initialized(const char *const label) const {
150   if (m_hipDev < 0) {
151     std::cerr << "Kokkos::Experimental::HIP::" << label
152               << " : ERROR device not initialized" << std::endl;
153   }
154   return 0 <= m_hipDev;
155 }
156 
singleton()157 HIPInternal &HIPInternal::singleton() {
158   static HIPInternal *self = nullptr;
159   if (!self) {
160     self = new HIPInternal();
161   }
162   return *self;
163 }
164 
fence() const165 void HIPInternal::fence() const {
166   HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
167   // can reset our cycle id now as well
168   m_cycleId = 0;
169 }
170 
initialize(int hip_device_id,hipStream_t stream)171 void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
172   if (was_finalized)
173     Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
174 
175   if (is_initialized()) return;
176 
177   int constexpr WordSize = sizeof(size_type);
178 
179   if (!HostSpace::execution_space::impl_is_initialized()) {
180     const std::string msg(
181         "HIP::initialize ERROR : HostSpace::execution_space "
182         "is not initialized");
183     Kokkos::Impl::throw_runtime_exception(msg);
184   }
185 
186   const HIPInternalDevices &dev_info = HIPInternalDevices::singleton();
187 
188   const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;
189 
190   // Need at least a GPU device
191   const bool ok_id =
192       0 <= hip_device_id && hip_device_id < dev_info.m_hipDevCount;
193 
194   if (ok_init && ok_id) {
195     const struct hipDeviceProp_t &hipProp = dev_info.m_hipProp[hip_device_id];
196 
197     m_hipDev     = hip_device_id;
198     m_deviceProp = hipProp;
199 
200     HIP_SAFE_CALL(hipSetDevice(m_hipDev));
201 
202     m_stream                    = stream;
203     m_team_scratch_current_size = 0;
204     m_team_scratch_ptr          = nullptr;
205 
206     // number of multiprocessors
207     m_multiProcCount = hipProp.multiProcessorCount;
208 
209     //----------------------------------
210     // Maximum number of warps,
211     // at most one warp per thread in a warp for reduction.
212     m_maxWarpCount = hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize;
213     if (HIPTraits::WarpSize < m_maxWarpCount) {
214       m_maxWarpCount = Impl::HIPTraits::WarpSize;
215     }
216     m_maxSharedWords = hipProp.sharedMemPerBlock / WordSize;
217 
218     //----------------------------------
219     // Maximum number of blocks
220     m_maxBlock = hipProp.maxGridSize[0];
221 
222     // theoretically, we can get 40 WF's / CU, but only can sustain 32
223     // see
224     // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742
225     m_maxBlocksPerSM = 32;
226     // FIXME_HIP - Nick to implement this upstream
227     //             Register count comes from Sec. 2.2. "Data Sharing" of the
228     //             Vega 7nm ISA document (see the diagram)
229     //             https://developer.amd.com/wp-content/resources/Vega_7nm_Shader_ISA.pdf
230     //             VGPRS = 4 (SIMD/CU) * 256 VGPR/SIMD * 64 registers / VGPR =
231     //             65536 VGPR/CU
232     m_regsPerSM        = 65536;
233     m_shmemPerSM       = hipProp.maxSharedMemoryPerMultiProcessor;
234     m_maxShmemPerBlock = hipProp.sharedMemPerBlock;
235     m_maxThreadsPerSM  = m_maxBlocksPerSM * HIPTraits::WarpSize;
236     //----------------------------------
237     // Multiblock reduction uses scratch flags for counters
238     // and scratch space for partial reduction values.
239     // Allocate some initial space.  This will grow as needed.
240     {
241       const unsigned reduce_block_count =
242           m_maxWarpCount * Impl::HIPTraits::WarpSize;
243 
244       (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
245       (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
246     }
247     //----------------------------------
248     // Concurrent bitset for obtaining unique tokens from within
249     // an executing kernel.
250     {
251       const int32_t buffer_bound =
252           Kokkos::Impl::concurrent_bitset::buffer_bound(HIP::concurrency());
253 
254       // Allocate and initialize uint32_t[ buffer_bound ]
255 
256       using Record =
257           Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
258                                                void>;
259 
260       Record *const r = Record::allocate(Kokkos::Experimental::HIPSpace(),
261                                          "Kokkos::InternalScratchBitset",
262                                          sizeof(uint32_t) * buffer_bound);
263 
264       Record::increment(r);
265 
266       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
267 
268       HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
269                               sizeof(uint32_t) * buffer_bound));
270     }
271     //----------------------------------
272 
273   } else {
274     std::ostringstream msg;
275     msg << "Kokkos::Experimental::HIP::initialize(" << hip_device_id
276         << ") FAILED";
277 
278     if (!ok_init) {
279       msg << " : Already initialized";
280     }
281     if (!ok_id) {
282       msg << " : Device identifier out of range "
283           << "[0.." << dev_info.m_hipDevCount - 1 << "]";
284     }
285     Kokkos::Impl::throw_runtime_exception(msg.str());
286   }
287 
288   // Init the array for used for arbitrarily sized atomics
289   if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays();
290 }
291 
292 //----------------------------------------------------------------------------
293 
294 using ScratchGrain =
295     Kokkos::Experimental::HIP::size_type[Impl::HIPTraits::WarpSize];
296 enum { sizeScratchGrain = sizeof(ScratchGrain) };
297 
scratch_space(const Kokkos::Experimental::HIP::size_type size)298 Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_space(
299     const Kokkos::Experimental::HIP::size_type size) {
300   if (verify_is_initialized("scratch_space") &&
301       m_scratchSpaceCount * sizeScratchGrain < size) {
302     m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
303 
304     using Record =
305         Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
306                                              void>;
307 
308     if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
309 
310     Record *const r = Record::allocate(
311         Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchSpace",
312         (sizeScratchGrain * m_scratchSpaceCount));
313 
314     Record::increment(r);
315 
316     m_scratchSpace = reinterpret_cast<size_type *>(r->data());
317   }
318 
319   return m_scratchSpace;
320 }
321 
scratch_flags(const Kokkos::Experimental::HIP::size_type size)322 Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(
323     const Kokkos::Experimental::HIP::size_type size) {
324   if (verify_is_initialized("scratch_flags") &&
325       m_scratchFlagsCount * sizeScratchGrain < size) {
326     m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
327 
328     using Record =
329         Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
330                                              void>;
331 
332     if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
333 
334     Record *const r = Record::allocate(
335         Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchFlags",
336         (sizeScratchGrain * m_scratchFlagsCount));
337 
338     Record::increment(r);
339 
340     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
341 
342     HIP_SAFE_CALL(
343         hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
344   }
345 
346   return m_scratchFlags;
347 }
348 
resize_team_scratch_space(std::int64_t bytes,bool force_shrink)349 void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
350                                              bool force_shrink) {
351   if (m_team_scratch_current_size == 0) {
352     m_team_scratch_current_size = bytes;
353     m_team_scratch_ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::HIPSpace>(
354         "Kokkos::HIPSpace::TeamScratchMemory", m_team_scratch_current_size);
355   }
356   if ((bytes > m_team_scratch_current_size) ||
357       ((bytes < m_team_scratch_current_size) && (force_shrink))) {
358     m_team_scratch_current_size = bytes;
359     m_team_scratch_ptr = Kokkos::kokkos_realloc<Kokkos::Experimental::HIPSpace>(
360         m_team_scratch_ptr, m_team_scratch_current_size);
361   }
362   return m_team_scratch_ptr;
363 }
364 
365 //----------------------------------------------------------------------------
366 
finalize()367 void HIPInternal::finalize() {
368   this->fence();
369   was_finalized = true;
370   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
371     using RecordHIP =
372         Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace>;
373 
374     RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags));
375     RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace));
376     RecordHIP::decrement(RecordHIP::get_record(m_scratchConcurrentBitset));
377 
378     if (m_team_scratch_current_size > 0)
379       Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr);
380 
381     m_hipDev                    = -1;
382     m_hipArch                   = -1;
383     m_multiProcCount            = 0;
384     m_maxWarpCount              = 0;
385     m_maxBlock                  = 0;
386     m_maxSharedWords            = 0;
387     m_maxShmemPerBlock          = 0;
388     m_scratchSpaceCount         = 0;
389     m_scratchFlagsCount         = 0;
390     m_scratchSpace              = nullptr;
391     m_scratchFlags              = nullptr;
392     m_scratchConcurrentBitset   = nullptr;
393     m_stream                    = nullptr;
394     m_team_scratch_current_size = 0;
395     m_team_scratch_ptr          = nullptr;
396   }
397   if (nullptr != d_driverWorkArray) {
398     HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
399     d_driverWorkArray = nullptr;
400   }
401 }
402 
get_next_driver(size_t driverTypeSize) const403 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
404   std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
405   if (d_driverWorkArray == nullptr) {
406     HIP_SAFE_CALL(
407         hipHostMalloc(&d_driverWorkArray,
408                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
409                       hipHostMallocNonCoherent));
410   }
411   if (driverTypeSize > m_maxDriverTypeSize) {
412     // fence handles the cycle id reset for us
413     fence();
414     HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
415     m_maxDriverTypeSize = driverTypeSize;
416     if (m_maxDriverTypeSize % 128 != 0)
417       m_maxDriverTypeSize =
418           m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
419     HIP_SAFE_CALL(
420         hipHostMalloc(&d_driverWorkArray,
421                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
422                       hipHostMallocNonCoherent));
423   } else {
424     m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
425     if (m_cycleId == 0) {
426       // ensure any outstanding kernels are completed before we wrap around
427       fence();
428     }
429   }
430   return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
431 }
432 
433 //----------------------------------------------------------------------------
434 
hip_internal_multiprocessor_count()435 Kokkos::Experimental::HIP::size_type hip_internal_multiprocessor_count() {
436   return HIPInternal::singleton().m_multiProcCount;
437 }
438 
hip_internal_maximum_warp_count()439 Kokkos::Experimental::HIP::size_type hip_internal_maximum_warp_count() {
440   return HIPInternal::singleton().m_maxWarpCount;
441 }
442 
hip_internal_maximum_grid_count()443 Kokkos::Experimental::HIP::size_type hip_internal_maximum_grid_count() {
444   return HIPInternal::singleton().m_maxBlock;
445 }
446 
hip_internal_scratch_space(const Kokkos::Experimental::HIP::size_type size)447 Kokkos::Experimental::HIP::size_type *hip_internal_scratch_space(
448     const Kokkos::Experimental::HIP::size_type size) {
449   return HIPInternal::singleton().scratch_space(size);
450 }
451 
hip_internal_scratch_flags(const Kokkos::Experimental::HIP::size_type size)452 Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags(
453     const Kokkos::Experimental::HIP::size_type size) {
454   return HIPInternal::singleton().scratch_flags(size);
455 }
456 
457 }  // namespace Impl
458 }  // namespace Experimental
459 }  // namespace Kokkos
460 
461 //----------------------------------------------------------------------------
462 
463 namespace Kokkos {
464 namespace Impl {
hip_device_synchronize()465 void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
466 
hip_internal_error_throw(hipError_t e,const char * name,const char * file,const int line)467 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
468                               const int line) {
469   std::ostringstream out;
470   out << name << " error( " << hipGetErrorName(e)
471       << "): " << hipGetErrorString(e);
472   if (file) {
473     out << " " << file << ":" << line;
474   }
475   throw_runtime_exception(out.str());
476 }
477 }  // namespace Impl
478 }  // namespace Kokkos
479 
480 //----------------------------------------------------------------------------
481 
482 namespace Kokkos {
483 namespace Experimental {
detect_device_count()484 HIP::size_type HIP::detect_device_count() {
485   return HIPInternalDevices::singleton().m_hipDevCount;
486 }
487 }  // namespace Experimental
488 }  // namespace Kokkos
489