1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 /*--------------------------------------------------------------------------*/
46 /* Kokkos interfaces */
47
48 #include <Kokkos_Core.hpp>
49
50 #include <HIP/Kokkos_HIP_Instance.hpp>
51 #include <Kokkos_HIP.hpp>
52 #include <Kokkos_HIP_Space.hpp>
53 #include <impl/Kokkos_Error.hpp>
54
55 /*--------------------------------------------------------------------------*/
56 /* Standard 'C' libraries */
57 #include <stdlib.h>
58
59 /* Standard 'C++' libraries */
60 #include <iostream>
61 #include <sstream>
62 #include <string>
63 #include <vector>
64
65 namespace Kokkos {
66 namespace Experimental {
67 namespace {
68 class HIPInternalDevices {
69 public:
70 enum { MAXIMUM_DEVICE_COUNT = 64 };
71 struct hipDeviceProp_t m_hipProp[MAXIMUM_DEVICE_COUNT];
72 int m_hipDevCount;
73
74 HIPInternalDevices();
75
76 static HIPInternalDevices const &singleton();
77 };
78
HIPInternalDevices()79 HIPInternalDevices::HIPInternalDevices() {
80 HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
81
82 if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) {
83 Kokkos::abort(
84 "Sorry, you have more GPUs per node than we thought anybody would ever "
85 "have. Please report this to github.com/kokkos/kokkos.");
86 }
87 for (int i = 0; i < m_hipDevCount; ++i) {
88 HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
89 }
90 }
91
singleton()92 const HIPInternalDevices &HIPInternalDevices::singleton() {
93 static HIPInternalDevices self;
94 return self;
95 }
96 } // namespace
97
98 namespace Impl {
99
100 //----------------------------------------------------------------------------
101
print_configuration(std::ostream & s) const102 void HIPInternal::print_configuration(std::ostream &s) const {
103 const HIPInternalDevices &dev_info = HIPInternalDevices::singleton();
104
105 s << "macro KOKKOS_ENABLE_HIP : defined" << '\n';
106 #if defined(HIP_VERSION)
107 s << "macro HIP_VERSION = " << HIP_VERSION << " = version "
108 << HIP_VERSION / 100 << "." << HIP_VERSION % 100 << '\n';
109 #endif
110
111 for (int i = 0; i < dev_info.m_hipDevCount; ++i) {
112 s << "Kokkos::Experimental::HIP[ " << i << " ] "
113 << dev_info.m_hipProp[i].name << " version "
114 << (dev_info.m_hipProp[i].major) << "." << dev_info.m_hipProp[i].minor
115 << ", Total Global Memory: "
116 << ::Kokkos::Impl::human_memory_size(dev_info.m_hipProp[i].totalGlobalMem)
117 << ", Shared Memory per Block: "
118 << ::Kokkos::Impl::human_memory_size(
119 dev_info.m_hipProp[i].sharedMemPerBlock);
120 if (m_hipDev == i) s << " : Selected";
121 s << '\n';
122 }
123 }
124
125 //----------------------------------------------------------------------------
126
~HIPInternal()127 HIPInternal::~HIPInternal() {
128 if (m_scratchSpace || m_scratchFlags || m_scratchConcurrentBitset) {
129 std::cerr << "Kokkos::Experimental::HIP ERROR: Failed to call "
130 "Kokkos::Experimental::HIP::finalize()"
131 << std::endl;
132 std::cerr.flush();
133 }
134
135 m_hipDev = -1;
136 m_hipArch = -1;
137 m_multiProcCount = 0;
138 m_maxWarpCount = 0;
139 m_maxSharedWords = 0;
140 m_maxShmemPerBlock = 0;
141 m_scratchSpaceCount = 0;
142 m_scratchFlagsCount = 0;
143 m_scratchSpace = nullptr;
144 m_scratchFlags = nullptr;
145 m_scratchConcurrentBitset = nullptr;
146 m_stream = nullptr;
147 }
148
verify_is_initialized(const char * const label) const149 int HIPInternal::verify_is_initialized(const char *const label) const {
150 if (m_hipDev < 0) {
151 std::cerr << "Kokkos::Experimental::HIP::" << label
152 << " : ERROR device not initialized" << std::endl;
153 }
154 return 0 <= m_hipDev;
155 }
156
singleton()157 HIPInternal &HIPInternal::singleton() {
158 static HIPInternal *self = nullptr;
159 if (!self) {
160 self = new HIPInternal();
161 }
162 return *self;
163 }
164
fence() const165 void HIPInternal::fence() const {
166 HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
167 // can reset our cycle id now as well
168 m_cycleId = 0;
169 }
170
initialize(int hip_device_id,hipStream_t stream)171 void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
172 if (was_finalized)
173 Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
174
175 if (is_initialized()) return;
176
177 int constexpr WordSize = sizeof(size_type);
178
179 if (!HostSpace::execution_space::impl_is_initialized()) {
180 const std::string msg(
181 "HIP::initialize ERROR : HostSpace::execution_space "
182 "is not initialized");
183 Kokkos::Impl::throw_runtime_exception(msg);
184 }
185
186 const HIPInternalDevices &dev_info = HIPInternalDevices::singleton();
187
188 const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;
189
190 // Need at least a GPU device
191 const bool ok_id =
192 0 <= hip_device_id && hip_device_id < dev_info.m_hipDevCount;
193
194 if (ok_init && ok_id) {
195 const struct hipDeviceProp_t &hipProp = dev_info.m_hipProp[hip_device_id];
196
197 m_hipDev = hip_device_id;
198 m_deviceProp = hipProp;
199
200 HIP_SAFE_CALL(hipSetDevice(m_hipDev));
201
202 m_stream = stream;
203 m_team_scratch_current_size = 0;
204 m_team_scratch_ptr = nullptr;
205
206 // number of multiprocessors
207 m_multiProcCount = hipProp.multiProcessorCount;
208
209 //----------------------------------
210 // Maximum number of warps,
211 // at most one warp per thread in a warp for reduction.
212 m_maxWarpCount = hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize;
213 if (HIPTraits::WarpSize < m_maxWarpCount) {
214 m_maxWarpCount = Impl::HIPTraits::WarpSize;
215 }
216 m_maxSharedWords = hipProp.sharedMemPerBlock / WordSize;
217
218 //----------------------------------
219 // Maximum number of blocks
220 m_maxBlock = hipProp.maxGridSize[0];
221
222 // theoretically, we can get 40 WF's / CU, but only can sustain 32
223 // see
224 // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742
225 m_maxBlocksPerSM = 32;
226 // FIXME_HIP - Nick to implement this upstream
227 // Register count comes from Sec. 2.2. "Data Sharing" of the
228 // Vega 7nm ISA document (see the diagram)
229 // https://developer.amd.com/wp-content/resources/Vega_7nm_Shader_ISA.pdf
230 // VGPRS = 4 (SIMD/CU) * 256 VGPR/SIMD * 64 registers / VGPR =
231 // 65536 VGPR/CU
232 m_regsPerSM = 65536;
233 m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor;
234 m_maxShmemPerBlock = hipProp.sharedMemPerBlock;
235 m_maxThreadsPerSM = m_maxBlocksPerSM * HIPTraits::WarpSize;
236 //----------------------------------
237 // Multiblock reduction uses scratch flags for counters
238 // and scratch space for partial reduction values.
239 // Allocate some initial space. This will grow as needed.
240 {
241 const unsigned reduce_block_count =
242 m_maxWarpCount * Impl::HIPTraits::WarpSize;
243
244 (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
245 (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
246 }
247 //----------------------------------
248 // Concurrent bitset for obtaining unique tokens from within
249 // an executing kernel.
250 {
251 const int32_t buffer_bound =
252 Kokkos::Impl::concurrent_bitset::buffer_bound(HIP::concurrency());
253
254 // Allocate and initialize uint32_t[ buffer_bound ]
255
256 using Record =
257 Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
258 void>;
259
260 Record *const r = Record::allocate(Kokkos::Experimental::HIPSpace(),
261 "Kokkos::InternalScratchBitset",
262 sizeof(uint32_t) * buffer_bound);
263
264 Record::increment(r);
265
266 m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
267
268 HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
269 sizeof(uint32_t) * buffer_bound));
270 }
271 //----------------------------------
272
273 } else {
274 std::ostringstream msg;
275 msg << "Kokkos::Experimental::HIP::initialize(" << hip_device_id
276 << ") FAILED";
277
278 if (!ok_init) {
279 msg << " : Already initialized";
280 }
281 if (!ok_id) {
282 msg << " : Device identifier out of range "
283 << "[0.." << dev_info.m_hipDevCount - 1 << "]";
284 }
285 Kokkos::Impl::throw_runtime_exception(msg.str());
286 }
287
288 // Init the array for used for arbitrarily sized atomics
289 if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays();
290 }
291
292 //----------------------------------------------------------------------------
293
294 using ScratchGrain =
295 Kokkos::Experimental::HIP::size_type[Impl::HIPTraits::WarpSize];
296 enum { sizeScratchGrain = sizeof(ScratchGrain) };
297
scratch_space(const Kokkos::Experimental::HIP::size_type size)298 Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_space(
299 const Kokkos::Experimental::HIP::size_type size) {
300 if (verify_is_initialized("scratch_space") &&
301 m_scratchSpaceCount * sizeScratchGrain < size) {
302 m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
303
304 using Record =
305 Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
306 void>;
307
308 if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
309
310 Record *const r = Record::allocate(
311 Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchSpace",
312 (sizeScratchGrain * m_scratchSpaceCount));
313
314 Record::increment(r);
315
316 m_scratchSpace = reinterpret_cast<size_type *>(r->data());
317 }
318
319 return m_scratchSpace;
320 }
321
scratch_flags(const Kokkos::Experimental::HIP::size_type size)322 Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(
323 const Kokkos::Experimental::HIP::size_type size) {
324 if (verify_is_initialized("scratch_flags") &&
325 m_scratchFlagsCount * sizeScratchGrain < size) {
326 m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
327
328 using Record =
329 Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
330 void>;
331
332 if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
333
334 Record *const r = Record::allocate(
335 Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchFlags",
336 (sizeScratchGrain * m_scratchFlagsCount));
337
338 Record::increment(r);
339
340 m_scratchFlags = reinterpret_cast<size_type *>(r->data());
341
342 HIP_SAFE_CALL(
343 hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
344 }
345
346 return m_scratchFlags;
347 }
348
resize_team_scratch_space(std::int64_t bytes,bool force_shrink)349 void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
350 bool force_shrink) {
351 if (m_team_scratch_current_size == 0) {
352 m_team_scratch_current_size = bytes;
353 m_team_scratch_ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::HIPSpace>(
354 "Kokkos::HIPSpace::TeamScratchMemory", m_team_scratch_current_size);
355 }
356 if ((bytes > m_team_scratch_current_size) ||
357 ((bytes < m_team_scratch_current_size) && (force_shrink))) {
358 m_team_scratch_current_size = bytes;
359 m_team_scratch_ptr = Kokkos::kokkos_realloc<Kokkos::Experimental::HIPSpace>(
360 m_team_scratch_ptr, m_team_scratch_current_size);
361 }
362 return m_team_scratch_ptr;
363 }
364
365 //----------------------------------------------------------------------------
366
finalize()367 void HIPInternal::finalize() {
368 this->fence();
369 was_finalized = true;
370 if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
371 using RecordHIP =
372 Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace>;
373
374 RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags));
375 RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace));
376 RecordHIP::decrement(RecordHIP::get_record(m_scratchConcurrentBitset));
377
378 if (m_team_scratch_current_size > 0)
379 Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr);
380
381 m_hipDev = -1;
382 m_hipArch = -1;
383 m_multiProcCount = 0;
384 m_maxWarpCount = 0;
385 m_maxBlock = 0;
386 m_maxSharedWords = 0;
387 m_maxShmemPerBlock = 0;
388 m_scratchSpaceCount = 0;
389 m_scratchFlagsCount = 0;
390 m_scratchSpace = nullptr;
391 m_scratchFlags = nullptr;
392 m_scratchConcurrentBitset = nullptr;
393 m_stream = nullptr;
394 m_team_scratch_current_size = 0;
395 m_team_scratch_ptr = nullptr;
396 }
397 if (nullptr != d_driverWorkArray) {
398 HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
399 d_driverWorkArray = nullptr;
400 }
401 }
402
get_next_driver(size_t driverTypeSize) const403 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
404 std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
405 if (d_driverWorkArray == nullptr) {
406 HIP_SAFE_CALL(
407 hipHostMalloc(&d_driverWorkArray,
408 m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
409 hipHostMallocNonCoherent));
410 }
411 if (driverTypeSize > m_maxDriverTypeSize) {
412 // fence handles the cycle id reset for us
413 fence();
414 HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
415 m_maxDriverTypeSize = driverTypeSize;
416 if (m_maxDriverTypeSize % 128 != 0)
417 m_maxDriverTypeSize =
418 m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
419 HIP_SAFE_CALL(
420 hipHostMalloc(&d_driverWorkArray,
421 m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
422 hipHostMallocNonCoherent));
423 } else {
424 m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
425 if (m_cycleId == 0) {
426 // ensure any outstanding kernels are completed before we wrap around
427 fence();
428 }
429 }
430 return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
431 }
432
433 //----------------------------------------------------------------------------
434
hip_internal_multiprocessor_count()435 Kokkos::Experimental::HIP::size_type hip_internal_multiprocessor_count() {
436 return HIPInternal::singleton().m_multiProcCount;
437 }
438
hip_internal_maximum_warp_count()439 Kokkos::Experimental::HIP::size_type hip_internal_maximum_warp_count() {
440 return HIPInternal::singleton().m_maxWarpCount;
441 }
442
hip_internal_maximum_grid_count()443 Kokkos::Experimental::HIP::size_type hip_internal_maximum_grid_count() {
444 return HIPInternal::singleton().m_maxBlock;
445 }
446
hip_internal_scratch_space(const Kokkos::Experimental::HIP::size_type size)447 Kokkos::Experimental::HIP::size_type *hip_internal_scratch_space(
448 const Kokkos::Experimental::HIP::size_type size) {
449 return HIPInternal::singleton().scratch_space(size);
450 }
451
hip_internal_scratch_flags(const Kokkos::Experimental::HIP::size_type size)452 Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags(
453 const Kokkos::Experimental::HIP::size_type size) {
454 return HIPInternal::singleton().scratch_flags(size);
455 }
456
457 } // namespace Impl
458 } // namespace Experimental
459 } // namespace Kokkos
460
461 //----------------------------------------------------------------------------
462
463 namespace Kokkos {
464 namespace Impl {
hip_device_synchronize()465 void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
466
hip_internal_error_throw(hipError_t e,const char * name,const char * file,const int line)467 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
468 const int line) {
469 std::ostringstream out;
470 out << name << " error( " << hipGetErrorName(e)
471 << "): " << hipGetErrorString(e);
472 if (file) {
473 out << " " << file << ":" << line;
474 }
475 throw_runtime_exception(out.str());
476 }
477 } // namespace Impl
478 } // namespace Kokkos
479
480 //----------------------------------------------------------------------------
481
482 namespace Kokkos {
483 namespace Experimental {
detect_device_count()484 HIP::size_type HIP::detect_device_count() {
485 return HIPInternalDevices::singleton().m_hipDevCount;
486 }
487 } // namespace Experimental
488 } // namespace Kokkos
489