1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 // AMD Research and AMD HSA Software Development
11 //
12 // Advanced Micro Devices, Inc.
13 //
14 // www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 // - Redistributions of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimers.
25 // - Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimers in
27 // the documentation and/or other materials provided with the distribution.
28 // - Neither the names of Advanced Micro Devices, Inc,
29 // nor the names of its contributors may be used to endorse or promote
30 // products derived from this Software without specific prior written
31 // permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42
43 #include "core/inc/runtime.h"
44
45 #include <algorithm>
46 #include <atomic>
47 #include <cstring>
48 #include <string>
49 #include <thread>
50 #include <vector>
51
52 #include "core/common/shared.h"
53 #include "core/inc/hsa_ext_interface.h"
54 #include "core/inc/amd_cpu_agent.h"
55 #include "core/inc/amd_gpu_agent.h"
56 #include "core/inc/amd_memory_region.h"
57 #include "core/inc/amd_topology.h"
58 #include "core/inc/signal.h"
59 #include "core/inc/interrupt_signal.h"
60 #include "core/inc/hsa_ext_amd_impl.h"
61 #include "core/inc/hsa_api_trace_int.h"
62 #include "core/util/os.h"
63 #include "inc/hsa_ven_amd_aqlprofile.h"
64
65 #define HSA_VERSION_MAJOR 1
66 #define HSA_VERSION_MINOR 1
67
68 const char rocrbuildid[] __attribute__((used)) = "ROCR BUILD ID: " STRING(ROCR_BUILD_ID);
69
70 namespace core {
71 bool g_use_interrupt_wait = true;
72
73 Runtime* Runtime::runtime_singleton_ = NULL;
74
75 KernelMutex Runtime::bootstrap_lock_;
76
77 static bool loaded = true;
78
79 class RuntimeCleanup {
80 public:
~RuntimeCleanup()81 ~RuntimeCleanup() {
82 if (!Runtime::IsOpen()) {
83 delete Runtime::runtime_singleton_;
84 }
85
86 loaded = false;
87 }
88 };
89
90 static RuntimeCleanup cleanup_at_unload_;
91
Acquire()92 hsa_status_t Runtime::Acquire() {
93 // Check to see if HSA has been cleaned up (process exit)
94 if (!loaded) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
95
96 // Handle initialization races
97 ScopedAcquire<KernelMutex> boot(&bootstrap_lock_);
98
99 if (runtime_singleton_ == NULL) {
100 runtime_singleton_ = new Runtime();
101 }
102
103 // Serialize with release
104 ScopedAcquire<KernelMutex> lock(&runtime_singleton_->kernel_lock_);
105
106 if (runtime_singleton_->ref_count_ == INT32_MAX) {
107 return HSA_STATUS_ERROR_REFCOUNT_OVERFLOW;
108 }
109
110 runtime_singleton_->ref_count_++;
111 MAKE_NAMED_SCOPE_GUARD(refGuard, [&]() { runtime_singleton_->ref_count_--; });
112
113 if (runtime_singleton_->ref_count_ == 1) {
114 hsa_status_t status = runtime_singleton_->Load();
115
116 if (status != HSA_STATUS_SUCCESS) {
117 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
118 }
119 }
120
121 refGuard.Dismiss();
122 return HSA_STATUS_SUCCESS;
123 }
124
Release()125 hsa_status_t Runtime::Release() {
126 ScopedAcquire<KernelMutex> lock(&kernel_lock_);
127 if (ref_count_ == 0) {
128 return HSA_STATUS_ERROR_NOT_INITIALIZED;
129 }
130
131 if (ref_count_ == 1) {
132 // Release all registered memory, then unload backends
133 Unload();
134 }
135
136 ref_count_--;
137
138 return HSA_STATUS_SUCCESS;
139 }
140
IsOpen()141 bool Runtime::IsOpen() {
142 return (Runtime::runtime_singleton_ != NULL) &&
143 (Runtime::runtime_singleton_->ref_count_ != 0);
144 }
145
RegisterAgent(Agent * agent)146 void Runtime::RegisterAgent(Agent* agent) {
147 // Record the agent in the node-to-agent reverse lookup table.
148 agents_by_node_[agent->node_id()].push_back(agent);
149
150 // Process agent as a cpu or gpu device.
151 if (agent->device_type() == Agent::DeviceType::kAmdCpuDevice) {
152 cpu_agents_.push_back(agent);
153
154 // Add cpu regions to the system region list.
155 for (const core::MemoryRegion* region : agent->regions()) {
156 if (region->fine_grain()) {
157 system_regions_fine_.push_back(region);
158 } else {
159 system_regions_coarse_.push_back(region);
160 }
161 }
162
163 assert(system_regions_fine_.size() > 0);
164
165 // Init default fine grain system region allocator using fine grain
166 // system region of the first discovered CPU agent.
167 if (cpu_agents_.size() == 1) {
168 // Might need memory pooling to cover allocation that
169 // requires less than 4096 bytes.
170 system_allocator_ =
171 [&](size_t size, size_t alignment,
172 MemoryRegion::AllocateFlags alloc_flags) -> void* {
173 assert(alignment <= 4096);
174 void* ptr = NULL;
175 return (HSA_STATUS_SUCCESS ==
176 core::Runtime::runtime_singleton_->AllocateMemory(
177 system_regions_fine_[0], size, alloc_flags, &ptr))
178 ? ptr
179 : NULL;
180 };
181
182 system_deallocator_ =
183 [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
184
185 BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
186 }
187
188 // Setup system clock frequency for the first time.
189 if (sys_clock_freq_ == 0) {
190 // Cache system clock frequency
191 HsaClockCounters clocks;
192 hsaKmtGetClockCounters(0, &clocks);
193 sys_clock_freq_ = clocks.SystemClockFrequencyHz;
194 }
195 } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) {
196 gpu_agents_.push_back(agent);
197
198 gpu_ids_.push_back(agent->node_id());
199
200 // Assign the first discovered gpu agent as blit agent that will provide
201 // DMA operation for hsa_memory_copy.
202 if (blit_agent_ == NULL) {
203 blit_agent_ = agent;
204
205 // Query the start and end address of the SVM address space in this
206 // platform.
207 if (reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
208 HSA_PROFILE_BASE) {
209 std::vector<const core::MemoryRegion*>::const_iterator it =
210 std::find_if(blit_agent_->regions().begin(),
211 blit_agent_->regions().end(),
212 [](const core::MemoryRegion* region) {
213 return (
214 reinterpret_cast<const amd::MemoryRegion*>(region)->IsSvm());
215 });
216
217 assert(it != blit_agent_->regions().end());
218
219 const amd::MemoryRegion* svm_region =
220 reinterpret_cast<const amd::MemoryRegion*>(*it);
221
222 start_svm_address_ =
223 static_cast<uintptr_t>(svm_region->GetBaseAddress());
224 end_svm_address_ = start_svm_address_ + svm_region->GetPhysicalSize();
225
226 // Bind VM fault handler when we detect the first GPU agent.
227 // TODO: validate if it works on APU.
228 BindVmFaultHandler();
229 } else {
230 start_svm_address_ = 0;
231 end_svm_address_ = os::GetUserModeVirtualMemoryBase() +
232 os::GetUserModeVirtualMemorySize();
233 }
234 }
235 }
236 }
237
DestroyAgents()238 void Runtime::DestroyAgents() {
239 agents_by_node_.clear();
240
241 std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject());
242 gpu_agents_.clear();
243
244 gpu_ids_.clear();
245
246 std::for_each(cpu_agents_.begin(), cpu_agents_.end(), DeleteObject());
247 cpu_agents_.clear();
248
249 blit_agent_ = NULL;
250
251 system_regions_fine_.clear();
252 system_regions_coarse_.clear();
253 }
254
SetLinkCount(size_t num_link)255 void Runtime::SetLinkCount(size_t num_link) {
256 const size_t last_index = GetIndexLinkInfo(0, num_link);
257 link_matrix_.resize(last_index);
258
259 memset(&link_matrix_[0], 0,
260 link_matrix_.size() * sizeof(hsa_amd_memory_pool_link_info_t));
261 }
262
RegisterLinkInfo(uint32_t node_id_from,uint32_t node_id_to,uint32_t num_hop,hsa_amd_memory_pool_link_info_t & link_info)263 void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
264 uint32_t num_hop,
265 hsa_amd_memory_pool_link_info_t& link_info) {
266 const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
267 link_matrix_[idx].num_hop = num_hop;
268 link_matrix_[idx].info = link_info;
269
270 // Limit the number of hop to 1 since the runtime does not have enough
271 // information to share to the user about each hop.
272 link_matrix_[idx].num_hop = std::min(link_matrix_[idx].num_hop , 1U);
273 }
274
GetLinkInfo(uint32_t node_id_from,uint32_t node_id_to)275 const Runtime::LinkInfo Runtime::GetLinkInfo(uint32_t node_id_from,
276 uint32_t node_id_to) {
277 return (node_id_from != node_id_to)
278 ? link_matrix_[GetIndexLinkInfo(node_id_from, node_id_to)]
279 : LinkInfo(); // No link.
280 }
281
GetIndexLinkInfo(uint32_t node_id_from,uint32_t node_id_to)282 uint32_t Runtime::GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to) {
283 const uint32_t node_id_max = std::max(node_id_from, node_id_to) - 1;
284 const uint32_t node_id_min = std::min(node_id_from, node_id_to);
285 return ((node_id_max * (node_id_max + 1) / 2) + node_id_min);
286 }
287
IterateAgent(hsa_status_t (* callback)(hsa_agent_t agent,void * data),void * data)288 hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
289 void* data),
290 void* data) {
291 AMD::callback_t<decltype(callback)> call(callback);
292
293 std::vector<core::Agent*>* agent_lists[2] = {&cpu_agents_, &gpu_agents_};
294 for (std::vector<core::Agent*>* agent_list : agent_lists) {
295 for (size_t i = 0; i < agent_list->size(); ++i) {
296 hsa_agent_t agent = Agent::Convert(agent_list->at(i));
297 hsa_status_t status = call(agent, data);
298
299 if (status != HSA_STATUS_SUCCESS) {
300 return status;
301 }
302 }
303 }
304
305 return HSA_STATUS_SUCCESS;
306 }
307
AllocateMemory(const MemoryRegion * region,size_t size,MemoryRegion::AllocateFlags alloc_flags,void ** address)308 hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
309 MemoryRegion::AllocateFlags alloc_flags,
310 void** address) {
311 ScopedAcquire<KernelMutex> lock(&memory_lock_);
312 hsa_status_t status = region->Allocate(size, alloc_flags, address);
313
314 // Track the allocation result so that it could be freed properly.
315 if (status == HSA_STATUS_SUCCESS) {
316 allocation_map_[*address] = AllocationRegion(region, size);
317 }
318
319 return status;
320 }
321
FreeMemory(void * ptr)322 hsa_status_t Runtime::FreeMemory(void* ptr) {
323 if (ptr == nullptr) {
324 return HSA_STATUS_SUCCESS;
325 }
326
327 const MemoryRegion* region = nullptr;
328 size_t size = 0;
329 ScopedAcquire<KernelMutex> lock(&memory_lock_);
330
331 std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
332
333 if (it == allocation_map_.end()) {
334 assert(false && "Can't find address in allocation map");
335 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
336 }
337 region = it->second.region;
338 size = it->second.size;
339
340 // Imported fragments can't be released with FreeMemory.
341 if (region == nullptr) {
342 assert(false && "Can't release imported memory with free.");
343 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
344 }
345
346 allocation_map_.erase(it);
347
348 return region->Free(ptr, size);
349 }
350
CopyMemory(void * dst,const void * src,size_t size)351 hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
352 assert(dst != NULL && src != NULL && size != 0);
353
354 bool is_src_system = false;
355 bool is_dst_system = false;
356 const uintptr_t src_uptr = reinterpret_cast<uintptr_t>(src);
357 const uintptr_t dst_uptr = reinterpret_cast<uintptr_t>(dst);
358
359 if ((reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
360 HSA_PROFILE_FULL)) {
361 is_src_system = (src_uptr < end_svm_address_);
362 is_dst_system = (dst_uptr < end_svm_address_);
363 } else {
364 is_src_system =
365 ((src_uptr < start_svm_address_) || (src_uptr >= end_svm_address_));
366 is_dst_system =
367 ((dst_uptr < start_svm_address_) || (dst_uptr >= end_svm_address_));
368
369 if ((is_src_system && !is_dst_system) ||
370 (!is_src_system && is_dst_system)) {
371 // Use staging buffer or pin if either src or dst is gpuvm and the other
372 // is system memory allocated via OS or C/C++ allocator.
373 return CopyMemoryHostAlloc(dst, src, size, is_dst_system);
374 }
375 }
376
377 if (is_src_system && is_dst_system) {
378 memmove(dst, src, size);
379 return HSA_STATUS_SUCCESS;
380 }
381
382 return blit_agent_->DmaCopy(dst, src, size);
383 }
384
CopyMemoryHostAlloc(void * dst,const void * src,size_t size,bool dst_malloc)385 hsa_status_t Runtime::CopyMemoryHostAlloc(void* dst, const void* src,
386 size_t size, bool dst_malloc) {
387 void* usrptr = (dst_malloc) ? dst : const_cast<void*>(src);
388 void* agent_ptr = NULL;
389
390 hsa_agent_t blit_agent = core::Agent::Convert(blit_agent_);
391
392 const amd::MemoryRegion* system_region =
393 reinterpret_cast<const amd::MemoryRegion*>(system_regions_fine_[0]);
394 hsa_status_t stat =
395 system_region->Lock(1, &blit_agent, usrptr, size, &agent_ptr);
396
397 if (stat != HSA_STATUS_SUCCESS) {
398 return stat;
399 }
400
401 stat = blit_agent_->DmaCopy((dst_malloc) ? agent_ptr : dst,
402 (dst_malloc) ? src : agent_ptr, size);
403
404 system_region->Unlock(usrptr);
405
406 return stat;
407 }
408
CopyMemory(void * dst,core::Agent & dst_agent,const void * src,core::Agent & src_agent,size_t size,std::vector<core::Signal * > & dep_signals,core::Signal & completion_signal)409 hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
410 const void* src, core::Agent& src_agent,
411 size_t size,
412 std::vector<core::Signal*>& dep_signals,
413 core::Signal& completion_signal) {
414 const bool dst_gpu =
415 (dst_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
416 const bool src_gpu =
417 (src_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
418 if (dst_gpu || src_gpu) {
419 core::Agent& copy_agent = (src_gpu) ? src_agent : dst_agent;
420 return copy_agent.DmaCopy(dst, dst_agent, src, src_agent, size, dep_signals,
421 completion_signal);
422 }
423
424 // For cpu to cpu, fire and forget a copy thread.
425 const bool profiling_enabled =
426 (dst_agent.profiling_enabled() || src_agent.profiling_enabled());
427 std::thread(
428 [](void* dst, const void* src, size_t size,
429 std::vector<core::Signal*> dep_signals,
430 core::Signal* completion_signal, bool profiling_enabled) {
431
432 for (core::Signal* dep : dep_signals) {
433 dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
434 HSA_WAIT_STATE_BLOCKED);
435 }
436
437 if (profiling_enabled) {
438 HsaClockCounters clocks = {0};
439 core::Runtime::runtime_singleton_->GetSystemInfo(
440 HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
441 completion_signal->signal_.start_ts = clocks.SystemClockCounter;
442 }
443
444 memcpy(dst, src, size);
445
446 if (profiling_enabled) {
447 HsaClockCounters clocks = {0};
448 core::Runtime::runtime_singleton_->GetSystemInfo(
449 HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
450 completion_signal->signal_.end_ts = clocks.SystemClockCounter;
451 }
452
453 completion_signal->SubRelease(1);
454 },
455 dst, src, size, dep_signals, &completion_signal,
456 profiling_enabled).detach();
457
458 return HSA_STATUS_SUCCESS;
459 }
460
FillMemory(void * ptr,uint32_t value,size_t count)461 hsa_status_t Runtime::FillMemory(void* ptr, uint32_t value, size_t count) {
462 // Choose blit agent from pointer info
463 hsa_amd_pointer_info_t info;
464 uint32_t agent_count;
465 hsa_agent_t* accessible = nullptr;
466 info.size = sizeof(info);
467 MAKE_SCOPE_GUARD([&]() { free(accessible); });
468 hsa_status_t err = PtrInfo(ptr, &info, malloc, &agent_count, &accessible);
469 if (err != HSA_STATUS_SUCCESS) return err;
470
471 ptrdiff_t endPtr = (ptrdiff_t)ptr + count * sizeof(uint32_t);
472
473 // Check for GPU fill
474 // Selects GPU fill for SVM and Locked allocations if a GPU address is given and is mapped.
475 if (info.agentBaseAddress <= ptr &&
476 endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
477 core::Agent* blit_agent = core::Agent::Convert(info.agentOwner);
478 if (blit_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
479 blit_agent = nullptr;
480 for (uint32_t i = 0; i < agent_count; i++) {
481 if (core::Agent::Convert(accessible[i])->device_type() ==
482 core::Agent::DeviceType::kAmdGpuDevice) {
483 blit_agent = core::Agent::Convert(accessible[i]);
484 break;
485 }
486 }
487 }
488 if (blit_agent) return blit_agent->DmaFill(ptr, value, count);
489 }
490
491 // Host and unmapped SVM addresses copy via host.
492 if (info.hostBaseAddress <= ptr && endPtr <= (ptrdiff_t)info.hostBaseAddress + info.sizeInBytes) {
493 memset(ptr, value, count * sizeof(uint32_t));
494 return HSA_STATUS_SUCCESS;
495 }
496
497 return HSA_STATUS_ERROR_INVALID_ALLOCATION;
498 }
499
AllowAccess(uint32_t num_agents,const hsa_agent_t * agents,const void * ptr)500 hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
501 const hsa_agent_t* agents, const void* ptr) {
502 const amd::MemoryRegion* amd_region = NULL;
503 size_t alloc_size = 0;
504
505 {
506 ScopedAcquire<KernelMutex> lock(&memory_lock_);
507
508 std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
509
510 if (it == allocation_map_.end()) {
511 return HSA_STATUS_ERROR;
512 }
513
514 amd_region = reinterpret_cast<const amd::MemoryRegion*>(it->second.region);
515 alloc_size = it->second.size;
516 }
517
518 return amd_region->AllowAccess(num_agents, agents, ptr, alloc_size);
519 }
520
GetSystemInfo(hsa_system_info_t attribute,void * value)521 hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
522 switch (attribute) {
523 case HSA_SYSTEM_INFO_VERSION_MAJOR:
524 *((uint16_t*)value) = HSA_VERSION_MAJOR;
525 break;
526 case HSA_SYSTEM_INFO_VERSION_MINOR:
527 *((uint16_t*)value) = HSA_VERSION_MINOR;
528 break;
529 case HSA_SYSTEM_INFO_TIMESTAMP: {
530 HsaClockCounters clocks;
531 hsaKmtGetClockCounters(0, &clocks);
532 *((uint64_t*)value) = clocks.SystemClockCounter;
533 break;
534 }
535 case HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY: {
536 assert(sys_clock_freq_ != 0 &&
537 "Use of HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY before HSA "
538 "initialization completes.");
539 *(uint64_t*)value = sys_clock_freq_;
540 break;
541 }
542 case HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT:
543 *((uint64_t*)value) = 0xFFFFFFFFFFFFFFFF;
544 break;
545 case HSA_SYSTEM_INFO_ENDIANNESS:
546 #if defined(HSA_LITTLE_ENDIAN)
547 *((hsa_endianness_t*)value) = HSA_ENDIANNESS_LITTLE;
548 #else
549 *((hsa_endianness_t*)value) = HSA_ENDIANNESS_BIG;
550 #endif
551 break;
552 case HSA_SYSTEM_INFO_MACHINE_MODEL:
553 #if defined(HSA_LARGE_MODEL)
554 *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE;
555 #else
556 *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL;
557 #endif
558 break;
559 case HSA_SYSTEM_INFO_EXTENSIONS: {
560 memset(value, 0, sizeof(uint8_t) * 128);
561
562 auto setFlag = [&](uint32_t bit) {
563 assert(bit < 128 * 8 && "Extension value exceeds extension bitmask");
564 uint index = bit / 8;
565 uint subBit = bit % 8;
566 ((uint8_t*)value)[index] |= 1 << subBit;
567 };
568
569 if (hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) {
570 setFlag(HSA_EXTENSION_FINALIZER);
571 }
572
573 if (hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) {
574 setFlag(HSA_EXTENSION_IMAGES);
575 }
576
577 if (os::LibHandle lib = os::LoadLib(kAqlProfileLib)) {
578 os::CloseLib(lib);
579 setFlag(HSA_EXTENSION_AMD_AQLPROFILE);
580 }
581
582 setFlag(HSA_EXTENSION_AMD_PROFILER);
583
584 break;
585 }
586 default:
587 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
588 }
589 return HSA_STATUS_SUCCESS;
590 }
591
SetAsyncSignalHandler(hsa_signal_t signal,hsa_signal_condition_t cond,hsa_signal_value_t value,hsa_amd_signal_handler handler,void * arg)592 hsa_status_t Runtime::SetAsyncSignalHandler(hsa_signal_t signal,
593 hsa_signal_condition_t cond,
594 hsa_signal_value_t value,
595 hsa_amd_signal_handler handler,
596 void* arg) {
597 // Indicate that this signal is in use.
598 if (signal.handle != 0) hsa_signal_handle(signal)->Retain();
599
600 ScopedAcquire<KernelMutex> scope_lock(&async_events_control_.lock);
601
602 // Lazy initializer
603 if (async_events_control_.async_events_thread_ == NULL) {
604 // Create monitoring thread control signal
605 auto err = HSA::hsa_signal_create(0, 0, NULL, &async_events_control_.wake);
606 if (err != HSA_STATUS_SUCCESS) {
607 assert(false && "Asyncronous events control signal creation error.");
608 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
609 }
610 async_events_.PushBack(async_events_control_.wake, HSA_SIGNAL_CONDITION_NE,
611 0, NULL, NULL);
612
613 // Start event monitoring thread
614 async_events_control_.exit = false;
615 async_events_control_.async_events_thread_ =
616 os::CreateThread(AsyncEventsLoop, NULL);
617 if (async_events_control_.async_events_thread_ == NULL) {
618 assert(false && "Asyncronous events thread creation error.");
619 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
620 }
621 }
622
623 new_async_events_.PushBack(signal, cond, value, handler, arg);
624
625 hsa_signal_handle(async_events_control_.wake)->StoreRelease(1);
626
627 return HSA_STATUS_SUCCESS;
628 }
629
InteropMap(uint32_t num_agents,Agent ** agents,int interop_handle,uint32_t flags,size_t * size,void ** ptr,size_t * metadata_size,const void ** metadata)630 hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents,
631 int interop_handle, uint32_t flags,
632 size_t* size, void** ptr,
633 size_t* metadata_size, const void** metadata) {
634 static const int tinyArraySize=8;
635 HsaGraphicsResourceInfo info;
636
637 HSAuint32 short_nodes[tinyArraySize];
638 HSAuint32* nodes = short_nodes;
639 if (num_agents > tinyArraySize) {
640 nodes = new HSAuint32[num_agents];
641 if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
642 }
643 MAKE_SCOPE_GUARD([&]() {
644 if (num_agents > tinyArraySize) delete[] nodes;
645 });
646
647 for (uint32_t i = 0; i < num_agents; i++)
648 agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID,
649 &nodes[i]);
650
651 if (hsaKmtRegisterGraphicsHandleToNodes(interop_handle, &info, num_agents,
652 nodes) != HSAKMT_STATUS_SUCCESS)
653 return HSA_STATUS_ERROR;
654
655 HSAuint64 altAddress;
656 HsaMemMapFlags map_flags;
657 map_flags.Value = 0;
658 map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB;
659 if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes,
660 &altAddress, map_flags, num_agents,
661 nodes) != HSAKMT_STATUS_SUCCESS) {
662 map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
663 if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes, &altAddress, map_flags,
664 num_agents, nodes) != HSAKMT_STATUS_SUCCESS) {
665 hsaKmtDeregisterMemory(info.MemoryAddress);
666 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
667 }
668 }
669
670 if (metadata_size != NULL) *metadata_size = info.MetadataSizeInBytes;
671 if (metadata != NULL) *metadata = info.Metadata;
672
673 *size = info.SizeInBytes;
674 *ptr = info.MemoryAddress;
675
676 return HSA_STATUS_SUCCESS;
677 }
678
InteropUnmap(void * ptr)679 hsa_status_t Runtime::InteropUnmap(void* ptr) {
680 if(hsaKmtUnmapMemoryToGPU(ptr)!=HSAKMT_STATUS_SUCCESS)
681 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
682 if(hsaKmtDeregisterMemory(ptr)!=HSAKMT_STATUS_SUCCESS)
683 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
684 return HSA_STATUS_SUCCESS;
685 }
686
PtrInfo(void * ptr,hsa_amd_pointer_info_t * info,void * (* alloc)(size_t),uint32_t * num_agents_accessible,hsa_agent_t ** accessible,PtrInfoBlockData * block_info)687 hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
688 uint32_t* num_agents_accessible, hsa_agent_t** accessible,
689 PtrInfoBlockData* block_info) {
690 static_assert(static_cast<int>(HSA_POINTER_UNKNOWN) == static_cast<int>(HSA_EXT_POINTER_TYPE_UNKNOWN),
691 "Thunk pointer info mismatch");
692 static_assert(static_cast<int>(HSA_POINTER_ALLOCATED) == static_cast<int>(HSA_EXT_POINTER_TYPE_HSA),
693 "Thunk pointer info mismatch");
694 static_assert(static_cast<int>(HSA_POINTER_REGISTERED_USER) == static_cast<int>(HSA_EXT_POINTER_TYPE_LOCKED),
695 "Thunk pointer info mismatch");
696 static_assert(static_cast<int>(HSA_POINTER_REGISTERED_GRAPHICS) == static_cast<int>(HSA_EXT_POINTER_TYPE_GRAPHICS),
697 "Thunk pointer info mismatch");
698
699 HsaPointerInfo thunkInfo;
700 uint32_t* mappedNodes;
701
702 hsa_amd_pointer_info_t retInfo;
703
704 // check output struct has an initialized size.
705 if (info->size == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
706
707 bool returnListData =
708 ((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr));
709
710 { // memory_lock protects access to the NMappedNodes array and fragment user data since these may
711 // change with calls to memory APIs.
712 ScopedAcquire<KernelMutex> lock(&memory_lock_);
713 hsaKmtQueryPointerInfo(ptr, &thunkInfo);
714 if (returnListData) {
715 assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
716 "PointerInfo: Thunk returned more than all agents in NMappedNodes.");
717 mappedNodes = (uint32_t*)alloca(thunkInfo.NMappedNodes * sizeof(uint32_t));
718 memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
719 }
720 retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
721 retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
722 retInfo.hostBaseAddress = thunkInfo.CPUAddress;
723 retInfo.sizeInBytes = thunkInfo.SizeInBytes;
724 retInfo.userData = thunkInfo.UserData;
725 if (block_info != nullptr) {
726 // The only time host and agent ptr may be different is when the memory is lock memory (malloc
727 // memory pinned for GPU access). In this case there can not be any suballocation so
728 // block_info is redundant and unused. Host address is returned since host address is used to
729 // manipulate lock memory. This protects future use of block_info with lock memory.
730 block_info->base = retInfo.hostBaseAddress;
731 block_info->length = retInfo.sizeInBytes;
732 }
733 if (retInfo.type == HSA_EXT_POINTER_TYPE_HSA) {
734 auto fragment = allocation_map_.upper_bound(ptr);
735 if (fragment != allocation_map_.begin()) {
736 fragment--;
737 if ((fragment->first <= ptr) &&
738 (ptr < reinterpret_cast<const uint8_t*>(fragment->first) + fragment->second.size)) {
739 // agent and host address must match here. Only lock memory is allowed to have differing
740 // addresses but lock memory has type HSA_EXT_POINTER_TYPE_LOCKED and cannot be
741 // suballocated.
742 retInfo.agentBaseAddress = const_cast<void*>(fragment->first);
743 retInfo.hostBaseAddress = retInfo.agentBaseAddress;
744 retInfo.sizeInBytes = fragment->second.size;
745 retInfo.userData = fragment->second.user_ptr;
746 }
747 }
748 }
749 } // end lock scope
750
751 retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
752
753 // Temp: workaround thunk bug, IPC memory has garbage in Node.
754 // retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
755 auto it = agents_by_node_.find(thunkInfo.Node);
756 if (it != agents_by_node_.end())
757 retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
758 else
759 retInfo.agentOwner.handle = 0;
760
761 memcpy(info, &retInfo, retInfo.size);
762
763 if (returnListData) {
764 uint32_t count = 0;
765 for (HSAuint32 i = 0; i < thunkInfo.NMappedNodes; i++) {
766 assert(mappedNodes[i] < agents_by_node_.size() &&
767 "PointerInfo: Invalid node ID returned from thunk.");
768 count += agents_by_node_[mappedNodes[i]].size();
769 }
770
771 AMD::callback_t<decltype(alloc)> Alloc(alloc);
772 *accessible = (hsa_agent_t*)Alloc(sizeof(hsa_agent_t) * count);
773 if ((*accessible) == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
774 *num_agents_accessible = count;
775
776 uint32_t index = 0;
777 for (HSAuint32 i = 0; i < thunkInfo.NMappedNodes; i++) {
778 auto& list = agents_by_node_[mappedNodes[i]];
779 for (auto agent : list) {
780 (*accessible)[index] = agent->public_handle();
781 index++;
782 }
783 }
784 }
785
786 return HSA_STATUS_SUCCESS;
787 }
788
SetPtrInfoData(void * ptr,void * userptr)789 hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) {
790 { // Use allocation map if possible to handle fragments.
791 ScopedAcquire<KernelMutex> lock(&memory_lock_);
792 const auto& it = allocation_map_.find(ptr);
793 if (it != allocation_map_.end()) {
794 it->second.user_ptr = userptr;
795 return HSA_STATUS_SUCCESS;
796 }
797 }
798 // Cover entries not in the allocation map (graphics, lock,...)
799 if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS)
800 return HSA_STATUS_SUCCESS;
801 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
802 }
803
IPCCreate(void * ptr,size_t len,hsa_amd_ipc_memory_t * handle)804 hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) {
805 static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle),
806 "Thunk IPC mismatch.");
807 // Reject sharing allocations larger than ~8TB due to thunk limitations.
808 if (len > 0x7FFFFFFF000ull) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
809
810 // Check for fragment sharing.
811 PtrInfoBlockData block;
812 hsa_amd_pointer_info_t info;
813 info.size = sizeof(info);
814 if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
815 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
816 if ((block.base != ptr) || (block.length != len)) {
817 if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) {
818 assert(false && "Fragment's block not aligned to 2MB!");
819 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
820 }
821 if (hsaKmtShareMemory(block.base, block.length, reinterpret_cast<HsaSharedMemoryHandle*>(
822 handle)) != HSAKMT_STATUS_SUCCESS)
823 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
824 uint32_t offset =
825 (reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<uint8_t*>(block.base)) / 4096;
826 // Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset.
827 handle->handle[6] |= 0x80000000 | offset;
828 } else {
829 if (hsaKmtShareMemory(ptr, len, reinterpret_cast<HsaSharedMemoryHandle*>(handle)) !=
830 HSAKMT_STATUS_SUCCESS)
831 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
832 }
833 return HSA_STATUS_SUCCESS;
834 }
835
IPCAttach(const hsa_amd_ipc_memory_t * handle,size_t len,uint32_t num_agents,Agent ** agents,void ** mapped_ptr)836 hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents,
837 Agent** agents, void** mapped_ptr) {
838 static const int tinyArraySize = 8;
839 void* importAddress;
840 HSAuint64 importSize;
841 HSAuint64 altAddress;
842
843 hsa_amd_ipc_memory_t importHandle;
844 importHandle = *handle;
845
846 // Extract fragment info
847 bool isFragment = false;
848 uint32_t fragOffset = 0;
849 auto fixFragment = [&]() {
850 if (!isFragment) return;
851 importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
852 len = Min(len, importSize - fragOffset);
853 ScopedAcquire<KernelMutex> lock(&memory_lock_);
854 allocation_map_[importAddress] = AllocationRegion(nullptr, len);
855 };
856
857 if ((importHandle.handle[6] & 0x80000000) != 0) {
858 isFragment = true;
859 fragOffset = (importHandle.handle[6] & 0x1FF) * 4096;
860 importHandle.handle[6] &= ~(0x80000000 | 0x1FF);
861 }
862
863 if (num_agents == 0) {
864 if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle),
865 &importAddress, &importSize) != HSAKMT_STATUS_SUCCESS)
866 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
867 if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) {
868 hsaKmtDeregisterMemory(importAddress);
869 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
870 }
871 fixFragment();
872 *mapped_ptr = importAddress;
873 return HSA_STATUS_SUCCESS;
874 }
875
876 HSAuint32* nodes = nullptr;
877 if (num_agents > tinyArraySize)
878 nodes = new HSAuint32[num_agents];
879 else
880 nodes = (HSAuint32*)alloca(sizeof(HSAuint32) * num_agents);
881 if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
882
883 MAKE_SCOPE_GUARD([&]() {
884 if (num_agents > tinyArraySize) delete[] nodes;
885 });
886
887 for (uint32_t i = 0; i < num_agents; i++)
888 agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]);
889
890 if (hsaKmtRegisterSharedHandleToNodes(
891 reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle), &importAddress,
892 &importSize, num_agents, nodes) != HSAKMT_STATUS_SUCCESS)
893 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
894
895 HsaMemMapFlags map_flags;
896 map_flags.Value = 0;
897 map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB;
898 if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents,
899 nodes) != HSAKMT_STATUS_SUCCESS) {
900 map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
901 if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents,
902 nodes) != HSAKMT_STATUS_SUCCESS) {
903 hsaKmtDeregisterMemory(importAddress);
904 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
905 }
906 }
907
908 fixFragment();
909 *mapped_ptr = importAddress;
910 return HSA_STATUS_SUCCESS;
911 }
912
IPCDetach(void * ptr)913 hsa_status_t Runtime::IPCDetach(void* ptr) {
914 { // Handle imported fragments.
915 ScopedAcquire<KernelMutex> lock(&memory_lock_);
916 const auto& it = allocation_map_.find(ptr);
917 if (it != allocation_map_.end()) {
918 if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
919 allocation_map_.erase(it);
920 lock.Release(); // Can't hold memory lock when using pointer info.
921
922 PtrInfoBlockData block;
923 hsa_amd_pointer_info_t info;
924 info.size = sizeof(info);
925 if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
926 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
927 ptr = block.base;
928 }
929 }
930 if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS)
931 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
932 if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS)
933 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
934 return HSA_STATUS_SUCCESS;
935 }
936
AsyncEventsLoop(void *)937 void Runtime::AsyncEventsLoop(void*) {
938 auto& async_events_control_ = runtime_singleton_->async_events_control_;
939 auto& async_events_ = runtime_singleton_->async_events_;
940 auto& new_async_events_ = runtime_singleton_->new_async_events_;
941
942 while (!async_events_control_.exit) {
943 // Wait for a signal
944 hsa_signal_value_t value;
945 uint32_t index = AMD::hsa_amd_signal_wait_any(
946 uint32_t(async_events_.Size()), &async_events_.signal_[0],
947 &async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1),
948 HSA_WAIT_STATE_BLOCKED, &value);
949
950 // Reset the control signal
951 if (index == 0) {
952 hsa_signal_handle(async_events_control_.wake)->StoreRelaxed(0);
953 } else if (index != -1) {
954 // No error or timout occured, process the handler
955 assert(async_events_.handler_[index] != NULL);
956 bool keep =
957 async_events_.handler_[index](value, async_events_.arg_[index]);
958 if (!keep) {
959 hsa_signal_handle(async_events_.signal_[index])->Release();
960 async_events_.CopyIndex(index, async_events_.Size() - 1);
961 async_events_.PopBack();
962 }
963 }
964
965 // Check for dead signals
966 index = 0;
967 while (index != async_events_.Size()) {
968 if (!hsa_signal_handle(async_events_.signal_[index])->IsValid()) {
969 hsa_signal_handle(async_events_.signal_[index])->Release();
970 async_events_.CopyIndex(index, async_events_.Size() - 1);
971 async_events_.PopBack();
972 continue;
973 }
974 index++;
975 }
976
977 // Insert new signals and find plain functions
978 typedef std::pair<void (*)(void*), void*> func_arg_t;
979 std::vector<func_arg_t> functions;
980 {
981 ScopedAcquire<KernelMutex> scope_lock(&async_events_control_.lock);
982 for (size_t i = 0; i < new_async_events_.Size(); i++) {
983 if (new_async_events_.signal_[i].handle == 0) {
984 functions.push_back(
985 func_arg_t((void (*)(void*))new_async_events_.handler_[i],
986 new_async_events_.arg_[i]));
987 continue;
988 }
989 async_events_.PushBack(
990 new_async_events_.signal_[i], new_async_events_.cond_[i],
991 new_async_events_.value_[i], new_async_events_.handler_[i],
992 new_async_events_.arg_[i]);
993 }
994 new_async_events_.Clear();
995 }
996
997 // Call plain functions
998 for (size_t i = 0; i < functions.size(); i++)
999 functions[i].first(functions[i].second);
1000 functions.clear();
1001 }
1002
1003 // Release wait count of all pending signals
1004 for (size_t i = 1; i < async_events_.Size(); i++)
1005 hsa_signal_handle(async_events_.signal_[i])->Release();
1006 async_events_.Clear();
1007
1008 for (size_t i = 0; i < new_async_events_.Size(); i++)
1009 hsa_signal_handle(new_async_events_.signal_[i])->Release();
1010 new_async_events_.Clear();
1011 }
1012
BindVmFaultHandler()1013 void Runtime::BindVmFaultHandler() {
1014 if (core::g_use_interrupt_wait) {
1015 // Create memory event with manual reset to avoid racing condition
1016 // with driver in case of multiple concurrent VM faults.
1017 vm_fault_event_ =
1018 core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true);
1019
1020 // Create an interrupt signal object to contain the memory event.
1021 // This signal object will be registered with the async handler global
1022 // thread.
1023 vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_);
1024
1025 if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) {
1026 assert(false && "Failed on creating VM fault signal");
1027 return;
1028 }
1029
1030 SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_),
1031 HSA_SIGNAL_CONDITION_NE, 0, VMFaultHandler,
1032 reinterpret_cast<void*>(vm_fault_signal_));
1033 }
1034 }
1035
VMFaultHandler(hsa_signal_value_t val,void * arg)1036 bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
1037 core::InterruptSignal* vm_fault_signal =
1038 reinterpret_cast<core::InterruptSignal*>(arg);
1039
1040 assert(vm_fault_signal != NULL);
1041
1042 if (vm_fault_signal == NULL) {
1043 return false;
1044 }
1045
1046 HsaEvent* vm_fault_event = vm_fault_signal->EopEvent();
1047
1048 HsaMemoryAccessFault& fault =
1049 vm_fault_event->EventData.EventData.MemoryAccessFault;
1050
1051 hsa_status_t custom_handler_status = HSA_STATUS_ERROR;
1052 // If custom handler is registered, pack the fault info and call the handler
1053 if (runtime_singleton_->GetCustomSystemEventHandler()) {
1054 hsa_amd_gpu_memory_fault_info_t fault_info;
1055
1056 // Find the faulty agent
1057 auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId);
1058 assert(it != runtime_singleton_->agents_by_node_.end() && "Can't find faulty agent.");
1059 Agent* faulty_agent = it->second.front();
1060 fault_info.agent = Agent::Convert(faulty_agent);
1061
1062 fault_info.virtual_address = fault.VirtualAddress;
1063 fault_info.fault_reason_mask = 0x00000000;
1064 if (fault.Failure.NotPresent == 1) {
1065 fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000001;
1066 }
1067 if (fault.Failure.ReadOnly == 1) {
1068 fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000010;
1069 }
1070 if (fault.Failure.NoExecute == 1) {
1071 fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000100;
1072 }
1073 if (fault.Failure.GpuAccess == 1) {
1074 fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00001000;
1075 }
1076 if (fault.Failure.ECC == 1) {
1077 fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00010000;
1078 }
1079 if (fault.Failure.Imprecise == 1) {
1080 fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00100000;
1081 }
1082 hsa_amd_event_t memory_fault_event;
1083 memory_fault_event.event_type = GPU_MEMORY_FAULT_EVENT;
1084 memory_fault_event.memory_fault = fault_info;
1085 custom_handler_status = runtime_singleton_->GetCustomSystemEventHandler()(
1086 &memory_fault_event, runtime_singleton_->GetCustomSystemEventData());
1087 }
1088
1089 // No custom VM fault handler registered or it failed.
1090 if (custom_handler_status != HSA_STATUS_SUCCESS) {
1091 if (runtime_singleton_->flag().enable_vm_fault_message()) {
1092 std::string reason = "";
1093 if (fault.Failure.NotPresent == 1) {
1094 reason += "Page not present or supervisor privilege";
1095 } else if (fault.Failure.ReadOnly == 1) {
1096 reason += "Write access to a read-only page";
1097 } else if (fault.Failure.NoExecute == 1) {
1098 reason += "Execute access to a page marked NX";
1099 } else if (fault.Failure.GpuAccess == 1) {
1100 reason += "Host access only";
1101 } else if (fault.Failure.ECC == 1) {
1102 reason += "ECC failure (if supported by HW)";
1103 } else {
1104 reason += "Unknown";
1105 }
1106
1107 core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
1108
1109 fprintf(
1110 stderr,
1111 "Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n",
1112 fault.NodeId, reinterpret_cast<void*>(faultingAgent->public_handle().handle),
1113 reinterpret_cast<const void*>(fault.VirtualAddress),
1114 (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str());
1115
1116 #ifndef NDEBUG
1117 runtime_singleton_->memory_lock_.Acquire();
1118 auto it = runtime_singleton_->allocation_map_.upper_bound(
1119 reinterpret_cast<void*>(fault.VirtualAddress));
1120 for (int i = 0; i < 2; i++) {
1121 if (it != runtime_singleton_->allocation_map_.begin()) it--;
1122 }
1123 fprintf(stderr, "Nearby memory map:\n");
1124 auto start = it;
1125 for (int i = 0; i < 3; i++) {
1126 if (it == runtime_singleton_->allocation_map_.end()) break;
1127 std::string kind = "Non-HSA";
1128 if (it->second.region != nullptr) {
1129 const amd::MemoryRegion* region =
1130 static_cast<const amd::MemoryRegion*>(it->second.region);
1131 if (region->IsSystem())
1132 kind = "System";
1133 else if (region->IsLocalMemory())
1134 kind = "VRAM";
1135 else if (region->IsScratch())
1136 kind = "Scratch";
1137 else if (region->IsLDS())
1138 kind = "LDS";
1139 }
1140 fprintf(stderr, "%p, 0x%lx, %s\n", it->first, it->second.size, kind.c_str());
1141 it++;
1142 }
1143 fprintf(stderr, "\n");
1144 it = start;
1145 runtime_singleton_->memory_lock_.Release();
1146 hsa_amd_pointer_info_t info;
1147 PtrInfoBlockData block;
1148 uint32_t count;
1149 hsa_agent_t* canAccess;
1150 info.size = sizeof(info);
1151 for (int i = 0; i < 3; i++) {
1152 if (it == runtime_singleton_->allocation_map_.end()) break;
1153 runtime_singleton_->PtrInfo(const_cast<void*>(it->first), &info, malloc, &count, &canAccess,
1154 &block);
1155 fprintf(stderr,
1156 "PtrInfo:\n\tAddress: %p-%p/%p-%p\n\tSize: 0x%lx\n\tType: %u\n\tOwner: %p\n",
1157 info.agentBaseAddress, (char*)info.agentBaseAddress + info.sizeInBytes,
1158 info.hostBaseAddress, (char*)info.hostBaseAddress + info.sizeInBytes,
1159 info.sizeInBytes, info.type, reinterpret_cast<void*>(info.agentOwner.handle));
1160 fprintf(stderr, "\tCanAccess: %u\n", count);
1161 for (int t = 0; t < count; t++)
1162 fprintf(stderr, "\t\t%p\n", reinterpret_cast<void*>(canAccess[t].handle));
1163 fprintf(stderr, "\tIn block: %p, 0x%lx\n", block.base, block.length);
1164 free(canAccess);
1165 it++;
1166 }
1167 #endif //! NDEBUG
1168 }
1169 assert(false && "GPU memory access fault.");
1170 std::abort();
1171 }
1172 // No need to keep the signal because we are done.
1173 return false;
1174 }
1175
Runtime()1176 Runtime::Runtime()
1177 : blit_agent_(NULL),
1178 sys_clock_freq_(0),
1179 vm_fault_event_(nullptr),
1180 vm_fault_signal_(nullptr),
1181 system_event_handler_user_data_(nullptr),
1182 ref_count_(0) {
1183 start_svm_address_ = 0;
1184 #if defined(HSA_LARGE_MODEL)
1185 end_svm_address_ = UINT64_MAX;
1186 #else
1187 end_svm_address_ = UINT32_MAX;
1188 #endif
1189 }
1190
Load()1191 hsa_status_t Runtime::Load() {
1192 flag_.Refresh();
1193
1194 g_use_interrupt_wait = flag_.enable_interrupt();
1195
1196 if (!amd::Load()) {
1197 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
1198 }
1199
1200 loader_ = amd::hsa::loader::Loader::Create(&loader_context_);
1201
1202 // Load extensions
1203 LoadExtensions();
1204
1205 // Load tools libraries
1206 LoadTools();
1207
1208 for (core::Agent* agent : gpu_agents_) {
1209 hsa_status_t status =
1210 reinterpret_cast<amd::GpuAgentInt*>(agent)->PostToolsInit();
1211
1212 if (status != HSA_STATUS_SUCCESS) {
1213 return status;
1214 }
1215 }
1216
1217 return HSA_STATUS_SUCCESS;
1218 }
1219
Unload()1220 void Runtime::Unload() {
1221 UnloadTools();
1222 UnloadExtensions();
1223
1224 amd::hsa::loader::Loader::Destroy(loader_);
1225 loader_ = nullptr;
1226
1227 std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject());
1228 gpu_agents_.clear();
1229
1230 async_events_control_.Shutdown();
1231
1232 if (vm_fault_signal_ != nullptr) {
1233 vm_fault_signal_->DestroySignal();
1234 vm_fault_signal_ = nullptr;
1235 }
1236 core::InterruptSignal::DestroyEvent(vm_fault_event_);
1237 vm_fault_event_ = nullptr;
1238
1239 DestroyAgents();
1240
1241 CloseTools();
1242
1243 amd::Unload();
1244 }
1245
LoadExtensions()1246 void Runtime::LoadExtensions() {
1247 // Load finalizer and extension library
1248 #ifdef HSA_LARGE_MODEL
1249 static const std::string kFinalizerLib[] = {"hsa-ext-finalize64.dll",
1250 "libhsa-ext-finalize64.so.1",
1251 "libhsa-ext-finalize64.so.1"};
1252 static const std::string kImageLib[] = {"hsa-ext-image64.dll",
1253 "libhsa-ext-image64.so.1",
1254 "libhsa-ext-image64.so.1"};
1255 #else
1256 static const std::string kFinalizerLib[] = {"hsa-ext-finalize.dll",
1257 "libhsa-ext-finalize.so.1",
1258 "libhsa-ext-finalize.so.1"};
1259 static const std::string kImageLib[] = {"hsa-ext-image.dll",
1260 "libhsa-ext-image.so.1",
1261 "libhsa-ext-image.so.1"};
1262 #endif
1263
1264 // Update Hsa Api Table with handle of Image extension Apis
1265 extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]);
1266 hsa_api_table_.LinkExts(&extensions_.finalizer_api,
1267 core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID);
1268
1269 // Update Hsa Api Table with handle of Finalizer extension Apis
1270 extensions_.LoadImage(kImageLib[os_index(os::current_os)]);
1271 hsa_api_table_.LinkExts(&extensions_.image_api,
1272 core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID);
1273 }
1274
UnloadExtensions()1275 void Runtime::UnloadExtensions() { extensions_.Unload(); }
1276
parse_tool_names(std::string tool_names)1277 static std::vector<std::string> parse_tool_names(std::string tool_names) {
1278 std::vector<std::string> names;
1279 std::string name = "";
1280 bool quoted = false;
1281 while (tool_names.size() != 0) {
1282 auto index = tool_names.find_first_of(" \"\\");
1283 if (index == std::string::npos) {
1284 name += tool_names;
1285 break;
1286 }
1287 switch (tool_names[index]) {
1288 case ' ': {
1289 if (!quoted) {
1290 name += tool_names.substr(0, index);
1291 tool_names.erase(0, index + 1);
1292 names.push_back(name);
1293 name = "";
1294 } else {
1295 name += tool_names.substr(0, index + 1);
1296 tool_names.erase(0, index + 1);
1297 }
1298 break;
1299 }
1300 case '\"': {
1301 if (quoted) {
1302 quoted = false;
1303 name += tool_names.substr(0, index);
1304 tool_names.erase(0, index + 1);
1305 names.push_back(name);
1306 name = "";
1307 } else {
1308 quoted = true;
1309 tool_names.erase(0, index + 1);
1310 }
1311 break;
1312 }
1313 case '\\': {
1314 if (tool_names.size() > index + 1) {
1315 name += tool_names.substr(0, index) + tool_names[index + 1];
1316 tool_names.erase(0, index + 2);
1317 }
1318 break;
1319 }
1320 } // end switch
1321 } // end while
1322
1323 if (name != "") names.push_back(name);
1324 return names;
1325 }
1326
LoadTools()1327 void Runtime::LoadTools() {
1328 typedef bool (*tool_init_t)(::HsaApiTable*, uint64_t, uint64_t,
1329 const char* const*);
1330 typedef Agent* (*tool_wrap_t)(Agent*);
1331 typedef void (*tool_add_t)(Runtime*);
1332
1333 // Load tool libs
1334 std::string tool_names = flag_.tools_lib_names();
1335 if (tool_names != "") {
1336 std::vector<std::string> names = parse_tool_names(tool_names);
1337 std::vector<const char*> failed;
1338 for (auto& name : names) {
1339 os::LibHandle tool = os::LoadLib(name);
1340
1341 if (tool != NULL) {
1342 tool_libs_.push_back(tool);
1343
1344 tool_init_t ld;
1345 ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad");
1346 if (ld) {
1347 if (!ld(&hsa_api_table_.hsa_api,
1348 hsa_api_table_.hsa_api.version.major_id,
1349 failed.size(), &failed[0])) {
1350 failed.push_back(name.c_str());
1351 os::CloseLib(tool);
1352 continue;
1353 }
1354 }
1355
1356 tool_wrap_t wrap;
1357 wrap = (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent");
1358 if (wrap) {
1359 std::vector<core::Agent*>* agent_lists[2] = {&cpu_agents_,
1360 &gpu_agents_};
1361 for (std::vector<core::Agent*>* agent_list : agent_lists) {
1362 for (size_t agent_idx = 0; agent_idx < agent_list->size();
1363 ++agent_idx) {
1364 Agent* agent = wrap(agent_list->at(agent_idx));
1365 if (agent != NULL) {
1366 assert(agent->IsValid() &&
1367 "Agent returned from WrapAgent is not valid");
1368 agent_list->at(agent_idx) = agent;
1369 }
1370 }
1371 }
1372 }
1373
1374 tool_add_t add;
1375 add = (tool_add_t)os::GetExportAddress(tool, "AddAgent");
1376 if (add) add(this);
1377 }
1378 else {
1379 if (flag().report_tool_load_failures())
1380 fprintf(stderr, "Tool lib \"%s\" failed to load.\n", name.c_str());
1381 }
1382 }
1383 }
1384 }
1385
UnloadTools()1386 void Runtime::UnloadTools() {
1387 typedef void (*tool_unload_t)();
1388 for (size_t i = tool_libs_.size(); i != 0; i--) {
1389 tool_unload_t unld;
1390 unld = (tool_unload_t)os::GetExportAddress(tool_libs_[i - 1], "OnUnload");
1391 if (unld) unld();
1392 }
1393
1394 // Reset API table in case some tool doesn't cleanup properly
1395 hsa_api_table_.Reset();
1396 }
1397
CloseTools()1398 void Runtime::CloseTools() {
1399 // Due to valgrind bug, runtime cannot dlclose extensions see:
1400 // http://valgrind.org/docs/manual/faq.html#faq.unhelpful
1401 if (!flag_.running_valgrind()) {
1402 for (auto& lib : tool_libs_) os::CloseLib(lib);
1403 }
1404 tool_libs_.clear();
1405 }
1406
Shutdown()1407 void Runtime::AsyncEventsControl::Shutdown() {
1408 if (async_events_thread_ != NULL) {
1409 exit = true;
1410 hsa_signal_handle(wake)->StoreRelaxed(1);
1411 os::WaitForThread(async_events_thread_);
1412 os::CloseThread(async_events_thread_);
1413 async_events_thread_ = NULL;
1414 HSA::hsa_signal_destroy(wake);
1415 }
1416 }
1417
PushBack(hsa_signal_t signal,hsa_signal_condition_t cond,hsa_signal_value_t value,hsa_amd_signal_handler handler,void * arg)1418 void Runtime::AsyncEvents::PushBack(hsa_signal_t signal,
1419 hsa_signal_condition_t cond,
1420 hsa_signal_value_t value,
1421 hsa_amd_signal_handler handler, void* arg) {
1422 signal_.push_back(signal);
1423 cond_.push_back(cond);
1424 value_.push_back(value);
1425 handler_.push_back(handler);
1426 arg_.push_back(arg);
1427 }
1428
CopyIndex(size_t dst,size_t src)1429 void Runtime::AsyncEvents::CopyIndex(size_t dst, size_t src) {
1430 signal_[dst] = signal_[src];
1431 cond_[dst] = cond_[src];
1432 value_[dst] = value_[src];
1433 handler_[dst] = handler_[src];
1434 arg_[dst] = arg_[src];
1435 }
1436
Size()1437 size_t Runtime::AsyncEvents::Size() { return signal_.size(); }
1438
PopBack()1439 void Runtime::AsyncEvents::PopBack() {
1440 signal_.pop_back();
1441 cond_.pop_back();
1442 value_.pop_back();
1443 handler_.pop_back();
1444 arg_.pop_back();
1445 }
1446
Clear()1447 void Runtime::AsyncEvents::Clear() {
1448 signal_.clear();
1449 cond_.clear();
1450 value_.clear();
1451 handler_.clear();
1452 arg_.clear();
1453 }
1454
SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,void * data)1455 hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
1456 void* data) {
1457 if (system_event_handler_) {
1458 return HSA_STATUS_ERROR;
1459 } else {
1460 system_event_handler_ = callback;
1461 system_event_handler_user_data_ = data;
1462 return HSA_STATUS_SUCCESS;
1463 }
1464 }
1465
1466 } // namespace core
1467