1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 //                 AMD Research and AMD HSA Software Development
11 //
12 //                 Advanced Micro Devices, Inc.
13 //
14 //                 www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 //  - Redistributions of source code must retain the above copyright notice,
24 //    this list of conditions and the following disclaimers.
25 //  - Redistributions in binary form must reproduce the above copyright
26 //    notice, this list of conditions and the following disclaimers in
27 //    the documentation and/or other materials provided with the distribution.
28 //  - Neither the names of Advanced Micro Devices, Inc,
29 //    nor the names of its contributors may be used to endorse or promote
30 //    products derived from this Software without specific prior written
31 //    permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42 
43 #include "core/inc/runtime.h"
44 
45 #include <algorithm>
46 #include <atomic>
47 #include <cstring>
48 #include <string>
49 #include <thread>
50 #include <vector>
51 
52 #include "core/common/shared.h"
53 #include "core/inc/hsa_ext_interface.h"
54 #include "core/inc/amd_cpu_agent.h"
55 #include "core/inc/amd_gpu_agent.h"
56 #include "core/inc/amd_memory_region.h"
57 #include "core/inc/amd_topology.h"
58 #include "core/inc/signal.h"
59 #include "core/inc/interrupt_signal.h"
60 #include "core/inc/hsa_ext_amd_impl.h"
61 #include "core/inc/hsa_api_trace_int.h"
62 #include "core/util/os.h"
63 #include "inc/hsa_ven_amd_aqlprofile.h"
64 
65 #define HSA_VERSION_MAJOR 1
66 #define HSA_VERSION_MINOR 1
67 
68 const char rocrbuildid[] __attribute__((used)) = "ROCR BUILD ID: " STRING(ROCR_BUILD_ID);
69 
70 namespace core {
71 bool g_use_interrupt_wait = true;
72 
73 Runtime* Runtime::runtime_singleton_ = NULL;
74 
75 KernelMutex Runtime::bootstrap_lock_;
76 
77 static bool loaded = true;
78 
79 class RuntimeCleanup {
80  public:
~RuntimeCleanup()81   ~RuntimeCleanup() {
82     if (!Runtime::IsOpen()) {
83       delete Runtime::runtime_singleton_;
84     }
85 
86     loaded = false;
87   }
88 };
89 
90 static RuntimeCleanup cleanup_at_unload_;
91 
Acquire()92 hsa_status_t Runtime::Acquire() {
93   // Check to see if HSA has been cleaned up (process exit)
94   if (!loaded) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
95 
96   // Handle initialization races
97   ScopedAcquire<KernelMutex> boot(&bootstrap_lock_);
98 
99   if (runtime_singleton_ == NULL) {
100     runtime_singleton_ = new Runtime();
101   }
102 
103   // Serialize with release
104   ScopedAcquire<KernelMutex> lock(&runtime_singleton_->kernel_lock_);
105 
106   if (runtime_singleton_->ref_count_ == INT32_MAX) {
107     return HSA_STATUS_ERROR_REFCOUNT_OVERFLOW;
108   }
109 
110   runtime_singleton_->ref_count_++;
111   MAKE_NAMED_SCOPE_GUARD(refGuard, [&]() { runtime_singleton_->ref_count_--; });
112 
113   if (runtime_singleton_->ref_count_ == 1) {
114     hsa_status_t status = runtime_singleton_->Load();
115 
116     if (status != HSA_STATUS_SUCCESS) {
117       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
118     }
119   }
120 
121   refGuard.Dismiss();
122   return HSA_STATUS_SUCCESS;
123 }
124 
Release()125 hsa_status_t Runtime::Release() {
126   ScopedAcquire<KernelMutex> lock(&kernel_lock_);
127   if (ref_count_ == 0) {
128     return HSA_STATUS_ERROR_NOT_INITIALIZED;
129   }
130 
131   if (ref_count_ == 1) {
132     // Release all registered memory, then unload backends
133     Unload();
134   }
135 
136   ref_count_--;
137 
138   return HSA_STATUS_SUCCESS;
139 }
140 
IsOpen()141 bool Runtime::IsOpen() {
142   return (Runtime::runtime_singleton_ != NULL) &&
143          (Runtime::runtime_singleton_->ref_count_ != 0);
144 }
145 
RegisterAgent(Agent * agent)146 void Runtime::RegisterAgent(Agent* agent) {
147   // Record the agent in the node-to-agent reverse lookup table.
148   agents_by_node_[agent->node_id()].push_back(agent);
149 
150   // Process agent as a cpu or gpu device.
151   if (agent->device_type() == Agent::DeviceType::kAmdCpuDevice) {
152     cpu_agents_.push_back(agent);
153 
154     // Add cpu regions to the system region list.
155     for (const core::MemoryRegion* region : agent->regions()) {
156       if (region->fine_grain()) {
157         system_regions_fine_.push_back(region);
158       } else {
159         system_regions_coarse_.push_back(region);
160       }
161     }
162 
163     assert(system_regions_fine_.size() > 0);
164 
165     // Init default fine grain system region allocator using fine grain
166     // system region of the first discovered CPU agent.
167     if (cpu_agents_.size() == 1) {
168       // Might need memory pooling to cover allocation that
169       // requires less than 4096 bytes.
170       system_allocator_ =
171           [&](size_t size, size_t alignment,
172               MemoryRegion::AllocateFlags alloc_flags) -> void* {
173             assert(alignment <= 4096);
174             void* ptr = NULL;
175             return (HSA_STATUS_SUCCESS ==
176                     core::Runtime::runtime_singleton_->AllocateMemory(
177                         system_regions_fine_[0], size, alloc_flags, &ptr))
178                        ? ptr
179                        : NULL;
180           };
181 
182       system_deallocator_ =
183           [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
184 
185       BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
186     }
187 
188     // Setup system clock frequency for the first time.
189     if (sys_clock_freq_ == 0) {
190       // Cache system clock frequency
191       HsaClockCounters clocks;
192       hsaKmtGetClockCounters(0, &clocks);
193       sys_clock_freq_ = clocks.SystemClockFrequencyHz;
194     }
195   } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) {
196     gpu_agents_.push_back(agent);
197 
198     gpu_ids_.push_back(agent->node_id());
199 
200     // Assign the first discovered gpu agent as blit agent that will provide
201     // DMA operation for hsa_memory_copy.
202     if (blit_agent_ == NULL) {
203       blit_agent_ = agent;
204 
205       // Query the start and end address of the SVM address space in this
206       // platform.
207       if (reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
208           HSA_PROFILE_BASE) {
209         std::vector<const core::MemoryRegion*>::const_iterator it =
210             std::find_if(blit_agent_->regions().begin(),
211                          blit_agent_->regions().end(),
212                          [](const core::MemoryRegion* region) {
213               return (
214                   reinterpret_cast<const amd::MemoryRegion*>(region)->IsSvm());
215             });
216 
217         assert(it != blit_agent_->regions().end());
218 
219         const amd::MemoryRegion* svm_region =
220             reinterpret_cast<const amd::MemoryRegion*>(*it);
221 
222         start_svm_address_ =
223             static_cast<uintptr_t>(svm_region->GetBaseAddress());
224         end_svm_address_ = start_svm_address_ + svm_region->GetPhysicalSize();
225 
226         // Bind VM fault handler when we detect the first GPU agent.
227         // TODO: validate if it works on APU.
228         BindVmFaultHandler();
229       } else {
230         start_svm_address_ = 0;
231         end_svm_address_ = os::GetUserModeVirtualMemoryBase() +
232                            os::GetUserModeVirtualMemorySize();
233       }
234     }
235   }
236 }
237 
DestroyAgents()238 void Runtime::DestroyAgents() {
239   agents_by_node_.clear();
240 
241   std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject());
242   gpu_agents_.clear();
243 
244   gpu_ids_.clear();
245 
246   std::for_each(cpu_agents_.begin(), cpu_agents_.end(), DeleteObject());
247   cpu_agents_.clear();
248 
249   blit_agent_ = NULL;
250 
251   system_regions_fine_.clear();
252   system_regions_coarse_.clear();
253 }
254 
SetLinkCount(size_t num_link)255 void Runtime::SetLinkCount(size_t num_link) {
256   const size_t last_index = GetIndexLinkInfo(0, num_link);
257   link_matrix_.resize(last_index);
258 
259   memset(&link_matrix_[0], 0,
260          link_matrix_.size() * sizeof(hsa_amd_memory_pool_link_info_t));
261 }
262 
RegisterLinkInfo(uint32_t node_id_from,uint32_t node_id_to,uint32_t num_hop,hsa_amd_memory_pool_link_info_t & link_info)263 void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
264                                uint32_t num_hop,
265                                hsa_amd_memory_pool_link_info_t& link_info) {
266   const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
267   link_matrix_[idx].num_hop = num_hop;
268   link_matrix_[idx].info = link_info;
269 
270   // Limit the number of hop to 1 since the runtime does not have enough
271   // information to share to the user about each hop.
272   link_matrix_[idx].num_hop = std::min(link_matrix_[idx].num_hop , 1U);
273 }
274 
GetLinkInfo(uint32_t node_id_from,uint32_t node_id_to)275 const Runtime::LinkInfo Runtime::GetLinkInfo(uint32_t node_id_from,
276                                              uint32_t node_id_to) {
277   return (node_id_from != node_id_to)
278              ? link_matrix_[GetIndexLinkInfo(node_id_from, node_id_to)]
279              : LinkInfo();  // No link.
280 }
281 
GetIndexLinkInfo(uint32_t node_id_from,uint32_t node_id_to)282 uint32_t Runtime::GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to) {
283   const uint32_t node_id_max = std::max(node_id_from, node_id_to) - 1;
284   const uint32_t node_id_min = std::min(node_id_from, node_id_to);
285   return ((node_id_max * (node_id_max + 1) / 2) + node_id_min);
286 }
287 
IterateAgent(hsa_status_t (* callback)(hsa_agent_t agent,void * data),void * data)288 hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
289                                                             void* data),
290                                    void* data) {
291   AMD::callback_t<decltype(callback)> call(callback);
292 
293   std::vector<core::Agent*>* agent_lists[2] = {&cpu_agents_, &gpu_agents_};
294   for (std::vector<core::Agent*>* agent_list : agent_lists) {
295     for (size_t i = 0; i < agent_list->size(); ++i) {
296       hsa_agent_t agent = Agent::Convert(agent_list->at(i));
297       hsa_status_t status = call(agent, data);
298 
299       if (status != HSA_STATUS_SUCCESS) {
300         return status;
301       }
302     }
303   }
304 
305   return HSA_STATUS_SUCCESS;
306 }
307 
AllocateMemory(const MemoryRegion * region,size_t size,MemoryRegion::AllocateFlags alloc_flags,void ** address)308 hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
309                                      MemoryRegion::AllocateFlags alloc_flags,
310                                      void** address) {
311   ScopedAcquire<KernelMutex> lock(&memory_lock_);
312   hsa_status_t status = region->Allocate(size, alloc_flags, address);
313 
314   // Track the allocation result so that it could be freed properly.
315   if (status == HSA_STATUS_SUCCESS) {
316     allocation_map_[*address] = AllocationRegion(region, size);
317   }
318 
319   return status;
320 }
321 
FreeMemory(void * ptr)322 hsa_status_t Runtime::FreeMemory(void* ptr) {
323   if (ptr == nullptr) {
324     return HSA_STATUS_SUCCESS;
325   }
326 
327   const MemoryRegion* region = nullptr;
328   size_t size = 0;
329   ScopedAcquire<KernelMutex> lock(&memory_lock_);
330 
331   std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
332 
333   if (it == allocation_map_.end()) {
334     assert(false && "Can't find address in allocation map");
335     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
336   }
337   region = it->second.region;
338   size = it->second.size;
339 
340   // Imported fragments can't be released with FreeMemory.
341   if (region == nullptr) {
342     assert(false && "Can't release imported memory with free.");
343     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
344   }
345 
346   allocation_map_.erase(it);
347 
348   return region->Free(ptr, size);
349 }
350 
CopyMemory(void * dst,const void * src,size_t size)351 hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
352   assert(dst != NULL && src != NULL && size != 0);
353 
354   bool is_src_system = false;
355   bool is_dst_system = false;
356   const uintptr_t src_uptr = reinterpret_cast<uintptr_t>(src);
357   const uintptr_t dst_uptr = reinterpret_cast<uintptr_t>(dst);
358 
359   if ((reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
360        HSA_PROFILE_FULL)) {
361     is_src_system = (src_uptr < end_svm_address_);
362     is_dst_system = (dst_uptr < end_svm_address_);
363   } else {
364     is_src_system =
365         ((src_uptr < start_svm_address_) || (src_uptr >= end_svm_address_));
366     is_dst_system =
367         ((dst_uptr < start_svm_address_) || (dst_uptr >= end_svm_address_));
368 
369     if ((is_src_system && !is_dst_system) ||
370         (!is_src_system && is_dst_system)) {
371       // Use staging buffer or pin if either src or dst is gpuvm and the other
372       // is system memory allocated via OS or C/C++ allocator.
373       return CopyMemoryHostAlloc(dst, src, size, is_dst_system);
374     }
375   }
376 
377   if (is_src_system && is_dst_system) {
378     memmove(dst, src, size);
379     return HSA_STATUS_SUCCESS;
380   }
381 
382   return blit_agent_->DmaCopy(dst, src, size);
383 }
384 
CopyMemoryHostAlloc(void * dst,const void * src,size_t size,bool dst_malloc)385 hsa_status_t Runtime::CopyMemoryHostAlloc(void* dst, const void* src,
386                                           size_t size, bool dst_malloc) {
387   void* usrptr = (dst_malloc) ? dst : const_cast<void*>(src);
388   void* agent_ptr = NULL;
389 
390   hsa_agent_t blit_agent = core::Agent::Convert(blit_agent_);
391 
392   const amd::MemoryRegion* system_region =
393       reinterpret_cast<const amd::MemoryRegion*>(system_regions_fine_[0]);
394   hsa_status_t stat =
395       system_region->Lock(1, &blit_agent, usrptr, size, &agent_ptr);
396 
397   if (stat != HSA_STATUS_SUCCESS) {
398     return stat;
399   }
400 
401   stat = blit_agent_->DmaCopy((dst_malloc) ? agent_ptr : dst,
402                               (dst_malloc) ? src : agent_ptr, size);
403 
404   system_region->Unlock(usrptr);
405 
406   return stat;
407 }
408 
CopyMemory(void * dst,core::Agent & dst_agent,const void * src,core::Agent & src_agent,size_t size,std::vector<core::Signal * > & dep_signals,core::Signal & completion_signal)409 hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
410                                  const void* src, core::Agent& src_agent,
411                                  size_t size,
412                                  std::vector<core::Signal*>& dep_signals,
413                                  core::Signal& completion_signal) {
414   const bool dst_gpu =
415       (dst_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
416   const bool src_gpu =
417       (src_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
418   if (dst_gpu || src_gpu) {
419     core::Agent& copy_agent = (src_gpu) ? src_agent : dst_agent;
420     return copy_agent.DmaCopy(dst, dst_agent, src, src_agent, size, dep_signals,
421                               completion_signal);
422   }
423 
424   // For cpu to cpu, fire and forget a copy thread.
425   const bool profiling_enabled =
426       (dst_agent.profiling_enabled() || src_agent.profiling_enabled());
427   std::thread(
428       [](void* dst, const void* src, size_t size,
429          std::vector<core::Signal*> dep_signals,
430          core::Signal* completion_signal, bool profiling_enabled) {
431 
432         for (core::Signal* dep : dep_signals) {
433           dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
434                            HSA_WAIT_STATE_BLOCKED);
435         }
436 
437         if (profiling_enabled) {
438           HsaClockCounters clocks = {0};
439           core::Runtime::runtime_singleton_->GetSystemInfo(
440               HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
441           completion_signal->signal_.start_ts = clocks.SystemClockCounter;
442         }
443 
444         memcpy(dst, src, size);
445 
446         if (profiling_enabled) {
447           HsaClockCounters clocks = {0};
448           core::Runtime::runtime_singleton_->GetSystemInfo(
449               HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
450           completion_signal->signal_.end_ts = clocks.SystemClockCounter;
451         }
452 
453         completion_signal->SubRelease(1);
454       },
455       dst, src, size, dep_signals, &completion_signal,
456       profiling_enabled).detach();
457 
458   return HSA_STATUS_SUCCESS;
459 }
460 
FillMemory(void * ptr,uint32_t value,size_t count)461 hsa_status_t Runtime::FillMemory(void* ptr, uint32_t value, size_t count) {
462   // Choose blit agent from pointer info
463   hsa_amd_pointer_info_t info;
464   uint32_t agent_count;
465   hsa_agent_t* accessible = nullptr;
466   info.size = sizeof(info);
467   MAKE_SCOPE_GUARD([&]() { free(accessible); });
468   hsa_status_t err = PtrInfo(ptr, &info, malloc, &agent_count, &accessible);
469   if (err != HSA_STATUS_SUCCESS) return err;
470 
471   ptrdiff_t endPtr = (ptrdiff_t)ptr + count * sizeof(uint32_t);
472 
473   // Check for GPU fill
474   // Selects GPU fill for SVM and Locked allocations if a GPU address is given and is mapped.
475   if (info.agentBaseAddress <= ptr &&
476       endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
477     core::Agent* blit_agent = core::Agent::Convert(info.agentOwner);
478     if (blit_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
479       blit_agent = nullptr;
480       for (uint32_t i = 0; i < agent_count; i++) {
481         if (core::Agent::Convert(accessible[i])->device_type() ==
482             core::Agent::DeviceType::kAmdGpuDevice) {
483           blit_agent = core::Agent::Convert(accessible[i]);
484           break;
485         }
486       }
487     }
488     if (blit_agent) return blit_agent->DmaFill(ptr, value, count);
489   }
490 
491   // Host and unmapped SVM addresses copy via host.
492   if (info.hostBaseAddress <= ptr && endPtr <= (ptrdiff_t)info.hostBaseAddress + info.sizeInBytes) {
493     memset(ptr, value, count * sizeof(uint32_t));
494     return HSA_STATUS_SUCCESS;
495   }
496 
497   return HSA_STATUS_ERROR_INVALID_ALLOCATION;
498 }
499 
AllowAccess(uint32_t num_agents,const hsa_agent_t * agents,const void * ptr)500 hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
501                                   const hsa_agent_t* agents, const void* ptr) {
502   const amd::MemoryRegion* amd_region = NULL;
503   size_t alloc_size = 0;
504 
505   {
506     ScopedAcquire<KernelMutex> lock(&memory_lock_);
507 
508     std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
509 
510     if (it == allocation_map_.end()) {
511       return HSA_STATUS_ERROR;
512     }
513 
514     amd_region = reinterpret_cast<const amd::MemoryRegion*>(it->second.region);
515     alloc_size = it->second.size;
516   }
517 
518   return amd_region->AllowAccess(num_agents, agents, ptr, alloc_size);
519 }
520 
GetSystemInfo(hsa_system_info_t attribute,void * value)521 hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
522   switch (attribute) {
523     case HSA_SYSTEM_INFO_VERSION_MAJOR:
524       *((uint16_t*)value) = HSA_VERSION_MAJOR;
525       break;
526     case HSA_SYSTEM_INFO_VERSION_MINOR:
527       *((uint16_t*)value) = HSA_VERSION_MINOR;
528       break;
529     case HSA_SYSTEM_INFO_TIMESTAMP: {
530       HsaClockCounters clocks;
531       hsaKmtGetClockCounters(0, &clocks);
532       *((uint64_t*)value) = clocks.SystemClockCounter;
533       break;
534     }
535     case HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY: {
536       assert(sys_clock_freq_ != 0 &&
537              "Use of HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY before HSA "
538              "initialization completes.");
539       *(uint64_t*)value = sys_clock_freq_;
540       break;
541     }
542     case HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT:
543       *((uint64_t*)value) = 0xFFFFFFFFFFFFFFFF;
544       break;
545     case HSA_SYSTEM_INFO_ENDIANNESS:
546 #if defined(HSA_LITTLE_ENDIAN)
547       *((hsa_endianness_t*)value) = HSA_ENDIANNESS_LITTLE;
548 #else
549       *((hsa_endianness_t*)value) = HSA_ENDIANNESS_BIG;
550 #endif
551       break;
552     case HSA_SYSTEM_INFO_MACHINE_MODEL:
553 #if defined(HSA_LARGE_MODEL)
554       *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE;
555 #else
556       *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL;
557 #endif
558       break;
559     case HSA_SYSTEM_INFO_EXTENSIONS: {
560       memset(value, 0, sizeof(uint8_t) * 128);
561 
562       auto setFlag = [&](uint32_t bit) {
563         assert(bit < 128 * 8 && "Extension value exceeds extension bitmask");
564         uint index = bit / 8;
565         uint subBit = bit % 8;
566         ((uint8_t*)value)[index] |= 1 << subBit;
567       };
568 
569       if (hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) {
570         setFlag(HSA_EXTENSION_FINALIZER);
571       }
572 
573       if (hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) {
574         setFlag(HSA_EXTENSION_IMAGES);
575       }
576 
577       if (os::LibHandle lib = os::LoadLib(kAqlProfileLib)) {
578         os::CloseLib(lib);
579         setFlag(HSA_EXTENSION_AMD_AQLPROFILE);
580       }
581 
582       setFlag(HSA_EXTENSION_AMD_PROFILER);
583 
584       break;
585     }
586     default:
587       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
588   }
589   return HSA_STATUS_SUCCESS;
590 }
591 
SetAsyncSignalHandler(hsa_signal_t signal,hsa_signal_condition_t cond,hsa_signal_value_t value,hsa_amd_signal_handler handler,void * arg)592 hsa_status_t Runtime::SetAsyncSignalHandler(hsa_signal_t signal,
593                                             hsa_signal_condition_t cond,
594                                             hsa_signal_value_t value,
595                                             hsa_amd_signal_handler handler,
596                                             void* arg) {
597   // Indicate that this signal is in use.
598   if (signal.handle != 0) hsa_signal_handle(signal)->Retain();
599 
600   ScopedAcquire<KernelMutex> scope_lock(&async_events_control_.lock);
601 
602   // Lazy initializer
603   if (async_events_control_.async_events_thread_ == NULL) {
604     // Create monitoring thread control signal
605     auto err = HSA::hsa_signal_create(0, 0, NULL, &async_events_control_.wake);
606     if (err != HSA_STATUS_SUCCESS) {
607       assert(false && "Asyncronous events control signal creation error.");
608       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
609     }
610     async_events_.PushBack(async_events_control_.wake, HSA_SIGNAL_CONDITION_NE,
611                            0, NULL, NULL);
612 
613     // Start event monitoring thread
614     async_events_control_.exit = false;
615     async_events_control_.async_events_thread_ =
616         os::CreateThread(AsyncEventsLoop, NULL);
617     if (async_events_control_.async_events_thread_ == NULL) {
618       assert(false && "Asyncronous events thread creation error.");
619       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
620     }
621   }
622 
623   new_async_events_.PushBack(signal, cond, value, handler, arg);
624 
625   hsa_signal_handle(async_events_control_.wake)->StoreRelease(1);
626 
627   return HSA_STATUS_SUCCESS;
628 }
629 
InteropMap(uint32_t num_agents,Agent ** agents,int interop_handle,uint32_t flags,size_t * size,void ** ptr,size_t * metadata_size,const void ** metadata)630 hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents,
631                                  int interop_handle, uint32_t flags,
632                                  size_t* size, void** ptr,
633                                  size_t* metadata_size, const void** metadata) {
634   static const int tinyArraySize=8;
635   HsaGraphicsResourceInfo info;
636 
637   HSAuint32 short_nodes[tinyArraySize];
638   HSAuint32* nodes = short_nodes;
639   if (num_agents > tinyArraySize) {
640     nodes = new HSAuint32[num_agents];
641     if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
642   }
643   MAKE_SCOPE_GUARD([&]() {
644     if (num_agents > tinyArraySize) delete[] nodes;
645   });
646 
647   for (uint32_t i = 0; i < num_agents; i++)
648     agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID,
649                        &nodes[i]);
650 
651   if (hsaKmtRegisterGraphicsHandleToNodes(interop_handle, &info, num_agents,
652                                           nodes) != HSAKMT_STATUS_SUCCESS)
653     return HSA_STATUS_ERROR;
654 
655   HSAuint64 altAddress;
656   HsaMemMapFlags map_flags;
657   map_flags.Value = 0;
658   map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB;
659   if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes,
660                                 &altAddress, map_flags, num_agents,
661                                 nodes) != HSAKMT_STATUS_SUCCESS) {
662     map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
663     if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes, &altAddress, map_flags,
664                                   num_agents, nodes) != HSAKMT_STATUS_SUCCESS) {
665       hsaKmtDeregisterMemory(info.MemoryAddress);
666       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
667     }
668   }
669 
670   if (metadata_size != NULL) *metadata_size = info.MetadataSizeInBytes;
671   if (metadata != NULL) *metadata = info.Metadata;
672 
673   *size = info.SizeInBytes;
674   *ptr = info.MemoryAddress;
675 
676   return HSA_STATUS_SUCCESS;
677 }
678 
InteropUnmap(void * ptr)679 hsa_status_t Runtime::InteropUnmap(void* ptr) {
680   if(hsaKmtUnmapMemoryToGPU(ptr)!=HSAKMT_STATUS_SUCCESS)
681     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
682   if(hsaKmtDeregisterMemory(ptr)!=HSAKMT_STATUS_SUCCESS)
683     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
684   return HSA_STATUS_SUCCESS;
685 }
686 
PtrInfo(void * ptr,hsa_amd_pointer_info_t * info,void * (* alloc)(size_t),uint32_t * num_agents_accessible,hsa_agent_t ** accessible,PtrInfoBlockData * block_info)687 hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
688                               uint32_t* num_agents_accessible, hsa_agent_t** accessible,
689                               PtrInfoBlockData* block_info) {
690   static_assert(static_cast<int>(HSA_POINTER_UNKNOWN) == static_cast<int>(HSA_EXT_POINTER_TYPE_UNKNOWN),
691                 "Thunk pointer info mismatch");
692   static_assert(static_cast<int>(HSA_POINTER_ALLOCATED) == static_cast<int>(HSA_EXT_POINTER_TYPE_HSA),
693                 "Thunk pointer info mismatch");
694   static_assert(static_cast<int>(HSA_POINTER_REGISTERED_USER) == static_cast<int>(HSA_EXT_POINTER_TYPE_LOCKED),
695                 "Thunk pointer info mismatch");
696   static_assert(static_cast<int>(HSA_POINTER_REGISTERED_GRAPHICS) == static_cast<int>(HSA_EXT_POINTER_TYPE_GRAPHICS),
697                 "Thunk pointer info mismatch");
698 
699   HsaPointerInfo thunkInfo;
700   uint32_t* mappedNodes;
701 
702   hsa_amd_pointer_info_t retInfo;
703 
704   // check output struct has an initialized size.
705   if (info->size == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
706 
707   bool returnListData =
708       ((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr));
709 
710   {  // memory_lock protects access to the NMappedNodes array and fragment user data since these may
711      // change with calls to memory APIs.
712     ScopedAcquire<KernelMutex> lock(&memory_lock_);
713     hsaKmtQueryPointerInfo(ptr, &thunkInfo);
714     if (returnListData) {
715       assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
716              "PointerInfo: Thunk returned more than all agents in NMappedNodes.");
717       mappedNodes = (uint32_t*)alloca(thunkInfo.NMappedNodes * sizeof(uint32_t));
718       memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
719     }
720     retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
721     retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
722     retInfo.hostBaseAddress = thunkInfo.CPUAddress;
723     retInfo.sizeInBytes = thunkInfo.SizeInBytes;
724     retInfo.userData = thunkInfo.UserData;
725     if (block_info != nullptr) {
726       // The only time host and agent ptr may be different is when the memory is lock memory (malloc
727       // memory pinned for GPU access).  In this case there can not be any suballocation so
728       // block_info is redundant and unused.  Host address is returned since host address is used to
729       // manipulate lock memory.  This protects future use of block_info with lock memory.
730       block_info->base = retInfo.hostBaseAddress;
731       block_info->length = retInfo.sizeInBytes;
732     }
733     if (retInfo.type == HSA_EXT_POINTER_TYPE_HSA) {
734       auto fragment = allocation_map_.upper_bound(ptr);
735       if (fragment != allocation_map_.begin()) {
736         fragment--;
737         if ((fragment->first <= ptr) &&
738             (ptr < reinterpret_cast<const uint8_t*>(fragment->first) + fragment->second.size)) {
739           // agent and host address must match here.  Only lock memory is allowed to have differing
740           // addresses but lock memory has type HSA_EXT_POINTER_TYPE_LOCKED and cannot be
741           // suballocated.
742           retInfo.agentBaseAddress = const_cast<void*>(fragment->first);
743           retInfo.hostBaseAddress = retInfo.agentBaseAddress;
744           retInfo.sizeInBytes = fragment->second.size;
745           retInfo.userData = fragment->second.user_ptr;
746         }
747       }
748     }
749   }  // end lock scope
750 
751   retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
752 
753   // Temp: workaround thunk bug, IPC memory has garbage in Node.
754   // retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
755   auto it = agents_by_node_.find(thunkInfo.Node);
756   if (it != agents_by_node_.end())
757     retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
758   else
759     retInfo.agentOwner.handle = 0;
760 
761   memcpy(info, &retInfo, retInfo.size);
762 
763   if (returnListData) {
764     uint32_t count = 0;
765     for (HSAuint32 i = 0; i < thunkInfo.NMappedNodes; i++) {
766       assert(mappedNodes[i] < agents_by_node_.size() &&
767              "PointerInfo: Invalid node ID returned from thunk.");
768       count += agents_by_node_[mappedNodes[i]].size();
769     }
770 
771     AMD::callback_t<decltype(alloc)> Alloc(alloc);
772     *accessible = (hsa_agent_t*)Alloc(sizeof(hsa_agent_t) * count);
773     if ((*accessible) == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
774     *num_agents_accessible = count;
775 
776     uint32_t index = 0;
777     for (HSAuint32 i = 0; i < thunkInfo.NMappedNodes; i++) {
778       auto& list = agents_by_node_[mappedNodes[i]];
779       for (auto agent : list) {
780         (*accessible)[index] = agent->public_handle();
781         index++;
782       }
783     }
784   }
785 
786   return HSA_STATUS_SUCCESS;
787 }
788 
SetPtrInfoData(void * ptr,void * userptr)789 hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) {
790   {  // Use allocation map if possible to handle fragments.
791     ScopedAcquire<KernelMutex> lock(&memory_lock_);
792     const auto& it = allocation_map_.find(ptr);
793     if (it != allocation_map_.end()) {
794       it->second.user_ptr = userptr;
795       return HSA_STATUS_SUCCESS;
796     }
797   }
798   // Cover entries not in the allocation map (graphics, lock,...)
799   if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS)
800     return HSA_STATUS_SUCCESS;
801   return HSA_STATUS_ERROR_INVALID_ARGUMENT;
802 }
803 
IPCCreate(void * ptr,size_t len,hsa_amd_ipc_memory_t * handle)804 hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) {
805   static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle),
806                 "Thunk IPC mismatch.");
807   // Reject sharing allocations larger than ~8TB due to thunk limitations.
808   if (len > 0x7FFFFFFF000ull) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
809 
810   // Check for fragment sharing.
811   PtrInfoBlockData block;
812   hsa_amd_pointer_info_t info;
813   info.size = sizeof(info);
814   if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
815     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
816   if ((block.base != ptr) || (block.length != len)) {
817     if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) {
818       assert(false && "Fragment's block not aligned to 2MB!");
819       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
820     }
821     if (hsaKmtShareMemory(block.base, block.length, reinterpret_cast<HsaSharedMemoryHandle*>(
822                                                         handle)) != HSAKMT_STATUS_SUCCESS)
823       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
824     uint32_t offset =
825         (reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<uint8_t*>(block.base)) / 4096;
826     // Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset.
827     handle->handle[6] |= 0x80000000 | offset;
828   } else {
829     if (hsaKmtShareMemory(ptr, len, reinterpret_cast<HsaSharedMemoryHandle*>(handle)) !=
830         HSAKMT_STATUS_SUCCESS)
831       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
832   }
833   return HSA_STATUS_SUCCESS;
834 }
835 
IPCAttach(const hsa_amd_ipc_memory_t * handle,size_t len,uint32_t num_agents,Agent ** agents,void ** mapped_ptr)836 hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents,
837                                 Agent** agents, void** mapped_ptr) {
838   static const int tinyArraySize = 8;
839   void* importAddress;
840   HSAuint64 importSize;
841   HSAuint64 altAddress;
842 
843   hsa_amd_ipc_memory_t importHandle;
844   importHandle = *handle;
845 
846   // Extract fragment info
847   bool isFragment = false;
848   uint32_t fragOffset = 0;
849   auto fixFragment = [&]() {
850     if (!isFragment) return;
851     importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
852     len = Min(len, importSize - fragOffset);
853     ScopedAcquire<KernelMutex> lock(&memory_lock_);
854     allocation_map_[importAddress] = AllocationRegion(nullptr, len);
855   };
856 
857   if ((importHandle.handle[6] & 0x80000000) != 0) {
858     isFragment = true;
859     fragOffset = (importHandle.handle[6] & 0x1FF) * 4096;
860     importHandle.handle[6] &= ~(0x80000000 | 0x1FF);
861   }
862 
863   if (num_agents == 0) {
864     if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle),
865                                    &importAddress, &importSize) != HSAKMT_STATUS_SUCCESS)
866       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
867     if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) {
868       hsaKmtDeregisterMemory(importAddress);
869       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
870     }
871     fixFragment();
872     *mapped_ptr = importAddress;
873     return HSA_STATUS_SUCCESS;
874   }
875 
876   HSAuint32* nodes = nullptr;
877   if (num_agents > tinyArraySize)
878     nodes = new HSAuint32[num_agents];
879   else
880     nodes = (HSAuint32*)alloca(sizeof(HSAuint32) * num_agents);
881   if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
882 
883   MAKE_SCOPE_GUARD([&]() {
884     if (num_agents > tinyArraySize) delete[] nodes;
885   });
886 
887   for (uint32_t i = 0; i < num_agents; i++)
888     agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]);
889 
890   if (hsaKmtRegisterSharedHandleToNodes(
891           reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle), &importAddress,
892           &importSize, num_agents, nodes) != HSAKMT_STATUS_SUCCESS)
893     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
894 
895   HsaMemMapFlags map_flags;
896   map_flags.Value = 0;
897   map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB;
898   if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents,
899                                 nodes) != HSAKMT_STATUS_SUCCESS) {
900     map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
901     if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents,
902                                   nodes) != HSAKMT_STATUS_SUCCESS) {
903       hsaKmtDeregisterMemory(importAddress);
904       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
905     }
906   }
907 
908   fixFragment();
909   *mapped_ptr = importAddress;
910   return HSA_STATUS_SUCCESS;
911 }
912 
IPCDetach(void * ptr)913 hsa_status_t Runtime::IPCDetach(void* ptr) {
914   {  // Handle imported fragments.
915     ScopedAcquire<KernelMutex> lock(&memory_lock_);
916     const auto& it = allocation_map_.find(ptr);
917     if (it != allocation_map_.end()) {
918       if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
919       allocation_map_.erase(it);
920       lock.Release();  // Can't hold memory lock when using pointer info.
921 
922       PtrInfoBlockData block;
923       hsa_amd_pointer_info_t info;
924       info.size = sizeof(info);
925       if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
926         return HSA_STATUS_ERROR_INVALID_ARGUMENT;
927       ptr = block.base;
928     }
929   }
930   if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS)
931     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
932   if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS)
933     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
934   return HSA_STATUS_SUCCESS;
935 }
936 
AsyncEventsLoop(void *)937 void Runtime::AsyncEventsLoop(void*) {
938   auto& async_events_control_ = runtime_singleton_->async_events_control_;
939   auto& async_events_ = runtime_singleton_->async_events_;
940   auto& new_async_events_ = runtime_singleton_->new_async_events_;
941 
942   while (!async_events_control_.exit) {
943     // Wait for a signal
944     hsa_signal_value_t value;
945     uint32_t index = AMD::hsa_amd_signal_wait_any(
946         uint32_t(async_events_.Size()), &async_events_.signal_[0],
947         &async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1),
948         HSA_WAIT_STATE_BLOCKED, &value);
949 
950     // Reset the control signal
951     if (index == 0) {
952       hsa_signal_handle(async_events_control_.wake)->StoreRelaxed(0);
953     } else if (index != -1) {
954       // No error or timout occured, process the handler
955       assert(async_events_.handler_[index] != NULL);
956       bool keep =
957           async_events_.handler_[index](value, async_events_.arg_[index]);
958       if (!keep) {
959         hsa_signal_handle(async_events_.signal_[index])->Release();
960         async_events_.CopyIndex(index, async_events_.Size() - 1);
961         async_events_.PopBack();
962       }
963     }
964 
965     // Check for dead signals
966     index = 0;
967     while (index != async_events_.Size()) {
968       if (!hsa_signal_handle(async_events_.signal_[index])->IsValid()) {
969         hsa_signal_handle(async_events_.signal_[index])->Release();
970         async_events_.CopyIndex(index, async_events_.Size() - 1);
971         async_events_.PopBack();
972         continue;
973       }
974       index++;
975     }
976 
977     // Insert new signals and find plain functions
978     typedef std::pair<void (*)(void*), void*> func_arg_t;
979     std::vector<func_arg_t> functions;
980     {
981       ScopedAcquire<KernelMutex> scope_lock(&async_events_control_.lock);
982       for (size_t i = 0; i < new_async_events_.Size(); i++) {
983         if (new_async_events_.signal_[i].handle == 0) {
984           functions.push_back(
985               func_arg_t((void (*)(void*))new_async_events_.handler_[i],
986                          new_async_events_.arg_[i]));
987           continue;
988         }
989         async_events_.PushBack(
990             new_async_events_.signal_[i], new_async_events_.cond_[i],
991             new_async_events_.value_[i], new_async_events_.handler_[i],
992             new_async_events_.arg_[i]);
993       }
994       new_async_events_.Clear();
995     }
996 
997     // Call plain functions
998     for (size_t i = 0; i < functions.size(); i++)
999       functions[i].first(functions[i].second);
1000     functions.clear();
1001   }
1002 
1003   // Release wait count of all pending signals
1004   for (size_t i = 1; i < async_events_.Size(); i++)
1005     hsa_signal_handle(async_events_.signal_[i])->Release();
1006   async_events_.Clear();
1007 
1008   for (size_t i = 0; i < new_async_events_.Size(); i++)
1009     hsa_signal_handle(new_async_events_.signal_[i])->Release();
1010   new_async_events_.Clear();
1011 }
1012 
BindVmFaultHandler()1013 void Runtime::BindVmFaultHandler() {
1014   if (core::g_use_interrupt_wait) {
1015     // Create memory event with manual reset to avoid racing condition
1016     // with driver in case of multiple concurrent VM faults.
1017     vm_fault_event_ =
1018         core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true);
1019 
1020     // Create an interrupt signal object to contain the memory event.
1021     // This signal object will be registered with the async handler global
1022     // thread.
1023     vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_);
1024 
1025     if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) {
1026       assert(false && "Failed on creating VM fault signal");
1027       return;
1028     }
1029 
1030     SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_),
1031                           HSA_SIGNAL_CONDITION_NE, 0, VMFaultHandler,
1032                           reinterpret_cast<void*>(vm_fault_signal_));
1033   }
1034 }
1035 
VMFaultHandler(hsa_signal_value_t val,void * arg)1036 bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
1037   core::InterruptSignal* vm_fault_signal =
1038       reinterpret_cast<core::InterruptSignal*>(arg);
1039 
1040   assert(vm_fault_signal != NULL);
1041 
1042   if (vm_fault_signal == NULL) {
1043     return false;
1044   }
1045 
1046   HsaEvent* vm_fault_event = vm_fault_signal->EopEvent();
1047 
1048   HsaMemoryAccessFault& fault =
1049       vm_fault_event->EventData.EventData.MemoryAccessFault;
1050 
1051   hsa_status_t custom_handler_status = HSA_STATUS_ERROR;
1052   // If custom handler is registered, pack the fault info and call the handler
1053   if (runtime_singleton_->GetCustomSystemEventHandler()) {
1054     hsa_amd_gpu_memory_fault_info_t fault_info;
1055 
1056     // Find the faulty agent
1057     auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId);
1058     assert(it != runtime_singleton_->agents_by_node_.end() && "Can't find faulty agent.");
1059     Agent* faulty_agent = it->second.front();
1060     fault_info.agent = Agent::Convert(faulty_agent);
1061 
1062     fault_info.virtual_address = fault.VirtualAddress;
1063     fault_info.fault_reason_mask = 0x00000000;
1064     if (fault.Failure.NotPresent == 1) {
1065       fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000001;
1066     }
1067     if (fault.Failure.ReadOnly == 1) {
1068       fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000010;
1069     }
1070     if (fault.Failure.NoExecute == 1) {
1071       fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000100;
1072     }
1073     if (fault.Failure.GpuAccess == 1) {
1074       fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00001000;
1075     }
1076     if (fault.Failure.ECC == 1) {
1077       fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00010000;
1078     }
1079     if (fault.Failure.Imprecise == 1) {
1080       fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00100000;
1081     }
1082     hsa_amd_event_t memory_fault_event;
1083     memory_fault_event.event_type = GPU_MEMORY_FAULT_EVENT;
1084     memory_fault_event.memory_fault = fault_info;
1085     custom_handler_status = runtime_singleton_->GetCustomSystemEventHandler()(
1086         &memory_fault_event, runtime_singleton_->GetCustomSystemEventData());
1087   }
1088 
1089   // No custom VM fault handler registered or it failed.
1090   if (custom_handler_status != HSA_STATUS_SUCCESS) {
1091     if (runtime_singleton_->flag().enable_vm_fault_message()) {
1092       std::string reason = "";
1093       if (fault.Failure.NotPresent == 1) {
1094         reason += "Page not present or supervisor privilege";
1095       } else if (fault.Failure.ReadOnly == 1) {
1096         reason += "Write access to a read-only page";
1097       } else if (fault.Failure.NoExecute == 1) {
1098         reason += "Execute access to a page marked NX";
1099       } else if (fault.Failure.GpuAccess == 1) {
1100         reason += "Host access only";
1101       } else if (fault.Failure.ECC == 1) {
1102         reason += "ECC failure (if supported by HW)";
1103       } else {
1104         reason += "Unknown";
1105       }
1106 
1107       core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
1108 
1109       fprintf(
1110           stderr,
1111           "Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n",
1112           fault.NodeId, reinterpret_cast<void*>(faultingAgent->public_handle().handle),
1113           reinterpret_cast<const void*>(fault.VirtualAddress),
1114           (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str());
1115 
1116 #ifndef NDEBUG
1117       runtime_singleton_->memory_lock_.Acquire();
1118       auto it = runtime_singleton_->allocation_map_.upper_bound(
1119           reinterpret_cast<void*>(fault.VirtualAddress));
1120       for (int i = 0; i < 2; i++) {
1121         if (it != runtime_singleton_->allocation_map_.begin()) it--;
1122       }
1123       fprintf(stderr, "Nearby memory map:\n");
1124       auto start = it;
1125       for (int i = 0; i < 3; i++) {
1126         if (it == runtime_singleton_->allocation_map_.end()) break;
1127         std::string kind = "Non-HSA";
1128         if (it->second.region != nullptr) {
1129           const amd::MemoryRegion* region =
1130               static_cast<const amd::MemoryRegion*>(it->second.region);
1131           if (region->IsSystem())
1132             kind = "System";
1133           else if (region->IsLocalMemory())
1134             kind = "VRAM";
1135           else if (region->IsScratch())
1136             kind = "Scratch";
1137           else if (region->IsLDS())
1138             kind = "LDS";
1139         }
1140         fprintf(stderr, "%p, 0x%lx, %s\n", it->first, it->second.size, kind.c_str());
1141         it++;
1142       }
1143       fprintf(stderr, "\n");
1144       it = start;
1145       runtime_singleton_->memory_lock_.Release();
1146       hsa_amd_pointer_info_t info;
1147       PtrInfoBlockData block;
1148       uint32_t count;
1149       hsa_agent_t* canAccess;
1150       info.size = sizeof(info);
1151       for (int i = 0; i < 3; i++) {
1152         if (it == runtime_singleton_->allocation_map_.end()) break;
1153         runtime_singleton_->PtrInfo(const_cast<void*>(it->first), &info, malloc, &count, &canAccess,
1154                                     &block);
1155         fprintf(stderr,
1156                 "PtrInfo:\n\tAddress: %p-%p/%p-%p\n\tSize: 0x%lx\n\tType: %u\n\tOwner: %p\n",
1157                 info.agentBaseAddress, (char*)info.agentBaseAddress + info.sizeInBytes,
1158                 info.hostBaseAddress, (char*)info.hostBaseAddress + info.sizeInBytes,
1159                 info.sizeInBytes, info.type, reinterpret_cast<void*>(info.agentOwner.handle));
1160         fprintf(stderr, "\tCanAccess: %u\n", count);
1161         for (int t = 0; t < count; t++)
1162           fprintf(stderr, "\t\t%p\n", reinterpret_cast<void*>(canAccess[t].handle));
1163         fprintf(stderr, "\tIn block: %p, 0x%lx\n", block.base, block.length);
1164         free(canAccess);
1165         it++;
1166       }
1167 #endif  //! NDEBUG
1168     }
1169     assert(false && "GPU memory access fault.");
1170     std::abort();
1171   }
1172   // No need to keep the signal because we are done.
1173   return false;
1174 }
1175 
Runtime()1176 Runtime::Runtime()
1177     : blit_agent_(NULL),
1178       sys_clock_freq_(0),
1179       vm_fault_event_(nullptr),
1180       vm_fault_signal_(nullptr),
1181       system_event_handler_user_data_(nullptr),
1182       ref_count_(0) {
1183   start_svm_address_ = 0;
1184 #if defined(HSA_LARGE_MODEL)
1185   end_svm_address_ = UINT64_MAX;
1186 #else
1187   end_svm_address_ = UINT32_MAX;
1188 #endif
1189 }
1190 
Load()1191 hsa_status_t Runtime::Load() {
1192   flag_.Refresh();
1193 
1194   g_use_interrupt_wait = flag_.enable_interrupt();
1195 
1196   if (!amd::Load()) {
1197     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
1198   }
1199 
1200   loader_ = amd::hsa::loader::Loader::Create(&loader_context_);
1201 
1202   // Load extensions
1203   LoadExtensions();
1204 
1205   // Load tools libraries
1206   LoadTools();
1207 
1208   for (core::Agent* agent : gpu_agents_) {
1209     hsa_status_t status =
1210         reinterpret_cast<amd::GpuAgentInt*>(agent)->PostToolsInit();
1211 
1212     if (status != HSA_STATUS_SUCCESS) {
1213       return status;
1214     }
1215   }
1216 
1217   return HSA_STATUS_SUCCESS;
1218 }
1219 
Unload()1220 void Runtime::Unload() {
1221   UnloadTools();
1222   UnloadExtensions();
1223 
1224   amd::hsa::loader::Loader::Destroy(loader_);
1225   loader_ = nullptr;
1226 
1227   std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject());
1228   gpu_agents_.clear();
1229 
1230   async_events_control_.Shutdown();
1231 
1232   if (vm_fault_signal_ != nullptr) {
1233     vm_fault_signal_->DestroySignal();
1234     vm_fault_signal_ = nullptr;
1235   }
1236   core::InterruptSignal::DestroyEvent(vm_fault_event_);
1237   vm_fault_event_ = nullptr;
1238 
1239   DestroyAgents();
1240 
1241   CloseTools();
1242 
1243   amd::Unload();
1244 }
1245 
LoadExtensions()1246 void Runtime::LoadExtensions() {
1247 // Load finalizer and extension library
1248 #ifdef HSA_LARGE_MODEL
1249   static const std::string kFinalizerLib[] = {"hsa-ext-finalize64.dll",
1250                                               "libhsa-ext-finalize64.so.1",
1251                                               "libhsa-ext-finalize64.so.1"};
1252   static const std::string kImageLib[] = {"hsa-ext-image64.dll",
1253                                           "libhsa-ext-image64.so.1",
1254                                           "libhsa-ext-image64.so.1"};
1255 #else
1256   static const std::string kFinalizerLib[] = {"hsa-ext-finalize.dll",
1257                                               "libhsa-ext-finalize.so.1",
1258                                               "libhsa-ext-finalize.so.1"};
1259   static const std::string kImageLib[] = {"hsa-ext-image.dll",
1260                                           "libhsa-ext-image.so.1",
1261                                           "libhsa-ext-image.so.1"};
1262 #endif
1263 
1264   // Update Hsa Api Table with handle of Image extension Apis
1265   extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]);
1266   hsa_api_table_.LinkExts(&extensions_.finalizer_api,
1267                           core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID);
1268 
1269   // Update Hsa Api Table with handle of Finalizer extension Apis
1270   extensions_.LoadImage(kImageLib[os_index(os::current_os)]);
1271   hsa_api_table_.LinkExts(&extensions_.image_api,
1272                           core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID);
1273 }
1274 
UnloadExtensions()1275 void Runtime::UnloadExtensions() { extensions_.Unload(); }
1276 
parse_tool_names(std::string tool_names)1277 static std::vector<std::string> parse_tool_names(std::string tool_names) {
1278   std::vector<std::string> names;
1279   std::string name = "";
1280   bool quoted = false;
1281   while (tool_names.size() != 0) {
1282     auto index = tool_names.find_first_of(" \"\\");
1283     if (index == std::string::npos) {
1284       name += tool_names;
1285       break;
1286     }
1287     switch (tool_names[index]) {
1288       case ' ': {
1289         if (!quoted) {
1290           name += tool_names.substr(0, index);
1291           tool_names.erase(0, index + 1);
1292           names.push_back(name);
1293           name = "";
1294         } else {
1295           name += tool_names.substr(0, index + 1);
1296           tool_names.erase(0, index + 1);
1297         }
1298         break;
1299       }
1300       case '\"': {
1301         if (quoted) {
1302           quoted = false;
1303           name += tool_names.substr(0, index);
1304           tool_names.erase(0, index + 1);
1305           names.push_back(name);
1306           name = "";
1307         } else {
1308           quoted = true;
1309           tool_names.erase(0, index + 1);
1310         }
1311         break;
1312       }
1313       case '\\': {
1314         if (tool_names.size() > index + 1) {
1315           name += tool_names.substr(0, index) + tool_names[index + 1];
1316           tool_names.erase(0, index + 2);
1317         }
1318         break;
1319       }
1320     }  // end switch
1321   }    // end while
1322 
1323   if (name != "") names.push_back(name);
1324   return names;
1325 }
1326 
LoadTools()1327 void Runtime::LoadTools() {
1328   typedef bool (*tool_init_t)(::HsaApiTable*, uint64_t, uint64_t,
1329                               const char* const*);
1330   typedef Agent* (*tool_wrap_t)(Agent*);
1331   typedef void (*tool_add_t)(Runtime*);
1332 
1333   // Load tool libs
1334   std::string tool_names = flag_.tools_lib_names();
1335   if (tool_names != "") {
1336     std::vector<std::string> names = parse_tool_names(tool_names);
1337     std::vector<const char*> failed;
1338     for (auto& name : names) {
1339       os::LibHandle tool = os::LoadLib(name);
1340 
1341       if (tool != NULL) {
1342         tool_libs_.push_back(tool);
1343 
1344         tool_init_t ld;
1345         ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad");
1346         if (ld) {
1347           if (!ld(&hsa_api_table_.hsa_api,
1348                   hsa_api_table_.hsa_api.version.major_id,
1349                   failed.size(), &failed[0])) {
1350             failed.push_back(name.c_str());
1351             os::CloseLib(tool);
1352             continue;
1353           }
1354         }
1355 
1356         tool_wrap_t wrap;
1357         wrap = (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent");
1358         if (wrap) {
1359           std::vector<core::Agent*>* agent_lists[2] = {&cpu_agents_,
1360                                                        &gpu_agents_};
1361           for (std::vector<core::Agent*>* agent_list : agent_lists) {
1362             for (size_t agent_idx = 0; agent_idx < agent_list->size();
1363                  ++agent_idx) {
1364               Agent* agent = wrap(agent_list->at(agent_idx));
1365               if (agent != NULL) {
1366                 assert(agent->IsValid() &&
1367                        "Agent returned from WrapAgent is not valid");
1368                 agent_list->at(agent_idx) = agent;
1369               }
1370             }
1371           }
1372         }
1373 
1374         tool_add_t add;
1375         add = (tool_add_t)os::GetExportAddress(tool, "AddAgent");
1376         if (add) add(this);
1377       }
1378       else {
1379         if (flag().report_tool_load_failures())
1380           fprintf(stderr, "Tool lib \"%s\" failed to load.\n", name.c_str());
1381       }
1382     }
1383   }
1384 }
1385 
UnloadTools()1386 void Runtime::UnloadTools() {
1387   typedef void (*tool_unload_t)();
1388   for (size_t i = tool_libs_.size(); i != 0; i--) {
1389     tool_unload_t unld;
1390     unld = (tool_unload_t)os::GetExportAddress(tool_libs_[i - 1], "OnUnload");
1391     if (unld) unld();
1392   }
1393 
1394   // Reset API table in case some tool doesn't cleanup properly
1395   hsa_api_table_.Reset();
1396 }
1397 
CloseTools()1398 void Runtime::CloseTools() {
1399   // Due to valgrind bug, runtime cannot dlclose extensions see:
1400   // http://valgrind.org/docs/manual/faq.html#faq.unhelpful
1401   if (!flag_.running_valgrind()) {
1402     for (auto& lib : tool_libs_) os::CloseLib(lib);
1403   }
1404   tool_libs_.clear();
1405 }
1406 
Shutdown()1407 void Runtime::AsyncEventsControl::Shutdown() {
1408   if (async_events_thread_ != NULL) {
1409     exit = true;
1410     hsa_signal_handle(wake)->StoreRelaxed(1);
1411     os::WaitForThread(async_events_thread_);
1412     os::CloseThread(async_events_thread_);
1413     async_events_thread_ = NULL;
1414     HSA::hsa_signal_destroy(wake);
1415   }
1416 }
1417 
PushBack(hsa_signal_t signal,hsa_signal_condition_t cond,hsa_signal_value_t value,hsa_amd_signal_handler handler,void * arg)1418 void Runtime::AsyncEvents::PushBack(hsa_signal_t signal,
1419                                     hsa_signal_condition_t cond,
1420                                     hsa_signal_value_t value,
1421                                     hsa_amd_signal_handler handler, void* arg) {
1422   signal_.push_back(signal);
1423   cond_.push_back(cond);
1424   value_.push_back(value);
1425   handler_.push_back(handler);
1426   arg_.push_back(arg);
1427 }
1428 
CopyIndex(size_t dst,size_t src)1429 void Runtime::AsyncEvents::CopyIndex(size_t dst, size_t src) {
1430   signal_[dst] = signal_[src];
1431   cond_[dst] = cond_[src];
1432   value_[dst] = value_[src];
1433   handler_[dst] = handler_[src];
1434   arg_[dst] = arg_[src];
1435 }
1436 
Size()1437 size_t Runtime::AsyncEvents::Size() { return signal_.size(); }
1438 
PopBack()1439 void Runtime::AsyncEvents::PopBack() {
1440   signal_.pop_back();
1441   cond_.pop_back();
1442   value_.pop_back();
1443   handler_.pop_back();
1444   arg_.pop_back();
1445 }
1446 
Clear()1447 void Runtime::AsyncEvents::Clear() {
1448   signal_.clear();
1449   cond_.clear();
1450   value_.clear();
1451   handler_.clear();
1452   arg_.clear();
1453 }
1454 
SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,void * data)1455 hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
1456                                                   void* data) {
1457   if (system_event_handler_) {
1458     return HSA_STATUS_ERROR;
1459   } else {
1460     system_event_handler_ = callback;
1461     system_event_handler_user_data_ = data;
1462     return HSA_STATUS_SUCCESS;
1463   }
1464 }
1465 
1466 }  // namespace core
1467