1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 //                 AMD Research and AMD HSA Software Development
11 //
12 //                 Advanced Micro Devices, Inc.
13 //
14 //                 www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 //  - Redistributions of source code must retain the above copyright notice,
24 //    this list of conditions and the following disclaimers.
25 //  - Redistributions in binary form must reproduce the above copyright
26 //    notice, this list of conditions and the following disclaimers in
27 //    the documentation and/or other materials provided with the distribution.
28 //  - Neither the names of Advanced Micro Devices, Inc,
29 //    nor the names of its contributors may be used to endorse or promote
30 //    products derived from this Software without specific prior written
31 //    permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42 
43 #include "core/inc/amd_gpu_agent.h"
44 
45 #include <algorithm>
46 #include <atomic>
47 #include <cstring>
48 #include <climits>
49 #include <map>
50 #include <string>
51 #include <vector>
52 #include <memory>
53 #include <utility>
54 
55 #include "core/inc/amd_aql_queue.h"
56 #include "core/inc/amd_blit_kernel.h"
57 #include "core/inc/amd_blit_sdma.h"
58 #include "core/inc/amd_gpu_pm4.h"
59 #include "core/inc/amd_gpu_shaders.h"
60 #include "core/inc/amd_memory_region.h"
61 #include "core/inc/interrupt_signal.h"
62 #include "core/inc/isa.h"
63 #include "core/inc/runtime.h"
64 #include "core/util/os.h"
65 #include "hsa_ext_image.h"
66 #include "inc/hsa_ven_amd_aqlprofile.h"
67 
68 // Size of scratch (private) segment pre-allocated per thread, in bytes.
69 #define DEFAULT_SCRATCH_BYTES_PER_THREAD 2048
70 
71 extern core::HsaApiTable hsa_internal_api_table_;
72 
73 namespace amd {
GpuAgent(HSAuint32 node,const HsaNodeProperties & node_props)74 GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
75     : GpuAgentInt(node),
76       properties_(node_props),
77       current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
78       blits_(),
79       queues_(),
80       local_region_(NULL),
81       is_kv_device_(false),
82       trap_code_buf_(NULL),
83       trap_code_buf_size_(0),
84       memory_bus_width_(0),
85       memory_max_frequency_(0),
86       ape1_base_(0),
87       ape1_size_(0),
88       end_ts_pool_size_(0),
89       end_ts_pool_counter_(0),
90       end_ts_base_addr_(NULL) {
91   const bool is_apu_node = (properties_.NumCPUCores > 0);
92   profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
93 
94   HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t0_);
95   t1_ = t0_;
96   assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
97 
98   // Set instruction set architecture via node property, only on GPU device.
99   isa_ = (core::Isa*)core::IsaRegistry::GetIsa(core::Isa::Version(
100       node_props.EngineId.ui32.Major, node_props.EngineId.ui32.Minor,
101       node_props.EngineId.ui32.Stepping), profile_ == HSA_PROFILE_FULL);
102 
103   // Check if the device is Kaveri, only on GPU device.
104   if (isa_->GetMajorVersion() == 7 && isa_->GetMinorVersion() == 0 &&
105       isa_->GetStepping() == 0) {
106     is_kv_device_ = true;
107   }
108 
109   current_coherency_type((profile_ == HSA_PROFILE_FULL)
110                              ? HSA_AMD_COHERENCY_TYPE_COHERENT
111                              : HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
112 
113   max_queues_ = core::Runtime::runtime_singleton_->flag().max_queues();
114 #if !defined(HSA_LARGE_MODEL) || !defined(__linux__)
115   if (max_queues_ == 0) {
116     max_queues_ = 10;
117   }
118   max_queues_ = std::min(10U, max_queues_);
119 #else
120   if (max_queues_ == 0) {
121     max_queues_ = 128;
122   }
123   max_queues_ = std::min(128U, max_queues_);
124 #endif
125 
126   // Populate region list.
127   InitRegionList();
128 
129   // Populate cache list.
130   InitCacheList();
131 }
132 
~GpuAgent()133 GpuAgent::~GpuAgent() {
134   for (int i = 0; i < BlitCount; ++i) {
135     if (blits_[i] != nullptr) {
136       hsa_status_t status = blits_[i]->Destroy(*this);
137       assert(status == HSA_STATUS_SUCCESS);
138     }
139   }
140 
141   if (end_ts_base_addr_ != NULL) {
142     core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
143   }
144 
145   if (ape1_base_ != 0) {
146     _aligned_free(reinterpret_cast<void*>(ape1_base_));
147   }
148 
149   if (scratch_pool_.base() != NULL) {
150     hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size());
151   }
152 
153   if (trap_code_buf_ != NULL) {
154     ReleaseShader(trap_code_buf_, trap_code_buf_size_);
155   }
156 
157   std::for_each(regions_.begin(), regions_.end(), DeleteObject());
158   regions_.clear();
159 }
160 
AssembleShader(const char * src_sp3,const char * func_name,AssembleTarget assemble_target,void * & code_buf,size_t & code_buf_size) const161 void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
162                               AssembleTarget assemble_target, void*& code_buf,
163                               size_t& code_buf_size) const {
164   // Select precompiled shader implementation from name/target.
165   struct ASICShader {
166     const void* code;
167     size_t size;
168     int num_sgprs;
169     int num_vgprs;
170   };
171 
172   struct CompiledShader {
173     ASICShader compute_7;
174     ASICShader compute_8;
175     ASICShader compute_9;
176   };
177 
178   std::map<std::string, CompiledShader> compiled_shaders = {
179       {"TrapHandler",
180        {
181            {NULL, 0, 0, 0},
182            {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
183            {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
184        }},
185       {"CopyAligned",
186        {
187            {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
188            {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
189            {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
190        }},
191       {"CopyMisaligned",
192        {
193            {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
194            {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
195            {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
196        }},
197       {"Fill",
198        {
199            {kCodeFill7, sizeof(kCodeFill7), 19, 8},
200            {kCodeFill8, sizeof(kCodeFill8), 19, 8},
201            {kCodeFill8, sizeof(kCodeFill8), 19, 8},
202        }}};
203 
204   auto compiled_shader_it = compiled_shaders.find(func_name);
205   assert(compiled_shader_it != compiled_shaders.end() &&
206          "Precompiled shader unavailable");
207 
208   ASICShader* asic_shader = NULL;
209 
210   switch (isa_->GetMajorVersion()) {
211     case 7:
212       asic_shader = &compiled_shader_it->second.compute_7;
213       break;
214     case 8:
215       asic_shader = &compiled_shader_it->second.compute_8;
216       break;
217     case 9:
218       asic_shader = &compiled_shader_it->second.compute_9;
219       break;
220     default:
221       assert(false && "Precompiled shader unavailable for target");
222   }
223 
224   // Allocate a GPU-visible buffer for the shader.
225   size_t header_size =
226       (assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0);
227   code_buf_size = AlignUp(header_size + asic_shader->size, 0x1000);
228 
229   code_buf = core::Runtime::runtime_singleton_->system_allocator()(
230       code_buf_size, 0x1000, core::MemoryRegion::AllocateExecutable);
231   assert(code_buf != NULL && "Code buffer allocation failed");
232 
233   memset(code_buf, 0, code_buf_size);
234 
235   // Populate optional code object header.
236   if (assemble_target == AssembleTarget::AQL) {
237     amd_kernel_code_t* header = reinterpret_cast<amd_kernel_code_t*>(code_buf);
238 
239     int gran_sgprs = std::max(0, (int(asic_shader->num_sgprs) - 1) / 8);
240     int gran_vgprs = std::max(0, (int(asic_shader->num_vgprs) - 1) / 4);
241 
242     header->kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t);
243     AMD_HSA_BITS_SET(header->kernel_code_properties,
244                      AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
245                      1);
246     AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
247                      AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT,
248                      gran_sgprs);
249     AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
250                      AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT,
251                      gran_vgprs);
252     AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
253                      AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 3);
254     AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
255                      AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 1);
256     AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
257                      AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
258     AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
259                      AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
260   }
261 
262   // Copy shader code into the GPU-visible buffer.
263   memcpy((void*)(uintptr_t(code_buf) + header_size), asic_shader->code,
264          asic_shader->size);
265 }
266 
ReleaseShader(void * code_buf,size_t code_buf_size) const267 void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const {
268   core::Runtime::runtime_singleton_->system_deallocator()(code_buf);
269 }
270 
InitRegionList()271 void GpuAgent::InitRegionList() {
272   const bool is_apu_node = (properties_.NumCPUCores > 0);
273 
274   std::vector<HsaMemoryProperties> mem_props(properties_.NumMemoryBanks);
275   if (HSAKMT_STATUS_SUCCESS ==
276       hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks,
277                                     &mem_props[0])) {
278     for (uint32_t mem_idx = 0; mem_idx < properties_.NumMemoryBanks;
279          ++mem_idx) {
280       // Ignore the one(s) with unknown size.
281       if (mem_props[mem_idx].SizeInBytes == 0) {
282         continue;
283       }
284 
285       switch (mem_props[mem_idx].HeapType) {
286         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
287         case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
288           if (!is_apu_node) {
289             mem_props[mem_idx].VirtualBaseAddress = 0;
290           }
291 
292           memory_bus_width_ = mem_props[mem_idx].Width;
293           memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
294         case HSA_HEAPTYPE_GPU_LDS:
295         case HSA_HEAPTYPE_GPU_SCRATCH:
296         case HSA_HEAPTYPE_DEVICE_SVM: {
297           MemoryRegion* region =
298               new MemoryRegion(false, false, this, mem_props[mem_idx]);
299 
300           regions_.push_back(region);
301 
302           if (region->IsLocalMemory()) {
303             local_region_ = region;
304           }
305           break;
306         }
307         case HSA_HEAPTYPE_SYSTEM:
308           if (is_apu_node) {
309             memory_bus_width_ = mem_props[mem_idx].Width;
310             memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
311           }
312           break;
313         default:
314           continue;
315       }
316     }
317   }
318 }
319 
InitScratchPool()320 void GpuAgent::InitScratchPool() {
321   HsaMemFlags flags;
322   flags.Value = 0;
323   flags.ui32.Scratch = 1;
324   flags.ui32.HostAccess = 1;
325 
326   scratch_per_thread_ =
327       core::Runtime::runtime_singleton_->flag().scratch_mem_size();
328   if (scratch_per_thread_ == 0)
329     scratch_per_thread_ = DEFAULT_SCRATCH_BYTES_PER_THREAD;
330 
331   // Scratch length is: waves/CU * threads/wave * queues * #CUs *
332   // scratch/thread
333   const uint32_t num_cu =
334       properties_.NumFComputeCores / properties_.NumSIMDPerCU;
335   queue_scratch_len_ = AlignUp(32 * 64 * num_cu * scratch_per_thread_, 65536);
336   size_t max_scratch_len = queue_scratch_len_ * max_queues_;
337 
338 #if defined(HSA_LARGE_MODEL) && defined(__linux__)
339   // For 64-bit linux use max queues unless otherwise specified
340   if ((max_scratch_len == 0) || (max_scratch_len > 4294967296)) {
341     max_scratch_len = 4294967296;  // 4GB apeture max
342   }
343 #endif
344 
345   void* scratch_base;
346   HSAKMT_STATUS err =
347       hsaKmtAllocMemory(node_id(), max_scratch_len, flags, &scratch_base);
348   assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Scratch) failed");
349   assert(IsMultipleOf(scratch_base, 0x1000) &&
350          "Scratch base is not page aligned!");
351 
352   scratch_pool_. ~SmallHeap();
353   if (HSAKMT_STATUS_SUCCESS == err) {
354     new (&scratch_pool_) SmallHeap(scratch_base, max_scratch_len);
355   } else {
356     new (&scratch_pool_) SmallHeap();
357   }
358 }
359 
InitCacheList()360 void GpuAgent::InitCacheList() {
361   // Get GPU cache information.
362   // Similar to getting CPU cache but here we use FComputeIdLo.
363   cache_props_.resize(properties_.NumCaches);
364   if (HSAKMT_STATUS_SUCCESS !=
365       hsaKmtGetNodeCacheProperties(node_id(), properties_.FComputeIdLo,
366                                    properties_.NumCaches, &cache_props_[0])) {
367     cache_props_.clear();
368   } else {
369     // Only store GPU D-cache.
370     for (size_t cache_id = 0; cache_id < cache_props_.size(); ++cache_id) {
371       const HsaCacheType type = cache_props_[cache_id].CacheType;
372       if (type.ui32.HSACU != 1 || type.ui32.Instruction == 1) {
373         cache_props_.erase(cache_props_.begin() + cache_id);
374         --cache_id;
375       }
376     }
377   }
378 
379   // Update cache objects
380   caches_.clear();
381   caches_.resize(cache_props_.size());
382   char name[64];
383   GetInfo(HSA_AGENT_INFO_NAME, name);
384   std::string deviceName = name;
385   for (size_t i = 0; i < caches_.size(); i++)
386     caches_[i].reset(new core::Cache(deviceName + " L" + std::to_string(cache_props_[i].CacheLevel),
387                                      cache_props_[i].CacheLevel, cache_props_[i].CacheSize));
388 }
389 
InitEndTsPool()390 bool GpuAgent::InitEndTsPool() {
391   if (HSA_PROFILE_FULL == profile_) {
392     return true;
393   }
394 
395   if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) {
396     return true;
397   }
398 
399   ScopedAcquire<KernelMutex> lock(&blit_lock_);
400 
401   if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) {
402     return true;
403   }
404 
405   end_ts_pool_size_ =
406       static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
407                             (BlitSdmaBase::kCopyPacketSize));
408 
409   // Allocate end timestamp object for both h2d and d2h DMA.
410   const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
411 
412   core::Runtime* runtime = core::Runtime::runtime_singleton_;
413 
414   uint64_t* buff = NULL;
415   if (HSA_STATUS_SUCCESS !=
416       runtime->AllocateMemory(local_region_, alloc_size,
417                               MemoryRegion::AllocateRestrict,
418                               reinterpret_cast<void**>(&buff))) {
419     return false;
420   }
421 
422   end_ts_base_addr_.store(buff, std::memory_order_release);
423 
424   return true;
425 }
426 
ObtainEndTsObject()427 uint64_t* GpuAgent::ObtainEndTsObject() {
428   if (end_ts_base_addr_ == NULL) {
429     return NULL;
430   }
431 
432   const uint32_t end_ts_index =
433       end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) %
434       end_ts_pool_size_;
435   const static size_t kNumU64 = kTsSize / sizeof(uint64_t);
436   uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64];
437   assert(IsMultipleOf(end_ts_addr, kTsSize));
438 
439   return end_ts_addr;
440 }
441 
IterateRegion(hsa_status_t (* callback)(hsa_region_t region,void * data),void * data) const442 hsa_status_t GpuAgent::IterateRegion(
443     hsa_status_t (*callback)(hsa_region_t region, void* data),
444     void* data) const {
445   return VisitRegion(true, callback, data);
446 }
447 
IterateCache(hsa_status_t (* callback)(hsa_cache_t cache,void * data),void * data) const448 hsa_status_t GpuAgent::IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
449                                     void* data) const {
450   AMD::callback_t<decltype(callback)> call(callback);
451   for (size_t i = 0; i < caches_.size(); i++) {
452     hsa_status_t stat = call(core::Cache::Convert(caches_[i].get()), data);
453     if (stat != HSA_STATUS_SUCCESS) return stat;
454   }
455   return HSA_STATUS_SUCCESS;
456 }
457 
VisitRegion(bool include_peer,hsa_status_t (* callback)(hsa_region_t region,void * data),void * data) const458 hsa_status_t GpuAgent::VisitRegion(bool include_peer,
459                                    hsa_status_t (*callback)(hsa_region_t region,
460                                                             void* data),
461                                    void* data) const {
462   if (include_peer) {
463     // Only expose system, local, and LDS memory of the blit agent.
464     if (this->node_id() ==
465         core::Runtime::runtime_singleton_->blit_agent()->node_id()) {
466       hsa_status_t stat = VisitRegion(regions_, callback, data);
467       if (stat != HSA_STATUS_SUCCESS) {
468         return stat;
469       }
470     }
471 
472     // Also expose system regions accessible by this agent.
473     hsa_status_t stat =
474         VisitRegion(core::Runtime::runtime_singleton_->system_regions_fine(),
475                     callback, data);
476     if (stat != HSA_STATUS_SUCCESS) {
477       return stat;
478     }
479 
480     return VisitRegion(
481         core::Runtime::runtime_singleton_->system_regions_coarse(), callback,
482         data);
483   }
484 
485   // Only expose system, local, and LDS memory of this agent.
486   return VisitRegion(regions_, callback, data);
487 }
488 
VisitRegion(const std::vector<const core::MemoryRegion * > & regions,hsa_status_t (* callback)(hsa_region_t region,void * data),void * data) const489 hsa_status_t GpuAgent::VisitRegion(
490     const std::vector<const core::MemoryRegion*>& regions,
491     hsa_status_t (*callback)(hsa_region_t region, void* data),
492     void* data) const {
493   AMD::callback_t<decltype(callback)> call(callback);
494   for (const core::MemoryRegion* region : regions) {
495     const amd::MemoryRegion* amd_region =
496         reinterpret_cast<const amd::MemoryRegion*>(region);
497 
498     // Only expose system, local, and LDS memory.
499     if (amd_region->IsSystem() || amd_region->IsLocalMemory() ||
500         amd_region->IsLDS()) {
501       hsa_region_t region_handle = core::MemoryRegion::Convert(region);
502       hsa_status_t status = call(region_handle, data);
503       if (status != HSA_STATUS_SUCCESS) {
504         return status;
505       }
506     }
507   }
508 
509   return HSA_STATUS_SUCCESS;
510 }
511 
CreateInterceptibleQueue()512 core::Queue* GpuAgent::CreateInterceptibleQueue() {
513   // Disabled intercept of internal queues pending tools updates.
514   core::Queue* queue = nullptr;
515   QueueCreate(minAqlSize_, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue);
516   return queue;
517 }
518 
CreateBlitSdma(bool h2d)519 core::Blit* GpuAgent::CreateBlitSdma(bool h2d) {
520   core::Blit* sdma;
521 
522   if (isa_->GetMajorVersion() <= 8) {
523     sdma = new BlitSdmaV2V3(h2d);
524   } else {
525     sdma = new BlitSdmaV4(h2d);
526   }
527 
528   if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
529     sdma->Destroy(*this);
530     delete sdma;
531     sdma = NULL;
532   }
533 
534   return sdma;
535 }
536 
CreateBlitKernel(core::Queue * queue)537 core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
538   BlitKernel* kernl = new BlitKernel(queue);
539 
540   if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) {
541     kernl->Destroy(*this);
542     delete kernl;
543     kernl = NULL;
544   }
545 
546   return kernl;
547 }
548 
InitDma()549 void GpuAgent::InitDma() {
550   // Setup lazy init pointers on queues and blits.
551   auto queue_lambda = [this]() {
552     auto ret = CreateInterceptibleQueue();
553     if (ret == nullptr)
554       throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
555                                "Internal queue creation failed.");
556     return ret;
557   };
558   // Dedicated compute queue for host-to-device blits.
559   queues_[QueueBlitOnly].reset(queue_lambda);
560   // Share utility queue with device-to-host blits.
561   queues_[QueueUtility].reset(queue_lambda);
562 
563   // Decide which engine to use for blits.
564   auto blit_lambda = [this](bool h2d, lazy_ptr<core::Queue>& queue) {
565     const std::string& sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
566 
567     // Per-ASIC disables for firmware stability.
568     bool use_sdma = (isa_->GetMajorVersion() != 8) && (isa_->version() != core::Isa::Version(9, 0, 6));
569     if (sdma_override.size() != 0) use_sdma = (sdma_override == "1");
570 
571     if (use_sdma && (HSA_PROFILE_BASE == profile_)) {
572       auto ret = CreateBlitSdma(h2d);
573       if (ret != nullptr) return ret;
574     }
575 
576     auto ret = CreateBlitKernel((*queue).get());
577     if (ret == nullptr)
578       throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
579     return ret;
580   };
581 
582   blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueBlitOnly]); });
583   blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
584   blits_[BlitDevToDev].reset([this]() {
585     auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
586     if (ret == nullptr)
587       throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
588     return ret;
589   });
590 }
591 
PreloadBlits()592 void GpuAgent::PreloadBlits() {
593   blits_[BlitHostToDev].touch();
594   blits_[BlitDevToHost].touch();
595   blits_[BlitDevToDev].touch();
596 }
597 
PostToolsInit()598 hsa_status_t GpuAgent::PostToolsInit() {
599   // Defer memory allocation until agents have been discovered.
600   InitScratchPool();
601   BindTrapHandler();
602   InitDma();
603 
604   return HSA_STATUS_SUCCESS;
605 }
606 
DmaCopy(void * dst,const void * src,size_t size)607 hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
608   return blits_[BlitDevToDev]->SubmitLinearCopyCommand(dst, src, size);
609 }
610 
DmaCopy(void * dst,core::Agent & dst_agent,const void * src,core::Agent & src_agent,size_t size,std::vector<core::Signal * > & dep_signals,core::Signal & out_signal)611 hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
612                                const void* src, core::Agent& src_agent,
613                                size_t size,
614                                std::vector<core::Signal*>& dep_signals,
615                                core::Signal& out_signal) {
616   lazy_ptr<core::Blit>& blit =
617     (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
618      dst_agent.device_type() == core::Agent::kAmdGpuDevice)
619        ? blits_[BlitHostToDev]
620        : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
621           dst_agent.device_type() == core::Agent::kAmdCpuDevice)
622             ? blits_[BlitDevToHost]
623             : (src_agent.node_id() == dst_agent.node_id())
624               ? blits_[BlitDevToDev] : blits_[BlitDevToHost];
625 
626   if (profiling_enabled()) {
627     // Track the agent so we could translate the resulting timestamp to system
628     // domain correctly.
629     out_signal.async_copy_agent(core::Agent::Convert(this->public_handle()));
630   }
631 
632   hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal);
633 
634   return stat;
635 }
636 
DmaFill(void * ptr,uint32_t value,size_t count)637 hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
638   return blits_[BlitDevToDev]->SubmitLinearFillCommand(ptr, value, count);
639 }
640 
EnableDmaProfiling(bool enable)641 hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
642   if (enable && !InitEndTsPool()) {
643     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
644   }
645 
646   for (int i = 0; i < BlitCount; ++i) {
647     if (blits_[i] != NULL) {
648       const hsa_status_t stat = blits_[i]->EnableProfiling(enable);
649       if (stat != HSA_STATUS_SUCCESS) {
650         return stat;
651       }
652     }
653   }
654 
655   return HSA_STATUS_SUCCESS;
656 }
657 
GetInfo(hsa_agent_info_t attribute,void * value) const658 hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
659 
660   // agent, and vendor name size limit
661   const size_t attribute_u = static_cast<size_t>(attribute);
662 
663   switch (attribute_u) {
664 
665     // Build agent name by concatenating the Major, Minor and Stepping Ids
666     // of devices compute capability with a prefix of "gfx"
667     case HSA_AGENT_INFO_NAME: {
668       std::stringstream name;
669       std::memset(value, 0, HSA_PUBLIC_NAME_SIZE);
670       char* temp = reinterpret_cast<char*>(value);
671       name << "gfx" << isa_->GetMajorVersion() << isa_->GetMinorVersion() << isa_->GetStepping();
672       std::strcpy(temp, name.str().c_str());
673       break;
674     }
675     case HSA_AGENT_INFO_VENDOR_NAME:
676       std::memset(value, 0, HSA_PUBLIC_NAME_SIZE);
677       std::memcpy(value, "AMD", sizeof("AMD"));
678       break;
679     case HSA_AGENT_INFO_FEATURE:
680       *((hsa_agent_feature_t*)value) = HSA_AGENT_FEATURE_KERNEL_DISPATCH;
681       break;
682     case HSA_AGENT_INFO_MACHINE_MODEL:
683 #if defined(HSA_LARGE_MODEL)
684       *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE;
685 #else
686       *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL;
687 #endif
688       break;
689     case HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES:
690     case HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE:
691       *((hsa_default_float_rounding_mode_t*)value) =
692           HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR;
693       break;
694     case HSA_AGENT_INFO_FAST_F16_OPERATION:
695       *((bool*)value) = false;
696       break;
697     case HSA_AGENT_INFO_PROFILE:
698       *((hsa_profile_t*)value) = profile_;
699       break;
700     case HSA_AGENT_INFO_WAVEFRONT_SIZE:
701       *((uint32_t*)value) = properties_.WaveFrontSize;
702       break;
703     case HSA_AGENT_INFO_WORKGROUP_MAX_DIM: {
704       // TODO: must be per-device
705       const uint16_t group_size[3] = {1024, 1024, 1024};
706       std::memcpy(value, group_size, sizeof(group_size));
707     } break;
708     case HSA_AGENT_INFO_WORKGROUP_MAX_SIZE:
709       // TODO: must be per-device
710       *((uint32_t*)value) = 1024;
711       break;
712     case HSA_AGENT_INFO_GRID_MAX_DIM: {
713       const hsa_dim3_t grid_size = {UINT32_MAX, UINT32_MAX, UINT32_MAX};
714       std::memcpy(value, &grid_size, sizeof(hsa_dim3_t));
715     } break;
716     case HSA_AGENT_INFO_GRID_MAX_SIZE:
717       *((uint32_t*)value) = UINT32_MAX;
718       break;
719     case HSA_AGENT_INFO_FBARRIER_MAX_SIZE:
720       // TODO: to confirm
721       *((uint32_t*)value) = 32;
722       break;
723     case HSA_AGENT_INFO_QUEUES_MAX:
724       *((uint32_t*)value) = max_queues_;
725       break;
726     case HSA_AGENT_INFO_QUEUE_MIN_SIZE:
727       *((uint32_t*)value) = minAqlSize_;
728       break;
729     case HSA_AGENT_INFO_QUEUE_MAX_SIZE:
730       *((uint32_t*)value) = maxAqlSize_;
731       break;
732     case HSA_AGENT_INFO_QUEUE_TYPE:
733       *((hsa_queue_type32_t*)value) = HSA_QUEUE_TYPE_MULTI;
734       break;
735     case HSA_AGENT_INFO_NODE:
736       // TODO: associate with OS NUMA support (numactl / GetNumaProcessorNode).
737       *((uint32_t*)value) = node_id();
738       break;
739     case HSA_AGENT_INFO_DEVICE:
740       *((hsa_device_type_t*)value) = HSA_DEVICE_TYPE_GPU;
741       break;
742     case HSA_AGENT_INFO_CACHE_SIZE:
743       std::memset(value, 0, sizeof(uint32_t) * 4);
744       // TODO: no GPU cache info from KFD. Hardcode for now.
745       // GCN whitepaper: L1 data cache is 16KB.
746       ((uint32_t*)value)[0] = 16 * 1024;
747       break;
748     case HSA_AGENT_INFO_ISA:
749       *((hsa_isa_t*)value) = core::Isa::Handle(isa_);
750       break;
751     case HSA_AGENT_INFO_EXTENSIONS: {
752       memset(value, 0, sizeof(uint8_t) * 128);
753 
754       auto setFlag = [&](uint32_t bit) {
755         assert(bit < 128 * 8 && "Extension value exceeds extension bitmask");
756         uint index = bit / 8;
757         uint subBit = bit % 8;
758         ((uint8_t*)value)[index] |= 1 << subBit;
759       };
760 
761       if (core::hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) {
762         setFlag(HSA_EXTENSION_FINALIZER);
763       }
764 
765       if (core::hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) {
766         setFlag(HSA_EXTENSION_IMAGES);
767       }
768 
769       if (os::LibHandle lib = os::LoadLib(kAqlProfileLib)) {
770         os::CloseLib(lib);
771         setFlag(HSA_EXTENSION_AMD_AQLPROFILE);
772       }
773 
774       setFlag(HSA_EXTENSION_AMD_PROFILER);
775 
776       break;
777     }
778     case HSA_AGENT_INFO_VERSION_MAJOR:
779       *((uint16_t*)value) = 1;
780       break;
781     case HSA_AGENT_INFO_VERSION_MINOR:
782       *((uint16_t*)value) = 1;
783       break;
784     case HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS:
785     case HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS:
786     case HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS:
787     case HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS:
788     case HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS:
789     case HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS:
790     case HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS:
791     case HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS:
792     case HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS:
793       return hsa_amd_image_get_info_max_dim(public_handle(), attribute, value);
794     case HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES:
795       // TODO: hardcode based on OCL constants.
796       *((uint32_t*)value) = 128;
797       break;
798     case HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES:
799       // TODO: hardcode based on OCL constants.
800       *((uint32_t*)value) = 64;
801       break;
802     case HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS:
803       // TODO: hardcode based on OCL constants.
804       *((uint32_t*)value) = 16;
805     case HSA_AMD_AGENT_INFO_CHIP_ID:
806       *((uint32_t*)value) = properties_.DeviceId;
807       break;
808     case HSA_AMD_AGENT_INFO_CACHELINE_SIZE:
809       // TODO: hardcode for now.
810       // GCN whitepaper: cache line size is 64 byte long.
811       *((uint32_t*)value) = 64;
812       break;
813     case HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT:
814       *((uint32_t*)value) =
815           (properties_.NumFComputeCores / properties_.NumSIMDPerCU);
816       break;
817     case HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY:
818       *((uint32_t*)value) = properties_.MaxEngineClockMhzFCompute;
819       break;
820     case HSA_AMD_AGENT_INFO_DRIVER_NODE_ID:
821       *((uint32_t*)value) = node_id();
822       break;
823     case HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS:
824       *((uint32_t*)value) = static_cast<uint32_t>(
825           1 << properties_.Capability.ui32.WatchPointsTotalBits);
826       break;
827     case HSA_AMD_AGENT_INFO_BDFID:
828       *((uint32_t*)value) = static_cast<uint32_t>(properties_.LocationId);
829       break;
830     case HSA_AMD_AGENT_INFO_MEMORY_WIDTH:
831       *((uint32_t*)value) = memory_bus_width_;
832       break;
833     case HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY:
834       *((uint32_t*)value) = memory_max_frequency_;
835       break;
836 
837     // The code copies HsaNodeProperties.MarketingName a Unicode string
838     // which is encoded in UTF-16 as a 7-bit ASCII string
839     case HSA_AMD_AGENT_INFO_PRODUCT_NAME: {
840       std::memset(value, 0, HSA_PUBLIC_NAME_SIZE);
841       char* temp = reinterpret_cast<char*>(value);
842       for (uint32_t idx = 0;
843            properties_.MarketingName[idx] != 0 && idx < HSA_PUBLIC_NAME_SIZE - 1; idx++) {
844         temp[idx] = (uint8_t)properties_.MarketingName[idx];
845       }
846       break;
847     }
848     case HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU:
849       *((uint32_t*)value) = static_cast<uint32_t>(
850           properties_.NumSIMDPerCU * properties_.MaxWavesPerSIMD);
851       break;
852     case HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU:
853       *((uint32_t*)value) = properties_.NumSIMDPerCU;
854       break;
855     case HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES:
856       *((uint32_t*)value) = properties_.NumShaderBanks;
857       break;
858     case HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE:
859       *((uint32_t*)value) = properties_.NumArrays;
860       break;
861     default:
862       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
863       break;
864   }
865   return HSA_STATUS_SUCCESS;
866 }
867 
QueueCreate(size_t size,hsa_queue_type32_t queue_type,core::HsaEventCallback event_callback,void * data,uint32_t private_segment_size,uint32_t group_segment_size,core::Queue ** queue)868 hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
869                                    core::HsaEventCallback event_callback,
870                                    void* data, uint32_t private_segment_size,
871                                    uint32_t group_segment_size,
872                                    core::Queue** queue) {
873   // AQL queues must be a power of two in length.
874   if (!IsPowerOfTwo(size)) {
875     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
876   }
877 
878   // Enforce max size
879   if (size > maxAqlSize_) {
880     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
881   }
882 
883   // Allocate scratch memory
884   ScratchInfo scratch;
885   if (private_segment_size == UINT_MAX) {
886     private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
887   }
888 
889   if (private_segment_size > 262128) {
890     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
891   }
892 
893   scratch.size_per_thread = AlignUp(private_segment_size, 16);
894   if (scratch.size_per_thread > 262128) {
895     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
896   }
897 
898   const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
899   scratch.size = scratch.size_per_thread * 32 * 64 * num_cu;
900   scratch.queue_base = nullptr;
901   scratch.queue_process_offset = 0;
902 
903   MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueScratch(scratch); });
904 
905   if (scratch.size != 0) {
906     AcquireQueueScratch(scratch);
907     if (scratch.queue_base == nullptr) {
908       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
909     }
910   }
911 
912   // Ensure utility queue has been created.
913   // Deferring longer risks exhausting queue count before ISA upload and invalidation capability is
914   // ensured.
915   queues_[QueueUtility].touch();
916 
917   // Create an HW AQL queue
918   *queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_);
919   scratchGuard.Dismiss();
920   return HSA_STATUS_SUCCESS;
921 }
922 
AcquireQueueScratch(ScratchInfo & scratch)923 void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
924   bool need_queue_scratch_base = (isa_->GetMajorVersion() > 8);
925 
926   if (scratch.size == 0) {
927     scratch.size = queue_scratch_len_;
928     scratch.size_per_thread = scratch_per_thread_;
929   }
930 
931   scratch.retry = false;
932 
933   ScopedAcquire<KernelMutex> lock(&scratch_lock_);
934   // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
935   size_t small_limit = scratch_pool_.size() >> 3;
936   size_t single_limit = small_limit >> 2;
937   bool large = (scratch.size > single_limit) ||
938       (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
939   large = (isa_->GetMajorVersion() < 8) ? false : large;
940   if (large)
941     scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
942   else
943     scratch.queue_base = scratch_pool_.alloc(scratch.size);
944   large |= scratch.queue_base > scratch_pool_.high_split();
945   scratch.large = large;
946 
947   scratch.queue_process_offset =
948       (need_queue_scratch_base)
949           ? uintptr_t(scratch.queue_base)
950           : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
951 
952   if (scratch.queue_base != nullptr) {
953     if (profile_ == HSA_PROFILE_FULL) return;
954     if (profile_ == HSA_PROFILE_BASE) {
955       HSAuint64 alternate_va;
956       if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
957           HSAKMT_STATUS_SUCCESS) {
958         if (large) scratch_used_large_ += scratch.size;
959         return;
960       }
961     }
962   }
963 
964   // Scratch request failed allocation or mapping.
965   scratch_pool_.free(scratch.queue_base);
966   scratch.queue_base = nullptr;
967 
968   // Retry if large may yield needed space.
969   if (scratch_used_large_ != 0) {
970     scratch.retry = true;
971     return;
972   }
973 
974   // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
975   if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
976     debug_print("Failed to map requested scratch - reducing queue occupancy.\n");
977   uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
978   uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024);
979   uint64_t total_waves = scratch.size / size_per_wave;
980   uint64_t waves_per_cu = total_waves / num_cus;
981   while (waves_per_cu != 0) {
982     size_t size = waves_per_cu * num_cus * size_per_wave;
983     void* base = scratch_pool_.alloc(size);
984     HSAuint64 alternate_va;
985     if ((base != nullptr) &&
986         ((profile_ == HSA_PROFILE_FULL) ||
987          (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
988       // Scratch allocated and either full profile or map succeeded.
989       scratch.queue_base = base;
990       scratch.size = size;
991       scratch.queue_process_offset =
992           (need_queue_scratch_base)
993               ? uintptr_t(scratch.queue_base)
994               : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
995       scratch.large = true;
996       scratch_used_large_ += scratch.size;
997       return;
998     }
999     scratch_pool_.free(base);
1000     waves_per_cu--;
1001   }
1002 
1003   // Failed to allocate minimal scratch
1004   assert(scratch.queue_base == nullptr && "bad scratch data");
1005   if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
1006     debug_print("Could not allocate scratch for one wave per CU.\n");
1007 }
1008 
ReleaseQueueScratch(ScratchInfo & scratch)1009 void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
1010   if (scratch.queue_base == nullptr) {
1011     return;
1012   }
1013 
1014   ScopedAcquire<KernelMutex> lock(&scratch_lock_);
1015   if (profile_ == HSA_PROFILE_BASE) {
1016     if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) {
1017       assert(false && "Unmap scratch subrange failed!");
1018     }
1019   }
1020   scratch_pool_.free(scratch.queue_base);
1021 
1022   if (scratch.large) scratch_used_large_ -= scratch.size;
1023 
1024   // Notify waiters that additional scratch may be available.
1025   for (auto notifier : scratch_notifiers_)
1026     HSA::hsa_signal_or_relaxed(notifier.first, notifier.second);
1027 }
1028 
TranslateTime(core::Signal * signal,hsa_amd_profiling_dispatch_time_t & time)1029 void GpuAgent::TranslateTime(core::Signal* signal,
1030                              hsa_amd_profiling_dispatch_time_t& time) {
1031   // Ensure interpolation
1032   ScopedAcquire<KernelMutex> lock(&t1_lock_);
1033   if (t1_.GPUClockCounter < signal->signal_.end_ts) {
1034     SyncClocks();
1035   }
1036 
1037   time.start = uint64_t(
1038       (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) /
1039        double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) *
1040           double(int64_t(signal->signal_.start_ts - t1_.GPUClockCounter)) +
1041       double(t1_.SystemClockCounter));
1042   time.end = uint64_t(
1043       (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) /
1044        double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) *
1045           double(int64_t(signal->signal_.end_ts - t1_.GPUClockCounter)) +
1046       double(t1_.SystemClockCounter));
1047 }
1048 
TranslateTime(uint64_t tick)1049 uint64_t GpuAgent::TranslateTime(uint64_t tick) {
1050   ScopedAcquire<KernelMutex> lock(&t1_lock_);
1051   SyncClocks();
1052 
1053   uint64_t system_tick = 0;
1054   system_tick = uint64_t(
1055       (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) /
1056        double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) *
1057           double(int64_t(tick - t1_.GPUClockCounter)) +
1058       double(t1_.SystemClockCounter));
1059   return system_tick;
1060 }
1061 
current_coherency_type(hsa_amd_coherency_type_t type)1062 bool GpuAgent::current_coherency_type(hsa_amd_coherency_type_t type) {
1063   if (!is_kv_device_) {
1064     current_coherency_type_ = type;
1065     return true;
1066   }
1067 
1068   ScopedAcquire<KernelMutex> Lock(&coherency_lock_);
1069 
1070   if (ape1_base_ == 0 && ape1_size_ == 0) {
1071     static const size_t kApe1Alignment = 64 * 1024;
1072     ape1_size_ = kApe1Alignment;
1073     ape1_base_ = reinterpret_cast<uintptr_t>(
1074         _aligned_malloc(ape1_size_, kApe1Alignment));
1075     assert((ape1_base_ != 0) && ("APE1 allocation failed"));
1076   } else if (type == current_coherency_type_) {
1077     return true;
1078   }
1079 
1080   HSA_CACHING_TYPE type0, type1;
1081   if (type == HSA_AMD_COHERENCY_TYPE_COHERENT) {
1082     type0 = HSA_CACHING_CACHED;
1083     type1 = HSA_CACHING_NONCACHED;
1084   } else {
1085     type0 = HSA_CACHING_NONCACHED;
1086     type1 = HSA_CACHING_CACHED;
1087   }
1088 
1089   if (hsaKmtSetMemoryPolicy(node_id(), type0, type1,
1090                             reinterpret_cast<void*>(ape1_base_),
1091                             ape1_size_) != HSAKMT_STATUS_SUCCESS) {
1092     return false;
1093   }
1094   current_coherency_type_ = type;
1095   return true;
1096 }
1097 
GetMicrocodeVersion() const1098 uint16_t GpuAgent::GetMicrocodeVersion() const {
1099   return (properties_.EngineId.ui32.uCode);
1100 }
1101 
GetSdmaMicrocodeVersion() const1102 uint16_t GpuAgent::GetSdmaMicrocodeVersion() const {
1103   return (properties_.uCodeEngineVersions.uCodeSDMA);
1104 }
1105 
SyncClocks()1106 void GpuAgent::SyncClocks() {
1107   HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t1_);
1108   assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
1109 }
1110 
BindTrapHandler()1111 void GpuAgent::BindTrapHandler() {
1112   const char* src_sp3 = R"(
1113     var s_trap_info_lo = ttmp0
1114     var s_trap_info_hi = ttmp1
1115     var s_tmp0         = ttmp2
1116     var s_tmp1         = ttmp3
1117     var s_tmp2         = ttmp4
1118     var s_tmp3         = ttmp5
1119 
1120     shader TrapHandler
1121       type(CS)
1122 
1123       // Retrieve the queue inactive signal.
1124       s_load_dwordx2       [s_tmp0, s_tmp1], s[0:1], 0xC0
1125       s_waitcnt            lgkmcnt(0)
1126 
1127       // Mask all but one lane of the wavefront.
1128       s_mov_b64            exec, 0x1
1129 
1130       // Set queue signal value to unhandled exception error.
1131       s_add_u32            s_tmp0, s_tmp0, 0x8
1132       s_addc_u32           s_tmp1, s_tmp1, 0x0
1133       v_mov_b32            v0, s_tmp0
1134       v_mov_b32            v1, s_tmp1
1135       v_mov_b32            v2, 0x80000000
1136       v_mov_b32            v3, 0x0
1137       flat_atomic_swap_x2  v[0:1], v[0:1], v[2:3]
1138       s_waitcnt            vmcnt(0)
1139 
1140       // Skip event if the signal was already set to unhandled exception.
1141       v_cmp_eq_u64         vcc, v[0:1], v[2:3]
1142       s_cbranch_vccnz      L_SIGNAL_DONE
1143 
1144       // Check for a non-NULL signal event mailbox.
1145       s_load_dwordx2       [s_tmp2, s_tmp3], [s_tmp0, s_tmp1], 0x8
1146       s_waitcnt            lgkmcnt(0)
1147       s_and_b64            [s_tmp2, s_tmp3], [s_tmp2, s_tmp3], [s_tmp2, s_tmp3]
1148       s_cbranch_scc0       L_SIGNAL_DONE
1149 
1150       // Load the signal event value.
1151       s_add_u32            s_tmp0, s_tmp0, 0x10
1152       s_addc_u32           s_tmp1, s_tmp1, 0x0
1153       s_load_dword         s_tmp0, [s_tmp0, s_tmp1], 0x0
1154       s_waitcnt            lgkmcnt(0)
1155 
1156       // Write the signal event value to the mailbox.
1157       v_mov_b32            v0, s_tmp2
1158       v_mov_b32            v1, s_tmp3
1159       v_mov_b32            v2, s_tmp0
1160       flat_store_dword     v[0:1], v2
1161       s_waitcnt            vmcnt(0)
1162 
1163       // Send an interrupt to trigger event notification.
1164       s_sendmsg            sendmsg(MSG_INTERRUPT)
1165 
1166     L_SIGNAL_DONE:
1167       // Halt wavefront and exit trap.
1168       s_sethalt            1
1169       s_rfe_b64            [s_trap_info_lo, s_trap_info_hi]
1170     end
1171   )";
1172 
1173   if (isa_->GetMajorVersion() == 7) {
1174     // No trap handler support on Gfx7, soft error.
1175     return;
1176   }
1177 
1178   // Disable trap handler on Carrizo until KFD is fixed.
1179   if (profile_ == HSA_PROFILE_FULL) {
1180     return;
1181   }
1182 
1183   // Assemble the trap handler source code.
1184   AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_,
1185                  trap_code_buf_size_);
1186 
1187   // Bind the trap handler to this node.
1188   HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_,
1189                                            trap_code_buf_size_, NULL, 0);
1190   assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtSetTrapHandler() failed");
1191 }
1192 
InvalidateCodeCaches()1193 void GpuAgent::InvalidateCodeCaches() {
1194   // Check for microcode cache invalidation support.
1195   // This is deprecated in later microcode builds.
1196   if (isa_->GetMajorVersion() == 7) {
1197     if (properties_.EngineId.ui32.uCode < 420) {
1198       // Microcode is handling code cache invalidation.
1199       return;
1200     }
1201   } else if (isa_->GetMajorVersion() == 8 && isa_->GetMinorVersion() == 0) {
1202     if (properties_.EngineId.ui32.uCode < 685) {
1203       // Microcode is handling code cache invalidation.
1204       return;
1205     }
1206   } else if (isa_->GetMajorVersion() == 9) {
1207     if (properties_.EngineId.ui32.uCode < 334) {
1208       static std::once_flag once;
1209       std::call_once(
1210           once, []() { fprintf(stderr, "warning: code cache invalidation not implemented\n"); });
1211       return;
1212     }
1213   } else {
1214     assert(false && "Code cache invalidation not implemented for this agent");
1215   }
1216 
1217   // Invalidate caches which may hold lines of code object allocation.
1218   constexpr uint32_t cache_inv_size_dw = 7;
1219   uint32_t cache_inv[cache_inv_size_dw];
1220 
1221   cache_inv[0] = PM4_HDR(PM4_HDR_IT_OPCODE_ACQUIRE_MEM, cache_inv_size_dw,
1222                          isa_->GetMajorVersion());
1223   cache_inv[1] = PM4_ACQUIRE_MEM_DW1_COHER_CNTL(
1224       PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA |
1225       PM4_ACQUIRE_MEM_COHER_CNTL_SH_KCACHE_ACTION_ENA |
1226       PM4_ACQUIRE_MEM_COHER_CNTL_TC_ACTION_ENA |
1227       PM4_ACQUIRE_MEM_COHER_CNTL_TC_WB_ACTION_ENA);
1228   cache_inv[2] = PM4_ACQUIRE_MEM_DW2_COHER_SIZE(0xFFFFFFFF);
1229   cache_inv[3] = PM4_ACQUIRE_MEM_DW3_COHER_SIZE_HI(0xFF);
1230   cache_inv[4] = 0;
1231   cache_inv[5] = 0;
1232   cache_inv[6] = 0;
1233 
1234   // Submit the command to the utility queue and wait for it to complete.
1235   queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv));
1236 }
1237 
1238 }  // namespace
1239