1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 // AMD Research and AMD HSA Software Development
11 //
12 // Advanced Micro Devices, Inc.
13 //
14 // www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 // - Redistributions of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimers.
25 // - Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimers in
27 // the documentation and/or other materials provided with the distribution.
28 // - Neither the names of Advanced Micro Devices, Inc,
29 // nor the names of its contributors may be used to endorse or promote
30 // products derived from this Software without specific prior written
31 // permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42
43 #include "core/inc/amd_gpu_agent.h"
44
45 #include <algorithm>
46 #include <atomic>
47 #include <cstring>
48 #include <climits>
49 #include <map>
50 #include <string>
51 #include <vector>
52 #include <memory>
53 #include <utility>
54
55 #include "core/inc/amd_aql_queue.h"
56 #include "core/inc/amd_blit_kernel.h"
57 #include "core/inc/amd_blit_sdma.h"
58 #include "core/inc/amd_gpu_pm4.h"
59 #include "core/inc/amd_gpu_shaders.h"
60 #include "core/inc/amd_memory_region.h"
61 #include "core/inc/interrupt_signal.h"
62 #include "core/inc/isa.h"
63 #include "core/inc/runtime.h"
64 #include "core/util/os.h"
65 #include "hsa_ext_image.h"
66 #include "inc/hsa_ven_amd_aqlprofile.h"
67
68 // Size of scratch (private) segment pre-allocated per thread, in bytes.
69 #define DEFAULT_SCRATCH_BYTES_PER_THREAD 2048
70
71 extern core::HsaApiTable hsa_internal_api_table_;
72
73 namespace amd {
GpuAgent(HSAuint32 node,const HsaNodeProperties & node_props)74 GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
75 : GpuAgentInt(node),
76 properties_(node_props),
77 current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
78 blits_(),
79 queues_(),
80 local_region_(NULL),
81 is_kv_device_(false),
82 trap_code_buf_(NULL),
83 trap_code_buf_size_(0),
84 memory_bus_width_(0),
85 memory_max_frequency_(0),
86 ape1_base_(0),
87 ape1_size_(0),
88 end_ts_pool_size_(0),
89 end_ts_pool_counter_(0),
90 end_ts_base_addr_(NULL) {
91 const bool is_apu_node = (properties_.NumCPUCores > 0);
92 profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
93
94 HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t0_);
95 t1_ = t0_;
96 assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
97
98 // Set instruction set architecture via node property, only on GPU device.
99 isa_ = (core::Isa*)core::IsaRegistry::GetIsa(core::Isa::Version(
100 node_props.EngineId.ui32.Major, node_props.EngineId.ui32.Minor,
101 node_props.EngineId.ui32.Stepping), profile_ == HSA_PROFILE_FULL);
102
103 // Check if the device is Kaveri, only on GPU device.
104 if (isa_->GetMajorVersion() == 7 && isa_->GetMinorVersion() == 0 &&
105 isa_->GetStepping() == 0) {
106 is_kv_device_ = true;
107 }
108
109 current_coherency_type((profile_ == HSA_PROFILE_FULL)
110 ? HSA_AMD_COHERENCY_TYPE_COHERENT
111 : HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
112
113 max_queues_ = core::Runtime::runtime_singleton_->flag().max_queues();
114 #if !defined(HSA_LARGE_MODEL) || !defined(__linux__)
115 if (max_queues_ == 0) {
116 max_queues_ = 10;
117 }
118 max_queues_ = std::min(10U, max_queues_);
119 #else
120 if (max_queues_ == 0) {
121 max_queues_ = 128;
122 }
123 max_queues_ = std::min(128U, max_queues_);
124 #endif
125
126 // Populate region list.
127 InitRegionList();
128
129 // Populate cache list.
130 InitCacheList();
131 }
132
~GpuAgent()133 GpuAgent::~GpuAgent() {
134 for (int i = 0; i < BlitCount; ++i) {
135 if (blits_[i] != nullptr) {
136 hsa_status_t status = blits_[i]->Destroy(*this);
137 assert(status == HSA_STATUS_SUCCESS);
138 }
139 }
140
141 if (end_ts_base_addr_ != NULL) {
142 core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
143 }
144
145 if (ape1_base_ != 0) {
146 _aligned_free(reinterpret_cast<void*>(ape1_base_));
147 }
148
149 if (scratch_pool_.base() != NULL) {
150 hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size());
151 }
152
153 if (trap_code_buf_ != NULL) {
154 ReleaseShader(trap_code_buf_, trap_code_buf_size_);
155 }
156
157 std::for_each(regions_.begin(), regions_.end(), DeleteObject());
158 regions_.clear();
159 }
160
AssembleShader(const char * src_sp3,const char * func_name,AssembleTarget assemble_target,void * & code_buf,size_t & code_buf_size) const161 void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
162 AssembleTarget assemble_target, void*& code_buf,
163 size_t& code_buf_size) const {
164 // Select precompiled shader implementation from name/target.
165 struct ASICShader {
166 const void* code;
167 size_t size;
168 int num_sgprs;
169 int num_vgprs;
170 };
171
172 struct CompiledShader {
173 ASICShader compute_7;
174 ASICShader compute_8;
175 ASICShader compute_9;
176 };
177
178 std::map<std::string, CompiledShader> compiled_shaders = {
179 {"TrapHandler",
180 {
181 {NULL, 0, 0, 0},
182 {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
183 {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
184 }},
185 {"CopyAligned",
186 {
187 {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
188 {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
189 {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
190 }},
191 {"CopyMisaligned",
192 {
193 {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
194 {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
195 {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
196 }},
197 {"Fill",
198 {
199 {kCodeFill7, sizeof(kCodeFill7), 19, 8},
200 {kCodeFill8, sizeof(kCodeFill8), 19, 8},
201 {kCodeFill8, sizeof(kCodeFill8), 19, 8},
202 }}};
203
204 auto compiled_shader_it = compiled_shaders.find(func_name);
205 assert(compiled_shader_it != compiled_shaders.end() &&
206 "Precompiled shader unavailable");
207
208 ASICShader* asic_shader = NULL;
209
210 switch (isa_->GetMajorVersion()) {
211 case 7:
212 asic_shader = &compiled_shader_it->second.compute_7;
213 break;
214 case 8:
215 asic_shader = &compiled_shader_it->second.compute_8;
216 break;
217 case 9:
218 asic_shader = &compiled_shader_it->second.compute_9;
219 break;
220 default:
221 assert(false && "Precompiled shader unavailable for target");
222 }
223
224 // Allocate a GPU-visible buffer for the shader.
225 size_t header_size =
226 (assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0);
227 code_buf_size = AlignUp(header_size + asic_shader->size, 0x1000);
228
229 code_buf = core::Runtime::runtime_singleton_->system_allocator()(
230 code_buf_size, 0x1000, core::MemoryRegion::AllocateExecutable);
231 assert(code_buf != NULL && "Code buffer allocation failed");
232
233 memset(code_buf, 0, code_buf_size);
234
235 // Populate optional code object header.
236 if (assemble_target == AssembleTarget::AQL) {
237 amd_kernel_code_t* header = reinterpret_cast<amd_kernel_code_t*>(code_buf);
238
239 int gran_sgprs = std::max(0, (int(asic_shader->num_sgprs) - 1) / 8);
240 int gran_vgprs = std::max(0, (int(asic_shader->num_vgprs) - 1) / 4);
241
242 header->kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t);
243 AMD_HSA_BITS_SET(header->kernel_code_properties,
244 AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
245 1);
246 AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
247 AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT,
248 gran_sgprs);
249 AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
250 AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT,
251 gran_vgprs);
252 AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
253 AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 3);
254 AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
255 AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 1);
256 AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
257 AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
258 AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
259 AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
260 }
261
262 // Copy shader code into the GPU-visible buffer.
263 memcpy((void*)(uintptr_t(code_buf) + header_size), asic_shader->code,
264 asic_shader->size);
265 }
266
ReleaseShader(void * code_buf,size_t code_buf_size) const267 void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const {
268 core::Runtime::runtime_singleton_->system_deallocator()(code_buf);
269 }
270
InitRegionList()271 void GpuAgent::InitRegionList() {
272 const bool is_apu_node = (properties_.NumCPUCores > 0);
273
274 std::vector<HsaMemoryProperties> mem_props(properties_.NumMemoryBanks);
275 if (HSAKMT_STATUS_SUCCESS ==
276 hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks,
277 &mem_props[0])) {
278 for (uint32_t mem_idx = 0; mem_idx < properties_.NumMemoryBanks;
279 ++mem_idx) {
280 // Ignore the one(s) with unknown size.
281 if (mem_props[mem_idx].SizeInBytes == 0) {
282 continue;
283 }
284
285 switch (mem_props[mem_idx].HeapType) {
286 case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
287 case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
288 if (!is_apu_node) {
289 mem_props[mem_idx].VirtualBaseAddress = 0;
290 }
291
292 memory_bus_width_ = mem_props[mem_idx].Width;
293 memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
294 case HSA_HEAPTYPE_GPU_LDS:
295 case HSA_HEAPTYPE_GPU_SCRATCH:
296 case HSA_HEAPTYPE_DEVICE_SVM: {
297 MemoryRegion* region =
298 new MemoryRegion(false, false, this, mem_props[mem_idx]);
299
300 regions_.push_back(region);
301
302 if (region->IsLocalMemory()) {
303 local_region_ = region;
304 }
305 break;
306 }
307 case HSA_HEAPTYPE_SYSTEM:
308 if (is_apu_node) {
309 memory_bus_width_ = mem_props[mem_idx].Width;
310 memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
311 }
312 break;
313 default:
314 continue;
315 }
316 }
317 }
318 }
319
InitScratchPool()320 void GpuAgent::InitScratchPool() {
321 HsaMemFlags flags;
322 flags.Value = 0;
323 flags.ui32.Scratch = 1;
324 flags.ui32.HostAccess = 1;
325
326 scratch_per_thread_ =
327 core::Runtime::runtime_singleton_->flag().scratch_mem_size();
328 if (scratch_per_thread_ == 0)
329 scratch_per_thread_ = DEFAULT_SCRATCH_BYTES_PER_THREAD;
330
331 // Scratch length is: waves/CU * threads/wave * queues * #CUs *
332 // scratch/thread
333 const uint32_t num_cu =
334 properties_.NumFComputeCores / properties_.NumSIMDPerCU;
335 queue_scratch_len_ = AlignUp(32 * 64 * num_cu * scratch_per_thread_, 65536);
336 size_t max_scratch_len = queue_scratch_len_ * max_queues_;
337
338 #if defined(HSA_LARGE_MODEL) && defined(__linux__)
339 // For 64-bit linux use max queues unless otherwise specified
340 if ((max_scratch_len == 0) || (max_scratch_len > 4294967296)) {
341 max_scratch_len = 4294967296; // 4GB apeture max
342 }
343 #endif
344
345 void* scratch_base;
346 HSAKMT_STATUS err =
347 hsaKmtAllocMemory(node_id(), max_scratch_len, flags, &scratch_base);
348 assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Scratch) failed");
349 assert(IsMultipleOf(scratch_base, 0x1000) &&
350 "Scratch base is not page aligned!");
351
352 scratch_pool_. ~SmallHeap();
353 if (HSAKMT_STATUS_SUCCESS == err) {
354 new (&scratch_pool_) SmallHeap(scratch_base, max_scratch_len);
355 } else {
356 new (&scratch_pool_) SmallHeap();
357 }
358 }
359
InitCacheList()360 void GpuAgent::InitCacheList() {
361 // Get GPU cache information.
362 // Similar to getting CPU cache but here we use FComputeIdLo.
363 cache_props_.resize(properties_.NumCaches);
364 if (HSAKMT_STATUS_SUCCESS !=
365 hsaKmtGetNodeCacheProperties(node_id(), properties_.FComputeIdLo,
366 properties_.NumCaches, &cache_props_[0])) {
367 cache_props_.clear();
368 } else {
369 // Only store GPU D-cache.
370 for (size_t cache_id = 0; cache_id < cache_props_.size(); ++cache_id) {
371 const HsaCacheType type = cache_props_[cache_id].CacheType;
372 if (type.ui32.HSACU != 1 || type.ui32.Instruction == 1) {
373 cache_props_.erase(cache_props_.begin() + cache_id);
374 --cache_id;
375 }
376 }
377 }
378
379 // Update cache objects
380 caches_.clear();
381 caches_.resize(cache_props_.size());
382 char name[64];
383 GetInfo(HSA_AGENT_INFO_NAME, name);
384 std::string deviceName = name;
385 for (size_t i = 0; i < caches_.size(); i++)
386 caches_[i].reset(new core::Cache(deviceName + " L" + std::to_string(cache_props_[i].CacheLevel),
387 cache_props_[i].CacheLevel, cache_props_[i].CacheSize));
388 }
389
InitEndTsPool()390 bool GpuAgent::InitEndTsPool() {
391 if (HSA_PROFILE_FULL == profile_) {
392 return true;
393 }
394
395 if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) {
396 return true;
397 }
398
399 ScopedAcquire<KernelMutex> lock(&blit_lock_);
400
401 if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) {
402 return true;
403 }
404
405 end_ts_pool_size_ =
406 static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
407 (BlitSdmaBase::kCopyPacketSize));
408
409 // Allocate end timestamp object for both h2d and d2h DMA.
410 const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
411
412 core::Runtime* runtime = core::Runtime::runtime_singleton_;
413
414 uint64_t* buff = NULL;
415 if (HSA_STATUS_SUCCESS !=
416 runtime->AllocateMemory(local_region_, alloc_size,
417 MemoryRegion::AllocateRestrict,
418 reinterpret_cast<void**>(&buff))) {
419 return false;
420 }
421
422 end_ts_base_addr_.store(buff, std::memory_order_release);
423
424 return true;
425 }
426
ObtainEndTsObject()427 uint64_t* GpuAgent::ObtainEndTsObject() {
428 if (end_ts_base_addr_ == NULL) {
429 return NULL;
430 }
431
432 const uint32_t end_ts_index =
433 end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) %
434 end_ts_pool_size_;
435 const static size_t kNumU64 = kTsSize / sizeof(uint64_t);
436 uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64];
437 assert(IsMultipleOf(end_ts_addr, kTsSize));
438
439 return end_ts_addr;
440 }
441
IterateRegion(hsa_status_t (* callback)(hsa_region_t region,void * data),void * data) const442 hsa_status_t GpuAgent::IterateRegion(
443 hsa_status_t (*callback)(hsa_region_t region, void* data),
444 void* data) const {
445 return VisitRegion(true, callback, data);
446 }
447
IterateCache(hsa_status_t (* callback)(hsa_cache_t cache,void * data),void * data) const448 hsa_status_t GpuAgent::IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
449 void* data) const {
450 AMD::callback_t<decltype(callback)> call(callback);
451 for (size_t i = 0; i < caches_.size(); i++) {
452 hsa_status_t stat = call(core::Cache::Convert(caches_[i].get()), data);
453 if (stat != HSA_STATUS_SUCCESS) return stat;
454 }
455 return HSA_STATUS_SUCCESS;
456 }
457
VisitRegion(bool include_peer,hsa_status_t (* callback)(hsa_region_t region,void * data),void * data) const458 hsa_status_t GpuAgent::VisitRegion(bool include_peer,
459 hsa_status_t (*callback)(hsa_region_t region,
460 void* data),
461 void* data) const {
462 if (include_peer) {
463 // Only expose system, local, and LDS memory of the blit agent.
464 if (this->node_id() ==
465 core::Runtime::runtime_singleton_->blit_agent()->node_id()) {
466 hsa_status_t stat = VisitRegion(regions_, callback, data);
467 if (stat != HSA_STATUS_SUCCESS) {
468 return stat;
469 }
470 }
471
472 // Also expose system regions accessible by this agent.
473 hsa_status_t stat =
474 VisitRegion(core::Runtime::runtime_singleton_->system_regions_fine(),
475 callback, data);
476 if (stat != HSA_STATUS_SUCCESS) {
477 return stat;
478 }
479
480 return VisitRegion(
481 core::Runtime::runtime_singleton_->system_regions_coarse(), callback,
482 data);
483 }
484
485 // Only expose system, local, and LDS memory of this agent.
486 return VisitRegion(regions_, callback, data);
487 }
488
VisitRegion(const std::vector<const core::MemoryRegion * > & regions,hsa_status_t (* callback)(hsa_region_t region,void * data),void * data) const489 hsa_status_t GpuAgent::VisitRegion(
490 const std::vector<const core::MemoryRegion*>& regions,
491 hsa_status_t (*callback)(hsa_region_t region, void* data),
492 void* data) const {
493 AMD::callback_t<decltype(callback)> call(callback);
494 for (const core::MemoryRegion* region : regions) {
495 const amd::MemoryRegion* amd_region =
496 reinterpret_cast<const amd::MemoryRegion*>(region);
497
498 // Only expose system, local, and LDS memory.
499 if (amd_region->IsSystem() || amd_region->IsLocalMemory() ||
500 amd_region->IsLDS()) {
501 hsa_region_t region_handle = core::MemoryRegion::Convert(region);
502 hsa_status_t status = call(region_handle, data);
503 if (status != HSA_STATUS_SUCCESS) {
504 return status;
505 }
506 }
507 }
508
509 return HSA_STATUS_SUCCESS;
510 }
511
CreateInterceptibleQueue()512 core::Queue* GpuAgent::CreateInterceptibleQueue() {
513 // Disabled intercept of internal queues pending tools updates.
514 core::Queue* queue = nullptr;
515 QueueCreate(minAqlSize_, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue);
516 return queue;
517 }
518
CreateBlitSdma(bool h2d)519 core::Blit* GpuAgent::CreateBlitSdma(bool h2d) {
520 core::Blit* sdma;
521
522 if (isa_->GetMajorVersion() <= 8) {
523 sdma = new BlitSdmaV2V3(h2d);
524 } else {
525 sdma = new BlitSdmaV4(h2d);
526 }
527
528 if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
529 sdma->Destroy(*this);
530 delete sdma;
531 sdma = NULL;
532 }
533
534 return sdma;
535 }
536
CreateBlitKernel(core::Queue * queue)537 core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
538 BlitKernel* kernl = new BlitKernel(queue);
539
540 if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) {
541 kernl->Destroy(*this);
542 delete kernl;
543 kernl = NULL;
544 }
545
546 return kernl;
547 }
548
InitDma()549 void GpuAgent::InitDma() {
550 // Setup lazy init pointers on queues and blits.
551 auto queue_lambda = [this]() {
552 auto ret = CreateInterceptibleQueue();
553 if (ret == nullptr)
554 throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
555 "Internal queue creation failed.");
556 return ret;
557 };
558 // Dedicated compute queue for host-to-device blits.
559 queues_[QueueBlitOnly].reset(queue_lambda);
560 // Share utility queue with device-to-host blits.
561 queues_[QueueUtility].reset(queue_lambda);
562
563 // Decide which engine to use for blits.
564 auto blit_lambda = [this](bool h2d, lazy_ptr<core::Queue>& queue) {
565 const std::string& sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
566
567 // Per-ASIC disables for firmware stability.
568 bool use_sdma = (isa_->GetMajorVersion() != 8) && (isa_->version() != core::Isa::Version(9, 0, 6));
569 if (sdma_override.size() != 0) use_sdma = (sdma_override == "1");
570
571 if (use_sdma && (HSA_PROFILE_BASE == profile_)) {
572 auto ret = CreateBlitSdma(h2d);
573 if (ret != nullptr) return ret;
574 }
575
576 auto ret = CreateBlitKernel((*queue).get());
577 if (ret == nullptr)
578 throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
579 return ret;
580 };
581
582 blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueBlitOnly]); });
583 blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
584 blits_[BlitDevToDev].reset([this]() {
585 auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
586 if (ret == nullptr)
587 throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
588 return ret;
589 });
590 }
591
PreloadBlits()592 void GpuAgent::PreloadBlits() {
593 blits_[BlitHostToDev].touch();
594 blits_[BlitDevToHost].touch();
595 blits_[BlitDevToDev].touch();
596 }
597
PostToolsInit()598 hsa_status_t GpuAgent::PostToolsInit() {
599 // Defer memory allocation until agents have been discovered.
600 InitScratchPool();
601 BindTrapHandler();
602 InitDma();
603
604 return HSA_STATUS_SUCCESS;
605 }
606
DmaCopy(void * dst,const void * src,size_t size)607 hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
608 return blits_[BlitDevToDev]->SubmitLinearCopyCommand(dst, src, size);
609 }
610
DmaCopy(void * dst,core::Agent & dst_agent,const void * src,core::Agent & src_agent,size_t size,std::vector<core::Signal * > & dep_signals,core::Signal & out_signal)611 hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
612 const void* src, core::Agent& src_agent,
613 size_t size,
614 std::vector<core::Signal*>& dep_signals,
615 core::Signal& out_signal) {
616 lazy_ptr<core::Blit>& blit =
617 (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
618 dst_agent.device_type() == core::Agent::kAmdGpuDevice)
619 ? blits_[BlitHostToDev]
620 : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
621 dst_agent.device_type() == core::Agent::kAmdCpuDevice)
622 ? blits_[BlitDevToHost]
623 : (src_agent.node_id() == dst_agent.node_id())
624 ? blits_[BlitDevToDev] : blits_[BlitDevToHost];
625
626 if (profiling_enabled()) {
627 // Track the agent so we could translate the resulting timestamp to system
628 // domain correctly.
629 out_signal.async_copy_agent(core::Agent::Convert(this->public_handle()));
630 }
631
632 hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal);
633
634 return stat;
635 }
636
DmaFill(void * ptr,uint32_t value,size_t count)637 hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
638 return blits_[BlitDevToDev]->SubmitLinearFillCommand(ptr, value, count);
639 }
640
EnableDmaProfiling(bool enable)641 hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
642 if (enable && !InitEndTsPool()) {
643 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
644 }
645
646 for (int i = 0; i < BlitCount; ++i) {
647 if (blits_[i] != NULL) {
648 const hsa_status_t stat = blits_[i]->EnableProfiling(enable);
649 if (stat != HSA_STATUS_SUCCESS) {
650 return stat;
651 }
652 }
653 }
654
655 return HSA_STATUS_SUCCESS;
656 }
657
GetInfo(hsa_agent_info_t attribute,void * value) const658 hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
659
660 // agent, and vendor name size limit
661 const size_t attribute_u = static_cast<size_t>(attribute);
662
663 switch (attribute_u) {
664
665 // Build agent name by concatenating the Major, Minor and Stepping Ids
666 // of devices compute capability with a prefix of "gfx"
667 case HSA_AGENT_INFO_NAME: {
668 std::stringstream name;
669 std::memset(value, 0, HSA_PUBLIC_NAME_SIZE);
670 char* temp = reinterpret_cast<char*>(value);
671 name << "gfx" << isa_->GetMajorVersion() << isa_->GetMinorVersion() << isa_->GetStepping();
672 std::strcpy(temp, name.str().c_str());
673 break;
674 }
675 case HSA_AGENT_INFO_VENDOR_NAME:
676 std::memset(value, 0, HSA_PUBLIC_NAME_SIZE);
677 std::memcpy(value, "AMD", sizeof("AMD"));
678 break;
679 case HSA_AGENT_INFO_FEATURE:
680 *((hsa_agent_feature_t*)value) = HSA_AGENT_FEATURE_KERNEL_DISPATCH;
681 break;
682 case HSA_AGENT_INFO_MACHINE_MODEL:
683 #if defined(HSA_LARGE_MODEL)
684 *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE;
685 #else
686 *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL;
687 #endif
688 break;
689 case HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES:
690 case HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE:
691 *((hsa_default_float_rounding_mode_t*)value) =
692 HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR;
693 break;
694 case HSA_AGENT_INFO_FAST_F16_OPERATION:
695 *((bool*)value) = false;
696 break;
697 case HSA_AGENT_INFO_PROFILE:
698 *((hsa_profile_t*)value) = profile_;
699 break;
700 case HSA_AGENT_INFO_WAVEFRONT_SIZE:
701 *((uint32_t*)value) = properties_.WaveFrontSize;
702 break;
703 case HSA_AGENT_INFO_WORKGROUP_MAX_DIM: {
704 // TODO: must be per-device
705 const uint16_t group_size[3] = {1024, 1024, 1024};
706 std::memcpy(value, group_size, sizeof(group_size));
707 } break;
708 case HSA_AGENT_INFO_WORKGROUP_MAX_SIZE:
709 // TODO: must be per-device
710 *((uint32_t*)value) = 1024;
711 break;
712 case HSA_AGENT_INFO_GRID_MAX_DIM: {
713 const hsa_dim3_t grid_size = {UINT32_MAX, UINT32_MAX, UINT32_MAX};
714 std::memcpy(value, &grid_size, sizeof(hsa_dim3_t));
715 } break;
716 case HSA_AGENT_INFO_GRID_MAX_SIZE:
717 *((uint32_t*)value) = UINT32_MAX;
718 break;
719 case HSA_AGENT_INFO_FBARRIER_MAX_SIZE:
720 // TODO: to confirm
721 *((uint32_t*)value) = 32;
722 break;
723 case HSA_AGENT_INFO_QUEUES_MAX:
724 *((uint32_t*)value) = max_queues_;
725 break;
726 case HSA_AGENT_INFO_QUEUE_MIN_SIZE:
727 *((uint32_t*)value) = minAqlSize_;
728 break;
729 case HSA_AGENT_INFO_QUEUE_MAX_SIZE:
730 *((uint32_t*)value) = maxAqlSize_;
731 break;
732 case HSA_AGENT_INFO_QUEUE_TYPE:
733 *((hsa_queue_type32_t*)value) = HSA_QUEUE_TYPE_MULTI;
734 break;
735 case HSA_AGENT_INFO_NODE:
736 // TODO: associate with OS NUMA support (numactl / GetNumaProcessorNode).
737 *((uint32_t*)value) = node_id();
738 break;
739 case HSA_AGENT_INFO_DEVICE:
740 *((hsa_device_type_t*)value) = HSA_DEVICE_TYPE_GPU;
741 break;
742 case HSA_AGENT_INFO_CACHE_SIZE:
743 std::memset(value, 0, sizeof(uint32_t) * 4);
744 // TODO: no GPU cache info from KFD. Hardcode for now.
745 // GCN whitepaper: L1 data cache is 16KB.
746 ((uint32_t*)value)[0] = 16 * 1024;
747 break;
748 case HSA_AGENT_INFO_ISA:
749 *((hsa_isa_t*)value) = core::Isa::Handle(isa_);
750 break;
751 case HSA_AGENT_INFO_EXTENSIONS: {
752 memset(value, 0, sizeof(uint8_t) * 128);
753
754 auto setFlag = [&](uint32_t bit) {
755 assert(bit < 128 * 8 && "Extension value exceeds extension bitmask");
756 uint index = bit / 8;
757 uint subBit = bit % 8;
758 ((uint8_t*)value)[index] |= 1 << subBit;
759 };
760
761 if (core::hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) {
762 setFlag(HSA_EXTENSION_FINALIZER);
763 }
764
765 if (core::hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) {
766 setFlag(HSA_EXTENSION_IMAGES);
767 }
768
769 if (os::LibHandle lib = os::LoadLib(kAqlProfileLib)) {
770 os::CloseLib(lib);
771 setFlag(HSA_EXTENSION_AMD_AQLPROFILE);
772 }
773
774 setFlag(HSA_EXTENSION_AMD_PROFILER);
775
776 break;
777 }
778 case HSA_AGENT_INFO_VERSION_MAJOR:
779 *((uint16_t*)value) = 1;
780 break;
781 case HSA_AGENT_INFO_VERSION_MINOR:
782 *((uint16_t*)value) = 1;
783 break;
784 case HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS:
785 case HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS:
786 case HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS:
787 case HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS:
788 case HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS:
789 case HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS:
790 case HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS:
791 case HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS:
792 case HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS:
793 return hsa_amd_image_get_info_max_dim(public_handle(), attribute, value);
794 case HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES:
795 // TODO: hardcode based on OCL constants.
796 *((uint32_t*)value) = 128;
797 break;
798 case HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES:
799 // TODO: hardcode based on OCL constants.
800 *((uint32_t*)value) = 64;
801 break;
802 case HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS:
803 // TODO: hardcode based on OCL constants.
804 *((uint32_t*)value) = 16;
805 case HSA_AMD_AGENT_INFO_CHIP_ID:
806 *((uint32_t*)value) = properties_.DeviceId;
807 break;
808 case HSA_AMD_AGENT_INFO_CACHELINE_SIZE:
809 // TODO: hardcode for now.
810 // GCN whitepaper: cache line size is 64 byte long.
811 *((uint32_t*)value) = 64;
812 break;
813 case HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT:
814 *((uint32_t*)value) =
815 (properties_.NumFComputeCores / properties_.NumSIMDPerCU);
816 break;
817 case HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY:
818 *((uint32_t*)value) = properties_.MaxEngineClockMhzFCompute;
819 break;
820 case HSA_AMD_AGENT_INFO_DRIVER_NODE_ID:
821 *((uint32_t*)value) = node_id();
822 break;
823 case HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS:
824 *((uint32_t*)value) = static_cast<uint32_t>(
825 1 << properties_.Capability.ui32.WatchPointsTotalBits);
826 break;
827 case HSA_AMD_AGENT_INFO_BDFID:
828 *((uint32_t*)value) = static_cast<uint32_t>(properties_.LocationId);
829 break;
830 case HSA_AMD_AGENT_INFO_MEMORY_WIDTH:
831 *((uint32_t*)value) = memory_bus_width_;
832 break;
833 case HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY:
834 *((uint32_t*)value) = memory_max_frequency_;
835 break;
836
837 // The code copies HsaNodeProperties.MarketingName a Unicode string
838 // which is encoded in UTF-16 as a 7-bit ASCII string
839 case HSA_AMD_AGENT_INFO_PRODUCT_NAME: {
840 std::memset(value, 0, HSA_PUBLIC_NAME_SIZE);
841 char* temp = reinterpret_cast<char*>(value);
842 for (uint32_t idx = 0;
843 properties_.MarketingName[idx] != 0 && idx < HSA_PUBLIC_NAME_SIZE - 1; idx++) {
844 temp[idx] = (uint8_t)properties_.MarketingName[idx];
845 }
846 break;
847 }
848 case HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU:
849 *((uint32_t*)value) = static_cast<uint32_t>(
850 properties_.NumSIMDPerCU * properties_.MaxWavesPerSIMD);
851 break;
852 case HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU:
853 *((uint32_t*)value) = properties_.NumSIMDPerCU;
854 break;
855 case HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES:
856 *((uint32_t*)value) = properties_.NumShaderBanks;
857 break;
858 case HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE:
859 *((uint32_t*)value) = properties_.NumArrays;
860 break;
861 default:
862 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
863 break;
864 }
865 return HSA_STATUS_SUCCESS;
866 }
867
QueueCreate(size_t size,hsa_queue_type32_t queue_type,core::HsaEventCallback event_callback,void * data,uint32_t private_segment_size,uint32_t group_segment_size,core::Queue ** queue)868 hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
869 core::HsaEventCallback event_callback,
870 void* data, uint32_t private_segment_size,
871 uint32_t group_segment_size,
872 core::Queue** queue) {
873 // AQL queues must be a power of two in length.
874 if (!IsPowerOfTwo(size)) {
875 return HSA_STATUS_ERROR_INVALID_ARGUMENT;
876 }
877
878 // Enforce max size
879 if (size > maxAqlSize_) {
880 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
881 }
882
883 // Allocate scratch memory
884 ScratchInfo scratch;
885 if (private_segment_size == UINT_MAX) {
886 private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
887 }
888
889 if (private_segment_size > 262128) {
890 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
891 }
892
893 scratch.size_per_thread = AlignUp(private_segment_size, 16);
894 if (scratch.size_per_thread > 262128) {
895 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
896 }
897
898 const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
899 scratch.size = scratch.size_per_thread * 32 * 64 * num_cu;
900 scratch.queue_base = nullptr;
901 scratch.queue_process_offset = 0;
902
903 MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueScratch(scratch); });
904
905 if (scratch.size != 0) {
906 AcquireQueueScratch(scratch);
907 if (scratch.queue_base == nullptr) {
908 return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
909 }
910 }
911
912 // Ensure utility queue has been created.
913 // Deferring longer risks exhausting queue count before ISA upload and invalidation capability is
914 // ensured.
915 queues_[QueueUtility].touch();
916
917 // Create an HW AQL queue
918 *queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_);
919 scratchGuard.Dismiss();
920 return HSA_STATUS_SUCCESS;
921 }
922
AcquireQueueScratch(ScratchInfo & scratch)923 void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
924 bool need_queue_scratch_base = (isa_->GetMajorVersion() > 8);
925
926 if (scratch.size == 0) {
927 scratch.size = queue_scratch_len_;
928 scratch.size_per_thread = scratch_per_thread_;
929 }
930
931 scratch.retry = false;
932
933 ScopedAcquire<KernelMutex> lock(&scratch_lock_);
934 // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
935 size_t small_limit = scratch_pool_.size() >> 3;
936 size_t single_limit = small_limit >> 2;
937 bool large = (scratch.size > single_limit) ||
938 (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
939 large = (isa_->GetMajorVersion() < 8) ? false : large;
940 if (large)
941 scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
942 else
943 scratch.queue_base = scratch_pool_.alloc(scratch.size);
944 large |= scratch.queue_base > scratch_pool_.high_split();
945 scratch.large = large;
946
947 scratch.queue_process_offset =
948 (need_queue_scratch_base)
949 ? uintptr_t(scratch.queue_base)
950 : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
951
952 if (scratch.queue_base != nullptr) {
953 if (profile_ == HSA_PROFILE_FULL) return;
954 if (profile_ == HSA_PROFILE_BASE) {
955 HSAuint64 alternate_va;
956 if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
957 HSAKMT_STATUS_SUCCESS) {
958 if (large) scratch_used_large_ += scratch.size;
959 return;
960 }
961 }
962 }
963
964 // Scratch request failed allocation or mapping.
965 scratch_pool_.free(scratch.queue_base);
966 scratch.queue_base = nullptr;
967
968 // Retry if large may yield needed space.
969 if (scratch_used_large_ != 0) {
970 scratch.retry = true;
971 return;
972 }
973
974 // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
975 if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
976 debug_print("Failed to map requested scratch - reducing queue occupancy.\n");
977 uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
978 uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024);
979 uint64_t total_waves = scratch.size / size_per_wave;
980 uint64_t waves_per_cu = total_waves / num_cus;
981 while (waves_per_cu != 0) {
982 size_t size = waves_per_cu * num_cus * size_per_wave;
983 void* base = scratch_pool_.alloc(size);
984 HSAuint64 alternate_va;
985 if ((base != nullptr) &&
986 ((profile_ == HSA_PROFILE_FULL) ||
987 (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
988 // Scratch allocated and either full profile or map succeeded.
989 scratch.queue_base = base;
990 scratch.size = size;
991 scratch.queue_process_offset =
992 (need_queue_scratch_base)
993 ? uintptr_t(scratch.queue_base)
994 : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
995 scratch.large = true;
996 scratch_used_large_ += scratch.size;
997 return;
998 }
999 scratch_pool_.free(base);
1000 waves_per_cu--;
1001 }
1002
1003 // Failed to allocate minimal scratch
1004 assert(scratch.queue_base == nullptr && "bad scratch data");
1005 if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
1006 debug_print("Could not allocate scratch for one wave per CU.\n");
1007 }
1008
ReleaseQueueScratch(ScratchInfo & scratch)1009 void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
1010 if (scratch.queue_base == nullptr) {
1011 return;
1012 }
1013
1014 ScopedAcquire<KernelMutex> lock(&scratch_lock_);
1015 if (profile_ == HSA_PROFILE_BASE) {
1016 if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) {
1017 assert(false && "Unmap scratch subrange failed!");
1018 }
1019 }
1020 scratch_pool_.free(scratch.queue_base);
1021
1022 if (scratch.large) scratch_used_large_ -= scratch.size;
1023
1024 // Notify waiters that additional scratch may be available.
1025 for (auto notifier : scratch_notifiers_)
1026 HSA::hsa_signal_or_relaxed(notifier.first, notifier.second);
1027 }
1028
TranslateTime(core::Signal * signal,hsa_amd_profiling_dispatch_time_t & time)1029 void GpuAgent::TranslateTime(core::Signal* signal,
1030 hsa_amd_profiling_dispatch_time_t& time) {
1031 // Ensure interpolation
1032 ScopedAcquire<KernelMutex> lock(&t1_lock_);
1033 if (t1_.GPUClockCounter < signal->signal_.end_ts) {
1034 SyncClocks();
1035 }
1036
1037 time.start = uint64_t(
1038 (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) /
1039 double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) *
1040 double(int64_t(signal->signal_.start_ts - t1_.GPUClockCounter)) +
1041 double(t1_.SystemClockCounter));
1042 time.end = uint64_t(
1043 (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) /
1044 double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) *
1045 double(int64_t(signal->signal_.end_ts - t1_.GPUClockCounter)) +
1046 double(t1_.SystemClockCounter));
1047 }
1048
TranslateTime(uint64_t tick)1049 uint64_t GpuAgent::TranslateTime(uint64_t tick) {
1050 ScopedAcquire<KernelMutex> lock(&t1_lock_);
1051 SyncClocks();
1052
1053 uint64_t system_tick = 0;
1054 system_tick = uint64_t(
1055 (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) /
1056 double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) *
1057 double(int64_t(tick - t1_.GPUClockCounter)) +
1058 double(t1_.SystemClockCounter));
1059 return system_tick;
1060 }
1061
current_coherency_type(hsa_amd_coherency_type_t type)1062 bool GpuAgent::current_coherency_type(hsa_amd_coherency_type_t type) {
1063 if (!is_kv_device_) {
1064 current_coherency_type_ = type;
1065 return true;
1066 }
1067
1068 ScopedAcquire<KernelMutex> Lock(&coherency_lock_);
1069
1070 if (ape1_base_ == 0 && ape1_size_ == 0) {
1071 static const size_t kApe1Alignment = 64 * 1024;
1072 ape1_size_ = kApe1Alignment;
1073 ape1_base_ = reinterpret_cast<uintptr_t>(
1074 _aligned_malloc(ape1_size_, kApe1Alignment));
1075 assert((ape1_base_ != 0) && ("APE1 allocation failed"));
1076 } else if (type == current_coherency_type_) {
1077 return true;
1078 }
1079
1080 HSA_CACHING_TYPE type0, type1;
1081 if (type == HSA_AMD_COHERENCY_TYPE_COHERENT) {
1082 type0 = HSA_CACHING_CACHED;
1083 type1 = HSA_CACHING_NONCACHED;
1084 } else {
1085 type0 = HSA_CACHING_NONCACHED;
1086 type1 = HSA_CACHING_CACHED;
1087 }
1088
1089 if (hsaKmtSetMemoryPolicy(node_id(), type0, type1,
1090 reinterpret_cast<void*>(ape1_base_),
1091 ape1_size_) != HSAKMT_STATUS_SUCCESS) {
1092 return false;
1093 }
1094 current_coherency_type_ = type;
1095 return true;
1096 }
1097
GetMicrocodeVersion() const1098 uint16_t GpuAgent::GetMicrocodeVersion() const {
1099 return (properties_.EngineId.ui32.uCode);
1100 }
1101
GetSdmaMicrocodeVersion() const1102 uint16_t GpuAgent::GetSdmaMicrocodeVersion() const {
1103 return (properties_.uCodeEngineVersions.uCodeSDMA);
1104 }
1105
SyncClocks()1106 void GpuAgent::SyncClocks() {
1107 HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t1_);
1108 assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
1109 }
1110
BindTrapHandler()1111 void GpuAgent::BindTrapHandler() {
1112 const char* src_sp3 = R"(
1113 var s_trap_info_lo = ttmp0
1114 var s_trap_info_hi = ttmp1
1115 var s_tmp0 = ttmp2
1116 var s_tmp1 = ttmp3
1117 var s_tmp2 = ttmp4
1118 var s_tmp3 = ttmp5
1119
1120 shader TrapHandler
1121 type(CS)
1122
1123 // Retrieve the queue inactive signal.
1124 s_load_dwordx2 [s_tmp0, s_tmp1], s[0:1], 0xC0
1125 s_waitcnt lgkmcnt(0)
1126
1127 // Mask all but one lane of the wavefront.
1128 s_mov_b64 exec, 0x1
1129
1130 // Set queue signal value to unhandled exception error.
1131 s_add_u32 s_tmp0, s_tmp0, 0x8
1132 s_addc_u32 s_tmp1, s_tmp1, 0x0
1133 v_mov_b32 v0, s_tmp0
1134 v_mov_b32 v1, s_tmp1
1135 v_mov_b32 v2, 0x80000000
1136 v_mov_b32 v3, 0x0
1137 flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3]
1138 s_waitcnt vmcnt(0)
1139
1140 // Skip event if the signal was already set to unhandled exception.
1141 v_cmp_eq_u64 vcc, v[0:1], v[2:3]
1142 s_cbranch_vccnz L_SIGNAL_DONE
1143
1144 // Check for a non-NULL signal event mailbox.
1145 s_load_dwordx2 [s_tmp2, s_tmp3], [s_tmp0, s_tmp1], 0x8
1146 s_waitcnt lgkmcnt(0)
1147 s_and_b64 [s_tmp2, s_tmp3], [s_tmp2, s_tmp3], [s_tmp2, s_tmp3]
1148 s_cbranch_scc0 L_SIGNAL_DONE
1149
1150 // Load the signal event value.
1151 s_add_u32 s_tmp0, s_tmp0, 0x10
1152 s_addc_u32 s_tmp1, s_tmp1, 0x0
1153 s_load_dword s_tmp0, [s_tmp0, s_tmp1], 0x0
1154 s_waitcnt lgkmcnt(0)
1155
1156 // Write the signal event value to the mailbox.
1157 v_mov_b32 v0, s_tmp2
1158 v_mov_b32 v1, s_tmp3
1159 v_mov_b32 v2, s_tmp0
1160 flat_store_dword v[0:1], v2
1161 s_waitcnt vmcnt(0)
1162
1163 // Send an interrupt to trigger event notification.
1164 s_sendmsg sendmsg(MSG_INTERRUPT)
1165
1166 L_SIGNAL_DONE:
1167 // Halt wavefront and exit trap.
1168 s_sethalt 1
1169 s_rfe_b64 [s_trap_info_lo, s_trap_info_hi]
1170 end
1171 )";
1172
1173 if (isa_->GetMajorVersion() == 7) {
1174 // No trap handler support on Gfx7, soft error.
1175 return;
1176 }
1177
1178 // Disable trap handler on Carrizo until KFD is fixed.
1179 if (profile_ == HSA_PROFILE_FULL) {
1180 return;
1181 }
1182
1183 // Assemble the trap handler source code.
1184 AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_,
1185 trap_code_buf_size_);
1186
1187 // Bind the trap handler to this node.
1188 HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_,
1189 trap_code_buf_size_, NULL, 0);
1190 assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtSetTrapHandler() failed");
1191 }
1192
InvalidateCodeCaches()1193 void GpuAgent::InvalidateCodeCaches() {
1194 // Check for microcode cache invalidation support.
1195 // This is deprecated in later microcode builds.
1196 if (isa_->GetMajorVersion() == 7) {
1197 if (properties_.EngineId.ui32.uCode < 420) {
1198 // Microcode is handling code cache invalidation.
1199 return;
1200 }
1201 } else if (isa_->GetMajorVersion() == 8 && isa_->GetMinorVersion() == 0) {
1202 if (properties_.EngineId.ui32.uCode < 685) {
1203 // Microcode is handling code cache invalidation.
1204 return;
1205 }
1206 } else if (isa_->GetMajorVersion() == 9) {
1207 if (properties_.EngineId.ui32.uCode < 334) {
1208 static std::once_flag once;
1209 std::call_once(
1210 once, []() { fprintf(stderr, "warning: code cache invalidation not implemented\n"); });
1211 return;
1212 }
1213 } else {
1214 assert(false && "Code cache invalidation not implemented for this agent");
1215 }
1216
1217 // Invalidate caches which may hold lines of code object allocation.
1218 constexpr uint32_t cache_inv_size_dw = 7;
1219 uint32_t cache_inv[cache_inv_size_dw];
1220
1221 cache_inv[0] = PM4_HDR(PM4_HDR_IT_OPCODE_ACQUIRE_MEM, cache_inv_size_dw,
1222 isa_->GetMajorVersion());
1223 cache_inv[1] = PM4_ACQUIRE_MEM_DW1_COHER_CNTL(
1224 PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA |
1225 PM4_ACQUIRE_MEM_COHER_CNTL_SH_KCACHE_ACTION_ENA |
1226 PM4_ACQUIRE_MEM_COHER_CNTL_TC_ACTION_ENA |
1227 PM4_ACQUIRE_MEM_COHER_CNTL_TC_WB_ACTION_ENA);
1228 cache_inv[2] = PM4_ACQUIRE_MEM_DW2_COHER_SIZE(0xFFFFFFFF);
1229 cache_inv[3] = PM4_ACQUIRE_MEM_DW3_COHER_SIZE_HI(0xFF);
1230 cache_inv[4] = 0;
1231 cache_inv[5] = 0;
1232 cache_inv[6] = 0;
1233
1234 // Submit the command to the utility queue and wait for it to complete.
1235 queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv));
1236 }
1237
1238 } // namespace
1239