1 //===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <dlfcn.h>
19 #include <elf.h>
20 #include <ffi.h>
21 #include <fstream>
22 #include <iostream>
23 #include <libelf.h>
24 #include <list>
25 #include <memory>
26 #include <mutex>
27 #include <shared_mutex>
28 #include <thread>
29 #include <unordered_map>
30 #include <vector>
31
32 // Header from ATMI interface
33 #include "atmi_interop_hsa.h"
34 #include "atmi_runtime.h"
35
36 #include "internal.h"
37
38 #include "Debug.h"
39 #include "get_elf_mach_gfx_name.h"
40 #include "omptargetplugin.h"
41
42 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
43
44 #ifndef TARGET_NAME
45 #define TARGET_NAME AMDHSA
46 #endif
47 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
48
49 // hostrpc interface, FIXME: consider moving to its own include these are
50 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
51 // linked as --whole-archive to override the weak symbols that are used to
52 // implement a fallback for toolchains that do not yet have a hostrpc library.
53 extern "C" {
54 unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q,
55 uint32_t device_id);
56 hsa_status_t hostrpc_init();
57 hsa_status_t hostrpc_terminate();
58
hostrpc_init()59 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
hostrpc_terminate()60 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
61 return HSA_STATUS_SUCCESS;
62 }
63 __attribute__((weak)) unsigned long
hostrpc_assign_buffer(hsa_agent_t,hsa_queue_t *,uint32_t device_id)64 hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) {
65 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
66 "missing\n",
67 device_id);
68 return 0;
69 }
70 }
71
72 int print_kernel_trace;
73
74 // Size of the target call stack struture
75 uint32_t TgtStackItemSize = 0;
76
77 #undef check // Drop definition from internal.h
78 #ifdef OMPTARGET_DEBUG
79 #define check(msg, status) \
80 if (status != ATMI_STATUS_SUCCESS) { \
81 /* fprintf(stderr, "[%s:%d] %s failed.\n", __FILE__, __LINE__, #msg);*/ \
82 DP(#msg " failed\n"); \
83 /*assert(0);*/ \
84 } else { \
85 /* fprintf(stderr, "[%s:%d] %s succeeded.\n", __FILE__, __LINE__, #msg); \
86 */ \
87 DP(#msg " succeeded\n"); \
88 }
89 #else
90 #define check(msg, status) \
91 {}
92 #endif
93
94 #include "elf_common.h"
95
96 /// Keep entries table per device
97 struct FuncOrGblEntryTy {
98 __tgt_target_table Table;
99 std::vector<__tgt_offload_entry> Entries;
100 };
101
102 enum ExecutionModeType {
103 SPMD, // constructors, destructors,
104 // combined constructs (`teams distribute parallel for [simd]`)
105 GENERIC, // everything else
106 NONE
107 };
108
109 struct KernelArgPool {
110 private:
111 static pthread_mutex_t mutex;
112
113 public:
114 uint32_t kernarg_segment_size;
115 void *kernarg_region = nullptr;
116 std::queue<int> free_kernarg_segments;
117
kernarg_size_including_implicitKernelArgPool118 uint32_t kernarg_size_including_implicit() {
119 return kernarg_segment_size + sizeof(atmi_implicit_args_t);
120 }
121
~KernelArgPoolKernelArgPool122 ~KernelArgPool() {
123 if (kernarg_region) {
124 auto r = hsa_amd_memory_pool_free(kernarg_region);
125 assert(r == HSA_STATUS_SUCCESS);
126 ErrorCheck(Memory pool free, r);
127 }
128 }
129
130 // Can't really copy or move a mutex
131 KernelArgPool() = default;
132 KernelArgPool(const KernelArgPool &) = delete;
133 KernelArgPool(KernelArgPool &&) = delete;
134
KernelArgPoolKernelArgPool135 KernelArgPool(uint32_t kernarg_segment_size)
136 : kernarg_segment_size(kernarg_segment_size) {
137
138 // atmi uses one pool per kernel for all gpus, with a fixed upper size
139 // preserving that exact scheme here, including the queue<int>
140 {
141 hsa_status_t err = hsa_amd_memory_pool_allocate(
142 atl_gpu_kernarg_pools[0],
143 kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
144 &kernarg_region);
145 ErrorCheck(Allocating memory for the executable-kernel, err);
146 core::allow_access_to_all_gpu_agents(kernarg_region);
147
148 for (int i = 0; i < MAX_NUM_KERNELS; i++) {
149 free_kernarg_segments.push(i);
150 }
151 }
152 }
153
allocateKernelArgPool154 void *allocate(uint64_t arg_num) {
155 assert((arg_num * sizeof(void *)) == kernarg_segment_size);
156 lock l(&mutex);
157 void *res = nullptr;
158 if (!free_kernarg_segments.empty()) {
159
160 int free_idx = free_kernarg_segments.front();
161 res = static_cast<void *>(static_cast<char *>(kernarg_region) +
162 (free_idx * kernarg_size_including_implicit()));
163 assert(free_idx == pointer_to_index(res));
164 free_kernarg_segments.pop();
165 }
166 return res;
167 }
168
deallocateKernelArgPool169 void deallocate(void *ptr) {
170 lock l(&mutex);
171 int idx = pointer_to_index(ptr);
172 free_kernarg_segments.push(idx);
173 }
174
175 private:
pointer_to_indexKernelArgPool176 int pointer_to_index(void *ptr) {
177 ptrdiff_t bytes =
178 static_cast<char *>(ptr) - static_cast<char *>(kernarg_region);
179 assert(bytes >= 0);
180 assert(bytes % kernarg_size_including_implicit() == 0);
181 return bytes / kernarg_size_including_implicit();
182 }
183 struct lock {
lockKernelArgPool::lock184 lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
~lockKernelArgPool::lock185 ~lock() { pthread_mutex_unlock(m); }
186 pthread_mutex_t *m;
187 };
188 };
189 pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER;
190
191 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
192 KernelArgPoolMap;
193
194 /// Use a single entity to encode a kernel and a set of flags
195 struct KernelTy {
196 // execution mode of kernel
197 // 0 - SPMD mode (without master warp)
198 // 1 - Generic mode (with master warp)
199 int8_t ExecutionMode;
200 int16_t ConstWGSize;
201 int32_t device_id;
202 void *CallStackAddr = nullptr;
203 const char *Name;
204
KernelTyKernelTy205 KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
206 void *_CallStackAddr, const char *_Name,
207 uint32_t _kernarg_segment_size)
208 : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
209 device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
210 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
211
212 std::string N(_Name);
213 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
214 KernelArgPoolMap.insert(
215 std::make_pair(N, std::unique_ptr<KernelArgPool>(
216 new KernelArgPool(_kernarg_segment_size))));
217 }
218 }
219 };
220
221 /// List that contains all the kernels.
222 /// FIXME: we may need this to be per device and per library.
223 std::list<KernelTy> KernelsList;
224
225 // ATMI API to get gpu and gpu memory place
get_gpu_place(int device_id)226 static atmi_place_t get_gpu_place(int device_id) {
227 return ATMI_PLACE_GPU(0, device_id);
228 }
get_gpu_mem_place(int device_id)229 static atmi_mem_place_t get_gpu_mem_place(int device_id) {
230 return ATMI_MEM_PLACE_GPU_MEM(0, device_id, 0);
231 }
232
find_gpu_agents()233 static std::vector<hsa_agent_t> find_gpu_agents() {
234 std::vector<hsa_agent_t> res;
235
236 hsa_status_t err = hsa_iterate_agents(
237 [](hsa_agent_t agent, void *data) -> hsa_status_t {
238 std::vector<hsa_agent_t> *res =
239 static_cast<std::vector<hsa_agent_t> *>(data);
240
241 hsa_device_type_t device_type;
242 // get_info fails iff HSA runtime not yet initialized
243 hsa_status_t err =
244 hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
245 if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
246 printf("rtl.cpp: err %d\n", err);
247 assert(err == HSA_STATUS_SUCCESS);
248
249 if (device_type == HSA_DEVICE_TYPE_GPU) {
250 res->push_back(agent);
251 }
252 return HSA_STATUS_SUCCESS;
253 },
254 &res);
255
256 // iterate_agents fails iff HSA runtime not yet initialized
257 if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
258 printf("rtl.cpp: err %d\n", err);
259 assert(err == HSA_STATUS_SUCCESS);
260 return res;
261 }
262
callbackQueue(hsa_status_t status,hsa_queue_t * source,void * data)263 static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
264 void *data) {
265 if (status != HSA_STATUS_SUCCESS) {
266 const char *status_string;
267 if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) {
268 status_string = "unavailable";
269 }
270 fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__,
271 __LINE__, source, status, status_string);
272 abort();
273 }
274 }
275
276 namespace core {
packet_store_release(uint32_t * packet,uint16_t header,uint16_t rest)277 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
278 __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
279 }
280
create_header(hsa_packet_type_t type,int barrier,atmi_task_fence_scope_t acq_fence,atmi_task_fence_scope_t rel_fence)281 uint16_t create_header(hsa_packet_type_t type, int barrier,
282 atmi_task_fence_scope_t acq_fence,
283 atmi_task_fence_scope_t rel_fence) {
284 uint16_t header = type << HSA_PACKET_HEADER_TYPE;
285 header |= barrier << HSA_PACKET_HEADER_BARRIER;
286 header |= (hsa_fence_scope_t) static_cast<int>(
287 acq_fence << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
288 header |= (hsa_fence_scope_t) static_cast<int>(
289 rel_fence << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
290 return header;
291 }
292 } // namespace core
293
294 /// Class containing all the device information
295 class RTLDeviceInfoTy {
296 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
297
298 public:
299 // load binary populates symbol tables and mutates various global state
300 // run uses those symbol tables
301 std::shared_timed_mutex load_run_lock;
302
303 int NumberOfDevices;
304
305 // GPU devices
306 std::vector<hsa_agent_t> HSAAgents;
307 std::vector<hsa_queue_t *> HSAQueues; // one per gpu
308
309 // Device properties
310 std::vector<int> ComputeUnits;
311 std::vector<int> GroupsPerDevice;
312 std::vector<int> ThreadsPerGroup;
313 std::vector<int> WarpSize;
314 std::vector<std::string> GPUName;
315
316 // OpenMP properties
317 std::vector<int> NumTeams;
318 std::vector<int> NumThreads;
319
320 // OpenMP Environment properties
321 int EnvNumTeams;
322 int EnvTeamLimit;
323 int EnvMaxTeamsDefault;
324
325 // OpenMP Requires Flags
326 int64_t RequiresFlags;
327
328 // Resource pools
329 SignalPoolT FreeSignalPool;
330
331 struct atmiFreePtrDeletor {
operator ()RTLDeviceInfoTy::atmiFreePtrDeletor332 void operator()(void *p) {
333 atmi_free(p); // ignore failure to free
334 }
335 };
336
337 // device_State shared across loaded binaries, error if inconsistent size
338 std::vector<std::pair<std::unique_ptr<void, atmiFreePtrDeletor>, uint64_t>>
339 deviceStateStore;
340
341 static const unsigned HardTeamLimit =
342 (1 << 16) - 1; // 64K needed to fit in uint16
343 static const int DefaultNumTeams = 128;
344 static const int Max_Teams =
345 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
346 static const int Warp_Size =
347 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
348 static const int Max_WG_Size =
349 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size];
350 static const int Default_WG_Size =
351 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
352
353 using MemcpyFunc = atmi_status_t (*)(hsa_signal_t, void *, const void *,
354 size_t size, hsa_agent_t);
freesignalpool_memcpy(void * dest,const void * src,size_t size,MemcpyFunc Func,int32_t deviceId)355 atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
356 MemcpyFunc Func, int32_t deviceId) {
357 hsa_agent_t agent = HSAAgents[deviceId];
358 hsa_signal_t s = FreeSignalPool.pop();
359 if (s.handle == 0) {
360 return ATMI_STATUS_ERROR;
361 }
362 atmi_status_t r = Func(s, dest, src, size, agent);
363 FreeSignalPool.push(s);
364 return r;
365 }
366
freesignalpool_memcpy_d2h(void * dest,const void * src,size_t size,int32_t deviceId)367 atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
368 size_t size, int32_t deviceId) {
369 return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId);
370 }
371
freesignalpool_memcpy_h2d(void * dest,const void * src,size_t size,int32_t deviceId)372 atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
373 size_t size, int32_t deviceId) {
374 return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId);
375 }
376
377 // Record entry point associated with device
addOffloadEntry(int32_t device_id,__tgt_offload_entry entry)378 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
379 assert(device_id < (int32_t)FuncGblEntries.size() &&
380 "Unexpected device id!");
381 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
382
383 E.Entries.push_back(entry);
384 }
385
386 // Return true if the entry is associated with device
findOffloadEntry(int32_t device_id,void * addr)387 bool findOffloadEntry(int32_t device_id, void *addr) {
388 assert(device_id < (int32_t)FuncGblEntries.size() &&
389 "Unexpected device id!");
390 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
391
392 for (auto &it : E.Entries) {
393 if (it.addr == addr)
394 return true;
395 }
396
397 return false;
398 }
399
400 // Return the pointer to the target entries table
getOffloadEntriesTable(int32_t device_id)401 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
402 assert(device_id < (int32_t)FuncGblEntries.size() &&
403 "Unexpected device id!");
404 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
405
406 int32_t size = E.Entries.size();
407
408 // Table is empty
409 if (!size)
410 return 0;
411
412 __tgt_offload_entry *begin = &E.Entries[0];
413 __tgt_offload_entry *end = &E.Entries[size - 1];
414
415 // Update table info according to the entries and return the pointer
416 E.Table.EntriesBegin = begin;
417 E.Table.EntriesEnd = ++end;
418
419 return &E.Table;
420 }
421
422 // Clear entries table for a device
clearOffloadEntriesTable(int device_id)423 void clearOffloadEntriesTable(int device_id) {
424 assert(device_id < (int32_t)FuncGblEntries.size() &&
425 "Unexpected device id!");
426 FuncGblEntries[device_id].emplace_back();
427 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
428 // KernelArgPoolMap.clear();
429 E.Entries.clear();
430 E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
431 }
432
RTLDeviceInfoTy()433 RTLDeviceInfoTy() {
434 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
435 // anytime. You do not need a debug library build.
436 // 0 => no tracing
437 // 1 => tracing dispatch only
438 // >1 => verbosity increase
439 if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
440 print_kernel_trace = atoi(envStr);
441 else
442 print_kernel_trace = 0;
443
444 DP("Start initializing HSA-ATMI\n");
445 atmi_status_t err = atmi_init();
446 if (err != ATMI_STATUS_SUCCESS) {
447 DP("Error when initializing HSA-ATMI\n");
448 return;
449 }
450 // Init hostcall soon after initializing ATMI
451 hostrpc_init();
452
453 HSAAgents = find_gpu_agents();
454 NumberOfDevices = (int)HSAAgents.size();
455
456 if (NumberOfDevices == 0) {
457 DP("There are no devices supporting HSA.\n");
458 return;
459 } else {
460 DP("There are %d devices supporting HSA.\n", NumberOfDevices);
461 }
462
463 // Init the device info
464 HSAQueues.resize(NumberOfDevices);
465 FuncGblEntries.resize(NumberOfDevices);
466 ThreadsPerGroup.resize(NumberOfDevices);
467 ComputeUnits.resize(NumberOfDevices);
468 GPUName.resize(NumberOfDevices);
469 GroupsPerDevice.resize(NumberOfDevices);
470 WarpSize.resize(NumberOfDevices);
471 NumTeams.resize(NumberOfDevices);
472 NumThreads.resize(NumberOfDevices);
473 deviceStateStore.resize(NumberOfDevices);
474
475 for (int i = 0; i < NumberOfDevices; i++) {
476 uint32_t queue_size = 0;
477 {
478 hsa_status_t err;
479 err = hsa_agent_get_info(HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE,
480 &queue_size);
481 ErrorCheck(Querying the agent maximum queue size, err);
482 if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
483 queue_size = core::Runtime::getInstance().getMaxQueueSize();
484 }
485 }
486
487 hsa_status_t rc = hsa_queue_create(
488 HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL,
489 UINT32_MAX, UINT32_MAX, &HSAQueues[i]);
490 if (rc != HSA_STATUS_SUCCESS) {
491 DP("Failed to create HSA queues\n");
492 return;
493 }
494
495 deviceStateStore[i] = {nullptr, 0};
496 }
497
498 for (int i = 0; i < NumberOfDevices; i++) {
499 ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
500 GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
501 ComputeUnits[i] = 1;
502 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i,
503 GroupsPerDevice[i], ThreadsPerGroup[i]);
504 }
505
506 // Get environment variables regarding teams
507 char *envStr = getenv("OMP_TEAM_LIMIT");
508 if (envStr) {
509 // OMP_TEAM_LIMIT has been set
510 EnvTeamLimit = std::stoi(envStr);
511 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
512 } else {
513 EnvTeamLimit = -1;
514 }
515 envStr = getenv("OMP_NUM_TEAMS");
516 if (envStr) {
517 // OMP_NUM_TEAMS has been set
518 EnvNumTeams = std::stoi(envStr);
519 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
520 } else {
521 EnvNumTeams = -1;
522 }
523 // Get environment variables regarding expMaxTeams
524 envStr = getenv("OMP_MAX_TEAMS_DEFAULT");
525 if (envStr) {
526 EnvMaxTeamsDefault = std::stoi(envStr);
527 DP("Parsed OMP_MAX_TEAMS_DEFAULT=%d\n", EnvMaxTeamsDefault);
528 } else {
529 EnvMaxTeamsDefault = -1;
530 }
531
532 // Default state.
533 RequiresFlags = OMP_REQ_UNDEFINED;
534 }
535
~RTLDeviceInfoTy()536 ~RTLDeviceInfoTy() {
537 DP("Finalizing the HSA-ATMI DeviceInfo.\n");
538 // Run destructors on types that use HSA before
539 // atmi_finalize removes access to it
540 deviceStateStore.clear();
541 KernelArgPoolMap.clear();
542 // Terminate hostrpc before finalizing ATMI
543 hostrpc_terminate();
544 atmi_finalize();
545 }
546 };
547
548 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
549
550 // TODO: May need to drop the trailing to fields until deviceRTL is updated
551 struct omptarget_device_environmentTy {
552 int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
553 // only useful for Debug build of deviceRTLs
554 int32_t num_devices; // gets number of active offload devices
555 int32_t device_num; // gets a value 0 to num_devices-1
556 };
557
558 static RTLDeviceInfoTy DeviceInfo;
559
560 namespace {
561
dataRetrieve(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)562 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
563 __tgt_async_info *AsyncInfoPtr) {
564 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
565 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
566 // Return success if we are not copying back to host from target.
567 if (!HstPtr)
568 return OFFLOAD_SUCCESS;
569 atmi_status_t err;
570 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
571 (long long unsigned)(Elf64_Addr)TgtPtr,
572 (long long unsigned)(Elf64_Addr)HstPtr);
573
574 err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
575 DeviceId);
576
577 if (err != ATMI_STATUS_SUCCESS) {
578 DP("Error when copying data from device to host. Pointers: "
579 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
580 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
581 return OFFLOAD_FAIL;
582 }
583 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
584 (long long unsigned)(Elf64_Addr)TgtPtr,
585 (long long unsigned)(Elf64_Addr)HstPtr);
586 return OFFLOAD_SUCCESS;
587 }
588
dataSubmit(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)589 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
590 __tgt_async_info *AsyncInfoPtr) {
591 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
592 atmi_status_t err;
593 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
594 // Return success if we are not doing host to target.
595 if (!HstPtr)
596 return OFFLOAD_SUCCESS;
597
598 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
599 (long long unsigned)(Elf64_Addr)HstPtr,
600 (long long unsigned)(Elf64_Addr)TgtPtr);
601 err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
602 DeviceId);
603 if (err != ATMI_STATUS_SUCCESS) {
604 DP("Error when copying data from host to device. Pointers: "
605 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
606 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
607 return OFFLOAD_FAIL;
608 }
609 return OFFLOAD_SUCCESS;
610 }
611
612 // Async.
613 // The implementation was written with cuda streams in mind. The semantics of
614 // that are to execute kernels on a queue in order of insertion. A synchronise
615 // call then makes writes visible between host and device. This means a series
616 // of N data_submit_async calls are expected to execute serially. HSA offers
617 // various options to run the data copies concurrently. This may require changes
618 // to libomptarget.
619
620 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
621 // there are no outstanding kernels that need to be synchronized. Any async call
622 // may be passed a Queue==0, at which point the cuda implementation will set it
623 // to non-null (see getStream). The cuda streams are per-device. Upstream may
624 // change this interface to explicitly initialize the async_info_pointer, but
625 // until then hsa lazily initializes it as well.
626
initAsyncInfoPtr(__tgt_async_info * async_info_ptr)627 void initAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
628 // set non-null while using async calls, return to null to indicate completion
629 assert(async_info_ptr);
630 if (!async_info_ptr->Queue) {
631 async_info_ptr->Queue = reinterpret_cast<void *>(UINT64_MAX);
632 }
633 }
finiAsyncInfoPtr(__tgt_async_info * async_info_ptr)634 void finiAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
635 assert(async_info_ptr);
636 assert(async_info_ptr->Queue);
637 async_info_ptr->Queue = 0;
638 }
639
elf_machine_id_is_amdgcn(__tgt_device_image * image)640 bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
641 const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
642 int32_t r = elf_check_machine(image, amdgcnMachineID);
643 if (!r) {
644 DP("Supported machine ID not found\n");
645 }
646 return r;
647 }
648
elf_e_flags(__tgt_device_image * image)649 uint32_t elf_e_flags(__tgt_device_image *image) {
650 char *img_begin = (char *)image->ImageStart;
651 size_t img_size = (char *)image->ImageEnd - img_begin;
652
653 Elf *e = elf_memory(img_begin, img_size);
654 if (!e) {
655 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
656 return 0;
657 }
658
659 Elf64_Ehdr *eh64 = elf64_getehdr(e);
660
661 if (!eh64) {
662 DP("Unable to get machine ID from ELF file!\n");
663 elf_end(e);
664 return 0;
665 }
666
667 uint32_t Flags = eh64->e_flags;
668
669 elf_end(e);
670 DP("ELF Flags: 0x%x\n", Flags);
671 return Flags;
672 }
673 } // namespace
674
__tgt_rtl_is_valid_binary(__tgt_device_image * image)675 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
676 return elf_machine_id_is_amdgcn(image);
677 }
678
__tgt_rtl_number_of_devices()679 int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
680
__tgt_rtl_init_requires(int64_t RequiresFlags)681 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
682 DP("Init requires flags to %ld\n", RequiresFlags);
683 DeviceInfo.RequiresFlags = RequiresFlags;
684 return RequiresFlags;
685 }
686
__tgt_rtl_init_device(int device_id)687 int32_t __tgt_rtl_init_device(int device_id) {
688 hsa_status_t err;
689
690 // this is per device id init
691 DP("Initialize the device id: %d\n", device_id);
692
693 hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
694
695 // Get number of Compute Unit
696 uint32_t compute_units = 0;
697 err = hsa_agent_get_info(
698 agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
699 &compute_units);
700 if (err != HSA_STATUS_SUCCESS) {
701 DeviceInfo.ComputeUnits[device_id] = 1;
702 DP("Error getting compute units : settiing to 1\n");
703 } else {
704 DeviceInfo.ComputeUnits[device_id] = compute_units;
705 DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
706 }
707
708 char GetInfoName[64]; // 64 max size returned by get info
709 err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
710 (void *)GetInfoName);
711 if (err)
712 DeviceInfo.GPUName[device_id] = "--unknown gpu--";
713 else {
714 DeviceInfo.GPUName[device_id] = GetInfoName;
715 }
716
717 if (print_kernel_trace == 4)
718 fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id,
719 DeviceInfo.ComputeUnits[device_id],
720 DeviceInfo.GPUName[device_id].c_str());
721
722 // Query attributes to determine number of threads/block and blocks/grid.
723 uint16_t workgroup_max_dim[3];
724 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
725 &workgroup_max_dim);
726 if (err != HSA_STATUS_SUCCESS) {
727 DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
728 DP("Error getting grid dims: num groups : %d\n",
729 RTLDeviceInfoTy::DefaultNumTeams);
730 } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
731 DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
732 DP("Using %d ROCm blocks per grid\n",
733 DeviceInfo.GroupsPerDevice[device_id]);
734 } else {
735 DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
736 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
737 "at the hard limit\n",
738 workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
739 }
740
741 // Get thread limit
742 hsa_dim3_t grid_max_dim;
743 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
744 if (err == HSA_STATUS_SUCCESS) {
745 DeviceInfo.ThreadsPerGroup[device_id] =
746 reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
747 DeviceInfo.GroupsPerDevice[device_id];
748 if ((DeviceInfo.ThreadsPerGroup[device_id] >
749 RTLDeviceInfoTy::Max_WG_Size) ||
750 DeviceInfo.ThreadsPerGroup[device_id] == 0) {
751 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
752 DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
753 } else {
754 DP("Using ROCm Queried thread limit: %d\n",
755 DeviceInfo.ThreadsPerGroup[device_id]);
756 }
757 } else {
758 DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
759 DP("Error getting max block dimension, use default:%d \n",
760 RTLDeviceInfoTy::Max_WG_Size);
761 }
762
763 // Get wavefront size
764 uint32_t wavefront_size = 0;
765 err =
766 hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
767 if (err == HSA_STATUS_SUCCESS) {
768 DP("Queried wavefront size: %d\n", wavefront_size);
769 DeviceInfo.WarpSize[device_id] = wavefront_size;
770 } else {
771 DP("Default wavefront size: %d\n",
772 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]);
773 DeviceInfo.WarpSize[device_id] =
774 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
775 }
776
777 // Adjust teams to the env variables
778 if (DeviceInfo.EnvTeamLimit > 0 &&
779 DeviceInfo.GroupsPerDevice[device_id] > DeviceInfo.EnvTeamLimit) {
780 DeviceInfo.GroupsPerDevice[device_id] = DeviceInfo.EnvTeamLimit;
781 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
782 DeviceInfo.EnvTeamLimit);
783 }
784
785 // Set default number of teams
786 if (DeviceInfo.EnvNumTeams > 0) {
787 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
788 DP("Default number of teams set according to environment %d\n",
789 DeviceInfo.EnvNumTeams);
790 } else {
791 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
792 int TeamsPerCU = 1; // default number of teams per CU is 1
793 if (TeamsPerCUEnvStr) {
794 TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
795 }
796
797 DeviceInfo.NumTeams[device_id] =
798 TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
799 DP("Default number of teams = %d * number of compute units %d\n",
800 TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
801 }
802
803 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) {
804 DeviceInfo.NumTeams[device_id] = DeviceInfo.GroupsPerDevice[device_id];
805 DP("Default number of teams exceeds device limit, capping at %d\n",
806 DeviceInfo.GroupsPerDevice[device_id]);
807 }
808
809 // Set default number of threads
810 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
811 DP("Default number of threads set according to library's default %d\n",
812 RTLDeviceInfoTy::Default_WG_Size);
813 if (DeviceInfo.NumThreads[device_id] >
814 DeviceInfo.ThreadsPerGroup[device_id]) {
815 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerGroup[device_id];
816 DP("Default number of threads exceeds device limit, capping at %d\n",
817 DeviceInfo.ThreadsPerGroup[device_id]);
818 }
819
820 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
821 device_id, DeviceInfo.GroupsPerDevice[device_id],
822 DeviceInfo.ThreadsPerGroup[device_id]);
823
824 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
825 DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
826 DeviceInfo.GroupsPerDevice[device_id],
827 DeviceInfo.GroupsPerDevice[device_id] *
828 DeviceInfo.ThreadsPerGroup[device_id]);
829
830 return OFFLOAD_SUCCESS;
831 }
832
833 namespace {
find_only_SHT_HASH(Elf * elf)834 Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
835 size_t N;
836 int rc = elf_getshdrnum(elf, &N);
837 if (rc != 0) {
838 return nullptr;
839 }
840
841 Elf64_Shdr *result = nullptr;
842 for (size_t i = 0; i < N; i++) {
843 Elf_Scn *scn = elf_getscn(elf, i);
844 if (scn) {
845 Elf64_Shdr *shdr = elf64_getshdr(scn);
846 if (shdr) {
847 if (shdr->sh_type == SHT_HASH) {
848 if (result == nullptr) {
849 result = shdr;
850 } else {
851 // multiple SHT_HASH sections not handled
852 return nullptr;
853 }
854 }
855 }
856 }
857 }
858 return result;
859 }
860
elf_lookup(Elf * elf,char * base,Elf64_Shdr * section_hash,const char * symname)861 const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
862 const char *symname) {
863
864 assert(section_hash);
865 size_t section_symtab_index = section_hash->sh_link;
866 Elf64_Shdr *section_symtab =
867 elf64_getshdr(elf_getscn(elf, section_symtab_index));
868 size_t section_strtab_index = section_symtab->sh_link;
869
870 const Elf64_Sym *symtab =
871 reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
872
873 const uint32_t *hashtab =
874 reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
875
876 // Layout:
877 // nbucket
878 // nchain
879 // bucket[nbucket]
880 // chain[nchain]
881 uint32_t nbucket = hashtab[0];
882 const uint32_t *bucket = &hashtab[2];
883 const uint32_t *chain = &hashtab[nbucket + 2];
884
885 const size_t max = strlen(symname) + 1;
886 const uint32_t hash = elf_hash(symname);
887 for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
888 char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
889 if (strncmp(symname, n, max) == 0) {
890 return &symtab[i];
891 }
892 }
893
894 return nullptr;
895 }
896
897 typedef struct {
898 void *addr = nullptr;
899 uint32_t size = UINT32_MAX;
900 uint32_t sh_type = SHT_NULL;
901 } symbol_info;
902
get_symbol_info_without_loading(Elf * elf,char * base,const char * symname,symbol_info * res)903 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
904 symbol_info *res) {
905 if (elf_kind(elf) != ELF_K_ELF) {
906 return 1;
907 }
908
909 Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
910 if (!section_hash) {
911 return 1;
912 }
913
914 const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
915 if (!sym) {
916 return 1;
917 }
918
919 if (sym->st_size > UINT32_MAX) {
920 return 1;
921 }
922
923 if (sym->st_shndx == SHN_UNDEF) {
924 return 1;
925 }
926
927 Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
928 if (!section) {
929 return 1;
930 }
931
932 Elf64_Shdr *header = elf64_getshdr(section);
933 if (!header) {
934 return 1;
935 }
936
937 res->addr = sym->st_value + base;
938 res->size = static_cast<uint32_t>(sym->st_size);
939 res->sh_type = header->sh_type;
940 return 0;
941 }
942
get_symbol_info_without_loading(char * base,size_t img_size,const char * symname,symbol_info * res)943 int get_symbol_info_without_loading(char *base, size_t img_size,
944 const char *symname, symbol_info *res) {
945 Elf *elf = elf_memory(base, img_size);
946 if (elf) {
947 int rc = get_symbol_info_without_loading(elf, base, symname, res);
948 elf_end(elf);
949 return rc;
950 }
951 return 1;
952 }
953
interop_get_symbol_info(char * base,size_t img_size,const char * symname,void ** var_addr,uint32_t * var_size)954 atmi_status_t interop_get_symbol_info(char *base, size_t img_size,
955 const char *symname, void **var_addr,
956 uint32_t *var_size) {
957 symbol_info si;
958 int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
959 if (rc == 0) {
960 *var_addr = si.addr;
961 *var_size = si.size;
962 return ATMI_STATUS_SUCCESS;
963 } else {
964 return ATMI_STATUS_ERROR;
965 }
966 }
967
968 template <typename C>
module_register_from_memory_to_place(void * module_bytes,size_t module_size,atmi_place_t place,C cb)969 atmi_status_t module_register_from_memory_to_place(void *module_bytes,
970 size_t module_size,
971 atmi_place_t place, C cb) {
972 auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t {
973 C *unwrapped = static_cast<C *>(cb_state);
974 return (*unwrapped)(data, size);
975 };
976 return atmi_module_register_from_memory_to_place(
977 module_bytes, module_size, place, L, static_cast<void *>(&cb));
978 }
979 } // namespace
980
get_device_State_bytes(char * ImageStart,size_t img_size)981 static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
982 uint64_t device_State_bytes = 0;
983 {
984 // If this is the deviceRTL, get the state variable size
985 symbol_info size_si;
986 int rc = get_symbol_info_without_loading(
987 ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
988
989 if (rc == 0) {
990 if (size_si.size != sizeof(uint64_t)) {
991 fprintf(stderr,
992 "Found device_State_size variable with wrong size, aborting\n");
993 exit(1);
994 }
995
996 // Read number of bytes directly from the elf
997 memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
998 }
999 }
1000 return device_State_bytes;
1001 }
1002
1003 static __tgt_target_table *
1004 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1005
1006 static __tgt_target_table *
1007 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1008
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1009 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1010 __tgt_device_image *image) {
1011 DeviceInfo.load_run_lock.lock();
1012 __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
1013 DeviceInfo.load_run_lock.unlock();
1014 return res;
1015 }
1016
1017 struct device_environment {
1018 // initialise an omptarget_device_environmentTy in the deviceRTL
1019 // patches around differences in the deviceRTL between trunk, aomp,
1020 // rocmcc. Over time these differences will tend to zero and this class
1021 // simplified.
1022 // Symbol may be in .data or .bss, and may be missing fields:
1023 // - aomp has debug_level, num_devices, device_num
1024 // - trunk has debug_level
1025 // - under review in trunk is debug_level, device_num
1026 // - rocmcc matches aomp, patch to swap num_devices and device_num
1027
1028 // If the symbol is in .data (aomp, rocm) it can be written directly.
1029 // If it is in .bss, we must wait for it to be allocated space on the
1030 // gpu (trunk) and initialize after loading.
symdevice_environment1031 const char *sym() { return "omptarget_device_environment"; }
1032
1033 omptarget_device_environmentTy host_device_env;
1034 symbol_info si;
1035 bool valid = false;
1036
1037 __tgt_device_image *image;
1038 const size_t img_size;
1039
device_environmentdevice_environment1040 device_environment(int device_id, int number_devices,
1041 __tgt_device_image *image, const size_t img_size)
1042 : image(image), img_size(img_size) {
1043
1044 host_device_env.num_devices = number_devices;
1045 host_device_env.device_num = device_id;
1046 host_device_env.debug_level = 0;
1047 #ifdef OMPTARGET_DEBUG
1048 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
1049 host_device_env.debug_level = std::stoi(envStr);
1050 }
1051 #endif
1052
1053 int rc = get_symbol_info_without_loading((char *)image->ImageStart,
1054 img_size, sym(), &si);
1055 if (rc != 0) {
1056 DP("Finding global device environment '%s' - symbol missing.\n", sym());
1057 return;
1058 }
1059
1060 if (si.size > sizeof(host_device_env)) {
1061 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
1062 sizeof(host_device_env));
1063 return;
1064 }
1065
1066 valid = true;
1067 }
1068
in_imagedevice_environment1069 bool in_image() { return si.sh_type != SHT_NOBITS; }
1070
before_loadingdevice_environment1071 atmi_status_t before_loading(void *data, size_t size) {
1072 assert(valid);
1073 if (in_image()) {
1074 DP("Setting global device environment before load (%u bytes)\n", si.size);
1075 uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
1076 void *pos = (char *)data + offset;
1077 memcpy(pos, &host_device_env, si.size);
1078 }
1079 return ATMI_STATUS_SUCCESS;
1080 }
1081
after_loadingdevice_environment1082 atmi_status_t after_loading() {
1083 assert(valid);
1084 if (!in_image()) {
1085 DP("Setting global device environment after load (%u bytes)\n", si.size);
1086 int device_id = host_device_env.device_num;
1087
1088 void *state_ptr;
1089 uint32_t state_ptr_size;
1090 atmi_status_t err = atmi_interop_hsa_get_symbol_info(
1091 get_gpu_mem_place(device_id), sym(), &state_ptr, &state_ptr_size);
1092 if (err != ATMI_STATUS_SUCCESS) {
1093 DP("failed to find %s in loaded image\n", sym());
1094 return err;
1095 }
1096
1097 if (state_ptr_size != si.size) {
1098 DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
1099 si.size);
1100 return ATMI_STATUS_ERROR;
1101 }
1102
1103 return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
1104 state_ptr_size, device_id);
1105 }
1106 return ATMI_STATUS_SUCCESS;
1107 }
1108 };
1109
atmi_calloc(void ** ret_ptr,size_t size,atmi_mem_place_t place)1110 static atmi_status_t atmi_calloc(void **ret_ptr, size_t size,
1111 atmi_mem_place_t place) {
1112 uint64_t rounded = 4 * ((size + 3) / 4);
1113 void *ptr;
1114 atmi_status_t err = atmi_malloc(&ptr, rounded, place);
1115 if (err != ATMI_STATUS_SUCCESS) {
1116 return err;
1117 }
1118
1119 hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
1120 if (rc != HSA_STATUS_SUCCESS) {
1121 fprintf(stderr, "zero fill device_state failed with %u\n", rc);
1122 atmi_free(ptr);
1123 return ATMI_STATUS_ERROR;
1124 }
1125
1126 *ret_ptr = ptr;
1127 return ATMI_STATUS_SUCCESS;
1128 }
1129
__tgt_rtl_load_binary_locked(int32_t device_id,__tgt_device_image * image)1130 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
1131 __tgt_device_image *image) {
1132 // This function loads the device image onto gpu[device_id] and does other
1133 // per-image initialization work. Specifically:
1134 //
1135 // - Initialize an omptarget_device_environmentTy instance embedded in the
1136 // image at the symbol "omptarget_device_environment"
1137 // Fields debug_level, device_num, num_devices. Used by the deviceRTL.
1138 //
1139 // - Allocate a large array per-gpu (could be moved to init_device)
1140 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size
1141 // - Allocate at least that many bytes of gpu memory
1142 // - Zero initialize it
1143 // - Write the pointer to the symbol omptarget_nvptx_device_State
1144 //
1145 // - Pulls some per-kernel information together from various sources and
1146 // records it in the KernelsList for quicker access later
1147 //
1148 // The initialization can be done before or after loading the image onto the
1149 // gpu. This function presently does a mixture. Using the hsa api to get/set
1150 // the information is simpler to implement, in exchange for more complicated
1151 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
1152 // back from the gpu vs a hashtable lookup on the host.
1153
1154 const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
1155
1156 DeviceInfo.clearOffloadEntriesTable(device_id);
1157
1158 // We do not need to set the ELF version because the caller of this function
1159 // had to do that to decide the right runtime to use
1160
1161 if (!elf_machine_id_is_amdgcn(image)) {
1162 return NULL;
1163 }
1164
1165 {
1166 auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
1167 img_size);
1168 if (!env.valid) {
1169 return NULL;
1170 }
1171
1172 atmi_status_t err = module_register_from_memory_to_place(
1173 (void *)image->ImageStart, img_size, get_gpu_place(device_id),
1174 [&](void *data, size_t size) {
1175 return env.before_loading(data, size);
1176 });
1177
1178 check("Module registering", err);
1179 if (err != ATMI_STATUS_SUCCESS) {
1180 fprintf(stderr,
1181 "Possible gpu arch mismatch: device:%s, image:%s please check"
1182 " compiler flag: -march=<gpu>\n",
1183 DeviceInfo.GPUName[device_id].c_str(),
1184 get_elf_mach_gfx_name(elf_e_flags(image)));
1185 return NULL;
1186 }
1187
1188 err = env.after_loading();
1189 if (err != ATMI_STATUS_SUCCESS) {
1190 return NULL;
1191 }
1192 }
1193
1194 DP("ATMI module successfully loaded!\n");
1195
1196 {
1197 // the device_State array is either large value in bss or a void* that
1198 // needs to be assigned to a pointer to an array of size device_state_bytes
1199
1200 void *state_ptr;
1201 uint32_t state_ptr_size;
1202 atmi_status_t err = atmi_interop_hsa_get_symbol_info(
1203 get_gpu_mem_place(device_id), "omptarget_nvptx_device_State",
1204 &state_ptr, &state_ptr_size);
1205
1206 if (err != ATMI_STATUS_SUCCESS) {
1207 fprintf(stderr, "failed to find device_state symbol\n");
1208 return NULL;
1209 }
1210
1211 if (state_ptr_size < sizeof(void *)) {
1212 fprintf(stderr, "unexpected size of state_ptr %u != %zu\n",
1213 state_ptr_size, sizeof(void *));
1214 return NULL;
1215 }
1216
1217 // if it's larger than a void*, assume it's a bss array and no further
1218 // initialization is required. Only try to set up a pointer for
1219 // sizeof(void*)
1220 if (state_ptr_size == sizeof(void *)) {
1221 uint64_t device_State_bytes =
1222 get_device_State_bytes((char *)image->ImageStart, img_size);
1223 if (device_State_bytes == 0) {
1224 return NULL;
1225 }
1226
1227 auto &dss = DeviceInfo.deviceStateStore[device_id];
1228 if (dss.first.get() == nullptr) {
1229 assert(dss.second == 0);
1230 void *ptr = NULL;
1231 atmi_status_t err =
1232 atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
1233 if (err != ATMI_STATUS_SUCCESS) {
1234 fprintf(stderr, "Failed to allocate device_state array\n");
1235 return NULL;
1236 }
1237 dss = {std::unique_ptr<void, RTLDeviceInfoTy::atmiFreePtrDeletor>{ptr},
1238 device_State_bytes};
1239 }
1240
1241 void *ptr = dss.first.get();
1242 if (device_State_bytes != dss.second) {
1243 fprintf(stderr, "Inconsistent sizes of device_State unsupported\n");
1244 exit(1);
1245 }
1246
1247 // write ptr to device memory so it can be used by later kernels
1248 err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
1249 sizeof(void *), device_id);
1250 if (err != ATMI_STATUS_SUCCESS) {
1251 fprintf(stderr, "memcpy install of state_ptr failed\n");
1252 return NULL;
1253 }
1254 }
1255 }
1256
1257 // Here, we take advantage of the data that is appended after img_end to get
1258 // the symbols' name we need to load. This data consist of the host entries
1259 // begin and end as well as the target name (see the offloading linker script
1260 // creation in clang compiler).
1261
1262 // Find the symbols in the module by name. The name can be obtain by
1263 // concatenating the host entry name with the target name
1264
1265 __tgt_offload_entry *HostBegin = image->EntriesBegin;
1266 __tgt_offload_entry *HostEnd = image->EntriesEnd;
1267
1268 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
1269
1270 if (!e->addr) {
1271 // The host should have always something in the address to
1272 // uniquely identify the target region.
1273 fprintf(stderr, "Analyzing host entry '<null>' (size = %lld)...\n",
1274 (unsigned long long)e->size);
1275 return NULL;
1276 }
1277
1278 if (e->size) {
1279 __tgt_offload_entry entry = *e;
1280
1281 void *varptr;
1282 uint32_t varsize;
1283
1284 atmi_status_t err = atmi_interop_hsa_get_symbol_info(
1285 get_gpu_mem_place(device_id), e->name, &varptr, &varsize);
1286
1287 if (err != ATMI_STATUS_SUCCESS) {
1288 DP("Loading global '%s' (Failed)\n", e->name);
1289 // Inform the user what symbol prevented offloading
1290 fprintf(stderr, "Loading global '%s' (Failed)\n", e->name);
1291 return NULL;
1292 }
1293
1294 if (varsize != e->size) {
1295 DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
1296 varsize, e->size);
1297 return NULL;
1298 }
1299
1300 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
1301 DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
1302 entry.addr = (void *)varptr;
1303
1304 DeviceInfo.addOffloadEntry(device_id, entry);
1305
1306 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1307 e->flags & OMP_DECLARE_TARGET_LINK) {
1308 // If unified memory is present any target link variables
1309 // can access host addresses directly. There is no longer a
1310 // need for device copies.
1311 err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
1312 sizeof(void *), device_id);
1313 if (err != ATMI_STATUS_SUCCESS)
1314 DP("Error when copying USM\n");
1315 DP("Copy linked variable host address (" DPxMOD ")"
1316 "to device address (" DPxMOD ")\n",
1317 DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
1318 }
1319
1320 continue;
1321 }
1322
1323 DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
1324
1325 atmi_mem_place_t place = get_gpu_mem_place(device_id);
1326 uint32_t kernarg_segment_size;
1327 atmi_status_t err = atmi_interop_hsa_get_kernel_info(
1328 place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1329 &kernarg_segment_size);
1330
1331 // each arg is a void * in this openmp implementation
1332 uint32_t arg_num = kernarg_segment_size / sizeof(void *);
1333 std::vector<size_t> arg_sizes(arg_num);
1334 for (std::vector<size_t>::iterator it = arg_sizes.begin();
1335 it != arg_sizes.end(); it++) {
1336 *it = sizeof(void *);
1337 }
1338
1339 // default value GENERIC (in case symbol is missing from cubin file)
1340 int8_t ExecModeVal = ExecutionModeType::GENERIC;
1341
1342 // get flat group size if present, else Default_WG_Size
1343 int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1344
1345 // get Kernel Descriptor if present.
1346 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
1347 struct KernDescValType {
1348 uint16_t Version;
1349 uint16_t TSize;
1350 uint16_t WG_Size;
1351 uint8_t Mode;
1352 };
1353 struct KernDescValType KernDescVal;
1354 std::string KernDescNameStr(e->name);
1355 KernDescNameStr += "_kern_desc";
1356 const char *KernDescName = KernDescNameStr.c_str();
1357
1358 void *KernDescPtr;
1359 uint32_t KernDescSize;
1360 void *CallStackAddr = nullptr;
1361 err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1362 KernDescName, &KernDescPtr, &KernDescSize);
1363
1364 if (err == ATMI_STATUS_SUCCESS) {
1365 if ((size_t)KernDescSize != sizeof(KernDescVal))
1366 DP("Loading global computation properties '%s' - size mismatch (%u != "
1367 "%lu)\n",
1368 KernDescName, KernDescSize, sizeof(KernDescVal));
1369
1370 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
1371
1372 // Check structure size against recorded size.
1373 if ((size_t)KernDescSize != KernDescVal.TSize)
1374 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
1375 sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
1376
1377 DP("After loading global for %s KernDesc \n", KernDescName);
1378 DP("KernDesc: Version: %d\n", KernDescVal.Version);
1379 DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
1380 DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
1381 DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
1382
1383 // Get ExecMode
1384 ExecModeVal = KernDescVal.Mode;
1385 DP("ExecModeVal %d\n", ExecModeVal);
1386 if (KernDescVal.WG_Size == 0) {
1387 KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
1388 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
1389 }
1390 WGSizeVal = KernDescVal.WG_Size;
1391 DP("WGSizeVal %d\n", WGSizeVal);
1392 check("Loading KernDesc computation property", err);
1393 } else {
1394 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
1395
1396 // Generic
1397 std::string ExecModeNameStr(e->name);
1398 ExecModeNameStr += "_exec_mode";
1399 const char *ExecModeName = ExecModeNameStr.c_str();
1400
1401 void *ExecModePtr;
1402 uint32_t varsize;
1403 err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1404 ExecModeName, &ExecModePtr, &varsize);
1405
1406 if (err == ATMI_STATUS_SUCCESS) {
1407 if ((size_t)varsize != sizeof(int8_t)) {
1408 DP("Loading global computation properties '%s' - size mismatch(%u != "
1409 "%lu)\n",
1410 ExecModeName, varsize, sizeof(int8_t));
1411 return NULL;
1412 }
1413
1414 memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
1415
1416 DP("After loading global for %s ExecMode = %d\n", ExecModeName,
1417 ExecModeVal);
1418
1419 if (ExecModeVal < 0 || ExecModeVal > 1) {
1420 DP("Error wrong exec_mode value specified in HSA code object file: "
1421 "%d\n",
1422 ExecModeVal);
1423 return NULL;
1424 }
1425 } else {
1426 DP("Loading global exec_mode '%s' - symbol missing, using default "
1427 "value "
1428 "GENERIC (1)\n",
1429 ExecModeName);
1430 }
1431 check("Loading computation property", err);
1432
1433 // Flat group size
1434 std::string WGSizeNameStr(e->name);
1435 WGSizeNameStr += "_wg_size";
1436 const char *WGSizeName = WGSizeNameStr.c_str();
1437
1438 void *WGSizePtr;
1439 uint32_t WGSize;
1440 err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1441 WGSizeName, &WGSizePtr, &WGSize);
1442
1443 if (err == ATMI_STATUS_SUCCESS) {
1444 if ((size_t)WGSize != sizeof(int16_t)) {
1445 DP("Loading global computation properties '%s' - size mismatch (%u "
1446 "!= "
1447 "%lu)\n",
1448 WGSizeName, WGSize, sizeof(int16_t));
1449 return NULL;
1450 }
1451
1452 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
1453
1454 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
1455
1456 if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
1457 WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
1458 DP("Error wrong WGSize value specified in HSA code object file: "
1459 "%d\n",
1460 WGSizeVal);
1461 WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1462 }
1463 } else {
1464 DP("Warning: Loading WGSize '%s' - symbol not found, "
1465 "using default value %d\n",
1466 WGSizeName, WGSizeVal);
1467 }
1468
1469 check("Loading WGSize computation property", err);
1470 }
1471
1472 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
1473 CallStackAddr, e->name,
1474 kernarg_segment_size));
1475 __tgt_offload_entry entry = *e;
1476 entry.addr = (void *)&KernelsList.back();
1477 DeviceInfo.addOffloadEntry(device_id, entry);
1478 DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
1479 }
1480
1481 return DeviceInfo.getOffloadEntriesTable(device_id);
1482 }
1483
__tgt_rtl_data_alloc(int device_id,int64_t size,void *)1484 void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) {
1485 void *ptr = NULL;
1486 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1487 atmi_status_t err = atmi_malloc(&ptr, size, get_gpu_mem_place(device_id));
1488 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
1489 (long long unsigned)(Elf64_Addr)ptr);
1490 ptr = (err == ATMI_STATUS_SUCCESS) ? ptr : NULL;
1491 return ptr;
1492 }
1493
__tgt_rtl_data_submit(int device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1494 int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
1495 int64_t size) {
1496 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1497 __tgt_async_info async_info;
1498 int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info);
1499 if (rc != OFFLOAD_SUCCESS)
1500 return OFFLOAD_FAIL;
1501
1502 return __tgt_rtl_synchronize(device_id, &async_info);
1503 }
1504
__tgt_rtl_data_submit_async(int device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1505 int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
1506 int64_t size,
1507 __tgt_async_info *async_info_ptr) {
1508 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1509 if (async_info_ptr) {
1510 initAsyncInfoPtr(async_info_ptr);
1511 return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
1512 } else {
1513 return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
1514 }
1515 }
1516
__tgt_rtl_data_retrieve(int device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1517 int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
1518 int64_t size) {
1519 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1520 __tgt_async_info async_info;
1521 int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info);
1522 if (rc != OFFLOAD_SUCCESS)
1523 return OFFLOAD_FAIL;
1524
1525 return __tgt_rtl_synchronize(device_id, &async_info);
1526 }
1527
__tgt_rtl_data_retrieve_async(int device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1528 int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
1529 void *tgt_ptr, int64_t size,
1530 __tgt_async_info *async_info_ptr) {
1531 assert(async_info_ptr && "async_info is nullptr");
1532 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1533 initAsyncInfoPtr(async_info_ptr);
1534 return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
1535 }
1536
__tgt_rtl_data_delete(int device_id,void * tgt_ptr)1537 int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
1538 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1539 atmi_status_t err;
1540 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
1541 err = atmi_free(tgt_ptr);
1542 if (err != ATMI_STATUS_SUCCESS) {
1543 DP("Error when freeing CUDA memory\n");
1544 return OFFLOAD_FAIL;
1545 }
1546 return OFFLOAD_SUCCESS;
1547 }
1548
1549 // Determine launch values for threadsPerGroup and num_groups.
1550 // Outputs: treadsPerGroup, num_groups
1551 // Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode,
1552 // EnvTeamLimit, EnvNumTeams, num_teams, thread_limit,
1553 // loop_tripcount.
getLaunchVals(int & threadsPerGroup,int & num_groups,int ConstWGSize,int ExecutionMode,int EnvTeamLimit,int EnvNumTeams,int num_teams,int thread_limit,uint64_t loop_tripcount,int32_t device_id)1554 void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize,
1555 int ExecutionMode, int EnvTeamLimit, int EnvNumTeams,
1556 int num_teams, int thread_limit, uint64_t loop_tripcount,
1557 int32_t device_id) {
1558
1559 int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0
1560 ? DeviceInfo.EnvMaxTeamsDefault
1561 : DeviceInfo.NumTeams[device_id];
1562 if (Max_Teams > DeviceInfo.HardTeamLimit)
1563 Max_Teams = DeviceInfo.HardTeamLimit;
1564
1565 if (print_kernel_trace == 4) {
1566 fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n",
1567 RTLDeviceInfoTy::Max_Teams);
1568 fprintf(stderr, "Max_Teams: %d\n", Max_Teams);
1569 fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n",
1570 RTLDeviceInfoTy::Warp_Size);
1571 fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n",
1572 RTLDeviceInfoTy::Max_WG_Size);
1573 fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n",
1574 RTLDeviceInfoTy::Default_WG_Size);
1575 fprintf(stderr, "thread_limit: %d\n", thread_limit);
1576 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1577 fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize);
1578 }
1579 // check for thread_limit() clause
1580 if (thread_limit > 0) {
1581 threadsPerGroup = thread_limit;
1582 DP("Setting threads per block to requested %d\n", thread_limit);
1583 if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
1584 threadsPerGroup += RTLDeviceInfoTy::Warp_Size;
1585 DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size);
1586 }
1587 if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
1588 threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
1589 DP("Setting threads per block to maximum %d\n", threadsPerGroup);
1590 }
1591 }
1592 // check flat_max_work_group_size attr here
1593 if (threadsPerGroup > ConstWGSize) {
1594 threadsPerGroup = ConstWGSize;
1595 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1596 threadsPerGroup);
1597 }
1598 if (print_kernel_trace == 4)
1599 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1600 DP("Preparing %d threads\n", threadsPerGroup);
1601
1602 // Set default num_groups (teams)
1603 if (DeviceInfo.EnvTeamLimit > 0)
1604 num_groups = (Max_Teams < DeviceInfo.EnvTeamLimit)
1605 ? Max_Teams
1606 : DeviceInfo.EnvTeamLimit;
1607 else
1608 num_groups = Max_Teams;
1609 DP("Set default num of groups %d\n", num_groups);
1610
1611 if (print_kernel_trace == 4) {
1612 fprintf(stderr, "num_groups: %d\n", num_groups);
1613 fprintf(stderr, "num_teams: %d\n", num_teams);
1614 }
1615
1616 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1617 // This reduction is typical for default case (no thread_limit clause).
1618 // or when user goes crazy with num_teams clause.
1619 // FIXME: We cant distinguish between a constant or variable thread limit.
1620 // So we only handle constant thread_limits.
1621 if (threadsPerGroup >
1622 RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024
1623 // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size
1624 // here?
1625 num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
1626
1627 // check for num_teams() clause
1628 if (num_teams > 0) {
1629 num_groups = (num_teams < num_groups) ? num_teams : num_groups;
1630 }
1631 if (print_kernel_trace == 4) {
1632 fprintf(stderr, "num_groups: %d\n", num_groups);
1633 fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
1634 fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit);
1635 }
1636
1637 if (DeviceInfo.EnvNumTeams > 0) {
1638 num_groups = (DeviceInfo.EnvNumTeams < num_groups) ? DeviceInfo.EnvNumTeams
1639 : num_groups;
1640 DP("Modifying teams based on EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
1641 } else if (DeviceInfo.EnvTeamLimit > 0) {
1642 num_groups = (DeviceInfo.EnvTeamLimit < num_groups)
1643 ? DeviceInfo.EnvTeamLimit
1644 : num_groups;
1645 DP("Modifying teams based on EnvTeamLimit%d\n", DeviceInfo.EnvTeamLimit);
1646 } else {
1647 if (num_teams <= 0) {
1648 if (loop_tripcount > 0) {
1649 if (ExecutionMode == SPMD) {
1650 // round up to the nearest integer
1651 num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
1652 } else {
1653 num_groups = loop_tripcount;
1654 }
1655 DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1656 "threads per block %d\n",
1657 num_groups, loop_tripcount, threadsPerGroup);
1658 }
1659 } else {
1660 num_groups = num_teams;
1661 }
1662 if (num_groups > Max_Teams) {
1663 num_groups = Max_Teams;
1664 if (print_kernel_trace == 4)
1665 fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups,
1666 Max_Teams);
1667 }
1668 if (num_groups > num_teams && num_teams > 0) {
1669 num_groups = num_teams;
1670 if (print_kernel_trace == 4)
1671 fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n",
1672 num_groups, num_teams);
1673 }
1674 }
1675
1676 // num_teams clause always honored, no matter what, unless DEFAULT is active.
1677 if (num_teams > 0) {
1678 num_groups = num_teams;
1679 // Cap num_groups to EnvMaxTeamsDefault if set.
1680 if (DeviceInfo.EnvMaxTeamsDefault > 0 &&
1681 num_groups > DeviceInfo.EnvMaxTeamsDefault)
1682 num_groups = DeviceInfo.EnvMaxTeamsDefault;
1683 }
1684 if (print_kernel_trace == 4) {
1685 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1686 fprintf(stderr, "num_groups: %d\n", num_groups);
1687 fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount);
1688 }
1689 DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
1690 threadsPerGroup);
1691 }
1692
acquire_available_packet_id(hsa_queue_t * queue)1693 static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
1694 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
1695 bool full = true;
1696 while (full) {
1697 full =
1698 packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
1699 }
1700 return packet_id;
1701 }
1702
1703 extern bool g_atmi_hostcall_required; // declared without header by atmi
1704
1705 static int32_t __tgt_rtl_run_target_team_region_locked(
1706 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1707 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
1708 int32_t thread_limit, uint64_t loop_tripcount);
1709
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t num_teams,int32_t thread_limit,uint64_t loop_tripcount)1710 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1711 void **tgt_args,
1712 ptrdiff_t *tgt_offsets,
1713 int32_t arg_num, int32_t num_teams,
1714 int32_t thread_limit,
1715 uint64_t loop_tripcount) {
1716
1717 DeviceInfo.load_run_lock.lock_shared();
1718 int32_t res = __tgt_rtl_run_target_team_region_locked(
1719 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
1720 thread_limit, loop_tripcount);
1721
1722 DeviceInfo.load_run_lock.unlock_shared();
1723 return res;
1724 }
1725
__tgt_rtl_run_target_team_region_locked(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t num_teams,int32_t thread_limit,uint64_t loop_tripcount)1726 int32_t __tgt_rtl_run_target_team_region_locked(
1727 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1728 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
1729 int32_t thread_limit, uint64_t loop_tripcount) {
1730 // Set the context we are using
1731 // update thread limit content in gpu memory if un-initialized or specified
1732 // from host
1733
1734 DP("Run target team region thread_limit %d\n", thread_limit);
1735
1736 // All args are references.
1737 std::vector<void *> args(arg_num);
1738 std::vector<void *> ptrs(arg_num);
1739
1740 DP("Arg_num: %d\n", arg_num);
1741 for (int32_t i = 0; i < arg_num; ++i) {
1742 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
1743 args[i] = &ptrs[i];
1744 DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
1745 }
1746
1747 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
1748
1749 /*
1750 * Set limit based on ThreadsPerGroup and GroupsPerDevice
1751 */
1752 int num_groups = 0;
1753
1754 int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
1755
1756 getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize,
1757 KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit,
1758 DeviceInfo.EnvNumTeams,
1759 num_teams, // From run_region arg
1760 thread_limit, // From run_region arg
1761 loop_tripcount, // From run_region arg
1762 KernelInfo->device_id);
1763
1764 if (print_kernel_trace >= 1)
1765 // enum modes are SPMD, GENERIC, NONE 0,1,2
1766 fprintf(stderr,
1767 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1768 "reqd:(%4dX%4d) n:%s\n",
1769 device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1770 arg_num, num_groups, threadsPerGroup, num_teams, thread_limit,
1771 KernelInfo->Name);
1772
1773 // Run on the device.
1774 {
1775 hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id];
1776 uint64_t packet_id = acquire_available_packet_id(queue);
1777
1778 const uint32_t mask = queue->size - 1; // size is a power of 2
1779 hsa_kernel_dispatch_packet_t *packet =
1780 (hsa_kernel_dispatch_packet_t *)queue->base_address +
1781 (packet_id & mask);
1782
1783 // packet->header is written last
1784 packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1785 packet->workgroup_size_x = threadsPerGroup;
1786 packet->workgroup_size_y = 1;
1787 packet->workgroup_size_z = 1;
1788 packet->reserved0 = 0;
1789 packet->grid_size_x = num_groups * threadsPerGroup;
1790 packet->grid_size_y = 1;
1791 packet->grid_size_z = 1;
1792 packet->private_segment_size = 0;
1793 packet->group_segment_size = 0;
1794 packet->kernel_object = 0;
1795 packet->kernarg_address = 0; // use the block allocator
1796 packet->reserved2 = 0; // atmi writes id_ here
1797 packet->completion_signal = {0}; // may want a pool of signals
1798
1799 std::string kernel_name = std::string(KernelInfo->Name);
1800 {
1801 assert(KernelInfoTable[device_id].find(kernel_name) !=
1802 KernelInfoTable[device_id].end());
1803 auto it = KernelInfoTable[device_id][kernel_name];
1804 packet->kernel_object = it.kernel_object;
1805 packet->private_segment_size = it.private_segment_size;
1806 packet->group_segment_size = it.group_segment_size;
1807 assert(arg_num == (int)it.num_args);
1808 }
1809
1810 KernelArgPool *ArgPool = nullptr;
1811 {
1812 auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1813 if (it != KernelArgPoolMap.end()) {
1814 ArgPool = (it->second).get();
1815 }
1816 }
1817 if (!ArgPool) {
1818 fprintf(stderr, "Warning: No ArgPool for %s on device %d\n",
1819 KernelInfo->Name, device_id);
1820 }
1821 {
1822 void *kernarg = nullptr;
1823 if (ArgPool) {
1824 assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
1825 kernarg = ArgPool->allocate(arg_num);
1826 }
1827 if (!kernarg) {
1828 printf("Allocate kernarg failed\n");
1829 exit(1);
1830 }
1831
1832 // Copy explicit arguments
1833 for (int i = 0; i < arg_num; i++) {
1834 memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
1835 }
1836
1837 // Initialize implicit arguments. ATMI seems to leave most fields
1838 // uninitialized
1839 atmi_implicit_args_t *impl_args =
1840 reinterpret_cast<atmi_implicit_args_t *>(
1841 static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
1842 memset(impl_args, 0,
1843 sizeof(atmi_implicit_args_t)); // may not be necessary
1844 impl_args->offset_x = 0;
1845 impl_args->offset_y = 0;
1846 impl_args->offset_z = 0;
1847
1848 // assign a hostcall buffer for the selected Q
1849 if (g_atmi_hostcall_required) {
1850 // hostrpc_assign_buffer is not thread safe, and this function is
1851 // under a multiple reader lock, not a writer lock.
1852 static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
1853 pthread_mutex_lock(&hostcall_init_lock);
1854 impl_args->hostcall_ptr = hostrpc_assign_buffer(
1855 DeviceInfo.HSAAgents[device_id], queue, device_id);
1856 pthread_mutex_unlock(&hostcall_init_lock);
1857 if (!impl_args->hostcall_ptr) {
1858 DP("hostrpc_assign_buffer failed, gpu would dereference null and "
1859 "error\n");
1860 return OFFLOAD_FAIL;
1861 }
1862 }
1863
1864 packet->kernarg_address = kernarg;
1865 }
1866
1867 {
1868 hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
1869 if (s.handle == 0) {
1870 printf("Failed to get signal instance\n");
1871 exit(1);
1872 }
1873 packet->completion_signal = s;
1874 hsa_signal_store_relaxed(packet->completion_signal, 1);
1875 }
1876
1877 core::packet_store_release(
1878 reinterpret_cast<uint32_t *>(packet),
1879 core::create_header(HSA_PACKET_TYPE_KERNEL_DISPATCH, 0,
1880 ATMI_FENCE_SCOPE_SYSTEM, ATMI_FENCE_SCOPE_SYSTEM),
1881 packet->setup);
1882
1883 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
1884
1885 while (hsa_signal_wait_scacquire(packet->completion_signal,
1886 HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
1887 HSA_WAIT_STATE_BLOCKED) != 0)
1888 ;
1889
1890 assert(ArgPool);
1891 ArgPool->deallocate(packet->kernarg_address);
1892 DeviceInfo.FreeSignalPool.push(packet->completion_signal);
1893 }
1894
1895 DP("Kernel completed\n");
1896 return OFFLOAD_SUCCESS;
1897 }
1898
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1899 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1900 void **tgt_args, ptrdiff_t *tgt_offsets,
1901 int32_t arg_num) {
1902 // use one team and one thread
1903 // fix thread num
1904 int32_t team_num = 1;
1905 int32_t thread_limit = 0; // use default
1906 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
1907 tgt_offsets, arg_num, team_num,
1908 thread_limit, 0);
1909 }
1910
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1911 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1912 void *tgt_entry_ptr, void **tgt_args,
1913 ptrdiff_t *tgt_offsets,
1914 int32_t arg_num,
1915 __tgt_async_info *async_info_ptr) {
1916 assert(async_info_ptr && "async_info is nullptr");
1917 initAsyncInfoPtr(async_info_ptr);
1918
1919 // use one team and one thread
1920 // fix thread num
1921 int32_t team_num = 1;
1922 int32_t thread_limit = 0; // use default
1923 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
1924 tgt_offsets, arg_num, team_num,
1925 thread_limit, 0);
1926 }
1927
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1928 int32_t __tgt_rtl_synchronize(int32_t device_id,
1929 __tgt_async_info *async_info_ptr) {
1930 assert(async_info_ptr && "async_info is nullptr");
1931
1932 // Cuda asserts that async_info_ptr->Queue is non-null, but this invariant
1933 // is not ensured by devices.cpp for amdgcn
1934 // assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1935 if (async_info_ptr->Queue) {
1936 finiAsyncInfoPtr(async_info_ptr);
1937 }
1938 return OFFLOAD_SUCCESS;
1939 }
1940