1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22
23 #include "Debug.h"
24 #include "DeviceEnvironment.h"
25 #include "omptargetplugin.h"
26
27 #define TARGET_NAME CUDA
28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
29
30 #include "MemoryManager.h"
31
32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
33
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef OMPTARGET_DEBUG
36 #define CUDA_ERR_STRING(err) \
37 do { \
38 if (getDebugLevel() > 0) { \
39 const char *errStr = nullptr; \
40 CUresult errStr_status = cuGetErrorString(err, &errStr); \
41 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
42 REPORT("Unrecognized CUDA error code: %d\n", err); \
43 else if (errStr_status == CUDA_SUCCESS) \
44 REPORT("CUDA error is: %s\n", errStr); \
45 else { \
46 REPORT("Unresolved CUDA error code: %d\n", err); \
47 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
48 errStr_status); \
49 } \
50 } else { \
51 const char *errStr = nullptr; \
52 CUresult errStr_status = cuGetErrorString(err, &errStr); \
53 if (errStr_status == CUDA_SUCCESS) \
54 REPORT("%s \n", errStr); \
55 } \
56 } while (false)
57 #else // OMPTARGET_DEBUG
58 #define CUDA_ERR_STRING(err) \
59 do { \
60 const char *errStr = nullptr; \
61 CUresult errStr_status = cuGetErrorString(err, &errStr); \
62 if (errStr_status == CUDA_SUCCESS) \
63 REPORT("%s \n", errStr); \
64 } while (false)
65 #endif // OMPTARGET_DEBUG
66
67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
68
69 #include "elf_common.h"
70
71 /// Keep entries table per device.
72 struct FuncOrGblEntryTy {
73 __tgt_target_table Table;
74 std::vector<__tgt_offload_entry> Entries;
75 };
76
77 /// Use a single entity to encode a kernel and a set of flags.
78 struct KernelTy {
79 CUfunction Func;
80
81 // execution mode of kernel
82 llvm::omp::OMPTgtExecModeFlags ExecutionMode;
83
84 /// Maximal number of threads per block for this kernel.
85 int MaxThreadsPerBlock = 0;
86
KernelTyKernelTy87 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
88 : Func(_Func), ExecutionMode(_ExecutionMode) {}
89 };
90
91 namespace {
checkResult(CUresult Err,const char * ErrMsg)92 bool checkResult(CUresult Err, const char *ErrMsg) {
93 if (Err == CUDA_SUCCESS)
94 return true;
95
96 REPORT("%s", ErrMsg);
97 CUDA_ERR_STRING(Err);
98 return false;
99 }
100
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
102 CUstream Stream) {
103 CUresult Err =
104 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
105
106 if (Err != CUDA_SUCCESS) {
107 DP("Error when copying data from device to device. Pointers: src "
108 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
109 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
110 CUDA_ERR_STRING(Err);
111 return OFFLOAD_FAIL;
112 }
113
114 return OFFLOAD_SUCCESS;
115 }
116
createEvent(void ** P)117 int createEvent(void **P) {
118 CUevent Event = nullptr;
119
120 CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT);
121 if (Err != CUDA_SUCCESS) {
122 DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event));
123 CUDA_ERR_STRING(Err);
124 return OFFLOAD_FAIL;
125 }
126
127 *P = Event;
128
129 return OFFLOAD_SUCCESS;
130 }
131
recordEvent(void * EventPtr,__tgt_async_info * AsyncInfo)132 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
133 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
134 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
135
136 CUresult Err = cuEventRecord(Event, Stream);
137 if (Err != CUDA_SUCCESS) {
138 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
139 DPxPTR(Stream), DPxPTR(Event));
140 CUDA_ERR_STRING(Err);
141 return OFFLOAD_FAIL;
142 }
143
144 return OFFLOAD_SUCCESS;
145 }
146
syncEvent(void * EventPtr)147 int syncEvent(void *EventPtr) {
148 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
149
150 CUresult Err = cuEventSynchronize(Event);
151 if (Err != CUDA_SUCCESS) {
152 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
153 CUDA_ERR_STRING(Err);
154 return OFFLOAD_FAIL;
155 }
156
157 return OFFLOAD_SUCCESS;
158 }
159
destroyEvent(void * EventPtr)160 int destroyEvent(void *EventPtr) {
161 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
162
163 CUresult Err = cuEventDestroy(Event);
164 if (Err != CUDA_SUCCESS) {
165 DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event));
166 CUDA_ERR_STRING(Err);
167 return OFFLOAD_FAIL;
168 }
169
170 return OFFLOAD_SUCCESS;
171 }
172
173 // Structure contains per-device data
174 struct DeviceDataTy {
175 /// List that contains all the kernels.
176 std::list<KernelTy> KernelsList;
177
178 std::list<FuncOrGblEntryTy> FuncGblEntries;
179
180 CUcontext Context = nullptr;
181 // Device properties
182 int ThreadsPerBlock = 0;
183 int BlocksPerGrid = 0;
184 int WarpSize = 0;
185 // OpenMP properties
186 int NumTeams = 0;
187 int NumThreads = 0;
188 };
189
190 class StreamManagerTy {
191 int NumberOfDevices;
192 // The initial size of stream pool
193 int EnvNumInitialStreams;
194 // Per-device stream mutex
195 std::vector<std::unique_ptr<std::mutex>> StreamMtx;
196 // Per-device stream Id indicates the next available stream in the pool
197 std::vector<int> NextStreamId;
198 // Per-device stream pool
199 std::vector<std::vector<CUstream>> StreamPool;
200 // Reference to per-device data
201 std::vector<DeviceDataTy> &DeviceData;
202
203 // If there is no CUstream left in the pool, we will resize the pool to
204 // allocate more CUstream. This function should be called with device mutex,
205 // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)206 void resizeStreamPool(const int DeviceId, const size_t NewSize) {
207 std::vector<CUstream> &Pool = StreamPool[DeviceId];
208 const size_t CurrentSize = Pool.size();
209 assert(NewSize > CurrentSize && "new size is not larger than current size");
210
211 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
212 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
213 // We will return if cannot switch to the right context in case of
214 // creating bunch of streams that are not corresponding to the right
215 // device. The offloading will fail later because selected CUstream is
216 // nullptr.
217 return;
218 }
219
220 Pool.resize(NewSize, nullptr);
221
222 for (size_t I = CurrentSize; I < NewSize; ++I) {
223 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
224 "Error returned from cuStreamCreate\n");
225 }
226 }
227
228 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)229 StreamManagerTy(const int NumberOfDevices,
230 std::vector<DeviceDataTy> &DeviceData)
231 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
232 DeviceData(DeviceData) {
233 StreamPool.resize(NumberOfDevices);
234 NextStreamId.resize(NumberOfDevices);
235 StreamMtx.resize(NumberOfDevices);
236
237 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
238 EnvNumInitialStreams = std::stoi(EnvStr);
239
240 // Initialize the next stream id
241 std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
242
243 // Initialize stream mutex
244 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
245 Ptr = std::make_unique<std::mutex>();
246 }
247
~StreamManagerTy()248 ~StreamManagerTy() {
249 // Destroy streams
250 for (int I = 0; I < NumberOfDevices; ++I) {
251 checkResult(cuCtxSetCurrent(DeviceData[I].Context),
252 "Error returned from cuCtxSetCurrent\n");
253
254 for (CUstream &S : StreamPool[I]) {
255 if (S)
256 checkResult(cuStreamDestroy(S),
257 "Error returned from cuStreamDestroy\n");
258 }
259 }
260 }
261
262 // Get a CUstream from pool. Per-device next stream id always points to the
263 // next available CUstream. That means, CUstreams [0, id-1] have been
264 // assigned, and [id,] are still available. If there is no CUstream left, we
265 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
266 // the id will increase one.
267 // xxxxxs+++++++++
268 // ^
269 // id
270 // After assignment, the pool becomes the following and s is assigned.
271 // xxxxxs+++++++++
272 // ^
273 // id
getStream(const int DeviceId)274 CUstream getStream(const int DeviceId) {
275 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
276 int &Id = NextStreamId[DeviceId];
277 // No CUstream left in the pool, we need to request from CUDA RT
278 if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
279 // By default we double the stream pool every time
280 resizeStreamPool(DeviceId, Id * 2);
281 }
282 return StreamPool[DeviceId][Id++];
283 }
284
285 // Return a CUstream back to pool. As mentioned above, per-device next
286 // stream is always points to the next available CUstream, so when we return
287 // a CUstream, we need to first decrease the id, and then copy the CUstream
288 // back.
289 // It is worth noting that, the order of streams return might be different
290 // from that they're assigned, that saying, at some point, there might be
291 // two identical CUstreams.
292 // xxax+a+++++
293 // ^
294 // id
295 // However, it doesn't matter, because they're always on the two sides of
296 // id. The left one will in the end be overwritten by another CUstream.
297 // Therefore, after several execution, the order of pool might be different
298 // from its initial state.
returnStream(const int DeviceId,CUstream Stream)299 void returnStream(const int DeviceId, CUstream Stream) {
300 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
301 int &Id = NextStreamId[DeviceId];
302 assert(Id > 0 && "Wrong stream ID");
303 StreamPool[DeviceId][--Id] = Stream;
304 }
305
initializeDeviceStreamPool(const int DeviceId)306 bool initializeDeviceStreamPool(const int DeviceId) {
307 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
308
309 resizeStreamPool(DeviceId, EnvNumInitialStreams);
310
311 // Check the size of stream pool
312 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
313 return false;
314
315 // Check whether each stream is valid
316 for (CUstream &S : StreamPool[DeviceId])
317 if (!S)
318 return false;
319
320 return true;
321 }
322 };
323
324 class DeviceRTLTy {
325 int NumberOfDevices;
326 // OpenMP environment properties
327 int EnvNumTeams;
328 int EnvTeamLimit;
329 int EnvTeamThreadLimit;
330 // OpenMP requires flags
331 int64_t RequiresFlags;
332 // Amount of dynamic shared memory to use at launch.
333 uint64_t DynamicMemorySize;
334
335 static constexpr const int HardTeamLimit = 1U << 16U; // 64k
336 static constexpr const int HardThreadLimit = 1024;
337 static constexpr const int DefaultNumTeams = 128;
338 static constexpr const int DefaultNumThreads = 128;
339
340 std::unique_ptr<StreamManagerTy> StreamManager;
341 std::vector<DeviceDataTy> DeviceData;
342 std::vector<CUmodule> Modules;
343
344 /// A class responsible for interacting with device native runtime library to
345 /// allocate and free memory.
346 class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
347 const int DeviceId;
348 const std::vector<DeviceDataTy> &DeviceData;
349 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
350
351 public:
CUDADeviceAllocatorTy(int DeviceId,std::vector<DeviceDataTy> & DeviceData)352 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
353 : DeviceId(DeviceId), DeviceData(DeviceData) {}
354
allocate(size_t Size,void *,TargetAllocTy Kind)355 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
356 if (Size == 0)
357 return nullptr;
358
359 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
360 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
361 return nullptr;
362
363 void *MemAlloc = nullptr;
364 switch (Kind) {
365 case TARGET_ALLOC_DEFAULT:
366 case TARGET_ALLOC_DEVICE:
367 CUdeviceptr DevicePtr;
368 Err = cuMemAlloc(&DevicePtr, Size);
369 MemAlloc = (void *)DevicePtr;
370 if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
371 return nullptr;
372 break;
373 case TARGET_ALLOC_HOST:
374 void *HostPtr;
375 Err = cuMemAllocHost(&HostPtr, Size);
376 MemAlloc = HostPtr;
377 if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
378 return nullptr;
379 HostPinnedAllocs[MemAlloc] = Kind;
380 break;
381 case TARGET_ALLOC_SHARED:
382 CUdeviceptr SharedPtr;
383 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
384 MemAlloc = (void *)SharedPtr;
385 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
386 return nullptr;
387 break;
388 }
389
390 return MemAlloc;
391 }
392
free(void * TgtPtr)393 int free(void *TgtPtr) override {
394 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
395 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
396 return OFFLOAD_FAIL;
397
398 // Host pinned memory must be freed differently.
399 TargetAllocTy Kind =
400 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
401 ? TARGET_ALLOC_DEFAULT
402 : TARGET_ALLOC_HOST;
403 switch (Kind) {
404 case TARGET_ALLOC_DEFAULT:
405 case TARGET_ALLOC_DEVICE:
406 case TARGET_ALLOC_SHARED:
407 Err = cuMemFree((CUdeviceptr)TgtPtr);
408 if (!checkResult(Err, "Error returned from cuMemFree\n"))
409 return OFFLOAD_FAIL;
410 break;
411 case TARGET_ALLOC_HOST:
412 Err = cuMemFreeHost(TgtPtr);
413 if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
414 return OFFLOAD_FAIL;
415 break;
416 }
417
418 return OFFLOAD_SUCCESS;
419 }
420 };
421
422 /// A vector of device allocators
423 std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
424
425 /// A vector of memory managers. Since the memory manager is non-copyable and
426 // non-removable, we wrap them into std::unique_ptr.
427 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
428
429 /// Whether use memory manager
430 bool UseMemoryManager = true;
431
432 // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)433 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
434 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
435 E.Entries.push_back(entry);
436 }
437
438 // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const439 const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
440 const void *Addr) const {
441 for (const __tgt_offload_entry &Itr :
442 DeviceData[DeviceId].FuncGblEntries.back().Entries)
443 if (Itr.addr == Addr)
444 return &Itr;
445
446 return nullptr;
447 }
448
449 // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)450 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
451 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
452
453 if (E.Entries.empty())
454 return nullptr;
455
456 // Update table info according to the entries and return the pointer
457 E.Table.EntriesBegin = E.Entries.data();
458 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
459
460 return &E.Table;
461 }
462
463 // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)464 void clearOffloadEntriesTable(const int DeviceId) {
465 DeviceData[DeviceId].FuncGblEntries.emplace_back();
466 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
467 E.Entries.clear();
468 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
469 }
470
getStream(const int DeviceId,__tgt_async_info * AsyncInfo) const471 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
472 assert(AsyncInfo && "AsyncInfo is nullptr");
473
474 if (!AsyncInfo->Queue)
475 AsyncInfo->Queue = StreamManager->getStream(DeviceId);
476
477 return reinterpret_cast<CUstream>(AsyncInfo->Queue);
478 }
479
480 public:
481 // This class should not be copied
482 DeviceRTLTy(const DeviceRTLTy &) = delete;
483 DeviceRTLTy(DeviceRTLTy &&) = delete;
484
DeviceRTLTy()485 DeviceRTLTy()
486 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
487 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
488 DynamicMemorySize(0) {
489
490 DP("Start initializing CUDA\n");
491
492 CUresult Err = cuInit(0);
493 if (Err == CUDA_ERROR_INVALID_HANDLE) {
494 // Can't call cuGetErrorString if dlsym failed
495 DP("Failed to load CUDA shared library\n");
496 return;
497 }
498 if (!checkResult(Err, "Error returned from cuInit\n")) {
499 return;
500 }
501
502 Err = cuDeviceGetCount(&NumberOfDevices);
503 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
504 return;
505
506 if (NumberOfDevices == 0) {
507 DP("There are no devices supporting CUDA.\n");
508 return;
509 }
510
511 DeviceData.resize(NumberOfDevices);
512
513 // Get environment variables regarding teams
514 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
515 // OMP_TEAM_LIMIT has been set
516 EnvTeamLimit = std::stoi(EnvStr);
517 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
518 }
519 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
520 // OMP_TEAMS_THREAD_LIMIT has been set
521 EnvTeamThreadLimit = std::stoi(EnvStr);
522 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
523 }
524 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
525 // OMP_NUM_TEAMS has been set
526 EnvNumTeams = std::stoi(EnvStr);
527 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
528 }
529 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
530 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
531 DynamicMemorySize = std::stoi(EnvStr);
532 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
533 DynamicMemorySize);
534 }
535
536 StreamManager =
537 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
538
539 for (int I = 0; I < NumberOfDevices; ++I)
540 DeviceAllocators.emplace_back(I, DeviceData);
541
542 // Get the size threshold from environment variable
543 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
544 UseMemoryManager = Res.second;
545 size_t MemoryManagerThreshold = Res.first;
546
547 if (UseMemoryManager)
548 for (int I = 0; I < NumberOfDevices; ++I)
549 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
550 DeviceAllocators[I], MemoryManagerThreshold));
551 }
552
~DeviceRTLTy()553 ~DeviceRTLTy() {
554 // We first destruct memory managers in case that its dependent data are
555 // destroyed before it.
556 for (auto &M : MemoryManagers)
557 M.release();
558
559 StreamManager = nullptr;
560
561 for (CUmodule &M : Modules)
562 // Close module
563 if (M)
564 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
565
566 for (DeviceDataTy &D : DeviceData) {
567 // Destroy context
568 if (D.Context) {
569 checkResult(cuCtxSetCurrent(D.Context),
570 "Error returned from cuCtxSetCurrent\n");
571 CUdevice Device;
572 checkResult(cuCtxGetDevice(&Device),
573 "Error returned from cuCtxGetDevice\n");
574 checkResult(cuDevicePrimaryCtxRelease(Device),
575 "Error returned from cuDevicePrimaryCtxRelease\n");
576 }
577 }
578 }
579
580 // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const581 bool isValidDeviceId(const int DeviceId) const {
582 return DeviceId >= 0 && DeviceId < NumberOfDevices;
583 }
584
getNumOfDevices() const585 int getNumOfDevices() const { return NumberOfDevices; }
586
setRequiresFlag(const int64_t Flags)587 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
588
initDevice(const int DeviceId)589 int initDevice(const int DeviceId) {
590 CUdevice Device;
591
592 DP("Getting device %d\n", DeviceId);
593 CUresult Err = cuDeviceGet(&Device, DeviceId);
594 if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
595 return OFFLOAD_FAIL;
596
597 // Query the current flags of the primary context and set its flags if
598 // it is inactive
599 unsigned int FormerPrimaryCtxFlags = 0;
600 int FormerPrimaryCtxIsActive = 0;
601 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
602 &FormerPrimaryCtxIsActive);
603 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
604 return OFFLOAD_FAIL;
605
606 if (FormerPrimaryCtxIsActive) {
607 DP("The primary context is active, no change to its flags\n");
608 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
609 CU_CTX_SCHED_BLOCKING_SYNC)
610 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
611 } else {
612 DP("The primary context is inactive, set its flags to "
613 "CU_CTX_SCHED_BLOCKING_SYNC\n");
614 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
615 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
616 return OFFLOAD_FAIL;
617 }
618
619 // Retain the per device primary context and save it to use whenever this
620 // device is selected.
621 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
622 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
623 return OFFLOAD_FAIL;
624
625 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
626 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
627 return OFFLOAD_FAIL;
628
629 // Initialize stream pool
630 if (!StreamManager->initializeDeviceStreamPool(DeviceId))
631 return OFFLOAD_FAIL;
632
633 // Query attributes to determine number of threads/block and blocks/grid.
634 int MaxGridDimX;
635 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
636 Device);
637 if (Err != CUDA_SUCCESS) {
638 DP("Error getting max grid dimension, use default value %d\n",
639 DeviceRTLTy::DefaultNumTeams);
640 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
641 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
642 DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
643 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
644 } else {
645 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
646 "at the hard limit\n",
647 MaxGridDimX, DeviceRTLTy::HardTeamLimit);
648 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
649 }
650
651 // We are only exploiting threads along the x axis.
652 int MaxBlockDimX;
653 Err = cuDeviceGetAttribute(&MaxBlockDimX,
654 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
655 if (Err != CUDA_SUCCESS) {
656 DP("Error getting max block dimension, use default value %d\n",
657 DeviceRTLTy::DefaultNumThreads);
658 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
659 } else {
660 DP("Using %d CUDA threads per block\n", MaxBlockDimX);
661 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
662
663 if (EnvTeamThreadLimit > 0 &&
664 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
665 DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
666 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
667 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
668 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
669 }
670 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
671 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
672 "capping at the hard limit\n",
673 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
674 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
675 }
676 }
677
678 // Get and set warp size
679 int WarpSize;
680 Err =
681 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
682 if (Err != CUDA_SUCCESS) {
683 DP("Error getting warp size, assume default value 32\n");
684 DeviceData[DeviceId].WarpSize = 32;
685 } else {
686 DP("Using warp size %d\n", WarpSize);
687 DeviceData[DeviceId].WarpSize = WarpSize;
688 }
689
690 // Adjust teams to the env variables
691 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
692 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
693 EnvTeamLimit);
694 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
695 }
696
697 size_t StackLimit;
698 size_t HeapLimit;
699 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
700 StackLimit = std::stol(EnvStr);
701 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
702 return OFFLOAD_FAIL;
703 } else {
704 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
705 return OFFLOAD_FAIL;
706 }
707 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
708 HeapLimit = std::stol(EnvStr);
709 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
710 return OFFLOAD_FAIL;
711 } else {
712 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
713 return OFFLOAD_FAIL;
714 }
715
716 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
717 "Device supports up to %d CUDA blocks and %d threads with a "
718 "warp size of %d\n",
719 DeviceData[DeviceId].BlocksPerGrid,
720 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
721 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
722 "Device heap size is %d Bytes, device stack size is %d Bytes per "
723 "thread\n",
724 (int)HeapLimit, (int)StackLimit);
725
726 // Set default number of teams
727 if (EnvNumTeams > 0) {
728 DP("Default number of teams set according to environment %d\n",
729 EnvNumTeams);
730 DeviceData[DeviceId].NumTeams = EnvNumTeams;
731 } else {
732 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
733 DP("Default number of teams set according to library's default %d\n",
734 DeviceRTLTy::DefaultNumTeams);
735 }
736
737 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
738 DP("Default number of teams exceeds device limit, capping at %d\n",
739 DeviceData[DeviceId].BlocksPerGrid);
740 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
741 }
742
743 // Set default number of threads
744 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
745 DP("Default number of threads set according to library's default %d\n",
746 DeviceRTLTy::DefaultNumThreads);
747 if (DeviceData[DeviceId].NumThreads >
748 DeviceData[DeviceId].ThreadsPerBlock) {
749 DP("Default number of threads exceeds device limit, capping at %d\n",
750 DeviceData[DeviceId].ThreadsPerBlock);
751 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
752 }
753
754 return OFFLOAD_SUCCESS;
755 }
756
loadBinary(const int DeviceId,const __tgt_device_image * Image)757 __tgt_target_table *loadBinary(const int DeviceId,
758 const __tgt_device_image *Image) {
759 // Set the context we are using
760 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
761 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
762 return nullptr;
763
764 // Clear the offload table as we are going to create a new one.
765 clearOffloadEntriesTable(DeviceId);
766
767 // Create the module and extract the function pointers.
768 CUmodule Module;
769 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
770 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
771 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
772 return nullptr;
773
774 DP("CUDA module successfully loaded!\n");
775
776 Modules.push_back(Module);
777
778 // Find the symbols in the module by name.
779 const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
780 const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
781
782 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
783 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
784 if (!E->addr) {
785 // We return nullptr when something like this happens, the host should
786 // have always something in the address to uniquely identify the target
787 // region.
788 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
789 return nullptr;
790 }
791
792 if (E->size) {
793 __tgt_offload_entry Entry = *E;
794 CUdeviceptr CUPtr;
795 size_t CUSize;
796 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
797 // We keep this style here because we need the name
798 if (Err != CUDA_SUCCESS) {
799 REPORT("Loading global '%s' Failed\n", E->name);
800 CUDA_ERR_STRING(Err);
801 return nullptr;
802 }
803
804 if (CUSize != E->size) {
805 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
806 CUSize, E->size);
807 return nullptr;
808 }
809
810 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
811 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
812
813 Entry.addr = (void *)(CUPtr);
814
815 // Note: In the current implementation declare target variables
816 // can either be link or to. This means that once unified
817 // memory is activated via the requires directive, the variable
818 // can be used directly from the host in both cases.
819 // TODO: when variables types other than to or link are added,
820 // the below condition should be changed to explicitly
821 // check for to and link variables types:
822 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
823 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
824 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
825 // If unified memory is present any target link or to variables
826 // can access host addresses directly. There is no longer a
827 // need for device copies.
828 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
829 DP("Copy linked variable host address (" DPxMOD
830 ") to device address (" DPxMOD ")\n",
831 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
832 }
833
834 addOffloadEntry(DeviceId, Entry);
835
836 continue;
837 }
838
839 CUfunction Func;
840 Err = cuModuleGetFunction(&Func, Module, E->name);
841 // We keep this style here because we need the name
842 if (Err != CUDA_SUCCESS) {
843 REPORT("Loading '%s' Failed\n", E->name);
844 CUDA_ERR_STRING(Err);
845 return nullptr;
846 }
847
848 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
849 DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
850
851 // default value GENERIC (in case symbol is missing from cubin file)
852 llvm::omp::OMPTgtExecModeFlags ExecModeVal;
853 std::string ExecModeNameStr(E->name);
854 ExecModeNameStr += "_exec_mode";
855 const char *ExecModeName = ExecModeNameStr.c_str();
856
857 CUdeviceptr ExecModePtr;
858 size_t CUSize;
859 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
860 if (Err == CUDA_SUCCESS) {
861 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
862 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
863 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
864 return nullptr;
865 }
866
867 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
868 if (Err != CUDA_SUCCESS) {
869 REPORT("Error when copying data from device to host. Pointers: "
870 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
871 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
872 CUDA_ERR_STRING(Err);
873 return nullptr;
874 }
875 } else {
876 DP("Loading global exec_mode '%s' - symbol missing, using default "
877 "value GENERIC (1)\n",
878 ExecModeName);
879 }
880
881 KernelsList.emplace_back(Func, ExecModeVal);
882
883 __tgt_offload_entry Entry = *E;
884 Entry.addr = &KernelsList.back();
885 addOffloadEntry(DeviceId, Entry);
886 }
887
888 // send device environment data to the device
889 {
890 // TODO: The device ID used here is not the real device ID used by OpenMP.
891 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
892 static_cast<uint32_t>(DeviceId),
893 static_cast<uint32_t>(DynamicMemorySize)};
894
895 #ifdef OMPTARGET_DEBUG
896 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
897 DeviceEnv.DebugKind = std::stoi(EnvStr);
898 #endif
899
900 const char *DeviceEnvName = "omptarget_device_environment";
901 CUdeviceptr DeviceEnvPtr;
902 size_t CUSize;
903
904 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
905 if (Err == CUDA_SUCCESS) {
906 if (CUSize != sizeof(DeviceEnv)) {
907 REPORT(
908 "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
909 DeviceEnvName, CUSize, sizeof(int32_t));
910 CUDA_ERR_STRING(Err);
911 return nullptr;
912 }
913
914 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
915 if (Err != CUDA_SUCCESS) {
916 REPORT("Error when copying data from host to device. Pointers: "
917 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
918 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
919 CUDA_ERR_STRING(Err);
920 return nullptr;
921 }
922
923 DP("Sending global device environment data %zu bytes\n", CUSize);
924 } else {
925 DP("Finding global device environment '%s' - symbol missing.\n",
926 DeviceEnvName);
927 DP("Continue, considering this is a device RTL which does not accept "
928 "environment setting.\n");
929 }
930 }
931
932 return getOffloadEntriesTable(DeviceId);
933 }
934
dataAlloc(const int DeviceId,const int64_t Size,const TargetAllocTy Kind)935 void *dataAlloc(const int DeviceId, const int64_t Size,
936 const TargetAllocTy Kind) {
937 switch (Kind) {
938 case TARGET_ALLOC_DEFAULT:
939 case TARGET_ALLOC_DEVICE:
940 if (UseMemoryManager)
941 return MemoryManagers[DeviceId]->allocate(Size, nullptr);
942 else
943 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
944 case TARGET_ALLOC_HOST:
945 case TARGET_ALLOC_SHARED:
946 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
947 }
948
949 REPORT("Invalid target data allocation kind or requested allocator not "
950 "implemented yet\n");
951
952 return nullptr;
953 }
954
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const955 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
956 const int64_t Size, __tgt_async_info *AsyncInfo) const {
957 assert(AsyncInfo && "AsyncInfo is nullptr");
958
959 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
960 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
961 return OFFLOAD_FAIL;
962
963 CUstream Stream = getStream(DeviceId, AsyncInfo);
964
965 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
966 if (Err != CUDA_SUCCESS) {
967 DP("Error when copying data from host to device. Pointers: host "
968 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
969 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
970 CUDA_ERR_STRING(Err);
971 return OFFLOAD_FAIL;
972 }
973
974 return OFFLOAD_SUCCESS;
975 }
976
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const977 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
978 const int64_t Size, __tgt_async_info *AsyncInfo) const {
979 assert(AsyncInfo && "AsyncInfo is nullptr");
980
981 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
982 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
983 return OFFLOAD_FAIL;
984
985 CUstream Stream = getStream(DeviceId, AsyncInfo);
986
987 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
988 if (Err != CUDA_SUCCESS) {
989 DP("Error when copying data from device to host. Pointers: host "
990 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
991 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
992 CUDA_ERR_STRING(Err);
993 return OFFLOAD_FAIL;
994 }
995
996 return OFFLOAD_SUCCESS;
997 }
998
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo) const999 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1000 int64_t Size, __tgt_async_info *AsyncInfo) const {
1001 assert(AsyncInfo && "AsyncInfo is nullptr");
1002
1003 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1004 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1005 return OFFLOAD_FAIL;
1006
1007 CUstream Stream = getStream(SrcDevId, AsyncInfo);
1008
1009 // If they are two devices, we try peer to peer copy first
1010 if (SrcDevId != DstDevId) {
1011 int CanAccessPeer = 0;
1012 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1013 if (Err != CUDA_SUCCESS) {
1014 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1015 ", dst = %" PRId32 "\n",
1016 SrcDevId, DstDevId);
1017 CUDA_ERR_STRING(Err);
1018 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1019 }
1020
1021 if (!CanAccessPeer) {
1022 DP("P2P memcpy not supported so fall back to D2D memcpy");
1023 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1024 }
1025
1026 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1027 if (Err != CUDA_SUCCESS) {
1028 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1029 ", dst = %" PRId32 "\n",
1030 SrcDevId, DstDevId);
1031 CUDA_ERR_STRING(Err);
1032 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1033 }
1034
1035 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1036 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1037 Size, Stream);
1038 if (Err == CUDA_SUCCESS)
1039 return OFFLOAD_SUCCESS;
1040
1041 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1042 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1043 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1044 CUDA_ERR_STRING(Err);
1045 }
1046
1047 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1048 }
1049
dataDelete(const int DeviceId,void * TgtPtr)1050 int dataDelete(const int DeviceId, void *TgtPtr) {
1051 if (UseMemoryManager)
1052 return MemoryManagers[DeviceId]->free(TgtPtr);
1053
1054 return DeviceAllocators[DeviceId].free(TgtPtr);
1055 }
1056
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const1057 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1058 ptrdiff_t *TgtOffsets, const int ArgNum,
1059 const int TeamNum, const int ThreadLimit,
1060 const unsigned int LoopTripCount,
1061 __tgt_async_info *AsyncInfo) const {
1062 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1063 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1064 return OFFLOAD_FAIL;
1065
1066 // All args are references.
1067 std::vector<void *> Args(ArgNum);
1068 std::vector<void *> Ptrs(ArgNum);
1069
1070 for (int I = 0; I < ArgNum; ++I) {
1071 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1072 Args[I] = &Ptrs[I];
1073 }
1074
1075 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1076
1077 const bool IsSPMDGenericMode =
1078 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1079 const bool IsSPMDMode =
1080 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1081 const bool IsGenericMode =
1082 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1083
1084 int CudaThreadsPerBlock;
1085 if (ThreadLimit > 0) {
1086 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1087 CudaThreadsPerBlock = ThreadLimit;
1088 // Add master warp if necessary
1089 if (IsGenericMode) {
1090 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1091 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1092 }
1093 } else {
1094 DP("Setting CUDA threads per block to default %d\n",
1095 DeviceData[DeviceId].NumThreads);
1096 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1097 }
1098
1099 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1100 DP("Threads per block capped at device limit %d\n",
1101 DeviceData[DeviceId].ThreadsPerBlock);
1102 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1103 }
1104
1105 if (!KernelInfo->MaxThreadsPerBlock) {
1106 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1107 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1108 KernelInfo->Func);
1109 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1110 return OFFLOAD_FAIL;
1111 }
1112
1113 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1114 DP("Threads per block capped at kernel limit %d\n",
1115 KernelInfo->MaxThreadsPerBlock);
1116 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1117 }
1118
1119 unsigned int CudaBlocksPerGrid;
1120 if (TeamNum <= 0) {
1121 if (LoopTripCount > 0 && EnvNumTeams < 0) {
1122 if (IsSPMDGenericMode) {
1123 // If we reach this point, then we are executing a kernel that was
1124 // transformed from Generic-mode to SPMD-mode. This kernel has
1125 // SPMD-mode execution, but needs its blocks to be scheduled
1126 // differently because the current loop trip count only applies to the
1127 // `teams distribute` region and will create var too few blocks using
1128 // the regular SPMD-mode method.
1129 CudaBlocksPerGrid = LoopTripCount;
1130 } else if (IsSPMDMode) {
1131 // We have a combined construct, i.e. `target teams distribute
1132 // parallel for [simd]`. We launch so many teams so that each thread
1133 // will execute one iteration of the loop. round up to the nearest
1134 // integer
1135 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1136 } else if (IsGenericMode) {
1137 // If we reach this point, then we have a non-combined construct, i.e.
1138 // `teams distribute` with a nested `parallel for` and each team is
1139 // assigned one iteration of the `distribute` loop. E.g.:
1140 //
1141 // #pragma omp target teams distribute
1142 // for(...loop_tripcount...) {
1143 // #pragma omp parallel for
1144 // for(...) {}
1145 // }
1146 //
1147 // Threads within a team will execute the iterations of the `parallel`
1148 // loop.
1149 CudaBlocksPerGrid = LoopTripCount;
1150 } else {
1151 REPORT("Unknown execution mode: %d\n",
1152 static_cast<int8_t>(KernelInfo->ExecutionMode));
1153 return OFFLOAD_FAIL;
1154 }
1155 DP("Using %d teams due to loop trip count %" PRIu32
1156 " and number of threads per block %d\n",
1157 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1158 } else {
1159 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1160 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1161 }
1162 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1163 DP("Capping number of teams to team limit %d\n",
1164 DeviceData[DeviceId].BlocksPerGrid);
1165 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1166 } else {
1167 DP("Using requested number of teams %d\n", TeamNum);
1168 CudaBlocksPerGrid = TeamNum;
1169 }
1170
1171 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1172 "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1173 (getOffloadEntry(DeviceId, TgtEntryPtr))
1174 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1175 : "(null)",
1176 CudaBlocksPerGrid, CudaThreadsPerBlock,
1177 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1178
1179 CUstream Stream = getStream(DeviceId, AsyncInfo);
1180 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1181 /* gridDimZ */ 1, CudaThreadsPerBlock,
1182 /* blockDimY */ 1, /* blockDimZ */ 1,
1183 DynamicMemorySize, Stream, &Args[0], nullptr);
1184 if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1185 return OFFLOAD_FAIL;
1186
1187 DP("Launch of entry point at " DPxMOD " successful!\n",
1188 DPxPTR(TgtEntryPtr));
1189
1190 return OFFLOAD_SUCCESS;
1191 }
1192
synchronize(const int DeviceId,__tgt_async_info * AsyncInfo) const1193 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1194 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1195 CUresult Err = cuStreamSynchronize(Stream);
1196
1197 // Once the stream is synchronized, return it to stream pool and reset
1198 // AsyncInfo. This is to make sure the synchronization only works for its
1199 // own tasks.
1200 StreamManager->returnStream(DeviceId,
1201 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1202 AsyncInfo->Queue = nullptr;
1203
1204 if (Err != CUDA_SUCCESS) {
1205 DP("Error when synchronizing stream. stream = " DPxMOD
1206 ", async info ptr = " DPxMOD "\n",
1207 DPxPTR(Stream), DPxPTR(AsyncInfo));
1208 CUDA_ERR_STRING(Err);
1209 }
1210 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1211 }
1212
printDeviceInfo(int32_t device_id)1213 void printDeviceInfo(int32_t device_id) {
1214 char TmpChar[1000];
1215 std::string TmpStr;
1216 size_t TmpSt;
1217 int TmpInt, TmpInt2, TmpInt3;
1218
1219 CUdevice Device;
1220 checkResult(cuDeviceGet(&Device, device_id),
1221 "Error returned from cuCtxGetDevice\n");
1222
1223 cuDriverGetVersion(&TmpInt);
1224 printf(" CUDA Driver Version: \t\t%d \n", TmpInt);
1225 printf(" CUDA Device Number: \t\t%d \n", device_id);
1226 checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1227 "Error returned from cuDeviceGetName\n");
1228 printf(" Device Name: \t\t\t%s \n", TmpChar);
1229 checkResult(cuDeviceTotalMem(&TmpSt, Device),
1230 "Error returned from cuDeviceTotalMem\n");
1231 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt);
1232 checkResult(cuDeviceGetAttribute(
1233 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1234 "Error returned from cuDeviceGetAttribute\n");
1235 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt);
1236 checkResult(
1237 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1238 "Error returned from cuDeviceGetAttribute\n");
1239 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1240 checkResult(cuDeviceGetAttribute(
1241 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1242 "Error returned from cuDeviceGetAttribute\n");
1243 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt);
1244 checkResult(
1245 cuDeviceGetAttribute(
1246 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1247 "Error returned from cuDeviceGetAttribute\n");
1248 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1249 checkResult(
1250 cuDeviceGetAttribute(
1251 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1252 "Error returned from cuDeviceGetAttribute\n");
1253 printf(" Registers per Block: \t\t%d \n", TmpInt);
1254 checkResult(
1255 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1256 "Error returned from cuDeviceGetAttribute\n");
1257 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1258 checkResult(cuDeviceGetAttribute(
1259 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1260 "Error returned from cuDeviceGetAttribute\n");
1261 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt);
1262 checkResult(cuDeviceGetAttribute(
1263 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1264 "Error returned from cuDeviceGetAttribute\n");
1265 checkResult(cuDeviceGetAttribute(
1266 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1267 "Error returned from cuDeviceGetAttribute\n");
1268 checkResult(cuDeviceGetAttribute(
1269 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1270 "Error returned from cuDeviceGetAttribute\n");
1271 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1272 TmpInt3);
1273 checkResult(cuDeviceGetAttribute(
1274 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1275 "Error returned from cuDeviceGetAttribute\n");
1276 checkResult(cuDeviceGetAttribute(
1277 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1278 "Error returned from cuDeviceGetAttribute\n");
1279 checkResult(cuDeviceGetAttribute(
1280 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1281 "Error returned from cuDeviceGetAttribute\n");
1282 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1283 TmpInt3);
1284 checkResult(
1285 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1286 "Error returned from cuDeviceGetAttribute\n");
1287 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1288 checkResult(cuDeviceGetAttribute(
1289 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1290 "Error returned from cuDeviceGetAttribute\n");
1291 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1292 checkResult(
1293 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1294 "Error returned from cuDeviceGetAttribute\n");
1295 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt);
1296 checkResult(cuDeviceGetAttribute(
1297 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1298 "Error returned from cuDeviceGetAttribute\n");
1299 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1300 checkResult(
1301 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1302 "Error returned from cuDeviceGetAttribute\n");
1303 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1304 checkResult(cuDeviceGetAttribute(
1305 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1306 "Error returned from cuDeviceGetAttribute\n");
1307 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1308 checkResult(
1309 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1310 "Error returned from cuDeviceGetAttribute\n");
1311 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1312 TmpStr = "DEFAULT";
1313 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1314 TmpStr = "PROHIBITED";
1315 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1316 TmpStr = "EXCLUSIVE PROCESS";
1317 else
1318 TmpStr = "unknown";
1319 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1320 checkResult(cuDeviceGetAttribute(
1321 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1322 "Error returned from cuDeviceGetAttribute\n");
1323 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1324 checkResult(
1325 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1326 "Error returned from cuDeviceGetAttribute\n");
1327 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1328 checkResult(cuDeviceGetAttribute(
1329 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1330 "Error returned from cuDeviceGetAttribute\n");
1331 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1332 checkResult(
1333 cuDeviceGetAttribute(
1334 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1335 "Error returned from cuDeviceGetAttribute\n");
1336 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1337 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1338 Device),
1339 "Error returned from cuDeviceGetAttribute\n");
1340 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1341 checkResult(cuDeviceGetAttribute(
1342 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1343 Device),
1344 "Error returned from cuDeviceGetAttribute\n");
1345 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt);
1346 checkResult(cuDeviceGetAttribute(
1347 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1348 "Error returned from cuDeviceGetAttribute\n");
1349 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1350 checkResult(cuDeviceGetAttribute(
1351 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1352 "Error returned from cuDeviceGetAttribute\n");
1353 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1354 checkResult(cuDeviceGetAttribute(
1355 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1356 "Error returned from cuDeviceGetAttribute\n");
1357 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1358 checkResult(
1359 cuDeviceGetAttribute(
1360 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1361 "Error returned from cuDeviceGetAttribute\n");
1362 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1363 checkResult(
1364 cuDeviceGetAttribute(
1365 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1366 "Error returned from cuDeviceGetAttribute\n");
1367 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1368 checkResult(cuDeviceGetAttribute(
1369 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1370 "Error returned from cuDeviceGetAttribute\n");
1371 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1372 checkResult(cuDeviceGetAttribute(
1373 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1374 "Error returned from cuDeviceGetAttribute\n");
1375 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1376 checkResult(
1377 cuDeviceGetAttribute(
1378 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1379 "Error returned from cuDeviceGetAttribute\n");
1380 checkResult(
1381 cuDeviceGetAttribute(
1382 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1383 "Error returned from cuDeviceGetAttribute\n");
1384 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1385 }
1386
waitEvent(const int DeviceId,__tgt_async_info * AsyncInfo,void * EventPtr) const1387 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1388 void *EventPtr) const {
1389 CUstream Stream = getStream(DeviceId, AsyncInfo);
1390 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1391
1392 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1393 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1394 // API document, that argument has to be 0x0.
1395 CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1396 if (Err != CUDA_SUCCESS) {
1397 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1398 DPxPTR(Stream), DPxPTR(Event));
1399 CUDA_ERR_STRING(Err);
1400 return OFFLOAD_FAIL;
1401 }
1402
1403 return OFFLOAD_SUCCESS;
1404 }
1405 };
1406
1407 DeviceRTLTy DeviceRTL;
1408 } // namespace
1409
1410 // Exposed library API function
1411 #ifdef __cplusplus
1412 extern "C" {
1413 #endif
1414
__tgt_rtl_is_valid_binary(__tgt_device_image * image)1415 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1416 return elf_check_machine(image, /* EM_CUDA */ 190);
1417 }
1418
__tgt_rtl_number_of_devices()1419 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1420
__tgt_rtl_init_requires(int64_t RequiresFlags)1421 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1422 DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1423 DeviceRTL.setRequiresFlag(RequiresFlags);
1424 return RequiresFlags;
1425 }
1426
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1427 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1428 if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1429 DeviceRTL.isValidDeviceId(dst_dev_id))
1430 return 1;
1431
1432 return 0;
1433 }
1434
__tgt_rtl_init_device(int32_t device_id)1435 int32_t __tgt_rtl_init_device(int32_t device_id) {
1436 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1437
1438 return DeviceRTL.initDevice(device_id);
1439 }
1440
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1441 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1442 __tgt_device_image *image) {
1443 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1444
1445 return DeviceRTL.loadBinary(device_id, image);
1446 }
1447
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *,int32_t kind)1448 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1449 int32_t kind) {
1450 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1451
1452 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1453 }
1454
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1455 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1456 int64_t size) {
1457 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1458
1459 __tgt_async_info AsyncInfo;
1460 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1461 size, &AsyncInfo);
1462 if (rc != OFFLOAD_SUCCESS)
1463 return OFFLOAD_FAIL;
1464
1465 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1466 }
1467
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1468 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1469 void *hst_ptr, int64_t size,
1470 __tgt_async_info *async_info_ptr) {
1471 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1472 assert(async_info_ptr && "async_info_ptr is nullptr");
1473
1474 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1475 async_info_ptr);
1476 }
1477
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1478 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1479 int64_t size) {
1480 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1481
1482 __tgt_async_info AsyncInfo;
1483 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1484 size, &AsyncInfo);
1485 if (rc != OFFLOAD_SUCCESS)
1486 return OFFLOAD_FAIL;
1487
1488 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1489 }
1490
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1491 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1492 void *tgt_ptr, int64_t size,
1493 __tgt_async_info *async_info_ptr) {
1494 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1495 assert(async_info_ptr && "async_info_ptr is nullptr");
1496
1497 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1498 async_info_ptr);
1499 }
1500
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * AsyncInfo)1501 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1502 int dst_dev_id, void *dst_ptr,
1503 int64_t size,
1504 __tgt_async_info *AsyncInfo) {
1505 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1506 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1507 assert(AsyncInfo && "AsyncInfo is nullptr");
1508
1509 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1510 AsyncInfo);
1511 }
1512
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1513 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1514 int32_t dst_dev_id, void *dst_ptr,
1515 int64_t size) {
1516 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1517 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1518
1519 __tgt_async_info AsyncInfo;
1520 const int32_t rc = __tgt_rtl_data_exchange_async(
1521 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1522 if (rc != OFFLOAD_SUCCESS)
1523 return OFFLOAD_FAIL;
1524
1525 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1526 }
1527
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1528 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1529 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1530
1531 return DeviceRTL.dataDelete(device_id, tgt_ptr);
1532 }
1533
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1534 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1535 void **tgt_args,
1536 ptrdiff_t *tgt_offsets,
1537 int32_t arg_num, int32_t team_num,
1538 int32_t thread_limit,
1539 uint64_t loop_tripcount) {
1540 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1541
1542 __tgt_async_info AsyncInfo;
1543 const int32_t rc = __tgt_rtl_run_target_team_region_async(
1544 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1545 thread_limit, loop_tripcount, &AsyncInfo);
1546 if (rc != OFFLOAD_SUCCESS)
1547 return OFFLOAD_FAIL;
1548
1549 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1550 }
1551
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1552 int32_t __tgt_rtl_run_target_team_region_async(
1553 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1554 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1555 int32_t thread_limit, uint64_t loop_tripcount,
1556 __tgt_async_info *async_info_ptr) {
1557 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1558
1559 return DeviceRTL.runTargetTeamRegion(
1560 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1561 thread_limit, loop_tripcount, async_info_ptr);
1562 }
1563
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1564 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1565 void **tgt_args, ptrdiff_t *tgt_offsets,
1566 int32_t arg_num) {
1567 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1568
1569 __tgt_async_info AsyncInfo;
1570 const int32_t rc = __tgt_rtl_run_target_region_async(
1571 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1572 if (rc != OFFLOAD_SUCCESS)
1573 return OFFLOAD_FAIL;
1574
1575 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1576 }
1577
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1578 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1579 void *tgt_entry_ptr, void **tgt_args,
1580 ptrdiff_t *tgt_offsets,
1581 int32_t arg_num,
1582 __tgt_async_info *async_info_ptr) {
1583 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1584
1585 return __tgt_rtl_run_target_team_region_async(
1586 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1587 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1588 async_info_ptr);
1589 }
1590
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1591 int32_t __tgt_rtl_synchronize(int32_t device_id,
1592 __tgt_async_info *async_info_ptr) {
1593 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1594 assert(async_info_ptr && "async_info_ptr is nullptr");
1595 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1596
1597 return DeviceRTL.synchronize(device_id, async_info_ptr);
1598 }
1599
__tgt_rtl_set_info_flag(uint32_t NewInfoLevel)1600 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1601 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1602 InfoLevel.store(NewInfoLevel);
1603 }
1604
__tgt_rtl_print_device_info(int32_t device_id)1605 void __tgt_rtl_print_device_info(int32_t device_id) {
1606 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1607 DeviceRTL.printDeviceInfo(device_id);
1608 }
1609
__tgt_rtl_create_event(int32_t device_id,void ** event)1610 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1611 assert(event && "event is nullptr");
1612 return createEvent(event);
1613 }
1614
__tgt_rtl_record_event(int32_t device_id,void * event_ptr,__tgt_async_info * async_info_ptr)1615 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1616 __tgt_async_info *async_info_ptr) {
1617 assert(async_info_ptr && "async_info_ptr is nullptr");
1618 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1619 assert(event_ptr && "event_ptr is nullptr");
1620
1621 return recordEvent(event_ptr, async_info_ptr);
1622 }
1623
__tgt_rtl_wait_event(int32_t device_id,void * event_ptr,__tgt_async_info * async_info_ptr)1624 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1625 __tgt_async_info *async_info_ptr) {
1626 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1627 assert(async_info_ptr && "async_info_ptr is nullptr");
1628 assert(event_ptr && "event is nullptr");
1629
1630 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1631 }
1632
__tgt_rtl_sync_event(int32_t device_id,void * event_ptr)1633 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1634 assert(event_ptr && "event is nullptr");
1635
1636 return syncEvent(event_ptr);
1637 }
1638
__tgt_rtl_destroy_event(int32_t device_id,void * event_ptr)1639 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1640 assert(event_ptr && "event is nullptr");
1641
1642 return destroyEvent(event_ptr);
1643 }
1644
1645 #ifdef __cplusplus
1646 }
1647 #endif
1648