1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "DeviceEnvironment.h"
25 #include "omptargetplugin.h"
26 
27 #define TARGET_NAME CUDA
28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
29 
30 #include "MemoryManager.h"
31 
32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
33 
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef OMPTARGET_DEBUG
36 #define CUDA_ERR_STRING(err)                                                   \
37   do {                                                                         \
38     if (getDebugLevel() > 0) {                                                 \
39       const char *errStr = nullptr;                                            \
40       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
41       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
42         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
43       else if (errStr_status == CUDA_SUCCESS)                                  \
44         REPORT("CUDA error is: %s\n", errStr);                                 \
45       else {                                                                   \
46         REPORT("Unresolved CUDA error code: %d\n", err);                       \
47         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
48                errStr_status);                                                 \
49       }                                                                        \
50     } else {                                                                   \
51       const char *errStr = nullptr;                                            \
52       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
53       if (errStr_status == CUDA_SUCCESS)                                       \
54         REPORT("%s \n", errStr);                                               \
55     }                                                                          \
56   } while (false)
57 #else // OMPTARGET_DEBUG
58 #define CUDA_ERR_STRING(err)                                                   \
59   do {                                                                         \
60     const char *errStr = nullptr;                                              \
61     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
62     if (errStr_status == CUDA_SUCCESS)                                         \
63       REPORT("%s \n", errStr);                                                 \
64   } while (false)
65 #endif // OMPTARGET_DEBUG
66 
67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
68 
69 #include "elf_common.h"
70 
71 /// Keep entries table per device.
72 struct FuncOrGblEntryTy {
73   __tgt_target_table Table;
74   std::vector<__tgt_offload_entry> Entries;
75 };
76 
77 /// Use a single entity to encode a kernel and a set of flags.
78 struct KernelTy {
79   CUfunction Func;
80 
81   // execution mode of kernel
82   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
83 
84   /// Maximal number of threads per block for this kernel.
85   int MaxThreadsPerBlock = 0;
86 
KernelTyKernelTy87   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
88       : Func(_Func), ExecutionMode(_ExecutionMode) {}
89 };
90 
91 namespace {
checkResult(CUresult Err,const char * ErrMsg)92 bool checkResult(CUresult Err, const char *ErrMsg) {
93   if (Err == CUDA_SUCCESS)
94     return true;
95 
96   REPORT("%s", ErrMsg);
97   CUDA_ERR_STRING(Err);
98   return false;
99 }
100 
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
102                CUstream Stream) {
103   CUresult Err =
104       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
105 
106   if (Err != CUDA_SUCCESS) {
107     DP("Error when copying data from device to device. Pointers: src "
108        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
109        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
110     CUDA_ERR_STRING(Err);
111     return OFFLOAD_FAIL;
112   }
113 
114   return OFFLOAD_SUCCESS;
115 }
116 
createEvent(void ** P)117 int createEvent(void **P) {
118   CUevent Event = nullptr;
119 
120   CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT);
121   if (Err != CUDA_SUCCESS) {
122     DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event));
123     CUDA_ERR_STRING(Err);
124     return OFFLOAD_FAIL;
125   }
126 
127   *P = Event;
128 
129   return OFFLOAD_SUCCESS;
130 }
131 
recordEvent(void * EventPtr,__tgt_async_info * AsyncInfo)132 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
133   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
134   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
135 
136   CUresult Err = cuEventRecord(Event, Stream);
137   if (Err != CUDA_SUCCESS) {
138     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
139        DPxPTR(Stream), DPxPTR(Event));
140     CUDA_ERR_STRING(Err);
141     return OFFLOAD_FAIL;
142   }
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
syncEvent(void * EventPtr)147 int syncEvent(void *EventPtr) {
148   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
149 
150   CUresult Err = cuEventSynchronize(Event);
151   if (Err != CUDA_SUCCESS) {
152     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
153     CUDA_ERR_STRING(Err);
154     return OFFLOAD_FAIL;
155   }
156 
157   return OFFLOAD_SUCCESS;
158 }
159 
destroyEvent(void * EventPtr)160 int destroyEvent(void *EventPtr) {
161   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
162 
163   CUresult Err = cuEventDestroy(Event);
164   if (Err != CUDA_SUCCESS) {
165     DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event));
166     CUDA_ERR_STRING(Err);
167     return OFFLOAD_FAIL;
168   }
169 
170   return OFFLOAD_SUCCESS;
171 }
172 
173 // Structure contains per-device data
174 struct DeviceDataTy {
175   /// List that contains all the kernels.
176   std::list<KernelTy> KernelsList;
177 
178   std::list<FuncOrGblEntryTy> FuncGblEntries;
179 
180   CUcontext Context = nullptr;
181   // Device properties
182   int ThreadsPerBlock = 0;
183   int BlocksPerGrid = 0;
184   int WarpSize = 0;
185   // OpenMP properties
186   int NumTeams = 0;
187   int NumThreads = 0;
188 };
189 
190 class StreamManagerTy {
191   int NumberOfDevices;
192   // The initial size of stream pool
193   int EnvNumInitialStreams;
194   // Per-device stream mutex
195   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
196   // Per-device stream Id indicates the next available stream in the pool
197   std::vector<int> NextStreamId;
198   // Per-device stream pool
199   std::vector<std::vector<CUstream>> StreamPool;
200   // Reference to per-device data
201   std::vector<DeviceDataTy> &DeviceData;
202 
203   // If there is no CUstream left in the pool, we will resize the pool to
204   // allocate more CUstream. This function should be called with device mutex,
205   // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)206   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
207     std::vector<CUstream> &Pool = StreamPool[DeviceId];
208     const size_t CurrentSize = Pool.size();
209     assert(NewSize > CurrentSize && "new size is not larger than current size");
210 
211     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
212     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
213       // We will return if cannot switch to the right context in case of
214       // creating bunch of streams that are not corresponding to the right
215       // device. The offloading will fail later because selected CUstream is
216       // nullptr.
217       return;
218     }
219 
220     Pool.resize(NewSize, nullptr);
221 
222     for (size_t I = CurrentSize; I < NewSize; ++I) {
223       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
224                   "Error returned from cuStreamCreate\n");
225     }
226   }
227 
228 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)229   StreamManagerTy(const int NumberOfDevices,
230                   std::vector<DeviceDataTy> &DeviceData)
231       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
232         DeviceData(DeviceData) {
233     StreamPool.resize(NumberOfDevices);
234     NextStreamId.resize(NumberOfDevices);
235     StreamMtx.resize(NumberOfDevices);
236 
237     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
238       EnvNumInitialStreams = std::stoi(EnvStr);
239 
240     // Initialize the next stream id
241     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
242 
243     // Initialize stream mutex
244     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
245       Ptr = std::make_unique<std::mutex>();
246   }
247 
~StreamManagerTy()248   ~StreamManagerTy() {
249     // Destroy streams
250     for (int I = 0; I < NumberOfDevices; ++I) {
251       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
252                   "Error returned from cuCtxSetCurrent\n");
253 
254       for (CUstream &S : StreamPool[I]) {
255         if (S)
256           checkResult(cuStreamDestroy(S),
257                       "Error returned from cuStreamDestroy\n");
258       }
259     }
260   }
261 
262   // Get a CUstream from pool. Per-device next stream id always points to the
263   // next available CUstream. That means, CUstreams [0, id-1] have been
264   // assigned, and [id,] are still available. If there is no CUstream left, we
265   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
266   // the id will increase one.
267   // xxxxxs+++++++++
268   //      ^
269   //      id
270   // After assignment, the pool becomes the following and s is assigned.
271   // xxxxxs+++++++++
272   //       ^
273   //       id
getStream(const int DeviceId)274   CUstream getStream(const int DeviceId) {
275     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
276     int &Id = NextStreamId[DeviceId];
277     // No CUstream left in the pool, we need to request from CUDA RT
278     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
279       // By default we double the stream pool every time
280       resizeStreamPool(DeviceId, Id * 2);
281     }
282     return StreamPool[DeviceId][Id++];
283   }
284 
285   // Return a CUstream back to pool. As mentioned above, per-device next
286   // stream is always points to the next available CUstream, so when we return
287   // a CUstream, we need to first decrease the id, and then copy the CUstream
288   // back.
289   // It is worth noting that, the order of streams return might be different
290   // from that they're assigned, that saying, at some point, there might be
291   // two identical CUstreams.
292   // xxax+a+++++
293   //     ^
294   //     id
295   // However, it doesn't matter, because they're always on the two sides of
296   // id. The left one will in the end be overwritten by another CUstream.
297   // Therefore, after several execution, the order of pool might be different
298   // from its initial state.
returnStream(const int DeviceId,CUstream Stream)299   void returnStream(const int DeviceId, CUstream Stream) {
300     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
301     int &Id = NextStreamId[DeviceId];
302     assert(Id > 0 && "Wrong stream ID");
303     StreamPool[DeviceId][--Id] = Stream;
304   }
305 
initializeDeviceStreamPool(const int DeviceId)306   bool initializeDeviceStreamPool(const int DeviceId) {
307     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
308 
309     resizeStreamPool(DeviceId, EnvNumInitialStreams);
310 
311     // Check the size of stream pool
312     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
313       return false;
314 
315     // Check whether each stream is valid
316     for (CUstream &S : StreamPool[DeviceId])
317       if (!S)
318         return false;
319 
320     return true;
321   }
322 };
323 
324 class DeviceRTLTy {
325   int NumberOfDevices;
326   // OpenMP environment properties
327   int EnvNumTeams;
328   int EnvTeamLimit;
329   int EnvTeamThreadLimit;
330   // OpenMP requires flags
331   int64_t RequiresFlags;
332   // Amount of dynamic shared memory to use at launch.
333   uint64_t DynamicMemorySize;
334 
335   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
336   static constexpr const int HardThreadLimit = 1024;
337   static constexpr const int DefaultNumTeams = 128;
338   static constexpr const int DefaultNumThreads = 128;
339 
340   std::unique_ptr<StreamManagerTy> StreamManager;
341   std::vector<DeviceDataTy> DeviceData;
342   std::vector<CUmodule> Modules;
343 
344   /// A class responsible for interacting with device native runtime library to
345   /// allocate and free memory.
346   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
347     const int DeviceId;
348     const std::vector<DeviceDataTy> &DeviceData;
349     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
350 
351   public:
CUDADeviceAllocatorTy(int DeviceId,std::vector<DeviceDataTy> & DeviceData)352     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
353         : DeviceId(DeviceId), DeviceData(DeviceData) {}
354 
allocate(size_t Size,void *,TargetAllocTy Kind)355     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
356       if (Size == 0)
357         return nullptr;
358 
359       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
360       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
361         return nullptr;
362 
363       void *MemAlloc = nullptr;
364       switch (Kind) {
365       case TARGET_ALLOC_DEFAULT:
366       case TARGET_ALLOC_DEVICE:
367         CUdeviceptr DevicePtr;
368         Err = cuMemAlloc(&DevicePtr, Size);
369         MemAlloc = (void *)DevicePtr;
370         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
371           return nullptr;
372         break;
373       case TARGET_ALLOC_HOST:
374         void *HostPtr;
375         Err = cuMemAllocHost(&HostPtr, Size);
376         MemAlloc = HostPtr;
377         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
378           return nullptr;
379         HostPinnedAllocs[MemAlloc] = Kind;
380         break;
381       case TARGET_ALLOC_SHARED:
382         CUdeviceptr SharedPtr;
383         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
384         MemAlloc = (void *)SharedPtr;
385         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
386           return nullptr;
387         break;
388       }
389 
390       return MemAlloc;
391     }
392 
free(void * TgtPtr)393     int free(void *TgtPtr) override {
394       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
395       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
396         return OFFLOAD_FAIL;
397 
398       // Host pinned memory must be freed differently.
399       TargetAllocTy Kind =
400           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
401               ? TARGET_ALLOC_DEFAULT
402               : TARGET_ALLOC_HOST;
403       switch (Kind) {
404       case TARGET_ALLOC_DEFAULT:
405       case TARGET_ALLOC_DEVICE:
406       case TARGET_ALLOC_SHARED:
407         Err = cuMemFree((CUdeviceptr)TgtPtr);
408         if (!checkResult(Err, "Error returned from cuMemFree\n"))
409           return OFFLOAD_FAIL;
410         break;
411       case TARGET_ALLOC_HOST:
412         Err = cuMemFreeHost(TgtPtr);
413         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
414           return OFFLOAD_FAIL;
415         break;
416       }
417 
418       return OFFLOAD_SUCCESS;
419     }
420   };
421 
422   /// A vector of device allocators
423   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
424 
425   /// A vector of memory managers. Since the memory manager is non-copyable and
426   // non-removable, we wrap them into std::unique_ptr.
427   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
428 
429   /// Whether use memory manager
430   bool UseMemoryManager = true;
431 
432   // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)433   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
434     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
435     E.Entries.push_back(entry);
436   }
437 
438   // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const439   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
440                                              const void *Addr) const {
441     for (const __tgt_offload_entry &Itr :
442          DeviceData[DeviceId].FuncGblEntries.back().Entries)
443       if (Itr.addr == Addr)
444         return &Itr;
445 
446     return nullptr;
447   }
448 
449   // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)450   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
451     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
452 
453     if (E.Entries.empty())
454       return nullptr;
455 
456     // Update table info according to the entries and return the pointer
457     E.Table.EntriesBegin = E.Entries.data();
458     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
459 
460     return &E.Table;
461   }
462 
463   // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)464   void clearOffloadEntriesTable(const int DeviceId) {
465     DeviceData[DeviceId].FuncGblEntries.emplace_back();
466     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
467     E.Entries.clear();
468     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
469   }
470 
getStream(const int DeviceId,__tgt_async_info * AsyncInfo) const471   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
472     assert(AsyncInfo && "AsyncInfo is nullptr");
473 
474     if (!AsyncInfo->Queue)
475       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
476 
477     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
478   }
479 
480 public:
481   // This class should not be copied
482   DeviceRTLTy(const DeviceRTLTy &) = delete;
483   DeviceRTLTy(DeviceRTLTy &&) = delete;
484 
DeviceRTLTy()485   DeviceRTLTy()
486       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
487         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
488         DynamicMemorySize(0) {
489 
490     DP("Start initializing CUDA\n");
491 
492     CUresult Err = cuInit(0);
493     if (Err == CUDA_ERROR_INVALID_HANDLE) {
494       // Can't call cuGetErrorString if dlsym failed
495       DP("Failed to load CUDA shared library\n");
496       return;
497     }
498     if (!checkResult(Err, "Error returned from cuInit\n")) {
499       return;
500     }
501 
502     Err = cuDeviceGetCount(&NumberOfDevices);
503     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
504       return;
505 
506     if (NumberOfDevices == 0) {
507       DP("There are no devices supporting CUDA.\n");
508       return;
509     }
510 
511     DeviceData.resize(NumberOfDevices);
512 
513     // Get environment variables regarding teams
514     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
515       // OMP_TEAM_LIMIT has been set
516       EnvTeamLimit = std::stoi(EnvStr);
517       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
518     }
519     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
520       // OMP_TEAMS_THREAD_LIMIT has been set
521       EnvTeamThreadLimit = std::stoi(EnvStr);
522       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
523     }
524     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
525       // OMP_NUM_TEAMS has been set
526       EnvNumTeams = std::stoi(EnvStr);
527       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
528     }
529     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
530       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
531       DynamicMemorySize = std::stoi(EnvStr);
532       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
533          DynamicMemorySize);
534     }
535 
536     StreamManager =
537         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
538 
539     for (int I = 0; I < NumberOfDevices; ++I)
540       DeviceAllocators.emplace_back(I, DeviceData);
541 
542     // Get the size threshold from environment variable
543     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
544     UseMemoryManager = Res.second;
545     size_t MemoryManagerThreshold = Res.first;
546 
547     if (UseMemoryManager)
548       for (int I = 0; I < NumberOfDevices; ++I)
549         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
550             DeviceAllocators[I], MemoryManagerThreshold));
551   }
552 
~DeviceRTLTy()553   ~DeviceRTLTy() {
554     // We first destruct memory managers in case that its dependent data are
555     // destroyed before it.
556     for (auto &M : MemoryManagers)
557       M.release();
558 
559     StreamManager = nullptr;
560 
561     for (CUmodule &M : Modules)
562       // Close module
563       if (M)
564         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
565 
566     for (DeviceDataTy &D : DeviceData) {
567       // Destroy context
568       if (D.Context) {
569         checkResult(cuCtxSetCurrent(D.Context),
570                     "Error returned from cuCtxSetCurrent\n");
571         CUdevice Device;
572         checkResult(cuCtxGetDevice(&Device),
573                     "Error returned from cuCtxGetDevice\n");
574         checkResult(cuDevicePrimaryCtxRelease(Device),
575                     "Error returned from cuDevicePrimaryCtxRelease\n");
576       }
577     }
578   }
579 
580   // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const581   bool isValidDeviceId(const int DeviceId) const {
582     return DeviceId >= 0 && DeviceId < NumberOfDevices;
583   }
584 
getNumOfDevices() const585   int getNumOfDevices() const { return NumberOfDevices; }
586 
setRequiresFlag(const int64_t Flags)587   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
588 
initDevice(const int DeviceId)589   int initDevice(const int DeviceId) {
590     CUdevice Device;
591 
592     DP("Getting device %d\n", DeviceId);
593     CUresult Err = cuDeviceGet(&Device, DeviceId);
594     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
595       return OFFLOAD_FAIL;
596 
597     // Query the current flags of the primary context and set its flags if
598     // it is inactive
599     unsigned int FormerPrimaryCtxFlags = 0;
600     int FormerPrimaryCtxIsActive = 0;
601     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
602                                      &FormerPrimaryCtxIsActive);
603     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
604       return OFFLOAD_FAIL;
605 
606     if (FormerPrimaryCtxIsActive) {
607       DP("The primary context is active, no change to its flags\n");
608       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
609           CU_CTX_SCHED_BLOCKING_SYNC)
610         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
611     } else {
612       DP("The primary context is inactive, set its flags to "
613          "CU_CTX_SCHED_BLOCKING_SYNC\n");
614       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
615       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
616         return OFFLOAD_FAIL;
617     }
618 
619     // Retain the per device primary context and save it to use whenever this
620     // device is selected.
621     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
622     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
623       return OFFLOAD_FAIL;
624 
625     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
626     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
627       return OFFLOAD_FAIL;
628 
629     // Initialize stream pool
630     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
631       return OFFLOAD_FAIL;
632 
633     // Query attributes to determine number of threads/block and blocks/grid.
634     int MaxGridDimX;
635     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
636                                Device);
637     if (Err != CUDA_SUCCESS) {
638       DP("Error getting max grid dimension, use default value %d\n",
639          DeviceRTLTy::DefaultNumTeams);
640       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
641     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
642       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
643       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
644     } else {
645       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
646          "at the hard limit\n",
647          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
648       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
649     }
650 
651     // We are only exploiting threads along the x axis.
652     int MaxBlockDimX;
653     Err = cuDeviceGetAttribute(&MaxBlockDimX,
654                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
655     if (Err != CUDA_SUCCESS) {
656       DP("Error getting max block dimension, use default value %d\n",
657          DeviceRTLTy::DefaultNumThreads);
658       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
659     } else {
660       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
661       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
662 
663       if (EnvTeamThreadLimit > 0 &&
664           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
665         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
666            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
667            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
668         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
669       }
670       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
671         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
672            "capping at the hard limit\n",
673            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
674         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
675       }
676     }
677 
678     // Get and set warp size
679     int WarpSize;
680     Err =
681         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
682     if (Err != CUDA_SUCCESS) {
683       DP("Error getting warp size, assume default value 32\n");
684       DeviceData[DeviceId].WarpSize = 32;
685     } else {
686       DP("Using warp size %d\n", WarpSize);
687       DeviceData[DeviceId].WarpSize = WarpSize;
688     }
689 
690     // Adjust teams to the env variables
691     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
692       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
693          EnvTeamLimit);
694       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
695     }
696 
697     size_t StackLimit;
698     size_t HeapLimit;
699     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
700       StackLimit = std::stol(EnvStr);
701       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
702         return OFFLOAD_FAIL;
703     } else {
704       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
705         return OFFLOAD_FAIL;
706     }
707     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
708       HeapLimit = std::stol(EnvStr);
709       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
710         return OFFLOAD_FAIL;
711     } else {
712       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
713         return OFFLOAD_FAIL;
714     }
715 
716     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
717          "Device supports up to %d CUDA blocks and %d threads with a "
718          "warp size of %d\n",
719          DeviceData[DeviceId].BlocksPerGrid,
720          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
721     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
722          "Device heap size is %d Bytes, device stack size is %d Bytes per "
723          "thread\n",
724          (int)HeapLimit, (int)StackLimit);
725 
726     // Set default number of teams
727     if (EnvNumTeams > 0) {
728       DP("Default number of teams set according to environment %d\n",
729          EnvNumTeams);
730       DeviceData[DeviceId].NumTeams = EnvNumTeams;
731     } else {
732       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
733       DP("Default number of teams set according to library's default %d\n",
734          DeviceRTLTy::DefaultNumTeams);
735     }
736 
737     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
738       DP("Default number of teams exceeds device limit, capping at %d\n",
739          DeviceData[DeviceId].BlocksPerGrid);
740       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
741     }
742 
743     // Set default number of threads
744     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
745     DP("Default number of threads set according to library's default %d\n",
746        DeviceRTLTy::DefaultNumThreads);
747     if (DeviceData[DeviceId].NumThreads >
748         DeviceData[DeviceId].ThreadsPerBlock) {
749       DP("Default number of threads exceeds device limit, capping at %d\n",
750          DeviceData[DeviceId].ThreadsPerBlock);
751       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
752     }
753 
754     return OFFLOAD_SUCCESS;
755   }
756 
loadBinary(const int DeviceId,const __tgt_device_image * Image)757   __tgt_target_table *loadBinary(const int DeviceId,
758                                  const __tgt_device_image *Image) {
759     // Set the context we are using
760     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
761     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
762       return nullptr;
763 
764     // Clear the offload table as we are going to create a new one.
765     clearOffloadEntriesTable(DeviceId);
766 
767     // Create the module and extract the function pointers.
768     CUmodule Module;
769     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
770     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
771     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
772       return nullptr;
773 
774     DP("CUDA module successfully loaded!\n");
775 
776     Modules.push_back(Module);
777 
778     // Find the symbols in the module by name.
779     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
780     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
781 
782     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
783     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
784       if (!E->addr) {
785         // We return nullptr when something like this happens, the host should
786         // have always something in the address to uniquely identify the target
787         // region.
788         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
789         return nullptr;
790       }
791 
792       if (E->size) {
793         __tgt_offload_entry Entry = *E;
794         CUdeviceptr CUPtr;
795         size_t CUSize;
796         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
797         // We keep this style here because we need the name
798         if (Err != CUDA_SUCCESS) {
799           REPORT("Loading global '%s' Failed\n", E->name);
800           CUDA_ERR_STRING(Err);
801           return nullptr;
802         }
803 
804         if (CUSize != E->size) {
805           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
806              CUSize, E->size);
807           return nullptr;
808         }
809 
810         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
811            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
812 
813         Entry.addr = (void *)(CUPtr);
814 
815         // Note: In the current implementation declare target variables
816         // can either be link or to. This means that once unified
817         // memory is activated via the requires directive, the variable
818         // can be used directly from the host in both cases.
819         // TODO: when variables types other than to or link are added,
820         // the below condition should be changed to explicitly
821         // check for to and link variables types:
822         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
823         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
824         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
825           // If unified memory is present any target link or to variables
826           // can access host addresses directly. There is no longer a
827           // need for device copies.
828           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
829           DP("Copy linked variable host address (" DPxMOD
830              ") to device address (" DPxMOD ")\n",
831              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
832         }
833 
834         addOffloadEntry(DeviceId, Entry);
835 
836         continue;
837       }
838 
839       CUfunction Func;
840       Err = cuModuleGetFunction(&Func, Module, E->name);
841       // We keep this style here because we need the name
842       if (Err != CUDA_SUCCESS) {
843         REPORT("Loading '%s' Failed\n", E->name);
844         CUDA_ERR_STRING(Err);
845         return nullptr;
846       }
847 
848       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
849          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
850 
851       // default value GENERIC (in case symbol is missing from cubin file)
852       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
853       std::string ExecModeNameStr(E->name);
854       ExecModeNameStr += "_exec_mode";
855       const char *ExecModeName = ExecModeNameStr.c_str();
856 
857       CUdeviceptr ExecModePtr;
858       size_t CUSize;
859       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
860       if (Err == CUDA_SUCCESS) {
861         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
862           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
863              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
864           return nullptr;
865         }
866 
867         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
868         if (Err != CUDA_SUCCESS) {
869           REPORT("Error when copying data from device to host. Pointers: "
870                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
871                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
872           CUDA_ERR_STRING(Err);
873           return nullptr;
874         }
875       } else {
876         DP("Loading global exec_mode '%s' - symbol missing, using default "
877            "value GENERIC (1)\n",
878            ExecModeName);
879       }
880 
881       KernelsList.emplace_back(Func, ExecModeVal);
882 
883       __tgt_offload_entry Entry = *E;
884       Entry.addr = &KernelsList.back();
885       addOffloadEntry(DeviceId, Entry);
886     }
887 
888     // send device environment data to the device
889     {
890       // TODO: The device ID used here is not the real device ID used by OpenMP.
891       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
892                                     static_cast<uint32_t>(DeviceId),
893                                     static_cast<uint32_t>(DynamicMemorySize)};
894 
895 #ifdef OMPTARGET_DEBUG
896       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
897         DeviceEnv.DebugKind = std::stoi(EnvStr);
898 #endif
899 
900       const char *DeviceEnvName = "omptarget_device_environment";
901       CUdeviceptr DeviceEnvPtr;
902       size_t CUSize;
903 
904       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
905       if (Err == CUDA_SUCCESS) {
906         if (CUSize != sizeof(DeviceEnv)) {
907           REPORT(
908               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
909               DeviceEnvName, CUSize, sizeof(int32_t));
910           CUDA_ERR_STRING(Err);
911           return nullptr;
912         }
913 
914         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
915         if (Err != CUDA_SUCCESS) {
916           REPORT("Error when copying data from host to device. Pointers: "
917                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
918                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
919           CUDA_ERR_STRING(Err);
920           return nullptr;
921         }
922 
923         DP("Sending global device environment data %zu bytes\n", CUSize);
924       } else {
925         DP("Finding global device environment '%s' - symbol missing.\n",
926            DeviceEnvName);
927         DP("Continue, considering this is a device RTL which does not accept "
928            "environment setting.\n");
929       }
930     }
931 
932     return getOffloadEntriesTable(DeviceId);
933   }
934 
dataAlloc(const int DeviceId,const int64_t Size,const TargetAllocTy Kind)935   void *dataAlloc(const int DeviceId, const int64_t Size,
936                   const TargetAllocTy Kind) {
937     switch (Kind) {
938     case TARGET_ALLOC_DEFAULT:
939     case TARGET_ALLOC_DEVICE:
940       if (UseMemoryManager)
941         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
942       else
943         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
944     case TARGET_ALLOC_HOST:
945     case TARGET_ALLOC_SHARED:
946       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
947     }
948 
949     REPORT("Invalid target data allocation kind or requested allocator not "
950            "implemented yet\n");
951 
952     return nullptr;
953   }
954 
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const955   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
956                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
957     assert(AsyncInfo && "AsyncInfo is nullptr");
958 
959     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
960     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
961       return OFFLOAD_FAIL;
962 
963     CUstream Stream = getStream(DeviceId, AsyncInfo);
964 
965     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
966     if (Err != CUDA_SUCCESS) {
967       DP("Error when copying data from host to device. Pointers: host "
968          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
969          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
970       CUDA_ERR_STRING(Err);
971       return OFFLOAD_FAIL;
972     }
973 
974     return OFFLOAD_SUCCESS;
975   }
976 
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const977   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
978                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
979     assert(AsyncInfo && "AsyncInfo is nullptr");
980 
981     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
982     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
983       return OFFLOAD_FAIL;
984 
985     CUstream Stream = getStream(DeviceId, AsyncInfo);
986 
987     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
988     if (Err != CUDA_SUCCESS) {
989       DP("Error when copying data from device to host. Pointers: host "
990          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
991          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
992       CUDA_ERR_STRING(Err);
993       return OFFLOAD_FAIL;
994     }
995 
996     return OFFLOAD_SUCCESS;
997   }
998 
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo) const999   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1000                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1001     assert(AsyncInfo && "AsyncInfo is nullptr");
1002 
1003     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1004     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1005       return OFFLOAD_FAIL;
1006 
1007     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1008 
1009     // If they are two devices, we try peer to peer copy first
1010     if (SrcDevId != DstDevId) {
1011       int CanAccessPeer = 0;
1012       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1013       if (Err != CUDA_SUCCESS) {
1014         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1015                ", dst = %" PRId32 "\n",
1016                SrcDevId, DstDevId);
1017         CUDA_ERR_STRING(Err);
1018         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1019       }
1020 
1021       if (!CanAccessPeer) {
1022         DP("P2P memcpy not supported so fall back to D2D memcpy");
1023         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1024       }
1025 
1026       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1027       if (Err != CUDA_SUCCESS) {
1028         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1029                ", dst = %" PRId32 "\n",
1030                SrcDevId, DstDevId);
1031         CUDA_ERR_STRING(Err);
1032         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1033       }
1034 
1035       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1036                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1037                               Size, Stream);
1038       if (Err == CUDA_SUCCESS)
1039         return OFFLOAD_SUCCESS;
1040 
1041       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1042          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1043          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1044       CUDA_ERR_STRING(Err);
1045     }
1046 
1047     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1048   }
1049 
dataDelete(const int DeviceId,void * TgtPtr)1050   int dataDelete(const int DeviceId, void *TgtPtr) {
1051     if (UseMemoryManager)
1052       return MemoryManagers[DeviceId]->free(TgtPtr);
1053 
1054     return DeviceAllocators[DeviceId].free(TgtPtr);
1055   }
1056 
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const1057   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1058                           ptrdiff_t *TgtOffsets, const int ArgNum,
1059                           const int TeamNum, const int ThreadLimit,
1060                           const unsigned int LoopTripCount,
1061                           __tgt_async_info *AsyncInfo) const {
1062     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1063     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1064       return OFFLOAD_FAIL;
1065 
1066     // All args are references.
1067     std::vector<void *> Args(ArgNum);
1068     std::vector<void *> Ptrs(ArgNum);
1069 
1070     for (int I = 0; I < ArgNum; ++I) {
1071       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1072       Args[I] = &Ptrs[I];
1073     }
1074 
1075     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1076 
1077     const bool IsSPMDGenericMode =
1078         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1079     const bool IsSPMDMode =
1080         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1081     const bool IsGenericMode =
1082         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1083 
1084     int CudaThreadsPerBlock;
1085     if (ThreadLimit > 0) {
1086       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1087       CudaThreadsPerBlock = ThreadLimit;
1088       // Add master warp if necessary
1089       if (IsGenericMode) {
1090         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1091         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1092       }
1093     } else {
1094       DP("Setting CUDA threads per block to default %d\n",
1095          DeviceData[DeviceId].NumThreads);
1096       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1097     }
1098 
1099     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1100       DP("Threads per block capped at device limit %d\n",
1101          DeviceData[DeviceId].ThreadsPerBlock);
1102       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1103     }
1104 
1105     if (!KernelInfo->MaxThreadsPerBlock) {
1106       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1107                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1108                                KernelInfo->Func);
1109       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1110         return OFFLOAD_FAIL;
1111     }
1112 
1113     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1114       DP("Threads per block capped at kernel limit %d\n",
1115          KernelInfo->MaxThreadsPerBlock);
1116       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1117     }
1118 
1119     unsigned int CudaBlocksPerGrid;
1120     if (TeamNum <= 0) {
1121       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1122         if (IsSPMDGenericMode) {
1123           // If we reach this point, then we are executing a kernel that was
1124           // transformed from Generic-mode to SPMD-mode. This kernel has
1125           // SPMD-mode execution, but needs its blocks to be scheduled
1126           // differently because the current loop trip count only applies to the
1127           // `teams distribute` region and will create var too few blocks using
1128           // the regular SPMD-mode method.
1129           CudaBlocksPerGrid = LoopTripCount;
1130         } else if (IsSPMDMode) {
1131           // We have a combined construct, i.e. `target teams distribute
1132           // parallel for [simd]`. We launch so many teams so that each thread
1133           // will execute one iteration of the loop. round up to the nearest
1134           // integer
1135           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1136         } else if (IsGenericMode) {
1137           // If we reach this point, then we have a non-combined construct, i.e.
1138           // `teams distribute` with a nested `parallel for` and each team is
1139           // assigned one iteration of the `distribute` loop. E.g.:
1140           //
1141           // #pragma omp target teams distribute
1142           // for(...loop_tripcount...) {
1143           //   #pragma omp parallel for
1144           //   for(...) {}
1145           // }
1146           //
1147           // Threads within a team will execute the iterations of the `parallel`
1148           // loop.
1149           CudaBlocksPerGrid = LoopTripCount;
1150         } else {
1151           REPORT("Unknown execution mode: %d\n",
1152                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1153           return OFFLOAD_FAIL;
1154         }
1155         DP("Using %d teams due to loop trip count %" PRIu32
1156            " and number of threads per block %d\n",
1157            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1158       } else {
1159         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1160         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1161       }
1162     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1163       DP("Capping number of teams to team limit %d\n",
1164          DeviceData[DeviceId].BlocksPerGrid);
1165       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1166     } else {
1167       DP("Using requested number of teams %d\n", TeamNum);
1168       CudaBlocksPerGrid = TeamNum;
1169     }
1170 
1171     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1172          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1173          (getOffloadEntry(DeviceId, TgtEntryPtr))
1174              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1175              : "(null)",
1176          CudaBlocksPerGrid, CudaThreadsPerBlock,
1177          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1178 
1179     CUstream Stream = getStream(DeviceId, AsyncInfo);
1180     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1181                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1182                          /* blockDimY */ 1, /* blockDimZ */ 1,
1183                          DynamicMemorySize, Stream, &Args[0], nullptr);
1184     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1185       return OFFLOAD_FAIL;
1186 
1187     DP("Launch of entry point at " DPxMOD " successful!\n",
1188        DPxPTR(TgtEntryPtr));
1189 
1190     return OFFLOAD_SUCCESS;
1191   }
1192 
synchronize(const int DeviceId,__tgt_async_info * AsyncInfo) const1193   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1194     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1195     CUresult Err = cuStreamSynchronize(Stream);
1196 
1197     // Once the stream is synchronized, return it to stream pool and reset
1198     // AsyncInfo. This is to make sure the synchronization only works for its
1199     // own tasks.
1200     StreamManager->returnStream(DeviceId,
1201                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1202     AsyncInfo->Queue = nullptr;
1203 
1204     if (Err != CUDA_SUCCESS) {
1205       DP("Error when synchronizing stream. stream = " DPxMOD
1206          ", async info ptr = " DPxMOD "\n",
1207          DPxPTR(Stream), DPxPTR(AsyncInfo));
1208       CUDA_ERR_STRING(Err);
1209     }
1210     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1211   }
1212 
printDeviceInfo(int32_t device_id)1213   void printDeviceInfo(int32_t device_id) {
1214     char TmpChar[1000];
1215     std::string TmpStr;
1216     size_t TmpSt;
1217     int TmpInt, TmpInt2, TmpInt3;
1218 
1219     CUdevice Device;
1220     checkResult(cuDeviceGet(&Device, device_id),
1221                 "Error returned from cuCtxGetDevice\n");
1222 
1223     cuDriverGetVersion(&TmpInt);
1224     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1225     printf("    CUDA Device Number: \t\t%d \n", device_id);
1226     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1227                 "Error returned from cuDeviceGetName\n");
1228     printf("    Device Name: \t\t\t%s \n", TmpChar);
1229     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1230                 "Error returned from cuDeviceTotalMem\n");
1231     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1232     checkResult(cuDeviceGetAttribute(
1233                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1234                 "Error returned from cuDeviceGetAttribute\n");
1235     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1236     checkResult(
1237         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1238         "Error returned from cuDeviceGetAttribute\n");
1239     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1240     checkResult(cuDeviceGetAttribute(
1241                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1242                 "Error returned from cuDeviceGetAttribute\n");
1243     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1244     checkResult(
1245         cuDeviceGetAttribute(
1246             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1247         "Error returned from cuDeviceGetAttribute\n");
1248     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1249     checkResult(
1250         cuDeviceGetAttribute(
1251             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1252         "Error returned from cuDeviceGetAttribute\n");
1253     printf("    Registers per Block: \t\t%d \n", TmpInt);
1254     checkResult(
1255         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1256         "Error returned from cuDeviceGetAttribute\n");
1257     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1258     checkResult(cuDeviceGetAttribute(
1259                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1260                 "Error returned from cuDeviceGetAttribute\n");
1261     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1262     checkResult(cuDeviceGetAttribute(
1263                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1264                 "Error returned from cuDeviceGetAttribute\n");
1265     checkResult(cuDeviceGetAttribute(
1266                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1267                 "Error returned from cuDeviceGetAttribute\n");
1268     checkResult(cuDeviceGetAttribute(
1269                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1270                 "Error returned from cuDeviceGetAttribute\n");
1271     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1272            TmpInt3);
1273     checkResult(cuDeviceGetAttribute(
1274                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1275                 "Error returned from cuDeviceGetAttribute\n");
1276     checkResult(cuDeviceGetAttribute(
1277                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1278                 "Error returned from cuDeviceGetAttribute\n");
1279     checkResult(cuDeviceGetAttribute(
1280                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1281                 "Error returned from cuDeviceGetAttribute\n");
1282     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1283            TmpInt3);
1284     checkResult(
1285         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1286         "Error returned from cuDeviceGetAttribute\n");
1287     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1288     checkResult(cuDeviceGetAttribute(
1289                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1290                 "Error returned from cuDeviceGetAttribute\n");
1291     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1292     checkResult(
1293         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1294         "Error returned from cuDeviceGetAttribute\n");
1295     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1296     checkResult(cuDeviceGetAttribute(
1297                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1298                 "Error returned from cuDeviceGetAttribute\n");
1299     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1300     checkResult(
1301         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1302         "Error returned from cuDeviceGetAttribute\n");
1303     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1304     checkResult(cuDeviceGetAttribute(
1305                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1306                 "Error returned from cuDeviceGetAttribute\n");
1307     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1308     checkResult(
1309         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1310         "Error returned from cuDeviceGetAttribute\n");
1311     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1312       TmpStr = "DEFAULT";
1313     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1314       TmpStr = "PROHIBITED";
1315     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1316       TmpStr = "EXCLUSIVE PROCESS";
1317     else
1318       TmpStr = "unknown";
1319     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1320     checkResult(cuDeviceGetAttribute(
1321                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1322                 "Error returned from cuDeviceGetAttribute\n");
1323     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1324     checkResult(
1325         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1326         "Error returned from cuDeviceGetAttribute\n");
1327     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1328     checkResult(cuDeviceGetAttribute(
1329                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1330                 "Error returned from cuDeviceGetAttribute\n");
1331     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1332     checkResult(
1333         cuDeviceGetAttribute(
1334             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1335         "Error returned from cuDeviceGetAttribute\n");
1336     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1337     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1338                                      Device),
1339                 "Error returned from cuDeviceGetAttribute\n");
1340     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1341     checkResult(cuDeviceGetAttribute(
1342                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1343                     Device),
1344                 "Error returned from cuDeviceGetAttribute\n");
1345     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1346     checkResult(cuDeviceGetAttribute(
1347                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1348                 "Error returned from cuDeviceGetAttribute\n");
1349     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1350     checkResult(cuDeviceGetAttribute(
1351                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1352                 "Error returned from cuDeviceGetAttribute\n");
1353     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1354     checkResult(cuDeviceGetAttribute(
1355                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1356                 "Error returned from cuDeviceGetAttribute\n");
1357     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1358     checkResult(
1359         cuDeviceGetAttribute(
1360             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1361         "Error returned from cuDeviceGetAttribute\n");
1362     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1363     checkResult(
1364         cuDeviceGetAttribute(
1365             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1366         "Error returned from cuDeviceGetAttribute\n");
1367     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1368     checkResult(cuDeviceGetAttribute(
1369                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1370                 "Error returned from cuDeviceGetAttribute\n");
1371     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1372     checkResult(cuDeviceGetAttribute(
1373                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1374                 "Error returned from cuDeviceGetAttribute\n");
1375     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1376     checkResult(
1377         cuDeviceGetAttribute(
1378             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1379         "Error returned from cuDeviceGetAttribute\n");
1380     checkResult(
1381         cuDeviceGetAttribute(
1382             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1383         "Error returned from cuDeviceGetAttribute\n");
1384     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1385   }
1386 
waitEvent(const int DeviceId,__tgt_async_info * AsyncInfo,void * EventPtr) const1387   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1388                 void *EventPtr) const {
1389     CUstream Stream = getStream(DeviceId, AsyncInfo);
1390     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1391 
1392     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1393     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1394     // API document, that argument has to be 0x0.
1395     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1396     if (Err != CUDA_SUCCESS) {
1397       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1398          DPxPTR(Stream), DPxPTR(Event));
1399       CUDA_ERR_STRING(Err);
1400       return OFFLOAD_FAIL;
1401     }
1402 
1403     return OFFLOAD_SUCCESS;
1404   }
1405 };
1406 
1407 DeviceRTLTy DeviceRTL;
1408 } // namespace
1409 
1410 // Exposed library API function
1411 #ifdef __cplusplus
1412 extern "C" {
1413 #endif
1414 
__tgt_rtl_is_valid_binary(__tgt_device_image * image)1415 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1416   return elf_check_machine(image, /* EM_CUDA */ 190);
1417 }
1418 
__tgt_rtl_number_of_devices()1419 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1420 
__tgt_rtl_init_requires(int64_t RequiresFlags)1421 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1422   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1423   DeviceRTL.setRequiresFlag(RequiresFlags);
1424   return RequiresFlags;
1425 }
1426 
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1427 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1428   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1429       DeviceRTL.isValidDeviceId(dst_dev_id))
1430     return 1;
1431 
1432   return 0;
1433 }
1434 
__tgt_rtl_init_device(int32_t device_id)1435 int32_t __tgt_rtl_init_device(int32_t device_id) {
1436   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1437 
1438   return DeviceRTL.initDevice(device_id);
1439 }
1440 
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1441 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1442                                           __tgt_device_image *image) {
1443   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1444 
1445   return DeviceRTL.loadBinary(device_id, image);
1446 }
1447 
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *,int32_t kind)1448 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1449                            int32_t kind) {
1450   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1451 
1452   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1453 }
1454 
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1455 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1456                               int64_t size) {
1457   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1458 
1459   __tgt_async_info AsyncInfo;
1460   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1461                                                  size, &AsyncInfo);
1462   if (rc != OFFLOAD_SUCCESS)
1463     return OFFLOAD_FAIL;
1464 
1465   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1466 }
1467 
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1468 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1469                                     void *hst_ptr, int64_t size,
1470                                     __tgt_async_info *async_info_ptr) {
1471   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1472   assert(async_info_ptr && "async_info_ptr is nullptr");
1473 
1474   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1475                               async_info_ptr);
1476 }
1477 
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1478 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1479                                 int64_t size) {
1480   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1481 
1482   __tgt_async_info AsyncInfo;
1483   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1484                                                    size, &AsyncInfo);
1485   if (rc != OFFLOAD_SUCCESS)
1486     return OFFLOAD_FAIL;
1487 
1488   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1489 }
1490 
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1491 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1492                                       void *tgt_ptr, int64_t size,
1493                                       __tgt_async_info *async_info_ptr) {
1494   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1495   assert(async_info_ptr && "async_info_ptr is nullptr");
1496 
1497   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1498                                 async_info_ptr);
1499 }
1500 
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * AsyncInfo)1501 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1502                                       int dst_dev_id, void *dst_ptr,
1503                                       int64_t size,
1504                                       __tgt_async_info *AsyncInfo) {
1505   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1506   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1507   assert(AsyncInfo && "AsyncInfo is nullptr");
1508 
1509   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1510                                 AsyncInfo);
1511 }
1512 
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1513 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1514                                 int32_t dst_dev_id, void *dst_ptr,
1515                                 int64_t size) {
1516   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1517   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1518 
1519   __tgt_async_info AsyncInfo;
1520   const int32_t rc = __tgt_rtl_data_exchange_async(
1521       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1522   if (rc != OFFLOAD_SUCCESS)
1523     return OFFLOAD_FAIL;
1524 
1525   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1526 }
1527 
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1528 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1529   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1530 
1531   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1532 }
1533 
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1534 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1535                                          void **tgt_args,
1536                                          ptrdiff_t *tgt_offsets,
1537                                          int32_t arg_num, int32_t team_num,
1538                                          int32_t thread_limit,
1539                                          uint64_t loop_tripcount) {
1540   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1541 
1542   __tgt_async_info AsyncInfo;
1543   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1544       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1545       thread_limit, loop_tripcount, &AsyncInfo);
1546   if (rc != OFFLOAD_SUCCESS)
1547     return OFFLOAD_FAIL;
1548 
1549   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1550 }
1551 
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1552 int32_t __tgt_rtl_run_target_team_region_async(
1553     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1554     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1555     int32_t thread_limit, uint64_t loop_tripcount,
1556     __tgt_async_info *async_info_ptr) {
1557   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1558 
1559   return DeviceRTL.runTargetTeamRegion(
1560       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1561       thread_limit, loop_tripcount, async_info_ptr);
1562 }
1563 
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1564 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1565                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1566                                     int32_t arg_num) {
1567   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1568 
1569   __tgt_async_info AsyncInfo;
1570   const int32_t rc = __tgt_rtl_run_target_region_async(
1571       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1572   if (rc != OFFLOAD_SUCCESS)
1573     return OFFLOAD_FAIL;
1574 
1575   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1576 }
1577 
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1578 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1579                                           void *tgt_entry_ptr, void **tgt_args,
1580                                           ptrdiff_t *tgt_offsets,
1581                                           int32_t arg_num,
1582                                           __tgt_async_info *async_info_ptr) {
1583   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1584 
1585   return __tgt_rtl_run_target_team_region_async(
1586       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1587       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1588       async_info_ptr);
1589 }
1590 
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1591 int32_t __tgt_rtl_synchronize(int32_t device_id,
1592                               __tgt_async_info *async_info_ptr) {
1593   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1594   assert(async_info_ptr && "async_info_ptr is nullptr");
1595   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1596 
1597   return DeviceRTL.synchronize(device_id, async_info_ptr);
1598 }
1599 
__tgt_rtl_set_info_flag(uint32_t NewInfoLevel)1600 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1601   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1602   InfoLevel.store(NewInfoLevel);
1603 }
1604 
__tgt_rtl_print_device_info(int32_t device_id)1605 void __tgt_rtl_print_device_info(int32_t device_id) {
1606   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1607   DeviceRTL.printDeviceInfo(device_id);
1608 }
1609 
__tgt_rtl_create_event(int32_t device_id,void ** event)1610 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1611   assert(event && "event is nullptr");
1612   return createEvent(event);
1613 }
1614 
__tgt_rtl_record_event(int32_t device_id,void * event_ptr,__tgt_async_info * async_info_ptr)1615 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1616                                __tgt_async_info *async_info_ptr) {
1617   assert(async_info_ptr && "async_info_ptr is nullptr");
1618   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1619   assert(event_ptr && "event_ptr is nullptr");
1620 
1621   return recordEvent(event_ptr, async_info_ptr);
1622 }
1623 
__tgt_rtl_wait_event(int32_t device_id,void * event_ptr,__tgt_async_info * async_info_ptr)1624 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1625                              __tgt_async_info *async_info_ptr) {
1626   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1627   assert(async_info_ptr && "async_info_ptr is nullptr");
1628   assert(event_ptr && "event is nullptr");
1629 
1630   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1631 }
1632 
__tgt_rtl_sync_event(int32_t device_id,void * event_ptr)1633 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1634   assert(event_ptr && "event is nullptr");
1635 
1636   return syncEvent(event_ptr);
1637 }
1638 
__tgt_rtl_destroy_event(int32_t device_id,void * event_ptr)1639 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1640   assert(event_ptr && "event is nullptr");
1641 
1642   return destroyEvent(event_ptr);
1643 }
1644 
1645 #ifdef __cplusplus
1646 }
1647 #endif
1648