1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string.h>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 #include <string.h>
24 
25 #include "Debug.h"
26 #include "omptargetplugin.h"
27 
28 #define TARGET_NAME CUDA
29 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
30 
31 #include "MemoryManager.h"
32 
33 // Utility for retrieving and printing CUDA error string.
34 #ifdef OMPTARGET_DEBUG
35 #define CUDA_ERR_STRING(err)                                                   \
36   do {                                                                         \
37     if (getDebugLevel() > 0) {                                                 \
38       const char *errStr = nullptr;                                            \
39       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
40       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
41         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
42       else if (errStr_status == CUDA_SUCCESS)                                  \
43         REPORT("CUDA error is: %s\n", errStr);                                 \
44       else {                                                                   \
45         REPORT("Unresolved CUDA error code: %d\n", err);                       \
46         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
47                errStr_status);                                                 \
48       }                                                                        \
49     } else {                                                                   \
50       const char *errStr = nullptr;                                            \
51       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
52       if (errStr_status == CUDA_SUCCESS)                                       \
53         REPORT("%s \n", errStr);                                               \
54     }                                                                          \
55   } while (false)
56 #else // OMPTARGET_DEBUG
57 #define CUDA_ERR_STRING(err)                                                   \
58   do {                                                                         \
59     const char *errStr = nullptr;                                              \
60     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
61     if (errStr_status == CUDA_SUCCESS)                                         \
62       REPORT("%s \n", errStr);                                                 \
63   } while (false)
64 #endif // OMPTARGET_DEBUG
65 
66 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
67 
68 #include "elf_common.h"
69 
70 /// Keep entries table per device.
71 struct FuncOrGblEntryTy {
72   __tgt_target_table Table;
73   std::vector<__tgt_offload_entry> Entries;
74 };
75 
76 enum ExecutionModeType {
77   SPMD,         // constructors, destructors,
78                 // combined constructs (`teams distribute parallel for [simd]`)
79   GENERIC,      // everything else
80   SPMD_GENERIC, // Generic kernel with SPMD execution
81   NONE
82 };
83 
84 /// Use a single entity to encode a kernel and a set of flags.
85 struct KernelTy {
86   CUfunction Func;
87 
88   // execution mode of kernel
89   // 0 - SPMD mode (without master warp)
90   // 1 - Generic mode (with master warp)
91   // 2 - SPMD mode execution with Generic mode semantics.
92   int8_t ExecutionMode;
93 
94   /// Maximal number of threads per block for this kernel.
95   int MaxThreadsPerBlock = 0;
96 
KernelTyKernelTy97   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
98       : Func(_Func), ExecutionMode(_ExecutionMode) {}
99 };
100 
101 /// Device environment data
102 /// Manually sync with the deviceRTL side for now, move to a dedicated header
103 /// file later.
104 struct omptarget_device_environmentTy {
105   int32_t debug_level;
106 };
107 
108 namespace {
checkResult(CUresult Err,const char * ErrMsg)109 bool checkResult(CUresult Err, const char *ErrMsg) {
110   if (Err == CUDA_SUCCESS)
111     return true;
112 
113   REPORT("%s", ErrMsg);
114   CUDA_ERR_STRING(Err);
115   return false;
116 }
117 
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)118 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
119                CUstream Stream) {
120   CUresult Err =
121       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
122 
123   if (Err != CUDA_SUCCESS) {
124     DP("Error when copying data from device to device. Pointers: src "
125        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
126        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
127     CUDA_ERR_STRING(Err);
128     return OFFLOAD_FAIL;
129   }
130 
131   return OFFLOAD_SUCCESS;
132 }
133 
134 // Structure contains per-device data
135 struct DeviceDataTy {
136   /// List that contains all the kernels.
137   std::list<KernelTy> KernelsList;
138 
139   std::list<FuncOrGblEntryTy> FuncGblEntries;
140 
141   CUcontext Context = nullptr;
142   // Device properties
143   int ThreadsPerBlock = 0;
144   int BlocksPerGrid = 0;
145   int WarpSize = 0;
146   // OpenMP properties
147   int NumTeams = 0;
148   int NumThreads = 0;
149 };
150 
151 class StreamManagerTy {
152   int NumberOfDevices;
153   // The initial size of stream pool
154   int EnvNumInitialStreams;
155   // Per-device stream mutex
156   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
157   // Per-device stream Id indicates the next available stream in the pool
158   std::vector<int> NextStreamId;
159   // Per-device stream pool
160   std::vector<std::vector<CUstream>> StreamPool;
161   // Reference to per-device data
162   std::vector<DeviceDataTy> &DeviceData;
163 
164   // If there is no CUstream left in the pool, we will resize the pool to
165   // allocate more CUstream. This function should be called with device mutex,
166   // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)167   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
168     std::vector<CUstream> &Pool = StreamPool[DeviceId];
169     const size_t CurrentSize = Pool.size();
170     assert(NewSize > CurrentSize && "new size is not larger than current size");
171 
172     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
173     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
174       // We will return if cannot switch to the right context in case of
175       // creating bunch of streams that are not corresponding to the right
176       // device. The offloading will fail later because selected CUstream is
177       // nullptr.
178       return;
179     }
180 
181     Pool.resize(NewSize, nullptr);
182 
183     for (size_t I = CurrentSize; I < NewSize; ++I) {
184       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
185                   "Error returned from cuStreamCreate\n");
186     }
187   }
188 
189 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)190   StreamManagerTy(const int NumberOfDevices,
191                   std::vector<DeviceDataTy> &DeviceData)
192       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
193         DeviceData(DeviceData) {
194     StreamPool.resize(NumberOfDevices);
195     NextStreamId.resize(NumberOfDevices);
196     StreamMtx.resize(NumberOfDevices);
197 
198     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
199       EnvNumInitialStreams = std::stoi(EnvStr);
200 
201     // Initialize the next stream id
202     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
203 
204     // Initialize stream mutex
205     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
206       Ptr = std::make_unique<std::mutex>();
207   }
208 
~StreamManagerTy()209   ~StreamManagerTy() {
210     // Destroy streams
211     for (int I = 0; I < NumberOfDevices; ++I) {
212       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
213                   "Error returned from cuCtxSetCurrent\n");
214 
215       for (CUstream &S : StreamPool[I]) {
216         if (S)
217           checkResult(cuStreamDestroy(S),
218                       "Error returned from cuStreamDestroy\n");
219       }
220     }
221   }
222 
223   // Get a CUstream from pool. Per-device next stream id always points to the
224   // next available CUstream. That means, CUstreams [0, id-1] have been
225   // assigned, and [id,] are still available. If there is no CUstream left, we
226   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
227   // the id will increase one.
228   // xxxxxs+++++++++
229   //      ^
230   //      id
231   // After assignment, the pool becomes the following and s is assigned.
232   // xxxxxs+++++++++
233   //       ^
234   //       id
getStream(const int DeviceId)235   CUstream getStream(const int DeviceId) {
236     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
237     int &Id = NextStreamId[DeviceId];
238     // No CUstream left in the pool, we need to request from CUDA RT
239     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
240       // By default we double the stream pool every time
241       resizeStreamPool(DeviceId, Id * 2);
242     }
243     return StreamPool[DeviceId][Id++];
244   }
245 
246   // Return a CUstream back to pool. As mentioned above, per-device next
247   // stream is always points to the next available CUstream, so when we return
248   // a CUstream, we need to first decrease the id, and then copy the CUstream
249   // back.
250   // It is worth noting that, the order of streams return might be different
251   // from that they're assigned, that saying, at some point, there might be
252   // two identical CUstreams.
253   // xxax+a+++++
254   //     ^
255   //     id
256   // However, it doesn't matter, because they're always on the two sides of
257   // id. The left one will in the end be overwritten by another CUstream.
258   // Therefore, after several execution, the order of pool might be different
259   // from its initial state.
returnStream(const int DeviceId,CUstream Stream)260   void returnStream(const int DeviceId, CUstream Stream) {
261     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
262     int &Id = NextStreamId[DeviceId];
263     assert(Id > 0 && "Wrong stream ID");
264     StreamPool[DeviceId][--Id] = Stream;
265   }
266 
initializeDeviceStreamPool(const int DeviceId)267   bool initializeDeviceStreamPool(const int DeviceId) {
268     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
269 
270     resizeStreamPool(DeviceId, EnvNumInitialStreams);
271 
272     // Check the size of stream pool
273     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
274       return false;
275 
276     // Check whether each stream is valid
277     for (CUstream &S : StreamPool[DeviceId])
278       if (!S)
279         return false;
280 
281     return true;
282   }
283 };
284 
285 class DeviceRTLTy {
286   int NumberOfDevices;
287   // OpenMP environment properties
288   int EnvNumTeams;
289   int EnvTeamLimit;
290   int EnvTeamThreadLimit;
291   // OpenMP requires flags
292   int64_t RequiresFlags;
293 
294   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
295   static constexpr const int HardThreadLimit = 1024;
296   static constexpr const int DefaultNumTeams = 128;
297   static constexpr const int DefaultNumThreads = 128;
298 
299   std::unique_ptr<StreamManagerTy> StreamManager;
300   std::vector<DeviceDataTy> DeviceData;
301   std::vector<CUmodule> Modules;
302 
303   /// A class responsible for interacting with device native runtime library to
304   /// allocate and free memory.
305   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
306     const int DeviceId;
307     const std::vector<DeviceDataTy> &DeviceData;
308     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
309 
310   public:
CUDADeviceAllocatorTy(int DeviceId,std::vector<DeviceDataTy> & DeviceData)311     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
312         : DeviceId(DeviceId), DeviceData(DeviceData) {}
313 
allocate(size_t Size,void *,TargetAllocTy Kind)314     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
315       if (Size == 0)
316         return nullptr;
317 
318       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
319       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
320         return nullptr;
321 
322       void *MemAlloc = nullptr;
323       switch (Kind) {
324       case TARGET_ALLOC_DEFAULT:
325       case TARGET_ALLOC_DEVICE:
326         CUdeviceptr DevicePtr;
327         Err = cuMemAlloc(&DevicePtr, Size);
328         MemAlloc = (void *)DevicePtr;
329         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
330           return nullptr;
331         break;
332       case TARGET_ALLOC_HOST:
333         void *HostPtr;
334         Err = cuMemAllocHost(&HostPtr, Size);
335         MemAlloc = HostPtr;
336         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
337           return nullptr;
338         HostPinnedAllocs[MemAlloc] = Kind;
339         break;
340       case TARGET_ALLOC_SHARED:
341         CUdeviceptr SharedPtr;
342         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
343         MemAlloc = (void *)SharedPtr;
344         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
345           return nullptr;
346         break;
347       }
348 
349       return MemAlloc;
350     }
351 
free(void * TgtPtr)352     int free(void *TgtPtr) override {
353       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
354       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
355         return OFFLOAD_FAIL;
356 
357       // Host pinned memory must be freed differently.
358       TargetAllocTy Kind =
359           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
360               ? TARGET_ALLOC_DEFAULT
361               : TARGET_ALLOC_HOST;
362       switch (Kind) {
363       case TARGET_ALLOC_DEFAULT:
364       case TARGET_ALLOC_DEVICE:
365       case TARGET_ALLOC_SHARED:
366         Err = cuMemFree((CUdeviceptr)TgtPtr);
367         if (!checkResult(Err, "Error returned from cuMemFree\n"))
368           return OFFLOAD_FAIL;
369         break;
370       case TARGET_ALLOC_HOST:
371         Err = cuMemFreeHost(TgtPtr);
372         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
373           return OFFLOAD_FAIL;
374         break;
375       }
376 
377       return OFFLOAD_SUCCESS;
378     }
379   };
380 
381   /// A vector of device allocators
382   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
383 
384   /// A vector of memory managers. Since the memory manager is non-copyable and
385   // non-removable, we wrap them into std::unique_ptr.
386   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
387 
388   /// Whether use memory manager
389   bool UseMemoryManager = true;
390 
391   // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)392   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
393     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
394     E.Entries.push_back(entry);
395   }
396 
397   // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const398   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
399                                              const void *Addr) const {
400     for (const __tgt_offload_entry &Itr :
401          DeviceData[DeviceId].FuncGblEntries.back().Entries)
402       if (Itr.addr == Addr)
403         return &Itr;
404 
405     return nullptr;
406   }
407 
408   // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)409   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
410     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
411 
412     if (E.Entries.empty())
413       return nullptr;
414 
415     // Update table info according to the entries and return the pointer
416     E.Table.EntriesBegin = E.Entries.data();
417     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
418 
419     return &E.Table;
420   }
421 
422   // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)423   void clearOffloadEntriesTable(const int DeviceId) {
424     DeviceData[DeviceId].FuncGblEntries.emplace_back();
425     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
426     E.Entries.clear();
427     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
428   }
429 
getStream(const int DeviceId,__tgt_async_info * AsyncInfo) const430   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
431     assert(AsyncInfo && "AsyncInfo is nullptr");
432 
433     if (!AsyncInfo->Queue)
434       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
435 
436     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
437   }
438 
439 public:
440   // This class should not be copied
441   DeviceRTLTy(const DeviceRTLTy &) = delete;
442   DeviceRTLTy(DeviceRTLTy &&) = delete;
443 
DeviceRTLTy()444   DeviceRTLTy()
445       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
446         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
447 
448     DP("Start initializing CUDA\n");
449 
450     CUresult Err = cuInit(0);
451     if (Err == CUDA_ERROR_INVALID_HANDLE) {
452       // Can't call cuGetErrorString if dlsym failed
453       DP("Failed to load CUDA shared library\n");
454       return;
455     }
456     if (!checkResult(Err, "Error returned from cuInit\n")) {
457       return;
458     }
459 
460     Err = cuDeviceGetCount(&NumberOfDevices);
461     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
462       return;
463 
464     if (NumberOfDevices == 0) {
465       DP("There are no devices supporting CUDA.\n");
466       return;
467     }
468 
469     DeviceData.resize(NumberOfDevices);
470 
471     // Get environment variables regarding teams
472     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
473       // OMP_TEAM_LIMIT has been set
474       EnvTeamLimit = std::stoi(EnvStr);
475       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
476     }
477     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
478       // OMP_TEAMS_THREAD_LIMIT has been set
479       EnvTeamThreadLimit = std::stoi(EnvStr);
480       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
481     }
482     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
483       // OMP_NUM_TEAMS has been set
484       EnvNumTeams = std::stoi(EnvStr);
485       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
486     }
487 
488     StreamManager =
489         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
490 
491     for (int I = 0; I < NumberOfDevices; ++I)
492       DeviceAllocators.emplace_back(I, DeviceData);
493 
494     // Get the size threshold from environment variable
495     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
496     UseMemoryManager = Res.second;
497     size_t MemoryManagerThreshold = Res.first;
498 
499     if (UseMemoryManager)
500       for (int I = 0; I < NumberOfDevices; ++I)
501         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
502             DeviceAllocators[I], MemoryManagerThreshold));
503   }
504 
~DeviceRTLTy()505   ~DeviceRTLTy() {
506     // We first destruct memory managers in case that its dependent data are
507     // destroyed before it.
508     for (auto &M : MemoryManagers)
509       M.release();
510 
511     StreamManager = nullptr;
512 
513     for (CUmodule &M : Modules)
514       // Close module
515       if (M)
516         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
517 
518     for (DeviceDataTy &D : DeviceData) {
519       // Destroy context
520       if (D.Context) {
521         checkResult(cuCtxSetCurrent(D.Context),
522                     "Error returned from cuCtxSetCurrent\n");
523         CUdevice Device;
524         checkResult(cuCtxGetDevice(&Device),
525                     "Error returned from cuCtxGetDevice\n");
526         checkResult(cuDevicePrimaryCtxRelease(Device),
527                     "Error returned from cuDevicePrimaryCtxRelease\n");
528       }
529     }
530   }
531 
532   // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const533   bool isValidDeviceId(const int DeviceId) const {
534     return DeviceId >= 0 && DeviceId < NumberOfDevices;
535   }
536 
getNumOfDevices() const537   int getNumOfDevices() const { return NumberOfDevices; }
538 
setRequiresFlag(const int64_t Flags)539   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
540 
initDevice(const int DeviceId)541   int initDevice(const int DeviceId) {
542     CUdevice Device;
543 
544     DP("Getting device %d\n", DeviceId);
545     CUresult Err = cuDeviceGet(&Device, DeviceId);
546     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
547       return OFFLOAD_FAIL;
548 
549     // Query the current flags of the primary context and set its flags if
550     // it is inactive
551     unsigned int FormerPrimaryCtxFlags = 0;
552     int FormerPrimaryCtxIsActive = 0;
553     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
554                                      &FormerPrimaryCtxIsActive);
555     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
556       return OFFLOAD_FAIL;
557 
558     if (FormerPrimaryCtxIsActive) {
559       DP("The primary context is active, no change to its flags\n");
560       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
561           CU_CTX_SCHED_BLOCKING_SYNC)
562         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
563     } else {
564       DP("The primary context is inactive, set its flags to "
565          "CU_CTX_SCHED_BLOCKING_SYNC\n");
566       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
567       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
568         return OFFLOAD_FAIL;
569     }
570 
571     // Retain the per device primary context and save it to use whenever this
572     // device is selected.
573     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
574     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
575       return OFFLOAD_FAIL;
576 
577     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
578     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
579       return OFFLOAD_FAIL;
580 
581     // Initialize stream pool
582     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
583       return OFFLOAD_FAIL;
584 
585     // Query attributes to determine number of threads/block and blocks/grid.
586     int MaxGridDimX;
587     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
588                                Device);
589     if (Err != CUDA_SUCCESS) {
590       DP("Error getting max grid dimension, use default value %d\n",
591          DeviceRTLTy::DefaultNumTeams);
592       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
593     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
594       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
595       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
596     } else {
597       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
598          "at the hard limit\n",
599          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
600       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
601     }
602 
603     // We are only exploiting threads along the x axis.
604     int MaxBlockDimX;
605     Err = cuDeviceGetAttribute(&MaxBlockDimX,
606                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
607     if (Err != CUDA_SUCCESS) {
608       DP("Error getting max block dimension, use default value %d\n",
609          DeviceRTLTy::DefaultNumThreads);
610       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
611     } else {
612       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
613       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
614 
615       if (EnvTeamThreadLimit > 0 &&
616           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
617         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
618            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
619            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
620         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
621       }
622       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
623         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
624            "capping at the hard limit\n",
625            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
626         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
627       }
628     }
629 
630     // Get and set warp size
631     int WarpSize;
632     Err =
633         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
634     if (Err != CUDA_SUCCESS) {
635       DP("Error getting warp size, assume default value 32\n");
636       DeviceData[DeviceId].WarpSize = 32;
637     } else {
638       DP("Using warp size %d\n", WarpSize);
639       DeviceData[DeviceId].WarpSize = WarpSize;
640     }
641 
642     // Adjust teams to the env variables
643     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
644       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
645          EnvTeamLimit);
646       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
647     }
648 
649     size_t StackLimit;
650     size_t HeapLimit;
651     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
652       StackLimit = std::stol(EnvStr);
653       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
654         return OFFLOAD_FAIL;
655     } else {
656       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
657         return OFFLOAD_FAIL;
658     }
659     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
660       HeapLimit = std::stol(EnvStr);
661       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
662         return OFFLOAD_FAIL;
663     } else {
664       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
665         return OFFLOAD_FAIL;
666     }
667 
668     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
669          "Device supports up to %d CUDA blocks and %d threads with a "
670          "warp size of %d\n",
671          DeviceData[DeviceId].BlocksPerGrid,
672          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
673     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
674          "Device heap size is %d Bytes, device stack size is %d Bytes per "
675          "thread\n",
676          (int)HeapLimit, (int)StackLimit);
677 
678     // Set default number of teams
679     if (EnvNumTeams > 0) {
680       DP("Default number of teams set according to environment %d\n",
681          EnvNumTeams);
682       DeviceData[DeviceId].NumTeams = EnvNumTeams;
683     } else {
684       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
685       DP("Default number of teams set according to library's default %d\n",
686          DeviceRTLTy::DefaultNumTeams);
687     }
688 
689     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
690       DP("Default number of teams exceeds device limit, capping at %d\n",
691          DeviceData[DeviceId].BlocksPerGrid);
692       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
693     }
694 
695     // Set default number of threads
696     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
697     DP("Default number of threads set according to library's default %d\n",
698        DeviceRTLTy::DefaultNumThreads);
699     if (DeviceData[DeviceId].NumThreads >
700         DeviceData[DeviceId].ThreadsPerBlock) {
701       DP("Default number of threads exceeds device limit, capping at %d\n",
702          DeviceData[DeviceId].ThreadsPerBlock);
703       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
704     }
705 
706     return OFFLOAD_SUCCESS;
707   }
708 
loadBinary(const int DeviceId,const __tgt_device_image * Image)709   __tgt_target_table *loadBinary(const int DeviceId,
710                                  const __tgt_device_image *Image) {
711     // Set the context we are using
712     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
713     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
714       return nullptr;
715 
716     // Clear the offload table as we are going to create a new one.
717     clearOffloadEntriesTable(DeviceId);
718 
719     // Create the module and extract the function pointers.
720     CUmodule Module;
721     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
722     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
723     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
724       return nullptr;
725 
726     DP("CUDA module successfully loaded!\n");
727 
728     Modules.push_back(Module);
729 
730     // Find the symbols in the module by name.
731     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
732     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
733 
734     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
735     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
736       if (!E->addr) {
737         // We return nullptr when something like this happens, the host should
738         // have always something in the address to uniquely identify the target
739         // region.
740         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
741         return nullptr;
742       }
743 
744       if (E->size) {
745         __tgt_offload_entry Entry = *E;
746         CUdeviceptr CUPtr;
747         size_t CUSize;
748         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
749         // We keep this style here because we need the name
750         if (Err != CUDA_SUCCESS) {
751           REPORT("Loading global '%s' Failed\n", E->name);
752           CUDA_ERR_STRING(Err);
753           return nullptr;
754         }
755 
756         if (CUSize != E->size) {
757           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
758              CUSize, E->size);
759           return nullptr;
760         }
761 
762         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
763            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
764 
765         Entry.addr = (void *)(CUPtr);
766 
767         // Note: In the current implementation declare target variables
768         // can either be link or to. This means that once unified
769         // memory is activated via the requires directive, the variable
770         // can be used directly from the host in both cases.
771         // TODO: when variables types other than to or link are added,
772         // the below condition should be changed to explicitly
773         // check for to and link variables types:
774         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
775         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
776         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
777           // If unified memory is present any target link or to variables
778           // can access host addresses directly. There is no longer a
779           // need for device copies.
780           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
781           DP("Copy linked variable host address (" DPxMOD
782              ") to device address (" DPxMOD ")\n",
783              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
784         }
785 
786         addOffloadEntry(DeviceId, Entry);
787 
788         continue;
789       }
790 
791       CUfunction Func;
792       Err = cuModuleGetFunction(&Func, Module, E->name);
793       // We keep this style here because we need the name
794       if (Err != CUDA_SUCCESS) {
795         REPORT("Loading '%s' Failed\n", E->name);
796         CUDA_ERR_STRING(Err);
797         return nullptr;
798       }
799 
800       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
801          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
802 
803       // default value GENERIC (in case symbol is missing from cubin file)
804       int8_t ExecModeVal = ExecutionModeType::GENERIC;
805       std::string ExecModeNameStr(E->name);
806       ExecModeNameStr += "_exec_mode";
807       const char *ExecModeName = ExecModeNameStr.c_str();
808 
809       CUdeviceptr ExecModePtr;
810       size_t CUSize;
811       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
812       if (Err == CUDA_SUCCESS) {
813         if (CUSize != sizeof(int8_t)) {
814           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
815              ExecModeName, CUSize, sizeof(int8_t));
816           return nullptr;
817         }
818 
819         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
820         if (Err != CUDA_SUCCESS) {
821           REPORT("Error when copying data from device to host. Pointers: "
822                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
823                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
824           CUDA_ERR_STRING(Err);
825           return nullptr;
826         }
827 
828         if (ExecModeVal < 0 || ExecModeVal > 2) {
829           DP("Error wrong exec_mode value specified in cubin file: %d\n",
830              ExecModeVal);
831           return nullptr;
832         }
833       } else {
834         REPORT("Loading global exec_mode '%s' - symbol missing, using default "
835                "value GENERIC (1)\n",
836                ExecModeName);
837         CUDA_ERR_STRING(Err);
838       }
839 
840       KernelsList.emplace_back(Func, ExecModeVal);
841 
842       __tgt_offload_entry Entry = *E;
843       Entry.addr = &KernelsList.back();
844       addOffloadEntry(DeviceId, Entry);
845     }
846 
847     // send device environment data to the device
848     {
849       omptarget_device_environmentTy DeviceEnv{0};
850 
851 #ifdef OMPTARGET_DEBUG
852       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
853         DeviceEnv.debug_level = std::stoi(EnvStr);
854 #endif
855 
856       const char *DeviceEnvName = "omptarget_device_environment";
857       CUdeviceptr DeviceEnvPtr;
858       size_t CUSize;
859 
860       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
861       if (Err == CUDA_SUCCESS) {
862         if (CUSize != sizeof(DeviceEnv)) {
863           REPORT(
864               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
865               DeviceEnvName, CUSize, sizeof(int32_t));
866           CUDA_ERR_STRING(Err);
867           return nullptr;
868         }
869 
870         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
871         if (Err != CUDA_SUCCESS) {
872           REPORT("Error when copying data from host to device. Pointers: "
873                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
874                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
875           CUDA_ERR_STRING(Err);
876           return nullptr;
877         }
878 
879         DP("Sending global device environment data %zu bytes\n", CUSize);
880       } else {
881         DP("Finding global device environment '%s' - symbol missing.\n",
882            DeviceEnvName);
883         DP("Continue, considering this is a device RTL which does not accept "
884            "environment setting.\n");
885       }
886     }
887 
888     return getOffloadEntriesTable(DeviceId);
889   }
890 
dataAlloc(const int DeviceId,const int64_t Size,const TargetAllocTy Kind)891   void *dataAlloc(const int DeviceId, const int64_t Size,
892                   const TargetAllocTy Kind) {
893     switch (Kind) {
894     case TARGET_ALLOC_DEFAULT:
895     case TARGET_ALLOC_DEVICE:
896       if (UseMemoryManager)
897         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
898       else
899         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
900     case TARGET_ALLOC_HOST:
901     case TARGET_ALLOC_SHARED:
902       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
903     }
904 
905     REPORT("Invalid target data allocation kind or requested allocator not "
906            "implemented yet\n");
907 
908     return nullptr;
909   }
910 
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const911   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
912                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
913     assert(AsyncInfo && "AsyncInfo is nullptr");
914 
915     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
916     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
917       return OFFLOAD_FAIL;
918 
919     CUstream Stream = getStream(DeviceId, AsyncInfo);
920 
921     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
922     if (Err != CUDA_SUCCESS) {
923       DP("Error when copying data from host to device. Pointers: host "
924          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
925          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
926       CUDA_ERR_STRING(Err);
927       return OFFLOAD_FAIL;
928     }
929 
930     return OFFLOAD_SUCCESS;
931   }
932 
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const933   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
934                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
935     assert(AsyncInfo && "AsyncInfo is nullptr");
936 
937     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
938     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
939       return OFFLOAD_FAIL;
940 
941     CUstream Stream = getStream(DeviceId, AsyncInfo);
942 
943     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
944     if (Err != CUDA_SUCCESS) {
945       DP("Error when copying data from device to host. Pointers: host "
946          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
947          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
948       CUDA_ERR_STRING(Err);
949       return OFFLOAD_FAIL;
950     }
951 
952     return OFFLOAD_SUCCESS;
953   }
954 
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo) const955   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
956                    int64_t Size, __tgt_async_info *AsyncInfo) const {
957     assert(AsyncInfo && "AsyncInfo is nullptr");
958 
959     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
960     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
961       return OFFLOAD_FAIL;
962 
963     CUstream Stream = getStream(SrcDevId, AsyncInfo);
964 
965     // If they are two devices, we try peer to peer copy first
966     if (SrcDevId != DstDevId) {
967       int CanAccessPeer = 0;
968       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
969       if (Err != CUDA_SUCCESS) {
970         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
971                ", dst = %" PRId32 "\n",
972                SrcDevId, DstDevId);
973         CUDA_ERR_STRING(Err);
974         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
975       }
976 
977       if (!CanAccessPeer) {
978         DP("P2P memcpy not supported so fall back to D2D memcpy");
979         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
980       }
981 
982       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
983       if (Err != CUDA_SUCCESS) {
984         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
985                ", dst = %" PRId32 "\n",
986                SrcDevId, DstDevId);
987         CUDA_ERR_STRING(Err);
988         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
989       }
990 
991       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
992                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
993                               Size, Stream);
994       if (Err == CUDA_SUCCESS)
995         return OFFLOAD_SUCCESS;
996 
997       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
998          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
999          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1000       CUDA_ERR_STRING(Err);
1001     }
1002 
1003     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1004   }
1005 
dataDelete(const int DeviceId,void * TgtPtr)1006   int dataDelete(const int DeviceId, void *TgtPtr) {
1007     if (UseMemoryManager)
1008       return MemoryManagers[DeviceId]->free(TgtPtr);
1009 
1010     return DeviceAllocators[DeviceId].free(TgtPtr);
1011   }
1012 
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const1013   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1014                           ptrdiff_t *TgtOffsets, const int ArgNum,
1015                           const int TeamNum, const int ThreadLimit,
1016                           const unsigned int LoopTripCount,
1017                           __tgt_async_info *AsyncInfo) const {
1018     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1019     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1020       return OFFLOAD_FAIL;
1021 
1022     // All args are references.
1023     std::vector<void *> Args(ArgNum);
1024     std::vector<void *> Ptrs(ArgNum);
1025 
1026     for (int I = 0; I < ArgNum; ++I) {
1027       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1028       Args[I] = &Ptrs[I];
1029     }
1030 
1031     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1032 
1033     int CudaThreadsPerBlock;
1034     if (ThreadLimit > 0) {
1035       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1036       CudaThreadsPerBlock = ThreadLimit;
1037       // Add master warp if necessary
1038       if (KernelInfo->ExecutionMode == GENERIC) {
1039         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1040         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1041       }
1042     } else {
1043       DP("Setting CUDA threads per block to default %d\n",
1044          DeviceData[DeviceId].NumThreads);
1045       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1046     }
1047 
1048     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1049       DP("Threads per block capped at device limit %d\n",
1050          DeviceData[DeviceId].ThreadsPerBlock);
1051       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1052     }
1053 
1054     if (!KernelInfo->MaxThreadsPerBlock) {
1055       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1056                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1057                                KernelInfo->Func);
1058       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1059         return OFFLOAD_FAIL;
1060     }
1061 
1062     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1063       DP("Threads per block capped at kernel limit %d\n",
1064          KernelInfo->MaxThreadsPerBlock);
1065       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1066     }
1067 
1068     unsigned int CudaBlocksPerGrid;
1069     if (TeamNum <= 0) {
1070       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1071         if (KernelInfo->ExecutionMode == SPMD) {
1072           // We have a combined construct, i.e. `target teams distribute
1073           // parallel for [simd]`. We launch so many teams so that each thread
1074           // will execute one iteration of the loop. round up to the nearest
1075           // integer
1076           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1077         } else if (KernelInfo->ExecutionMode == GENERIC) {
1078           // If we reach this point, then we have a non-combined construct, i.e.
1079           // `teams distribute` with a nested `parallel for` and each team is
1080           // assigned one iteration of the `distribute` loop. E.g.:
1081           //
1082           // #pragma omp target teams distribute
1083           // for(...loop_tripcount...) {
1084           //   #pragma omp parallel for
1085           //   for(...) {}
1086           // }
1087           //
1088           // Threads within a team will execute the iterations of the `parallel`
1089           // loop.
1090           CudaBlocksPerGrid = LoopTripCount;
1091         } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
1092           // If we reach this point, then we are executing a kernel that was
1093           // transformed from Generic-mode to SPMD-mode. This kernel has
1094           // SPMD-mode execution, but needs its blocks to be scheduled
1095           // differently because the current loop trip count only applies to the
1096           // `teams distribute` region and will create var too few blocks using
1097           // the regular SPMD-mode method.
1098           CudaBlocksPerGrid = LoopTripCount;
1099         } else {
1100           REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode);
1101           return OFFLOAD_FAIL;
1102         }
1103         DP("Using %d teams due to loop trip count %" PRIu32
1104            " and number of threads per block %d\n",
1105            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1106       } else {
1107         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1108         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1109       }
1110     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1111       DP("Capping number of teams to team limit %d\n",
1112          DeviceData[DeviceId].BlocksPerGrid);
1113       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1114     } else {
1115       DP("Using requested number of teams %d\n", TeamNum);
1116       CudaBlocksPerGrid = TeamNum;
1117     }
1118 
1119     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1120          "Launching kernel %s with %d blocks and %d threads in %s "
1121          "mode\n",
1122          (getOffloadEntry(DeviceId, TgtEntryPtr))
1123              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1124              : "(null)",
1125          CudaBlocksPerGrid, CudaThreadsPerBlock,
1126          (KernelInfo->ExecutionMode != SPMD
1127               ? (KernelInfo->ExecutionMode == GENERIC ? "Generic"
1128                                                       : "SPMD-Generic")
1129               : "SPMD"));
1130 
1131     CUstream Stream = getStream(DeviceId, AsyncInfo);
1132     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1133                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1134                          /* blockDimY */ 1, /* blockDimZ */ 1,
1135                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1136     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1137       return OFFLOAD_FAIL;
1138 
1139     DP("Launch of entry point at " DPxMOD " successful!\n",
1140        DPxPTR(TgtEntryPtr));
1141 
1142     return OFFLOAD_SUCCESS;
1143   }
1144 
synchronize(const int DeviceId,__tgt_async_info * AsyncInfo) const1145   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1146     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1147     CUresult Err = cuStreamSynchronize(Stream);
1148 
1149     // Once the stream is synchronized, return it to stream pool and reset
1150     // AsyncInfo. This is to make sure the synchronization only works for its
1151     // own tasks.
1152     StreamManager->returnStream(DeviceId,
1153                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1154     AsyncInfo->Queue = nullptr;
1155 
1156     if (Err != CUDA_SUCCESS) {
1157       DP("Error when synchronizing stream. stream = " DPxMOD
1158          ", async info ptr = " DPxMOD "\n",
1159          DPxPTR(Stream), DPxPTR(AsyncInfo));
1160       CUDA_ERR_STRING(Err);
1161     }
1162     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1163   }
1164 
printDeviceInfo(int32_t device_id)1165   void printDeviceInfo(int32_t device_id) {
1166     char TmpChar[1000];
1167     std::string TmpStr;
1168     size_t TmpSt;
1169     int TmpInt, TmpInt2, TmpInt3;
1170 
1171     CUdevice Device;
1172     checkResult(cuDeviceGet(&Device, device_id),
1173                 "Error returned from cuCtxGetDevice\n");
1174 
1175     cuDriverGetVersion(&TmpInt);
1176     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1177     printf("    CUDA Device Number: \t\t%d \n", device_id);
1178     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1179                 "Error returned from cuDeviceGetName\n");
1180     printf("    Device Name: \t\t\t%s \n", TmpChar);
1181     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1182                 "Error returned from cuDeviceTotalMem\n");
1183     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1184     checkResult(cuDeviceGetAttribute(
1185                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1186                 "Error returned from cuDeviceGetAttribute\n");
1187     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1188     checkResult(
1189         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1190         "Error returned from cuDeviceGetAttribute\n");
1191     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1192     checkResult(cuDeviceGetAttribute(
1193                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1194                 "Error returned from cuDeviceGetAttribute\n");
1195     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1196     checkResult(
1197         cuDeviceGetAttribute(
1198             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1199         "Error returned from cuDeviceGetAttribute\n");
1200     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1201     checkResult(cuDeviceGetAttribute(
1202                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1203                 "Error returned from cuDeviceGetAttribute\n");
1204     printf("    Registers per Block: \t\t%d \n", TmpInt);
1205     checkResult(
1206         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1207         "Error returned from cuDeviceGetAttribute\n");
1208     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1209     checkResult(cuDeviceGetAttribute(
1210                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1211                 "Error returned from cuDeviceGetAttribute\n");
1212     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1213     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
1214                                     Device),
1215                 "Error returned from cuDeviceGetAttribute\n");
1216     checkResult(cuDeviceGetAttribute(&TmpInt2,
1217                                     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1218                 "Error returned from cuDeviceGetAttribute\n");
1219     checkResult(cuDeviceGetAttribute(&TmpInt3,
1220                                     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1221                 "Error returned from cuDeviceGetAttribute\n");
1222     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1223           TmpInt3);
1224     checkResult(
1225         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1226         "Error returned from cuDeviceGetAttribute\n");
1227     checkResult(cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
1228                                     Device),
1229                 "Error returned from cuDeviceGetAttribute\n");
1230     checkResult(cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
1231                                     Device),
1232                 "Error returned from cuDeviceGetAttribute\n");
1233     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1234           TmpInt3);
1235     checkResult(
1236         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1237         "Error returned from cuDeviceGetAttribute\n");
1238     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1239     checkResult(cuDeviceGetAttribute(
1240                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1241                 "Error returned from cuDeviceGetAttribute\n");
1242     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1243     checkResult(
1244         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1245         "Error returned from cuDeviceGetAttribute\n");
1246     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1247     checkResult(cuDeviceGetAttribute(
1248                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1249                 "Error returned from cuDeviceGetAttribute\n");
1250     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1251     checkResult(
1252         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1253         "Error returned from cuDeviceGetAttribute\n");
1254     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1255     checkResult(cuDeviceGetAttribute(
1256                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1257                 "Error returned from cuDeviceGetAttribute\n");
1258     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1259     checkResult(
1260         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1261         "Error returned from cuDeviceGetAttribute\n");
1262     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1263       TmpStr = "DEFAULT";
1264     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1265       TmpStr = "PROHIBITED";
1266     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1267       TmpStr = "EXCLUSIVE PROCESS";
1268     else
1269       TmpStr = "unknown";
1270     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1271     checkResult(cuDeviceGetAttribute(
1272                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1273                 "Error returned from cuDeviceGetAttribute\n");
1274     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1275     checkResult(
1276         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1277         "Error returned from cuDeviceGetAttribute\n");
1278     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1279     checkResult(cuDeviceGetAttribute(
1280                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1281                 "Error returned from cuDeviceGetAttribute\n");
1282     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1283     checkResult(cuDeviceGetAttribute(
1284                     &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1285                 "Error returned from cuDeviceGetAttribute\n");
1286     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1287     checkResult(
1288         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device),
1289         "Error returned from cuDeviceGetAttribute\n");
1290     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1291     checkResult(
1292         cuDeviceGetAttribute(
1293             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device),
1294         "Error returned from cuDeviceGetAttribute\n");
1295     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1296     checkResult(cuDeviceGetAttribute(
1297                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1298                 "Error returned from cuDeviceGetAttribute\n");
1299     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1300     checkResult(cuDeviceGetAttribute(
1301                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1302                 "Error returned from cuDeviceGetAttribute\n");
1303     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1304     checkResult(
1305         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1306         "Error returned from cuDeviceGetAttribute\n");
1307     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1308     checkResult(
1309         cuDeviceGetAttribute(
1310             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1311         "Error returned from cuDeviceGetAttribute\n");
1312     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1313     checkResult(
1314         cuDeviceGetAttribute(
1315             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1316         "Error returned from cuDeviceGetAttribute\n");
1317     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1318     checkResult(cuDeviceGetAttribute(
1319                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1320                 "Error returned from cuDeviceGetAttribute\n");
1321     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1322     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD,
1323                                     Device),
1324                 "Error returned from cuDeviceGetAttribute\n");
1325     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1326     checkResult(cuDeviceGetAttribute(&TmpInt,
1327                                     CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1328                                     Device),
1329                 "Error returned from cuDeviceGetAttribute\n");
1330     checkResult(cuDeviceGetAttribute(&TmpInt2,
1331                                     CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1332                                     Device),
1333                 "Error returned from cuDeviceGetAttribute\n");
1334     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1335   }
1336 };
1337 
1338 DeviceRTLTy DeviceRTL;
1339 } // namespace
1340 
1341 // Exposed library API function
1342 #ifdef __cplusplus
1343 extern "C" {
1344 #endif
1345 
__tgt_rtl_is_valid_binary(__tgt_device_image * image)1346 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1347   return elf_check_machine(image, /* EM_CUDA */ 190);
1348 }
1349 
__tgt_rtl_number_of_devices()1350 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1351 
__tgt_rtl_init_requires(int64_t RequiresFlags)1352 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1353   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1354   DeviceRTL.setRequiresFlag(RequiresFlags);
1355   return RequiresFlags;
1356 }
1357 
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1358 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1359   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1360       DeviceRTL.isValidDeviceId(dst_dev_id))
1361     return 1;
1362 
1363   return 0;
1364 }
1365 
__tgt_rtl_init_device(int32_t device_id)1366 int32_t __tgt_rtl_init_device(int32_t device_id) {
1367   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1368 
1369   return DeviceRTL.initDevice(device_id);
1370 }
1371 
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1372 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1373                                           __tgt_device_image *image) {
1374   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1375 
1376   return DeviceRTL.loadBinary(device_id, image);
1377 }
1378 
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *,int32_t kind)1379 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1380                            int32_t kind) {
1381   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1382 
1383   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1384 }
1385 
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1386 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1387                               int64_t size) {
1388   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1389 
1390   __tgt_async_info AsyncInfo;
1391   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1392                                                  size, &AsyncInfo);
1393   if (rc != OFFLOAD_SUCCESS)
1394     return OFFLOAD_FAIL;
1395 
1396   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1397 }
1398 
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1399 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1400                                     void *hst_ptr, int64_t size,
1401                                     __tgt_async_info *async_info_ptr) {
1402   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1403   assert(async_info_ptr && "async_info_ptr is nullptr");
1404 
1405   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1406                               async_info_ptr);
1407 }
1408 
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1409 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1410                                 int64_t size) {
1411   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1412 
1413   __tgt_async_info AsyncInfo;
1414   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1415                                                    size, &AsyncInfo);
1416   if (rc != OFFLOAD_SUCCESS)
1417     return OFFLOAD_FAIL;
1418 
1419   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1420 }
1421 
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1422 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1423                                       void *tgt_ptr, int64_t size,
1424                                       __tgt_async_info *async_info_ptr) {
1425   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1426   assert(async_info_ptr && "async_info_ptr is nullptr");
1427 
1428   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1429                                 async_info_ptr);
1430 }
1431 
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * AsyncInfo)1432 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1433                                       int dst_dev_id, void *dst_ptr,
1434                                       int64_t size,
1435                                       __tgt_async_info *AsyncInfo) {
1436   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1437   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1438   assert(AsyncInfo && "AsyncInfo is nullptr");
1439 
1440   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1441                                 AsyncInfo);
1442 }
1443 
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1444 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1445                                 int32_t dst_dev_id, void *dst_ptr,
1446                                 int64_t size) {
1447   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1448   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1449 
1450   __tgt_async_info AsyncInfo;
1451   const int32_t rc = __tgt_rtl_data_exchange_async(
1452       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1453   if (rc != OFFLOAD_SUCCESS)
1454     return OFFLOAD_FAIL;
1455 
1456   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1457 }
1458 
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1459 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1460   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1461 
1462   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1463 }
1464 
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1465 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1466                                          void **tgt_args,
1467                                          ptrdiff_t *tgt_offsets,
1468                                          int32_t arg_num, int32_t team_num,
1469                                          int32_t thread_limit,
1470                                          uint64_t loop_tripcount) {
1471   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1472 
1473   __tgt_async_info AsyncInfo;
1474   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1475       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1476       thread_limit, loop_tripcount, &AsyncInfo);
1477   if (rc != OFFLOAD_SUCCESS)
1478     return OFFLOAD_FAIL;
1479 
1480   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1481 }
1482 
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1483 int32_t __tgt_rtl_run_target_team_region_async(
1484     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1485     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1486     int32_t thread_limit, uint64_t loop_tripcount,
1487     __tgt_async_info *async_info_ptr) {
1488   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1489 
1490   return DeviceRTL.runTargetTeamRegion(
1491       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1492       thread_limit, loop_tripcount, async_info_ptr);
1493 }
1494 
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1495 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1496                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1497                                     int32_t arg_num) {
1498   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1499 
1500   __tgt_async_info AsyncInfo;
1501   const int32_t rc = __tgt_rtl_run_target_region_async(
1502       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1503   if (rc != OFFLOAD_SUCCESS)
1504     return OFFLOAD_FAIL;
1505 
1506   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1507 }
1508 
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1509 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1510                                           void *tgt_entry_ptr, void **tgt_args,
1511                                           ptrdiff_t *tgt_offsets,
1512                                           int32_t arg_num,
1513                                           __tgt_async_info *async_info_ptr) {
1514   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1515 
1516   return __tgt_rtl_run_target_team_region_async(
1517       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1518       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1519       async_info_ptr);
1520 }
1521 
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1522 int32_t __tgt_rtl_synchronize(int32_t device_id,
1523                               __tgt_async_info *async_info_ptr) {
1524   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1525   assert(async_info_ptr && "async_info_ptr is nullptr");
1526   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1527 
1528   return DeviceRTL.synchronize(device_id, async_info_ptr);
1529 }
1530 
__tgt_rtl_set_info_flag(uint32_t NewInfoLevel)1531 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1532   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1533   InfoLevel.store(NewInfoLevel);
1534 }
1535 
__tgt_rtl_print_device_info(int32_t device_id)1536 void __tgt_rtl_print_device_info(int32_t device_id) {
1537   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1538   DeviceRTL.printDeviceInfo(device_id);
1539 }
1540 
1541 #ifdef __cplusplus
1542 }
1543 #endif
1544