1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21 
22 #include "Debug.h"
23 #include "omptargetplugin.h"
24 
25 #define TARGET_NAME CUDA
26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
27 
28 #include "MemoryManager.h"
29 
30 // Utility for retrieving and printing CUDA error string.
31 #ifdef OMPTARGET_DEBUG
32 #define CUDA_ERR_STRING(err)                                                   \
33   do {                                                                         \
34     if (getDebugLevel() > 0) {                                                 \
35       const char *errStr = nullptr;                                            \
36       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
37       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
38         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
39       else if (errStr_status == CUDA_SUCCESS)                                  \
40         REPORT("CUDA error is: %s\n", errStr);                                 \
41       else {                                                                   \
42         REPORT("Unresolved CUDA error code: %d\n", err);                       \
43         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
44                errStr_status);                                                 \
45       }                                                                        \
46     } else {                                                                   \
47       const char *errStr = nullptr;                                            \
48       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
49       if (errStr_status == CUDA_SUCCESS)                                       \
50         REPORT("%s \n", errStr);                                               \
51     }                                                                          \
52   } while (false)
53 #else // OMPTARGET_DEBUG
54 #define CUDA_ERR_STRING(err)                                                   \
55   do {                                                                         \
56     const char *errStr = nullptr;                                              \
57     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
58     if (errStr_status == CUDA_SUCCESS)                                         \
59       REPORT("%s \n", errStr);                                                 \
60   } while (false)
61 #endif // OMPTARGET_DEBUG
62 
63 #include "elf_common.h"
64 
65 /// Keep entries table per device.
66 struct FuncOrGblEntryTy {
67   __tgt_target_table Table;
68   std::vector<__tgt_offload_entry> Entries;
69 };
70 
71 enum ExecutionModeType {
72   SPMD, // constructors, destructors,
73   // combined constructs (`teams distribute parallel for [simd]`)
74   GENERIC, // everything else
75   NONE
76 };
77 
78 /// Use a single entity to encode a kernel and a set of flags.
79 struct KernelTy {
80   CUfunction Func;
81 
82   // execution mode of kernel
83   // 0 - SPMD mode (without master warp)
84   // 1 - Generic mode (with master warp)
85   int8_t ExecutionMode;
86 
87   /// Maximal number of threads per block for this kernel.
88   int MaxThreadsPerBlock = 0;
89 
KernelTyKernelTy90   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
91       : Func(_Func), ExecutionMode(_ExecutionMode) {}
92 };
93 
94 /// Device environment data
95 /// Manually sync with the deviceRTL side for now, move to a dedicated header
96 /// file later.
97 struct omptarget_device_environmentTy {
98   int32_t debug_level;
99 };
100 
101 namespace {
checkResult(CUresult Err,const char * ErrMsg)102 bool checkResult(CUresult Err, const char *ErrMsg) {
103   if (Err == CUDA_SUCCESS)
104     return true;
105 
106   REPORT("%s", ErrMsg);
107   CUDA_ERR_STRING(Err);
108   return false;
109 }
110 
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)111 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
112                CUstream Stream) {
113   CUresult Err =
114       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
115 
116   if (Err != CUDA_SUCCESS) {
117     REPORT("Error when copying data from device to device. Pointers: src "
118            "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
119            DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
120     CUDA_ERR_STRING(Err);
121     return OFFLOAD_FAIL;
122   }
123 
124   return OFFLOAD_SUCCESS;
125 }
126 
127 // Structure contains per-device data
128 struct DeviceDataTy {
129   /// List that contains all the kernels.
130   std::list<KernelTy> KernelsList;
131 
132   std::list<FuncOrGblEntryTy> FuncGblEntries;
133 
134   CUcontext Context = nullptr;
135   // Device properties
136   int ThreadsPerBlock = 0;
137   int BlocksPerGrid = 0;
138   int WarpSize = 0;
139   // OpenMP properties
140   int NumTeams = 0;
141   int NumThreads = 0;
142 };
143 
144 class StreamManagerTy {
145   int NumberOfDevices;
146   // The initial size of stream pool
147   int EnvNumInitialStreams;
148   // Per-device stream mutex
149   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
150   // Per-device stream Id indicates the next available stream in the pool
151   std::vector<int> NextStreamId;
152   // Per-device stream pool
153   std::vector<std::vector<CUstream>> StreamPool;
154   // Reference to per-device data
155   std::vector<DeviceDataTy> &DeviceData;
156 
157   // If there is no CUstream left in the pool, we will resize the pool to
158   // allocate more CUstream. This function should be called with device mutex,
159   // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)160   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
161     std::vector<CUstream> &Pool = StreamPool[DeviceId];
162     const size_t CurrentSize = Pool.size();
163     assert(NewSize > CurrentSize && "new size is not larger than current size");
164 
165     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
166     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
167       // We will return if cannot switch to the right context in case of
168       // creating bunch of streams that are not corresponding to the right
169       // device. The offloading will fail later because selected CUstream is
170       // nullptr.
171       return;
172     }
173 
174     Pool.resize(NewSize, nullptr);
175 
176     for (size_t I = CurrentSize; I < NewSize; ++I) {
177       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
178                   "Error returned from cuStreamCreate\n");
179     }
180   }
181 
182 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)183   StreamManagerTy(const int NumberOfDevices,
184                   std::vector<DeviceDataTy> &DeviceData)
185       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
186         DeviceData(DeviceData) {
187     StreamPool.resize(NumberOfDevices);
188     NextStreamId.resize(NumberOfDevices);
189     StreamMtx.resize(NumberOfDevices);
190 
191     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
192       EnvNumInitialStreams = std::stoi(EnvStr);
193 
194     // Initialize the next stream id
195     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
196 
197     // Initialize stream mutex
198     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
199       Ptr = std::make_unique<std::mutex>();
200   }
201 
~StreamManagerTy()202   ~StreamManagerTy() {
203     // Destroy streams
204     for (int I = 0; I < NumberOfDevices; ++I) {
205       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
206                   "Error returned from cuCtxSetCurrent\n");
207 
208       for (CUstream &S : StreamPool[I]) {
209         if (S)
210           checkResult(cuStreamDestroy(S),
211                       "Error returned from cuStreamDestroy\n");
212       }
213     }
214   }
215 
216   // Get a CUstream from pool. Per-device next stream id always points to the
217   // next available CUstream. That means, CUstreams [0, id-1] have been
218   // assigned, and [id,] are still available. If there is no CUstream left, we
219   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
220   // the id will increase one.
221   // xxxxxs+++++++++
222   //      ^
223   //      id
224   // After assignment, the pool becomes the following and s is assigned.
225   // xxxxxs+++++++++
226   //       ^
227   //       id
getStream(const int DeviceId)228   CUstream getStream(const int DeviceId) {
229     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
230     int &Id = NextStreamId[DeviceId];
231     // No CUstream left in the pool, we need to request from CUDA RT
232     if (Id == StreamPool[DeviceId].size()) {
233       // By default we double the stream pool every time
234       resizeStreamPool(DeviceId, Id * 2);
235     }
236     return StreamPool[DeviceId][Id++];
237   }
238 
239   // Return a CUstream back to pool. As mentioned above, per-device next
240   // stream is always points to the next available CUstream, so when we return
241   // a CUstream, we need to first decrease the id, and then copy the CUstream
242   // back.
243   // It is worth noting that, the order of streams return might be different
244   // from that they're assigned, that saying, at some point, there might be
245   // two identical CUstreams.
246   // xxax+a+++++
247   //     ^
248   //     id
249   // However, it doesn't matter, because they're always on the two sides of
250   // id. The left one will in the end be overwritten by another CUstream.
251   // Therefore, after several execution, the order of pool might be different
252   // from its initial state.
returnStream(const int DeviceId,CUstream Stream)253   void returnStream(const int DeviceId, CUstream Stream) {
254     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
255     int &Id = NextStreamId[DeviceId];
256     assert(Id > 0 && "Wrong stream ID");
257     StreamPool[DeviceId][--Id] = Stream;
258   }
259 
initializeDeviceStreamPool(const int DeviceId)260   bool initializeDeviceStreamPool(const int DeviceId) {
261     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
262 
263     resizeStreamPool(DeviceId, EnvNumInitialStreams);
264 
265     // Check the size of stream pool
266     if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
267       return false;
268 
269     // Check whether each stream is valid
270     for (CUstream &S : StreamPool[DeviceId])
271       if (!S)
272         return false;
273 
274     return true;
275   }
276 };
277 
278 class DeviceRTLTy {
279   int NumberOfDevices;
280   // OpenMP environment properties
281   int EnvNumTeams;
282   int EnvTeamLimit;
283   // OpenMP requires flags
284   int64_t RequiresFlags;
285 
286   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
287   static constexpr const int HardThreadLimit = 1024;
288   static constexpr const int DefaultNumTeams = 128;
289   static constexpr const int DefaultNumThreads = 128;
290 
291   std::unique_ptr<StreamManagerTy> StreamManager;
292   std::vector<DeviceDataTy> DeviceData;
293   std::vector<CUmodule> Modules;
294 
295   /// A class responsible for interacting with device native runtime library to
296   /// allocate and free memory.
297   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
298     const int DeviceId;
299     const std::vector<DeviceDataTy> &DeviceData;
300 
301   public:
CUDADeviceAllocatorTy(int DeviceId,std::vector<DeviceDataTy> & DeviceData)302     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
303         : DeviceId(DeviceId), DeviceData(DeviceData) {}
304 
allocate(size_t Size,void *)305     void *allocate(size_t Size, void *) override {
306       if (Size == 0)
307         return nullptr;
308 
309       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
310       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
311         return nullptr;
312 
313       CUdeviceptr DevicePtr;
314       Err = cuMemAlloc(&DevicePtr, Size);
315       if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
316         return nullptr;
317 
318       return (void *)DevicePtr;
319     }
320 
free(void * TgtPtr)321     int free(void *TgtPtr) override {
322       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
323       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
324         return OFFLOAD_FAIL;
325 
326       Err = cuMemFree((CUdeviceptr)TgtPtr);
327       if (!checkResult(Err, "Error returned from cuMemFree\n"))
328         return OFFLOAD_FAIL;
329 
330       return OFFLOAD_SUCCESS;
331     }
332   };
333 
334   /// A vector of device allocators
335   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
336 
337   /// A vector of memory managers. Since the memory manager is non-copyable and
338   // non-removable, we wrap them into std::unique_ptr.
339   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
340 
341   /// Whether use memory manager
342   bool UseMemoryManager = true;
343 
344   // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)345   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
346     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
347     E.Entries.push_back(entry);
348   }
349 
350   // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const351   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
352                                              const void *Addr) const {
353     for (const __tgt_offload_entry &Itr :
354          DeviceData[DeviceId].FuncGblEntries.back().Entries)
355       if (Itr.addr == Addr)
356         return &Itr;
357 
358     return nullptr;
359   }
360 
361   // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)362   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
363     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
364 
365     if (E.Entries.empty())
366       return nullptr;
367 
368     // Update table info according to the entries and return the pointer
369     E.Table.EntriesBegin = E.Entries.data();
370     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
371 
372     return &E.Table;
373   }
374 
375   // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)376   void clearOffloadEntriesTable(const int DeviceId) {
377     DeviceData[DeviceId].FuncGblEntries.emplace_back();
378     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
379     E.Entries.clear();
380     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
381   }
382 
getStream(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const383   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
384     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
385 
386     if (!AsyncInfoPtr->Queue)
387       AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
388 
389     return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
390   }
391 
392 public:
393   // This class should not be copied
394   DeviceRTLTy(const DeviceRTLTy &) = delete;
395   DeviceRTLTy(DeviceRTLTy &&) = delete;
396 
DeviceRTLTy()397   DeviceRTLTy()
398       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
399         RequiresFlags(OMP_REQ_UNDEFINED) {
400 
401     DP("Start initializing CUDA\n");
402 
403     CUresult Err = cuInit(0);
404     if (Err == CUDA_ERROR_INVALID_HANDLE) {
405       // Can't call cuGetErrorString if dlsym failed
406       DP("Failed to load CUDA shared library\n");
407       return;
408     }
409     if (!checkResult(Err, "Error returned from cuInit\n")) {
410       return;
411     }
412 
413     Err = cuDeviceGetCount(&NumberOfDevices);
414     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
415       return;
416 
417     if (NumberOfDevices == 0) {
418       DP("There are no devices supporting CUDA.\n");
419       return;
420     }
421 
422     DeviceData.resize(NumberOfDevices);
423 
424     // Get environment variables regarding teams
425     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
426       // OMP_TEAM_LIMIT has been set
427       EnvTeamLimit = std::stoi(EnvStr);
428       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
429     }
430     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
431       // OMP_NUM_TEAMS has been set
432       EnvNumTeams = std::stoi(EnvStr);
433       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
434     }
435 
436     StreamManager =
437         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
438 
439     for (int I = 0; I < NumberOfDevices; ++I)
440       DeviceAllocators.emplace_back(I, DeviceData);
441 
442     // Get the size threshold from environment variable
443     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
444     UseMemoryManager = Res.second;
445     size_t MemoryManagerThreshold = Res.first;
446 
447     if (UseMemoryManager)
448       for (int I = 0; I < NumberOfDevices; ++I)
449         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
450             DeviceAllocators[I], MemoryManagerThreshold));
451   }
452 
~DeviceRTLTy()453   ~DeviceRTLTy() {
454     // We first destruct memory managers in case that its dependent data are
455     // destroyed before it.
456     for (auto &M : MemoryManagers)
457       M.release();
458 
459     StreamManager = nullptr;
460 
461     for (CUmodule &M : Modules)
462       // Close module
463       if (M)
464         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
465 
466     for (DeviceDataTy &D : DeviceData) {
467       // Destroy context
468       if (D.Context) {
469         checkResult(cuCtxSetCurrent(D.Context),
470                     "Error returned from cuCtxSetCurrent\n");
471         CUdevice Device;
472         checkResult(cuCtxGetDevice(&Device),
473                     "Error returned from cuCtxGetDevice\n");
474         checkResult(cuDevicePrimaryCtxRelease(Device),
475                     "Error returned from cuDevicePrimaryCtxRelease\n");
476       }
477     }
478   }
479 
480   // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const481   bool isValidDeviceId(const int DeviceId) const {
482     return DeviceId >= 0 && DeviceId < NumberOfDevices;
483   }
484 
getNumOfDevices() const485   int getNumOfDevices() const { return NumberOfDevices; }
486 
setRequiresFlag(const int64_t Flags)487   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
488 
initDevice(const int DeviceId)489   int initDevice(const int DeviceId) {
490     CUdevice Device;
491 
492     DP("Getting device %d\n", DeviceId);
493     CUresult Err = cuDeviceGet(&Device, DeviceId);
494     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
495       return OFFLOAD_FAIL;
496 
497     // Query the current flags of the primary context and set its flags if
498     // it is inactive
499     unsigned int FormerPrimaryCtxFlags = 0;
500     int FormerPrimaryCtxIsActive = 0;
501     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
502                                      &FormerPrimaryCtxIsActive);
503     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
504       return OFFLOAD_FAIL;
505 
506     if (FormerPrimaryCtxIsActive) {
507       DP("The primary context is active, no change to its flags\n");
508       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
509           CU_CTX_SCHED_BLOCKING_SYNC)
510         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
511     } else {
512       DP("The primary context is inactive, set its flags to "
513          "CU_CTX_SCHED_BLOCKING_SYNC\n");
514       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
515       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
516         return OFFLOAD_FAIL;
517     }
518 
519     // Retain the per device primary context and save it to use whenever this
520     // device is selected.
521     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
522     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
523       return OFFLOAD_FAIL;
524 
525     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
526     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
527       return OFFLOAD_FAIL;
528 
529     // Initialize stream pool
530     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
531       return OFFLOAD_FAIL;
532 
533     // Query attributes to determine number of threads/block and blocks/grid.
534     int MaxGridDimX;
535     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
536                                Device);
537     if (Err != CUDA_SUCCESS) {
538       DP("Error getting max grid dimension, use default value %d\n",
539          DeviceRTLTy::DefaultNumTeams);
540       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
541     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
542       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
543       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
544     } else {
545       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
546          "at the hard limit\n",
547          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
548       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
549     }
550 
551     // We are only exploiting threads along the x axis.
552     int MaxBlockDimX;
553     Err = cuDeviceGetAttribute(&MaxBlockDimX,
554                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
555     if (Err != CUDA_SUCCESS) {
556       DP("Error getting max block dimension, use default value %d\n",
557          DeviceRTLTy::DefaultNumThreads);
558       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
559     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
560       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
561       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
562     } else {
563       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
564          "capping at the hard limit\n",
565          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
566       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
567     }
568 
569     // Get and set warp size
570     int WarpSize;
571     Err =
572         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
573     if (Err != CUDA_SUCCESS) {
574       DP("Error getting warp size, assume default value 32\n");
575       DeviceData[DeviceId].WarpSize = 32;
576     } else {
577       DP("Using warp size %d\n", WarpSize);
578       DeviceData[DeviceId].WarpSize = WarpSize;
579     }
580 
581     // Adjust teams to the env variables
582     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
583       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
584          EnvTeamLimit);
585       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
586     }
587 
588     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
589          "Device supports up to %d CUDA blocks and %d threads with a "
590          "warp size of %d\n",
591          DeviceData[DeviceId].BlocksPerGrid,
592          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
593 
594     // Set default number of teams
595     if (EnvNumTeams > 0) {
596       DP("Default number of teams set according to environment %d\n",
597          EnvNumTeams);
598       DeviceData[DeviceId].NumTeams = EnvNumTeams;
599     } else {
600       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
601       DP("Default number of teams set according to library's default %d\n",
602          DeviceRTLTy::DefaultNumTeams);
603     }
604 
605     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
606       DP("Default number of teams exceeds device limit, capping at %d\n",
607          DeviceData[DeviceId].BlocksPerGrid);
608       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
609     }
610 
611     // Set default number of threads
612     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
613     DP("Default number of threads set according to library's default %d\n",
614        DeviceRTLTy::DefaultNumThreads);
615     if (DeviceData[DeviceId].NumThreads >
616         DeviceData[DeviceId].ThreadsPerBlock) {
617       DP("Default number of threads exceeds device limit, capping at %d\n",
618          DeviceData[DeviceId].ThreadsPerBlock);
619       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
620     }
621 
622     return OFFLOAD_SUCCESS;
623   }
624 
loadBinary(const int DeviceId,const __tgt_device_image * Image)625   __tgt_target_table *loadBinary(const int DeviceId,
626                                  const __tgt_device_image *Image) {
627     // Set the context we are using
628     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
629     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
630       return nullptr;
631 
632     // Clear the offload table as we are going to create a new one.
633     clearOffloadEntriesTable(DeviceId);
634 
635     // Create the module and extract the function pointers.
636     CUmodule Module;
637     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
638     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
639     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
640       return nullptr;
641 
642     DP("CUDA module successfully loaded!\n");
643 
644     Modules.push_back(Module);
645 
646     // Find the symbols in the module by name.
647     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
648     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
649 
650     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
651     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
652       if (!E->addr) {
653         // We return nullptr when something like this happens, the host should
654         // have always something in the address to uniquely identify the target
655         // region.
656         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
657         return nullptr;
658       }
659 
660       if (E->size) {
661         __tgt_offload_entry Entry = *E;
662         CUdeviceptr CUPtr;
663         size_t CUSize;
664         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
665         // We keep this style here because we need the name
666         if (Err != CUDA_SUCCESS) {
667           REPORT("Loading global '%s' Failed\n", E->name);
668           CUDA_ERR_STRING(Err);
669           return nullptr;
670         }
671 
672         if (CUSize != E->size) {
673           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
674              CUSize, E->size);
675           return nullptr;
676         }
677 
678         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
679            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
680 
681         Entry.addr = (void *)(CUPtr);
682 
683         // Note: In the current implementation declare target variables
684         // can either be link or to. This means that once unified
685         // memory is activated via the requires directive, the variable
686         // can be used directly from the host in both cases.
687         // TODO: when variables types other than to or link are added,
688         // the below condition should be changed to explicitly
689         // check for to and link variables types:
690         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
691         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
692         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
693           // If unified memory is present any target link or to variables
694           // can access host addresses directly. There is no longer a
695           // need for device copies.
696           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
697           DP("Copy linked variable host address (" DPxMOD
698              ") to device address (" DPxMOD ")\n",
699              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
700         }
701 
702         addOffloadEntry(DeviceId, Entry);
703 
704         continue;
705       }
706 
707       CUfunction Func;
708       Err = cuModuleGetFunction(&Func, Module, E->name);
709       // We keep this style here because we need the name
710       if (Err != CUDA_SUCCESS) {
711         REPORT("Loading '%s' Failed\n", E->name);
712         CUDA_ERR_STRING(Err);
713         return nullptr;
714       }
715 
716       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
717          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
718 
719       // default value GENERIC (in case symbol is missing from cubin file)
720       int8_t ExecModeVal = ExecutionModeType::GENERIC;
721       std::string ExecModeNameStr(E->name);
722       ExecModeNameStr += "_exec_mode";
723       const char *ExecModeName = ExecModeNameStr.c_str();
724 
725       CUdeviceptr ExecModePtr;
726       size_t CUSize;
727       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
728       if (Err == CUDA_SUCCESS) {
729         if (CUSize != sizeof(int8_t)) {
730           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
731              ExecModeName, CUSize, sizeof(int8_t));
732           return nullptr;
733         }
734 
735         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
736         if (Err != CUDA_SUCCESS) {
737           REPORT("Error when copying data from device to host. Pointers: "
738                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
739                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
740           CUDA_ERR_STRING(Err);
741           return nullptr;
742         }
743 
744         if (ExecModeVal < 0 || ExecModeVal > 1) {
745           DP("Error wrong exec_mode value specified in cubin file: %d\n",
746              ExecModeVal);
747           return nullptr;
748         }
749       } else {
750         REPORT("Loading global exec_mode '%s' - symbol missing, using default "
751                "value GENERIC (1)\n",
752                ExecModeName);
753         CUDA_ERR_STRING(Err);
754       }
755 
756       KernelsList.emplace_back(Func, ExecModeVal);
757 
758       __tgt_offload_entry Entry = *E;
759       Entry.addr = &KernelsList.back();
760       addOffloadEntry(DeviceId, Entry);
761     }
762 
763     // send device environment data to the device
764     {
765       omptarget_device_environmentTy DeviceEnv{0};
766 
767 #ifdef OMPTARGET_DEBUG
768       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
769         DeviceEnv.debug_level = std::stoi(EnvStr);
770 #endif
771 
772       const char *DeviceEnvName = "omptarget_device_environment";
773       CUdeviceptr DeviceEnvPtr;
774       size_t CUSize;
775 
776       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
777       if (Err == CUDA_SUCCESS) {
778         if (CUSize != sizeof(DeviceEnv)) {
779           REPORT(
780               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
781               DeviceEnvName, CUSize, sizeof(int32_t));
782           CUDA_ERR_STRING(Err);
783           return nullptr;
784         }
785 
786         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
787         if (Err != CUDA_SUCCESS) {
788           REPORT("Error when copying data from host to device. Pointers: "
789                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
790                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
791           CUDA_ERR_STRING(Err);
792           return nullptr;
793         }
794 
795         DP("Sending global device environment data %zu bytes\n", CUSize);
796       } else {
797         DP("Finding global device environment '%s' - symbol missing.\n",
798            DeviceEnvName);
799         DP("Continue, considering this is a device RTL which does not accept "
800            "environment setting.\n");
801       }
802     }
803 
804     return getOffloadEntriesTable(DeviceId);
805   }
806 
dataAlloc(const int DeviceId,const int64_t Size)807   void *dataAlloc(const int DeviceId, const int64_t Size) {
808     if (UseMemoryManager)
809       return MemoryManagers[DeviceId]->allocate(Size, nullptr);
810 
811     return DeviceAllocators[DeviceId].allocate(Size, nullptr);
812   }
813 
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const814   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
815                  const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
816     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
817 
818     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
819     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
820       return OFFLOAD_FAIL;
821 
822     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
823 
824     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
825     if (Err != CUDA_SUCCESS) {
826       REPORT("Error when copying data from host to device. Pointers: host "
827              "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
828              DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
829       CUDA_ERR_STRING(Err);
830       return OFFLOAD_FAIL;
831     }
832 
833     return OFFLOAD_SUCCESS;
834   }
835 
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const836   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
837                    const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
838     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
839 
840     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
841     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
842       return OFFLOAD_FAIL;
843 
844     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
845 
846     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
847     if (Err != CUDA_SUCCESS) {
848       REPORT("Error when copying data from device to host. Pointers: host "
849              "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
850              DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
851       CUDA_ERR_STRING(Err);
852       return OFFLOAD_FAIL;
853     }
854 
855     return OFFLOAD_SUCCESS;
856   }
857 
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr) const858   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
859                    int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
860     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
861 
862     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
863     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
864       return OFFLOAD_FAIL;
865 
866     CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
867 
868     // If they are two devices, we try peer to peer copy first
869     if (SrcDevId != DstDevId) {
870       int CanAccessPeer = 0;
871       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
872       if (Err != CUDA_SUCCESS) {
873         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
874                ", dst = %" PRId32 "\n",
875                SrcDevId, DstDevId);
876         CUDA_ERR_STRING(Err);
877         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
878       }
879 
880       if (!CanAccessPeer) {
881         DP("P2P memcpy not supported so fall back to D2D memcpy");
882         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
883       }
884 
885       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
886       if (Err != CUDA_SUCCESS) {
887         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
888                ", dst = %" PRId32 "\n",
889                SrcDevId, DstDevId);
890         CUDA_ERR_STRING(Err);
891         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
892       }
893 
894       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
895                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
896                               Size, Stream);
897       if (Err == CUDA_SUCCESS)
898         return OFFLOAD_SUCCESS;
899 
900       REPORT("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
901              ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
902              "\n",
903              DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
904       CUDA_ERR_STRING(Err);
905     }
906 
907     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
908   }
909 
dataDelete(const int DeviceId,void * TgtPtr)910   int dataDelete(const int DeviceId, void *TgtPtr) {
911     if (UseMemoryManager)
912       return MemoryManagers[DeviceId]->free(TgtPtr);
913 
914     return DeviceAllocators[DeviceId].free(TgtPtr);
915   }
916 
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const917   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
918                           ptrdiff_t *TgtOffsets, const int ArgNum,
919                           const int TeamNum, const int ThreadLimit,
920                           const unsigned int LoopTripCount,
921                           __tgt_async_info *AsyncInfo) const {
922     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
923     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
924       return OFFLOAD_FAIL;
925 
926     // All args are references.
927     std::vector<void *> Args(ArgNum);
928     std::vector<void *> Ptrs(ArgNum);
929 
930     for (int I = 0; I < ArgNum; ++I) {
931       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
932       Args[I] = &Ptrs[I];
933     }
934 
935     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
936 
937     int CudaThreadsPerBlock;
938     if (ThreadLimit > 0) {
939       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
940       CudaThreadsPerBlock = ThreadLimit;
941       // Add master warp if necessary
942       if (KernelInfo->ExecutionMode == GENERIC) {
943         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
944         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
945       }
946     } else {
947       DP("Setting CUDA threads per block to default %d\n",
948          DeviceData[DeviceId].NumThreads);
949       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
950     }
951 
952     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
953       DP("Threads per block capped at device limit %d\n",
954          DeviceData[DeviceId].ThreadsPerBlock);
955       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
956     }
957 
958     if (!KernelInfo->MaxThreadsPerBlock) {
959       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
960                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
961                                KernelInfo->Func);
962       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
963         return OFFLOAD_FAIL;
964     }
965 
966     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
967       DP("Threads per block capped at kernel limit %d\n",
968          KernelInfo->MaxThreadsPerBlock);
969       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
970     }
971 
972     unsigned int CudaBlocksPerGrid;
973     if (TeamNum <= 0) {
974       if (LoopTripCount > 0 && EnvNumTeams < 0) {
975         if (KernelInfo->ExecutionMode == SPMD) {
976           // We have a combined construct, i.e. `target teams distribute
977           // parallel for [simd]`. We launch so many teams so that each thread
978           // will execute one iteration of the loop. round up to the nearest
979           // integer
980           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
981         } else {
982           // If we reach this point, then we have a non-combined construct, i.e.
983           // `teams distribute` with a nested `parallel for` and each team is
984           // assigned one iteration of the `distribute` loop. E.g.:
985           //
986           // #pragma omp target teams distribute
987           // for(...loop_tripcount...) {
988           //   #pragma omp parallel for
989           //   for(...) {}
990           // }
991           //
992           // Threads within a team will execute the iterations of the `parallel`
993           // loop.
994           CudaBlocksPerGrid = LoopTripCount;
995         }
996         DP("Using %d teams due to loop trip count %" PRIu32
997            " and number of threads per block %d\n",
998            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
999       } else {
1000         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1001         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1002       }
1003     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1004       DP("Capping number of teams to team limit %d\n",
1005          DeviceData[DeviceId].BlocksPerGrid);
1006       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1007     } else {
1008       DP("Using requested number of teams %d\n", TeamNum);
1009       CudaBlocksPerGrid = TeamNum;
1010     }
1011 
1012     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1013          "Launching kernel %s with %d blocks and %d threads in %s "
1014          "mode\n",
1015          (getOffloadEntry(DeviceId, TgtEntryPtr))
1016              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1017              : "(null)",
1018          CudaBlocksPerGrid, CudaThreadsPerBlock,
1019          (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
1020 
1021     CUstream Stream = getStream(DeviceId, AsyncInfo);
1022     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1023                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1024                          /* blockDimY */ 1, /* blockDimZ */ 1,
1025                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1026     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1027       return OFFLOAD_FAIL;
1028 
1029     DP("Launch of entry point at " DPxMOD " successful!\n",
1030        DPxPTR(TgtEntryPtr));
1031 
1032     return OFFLOAD_SUCCESS;
1033   }
1034 
synchronize(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const1035   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
1036     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
1037     CUresult Err = cuStreamSynchronize(Stream);
1038     if (Err != CUDA_SUCCESS) {
1039       REPORT("Error when synchronizing stream. stream = " DPxMOD
1040              ", async info ptr = " DPxMOD "\n",
1041              DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
1042       CUDA_ERR_STRING(Err);
1043       return OFFLOAD_FAIL;
1044     }
1045 
1046     // Once the stream is synchronized, return it to stream pool and reset
1047     // async_info. This is to make sure the synchronization only works for its
1048     // own tasks.
1049     StreamManager->returnStream(
1050         DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
1051     AsyncInfoPtr->Queue = nullptr;
1052 
1053     return OFFLOAD_SUCCESS;
1054   }
1055 };
1056 
1057 DeviceRTLTy DeviceRTL;
1058 } // namespace
1059 
1060 // Exposed library API function
1061 #ifdef __cplusplus
1062 extern "C" {
1063 #endif
1064 
__tgt_rtl_is_valid_binary(__tgt_device_image * image)1065 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1066   return elf_check_machine(image, /* EM_CUDA */ 190);
1067 }
1068 
__tgt_rtl_number_of_devices()1069 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1070 
__tgt_rtl_init_requires(int64_t RequiresFlags)1071 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1072   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1073   DeviceRTL.setRequiresFlag(RequiresFlags);
1074   return RequiresFlags;
1075 }
1076 
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1077 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1078   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1079       DeviceRTL.isValidDeviceId(dst_dev_id))
1080     return 1;
1081 
1082   return 0;
1083 }
1084 
__tgt_rtl_init_device(int32_t device_id)1085 int32_t __tgt_rtl_init_device(int32_t device_id) {
1086   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1087 
1088   return DeviceRTL.initDevice(device_id);
1089 }
1090 
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1091 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1092                                           __tgt_device_image *image) {
1093   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1094 
1095   return DeviceRTL.loadBinary(device_id, image);
1096 }
1097 
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *)1098 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1099   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1100 
1101   return DeviceRTL.dataAlloc(device_id, size);
1102 }
1103 
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1104 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1105                               int64_t size) {
1106   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1107 
1108   __tgt_async_info async_info;
1109   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1110                                                  size, &async_info);
1111   if (rc != OFFLOAD_SUCCESS)
1112     return OFFLOAD_FAIL;
1113 
1114   return __tgt_rtl_synchronize(device_id, &async_info);
1115 }
1116 
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1117 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1118                                     void *hst_ptr, int64_t size,
1119                                     __tgt_async_info *async_info_ptr) {
1120   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1121   assert(async_info_ptr && "async_info_ptr is nullptr");
1122 
1123   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1124                               async_info_ptr);
1125 }
1126 
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1127 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1128                                 int64_t size) {
1129   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1130 
1131   __tgt_async_info async_info;
1132   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1133                                                    size, &async_info);
1134   if (rc != OFFLOAD_SUCCESS)
1135     return OFFLOAD_FAIL;
1136 
1137   return __tgt_rtl_synchronize(device_id, &async_info);
1138 }
1139 
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1140 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1141                                       void *tgt_ptr, int64_t size,
1142                                       __tgt_async_info *async_info_ptr) {
1143   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1144   assert(async_info_ptr && "async_info_ptr is nullptr");
1145 
1146   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1147                                 async_info_ptr);
1148 }
1149 
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1150 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1151                                       int dst_dev_id, void *dst_ptr,
1152                                       int64_t size,
1153                                       __tgt_async_info *async_info_ptr) {
1154   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1155   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1156   assert(async_info_ptr && "async_info_ptr is nullptr");
1157 
1158   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1159                                 async_info_ptr);
1160 }
1161 
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1162 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1163                                 int32_t dst_dev_id, void *dst_ptr,
1164                                 int64_t size) {
1165   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1166   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1167 
1168   __tgt_async_info async_info;
1169   const int32_t rc = __tgt_rtl_data_exchange_async(
1170       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1171   if (rc != OFFLOAD_SUCCESS)
1172     return OFFLOAD_FAIL;
1173 
1174   return __tgt_rtl_synchronize(src_dev_id, &async_info);
1175 }
1176 
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1177 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1178   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1179 
1180   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1181 }
1182 
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1183 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1184                                          void **tgt_args,
1185                                          ptrdiff_t *tgt_offsets,
1186                                          int32_t arg_num, int32_t team_num,
1187                                          int32_t thread_limit,
1188                                          uint64_t loop_tripcount) {
1189   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1190 
1191   __tgt_async_info async_info;
1192   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1193       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1194       thread_limit, loop_tripcount, &async_info);
1195   if (rc != OFFLOAD_SUCCESS)
1196     return OFFLOAD_FAIL;
1197 
1198   return __tgt_rtl_synchronize(device_id, &async_info);
1199 }
1200 
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1201 int32_t __tgt_rtl_run_target_team_region_async(
1202     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1203     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1204     int32_t thread_limit, uint64_t loop_tripcount,
1205     __tgt_async_info *async_info_ptr) {
1206   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1207 
1208   return DeviceRTL.runTargetTeamRegion(
1209       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1210       thread_limit, loop_tripcount, async_info_ptr);
1211 }
1212 
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1213 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1214                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1215                                     int32_t arg_num) {
1216   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1217 
1218   __tgt_async_info async_info;
1219   const int32_t rc = __tgt_rtl_run_target_region_async(
1220       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1221   if (rc != OFFLOAD_SUCCESS)
1222     return OFFLOAD_FAIL;
1223 
1224   return __tgt_rtl_synchronize(device_id, &async_info);
1225 }
1226 
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1227 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1228                                           void *tgt_entry_ptr, void **tgt_args,
1229                                           ptrdiff_t *tgt_offsets,
1230                                           int32_t arg_num,
1231                                           __tgt_async_info *async_info_ptr) {
1232   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1233 
1234   return __tgt_rtl_run_target_team_region_async(
1235       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1236       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1237       async_info_ptr);
1238 }
1239 
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1240 int32_t __tgt_rtl_synchronize(int32_t device_id,
1241                               __tgt_async_info *async_info_ptr) {
1242   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1243   assert(async_info_ptr && "async_info_ptr is nullptr");
1244   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1245 
1246   return DeviceRTL.synchronize(device_id, async_info_ptr);
1247 }
1248 
1249 #ifdef __cplusplus
1250 }
1251 #endif
1252