1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21
22 #include "omptargetplugin.h"
23
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27
28 #ifdef OMPTARGET_DEBUG
29 static int DebugLevel = 0;
30
31 #define GETNAME2(name) #name
32 #define GETNAME(name) GETNAME2(name)
33 #define DP(...) \
34 do { \
35 if (DebugLevel > 0) { \
36 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
37 } \
38 } while (false)
39
40 // Utility for retrieving and printing CUDA error string.
41 #define CUDA_ERR_STRING(err) \
42 do { \
43 if (DebugLevel > 0) { \
44 const char *errStr; \
45 cuGetErrorString(err, &errStr); \
46 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
47 } \
48 } while (false)
49 #else // OMPTARGET_DEBUG
50 #define DP(...) {}
51 #define CUDA_ERR_STRING(err) {}
52 #endif // OMPTARGET_DEBUG
53
54 #include "../../common/elf_common.c"
55
56 /// Keep entries table per device.
57 struct FuncOrGblEntryTy {
58 __tgt_target_table Table;
59 std::vector<__tgt_offload_entry> Entries;
60 };
61
62 enum ExecutionModeType {
63 SPMD, // constructors, destructors,
64 // combined constructs (`teams distribute parallel for [simd]`)
65 GENERIC, // everything else
66 NONE
67 };
68
69 /// Use a single entity to encode a kernel and a set of flags.
70 struct KernelTy {
71 CUfunction Func;
72
73 // execution mode of kernel
74 // 0 - SPMD mode (without master warp)
75 // 1 - Generic mode (with master warp)
76 int8_t ExecutionMode;
77
KernelTyKernelTy78 KernelTy(CUfunction _Func, int8_t _ExecutionMode)
79 : Func(_Func), ExecutionMode(_ExecutionMode) {}
80 };
81
82 /// Device environment data
83 /// Manually sync with the deviceRTL side for now, move to a dedicated header
84 /// file later.
85 struct omptarget_device_environmentTy {
86 int32_t debug_level;
87 };
88
89 /// List that contains all the kernels.
90 /// FIXME: we may need this to be per device and per library.
91 std::list<KernelTy> KernelsList;
92
93 namespace {
checkResult(CUresult Err,const char * ErrMsg)94 bool checkResult(CUresult Err, const char *ErrMsg) {
95 if (Err == CUDA_SUCCESS)
96 return true;
97
98 DP("%s", ErrMsg);
99 CUDA_ERR_STRING(Err);
100 return false;
101 }
102
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104 CUstream Stream) {
105 CUresult Err =
106 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107
108 if (Err != CUDA_SUCCESS) {
109 DP("Error when copying data from device to device. Pointers: src "
110 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112 CUDA_ERR_STRING(Err);
113 return OFFLOAD_FAIL;
114 }
115
116 return OFFLOAD_SUCCESS;
117 }
118
119 // Structure contains per-device data
120 struct DeviceDataTy {
121 std::list<FuncOrGblEntryTy> FuncGblEntries;
122 CUcontext Context = nullptr;
123 // Device properties
124 int ThreadsPerBlock = 0;
125 int BlocksPerGrid = 0;
126 int WarpSize = 0;
127 // OpenMP properties
128 int NumTeams = 0;
129 int NumThreads = 0;
130 };
131
132 class StreamManagerTy {
133 int NumberOfDevices;
134 // The initial size of stream pool
135 int EnvNumInitialStreams;
136 // Per-device stream mutex
137 std::vector<std::unique_ptr<std::mutex>> StreamMtx;
138 // Per-device stream Id indicates the next available stream in the pool
139 std::vector<int> NextStreamId;
140 // Per-device stream pool
141 std::vector<std::vector<CUstream>> StreamPool;
142 // Reference to per-device data
143 std::vector<DeviceDataTy> &DeviceData;
144
145 // If there is no CUstream left in the pool, we will resize the pool to
146 // allocate more CUstream. This function should be called with device mutex,
147 // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)148 void resizeStreamPool(const int DeviceId, const size_t NewSize) {
149 std::vector<CUstream> &Pool = StreamPool[DeviceId];
150 const size_t CurrentSize = Pool.size();
151 assert(NewSize > CurrentSize && "new size is not larger than current size");
152
153 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
154 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
155 // We will return if cannot switch to the right context in case of
156 // creating bunch of streams that are not corresponding to the right
157 // device. The offloading will fail later because selected CUstream is
158 // nullptr.
159 return;
160 }
161
162 Pool.resize(NewSize, nullptr);
163
164 for (size_t I = CurrentSize; I < NewSize; ++I) {
165 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
166 "Error returned from cuStreamCreate\n");
167 }
168 }
169
170 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)171 StreamManagerTy(const int NumberOfDevices,
172 std::vector<DeviceDataTy> &DeviceData)
173 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
174 DeviceData(DeviceData) {
175 StreamPool.resize(NumberOfDevices);
176 NextStreamId.resize(NumberOfDevices);
177 StreamMtx.resize(NumberOfDevices);
178
179 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
180 EnvNumInitialStreams = std::stoi(EnvStr);
181
182 // Initialize the next stream id
183 std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
184
185 // Initialize stream mutex
186 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
187 Ptr = std::make_unique<std::mutex>();
188 }
189
~StreamManagerTy()190 ~StreamManagerTy() {
191 // Destroy streams
192 for (int I = 0; I < NumberOfDevices; ++I) {
193 checkResult(cuCtxSetCurrent(DeviceData[I].Context),
194 "Error returned from cuCtxSetCurrent\n");
195
196 for (CUstream &S : StreamPool[I]) {
197 if (S)
198 checkResult(cuStreamDestroy(S),
199 "Error returned from cuStreamDestroy\n");
200 }
201 }
202 }
203
204 // Get a CUstream from pool. Per-device next stream id always points to the
205 // next available CUstream. That means, CUstreams [0, id-1] have been
206 // assigned, and [id,] are still available. If there is no CUstream left, we
207 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
208 // the id will increase one.
209 // xxxxxs+++++++++
210 // ^
211 // id
212 // After assignment, the pool becomes the following and s is assigned.
213 // xxxxxs+++++++++
214 // ^
215 // id
getStream(const int DeviceId)216 CUstream getStream(const int DeviceId) {
217 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
218 int &Id = NextStreamId[DeviceId];
219 // No CUstream left in the pool, we need to request from CUDA RT
220 if (Id == StreamPool[DeviceId].size()) {
221 // By default we double the stream pool every time
222 resizeStreamPool(DeviceId, Id * 2);
223 }
224 return StreamPool[DeviceId][Id++];
225 }
226
227 // Return a CUstream back to pool. As mentioned above, per-device next
228 // stream is always points to the next available CUstream, so when we return
229 // a CUstream, we need to first decrease the id, and then copy the CUstream
230 // back.
231 // It is worth noting that, the order of streams return might be different
232 // from that they're assigned, that saying, at some point, there might be
233 // two identical CUstreams.
234 // xxax+a+++++
235 // ^
236 // id
237 // However, it doesn't matter, because they're always on the two sides of
238 // id. The left one will in the end be overwritten by another CUstream.
239 // Therefore, after several execution, the order of pool might be different
240 // from its initial state.
returnStream(const int DeviceId,CUstream Stream)241 void returnStream(const int DeviceId, CUstream Stream) {
242 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
243 int &Id = NextStreamId[DeviceId];
244 assert(Id > 0 && "Wrong stream ID");
245 StreamPool[DeviceId][--Id] = Stream;
246 }
247
initializeDeviceStreamPool(const int DeviceId)248 bool initializeDeviceStreamPool(const int DeviceId) {
249 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
250
251 resizeStreamPool(DeviceId, EnvNumInitialStreams);
252
253 // Check the size of stream pool
254 if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
255 return false;
256
257 // Check whether each stream is valid
258 for (CUstream &S : StreamPool[DeviceId])
259 if (!S)
260 return false;
261
262 return true;
263 }
264 };
265
266 class DeviceRTLTy {
267 int NumberOfDevices;
268 // OpenMP environment properties
269 int EnvNumTeams;
270 int EnvTeamLimit;
271 // OpenMP requires flags
272 int64_t RequiresFlags;
273
274 static constexpr const int HardTeamLimit = 1U << 16U; // 64k
275 static constexpr const int HardThreadLimit = 1024;
276 static constexpr const int DefaultNumTeams = 128;
277 static constexpr const int DefaultNumThreads = 128;
278
279 std::unique_ptr<StreamManagerTy> StreamManager;
280 std::vector<DeviceDataTy> DeviceData;
281 std::vector<CUmodule> Modules;
282
283 // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)284 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
285 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
286 E.Entries.push_back(entry);
287 }
288
289 // Return true if the entry is associated with device
findOffloadEntry(const int DeviceId,const void * Addr) const290 bool findOffloadEntry(const int DeviceId, const void *Addr) const {
291 for (const __tgt_offload_entry &Itr :
292 DeviceData[DeviceId].FuncGblEntries.back().Entries)
293 if (Itr.addr == Addr)
294 return true;
295
296 return false;
297 }
298
299 // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)300 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
301 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
302
303 if (E.Entries.empty())
304 return nullptr;
305
306 // Update table info according to the entries and return the pointer
307 E.Table.EntriesBegin = E.Entries.data();
308 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
309
310 return &E.Table;
311 }
312
313 // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)314 void clearOffloadEntriesTable(const int DeviceId) {
315 DeviceData[DeviceId].FuncGblEntries.emplace_back();
316 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
317 E.Entries.clear();
318 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
319 }
320
getStream(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const321 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
322 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
323
324 if (!AsyncInfoPtr->Queue)
325 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
326
327 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
328 }
329
330 public:
331 // This class should not be copied
332 DeviceRTLTy(const DeviceRTLTy &) = delete;
333 DeviceRTLTy(DeviceRTLTy &&) = delete;
334
DeviceRTLTy()335 DeviceRTLTy()
336 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
337 RequiresFlags(OMP_REQ_UNDEFINED) {
338 #ifdef OMPTARGET_DEBUG
339 if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
340 DebugLevel = std::stoi(EnvStr);
341 #endif // OMPTARGET_DEBUG
342
343 DP("Start initializing CUDA\n");
344
345 CUresult Err = cuInit(0);
346 if (!checkResult(Err, "Error returned from cuInit\n")) {
347 return;
348 }
349
350 Err = cuDeviceGetCount(&NumberOfDevices);
351 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
352 return;
353
354 if (NumberOfDevices == 0) {
355 DP("There are no devices supporting CUDA.\n");
356 return;
357 }
358
359 DeviceData.resize(NumberOfDevices);
360
361 // Get environment variables regarding teams
362 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
363 // OMP_TEAM_LIMIT has been set
364 EnvTeamLimit = std::stoi(EnvStr);
365 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
366 }
367 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
368 // OMP_NUM_TEAMS has been set
369 EnvNumTeams = std::stoi(EnvStr);
370 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
371 }
372
373 StreamManager =
374 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
375 }
376
~DeviceRTLTy()377 ~DeviceRTLTy() {
378 // First destruct stream manager in case of Contexts is destructed before it
379 StreamManager = nullptr;
380
381 for (CUmodule &M : Modules)
382 // Close module
383 if (M)
384 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
385
386 for (DeviceDataTy &D : DeviceData) {
387 // Destroy context
388 if (D.Context) {
389 checkResult(cuCtxSetCurrent(D.Context),
390 "Error returned from cuCtxSetCurrent\n");
391 CUdevice Device;
392 checkResult(cuCtxGetDevice(&Device),
393 "Error returned from cuCtxGetDevice\n");
394 checkResult(cuDevicePrimaryCtxRelease(Device),
395 "Error returned from cuDevicePrimaryCtxRelease\n");
396 }
397 }
398 }
399
400 // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const401 bool isValidDeviceId(const int DeviceId) const {
402 return DeviceId >= 0 && DeviceId < NumberOfDevices;
403 }
404
getNumOfDevices() const405 int getNumOfDevices() const { return NumberOfDevices; }
406
setRequiresFlag(const int64_t Flags)407 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
408
initDevice(const int DeviceId)409 int initDevice(const int DeviceId) {
410 CUdevice Device;
411
412 DP("Getting device %d\n", DeviceId);
413 CUresult Err = cuDeviceGet(&Device, DeviceId);
414 if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
415 return OFFLOAD_FAIL;
416
417 // Query the current flags of the primary context and set its flags if
418 // it is inactive
419 unsigned int FormerPrimaryCtxFlags = 0;
420 int FormerPrimaryCtxIsActive = 0;
421 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
422 &FormerPrimaryCtxIsActive);
423 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
424 return OFFLOAD_FAIL;
425
426 if (FormerPrimaryCtxIsActive) {
427 DP("The primary context is active, no change to its flags\n");
428 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
429 CU_CTX_SCHED_BLOCKING_SYNC)
430 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
431 } else {
432 DP("The primary context is inactive, set its flags to "
433 "CU_CTX_SCHED_BLOCKING_SYNC\n");
434 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
435 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
436 return OFFLOAD_FAIL;
437 }
438
439 // Retain the per device primary context and save it to use whenever this
440 // device is selected.
441 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
442 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
443 return OFFLOAD_FAIL;
444
445 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
446 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
447 return OFFLOAD_FAIL;
448
449 // Initialize stream pool
450 if (!StreamManager->initializeDeviceStreamPool(DeviceId))
451 return OFFLOAD_FAIL;
452
453 // Query attributes to determine number of threads/block and blocks/grid.
454 int MaxGridDimX;
455 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
456 Device);
457 if (Err != CUDA_SUCCESS) {
458 DP("Error getting max grid dimension, use default value %d\n",
459 DeviceRTLTy::DefaultNumTeams);
460 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
461 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
462 DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
463 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
464 } else {
465 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
466 "at the hard limit\n",
467 MaxGridDimX, DeviceRTLTy::HardTeamLimit);
468 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
469 }
470
471 // We are only exploiting threads along the x axis.
472 int MaxBlockDimX;
473 Err = cuDeviceGetAttribute(&MaxBlockDimX,
474 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
475 if (Err != CUDA_SUCCESS) {
476 DP("Error getting max block dimension, use default value %d\n",
477 DeviceRTLTy::DefaultNumThreads);
478 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
479 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
480 DP("Using %d CUDA threads per block\n", MaxBlockDimX);
481 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
482 } else {
483 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
484 "capping at the hard limit\n",
485 MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
486 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
487 }
488
489 // Get and set warp size
490 int WarpSize;
491 Err =
492 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
493 if (Err != CUDA_SUCCESS) {
494 DP("Error getting warp size, assume default value 32\n");
495 DeviceData[DeviceId].WarpSize = 32;
496 } else {
497 DP("Using warp size %d\n", WarpSize);
498 DeviceData[DeviceId].WarpSize = WarpSize;
499 }
500
501 // Adjust teams to the env variables
502 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
503 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
504 EnvTeamLimit);
505 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
506 }
507
508 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
509 DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
510 DeviceData[DeviceId].WarpSize);
511
512 // Set default number of teams
513 if (EnvNumTeams > 0) {
514 DP("Default number of teams set according to environment %d\n",
515 EnvNumTeams);
516 DeviceData[DeviceId].NumTeams = EnvNumTeams;
517 } else {
518 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
519 DP("Default number of teams set according to library's default %d\n",
520 DeviceRTLTy::DefaultNumTeams);
521 }
522
523 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
524 DP("Default number of teams exceeds device limit, capping at %d\n",
525 DeviceData[DeviceId].BlocksPerGrid);
526 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
527 }
528
529 // Set default number of threads
530 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
531 DP("Default number of threads set according to library's default %d\n",
532 DeviceRTLTy::DefaultNumThreads);
533 if (DeviceData[DeviceId].NumThreads >
534 DeviceData[DeviceId].ThreadsPerBlock) {
535 DP("Default number of threads exceeds device limit, capping at %d\n",
536 DeviceData[DeviceId].ThreadsPerBlock);
537 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
538 }
539
540 return OFFLOAD_SUCCESS;
541 }
542
loadBinary(const int DeviceId,const __tgt_device_image * Image)543 __tgt_target_table *loadBinary(const int DeviceId,
544 const __tgt_device_image *Image) {
545 // Set the context we are using
546 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
547 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
548 return nullptr;
549
550 // Clear the offload table as we are going to create a new one.
551 clearOffloadEntriesTable(DeviceId);
552
553 // Create the module and extract the function pointers.
554 CUmodule Module;
555 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
556 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
557 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
558 return nullptr;
559
560 DP("CUDA module successfully loaded!\n");
561
562 Modules.push_back(Module);
563
564 // Find the symbols in the module by name.
565 const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
566 const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
567
568 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
569 if (!E->addr) {
570 // We return nullptr when something like this happens, the host should
571 // have always something in the address to uniquely identify the target
572 // region.
573 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
574 return nullptr;
575 }
576
577 if (E->size) {
578 __tgt_offload_entry Entry = *E;
579 CUdeviceptr CUPtr;
580 size_t CUSize;
581 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
582 // We keep this style here because we need the name
583 if (Err != CUDA_SUCCESS) {
584 DP("Loading global '%s' (Failed)\n", E->name);
585 CUDA_ERR_STRING(Err);
586 return nullptr;
587 }
588
589 if (CUSize != E->size) {
590 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
591 CUSize, E->size);
592 return nullptr;
593 }
594
595 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
596 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
597
598 Entry.addr = (void *)(CUPtr);
599
600 // Note: In the current implementation declare target variables
601 // can either be link or to. This means that once unified
602 // memory is activated via the requires directive, the variable
603 // can be used directly from the host in both cases.
604 // TODO: when variables types other than to or link are added,
605 // the below condition should be changed to explicitly
606 // check for to and link variables types:
607 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
608 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
609 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
610 // If unified memory is present any target link or to variables
611 // can access host addresses directly. There is no longer a
612 // need for device copies.
613 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
614 DP("Copy linked variable host address (" DPxMOD
615 ") to device address (" DPxMOD ")\n",
616 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
617 }
618
619 addOffloadEntry(DeviceId, Entry);
620
621 continue;
622 }
623
624 CUfunction Func;
625 Err = cuModuleGetFunction(&Func, Module, E->name);
626 // We keep this style here because we need the name
627 if (Err != CUDA_SUCCESS) {
628 DP("Loading '%s' (Failed)\n", E->name);
629 CUDA_ERR_STRING(Err);
630 return nullptr;
631 }
632
633 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
634 DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
635
636 // default value GENERIC (in case symbol is missing from cubin file)
637 int8_t ExecModeVal = ExecutionModeType::GENERIC;
638 std::string ExecModeNameStr(E->name);
639 ExecModeNameStr += "_exec_mode";
640 const char *ExecModeName = ExecModeNameStr.c_str();
641
642 CUdeviceptr ExecModePtr;
643 size_t CUSize;
644 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
645 if (Err == CUDA_SUCCESS) {
646 if (CUSize != sizeof(int8_t)) {
647 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
648 ExecModeName, CUSize, sizeof(int8_t));
649 return nullptr;
650 }
651
652 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
653 if (Err != CUDA_SUCCESS) {
654 DP("Error when copying data from device to host. Pointers: "
655 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
656 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
657 CUDA_ERR_STRING(Err);
658 return nullptr;
659 }
660
661 if (ExecModeVal < 0 || ExecModeVal > 1) {
662 DP("Error wrong exec_mode value specified in cubin file: %d\n",
663 ExecModeVal);
664 return nullptr;
665 }
666 } else {
667 DP("Loading global exec_mode '%s' - symbol missing, using default "
668 "value GENERIC (1)\n",
669 ExecModeName);
670 CUDA_ERR_STRING(Err);
671 }
672
673 KernelsList.emplace_back(Func, ExecModeVal);
674
675 __tgt_offload_entry Entry = *E;
676 Entry.addr = &KernelsList.back();
677 addOffloadEntry(DeviceId, Entry);
678 }
679
680 // send device environment data to the device
681 {
682 omptarget_device_environmentTy DeviceEnv{0};
683
684 #ifdef OMPTARGET_DEBUG
685 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
686 DeviceEnv.debug_level = std::stoi(EnvStr);
687 #endif
688
689 const char *DeviceEnvName = "omptarget_device_environment";
690 CUdeviceptr DeviceEnvPtr;
691 size_t CUSize;
692
693 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
694 if (Err == CUDA_SUCCESS) {
695 if (CUSize != sizeof(DeviceEnv)) {
696 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
697 DeviceEnvName, CUSize, sizeof(int32_t));
698 CUDA_ERR_STRING(Err);
699 return nullptr;
700 }
701
702 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
703 if (Err != CUDA_SUCCESS) {
704 DP("Error when copying data from host to device. Pointers: "
705 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
706 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
707 CUDA_ERR_STRING(Err);
708 return nullptr;
709 }
710
711 DP("Sending global device environment data %zu bytes\n", CUSize);
712 } else {
713 DP("Finding global device environment '%s' - symbol missing.\n",
714 DeviceEnvName);
715 DP("Continue, considering this is a device RTL which does not accept "
716 "environment setting.\n");
717 }
718 }
719
720 return getOffloadEntriesTable(DeviceId);
721 }
722
dataAlloc(const int DeviceId,const int64_t Size) const723 void *dataAlloc(const int DeviceId, const int64_t Size) const {
724 if (Size == 0)
725 return nullptr;
726
727 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
728 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
729 return nullptr;
730
731 CUdeviceptr DevicePtr;
732 Err = cuMemAlloc(&DevicePtr, Size);
733 if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
734 return nullptr;
735
736 return (void *)DevicePtr;
737 }
738
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const739 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
740 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
741 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
742
743 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
744 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
745 return OFFLOAD_FAIL;
746
747 CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
748
749 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
750 if (Err != CUDA_SUCCESS) {
751 DP("Error when copying data from host to device. Pointers: host = " DPxMOD
752 ", device = " DPxMOD ", size = %" PRId64 "\n",
753 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
754 CUDA_ERR_STRING(Err);
755 return OFFLOAD_FAIL;
756 }
757
758 return OFFLOAD_SUCCESS;
759 }
760
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const761 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
762 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
763 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
764
765 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
766 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
767 return OFFLOAD_FAIL;
768
769 CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
770
771 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
772 if (Err != CUDA_SUCCESS) {
773 DP("Error when copying data from device to host. Pointers: host = " DPxMOD
774 ", device = " DPxMOD ", size = %" PRId64 "\n",
775 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
776 CUDA_ERR_STRING(Err);
777 return OFFLOAD_FAIL;
778 }
779
780 return OFFLOAD_SUCCESS;
781 }
782
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr) const783 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
784 int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
785 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
786
787 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
788 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
789 return OFFLOAD_FAIL;
790
791 CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
792
793 // If they are two devices, we try peer to peer copy first
794 if (SrcDevId != DstDevId) {
795 int CanAccessPeer = 0;
796 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
797 if (Err != CUDA_SUCCESS) {
798 DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
799 ", dst = %" PRId32 "\n",
800 SrcDevId, DstDevId);
801 CUDA_ERR_STRING(Err);
802 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
803 }
804
805 if (!CanAccessPeer) {
806 DP("P2P memcpy not supported so fall back to D2D memcpy");
807 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
808 }
809
810 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
811 if (Err != CUDA_SUCCESS) {
812 DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
813 ", dst = %" PRId32 "\n",
814 SrcDevId, DstDevId);
815 CUDA_ERR_STRING(Err);
816 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
817 }
818
819 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
820 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
821 Size, Stream);
822 if (Err == CUDA_SUCCESS)
823 return OFFLOAD_SUCCESS;
824
825 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
826 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
827 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
828 CUDA_ERR_STRING(Err);
829 }
830
831 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
832 }
833
dataDelete(const int DeviceId,void * TgtPtr) const834 int dataDelete(const int DeviceId, void *TgtPtr) const {
835 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
836 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
837 return OFFLOAD_FAIL;
838
839 Err = cuMemFree((CUdeviceptr)TgtPtr);
840 if (!checkResult(Err, "Error returned from cuMemFree\n"))
841 return OFFLOAD_FAIL;
842
843 return OFFLOAD_SUCCESS;
844 }
845
runTargetTeamRegion(const int DeviceId,const void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const846 int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
847 void **TgtArgs, ptrdiff_t *TgtOffsets,
848 const int ArgNum, const int TeamNum,
849 const int ThreadLimit,
850 const unsigned int LoopTripCount,
851 __tgt_async_info *AsyncInfo) const {
852 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
853 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
854 return OFFLOAD_FAIL;
855
856 // All args are references.
857 std::vector<void *> Args(ArgNum);
858 std::vector<void *> Ptrs(ArgNum);
859
860 for (int I = 0; I < ArgNum; ++I) {
861 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
862 Args[I] = &Ptrs[I];
863 }
864
865 const KernelTy *KernelInfo =
866 reinterpret_cast<const KernelTy *>(TgtEntryPtr);
867
868 unsigned int CudaThreadsPerBlock;
869 if (ThreadLimit > 0) {
870 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
871 CudaThreadsPerBlock = ThreadLimit;
872 // Add master warp if necessary
873 if (KernelInfo->ExecutionMode == GENERIC) {
874 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
875 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
876 }
877 } else {
878 DP("Setting CUDA threads per block to default %d\n",
879 DeviceData[DeviceId].NumThreads);
880 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
881 }
882
883 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
884 DP("Threads per block capped at device limit %d\n",
885 DeviceData[DeviceId].ThreadsPerBlock);
886 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
887 }
888
889 int KernelLimit;
890 Err = cuFuncGetAttribute(&KernelLimit,
891 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
892 KernelInfo->Func);
893 if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
894 DP("Threads per block capped at kernel limit %d\n", KernelLimit);
895 CudaThreadsPerBlock = KernelLimit;
896 }
897
898 unsigned int CudaBlocksPerGrid;
899 if (TeamNum <= 0) {
900 if (LoopTripCount > 0 && EnvNumTeams < 0) {
901 if (KernelInfo->ExecutionMode == SPMD) {
902 // We have a combined construct, i.e. `target teams distribute
903 // parallel for [simd]`. We launch so many teams so that each thread
904 // will execute one iteration of the loop. round up to the nearest
905 // integer
906 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
907 } else {
908 // If we reach this point, then we have a non-combined construct, i.e.
909 // `teams distribute` with a nested `parallel for` and each team is
910 // assigned one iteration of the `distribute` loop. E.g.:
911 //
912 // #pragma omp target teams distribute
913 // for(...loop_tripcount...) {
914 // #pragma omp parallel for
915 // for(...) {}
916 // }
917 //
918 // Threads within a team will execute the iterations of the `parallel`
919 // loop.
920 CudaBlocksPerGrid = LoopTripCount;
921 }
922 DP("Using %d teams due to loop trip count %" PRIu32
923 " and number of threads per block %d\n",
924 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
925 } else {
926 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
927 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
928 }
929 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
930 DP("Capping number of teams to team limit %d\n",
931 DeviceData[DeviceId].BlocksPerGrid);
932 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
933 } else {
934 DP("Using requested number of teams %d\n", TeamNum);
935 CudaBlocksPerGrid = TeamNum;
936 }
937
938 // Run on the device.
939 DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
940 CudaThreadsPerBlock);
941
942 CUstream Stream = getStream(DeviceId, AsyncInfo);
943 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
944 /* gridDimZ */ 1, CudaThreadsPerBlock,
945 /* blockDimY */ 1, /* blockDimZ */ 1,
946 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
947 if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
948 return OFFLOAD_FAIL;
949
950 DP("Launch of entry point at " DPxMOD " successful!\n",
951 DPxPTR(TgtEntryPtr));
952
953 return OFFLOAD_SUCCESS;
954 }
955
synchronize(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const956 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
957 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
958 CUresult Err = cuStreamSynchronize(Stream);
959 if (Err != CUDA_SUCCESS) {
960 DP("Error when synchronizing stream. stream = " DPxMOD
961 ", async info ptr = " DPxMOD "\n",
962 DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
963 CUDA_ERR_STRING(Err);
964 return OFFLOAD_FAIL;
965 }
966
967 // Once the stream is synchronized, return it to stream pool and reset
968 // async_info. This is to make sure the synchronization only works for its
969 // own tasks.
970 StreamManager->returnStream(
971 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
972 AsyncInfoPtr->Queue = nullptr;
973
974 return OFFLOAD_SUCCESS;
975 }
976 };
977
978 DeviceRTLTy DeviceRTL;
979 } // namespace
980
981 // Exposed library API function
982 #ifdef __cplusplus
983 extern "C" {
984 #endif
985
__tgt_rtl_is_valid_binary(__tgt_device_image * image)986 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
987 return elf_check_machine(image, /* EM_CUDA */ 190);
988 }
989
__tgt_rtl_number_of_devices()990 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
991
__tgt_rtl_init_requires(int64_t RequiresFlags)992 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
993 DP("Init requires flags to %ld\n", RequiresFlags);
994 DeviceRTL.setRequiresFlag(RequiresFlags);
995 return RequiresFlags;
996 }
997
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)998 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
999 if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1000 DeviceRTL.isValidDeviceId(dst_dev_id))
1001 return 1;
1002
1003 return 0;
1004 }
1005
__tgt_rtl_init_device(int32_t device_id)1006 int32_t __tgt_rtl_init_device(int32_t device_id) {
1007 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1008
1009 return DeviceRTL.initDevice(device_id);
1010 }
1011
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1012 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1013 __tgt_device_image *image) {
1014 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1015
1016 return DeviceRTL.loadBinary(device_id, image);
1017 }
1018
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *)1019 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1020 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1021
1022 return DeviceRTL.dataAlloc(device_id, size);
1023 }
1024
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1025 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1026 int64_t size) {
1027 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1028
1029 __tgt_async_info async_info;
1030 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1031 size, &async_info);
1032 if (rc != OFFLOAD_SUCCESS)
1033 return OFFLOAD_FAIL;
1034
1035 return __tgt_rtl_synchronize(device_id, &async_info);
1036 }
1037
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1038 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1039 void *hst_ptr, int64_t size,
1040 __tgt_async_info *async_info_ptr) {
1041 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1042 assert(async_info_ptr && "async_info_ptr is nullptr");
1043
1044 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1045 async_info_ptr);
1046 }
1047
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1048 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1049 int64_t size) {
1050 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1051
1052 __tgt_async_info async_info;
1053 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1054 size, &async_info);
1055 if (rc != OFFLOAD_SUCCESS)
1056 return OFFLOAD_FAIL;
1057
1058 return __tgt_rtl_synchronize(device_id, &async_info);
1059 }
1060
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1061 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1062 void *tgt_ptr, int64_t size,
1063 __tgt_async_info *async_info_ptr) {
1064 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1065 assert(async_info_ptr && "async_info_ptr is nullptr");
1066
1067 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1068 async_info_ptr);
1069 }
1070
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1071 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1072 int dst_dev_id, void *dst_ptr,
1073 int64_t size,
1074 __tgt_async_info *async_info_ptr) {
1075 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1076 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1077 assert(async_info_ptr && "async_info_ptr is nullptr");
1078
1079 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1080 async_info_ptr);
1081 }
1082
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1083 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1084 int32_t dst_dev_id, void *dst_ptr,
1085 int64_t size) {
1086 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1087 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1088
1089 __tgt_async_info async_info;
1090 const int32_t rc = __tgt_rtl_data_exchange_async(
1091 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1092 if (rc != OFFLOAD_SUCCESS)
1093 return OFFLOAD_FAIL;
1094
1095 return __tgt_rtl_synchronize(src_dev_id, &async_info);
1096 }
1097
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1098 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1099 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1100
1101 return DeviceRTL.dataDelete(device_id, tgt_ptr);
1102 }
1103
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1104 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1105 void **tgt_args,
1106 ptrdiff_t *tgt_offsets,
1107 int32_t arg_num, int32_t team_num,
1108 int32_t thread_limit,
1109 uint64_t loop_tripcount) {
1110 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1111
1112 __tgt_async_info async_info;
1113 const int32_t rc = __tgt_rtl_run_target_team_region_async(
1114 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1115 thread_limit, loop_tripcount, &async_info);
1116 if (rc != OFFLOAD_SUCCESS)
1117 return OFFLOAD_FAIL;
1118
1119 return __tgt_rtl_synchronize(device_id, &async_info);
1120 }
1121
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1122 int32_t __tgt_rtl_run_target_team_region_async(
1123 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1124 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1125 int32_t thread_limit, uint64_t loop_tripcount,
1126 __tgt_async_info *async_info_ptr) {
1127 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1128
1129 return DeviceRTL.runTargetTeamRegion(
1130 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1131 thread_limit, loop_tripcount, async_info_ptr);
1132 }
1133
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1134 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1135 void **tgt_args, ptrdiff_t *tgt_offsets,
1136 int32_t arg_num) {
1137 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1138
1139 __tgt_async_info async_info;
1140 const int32_t rc = __tgt_rtl_run_target_region_async(
1141 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1142 if (rc != OFFLOAD_SUCCESS)
1143 return OFFLOAD_FAIL;
1144
1145 return __tgt_rtl_synchronize(device_id, &async_info);
1146 }
1147
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1148 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1149 void *tgt_entry_ptr, void **tgt_args,
1150 ptrdiff_t *tgt_offsets,
1151 int32_t arg_num,
1152 __tgt_async_info *async_info_ptr) {
1153 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1154
1155 return __tgt_rtl_run_target_team_region_async(
1156 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1157 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1158 async_info_ptr);
1159 }
1160
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1161 int32_t __tgt_rtl_synchronize(int32_t device_id,
1162 __tgt_async_info *async_info_ptr) {
1163 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1164 assert(async_info_ptr && "async_info_ptr is nullptr");
1165 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1166
1167 return DeviceRTL.synchronize(device_id, async_info_ptr);
1168 }
1169
1170 #ifdef __cplusplus
1171 }
1172 #endif
1173