1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21
22 #include "Debug.h"
23 #include "omptargetplugin.h"
24
25 #define TARGET_NAME CUDA
26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
27
28 #include "MemoryManager.h"
29
30 // Utility for retrieving and printing CUDA error string.
31 #ifdef OMPTARGET_DEBUG
32 #define CUDA_ERR_STRING(err) \
33 do { \
34 if (getDebugLevel() > 0) { \
35 const char *errStr = nullptr; \
36 CUresult errStr_status = cuGetErrorString(err, &errStr); \
37 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
38 REPORT("Unrecognized CUDA error code: %d\n", err); \
39 else if (errStr_status == CUDA_SUCCESS) \
40 REPORT("CUDA error is: %s\n", errStr); \
41 else { \
42 REPORT("Unresolved CUDA error code: %d\n", err); \
43 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
44 errStr_status); \
45 } \
46 } else { \
47 const char *errStr = nullptr; \
48 CUresult errStr_status = cuGetErrorString(err, &errStr); \
49 if (errStr_status == CUDA_SUCCESS) \
50 REPORT("%s \n", errStr); \
51 } \
52 } while (false)
53 #else // OMPTARGET_DEBUG
54 #define CUDA_ERR_STRING(err) \
55 do { \
56 const char *errStr = nullptr; \
57 CUresult errStr_status = cuGetErrorString(err, &errStr); \
58 if (errStr_status == CUDA_SUCCESS) \
59 REPORT("%s \n", errStr); \
60 } while (false)
61 #endif // OMPTARGET_DEBUG
62
63 #include "elf_common.h"
64
65 /// Keep entries table per device.
66 struct FuncOrGblEntryTy {
67 __tgt_target_table Table;
68 std::vector<__tgt_offload_entry> Entries;
69 };
70
71 enum ExecutionModeType {
72 SPMD, // constructors, destructors,
73 // combined constructs (`teams distribute parallel for [simd]`)
74 GENERIC, // everything else
75 NONE
76 };
77
78 /// Use a single entity to encode a kernel and a set of flags.
79 struct KernelTy {
80 CUfunction Func;
81
82 // execution mode of kernel
83 // 0 - SPMD mode (without master warp)
84 // 1 - Generic mode (with master warp)
85 int8_t ExecutionMode;
86
87 /// Maximal number of threads per block for this kernel.
88 int MaxThreadsPerBlock = 0;
89
KernelTyKernelTy90 KernelTy(CUfunction _Func, int8_t _ExecutionMode)
91 : Func(_Func), ExecutionMode(_ExecutionMode) {}
92 };
93
94 /// Device environment data
95 /// Manually sync with the deviceRTL side for now, move to a dedicated header
96 /// file later.
97 struct omptarget_device_environmentTy {
98 int32_t debug_level;
99 };
100
101 namespace {
checkResult(CUresult Err,const char * ErrMsg)102 bool checkResult(CUresult Err, const char *ErrMsg) {
103 if (Err == CUDA_SUCCESS)
104 return true;
105
106 REPORT("%s", ErrMsg);
107 CUDA_ERR_STRING(Err);
108 return false;
109 }
110
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)111 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
112 CUstream Stream) {
113 CUresult Err =
114 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
115
116 if (Err != CUDA_SUCCESS) {
117 REPORT("Error when copying data from device to device. Pointers: src "
118 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
119 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
120 CUDA_ERR_STRING(Err);
121 return OFFLOAD_FAIL;
122 }
123
124 return OFFLOAD_SUCCESS;
125 }
126
127 // Structure contains per-device data
128 struct DeviceDataTy {
129 /// List that contains all the kernels.
130 std::list<KernelTy> KernelsList;
131
132 std::list<FuncOrGblEntryTy> FuncGblEntries;
133
134 CUcontext Context = nullptr;
135 // Device properties
136 int ThreadsPerBlock = 0;
137 int BlocksPerGrid = 0;
138 int WarpSize = 0;
139 // OpenMP properties
140 int NumTeams = 0;
141 int NumThreads = 0;
142 };
143
144 class StreamManagerTy {
145 int NumberOfDevices;
146 // The initial size of stream pool
147 int EnvNumInitialStreams;
148 // Per-device stream mutex
149 std::vector<std::unique_ptr<std::mutex>> StreamMtx;
150 // Per-device stream Id indicates the next available stream in the pool
151 std::vector<int> NextStreamId;
152 // Per-device stream pool
153 std::vector<std::vector<CUstream>> StreamPool;
154 // Reference to per-device data
155 std::vector<DeviceDataTy> &DeviceData;
156
157 // If there is no CUstream left in the pool, we will resize the pool to
158 // allocate more CUstream. This function should be called with device mutex,
159 // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)160 void resizeStreamPool(const int DeviceId, const size_t NewSize) {
161 std::vector<CUstream> &Pool = StreamPool[DeviceId];
162 const size_t CurrentSize = Pool.size();
163 assert(NewSize > CurrentSize && "new size is not larger than current size");
164
165 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
166 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
167 // We will return if cannot switch to the right context in case of
168 // creating bunch of streams that are not corresponding to the right
169 // device. The offloading will fail later because selected CUstream is
170 // nullptr.
171 return;
172 }
173
174 Pool.resize(NewSize, nullptr);
175
176 for (size_t I = CurrentSize; I < NewSize; ++I) {
177 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
178 "Error returned from cuStreamCreate\n");
179 }
180 }
181
182 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)183 StreamManagerTy(const int NumberOfDevices,
184 std::vector<DeviceDataTy> &DeviceData)
185 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
186 DeviceData(DeviceData) {
187 StreamPool.resize(NumberOfDevices);
188 NextStreamId.resize(NumberOfDevices);
189 StreamMtx.resize(NumberOfDevices);
190
191 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
192 EnvNumInitialStreams = std::stoi(EnvStr);
193
194 // Initialize the next stream id
195 std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
196
197 // Initialize stream mutex
198 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
199 Ptr = std::make_unique<std::mutex>();
200 }
201
~StreamManagerTy()202 ~StreamManagerTy() {
203 // Destroy streams
204 for (int I = 0; I < NumberOfDevices; ++I) {
205 checkResult(cuCtxSetCurrent(DeviceData[I].Context),
206 "Error returned from cuCtxSetCurrent\n");
207
208 for (CUstream &S : StreamPool[I]) {
209 if (S)
210 checkResult(cuStreamDestroy(S),
211 "Error returned from cuStreamDestroy\n");
212 }
213 }
214 }
215
216 // Get a CUstream from pool. Per-device next stream id always points to the
217 // next available CUstream. That means, CUstreams [0, id-1] have been
218 // assigned, and [id,] are still available. If there is no CUstream left, we
219 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
220 // the id will increase one.
221 // xxxxxs+++++++++
222 // ^
223 // id
224 // After assignment, the pool becomes the following and s is assigned.
225 // xxxxxs+++++++++
226 // ^
227 // id
getStream(const int DeviceId)228 CUstream getStream(const int DeviceId) {
229 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
230 int &Id = NextStreamId[DeviceId];
231 // No CUstream left in the pool, we need to request from CUDA RT
232 if (Id == StreamPool[DeviceId].size()) {
233 // By default we double the stream pool every time
234 resizeStreamPool(DeviceId, Id * 2);
235 }
236 return StreamPool[DeviceId][Id++];
237 }
238
239 // Return a CUstream back to pool. As mentioned above, per-device next
240 // stream is always points to the next available CUstream, so when we return
241 // a CUstream, we need to first decrease the id, and then copy the CUstream
242 // back.
243 // It is worth noting that, the order of streams return might be different
244 // from that they're assigned, that saying, at some point, there might be
245 // two identical CUstreams.
246 // xxax+a+++++
247 // ^
248 // id
249 // However, it doesn't matter, because they're always on the two sides of
250 // id. The left one will in the end be overwritten by another CUstream.
251 // Therefore, after several execution, the order of pool might be different
252 // from its initial state.
returnStream(const int DeviceId,CUstream Stream)253 void returnStream(const int DeviceId, CUstream Stream) {
254 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
255 int &Id = NextStreamId[DeviceId];
256 assert(Id > 0 && "Wrong stream ID");
257 StreamPool[DeviceId][--Id] = Stream;
258 }
259
initializeDeviceStreamPool(const int DeviceId)260 bool initializeDeviceStreamPool(const int DeviceId) {
261 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
262
263 resizeStreamPool(DeviceId, EnvNumInitialStreams);
264
265 // Check the size of stream pool
266 if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
267 return false;
268
269 // Check whether each stream is valid
270 for (CUstream &S : StreamPool[DeviceId])
271 if (!S)
272 return false;
273
274 return true;
275 }
276 };
277
278 class DeviceRTLTy {
279 int NumberOfDevices;
280 // OpenMP environment properties
281 int EnvNumTeams;
282 int EnvTeamLimit;
283 // OpenMP requires flags
284 int64_t RequiresFlags;
285
286 static constexpr const int HardTeamLimit = 1U << 16U; // 64k
287 static constexpr const int HardThreadLimit = 1024;
288 static constexpr const int DefaultNumTeams = 128;
289 static constexpr const int DefaultNumThreads = 128;
290
291 std::unique_ptr<StreamManagerTy> StreamManager;
292 std::vector<DeviceDataTy> DeviceData;
293 std::vector<CUmodule> Modules;
294
295 /// A class responsible for interacting with device native runtime library to
296 /// allocate and free memory.
297 class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
298 const int DeviceId;
299 const std::vector<DeviceDataTy> &DeviceData;
300
301 public:
CUDADeviceAllocatorTy(int DeviceId,std::vector<DeviceDataTy> & DeviceData)302 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
303 : DeviceId(DeviceId), DeviceData(DeviceData) {}
304
allocate(size_t Size,void *)305 void *allocate(size_t Size, void *) override {
306 if (Size == 0)
307 return nullptr;
308
309 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
310 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
311 return nullptr;
312
313 CUdeviceptr DevicePtr;
314 Err = cuMemAlloc(&DevicePtr, Size);
315 if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
316 return nullptr;
317
318 return (void *)DevicePtr;
319 }
320
free(void * TgtPtr)321 int free(void *TgtPtr) override {
322 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
323 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
324 return OFFLOAD_FAIL;
325
326 Err = cuMemFree((CUdeviceptr)TgtPtr);
327 if (!checkResult(Err, "Error returned from cuMemFree\n"))
328 return OFFLOAD_FAIL;
329
330 return OFFLOAD_SUCCESS;
331 }
332 };
333
334 /// A vector of device allocators
335 std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
336
337 /// A vector of memory managers. Since the memory manager is non-copyable and
338 // non-removable, we wrap them into std::unique_ptr.
339 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
340
341 /// Whether use memory manager
342 bool UseMemoryManager = true;
343
344 // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)345 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
346 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
347 E.Entries.push_back(entry);
348 }
349
350 // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const351 const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
352 const void *Addr) const {
353 for (const __tgt_offload_entry &Itr :
354 DeviceData[DeviceId].FuncGblEntries.back().Entries)
355 if (Itr.addr == Addr)
356 return &Itr;
357
358 return nullptr;
359 }
360
361 // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)362 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
363 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
364
365 if (E.Entries.empty())
366 return nullptr;
367
368 // Update table info according to the entries and return the pointer
369 E.Table.EntriesBegin = E.Entries.data();
370 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
371
372 return &E.Table;
373 }
374
375 // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)376 void clearOffloadEntriesTable(const int DeviceId) {
377 DeviceData[DeviceId].FuncGblEntries.emplace_back();
378 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
379 E.Entries.clear();
380 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
381 }
382
getStream(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const383 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
384 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
385
386 if (!AsyncInfoPtr->Queue)
387 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
388
389 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
390 }
391
392 public:
393 // This class should not be copied
394 DeviceRTLTy(const DeviceRTLTy &) = delete;
395 DeviceRTLTy(DeviceRTLTy &&) = delete;
396
DeviceRTLTy()397 DeviceRTLTy()
398 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
399 RequiresFlags(OMP_REQ_UNDEFINED) {
400
401 DP("Start initializing CUDA\n");
402
403 CUresult Err = cuInit(0);
404 if (Err == CUDA_ERROR_INVALID_HANDLE) {
405 // Can't call cuGetErrorString if dlsym failed
406 DP("Failed to load CUDA shared library\n");
407 return;
408 }
409 if (!checkResult(Err, "Error returned from cuInit\n")) {
410 return;
411 }
412
413 Err = cuDeviceGetCount(&NumberOfDevices);
414 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
415 return;
416
417 if (NumberOfDevices == 0) {
418 DP("There are no devices supporting CUDA.\n");
419 return;
420 }
421
422 DeviceData.resize(NumberOfDevices);
423
424 // Get environment variables regarding teams
425 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
426 // OMP_TEAM_LIMIT has been set
427 EnvTeamLimit = std::stoi(EnvStr);
428 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
429 }
430 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
431 // OMP_NUM_TEAMS has been set
432 EnvNumTeams = std::stoi(EnvStr);
433 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
434 }
435
436 StreamManager =
437 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
438
439 for (int I = 0; I < NumberOfDevices; ++I)
440 DeviceAllocators.emplace_back(I, DeviceData);
441
442 // Get the size threshold from environment variable
443 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
444 UseMemoryManager = Res.second;
445 size_t MemoryManagerThreshold = Res.first;
446
447 if (UseMemoryManager)
448 for (int I = 0; I < NumberOfDevices; ++I)
449 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
450 DeviceAllocators[I], MemoryManagerThreshold));
451 }
452
~DeviceRTLTy()453 ~DeviceRTLTy() {
454 // We first destruct memory managers in case that its dependent data are
455 // destroyed before it.
456 for (auto &M : MemoryManagers)
457 M.release();
458
459 StreamManager = nullptr;
460
461 for (CUmodule &M : Modules)
462 // Close module
463 if (M)
464 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
465
466 for (DeviceDataTy &D : DeviceData) {
467 // Destroy context
468 if (D.Context) {
469 checkResult(cuCtxSetCurrent(D.Context),
470 "Error returned from cuCtxSetCurrent\n");
471 CUdevice Device;
472 checkResult(cuCtxGetDevice(&Device),
473 "Error returned from cuCtxGetDevice\n");
474 checkResult(cuDevicePrimaryCtxRelease(Device),
475 "Error returned from cuDevicePrimaryCtxRelease\n");
476 }
477 }
478 }
479
480 // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const481 bool isValidDeviceId(const int DeviceId) const {
482 return DeviceId >= 0 && DeviceId < NumberOfDevices;
483 }
484
getNumOfDevices() const485 int getNumOfDevices() const { return NumberOfDevices; }
486
setRequiresFlag(const int64_t Flags)487 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
488
initDevice(const int DeviceId)489 int initDevice(const int DeviceId) {
490 CUdevice Device;
491
492 DP("Getting device %d\n", DeviceId);
493 CUresult Err = cuDeviceGet(&Device, DeviceId);
494 if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
495 return OFFLOAD_FAIL;
496
497 // Query the current flags of the primary context and set its flags if
498 // it is inactive
499 unsigned int FormerPrimaryCtxFlags = 0;
500 int FormerPrimaryCtxIsActive = 0;
501 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
502 &FormerPrimaryCtxIsActive);
503 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
504 return OFFLOAD_FAIL;
505
506 if (FormerPrimaryCtxIsActive) {
507 DP("The primary context is active, no change to its flags\n");
508 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
509 CU_CTX_SCHED_BLOCKING_SYNC)
510 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
511 } else {
512 DP("The primary context is inactive, set its flags to "
513 "CU_CTX_SCHED_BLOCKING_SYNC\n");
514 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
515 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
516 return OFFLOAD_FAIL;
517 }
518
519 // Retain the per device primary context and save it to use whenever this
520 // device is selected.
521 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
522 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
523 return OFFLOAD_FAIL;
524
525 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
526 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
527 return OFFLOAD_FAIL;
528
529 // Initialize stream pool
530 if (!StreamManager->initializeDeviceStreamPool(DeviceId))
531 return OFFLOAD_FAIL;
532
533 // Query attributes to determine number of threads/block and blocks/grid.
534 int MaxGridDimX;
535 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
536 Device);
537 if (Err != CUDA_SUCCESS) {
538 DP("Error getting max grid dimension, use default value %d\n",
539 DeviceRTLTy::DefaultNumTeams);
540 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
541 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
542 DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
543 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
544 } else {
545 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
546 "at the hard limit\n",
547 MaxGridDimX, DeviceRTLTy::HardTeamLimit);
548 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
549 }
550
551 // We are only exploiting threads along the x axis.
552 int MaxBlockDimX;
553 Err = cuDeviceGetAttribute(&MaxBlockDimX,
554 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
555 if (Err != CUDA_SUCCESS) {
556 DP("Error getting max block dimension, use default value %d\n",
557 DeviceRTLTy::DefaultNumThreads);
558 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
559 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
560 DP("Using %d CUDA threads per block\n", MaxBlockDimX);
561 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
562 } else {
563 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
564 "capping at the hard limit\n",
565 MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
566 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
567 }
568
569 // Get and set warp size
570 int WarpSize;
571 Err =
572 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
573 if (Err != CUDA_SUCCESS) {
574 DP("Error getting warp size, assume default value 32\n");
575 DeviceData[DeviceId].WarpSize = 32;
576 } else {
577 DP("Using warp size %d\n", WarpSize);
578 DeviceData[DeviceId].WarpSize = WarpSize;
579 }
580
581 // Adjust teams to the env variables
582 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
583 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
584 EnvTeamLimit);
585 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
586 }
587
588 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
589 "Device supports up to %d CUDA blocks and %d threads with a "
590 "warp size of %d\n",
591 DeviceData[DeviceId].BlocksPerGrid,
592 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
593
594 // Set default number of teams
595 if (EnvNumTeams > 0) {
596 DP("Default number of teams set according to environment %d\n",
597 EnvNumTeams);
598 DeviceData[DeviceId].NumTeams = EnvNumTeams;
599 } else {
600 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
601 DP("Default number of teams set according to library's default %d\n",
602 DeviceRTLTy::DefaultNumTeams);
603 }
604
605 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
606 DP("Default number of teams exceeds device limit, capping at %d\n",
607 DeviceData[DeviceId].BlocksPerGrid);
608 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
609 }
610
611 // Set default number of threads
612 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
613 DP("Default number of threads set according to library's default %d\n",
614 DeviceRTLTy::DefaultNumThreads);
615 if (DeviceData[DeviceId].NumThreads >
616 DeviceData[DeviceId].ThreadsPerBlock) {
617 DP("Default number of threads exceeds device limit, capping at %d\n",
618 DeviceData[DeviceId].ThreadsPerBlock);
619 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
620 }
621
622 return OFFLOAD_SUCCESS;
623 }
624
loadBinary(const int DeviceId,const __tgt_device_image * Image)625 __tgt_target_table *loadBinary(const int DeviceId,
626 const __tgt_device_image *Image) {
627 // Set the context we are using
628 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
629 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
630 return nullptr;
631
632 // Clear the offload table as we are going to create a new one.
633 clearOffloadEntriesTable(DeviceId);
634
635 // Create the module and extract the function pointers.
636 CUmodule Module;
637 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
638 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
639 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
640 return nullptr;
641
642 DP("CUDA module successfully loaded!\n");
643
644 Modules.push_back(Module);
645
646 // Find the symbols in the module by name.
647 const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
648 const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
649
650 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
651 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
652 if (!E->addr) {
653 // We return nullptr when something like this happens, the host should
654 // have always something in the address to uniquely identify the target
655 // region.
656 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
657 return nullptr;
658 }
659
660 if (E->size) {
661 __tgt_offload_entry Entry = *E;
662 CUdeviceptr CUPtr;
663 size_t CUSize;
664 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
665 // We keep this style here because we need the name
666 if (Err != CUDA_SUCCESS) {
667 REPORT("Loading global '%s' Failed\n", E->name);
668 CUDA_ERR_STRING(Err);
669 return nullptr;
670 }
671
672 if (CUSize != E->size) {
673 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
674 CUSize, E->size);
675 return nullptr;
676 }
677
678 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
679 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
680
681 Entry.addr = (void *)(CUPtr);
682
683 // Note: In the current implementation declare target variables
684 // can either be link or to. This means that once unified
685 // memory is activated via the requires directive, the variable
686 // can be used directly from the host in both cases.
687 // TODO: when variables types other than to or link are added,
688 // the below condition should be changed to explicitly
689 // check for to and link variables types:
690 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
691 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
692 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
693 // If unified memory is present any target link or to variables
694 // can access host addresses directly. There is no longer a
695 // need for device copies.
696 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
697 DP("Copy linked variable host address (" DPxMOD
698 ") to device address (" DPxMOD ")\n",
699 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
700 }
701
702 addOffloadEntry(DeviceId, Entry);
703
704 continue;
705 }
706
707 CUfunction Func;
708 Err = cuModuleGetFunction(&Func, Module, E->name);
709 // We keep this style here because we need the name
710 if (Err != CUDA_SUCCESS) {
711 REPORT("Loading '%s' Failed\n", E->name);
712 CUDA_ERR_STRING(Err);
713 return nullptr;
714 }
715
716 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
717 DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
718
719 // default value GENERIC (in case symbol is missing from cubin file)
720 int8_t ExecModeVal = ExecutionModeType::GENERIC;
721 std::string ExecModeNameStr(E->name);
722 ExecModeNameStr += "_exec_mode";
723 const char *ExecModeName = ExecModeNameStr.c_str();
724
725 CUdeviceptr ExecModePtr;
726 size_t CUSize;
727 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
728 if (Err == CUDA_SUCCESS) {
729 if (CUSize != sizeof(int8_t)) {
730 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
731 ExecModeName, CUSize, sizeof(int8_t));
732 return nullptr;
733 }
734
735 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
736 if (Err != CUDA_SUCCESS) {
737 REPORT("Error when copying data from device to host. Pointers: "
738 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
739 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
740 CUDA_ERR_STRING(Err);
741 return nullptr;
742 }
743
744 if (ExecModeVal < 0 || ExecModeVal > 1) {
745 DP("Error wrong exec_mode value specified in cubin file: %d\n",
746 ExecModeVal);
747 return nullptr;
748 }
749 } else {
750 REPORT("Loading global exec_mode '%s' - symbol missing, using default "
751 "value GENERIC (1)\n",
752 ExecModeName);
753 CUDA_ERR_STRING(Err);
754 }
755
756 KernelsList.emplace_back(Func, ExecModeVal);
757
758 __tgt_offload_entry Entry = *E;
759 Entry.addr = &KernelsList.back();
760 addOffloadEntry(DeviceId, Entry);
761 }
762
763 // send device environment data to the device
764 {
765 omptarget_device_environmentTy DeviceEnv{0};
766
767 #ifdef OMPTARGET_DEBUG
768 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
769 DeviceEnv.debug_level = std::stoi(EnvStr);
770 #endif
771
772 const char *DeviceEnvName = "omptarget_device_environment";
773 CUdeviceptr DeviceEnvPtr;
774 size_t CUSize;
775
776 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
777 if (Err == CUDA_SUCCESS) {
778 if (CUSize != sizeof(DeviceEnv)) {
779 REPORT(
780 "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
781 DeviceEnvName, CUSize, sizeof(int32_t));
782 CUDA_ERR_STRING(Err);
783 return nullptr;
784 }
785
786 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
787 if (Err != CUDA_SUCCESS) {
788 REPORT("Error when copying data from host to device. Pointers: "
789 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
790 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
791 CUDA_ERR_STRING(Err);
792 return nullptr;
793 }
794
795 DP("Sending global device environment data %zu bytes\n", CUSize);
796 } else {
797 DP("Finding global device environment '%s' - symbol missing.\n",
798 DeviceEnvName);
799 DP("Continue, considering this is a device RTL which does not accept "
800 "environment setting.\n");
801 }
802 }
803
804 return getOffloadEntriesTable(DeviceId);
805 }
806
dataAlloc(const int DeviceId,const int64_t Size)807 void *dataAlloc(const int DeviceId, const int64_t Size) {
808 if (UseMemoryManager)
809 return MemoryManagers[DeviceId]->allocate(Size, nullptr);
810
811 return DeviceAllocators[DeviceId].allocate(Size, nullptr);
812 }
813
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const814 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
815 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
816 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
817
818 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
819 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
820 return OFFLOAD_FAIL;
821
822 CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
823
824 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
825 if (Err != CUDA_SUCCESS) {
826 REPORT("Error when copying data from host to device. Pointers: host "
827 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
828 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
829 CUDA_ERR_STRING(Err);
830 return OFFLOAD_FAIL;
831 }
832
833 return OFFLOAD_SUCCESS;
834 }
835
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfoPtr) const836 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
837 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
838 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
839
840 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
841 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
842 return OFFLOAD_FAIL;
843
844 CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
845
846 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
847 if (Err != CUDA_SUCCESS) {
848 REPORT("Error when copying data from device to host. Pointers: host "
849 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
850 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
851 CUDA_ERR_STRING(Err);
852 return OFFLOAD_FAIL;
853 }
854
855 return OFFLOAD_SUCCESS;
856 }
857
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr) const858 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
859 int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
860 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
861
862 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
863 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
864 return OFFLOAD_FAIL;
865
866 CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
867
868 // If they are two devices, we try peer to peer copy first
869 if (SrcDevId != DstDevId) {
870 int CanAccessPeer = 0;
871 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
872 if (Err != CUDA_SUCCESS) {
873 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
874 ", dst = %" PRId32 "\n",
875 SrcDevId, DstDevId);
876 CUDA_ERR_STRING(Err);
877 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
878 }
879
880 if (!CanAccessPeer) {
881 DP("P2P memcpy not supported so fall back to D2D memcpy");
882 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
883 }
884
885 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
886 if (Err != CUDA_SUCCESS) {
887 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
888 ", dst = %" PRId32 "\n",
889 SrcDevId, DstDevId);
890 CUDA_ERR_STRING(Err);
891 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
892 }
893
894 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
895 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
896 Size, Stream);
897 if (Err == CUDA_SUCCESS)
898 return OFFLOAD_SUCCESS;
899
900 REPORT("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
901 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
902 "\n",
903 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
904 CUDA_ERR_STRING(Err);
905 }
906
907 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
908 }
909
dataDelete(const int DeviceId,void * TgtPtr)910 int dataDelete(const int DeviceId, void *TgtPtr) {
911 if (UseMemoryManager)
912 return MemoryManagers[DeviceId]->free(TgtPtr);
913
914 return DeviceAllocators[DeviceId].free(TgtPtr);
915 }
916
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const917 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
918 ptrdiff_t *TgtOffsets, const int ArgNum,
919 const int TeamNum, const int ThreadLimit,
920 const unsigned int LoopTripCount,
921 __tgt_async_info *AsyncInfo) const {
922 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
923 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
924 return OFFLOAD_FAIL;
925
926 // All args are references.
927 std::vector<void *> Args(ArgNum);
928 std::vector<void *> Ptrs(ArgNum);
929
930 for (int I = 0; I < ArgNum; ++I) {
931 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
932 Args[I] = &Ptrs[I];
933 }
934
935 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
936
937 int CudaThreadsPerBlock;
938 if (ThreadLimit > 0) {
939 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
940 CudaThreadsPerBlock = ThreadLimit;
941 // Add master warp if necessary
942 if (KernelInfo->ExecutionMode == GENERIC) {
943 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
944 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
945 }
946 } else {
947 DP("Setting CUDA threads per block to default %d\n",
948 DeviceData[DeviceId].NumThreads);
949 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
950 }
951
952 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
953 DP("Threads per block capped at device limit %d\n",
954 DeviceData[DeviceId].ThreadsPerBlock);
955 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
956 }
957
958 if (!KernelInfo->MaxThreadsPerBlock) {
959 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
960 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
961 KernelInfo->Func);
962 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
963 return OFFLOAD_FAIL;
964 }
965
966 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
967 DP("Threads per block capped at kernel limit %d\n",
968 KernelInfo->MaxThreadsPerBlock);
969 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
970 }
971
972 unsigned int CudaBlocksPerGrid;
973 if (TeamNum <= 0) {
974 if (LoopTripCount > 0 && EnvNumTeams < 0) {
975 if (KernelInfo->ExecutionMode == SPMD) {
976 // We have a combined construct, i.e. `target teams distribute
977 // parallel for [simd]`. We launch so many teams so that each thread
978 // will execute one iteration of the loop. round up to the nearest
979 // integer
980 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
981 } else {
982 // If we reach this point, then we have a non-combined construct, i.e.
983 // `teams distribute` with a nested `parallel for` and each team is
984 // assigned one iteration of the `distribute` loop. E.g.:
985 //
986 // #pragma omp target teams distribute
987 // for(...loop_tripcount...) {
988 // #pragma omp parallel for
989 // for(...) {}
990 // }
991 //
992 // Threads within a team will execute the iterations of the `parallel`
993 // loop.
994 CudaBlocksPerGrid = LoopTripCount;
995 }
996 DP("Using %d teams due to loop trip count %" PRIu32
997 " and number of threads per block %d\n",
998 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
999 } else {
1000 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1001 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1002 }
1003 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1004 DP("Capping number of teams to team limit %d\n",
1005 DeviceData[DeviceId].BlocksPerGrid);
1006 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1007 } else {
1008 DP("Using requested number of teams %d\n", TeamNum);
1009 CudaBlocksPerGrid = TeamNum;
1010 }
1011
1012 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1013 "Launching kernel %s with %d blocks and %d threads in %s "
1014 "mode\n",
1015 (getOffloadEntry(DeviceId, TgtEntryPtr))
1016 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1017 : "(null)",
1018 CudaBlocksPerGrid, CudaThreadsPerBlock,
1019 (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
1020
1021 CUstream Stream = getStream(DeviceId, AsyncInfo);
1022 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1023 /* gridDimZ */ 1, CudaThreadsPerBlock,
1024 /* blockDimY */ 1, /* blockDimZ */ 1,
1025 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1026 if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1027 return OFFLOAD_FAIL;
1028
1029 DP("Launch of entry point at " DPxMOD " successful!\n",
1030 DPxPTR(TgtEntryPtr));
1031
1032 return OFFLOAD_SUCCESS;
1033 }
1034
synchronize(const int DeviceId,__tgt_async_info * AsyncInfoPtr) const1035 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
1036 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
1037 CUresult Err = cuStreamSynchronize(Stream);
1038 if (Err != CUDA_SUCCESS) {
1039 REPORT("Error when synchronizing stream. stream = " DPxMOD
1040 ", async info ptr = " DPxMOD "\n",
1041 DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
1042 CUDA_ERR_STRING(Err);
1043 return OFFLOAD_FAIL;
1044 }
1045
1046 // Once the stream is synchronized, return it to stream pool and reset
1047 // async_info. This is to make sure the synchronization only works for its
1048 // own tasks.
1049 StreamManager->returnStream(
1050 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
1051 AsyncInfoPtr->Queue = nullptr;
1052
1053 return OFFLOAD_SUCCESS;
1054 }
1055 };
1056
1057 DeviceRTLTy DeviceRTL;
1058 } // namespace
1059
1060 // Exposed library API function
1061 #ifdef __cplusplus
1062 extern "C" {
1063 #endif
1064
__tgt_rtl_is_valid_binary(__tgt_device_image * image)1065 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1066 return elf_check_machine(image, /* EM_CUDA */ 190);
1067 }
1068
__tgt_rtl_number_of_devices()1069 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1070
__tgt_rtl_init_requires(int64_t RequiresFlags)1071 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1072 DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1073 DeviceRTL.setRequiresFlag(RequiresFlags);
1074 return RequiresFlags;
1075 }
1076
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1077 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1078 if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1079 DeviceRTL.isValidDeviceId(dst_dev_id))
1080 return 1;
1081
1082 return 0;
1083 }
1084
__tgt_rtl_init_device(int32_t device_id)1085 int32_t __tgt_rtl_init_device(int32_t device_id) {
1086 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1087
1088 return DeviceRTL.initDevice(device_id);
1089 }
1090
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1091 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1092 __tgt_device_image *image) {
1093 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1094
1095 return DeviceRTL.loadBinary(device_id, image);
1096 }
1097
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *)1098 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1099 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1100
1101 return DeviceRTL.dataAlloc(device_id, size);
1102 }
1103
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1104 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1105 int64_t size) {
1106 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1107
1108 __tgt_async_info async_info;
1109 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1110 size, &async_info);
1111 if (rc != OFFLOAD_SUCCESS)
1112 return OFFLOAD_FAIL;
1113
1114 return __tgt_rtl_synchronize(device_id, &async_info);
1115 }
1116
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1117 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1118 void *hst_ptr, int64_t size,
1119 __tgt_async_info *async_info_ptr) {
1120 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1121 assert(async_info_ptr && "async_info_ptr is nullptr");
1122
1123 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1124 async_info_ptr);
1125 }
1126
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1127 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1128 int64_t size) {
1129 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1130
1131 __tgt_async_info async_info;
1132 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1133 size, &async_info);
1134 if (rc != OFFLOAD_SUCCESS)
1135 return OFFLOAD_FAIL;
1136
1137 return __tgt_rtl_synchronize(device_id, &async_info);
1138 }
1139
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1140 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1141 void *tgt_ptr, int64_t size,
1142 __tgt_async_info *async_info_ptr) {
1143 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1144 assert(async_info_ptr && "async_info_ptr is nullptr");
1145
1146 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1147 async_info_ptr);
1148 }
1149
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1150 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1151 int dst_dev_id, void *dst_ptr,
1152 int64_t size,
1153 __tgt_async_info *async_info_ptr) {
1154 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1155 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1156 assert(async_info_ptr && "async_info_ptr is nullptr");
1157
1158 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1159 async_info_ptr);
1160 }
1161
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1162 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1163 int32_t dst_dev_id, void *dst_ptr,
1164 int64_t size) {
1165 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1166 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1167
1168 __tgt_async_info async_info;
1169 const int32_t rc = __tgt_rtl_data_exchange_async(
1170 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1171 if (rc != OFFLOAD_SUCCESS)
1172 return OFFLOAD_FAIL;
1173
1174 return __tgt_rtl_synchronize(src_dev_id, &async_info);
1175 }
1176
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1177 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1178 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1179
1180 return DeviceRTL.dataDelete(device_id, tgt_ptr);
1181 }
1182
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1183 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1184 void **tgt_args,
1185 ptrdiff_t *tgt_offsets,
1186 int32_t arg_num, int32_t team_num,
1187 int32_t thread_limit,
1188 uint64_t loop_tripcount) {
1189 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1190
1191 __tgt_async_info async_info;
1192 const int32_t rc = __tgt_rtl_run_target_team_region_async(
1193 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1194 thread_limit, loop_tripcount, &async_info);
1195 if (rc != OFFLOAD_SUCCESS)
1196 return OFFLOAD_FAIL;
1197
1198 return __tgt_rtl_synchronize(device_id, &async_info);
1199 }
1200
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1201 int32_t __tgt_rtl_run_target_team_region_async(
1202 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1203 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1204 int32_t thread_limit, uint64_t loop_tripcount,
1205 __tgt_async_info *async_info_ptr) {
1206 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1207
1208 return DeviceRTL.runTargetTeamRegion(
1209 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1210 thread_limit, loop_tripcount, async_info_ptr);
1211 }
1212
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1213 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1214 void **tgt_args, ptrdiff_t *tgt_offsets,
1215 int32_t arg_num) {
1216 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1217
1218 __tgt_async_info async_info;
1219 const int32_t rc = __tgt_rtl_run_target_region_async(
1220 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1221 if (rc != OFFLOAD_SUCCESS)
1222 return OFFLOAD_FAIL;
1223
1224 return __tgt_rtl_synchronize(device_id, &async_info);
1225 }
1226
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1227 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1228 void *tgt_entry_ptr, void **tgt_args,
1229 ptrdiff_t *tgt_offsets,
1230 int32_t arg_num,
1231 __tgt_async_info *async_info_ptr) {
1232 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1233
1234 return __tgt_rtl_run_target_team_region_async(
1235 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1236 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1237 async_info_ptr);
1238 }
1239
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1240 int32_t __tgt_rtl_synchronize(int32_t device_id,
1241 __tgt_async_info *async_info_ptr) {
1242 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1243 assert(async_info_ptr && "async_info_ptr is nullptr");
1244 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1245
1246 return DeviceRTL.synchronize(device_id, async_info_ptr);
1247 }
1248
1249 #ifdef __cplusplus
1250 }
1251 #endif
1252