1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string.h>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 #include <string.h>
24
25 #include "Debug.h"
26 #include "omptargetplugin.h"
27
28 #define TARGET_NAME CUDA
29 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
30
31 #include "MemoryManager.h"
32
33 // Utility for retrieving and printing CUDA error string.
34 #ifdef OMPTARGET_DEBUG
35 #define CUDA_ERR_STRING(err) \
36 do { \
37 if (getDebugLevel() > 0) { \
38 const char *errStr = nullptr; \
39 CUresult errStr_status = cuGetErrorString(err, &errStr); \
40 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
41 REPORT("Unrecognized CUDA error code: %d\n", err); \
42 else if (errStr_status == CUDA_SUCCESS) \
43 REPORT("CUDA error is: %s\n", errStr); \
44 else { \
45 REPORT("Unresolved CUDA error code: %d\n", err); \
46 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
47 errStr_status); \
48 } \
49 } else { \
50 const char *errStr = nullptr; \
51 CUresult errStr_status = cuGetErrorString(err, &errStr); \
52 if (errStr_status == CUDA_SUCCESS) \
53 REPORT("%s \n", errStr); \
54 } \
55 } while (false)
56 #else // OMPTARGET_DEBUG
57 #define CUDA_ERR_STRING(err) \
58 do { \
59 const char *errStr = nullptr; \
60 CUresult errStr_status = cuGetErrorString(err, &errStr); \
61 if (errStr_status == CUDA_SUCCESS) \
62 REPORT("%s \n", errStr); \
63 } while (false)
64 #endif // OMPTARGET_DEBUG
65
66 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
67
68 #include "elf_common.h"
69
70 /// Keep entries table per device.
71 struct FuncOrGblEntryTy {
72 __tgt_target_table Table;
73 std::vector<__tgt_offload_entry> Entries;
74 };
75
76 enum ExecutionModeType {
77 SPMD, // constructors, destructors,
78 // combined constructs (`teams distribute parallel for [simd]`)
79 GENERIC, // everything else
80 SPMD_GENERIC, // Generic kernel with SPMD execution
81 NONE
82 };
83
84 /// Use a single entity to encode a kernel and a set of flags.
85 struct KernelTy {
86 CUfunction Func;
87
88 // execution mode of kernel
89 // 0 - SPMD mode (without master warp)
90 // 1 - Generic mode (with master warp)
91 // 2 - SPMD mode execution with Generic mode semantics.
92 int8_t ExecutionMode;
93
94 /// Maximal number of threads per block for this kernel.
95 int MaxThreadsPerBlock = 0;
96
KernelTyKernelTy97 KernelTy(CUfunction _Func, int8_t _ExecutionMode)
98 : Func(_Func), ExecutionMode(_ExecutionMode) {}
99 };
100
101 /// Device environment data
102 /// Manually sync with the deviceRTL side for now, move to a dedicated header
103 /// file later.
104 struct omptarget_device_environmentTy {
105 int32_t debug_level;
106 };
107
108 namespace {
checkResult(CUresult Err,const char * ErrMsg)109 bool checkResult(CUresult Err, const char *ErrMsg) {
110 if (Err == CUDA_SUCCESS)
111 return true;
112
113 REPORT("%s", ErrMsg);
114 CUDA_ERR_STRING(Err);
115 return false;
116 }
117
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)118 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
119 CUstream Stream) {
120 CUresult Err =
121 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
122
123 if (Err != CUDA_SUCCESS) {
124 DP("Error when copying data from device to device. Pointers: src "
125 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
126 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
127 CUDA_ERR_STRING(Err);
128 return OFFLOAD_FAIL;
129 }
130
131 return OFFLOAD_SUCCESS;
132 }
133
134 // Structure contains per-device data
135 struct DeviceDataTy {
136 /// List that contains all the kernels.
137 std::list<KernelTy> KernelsList;
138
139 std::list<FuncOrGblEntryTy> FuncGblEntries;
140
141 CUcontext Context = nullptr;
142 // Device properties
143 int ThreadsPerBlock = 0;
144 int BlocksPerGrid = 0;
145 int WarpSize = 0;
146 // OpenMP properties
147 int NumTeams = 0;
148 int NumThreads = 0;
149 };
150
151 class StreamManagerTy {
152 int NumberOfDevices;
153 // The initial size of stream pool
154 int EnvNumInitialStreams;
155 // Per-device stream mutex
156 std::vector<std::unique_ptr<std::mutex>> StreamMtx;
157 // Per-device stream Id indicates the next available stream in the pool
158 std::vector<int> NextStreamId;
159 // Per-device stream pool
160 std::vector<std::vector<CUstream>> StreamPool;
161 // Reference to per-device data
162 std::vector<DeviceDataTy> &DeviceData;
163
164 // If there is no CUstream left in the pool, we will resize the pool to
165 // allocate more CUstream. This function should be called with device mutex,
166 // and we do not resize to smaller one.
resizeStreamPool(const int DeviceId,const size_t NewSize)167 void resizeStreamPool(const int DeviceId, const size_t NewSize) {
168 std::vector<CUstream> &Pool = StreamPool[DeviceId];
169 const size_t CurrentSize = Pool.size();
170 assert(NewSize > CurrentSize && "new size is not larger than current size");
171
172 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
173 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
174 // We will return if cannot switch to the right context in case of
175 // creating bunch of streams that are not corresponding to the right
176 // device. The offloading will fail later because selected CUstream is
177 // nullptr.
178 return;
179 }
180
181 Pool.resize(NewSize, nullptr);
182
183 for (size_t I = CurrentSize; I < NewSize; ++I) {
184 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
185 "Error returned from cuStreamCreate\n");
186 }
187 }
188
189 public:
StreamManagerTy(const int NumberOfDevices,std::vector<DeviceDataTy> & DeviceData)190 StreamManagerTy(const int NumberOfDevices,
191 std::vector<DeviceDataTy> &DeviceData)
192 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
193 DeviceData(DeviceData) {
194 StreamPool.resize(NumberOfDevices);
195 NextStreamId.resize(NumberOfDevices);
196 StreamMtx.resize(NumberOfDevices);
197
198 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
199 EnvNumInitialStreams = std::stoi(EnvStr);
200
201 // Initialize the next stream id
202 std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
203
204 // Initialize stream mutex
205 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
206 Ptr = std::make_unique<std::mutex>();
207 }
208
~StreamManagerTy()209 ~StreamManagerTy() {
210 // Destroy streams
211 for (int I = 0; I < NumberOfDevices; ++I) {
212 checkResult(cuCtxSetCurrent(DeviceData[I].Context),
213 "Error returned from cuCtxSetCurrent\n");
214
215 for (CUstream &S : StreamPool[I]) {
216 if (S)
217 checkResult(cuStreamDestroy(S),
218 "Error returned from cuStreamDestroy\n");
219 }
220 }
221 }
222
223 // Get a CUstream from pool. Per-device next stream id always points to the
224 // next available CUstream. That means, CUstreams [0, id-1] have been
225 // assigned, and [id,] are still available. If there is no CUstream left, we
226 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
227 // the id will increase one.
228 // xxxxxs+++++++++
229 // ^
230 // id
231 // After assignment, the pool becomes the following and s is assigned.
232 // xxxxxs+++++++++
233 // ^
234 // id
getStream(const int DeviceId)235 CUstream getStream(const int DeviceId) {
236 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
237 int &Id = NextStreamId[DeviceId];
238 // No CUstream left in the pool, we need to request from CUDA RT
239 if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
240 // By default we double the stream pool every time
241 resizeStreamPool(DeviceId, Id * 2);
242 }
243 return StreamPool[DeviceId][Id++];
244 }
245
246 // Return a CUstream back to pool. As mentioned above, per-device next
247 // stream is always points to the next available CUstream, so when we return
248 // a CUstream, we need to first decrease the id, and then copy the CUstream
249 // back.
250 // It is worth noting that, the order of streams return might be different
251 // from that they're assigned, that saying, at some point, there might be
252 // two identical CUstreams.
253 // xxax+a+++++
254 // ^
255 // id
256 // However, it doesn't matter, because they're always on the two sides of
257 // id. The left one will in the end be overwritten by another CUstream.
258 // Therefore, after several execution, the order of pool might be different
259 // from its initial state.
returnStream(const int DeviceId,CUstream Stream)260 void returnStream(const int DeviceId, CUstream Stream) {
261 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
262 int &Id = NextStreamId[DeviceId];
263 assert(Id > 0 && "Wrong stream ID");
264 StreamPool[DeviceId][--Id] = Stream;
265 }
266
initializeDeviceStreamPool(const int DeviceId)267 bool initializeDeviceStreamPool(const int DeviceId) {
268 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
269
270 resizeStreamPool(DeviceId, EnvNumInitialStreams);
271
272 // Check the size of stream pool
273 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
274 return false;
275
276 // Check whether each stream is valid
277 for (CUstream &S : StreamPool[DeviceId])
278 if (!S)
279 return false;
280
281 return true;
282 }
283 };
284
285 class DeviceRTLTy {
286 int NumberOfDevices;
287 // OpenMP environment properties
288 int EnvNumTeams;
289 int EnvTeamLimit;
290 int EnvTeamThreadLimit;
291 // OpenMP requires flags
292 int64_t RequiresFlags;
293
294 static constexpr const int HardTeamLimit = 1U << 16U; // 64k
295 static constexpr const int HardThreadLimit = 1024;
296 static constexpr const int DefaultNumTeams = 128;
297 static constexpr const int DefaultNumThreads = 128;
298
299 std::unique_ptr<StreamManagerTy> StreamManager;
300 std::vector<DeviceDataTy> DeviceData;
301 std::vector<CUmodule> Modules;
302
303 /// A class responsible for interacting with device native runtime library to
304 /// allocate and free memory.
305 class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
306 const int DeviceId;
307 const std::vector<DeviceDataTy> &DeviceData;
308 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
309
310 public:
CUDADeviceAllocatorTy(int DeviceId,std::vector<DeviceDataTy> & DeviceData)311 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
312 : DeviceId(DeviceId), DeviceData(DeviceData) {}
313
allocate(size_t Size,void *,TargetAllocTy Kind)314 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
315 if (Size == 0)
316 return nullptr;
317
318 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
319 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
320 return nullptr;
321
322 void *MemAlloc = nullptr;
323 switch (Kind) {
324 case TARGET_ALLOC_DEFAULT:
325 case TARGET_ALLOC_DEVICE:
326 CUdeviceptr DevicePtr;
327 Err = cuMemAlloc(&DevicePtr, Size);
328 MemAlloc = (void *)DevicePtr;
329 if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
330 return nullptr;
331 break;
332 case TARGET_ALLOC_HOST:
333 void *HostPtr;
334 Err = cuMemAllocHost(&HostPtr, Size);
335 MemAlloc = HostPtr;
336 if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
337 return nullptr;
338 HostPinnedAllocs[MemAlloc] = Kind;
339 break;
340 case TARGET_ALLOC_SHARED:
341 CUdeviceptr SharedPtr;
342 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
343 MemAlloc = (void *)SharedPtr;
344 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
345 return nullptr;
346 break;
347 }
348
349 return MemAlloc;
350 }
351
free(void * TgtPtr)352 int free(void *TgtPtr) override {
353 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
354 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
355 return OFFLOAD_FAIL;
356
357 // Host pinned memory must be freed differently.
358 TargetAllocTy Kind =
359 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
360 ? TARGET_ALLOC_DEFAULT
361 : TARGET_ALLOC_HOST;
362 switch (Kind) {
363 case TARGET_ALLOC_DEFAULT:
364 case TARGET_ALLOC_DEVICE:
365 case TARGET_ALLOC_SHARED:
366 Err = cuMemFree((CUdeviceptr)TgtPtr);
367 if (!checkResult(Err, "Error returned from cuMemFree\n"))
368 return OFFLOAD_FAIL;
369 break;
370 case TARGET_ALLOC_HOST:
371 Err = cuMemFreeHost(TgtPtr);
372 if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
373 return OFFLOAD_FAIL;
374 break;
375 }
376
377 return OFFLOAD_SUCCESS;
378 }
379 };
380
381 /// A vector of device allocators
382 std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
383
384 /// A vector of memory managers. Since the memory manager is non-copyable and
385 // non-removable, we wrap them into std::unique_ptr.
386 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
387
388 /// Whether use memory manager
389 bool UseMemoryManager = true;
390
391 // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry entry)392 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
393 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
394 E.Entries.push_back(entry);
395 }
396
397 // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const398 const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
399 const void *Addr) const {
400 for (const __tgt_offload_entry &Itr :
401 DeviceData[DeviceId].FuncGblEntries.back().Entries)
402 if (Itr.addr == Addr)
403 return &Itr;
404
405 return nullptr;
406 }
407
408 // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)409 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
410 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
411
412 if (E.Entries.empty())
413 return nullptr;
414
415 // Update table info according to the entries and return the pointer
416 E.Table.EntriesBegin = E.Entries.data();
417 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
418
419 return &E.Table;
420 }
421
422 // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)423 void clearOffloadEntriesTable(const int DeviceId) {
424 DeviceData[DeviceId].FuncGblEntries.emplace_back();
425 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
426 E.Entries.clear();
427 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
428 }
429
getStream(const int DeviceId,__tgt_async_info * AsyncInfo) const430 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
431 assert(AsyncInfo && "AsyncInfo is nullptr");
432
433 if (!AsyncInfo->Queue)
434 AsyncInfo->Queue = StreamManager->getStream(DeviceId);
435
436 return reinterpret_cast<CUstream>(AsyncInfo->Queue);
437 }
438
439 public:
440 // This class should not be copied
441 DeviceRTLTy(const DeviceRTLTy &) = delete;
442 DeviceRTLTy(DeviceRTLTy &&) = delete;
443
DeviceRTLTy()444 DeviceRTLTy()
445 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
446 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
447
448 DP("Start initializing CUDA\n");
449
450 CUresult Err = cuInit(0);
451 if (Err == CUDA_ERROR_INVALID_HANDLE) {
452 // Can't call cuGetErrorString if dlsym failed
453 DP("Failed to load CUDA shared library\n");
454 return;
455 }
456 if (!checkResult(Err, "Error returned from cuInit\n")) {
457 return;
458 }
459
460 Err = cuDeviceGetCount(&NumberOfDevices);
461 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
462 return;
463
464 if (NumberOfDevices == 0) {
465 DP("There are no devices supporting CUDA.\n");
466 return;
467 }
468
469 DeviceData.resize(NumberOfDevices);
470
471 // Get environment variables regarding teams
472 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
473 // OMP_TEAM_LIMIT has been set
474 EnvTeamLimit = std::stoi(EnvStr);
475 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
476 }
477 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
478 // OMP_TEAMS_THREAD_LIMIT has been set
479 EnvTeamThreadLimit = std::stoi(EnvStr);
480 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
481 }
482 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
483 // OMP_NUM_TEAMS has been set
484 EnvNumTeams = std::stoi(EnvStr);
485 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
486 }
487
488 StreamManager =
489 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
490
491 for (int I = 0; I < NumberOfDevices; ++I)
492 DeviceAllocators.emplace_back(I, DeviceData);
493
494 // Get the size threshold from environment variable
495 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
496 UseMemoryManager = Res.second;
497 size_t MemoryManagerThreshold = Res.first;
498
499 if (UseMemoryManager)
500 for (int I = 0; I < NumberOfDevices; ++I)
501 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
502 DeviceAllocators[I], MemoryManagerThreshold));
503 }
504
~DeviceRTLTy()505 ~DeviceRTLTy() {
506 // We first destruct memory managers in case that its dependent data are
507 // destroyed before it.
508 for (auto &M : MemoryManagers)
509 M.release();
510
511 StreamManager = nullptr;
512
513 for (CUmodule &M : Modules)
514 // Close module
515 if (M)
516 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
517
518 for (DeviceDataTy &D : DeviceData) {
519 // Destroy context
520 if (D.Context) {
521 checkResult(cuCtxSetCurrent(D.Context),
522 "Error returned from cuCtxSetCurrent\n");
523 CUdevice Device;
524 checkResult(cuCtxGetDevice(&Device),
525 "Error returned from cuCtxGetDevice\n");
526 checkResult(cuDevicePrimaryCtxRelease(Device),
527 "Error returned from cuDevicePrimaryCtxRelease\n");
528 }
529 }
530 }
531
532 // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const533 bool isValidDeviceId(const int DeviceId) const {
534 return DeviceId >= 0 && DeviceId < NumberOfDevices;
535 }
536
getNumOfDevices() const537 int getNumOfDevices() const { return NumberOfDevices; }
538
setRequiresFlag(const int64_t Flags)539 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
540
initDevice(const int DeviceId)541 int initDevice(const int DeviceId) {
542 CUdevice Device;
543
544 DP("Getting device %d\n", DeviceId);
545 CUresult Err = cuDeviceGet(&Device, DeviceId);
546 if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
547 return OFFLOAD_FAIL;
548
549 // Query the current flags of the primary context and set its flags if
550 // it is inactive
551 unsigned int FormerPrimaryCtxFlags = 0;
552 int FormerPrimaryCtxIsActive = 0;
553 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
554 &FormerPrimaryCtxIsActive);
555 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
556 return OFFLOAD_FAIL;
557
558 if (FormerPrimaryCtxIsActive) {
559 DP("The primary context is active, no change to its flags\n");
560 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
561 CU_CTX_SCHED_BLOCKING_SYNC)
562 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
563 } else {
564 DP("The primary context is inactive, set its flags to "
565 "CU_CTX_SCHED_BLOCKING_SYNC\n");
566 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
567 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
568 return OFFLOAD_FAIL;
569 }
570
571 // Retain the per device primary context and save it to use whenever this
572 // device is selected.
573 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
574 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
575 return OFFLOAD_FAIL;
576
577 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
578 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
579 return OFFLOAD_FAIL;
580
581 // Initialize stream pool
582 if (!StreamManager->initializeDeviceStreamPool(DeviceId))
583 return OFFLOAD_FAIL;
584
585 // Query attributes to determine number of threads/block and blocks/grid.
586 int MaxGridDimX;
587 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
588 Device);
589 if (Err != CUDA_SUCCESS) {
590 DP("Error getting max grid dimension, use default value %d\n",
591 DeviceRTLTy::DefaultNumTeams);
592 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
593 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
594 DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
595 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
596 } else {
597 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
598 "at the hard limit\n",
599 MaxGridDimX, DeviceRTLTy::HardTeamLimit);
600 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
601 }
602
603 // We are only exploiting threads along the x axis.
604 int MaxBlockDimX;
605 Err = cuDeviceGetAttribute(&MaxBlockDimX,
606 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
607 if (Err != CUDA_SUCCESS) {
608 DP("Error getting max block dimension, use default value %d\n",
609 DeviceRTLTy::DefaultNumThreads);
610 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
611 } else {
612 DP("Using %d CUDA threads per block\n", MaxBlockDimX);
613 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
614
615 if (EnvTeamThreadLimit > 0 &&
616 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
617 DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
618 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
619 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
620 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
621 }
622 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
623 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
624 "capping at the hard limit\n",
625 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
626 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
627 }
628 }
629
630 // Get and set warp size
631 int WarpSize;
632 Err =
633 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
634 if (Err != CUDA_SUCCESS) {
635 DP("Error getting warp size, assume default value 32\n");
636 DeviceData[DeviceId].WarpSize = 32;
637 } else {
638 DP("Using warp size %d\n", WarpSize);
639 DeviceData[DeviceId].WarpSize = WarpSize;
640 }
641
642 // Adjust teams to the env variables
643 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
644 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
645 EnvTeamLimit);
646 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
647 }
648
649 size_t StackLimit;
650 size_t HeapLimit;
651 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
652 StackLimit = std::stol(EnvStr);
653 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
654 return OFFLOAD_FAIL;
655 } else {
656 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
657 return OFFLOAD_FAIL;
658 }
659 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
660 HeapLimit = std::stol(EnvStr);
661 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
662 return OFFLOAD_FAIL;
663 } else {
664 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
665 return OFFLOAD_FAIL;
666 }
667
668 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
669 "Device supports up to %d CUDA blocks and %d threads with a "
670 "warp size of %d\n",
671 DeviceData[DeviceId].BlocksPerGrid,
672 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
673 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
674 "Device heap size is %d Bytes, device stack size is %d Bytes per "
675 "thread\n",
676 (int)HeapLimit, (int)StackLimit);
677
678 // Set default number of teams
679 if (EnvNumTeams > 0) {
680 DP("Default number of teams set according to environment %d\n",
681 EnvNumTeams);
682 DeviceData[DeviceId].NumTeams = EnvNumTeams;
683 } else {
684 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
685 DP("Default number of teams set according to library's default %d\n",
686 DeviceRTLTy::DefaultNumTeams);
687 }
688
689 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
690 DP("Default number of teams exceeds device limit, capping at %d\n",
691 DeviceData[DeviceId].BlocksPerGrid);
692 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
693 }
694
695 // Set default number of threads
696 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
697 DP("Default number of threads set according to library's default %d\n",
698 DeviceRTLTy::DefaultNumThreads);
699 if (DeviceData[DeviceId].NumThreads >
700 DeviceData[DeviceId].ThreadsPerBlock) {
701 DP("Default number of threads exceeds device limit, capping at %d\n",
702 DeviceData[DeviceId].ThreadsPerBlock);
703 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
704 }
705
706 return OFFLOAD_SUCCESS;
707 }
708
loadBinary(const int DeviceId,const __tgt_device_image * Image)709 __tgt_target_table *loadBinary(const int DeviceId,
710 const __tgt_device_image *Image) {
711 // Set the context we are using
712 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
713 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
714 return nullptr;
715
716 // Clear the offload table as we are going to create a new one.
717 clearOffloadEntriesTable(DeviceId);
718
719 // Create the module and extract the function pointers.
720 CUmodule Module;
721 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
722 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
723 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
724 return nullptr;
725
726 DP("CUDA module successfully loaded!\n");
727
728 Modules.push_back(Module);
729
730 // Find the symbols in the module by name.
731 const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
732 const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
733
734 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
735 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
736 if (!E->addr) {
737 // We return nullptr when something like this happens, the host should
738 // have always something in the address to uniquely identify the target
739 // region.
740 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
741 return nullptr;
742 }
743
744 if (E->size) {
745 __tgt_offload_entry Entry = *E;
746 CUdeviceptr CUPtr;
747 size_t CUSize;
748 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
749 // We keep this style here because we need the name
750 if (Err != CUDA_SUCCESS) {
751 REPORT("Loading global '%s' Failed\n", E->name);
752 CUDA_ERR_STRING(Err);
753 return nullptr;
754 }
755
756 if (CUSize != E->size) {
757 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
758 CUSize, E->size);
759 return nullptr;
760 }
761
762 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
763 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
764
765 Entry.addr = (void *)(CUPtr);
766
767 // Note: In the current implementation declare target variables
768 // can either be link or to. This means that once unified
769 // memory is activated via the requires directive, the variable
770 // can be used directly from the host in both cases.
771 // TODO: when variables types other than to or link are added,
772 // the below condition should be changed to explicitly
773 // check for to and link variables types:
774 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
775 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
776 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
777 // If unified memory is present any target link or to variables
778 // can access host addresses directly. There is no longer a
779 // need for device copies.
780 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
781 DP("Copy linked variable host address (" DPxMOD
782 ") to device address (" DPxMOD ")\n",
783 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
784 }
785
786 addOffloadEntry(DeviceId, Entry);
787
788 continue;
789 }
790
791 CUfunction Func;
792 Err = cuModuleGetFunction(&Func, Module, E->name);
793 // We keep this style here because we need the name
794 if (Err != CUDA_SUCCESS) {
795 REPORT("Loading '%s' Failed\n", E->name);
796 CUDA_ERR_STRING(Err);
797 return nullptr;
798 }
799
800 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
801 DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
802
803 // default value GENERIC (in case symbol is missing from cubin file)
804 int8_t ExecModeVal = ExecutionModeType::GENERIC;
805 std::string ExecModeNameStr(E->name);
806 ExecModeNameStr += "_exec_mode";
807 const char *ExecModeName = ExecModeNameStr.c_str();
808
809 CUdeviceptr ExecModePtr;
810 size_t CUSize;
811 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
812 if (Err == CUDA_SUCCESS) {
813 if (CUSize != sizeof(int8_t)) {
814 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
815 ExecModeName, CUSize, sizeof(int8_t));
816 return nullptr;
817 }
818
819 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
820 if (Err != CUDA_SUCCESS) {
821 REPORT("Error when copying data from device to host. Pointers: "
822 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
823 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
824 CUDA_ERR_STRING(Err);
825 return nullptr;
826 }
827
828 if (ExecModeVal < 0 || ExecModeVal > 2) {
829 DP("Error wrong exec_mode value specified in cubin file: %d\n",
830 ExecModeVal);
831 return nullptr;
832 }
833 } else {
834 REPORT("Loading global exec_mode '%s' - symbol missing, using default "
835 "value GENERIC (1)\n",
836 ExecModeName);
837 CUDA_ERR_STRING(Err);
838 }
839
840 KernelsList.emplace_back(Func, ExecModeVal);
841
842 __tgt_offload_entry Entry = *E;
843 Entry.addr = &KernelsList.back();
844 addOffloadEntry(DeviceId, Entry);
845 }
846
847 // send device environment data to the device
848 {
849 omptarget_device_environmentTy DeviceEnv{0};
850
851 #ifdef OMPTARGET_DEBUG
852 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
853 DeviceEnv.debug_level = std::stoi(EnvStr);
854 #endif
855
856 const char *DeviceEnvName = "omptarget_device_environment";
857 CUdeviceptr DeviceEnvPtr;
858 size_t CUSize;
859
860 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
861 if (Err == CUDA_SUCCESS) {
862 if (CUSize != sizeof(DeviceEnv)) {
863 REPORT(
864 "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
865 DeviceEnvName, CUSize, sizeof(int32_t));
866 CUDA_ERR_STRING(Err);
867 return nullptr;
868 }
869
870 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
871 if (Err != CUDA_SUCCESS) {
872 REPORT("Error when copying data from host to device. Pointers: "
873 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
874 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
875 CUDA_ERR_STRING(Err);
876 return nullptr;
877 }
878
879 DP("Sending global device environment data %zu bytes\n", CUSize);
880 } else {
881 DP("Finding global device environment '%s' - symbol missing.\n",
882 DeviceEnvName);
883 DP("Continue, considering this is a device RTL which does not accept "
884 "environment setting.\n");
885 }
886 }
887
888 return getOffloadEntriesTable(DeviceId);
889 }
890
dataAlloc(const int DeviceId,const int64_t Size,const TargetAllocTy Kind)891 void *dataAlloc(const int DeviceId, const int64_t Size,
892 const TargetAllocTy Kind) {
893 switch (Kind) {
894 case TARGET_ALLOC_DEFAULT:
895 case TARGET_ALLOC_DEVICE:
896 if (UseMemoryManager)
897 return MemoryManagers[DeviceId]->allocate(Size, nullptr);
898 else
899 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
900 case TARGET_ALLOC_HOST:
901 case TARGET_ALLOC_SHARED:
902 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
903 }
904
905 REPORT("Invalid target data allocation kind or requested allocator not "
906 "implemented yet\n");
907
908 return nullptr;
909 }
910
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const911 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
912 const int64_t Size, __tgt_async_info *AsyncInfo) const {
913 assert(AsyncInfo && "AsyncInfo is nullptr");
914
915 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
916 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
917 return OFFLOAD_FAIL;
918
919 CUstream Stream = getStream(DeviceId, AsyncInfo);
920
921 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
922 if (Err != CUDA_SUCCESS) {
923 DP("Error when copying data from host to device. Pointers: host "
924 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
925 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
926 CUDA_ERR_STRING(Err);
927 return OFFLOAD_FAIL;
928 }
929
930 return OFFLOAD_SUCCESS;
931 }
932
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const933 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
934 const int64_t Size, __tgt_async_info *AsyncInfo) const {
935 assert(AsyncInfo && "AsyncInfo is nullptr");
936
937 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
938 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
939 return OFFLOAD_FAIL;
940
941 CUstream Stream = getStream(DeviceId, AsyncInfo);
942
943 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
944 if (Err != CUDA_SUCCESS) {
945 DP("Error when copying data from device to host. Pointers: host "
946 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
947 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
948 CUDA_ERR_STRING(Err);
949 return OFFLOAD_FAIL;
950 }
951
952 return OFFLOAD_SUCCESS;
953 }
954
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo) const955 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
956 int64_t Size, __tgt_async_info *AsyncInfo) const {
957 assert(AsyncInfo && "AsyncInfo is nullptr");
958
959 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
960 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
961 return OFFLOAD_FAIL;
962
963 CUstream Stream = getStream(SrcDevId, AsyncInfo);
964
965 // If they are two devices, we try peer to peer copy first
966 if (SrcDevId != DstDevId) {
967 int CanAccessPeer = 0;
968 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
969 if (Err != CUDA_SUCCESS) {
970 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
971 ", dst = %" PRId32 "\n",
972 SrcDevId, DstDevId);
973 CUDA_ERR_STRING(Err);
974 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
975 }
976
977 if (!CanAccessPeer) {
978 DP("P2P memcpy not supported so fall back to D2D memcpy");
979 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
980 }
981
982 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
983 if (Err != CUDA_SUCCESS) {
984 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
985 ", dst = %" PRId32 "\n",
986 SrcDevId, DstDevId);
987 CUDA_ERR_STRING(Err);
988 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
989 }
990
991 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
992 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
993 Size, Stream);
994 if (Err == CUDA_SUCCESS)
995 return OFFLOAD_SUCCESS;
996
997 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
998 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
999 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1000 CUDA_ERR_STRING(Err);
1001 }
1002
1003 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1004 }
1005
dataDelete(const int DeviceId,void * TgtPtr)1006 int dataDelete(const int DeviceId, void *TgtPtr) {
1007 if (UseMemoryManager)
1008 return MemoryManagers[DeviceId]->free(TgtPtr);
1009
1010 return DeviceAllocators[DeviceId].free(TgtPtr);
1011 }
1012
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const1013 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1014 ptrdiff_t *TgtOffsets, const int ArgNum,
1015 const int TeamNum, const int ThreadLimit,
1016 const unsigned int LoopTripCount,
1017 __tgt_async_info *AsyncInfo) const {
1018 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1019 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1020 return OFFLOAD_FAIL;
1021
1022 // All args are references.
1023 std::vector<void *> Args(ArgNum);
1024 std::vector<void *> Ptrs(ArgNum);
1025
1026 for (int I = 0; I < ArgNum; ++I) {
1027 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1028 Args[I] = &Ptrs[I];
1029 }
1030
1031 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1032
1033 int CudaThreadsPerBlock;
1034 if (ThreadLimit > 0) {
1035 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1036 CudaThreadsPerBlock = ThreadLimit;
1037 // Add master warp if necessary
1038 if (KernelInfo->ExecutionMode == GENERIC) {
1039 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1040 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1041 }
1042 } else {
1043 DP("Setting CUDA threads per block to default %d\n",
1044 DeviceData[DeviceId].NumThreads);
1045 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1046 }
1047
1048 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1049 DP("Threads per block capped at device limit %d\n",
1050 DeviceData[DeviceId].ThreadsPerBlock);
1051 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1052 }
1053
1054 if (!KernelInfo->MaxThreadsPerBlock) {
1055 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1056 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1057 KernelInfo->Func);
1058 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1059 return OFFLOAD_FAIL;
1060 }
1061
1062 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1063 DP("Threads per block capped at kernel limit %d\n",
1064 KernelInfo->MaxThreadsPerBlock);
1065 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1066 }
1067
1068 unsigned int CudaBlocksPerGrid;
1069 if (TeamNum <= 0) {
1070 if (LoopTripCount > 0 && EnvNumTeams < 0) {
1071 if (KernelInfo->ExecutionMode == SPMD) {
1072 // We have a combined construct, i.e. `target teams distribute
1073 // parallel for [simd]`. We launch so many teams so that each thread
1074 // will execute one iteration of the loop. round up to the nearest
1075 // integer
1076 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1077 } else if (KernelInfo->ExecutionMode == GENERIC) {
1078 // If we reach this point, then we have a non-combined construct, i.e.
1079 // `teams distribute` with a nested `parallel for` and each team is
1080 // assigned one iteration of the `distribute` loop. E.g.:
1081 //
1082 // #pragma omp target teams distribute
1083 // for(...loop_tripcount...) {
1084 // #pragma omp parallel for
1085 // for(...) {}
1086 // }
1087 //
1088 // Threads within a team will execute the iterations of the `parallel`
1089 // loop.
1090 CudaBlocksPerGrid = LoopTripCount;
1091 } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
1092 // If we reach this point, then we are executing a kernel that was
1093 // transformed from Generic-mode to SPMD-mode. This kernel has
1094 // SPMD-mode execution, but needs its blocks to be scheduled
1095 // differently because the current loop trip count only applies to the
1096 // `teams distribute` region and will create var too few blocks using
1097 // the regular SPMD-mode method.
1098 CudaBlocksPerGrid = LoopTripCount;
1099 } else {
1100 REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode);
1101 return OFFLOAD_FAIL;
1102 }
1103 DP("Using %d teams due to loop trip count %" PRIu32
1104 " and number of threads per block %d\n",
1105 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1106 } else {
1107 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1108 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1109 }
1110 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1111 DP("Capping number of teams to team limit %d\n",
1112 DeviceData[DeviceId].BlocksPerGrid);
1113 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1114 } else {
1115 DP("Using requested number of teams %d\n", TeamNum);
1116 CudaBlocksPerGrid = TeamNum;
1117 }
1118
1119 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1120 "Launching kernel %s with %d blocks and %d threads in %s "
1121 "mode\n",
1122 (getOffloadEntry(DeviceId, TgtEntryPtr))
1123 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1124 : "(null)",
1125 CudaBlocksPerGrid, CudaThreadsPerBlock,
1126 (KernelInfo->ExecutionMode != SPMD
1127 ? (KernelInfo->ExecutionMode == GENERIC ? "Generic"
1128 : "SPMD-Generic")
1129 : "SPMD"));
1130
1131 CUstream Stream = getStream(DeviceId, AsyncInfo);
1132 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1133 /* gridDimZ */ 1, CudaThreadsPerBlock,
1134 /* blockDimY */ 1, /* blockDimZ */ 1,
1135 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1136 if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1137 return OFFLOAD_FAIL;
1138
1139 DP("Launch of entry point at " DPxMOD " successful!\n",
1140 DPxPTR(TgtEntryPtr));
1141
1142 return OFFLOAD_SUCCESS;
1143 }
1144
synchronize(const int DeviceId,__tgt_async_info * AsyncInfo) const1145 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1146 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1147 CUresult Err = cuStreamSynchronize(Stream);
1148
1149 // Once the stream is synchronized, return it to stream pool and reset
1150 // AsyncInfo. This is to make sure the synchronization only works for its
1151 // own tasks.
1152 StreamManager->returnStream(DeviceId,
1153 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1154 AsyncInfo->Queue = nullptr;
1155
1156 if (Err != CUDA_SUCCESS) {
1157 DP("Error when synchronizing stream. stream = " DPxMOD
1158 ", async info ptr = " DPxMOD "\n",
1159 DPxPTR(Stream), DPxPTR(AsyncInfo));
1160 CUDA_ERR_STRING(Err);
1161 }
1162 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1163 }
1164
printDeviceInfo(int32_t device_id)1165 void printDeviceInfo(int32_t device_id) {
1166 char TmpChar[1000];
1167 std::string TmpStr;
1168 size_t TmpSt;
1169 int TmpInt, TmpInt2, TmpInt3;
1170
1171 CUdevice Device;
1172 checkResult(cuDeviceGet(&Device, device_id),
1173 "Error returned from cuCtxGetDevice\n");
1174
1175 cuDriverGetVersion(&TmpInt);
1176 printf(" CUDA Driver Version: \t\t%d \n", TmpInt);
1177 printf(" CUDA Device Number: \t\t%d \n", device_id);
1178 checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1179 "Error returned from cuDeviceGetName\n");
1180 printf(" Device Name: \t\t\t%s \n", TmpChar);
1181 checkResult(cuDeviceTotalMem(&TmpSt, Device),
1182 "Error returned from cuDeviceTotalMem\n");
1183 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt);
1184 checkResult(cuDeviceGetAttribute(
1185 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1186 "Error returned from cuDeviceGetAttribute\n");
1187 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt);
1188 checkResult(
1189 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1190 "Error returned from cuDeviceGetAttribute\n");
1191 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1192 checkResult(cuDeviceGetAttribute(
1193 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1194 "Error returned from cuDeviceGetAttribute\n");
1195 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt);
1196 checkResult(
1197 cuDeviceGetAttribute(
1198 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1199 "Error returned from cuDeviceGetAttribute\n");
1200 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1201 checkResult(cuDeviceGetAttribute(
1202 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1203 "Error returned from cuDeviceGetAttribute\n");
1204 printf(" Registers per Block: \t\t%d \n", TmpInt);
1205 checkResult(
1206 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1207 "Error returned from cuDeviceGetAttribute\n");
1208 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1209 checkResult(cuDeviceGetAttribute(
1210 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1211 "Error returned from cuDeviceGetAttribute\n");
1212 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt);
1213 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
1214 Device),
1215 "Error returned from cuDeviceGetAttribute\n");
1216 checkResult(cuDeviceGetAttribute(&TmpInt2,
1217 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1218 "Error returned from cuDeviceGetAttribute\n");
1219 checkResult(cuDeviceGetAttribute(&TmpInt3,
1220 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1221 "Error returned from cuDeviceGetAttribute\n");
1222 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1223 TmpInt3);
1224 checkResult(
1225 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1226 "Error returned from cuDeviceGetAttribute\n");
1227 checkResult(cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
1228 Device),
1229 "Error returned from cuDeviceGetAttribute\n");
1230 checkResult(cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
1231 Device),
1232 "Error returned from cuDeviceGetAttribute\n");
1233 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1234 TmpInt3);
1235 checkResult(
1236 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1237 "Error returned from cuDeviceGetAttribute\n");
1238 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1239 checkResult(cuDeviceGetAttribute(
1240 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1241 "Error returned from cuDeviceGetAttribute\n");
1242 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1243 checkResult(
1244 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1245 "Error returned from cuDeviceGetAttribute\n");
1246 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt);
1247 checkResult(cuDeviceGetAttribute(
1248 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1249 "Error returned from cuDeviceGetAttribute\n");
1250 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1251 checkResult(
1252 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1253 "Error returned from cuDeviceGetAttribute\n");
1254 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1255 checkResult(cuDeviceGetAttribute(
1256 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1257 "Error returned from cuDeviceGetAttribute\n");
1258 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1259 checkResult(
1260 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1261 "Error returned from cuDeviceGetAttribute\n");
1262 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1263 TmpStr = "DEFAULT";
1264 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1265 TmpStr = "PROHIBITED";
1266 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1267 TmpStr = "EXCLUSIVE PROCESS";
1268 else
1269 TmpStr = "unknown";
1270 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1271 checkResult(cuDeviceGetAttribute(
1272 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1273 "Error returned from cuDeviceGetAttribute\n");
1274 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1275 checkResult(
1276 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1277 "Error returned from cuDeviceGetAttribute\n");
1278 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1279 checkResult(cuDeviceGetAttribute(
1280 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1281 "Error returned from cuDeviceGetAttribute\n");
1282 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1283 checkResult(cuDeviceGetAttribute(
1284 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1285 "Error returned from cuDeviceGetAttribute\n");
1286 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1287 checkResult(
1288 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device),
1289 "Error returned from cuDeviceGetAttribute\n");
1290 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1291 checkResult(
1292 cuDeviceGetAttribute(
1293 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device),
1294 "Error returned from cuDeviceGetAttribute\n");
1295 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt);
1296 checkResult(cuDeviceGetAttribute(
1297 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1298 "Error returned from cuDeviceGetAttribute\n");
1299 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1300 checkResult(cuDeviceGetAttribute(
1301 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1302 "Error returned from cuDeviceGetAttribute\n");
1303 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1304 checkResult(
1305 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1306 "Error returned from cuDeviceGetAttribute\n");
1307 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1308 checkResult(
1309 cuDeviceGetAttribute(
1310 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1311 "Error returned from cuDeviceGetAttribute\n");
1312 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1313 checkResult(
1314 cuDeviceGetAttribute(
1315 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1316 "Error returned from cuDeviceGetAttribute\n");
1317 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1318 checkResult(cuDeviceGetAttribute(
1319 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1320 "Error returned from cuDeviceGetAttribute\n");
1321 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1322 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD,
1323 Device),
1324 "Error returned from cuDeviceGetAttribute\n");
1325 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1326 checkResult(cuDeviceGetAttribute(&TmpInt,
1327 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1328 Device),
1329 "Error returned from cuDeviceGetAttribute\n");
1330 checkResult(cuDeviceGetAttribute(&TmpInt2,
1331 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1332 Device),
1333 "Error returned from cuDeviceGetAttribute\n");
1334 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1335 }
1336 };
1337
1338 DeviceRTLTy DeviceRTL;
1339 } // namespace
1340
1341 // Exposed library API function
1342 #ifdef __cplusplus
1343 extern "C" {
1344 #endif
1345
__tgt_rtl_is_valid_binary(__tgt_device_image * image)1346 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1347 return elf_check_machine(image, /* EM_CUDA */ 190);
1348 }
1349
__tgt_rtl_number_of_devices()1350 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1351
__tgt_rtl_init_requires(int64_t RequiresFlags)1352 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1353 DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1354 DeviceRTL.setRequiresFlag(RequiresFlags);
1355 return RequiresFlags;
1356 }
1357
__tgt_rtl_is_data_exchangable(int32_t src_dev_id,int dst_dev_id)1358 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1359 if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1360 DeviceRTL.isValidDeviceId(dst_dev_id))
1361 return 1;
1362
1363 return 0;
1364 }
1365
__tgt_rtl_init_device(int32_t device_id)1366 int32_t __tgt_rtl_init_device(int32_t device_id) {
1367 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1368
1369 return DeviceRTL.initDevice(device_id);
1370 }
1371
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)1372 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1373 __tgt_device_image *image) {
1374 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1375
1376 return DeviceRTL.loadBinary(device_id, image);
1377 }
1378
__tgt_rtl_data_alloc(int32_t device_id,int64_t size,void *,int32_t kind)1379 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1380 int32_t kind) {
1381 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1382
1383 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1384 }
1385
__tgt_rtl_data_submit(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1386 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1387 int64_t size) {
1388 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1389
1390 __tgt_async_info AsyncInfo;
1391 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1392 size, &AsyncInfo);
1393 if (rc != OFFLOAD_SUCCESS)
1394 return OFFLOAD_FAIL;
1395
1396 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1397 }
1398
__tgt_rtl_data_submit_async(int32_t device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1399 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1400 void *hst_ptr, int64_t size,
1401 __tgt_async_info *async_info_ptr) {
1402 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1403 assert(async_info_ptr && "async_info_ptr is nullptr");
1404
1405 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1406 async_info_ptr);
1407 }
1408
__tgt_rtl_data_retrieve(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1409 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1410 int64_t size) {
1411 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1412
1413 __tgt_async_info AsyncInfo;
1414 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1415 size, &AsyncInfo);
1416 if (rc != OFFLOAD_SUCCESS)
1417 return OFFLOAD_FAIL;
1418
1419 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1420 }
1421
__tgt_rtl_data_retrieve_async(int32_t device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1422 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1423 void *tgt_ptr, int64_t size,
1424 __tgt_async_info *async_info_ptr) {
1425 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1426 assert(async_info_ptr && "async_info_ptr is nullptr");
1427
1428 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1429 async_info_ptr);
1430 }
1431
__tgt_rtl_data_exchange_async(int32_t src_dev_id,void * src_ptr,int dst_dev_id,void * dst_ptr,int64_t size,__tgt_async_info * AsyncInfo)1432 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1433 int dst_dev_id, void *dst_ptr,
1434 int64_t size,
1435 __tgt_async_info *AsyncInfo) {
1436 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1437 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1438 assert(AsyncInfo && "AsyncInfo is nullptr");
1439
1440 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1441 AsyncInfo);
1442 }
1443
__tgt_rtl_data_exchange(int32_t src_dev_id,void * src_ptr,int32_t dst_dev_id,void * dst_ptr,int64_t size)1444 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1445 int32_t dst_dev_id, void *dst_ptr,
1446 int64_t size) {
1447 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1448 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1449
1450 __tgt_async_info AsyncInfo;
1451 const int32_t rc = __tgt_rtl_data_exchange_async(
1452 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1453 if (rc != OFFLOAD_SUCCESS)
1454 return OFFLOAD_FAIL;
1455
1456 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1457 }
1458
__tgt_rtl_data_delete(int32_t device_id,void * tgt_ptr)1459 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1460 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1461
1462 return DeviceRTL.dataDelete(device_id, tgt_ptr);
1463 }
1464
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount)1465 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1466 void **tgt_args,
1467 ptrdiff_t *tgt_offsets,
1468 int32_t arg_num, int32_t team_num,
1469 int32_t thread_limit,
1470 uint64_t loop_tripcount) {
1471 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1472
1473 __tgt_async_info AsyncInfo;
1474 const int32_t rc = __tgt_rtl_run_target_team_region_async(
1475 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1476 thread_limit, loop_tripcount, &AsyncInfo);
1477 if (rc != OFFLOAD_SUCCESS)
1478 return OFFLOAD_FAIL;
1479
1480 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1481 }
1482
__tgt_rtl_run_target_team_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t team_num,int32_t thread_limit,uint64_t loop_tripcount,__tgt_async_info * async_info_ptr)1483 int32_t __tgt_rtl_run_target_team_region_async(
1484 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1485 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1486 int32_t thread_limit, uint64_t loop_tripcount,
1487 __tgt_async_info *async_info_ptr) {
1488 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1489
1490 return DeviceRTL.runTargetTeamRegion(
1491 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1492 thread_limit, loop_tripcount, async_info_ptr);
1493 }
1494
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1495 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1496 void **tgt_args, ptrdiff_t *tgt_offsets,
1497 int32_t arg_num) {
1498 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1499
1500 __tgt_async_info AsyncInfo;
1501 const int32_t rc = __tgt_rtl_run_target_region_async(
1502 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1503 if (rc != OFFLOAD_SUCCESS)
1504 return OFFLOAD_FAIL;
1505
1506 return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1507 }
1508
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1509 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1510 void *tgt_entry_ptr, void **tgt_args,
1511 ptrdiff_t *tgt_offsets,
1512 int32_t arg_num,
1513 __tgt_async_info *async_info_ptr) {
1514 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1515
1516 return __tgt_rtl_run_target_team_region_async(
1517 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1518 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1519 async_info_ptr);
1520 }
1521
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1522 int32_t __tgt_rtl_synchronize(int32_t device_id,
1523 __tgt_async_info *async_info_ptr) {
1524 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1525 assert(async_info_ptr && "async_info_ptr is nullptr");
1526 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1527
1528 return DeviceRTL.synchronize(device_id, async_info_ptr);
1529 }
1530
__tgt_rtl_set_info_flag(uint32_t NewInfoLevel)1531 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1532 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1533 InfoLevel.store(NewInfoLevel);
1534 }
1535
__tgt_rtl_print_device_info(int32_t device_id)1536 void __tgt_rtl_print_device_info(int32_t device_id) {
1537 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1538 DeviceRTL.printDeviceInfo(device_id);
1539 }
1540
1541 #ifdef __cplusplus
1542 }
1543 #endif
1544