1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18
19 #include <cassert>
20 #include <vector>
21
synchronize()22 int AsyncInfoTy::synchronize() {
23 int Result = OFFLOAD_SUCCESS;
24 if (AsyncInfo.Queue) {
25 // If we have a queue we need to synchronize it now.
26 Result = Device.synchronize(*this);
27 assert(AsyncInfo.Queue == nullptr &&
28 "The device plugin should have nulled the queue to indicate there "
29 "are no outstanding actions!");
30 }
31 return Result;
32 }
33
getVoidPtrLocation()34 void *&AsyncInfoTy::getVoidPtrLocation() {
35 BufferLocations.push_back(nullptr);
36 return BufferLocations.back();
37 }
38
39 /* All begin addresses for partially mapped structs must be 8-aligned in order
40 * to ensure proper alignment of members. E.g.
41 *
42 * struct S {
43 * int a; // 4-aligned
44 * int b; // 4-aligned
45 * int *p; // 8-aligned
46 * } s1;
47 * ...
48 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
49 * {
50 * s1.b = 5;
51 * for (int i...) s1.p[i] = ...;
52 * }
53 *
54 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
55 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
56 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
57 * requirements for its type. Now, when we allocate memory on the device, in
58 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
59 * This means that the chunk of the struct on the device will start at a
60 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
61 * address of p will be a misaligned 0x204 (on the host there was no need to add
62 * padding between b and p, so p comes exactly 4 bytes after b). If the device
63 * kernel tries to access s1.p, a misaligned address error occurs (as reported
64 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
65 * extending the size of the allocated chuck accordingly, the chuck on the
66 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
67 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
68 */
69 static const int64_t Alignment = 8;
70
71 /// Map global data and execute pending ctors
InitLibrary(DeviceTy & Device)72 static int InitLibrary(DeviceTy &Device) {
73 /*
74 * Map global data
75 */
76 int32_t device_id = Device.DeviceID;
77 int rc = OFFLOAD_SUCCESS;
78 bool supportsEmptyImages = Device.RTL->supports_empty_images &&
79 Device.RTL->supports_empty_images() > 0;
80
81 Device.PendingGlobalsMtx.lock();
82 PM->TrlTblMtx.lock();
83 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
84 TranslationTable *TransTable =
85 &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
86 if (TransTable->HostTable.EntriesBegin ==
87 TransTable->HostTable.EntriesEnd &&
88 !supportsEmptyImages) {
89 // No host entry so no need to proceed
90 continue;
91 }
92
93 if (TransTable->TargetsTable[device_id] != 0) {
94 // Library entries have already been processed
95 continue;
96 }
97
98 // 1) get image.
99 assert(TransTable->TargetsImages.size() > (size_t)device_id &&
100 "Not expecting a device ID outside the table's bounds!");
101 __tgt_device_image *img = TransTable->TargetsImages[device_id];
102 if (!img) {
103 REPORT("No image loaded for device id %d.\n", device_id);
104 rc = OFFLOAD_FAIL;
105 break;
106 }
107 // 2) load image into the target table.
108 __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
109 Device.load_binary(img);
110 // Unable to get table for this image: invalidate image and fail.
111 if (!TargetTable) {
112 REPORT("Unable to generate entries table for device id %d.\n", device_id);
113 TransTable->TargetsImages[device_id] = 0;
114 rc = OFFLOAD_FAIL;
115 break;
116 }
117
118 // Verify whether the two table sizes match.
119 size_t hsize =
120 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
121 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
122
123 // Invalid image for these host entries!
124 if (hsize != tsize) {
125 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
126 device_id, hsize, tsize);
127 TransTable->TargetsImages[device_id] = 0;
128 TransTable->TargetsTable[device_id] = 0;
129 rc = OFFLOAD_FAIL;
130 break;
131 }
132
133 // process global data that needs to be mapped.
134 Device.DataMapMtx.lock();
135 __tgt_target_table *HostTable = &TransTable->HostTable;
136 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
137 *CurrHostEntry = HostTable->EntriesBegin,
138 *EntryDeviceEnd = TargetTable->EntriesEnd;
139 CurrDeviceEntry != EntryDeviceEnd;
140 CurrDeviceEntry++, CurrHostEntry++) {
141 if (CurrDeviceEntry->size != 0) {
142 // has data.
143 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
144 "data size mismatch");
145
146 // Fortran may use multiple weak declarations for the same symbol,
147 // therefore we must allow for multiple weak symbols to be loaded from
148 // the fat binary. Treat these mappings as any other "regular" mapping.
149 // Add entry to map.
150 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
151 continue;
152 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
153 "\n",
154 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
155 CurrDeviceEntry->size);
156 Device.HostDataToTargetMap.emplace(
157 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
158 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
159 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
160 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr,
161 true /*IsRefCountINF*/);
162 }
163 }
164 Device.DataMapMtx.unlock();
165 }
166 PM->TrlTblMtx.unlock();
167
168 if (rc != OFFLOAD_SUCCESS) {
169 Device.PendingGlobalsMtx.unlock();
170 return rc;
171 }
172
173 /*
174 * Run ctors for static objects
175 */
176 if (!Device.PendingCtorsDtors.empty()) {
177 AsyncInfoTy AsyncInfo(Device);
178 // Call all ctors for all libraries registered so far
179 for (auto &lib : Device.PendingCtorsDtors) {
180 if (!lib.second.PendingCtors.empty()) {
181 DP("Has pending ctors... call now\n");
182 for (auto &entry : lib.second.PendingCtors) {
183 void *ctor = entry;
184 int rc =
185 target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
186 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
187 if (rc != OFFLOAD_SUCCESS) {
188 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
189 Device.PendingGlobalsMtx.unlock();
190 return OFFLOAD_FAIL;
191 }
192 }
193 // Clear the list to indicate that this device has been used
194 lib.second.PendingCtors.clear();
195 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
196 }
197 }
198 // All constructors have been issued, wait for them now.
199 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
200 return OFFLOAD_FAIL;
201 }
202 Device.HasPendingGlobals = false;
203 Device.PendingGlobalsMtx.unlock();
204
205 return OFFLOAD_SUCCESS;
206 }
207
handleTargetOutcome(bool Success,ident_t * Loc)208 void handleTargetOutcome(bool Success, ident_t *Loc) {
209 switch (PM->TargetOffloadPolicy) {
210 case tgt_disabled:
211 if (Success) {
212 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
213 }
214 break;
215 case tgt_default:
216 FATAL_MESSAGE0(1, "default offloading policy must be switched to "
217 "mandatory or disabled");
218 break;
219 case tgt_mandatory:
220 if (!Success) {
221 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
222 for (auto &Device : PM->Devices)
223 dumpTargetPointerMappings(Loc, Device);
224 else
225 FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target "
226 "pointer mappings.\n",
227 OMP_INFOTYPE_DUMP_TABLE);
228
229 SourceInfo info(Loc);
230 if (info.isAvailible())
231 fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
232 info.getColumn());
233 else
234 FAILURE_MESSAGE("Source location information not present. Compile with "
235 "-g or -gline-tables-only.\n");
236 FATAL_MESSAGE0(
237 1, "failure of target construct while offloading is mandatory");
238 } else {
239 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
240 for (auto &Device : PM->Devices)
241 dumpTargetPointerMappings(Loc, Device);
242 }
243 break;
244 }
245 }
246
handleDefaultTargetOffload()247 static void handleDefaultTargetOffload() {
248 PM->TargetOffloadMtx.lock();
249 if (PM->TargetOffloadPolicy == tgt_default) {
250 if (omp_get_num_devices() > 0) {
251 DP("Default TARGET OFFLOAD policy is now mandatory "
252 "(devices were found)\n");
253 PM->TargetOffloadPolicy = tgt_mandatory;
254 } else {
255 DP("Default TARGET OFFLOAD policy is now disabled "
256 "(no devices were found)\n");
257 PM->TargetOffloadPolicy = tgt_disabled;
258 }
259 }
260 PM->TargetOffloadMtx.unlock();
261 }
262
isOffloadDisabled()263 static bool isOffloadDisabled() {
264 if (PM->TargetOffloadPolicy == tgt_default)
265 handleDefaultTargetOffload();
266 return PM->TargetOffloadPolicy == tgt_disabled;
267 }
268
269 // If offload is enabled, ensure that device DeviceID has been initialized,
270 // global ctors have been executed, and global data has been mapped.
271 //
272 // There are three possible results:
273 // - Return OFFLOAD_SUCCESS if the device is ready for offload.
274 // - Return OFFLOAD_FAIL without reporting a runtime error if offload is
275 // disabled, perhaps because the initial device was specified.
276 // - Report a runtime error and return OFFLOAD_FAIL.
277 //
278 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
279 // This step might be skipped if offload is disabled.
checkDeviceAndCtors(int64_t & DeviceID,ident_t * Loc)280 int checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
281 if (isOffloadDisabled()) {
282 DP("Offload is disabled\n");
283 return OFFLOAD_FAIL;
284 }
285
286 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
287 DeviceID = omp_get_default_device();
288 DP("Use default device id %" PRId64 "\n", DeviceID);
289 }
290
291 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
292 if (omp_get_num_devices() == 0) {
293 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
294 handleTargetOutcome(false, Loc);
295 return OFFLOAD_FAIL;
296 }
297
298 if (DeviceID == omp_get_initial_device()) {
299 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
300 DeviceID);
301 return OFFLOAD_FAIL;
302 }
303
304 // Is device ready?
305 if (!device_is_ready(DeviceID)) {
306 REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
307 handleTargetOutcome(false, Loc);
308 return OFFLOAD_FAIL;
309 }
310
311 // Get device info.
312 DeviceTy &Device = PM->Devices[DeviceID];
313
314 // Check whether global data has been mapped for this device
315 Device.PendingGlobalsMtx.lock();
316 bool hasPendingGlobals = Device.HasPendingGlobals;
317 Device.PendingGlobalsMtx.unlock();
318 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
319 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
320 handleTargetOutcome(false, Loc);
321 return OFFLOAD_FAIL;
322 }
323
324 return OFFLOAD_SUCCESS;
325 }
326
getParentIndex(int64_t type)327 static int32_t getParentIndex(int64_t type) {
328 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
329 }
330
targetAllocExplicit(size_t size,int device_num,int kind,const char * name)331 void *targetAllocExplicit(size_t size, int device_num, int kind,
332 const char *name) {
333 TIMESCOPE();
334 DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
335
336 if (size <= 0) {
337 DP("Call to %s with non-positive length\n", name);
338 return NULL;
339 }
340
341 void *rc = NULL;
342
343 if (device_num == omp_get_initial_device()) {
344 rc = malloc(size);
345 DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
346 return rc;
347 }
348
349 if (!device_is_ready(device_num)) {
350 DP("%s returns NULL ptr\n", name);
351 return NULL;
352 }
353
354 DeviceTy &Device = PM->Devices[device_num];
355 rc = Device.allocData(size, nullptr, kind);
356 DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
357 return rc;
358 }
359
360 /// Call the user-defined mapper function followed by the appropriate
361 // targetData* function (targetData{Begin,End,Update}).
targetDataMapper(ident_t * loc,DeviceTy & Device,void * arg_base,void * arg,int64_t arg_size,int64_t arg_type,map_var_info_t arg_names,void * arg_mapper,AsyncInfoTy & AsyncInfo,TargetDataFuncPtrTy target_data_function)362 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
363 int64_t arg_size, int64_t arg_type,
364 map_var_info_t arg_names, void *arg_mapper,
365 AsyncInfoTy &AsyncInfo,
366 TargetDataFuncPtrTy target_data_function) {
367 TIMESCOPE_WITH_IDENT(loc);
368 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
369
370 // The mapper function fills up Components.
371 MapperComponentsTy MapperComponents;
372 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
373 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
374 arg_names);
375
376 // Construct new arrays for args_base, args, arg_sizes and arg_types
377 // using the information in MapperComponents and call the corresponding
378 // targetData* function using these new arrays.
379 std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
380 std::vector<void *> MapperArgs(MapperComponents.Components.size());
381 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
382 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
383 std::vector<void *> MapperArgNames(MapperComponents.Components.size());
384
385 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
386 auto &C = MapperComponents.Components[I];
387 MapperArgsBase[I] = C.Base;
388 MapperArgs[I] = C.Begin;
389 MapperArgSizes[I] = C.Size;
390 MapperArgTypes[I] = C.Type;
391 MapperArgNames[I] = C.Name;
392 }
393
394 int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
395 MapperArgsBase.data(), MapperArgs.data(),
396 MapperArgSizes.data(), MapperArgTypes.data(),
397 MapperArgNames.data(), /*arg_mappers*/ nullptr,
398 AsyncInfo, /*FromMapper=*/true);
399
400 return rc;
401 }
402
403 /// Internal function to do the mapping and transfer the data to the device
targetDataBegin(ident_t * loc,DeviceTy & Device,int32_t arg_num,void ** args_base,void ** args,int64_t * arg_sizes,int64_t * arg_types,map_var_info_t * arg_names,void ** arg_mappers,AsyncInfoTy & AsyncInfo,bool FromMapper)404 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
405 void **args_base, void **args, int64_t *arg_sizes,
406 int64_t *arg_types, map_var_info_t *arg_names,
407 void **arg_mappers, AsyncInfoTy &AsyncInfo,
408 bool FromMapper) {
409 // process each input.
410 for (int32_t i = 0; i < arg_num; ++i) {
411 // Ignore private variables and arrays - there is no mapping for them.
412 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
413 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
414 continue;
415
416 if (arg_mappers && arg_mappers[i]) {
417 // Instead of executing the regular path of targetDataBegin, call the
418 // targetDataMapper variant which will call targetDataBegin again
419 // with new arguments.
420 DP("Calling targetDataMapper for the %dth argument\n", i);
421
422 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
423 int rc = targetDataMapper(loc, Device, args_base[i], args[i],
424 arg_sizes[i], arg_types[i], arg_name,
425 arg_mappers[i], AsyncInfo, targetDataBegin);
426
427 if (rc != OFFLOAD_SUCCESS) {
428 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
429 " failed.\n");
430 return OFFLOAD_FAIL;
431 }
432
433 // Skip the rest of this function, continue to the next argument.
434 continue;
435 }
436
437 void *HstPtrBegin = args[i];
438 void *HstPtrBase = args_base[i];
439 int64_t data_size = arg_sizes[i];
440 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
441
442 // Adjust for proper alignment if this is a combined entry (for structs).
443 // Look at the next argument - if that is MEMBER_OF this one, then this one
444 // is a combined entry.
445 int64_t padding = 0;
446 const int next_i = i + 1;
447 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
448 getParentIndex(arg_types[next_i]) == i) {
449 padding = (int64_t)HstPtrBegin % Alignment;
450 if (padding) {
451 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
452 "\n",
453 padding, DPxPTR(HstPtrBegin));
454 HstPtrBegin = (char *)HstPtrBegin - padding;
455 data_size += padding;
456 }
457 }
458
459 // Address of pointer on the host and device, respectively.
460 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
461 TargetPointerResultTy Pointer_TPR;
462 bool IsHostPtr = false;
463 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
464 // Force the creation of a device side copy of the data when:
465 // a close map modifier was associated with a map that contained a to.
466 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
467 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
468 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
469 // have reached this point via __tgt_target_data_begin and not __tgt_target
470 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
471 // associated with a target region, so there are no target parameters). This
472 // may be considered a hack, we could revise the scheme in the future.
473 bool UpdateRef =
474 !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0);
475 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
476 DP("Has a pointer entry: \n");
477 // Base is address of pointer.
478 //
479 // Usually, the pointer is already allocated by this time. For example:
480 //
481 // #pragma omp target map(s.p[0:N])
482 //
483 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
484 // afterward, so the pointer is already allocated by the time the
485 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
486 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
487 // entry for a global that might not already be allocated by the time the
488 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
489 // when HasPresentModifier.
490 Pointer_TPR = Device.getTargetPointer(
491 HstPtrBase, HstPtrBase, sizeof(void *), nullptr,
492 MoveDataStateTy::NONE, IsImplicit, UpdateRef, HasCloseModifier,
493 HasPresentModifier, AsyncInfo);
494 PointerTgtPtrBegin = Pointer_TPR.TargetPointer;
495 IsHostPtr = Pointer_TPR.Flags.IsHostPointer;
496 if (!PointerTgtPtrBegin) {
497 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
498 HasPresentModifier ? "'present' map type modifier"
499 : "device failure or illegal mapping");
500 return OFFLOAD_FAIL;
501 }
502 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
503 "\n",
504 sizeof(void *), DPxPTR(PointerTgtPtrBegin),
505 (Pointer_TPR.Flags.IsNewEntry ? "" : " not"));
506 Pointer_HstPtrBegin = HstPtrBase;
507 // modify current entry.
508 HstPtrBase = *(void **)HstPtrBase;
509 // No need to update pointee ref count for the first element of the
510 // subelement that comes from mapper.
511 UpdateRef =
512 (!FromMapper || i != 0); // subsequently update ref count of pointee
513 }
514
515 MoveDataStateTy MoveData = MoveDataStateTy::NONE;
516 const bool UseUSM = PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
517 const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
518 const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
519 if (HasFlagTo && (!UseUSM || HasCloseModifier))
520 MoveData = HasFlagAlways ? MoveDataStateTy::REQUIRED
521 : MoveData = MoveDataStateTy::UNKNOWN;
522
523 auto TPR = Device.getTargetPointer(
524 HstPtrBegin, HstPtrBase, data_size, HstPtrName, MoveData, IsImplicit,
525 UpdateRef, HasCloseModifier, HasPresentModifier, AsyncInfo);
526 void *TgtPtrBegin = TPR.TargetPointer;
527 IsHostPtr = TPR.Flags.IsHostPointer;
528 // If data_size==0, then the argument could be a zero-length pointer to
529 // NULL, so getOrAlloc() returning NULL is not an error.
530 if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
531 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
532 HasPresentModifier ? "'present' map type modifier"
533 : "device failure or illegal mapping");
534 return OFFLOAD_FAIL;
535 }
536 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
537 " - is%s new\n",
538 data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
539
540 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
541 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
542 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
543 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
544 args_base[i] = TgtPtrBase;
545 }
546
547 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
548 // Check whether we need to update the pointer on the device
549 bool UpdateDevPtr = false;
550
551 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
552 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
553
554 Device.ShadowMtx.lock();
555 auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin);
556 // If this pointer is not in the map we need to insert it. If the map
557 // contains a stale entry, we need to update it (e.g. if the pointee was
558 // deallocated and later on is reallocated at another device address). The
559 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
560 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
561 // pair is deallocated, not when the OBJ is deallocated. In
562 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
563 // so it stays in the map for the lifetime of the application. When the
564 // OBJ is deallocated and later on allocated again (at a different device
565 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
566 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
567 if (Entry == Device.ShadowPtrMap.end() ||
568 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
569 // create or update shadow pointers for this entry
570 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
571 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
572 UpdateDevPtr = true;
573 }
574
575 if (UpdateDevPtr) {
576 Pointer_TPR.MapTableEntry->lock();
577 Device.ShadowMtx.unlock();
578
579 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
580 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
581
582 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
583 TgtPtrBase = ExpectedTgtPtrBase;
584
585 int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
586 sizeof(void *), AsyncInfo);
587 Pointer_TPR.MapTableEntry->unlock();
588
589 if (rt != OFFLOAD_SUCCESS) {
590 REPORT("Copying data to device failed.\n");
591 return OFFLOAD_FAIL;
592 }
593 } else
594 Device.ShadowMtx.unlock();
595 }
596 }
597
598 return OFFLOAD_SUCCESS;
599 }
600
601 namespace {
602 /// This structure contains information to deallocate a target pointer, aka.
603 /// used to call the function \p DeviceTy::deallocTgtPtr.
604 struct DeallocTgtPtrInfo {
605 /// Host pointer used to look up into the map table
606 void *HstPtrBegin;
607 /// Size of the data
608 int64_t DataSize;
609 /// Whether it has \p close modifier
610 bool HasCloseModifier;
611
DeallocTgtPtrInfo__anon0cffc9bb0111::DeallocTgtPtrInfo612 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasCloseModifier)
613 : HstPtrBegin(HstPtr), DataSize(Size),
614 HasCloseModifier(HasCloseModifier) {}
615 };
616 } // namespace
617
618 /// Internal function to undo the mapping and retrieve the data from the device.
targetDataEnd(ident_t * loc,DeviceTy & Device,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,AsyncInfoTy & AsyncInfo,bool FromMapper)619 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
620 void **ArgBases, void **Args, int64_t *ArgSizes,
621 int64_t *ArgTypes, map_var_info_t *ArgNames,
622 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
623 int Ret;
624 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
625 void *FromMapperBase = nullptr;
626 // process each input.
627 for (int32_t I = ArgNum - 1; I >= 0; --I) {
628 // Ignore private variables and arrays - there is no mapping for them.
629 // Also, ignore the use_device_ptr directive, it has no effect here.
630 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
631 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
632 continue;
633
634 if (ArgMappers && ArgMappers[I]) {
635 // Instead of executing the regular path of targetDataEnd, call the
636 // targetDataMapper variant which will call targetDataEnd again
637 // with new arguments.
638 DP("Calling targetDataMapper for the %dth argument\n", I);
639
640 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
641 Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
642 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
643 targetDataEnd);
644
645 if (Ret != OFFLOAD_SUCCESS) {
646 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
647 " failed.\n");
648 return OFFLOAD_FAIL;
649 }
650
651 // Skip the rest of this function, continue to the next argument.
652 continue;
653 }
654
655 void *HstPtrBegin = Args[I];
656 int64_t DataSize = ArgSizes[I];
657 // Adjust for proper alignment if this is a combined entry (for structs).
658 // Look at the next argument - if that is MEMBER_OF this one, then this one
659 // is a combined entry.
660 const int NextI = I + 1;
661 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
662 getParentIndex(ArgTypes[NextI]) == I) {
663 int64_t Padding = (int64_t)HstPtrBegin % Alignment;
664 if (Padding) {
665 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
666 "\n",
667 Padding, DPxPTR(HstPtrBegin));
668 HstPtrBegin = (char *)HstPtrBegin - Padding;
669 DataSize += Padding;
670 }
671 }
672
673 bool IsLast, IsHostPtr;
674 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
675 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
676 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
677 !(FromMapper && I == 0);
678 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
679 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
680 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
681
682 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
683 void *TgtPtrBegin =
684 Device.getTgtPtrBegin(HstPtrBegin, DataSize, IsLast, UpdateRef,
685 IsHostPtr, !IsImplicit, ForceDelete);
686 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
687 DP("Mapping does not exist (%s)\n",
688 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
689 if (HasPresentModifier) {
690 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
691 // "If a map clause appears on a target, target data, target enter data
692 // or target exit data construct with a present map-type-modifier then
693 // on entry to the region if the corresponding list item does not appear
694 // in the device data environment then an error occurs and the program
695 // terminates."
696 //
697 // This should be an error upon entering an "omp target exit data". It
698 // should not be an error upon exiting an "omp target data" or "omp
699 // target". For "omp target data", Clang thus doesn't include present
700 // modifiers for end calls. For "omp target", we have not found a valid
701 // OpenMP program for which the error matters: it appears that, if a
702 // program can guarantee that data is present at the beginning of an
703 // "omp target" region so that there's no error there, that data is also
704 // guaranteed to be present at the end.
705 MESSAGE("device mapping required by 'present' map type modifier does "
706 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
707 DPxPTR(HstPtrBegin), DataSize);
708 return OFFLOAD_FAIL;
709 }
710 } else {
711 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
712 " - is%s last\n",
713 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
714 }
715
716 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
717 // "If the map clause appears on a target, target data, or target exit data
718 // construct and a corresponding list item of the original list item is not
719 // present in the device data environment on exit from the region then the
720 // list item is ignored."
721 if (!TgtPtrBegin)
722 continue;
723
724 bool DelEntry = IsLast;
725
726 // If the last element from the mapper (for end transfer args comes in
727 // reverse order), do not remove the partial entry, the parent struct still
728 // exists.
729 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
730 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
731 DelEntry = false; // protect parent struct from being deallocated
732 }
733
734 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
735 // Move data back to the host
736 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
737 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
738 bool CopyMember = false;
739 if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
740 HasCloseModifier) {
741 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
742 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ))
743 CopyMember = IsLast;
744 }
745
746 if ((DelEntry || Always || CopyMember) &&
747 !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
748 TgtPtrBegin == HstPtrBegin)) {
749 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
750 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
751 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
752 AsyncInfo);
753 if (Ret != OFFLOAD_SUCCESS) {
754 REPORT("Copying data from device failed.\n");
755 return OFFLOAD_FAIL;
756 }
757 }
758 }
759 if (DelEntry && FromMapper && I == 0) {
760 DelEntry = false;
761 FromMapperBase = HstPtrBegin;
762 }
763
764 // If we copied back to the host a struct/array containing pointers, we
765 // need to restore the original host pointer values from their shadow
766 // copies. If the struct is going to be deallocated, remove any remaining
767 // shadow pointer entries for this struct.
768 uintptr_t LB = (uintptr_t)HstPtrBegin;
769 uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
770 Device.ShadowMtx.lock();
771 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
772 Itr != Device.ShadowPtrMap.end();) {
773 void **ShadowHstPtrAddr = (void **)Itr->first;
774
775 // An STL map is sorted on its keys; use this property
776 // to quickly determine when to break out of the loop.
777 if ((uintptr_t)ShadowHstPtrAddr < LB) {
778 ++Itr;
779 continue;
780 }
781 if ((uintptr_t)ShadowHstPtrAddr >= UB)
782 break;
783
784 // If we copied the struct to the host, we need to restore the pointer.
785 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
786 DP("Restoring original host pointer value " DPxMOD " for host "
787 "pointer " DPxMOD "\n",
788 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
789 *ShadowHstPtrAddr = Itr->second.HstPtrVal;
790 }
791 // If the struct is to be deallocated, remove the shadow entry.
792 if (DelEntry) {
793 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
794 Itr = Device.ShadowPtrMap.erase(Itr);
795 } else {
796 ++Itr;
797 }
798 }
799 Device.ShadowMtx.unlock();
800
801 // Add pointer to the buffer for later deallocation
802 if (DelEntry)
803 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasCloseModifier);
804 }
805 }
806
807 // TODO: We should not synchronize here but pass the AsyncInfo object to the
808 // allocate/deallocate device APIs.
809 //
810 // We need to synchronize before deallocating data.
811 Ret = AsyncInfo.synchronize();
812 if (Ret != OFFLOAD_SUCCESS)
813 return OFFLOAD_FAIL;
814
815 // Deallocate target pointer
816 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
817 if (FromMapperBase && FromMapperBase == Info.HstPtrBegin)
818 continue;
819 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
820 Info.HasCloseModifier);
821 if (Ret != OFFLOAD_SUCCESS) {
822 REPORT("Deallocating data from device failed.\n");
823 return OFFLOAD_FAIL;
824 }
825 }
826
827 return OFFLOAD_SUCCESS;
828 }
829
targetDataContiguous(ident_t * loc,DeviceTy & Device,void * ArgsBase,void * HstPtrBegin,int64_t ArgSize,int64_t ArgType,AsyncInfoTy & AsyncInfo)830 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
831 void *HstPtrBegin, int64_t ArgSize,
832 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
833 TIMESCOPE_WITH_IDENT(loc);
834 bool IsLast, IsHostPtr;
835 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
836 IsHostPtr, /*MustContain=*/true);
837 if (!TgtPtrBegin) {
838 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
839 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
840 MESSAGE("device mapping required by 'present' motion modifier does not "
841 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
842 DPxPTR(HstPtrBegin), ArgSize);
843 return OFFLOAD_FAIL;
844 }
845 return OFFLOAD_SUCCESS;
846 }
847
848 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
849 TgtPtrBegin == HstPtrBegin) {
850 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
851 DPxPTR(HstPtrBegin));
852 return OFFLOAD_SUCCESS;
853 }
854
855 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
856 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
857 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
858 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
859 if (Ret != OFFLOAD_SUCCESS) {
860 REPORT("Copying data from device failed.\n");
861 return OFFLOAD_FAIL;
862 }
863
864 uintptr_t LB = (uintptr_t)HstPtrBegin;
865 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
866 Device.ShadowMtx.lock();
867 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
868 IT != Device.ShadowPtrMap.end(); ++IT) {
869 void **ShadowHstPtrAddr = (void **)IT->first;
870 if ((uintptr_t)ShadowHstPtrAddr < LB)
871 continue;
872 if ((uintptr_t)ShadowHstPtrAddr >= UB)
873 break;
874 DP("Restoring original host pointer value " DPxMOD
875 " for host pointer " DPxMOD "\n",
876 DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
877 *ShadowHstPtrAddr = IT->second.HstPtrVal;
878 }
879 Device.ShadowMtx.unlock();
880 }
881
882 if (ArgType & OMP_TGT_MAPTYPE_TO) {
883 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
884 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
885 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
886 if (Ret != OFFLOAD_SUCCESS) {
887 REPORT("Copying data to device failed.\n");
888 return OFFLOAD_FAIL;
889 }
890
891 uintptr_t LB = (uintptr_t)HstPtrBegin;
892 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
893 Device.ShadowMtx.lock();
894 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
895 IT != Device.ShadowPtrMap.end(); ++IT) {
896 void **ShadowHstPtrAddr = (void **)IT->first;
897 if ((uintptr_t)ShadowHstPtrAddr < LB)
898 continue;
899 if ((uintptr_t)ShadowHstPtrAddr >= UB)
900 break;
901 DP("Restoring original target pointer value " DPxMOD " for target "
902 "pointer " DPxMOD "\n",
903 DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
904 Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
905 sizeof(void *), AsyncInfo);
906 if (Ret != OFFLOAD_SUCCESS) {
907 REPORT("Copying data to device failed.\n");
908 Device.ShadowMtx.unlock();
909 return OFFLOAD_FAIL;
910 }
911 }
912 Device.ShadowMtx.unlock();
913 }
914 return OFFLOAD_SUCCESS;
915 }
916
targetDataNonContiguous(ident_t * loc,DeviceTy & Device,void * ArgsBase,__tgt_target_non_contig * NonContig,uint64_t Size,int64_t ArgType,int CurrentDim,int DimSize,uint64_t Offset,AsyncInfoTy & AsyncInfo)917 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
918 void *ArgsBase,
919 __tgt_target_non_contig *NonContig,
920 uint64_t Size, int64_t ArgType,
921 int CurrentDim, int DimSize, uint64_t Offset,
922 AsyncInfoTy &AsyncInfo) {
923 TIMESCOPE_WITH_IDENT(loc);
924 int Ret = OFFLOAD_SUCCESS;
925 if (CurrentDim < DimSize) {
926 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
927 uint64_t CurOffset =
928 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
929 // we only need to transfer the first element for the last dimension
930 // since we've already got a contiguous piece.
931 if (CurrentDim != DimSize - 1 || I == 0) {
932 Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
933 ArgType, CurrentDim + 1, DimSize,
934 Offset + CurOffset, AsyncInfo);
935 // Stop the whole process if any contiguous piece returns anything
936 // other than OFFLOAD_SUCCESS.
937 if (Ret != OFFLOAD_SUCCESS)
938 return Ret;
939 }
940 }
941 } else {
942 char *Ptr = (char *)ArgsBase + Offset;
943 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
944 " len %" PRIu64 "\n",
945 DPxPTR(Ptr), Offset, Size);
946 Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
947 AsyncInfo);
948 }
949 return Ret;
950 }
951
getNonContigMergedDimension(__tgt_target_non_contig * NonContig,int32_t DimSize)952 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
953 int32_t DimSize) {
954 int RemovedDim = 0;
955 for (int I = DimSize - 1; I > 0; --I) {
956 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
957 RemovedDim++;
958 }
959 return RemovedDim;
960 }
961
962 /// Internal function to pass data to/from the target.
targetDataUpdate(ident_t * loc,DeviceTy & Device,int32_t ArgNum,void ** ArgsBase,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,AsyncInfoTy & AsyncInfo,bool)963 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
964 void **ArgsBase, void **Args, int64_t *ArgSizes,
965 int64_t *ArgTypes, map_var_info_t *ArgNames,
966 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
967 // process each input.
968 for (int32_t I = 0; I < ArgNum; ++I) {
969 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
970 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
971 continue;
972
973 if (ArgMappers && ArgMappers[I]) {
974 // Instead of executing the regular path of targetDataUpdate, call the
975 // targetDataMapper variant which will call targetDataUpdate again
976 // with new arguments.
977 DP("Calling targetDataMapper for the %dth argument\n", I);
978
979 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
980 int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
981 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
982 targetDataUpdate);
983
984 if (Ret != OFFLOAD_SUCCESS) {
985 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
986 " failed.\n");
987 return OFFLOAD_FAIL;
988 }
989
990 // Skip the rest of this function, continue to the next argument.
991 continue;
992 }
993
994 int Ret = OFFLOAD_SUCCESS;
995
996 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
997 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
998 int32_t DimSize = ArgSizes[I];
999 uint64_t Size =
1000 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1001 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1002 Ret = targetDataNonContiguous(
1003 loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1004 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1005 } else {
1006 Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1007 ArgTypes[I], AsyncInfo);
1008 }
1009 if (Ret == OFFLOAD_FAIL)
1010 return OFFLOAD_FAIL;
1011 }
1012 return OFFLOAD_SUCCESS;
1013 }
1014
1015 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1016 OMP_TGT_MAPTYPE_LITERAL |
1017 OMP_TGT_MAPTYPE_IMPLICIT;
isLambdaMapping(int64_t Mapping)1018 static bool isLambdaMapping(int64_t Mapping) {
1019 return (Mapping & LambdaMapping) == LambdaMapping;
1020 }
1021
1022 namespace {
1023 /// Find the table information in the map or look it up in the translation
1024 /// tables.
getTableMap(void * HostPtr)1025 TableMap *getTableMap(void *HostPtr) {
1026 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1027 HostPtrToTableMapTy::iterator TableMapIt =
1028 PM->HostPtrToTableMap.find(HostPtr);
1029
1030 if (TableMapIt != PM->HostPtrToTableMap.end())
1031 return &TableMapIt->second;
1032
1033 // We don't have a map. So search all the registered libraries.
1034 TableMap *TM = nullptr;
1035 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1036 for (HostEntriesBeginToTransTableTy::iterator Itr =
1037 PM->HostEntriesBeginToTransTable.begin();
1038 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1039 // get the translation table (which contains all the good info).
1040 TranslationTable *TransTable = &Itr->second;
1041 // iterate over all the host table entries to see if we can locate the
1042 // host_ptr.
1043 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1044 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1045 if (Cur->addr != HostPtr)
1046 continue;
1047 // we got a match, now fill the HostPtrToTableMap so that we
1048 // may avoid this search next time.
1049 TM = &(PM->HostPtrToTableMap)[HostPtr];
1050 TM->Table = TransTable;
1051 TM->Index = I;
1052 return TM;
1053 }
1054 }
1055
1056 return nullptr;
1057 }
1058
1059 /// Get loop trip count
1060 /// FIXME: This function will not work right if calling
1061 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1062 /// another thread, which might occur when we call task yield.
getLoopTripCount(int64_t DeviceId)1063 uint64_t getLoopTripCount(int64_t DeviceId) {
1064 DeviceTy &Device = PM->Devices[DeviceId];
1065 uint64_t LoopTripCount = 0;
1066
1067 {
1068 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1069 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1070 if (I != Device.LoopTripCnt.end()) {
1071 LoopTripCount = I->second;
1072 Device.LoopTripCnt.erase(I);
1073 DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
1074 }
1075 }
1076
1077 return LoopTripCount;
1078 }
1079
1080 /// A class manages private arguments in a target region.
1081 class PrivateArgumentManagerTy {
1082 /// A data structure for the information of first-private arguments. We can
1083 /// use this information to optimize data transfer by packing all
1084 /// first-private arguments and transfer them all at once.
1085 struct FirstPrivateArgInfoTy {
1086 /// The index of the element in \p TgtArgs corresponding to the argument
1087 const int Index;
1088 /// Host pointer begin
1089 const char *HstPtrBegin;
1090 /// Host pointer end
1091 const char *HstPtrEnd;
1092 /// Aligned size
1093 const int64_t AlignedSize;
1094 /// Host pointer name
1095 const map_var_info_t HstPtrName = nullptr;
1096
FirstPrivateArgInfoTy__anon0cffc9bb0211::PrivateArgumentManagerTy::FirstPrivateArgInfoTy1097 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1098 const map_var_info_t HstPtrName = nullptr)
1099 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1100 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1101 HstPtrName(HstPtrName) {}
1102 };
1103
1104 /// A vector of target pointers for all private arguments
1105 std::vector<void *> TgtPtrs;
1106
1107 /// A vector of information of all first-private arguments to be packed
1108 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1109 /// Host buffer for all arguments to be packed
1110 std::vector<char> FirstPrivateArgBuffer;
1111 /// The total size of all arguments to be packed
1112 int64_t FirstPrivateArgSize = 0;
1113
1114 /// A reference to the \p DeviceTy object
1115 DeviceTy &Device;
1116 /// A pointer to a \p AsyncInfoTy object
1117 AsyncInfoTy &AsyncInfo;
1118
1119 // TODO: What would be the best value here? Should we make it configurable?
1120 // If the size is larger than this threshold, we will allocate and transfer it
1121 // immediately instead of packing it.
1122 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1123
1124 public:
1125 /// Constructor
PrivateArgumentManagerTy(DeviceTy & Dev,AsyncInfoTy & AsyncInfo)1126 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1127 : Device(Dev), AsyncInfo(AsyncInfo) {}
1128
1129 /// Add a private argument
addArg(void * HstPtr,int64_t ArgSize,int64_t ArgOffset,bool IsFirstPrivate,void * & TgtPtr,int TgtArgsIndex,const map_var_info_t HstPtrName=nullptr,const bool AllocImmediately=false)1130 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1131 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1132 const map_var_info_t HstPtrName = nullptr,
1133 const bool AllocImmediately = false) {
1134 // If the argument is not first-private, or its size is greater than a
1135 // predefined threshold, we will allocate memory and issue the transfer
1136 // immediately.
1137 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1138 AllocImmediately) {
1139 TgtPtr = Device.allocData(ArgSize, HstPtr);
1140 if (!TgtPtr) {
1141 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1142 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1143 return OFFLOAD_FAIL;
1144 }
1145 #ifdef OMPTARGET_DEBUG
1146 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1147 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1148 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1149 "\n",
1150 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1151 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1152 #endif
1153 // If first-private, copy data from host
1154 if (IsFirstPrivate) {
1155 DP("Submitting firstprivate data to the device.\n");
1156 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1157 if (Ret != OFFLOAD_SUCCESS) {
1158 DP("Copying data to device failed, failed.\n");
1159 return OFFLOAD_FAIL;
1160 }
1161 }
1162 TgtPtrs.push_back(TgtPtr);
1163 } else {
1164 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1165 DPxPTR(HstPtr), ArgSize);
1166 // When reach this point, the argument must meet all following
1167 // requirements:
1168 // 1. Its size does not exceed the threshold (see the comment for
1169 // FirstPrivateArgSizeThreshold);
1170 // 2. It must be first-private (needs to be mapped to target device).
1171 // We will pack all this kind of arguments to transfer them all at once
1172 // to reduce the number of data transfer. We will not take
1173 // non-first-private arguments, aka. private arguments that doesn't need
1174 // to be mapped to target device, into account because data allocation
1175 // can be very efficient with memory manager.
1176
1177 // Placeholder value
1178 TgtPtr = nullptr;
1179 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1180 HstPtrName);
1181 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1182 }
1183
1184 return OFFLOAD_SUCCESS;
1185 }
1186
1187 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1188 /// and start the transfer.
packAndTransfer(std::vector<void * > & TgtArgs)1189 int packAndTransfer(std::vector<void *> &TgtArgs) {
1190 if (!FirstPrivateArgInfo.empty()) {
1191 assert(FirstPrivateArgSize != 0 &&
1192 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1193 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1194 auto Itr = FirstPrivateArgBuffer.begin();
1195 // Copy all host data to this buffer
1196 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1197 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1198 Itr = std::next(Itr, Info.AlignedSize);
1199 }
1200 // Allocate target memory
1201 void *TgtPtr =
1202 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1203 if (TgtPtr == nullptr) {
1204 DP("Failed to allocate target memory for private arguments.\n");
1205 return OFFLOAD_FAIL;
1206 }
1207 TgtPtrs.push_back(TgtPtr);
1208 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1209 FirstPrivateArgSize, DPxPTR(TgtPtr));
1210 // Transfer data to target device
1211 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1212 FirstPrivateArgSize, AsyncInfo);
1213 if (Ret != OFFLOAD_SUCCESS) {
1214 DP("Failed to submit data of private arguments.\n");
1215 return OFFLOAD_FAIL;
1216 }
1217 // Fill in all placeholder pointers
1218 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1219 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1220 void *&Ptr = TgtArgs[Info.Index];
1221 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1222 Ptr = reinterpret_cast<void *>(TP);
1223 TP += Info.AlignedSize;
1224 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1225 "\n",
1226 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1227 DPxPTR(Ptr));
1228 }
1229 }
1230
1231 return OFFLOAD_SUCCESS;
1232 }
1233
1234 /// Free all target memory allocated for private arguments
free()1235 int free() {
1236 for (void *P : TgtPtrs) {
1237 int Ret = Device.deleteData(P);
1238 if (Ret != OFFLOAD_SUCCESS) {
1239 DP("Deallocation of (first-)private arrays failed.\n");
1240 return OFFLOAD_FAIL;
1241 }
1242 }
1243
1244 TgtPtrs.clear();
1245
1246 return OFFLOAD_SUCCESS;
1247 }
1248 };
1249
1250 /// Process data before launching the kernel, including calling targetDataBegin
1251 /// to map and transfer data to target device, transferring (first-)private
1252 /// variables.
processDataBefore(ident_t * loc,int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,std::vector<void * > & TgtArgs,std::vector<ptrdiff_t> & TgtOffsets,PrivateArgumentManagerTy & PrivateArgumentManager,AsyncInfoTy & AsyncInfo)1253 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1254 int32_t ArgNum, void **ArgBases, void **Args,
1255 int64_t *ArgSizes, int64_t *ArgTypes,
1256 map_var_info_t *ArgNames, void **ArgMappers,
1257 std::vector<void *> &TgtArgs,
1258 std::vector<ptrdiff_t> &TgtOffsets,
1259 PrivateArgumentManagerTy &PrivateArgumentManager,
1260 AsyncInfoTy &AsyncInfo) {
1261 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1262 DeviceTy &Device = PM->Devices[DeviceId];
1263 int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1264 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1265 if (Ret != OFFLOAD_SUCCESS) {
1266 REPORT("Call to targetDataBegin failed, abort target.\n");
1267 return OFFLOAD_FAIL;
1268 }
1269
1270 // List of (first-)private arrays allocated for this target region
1271 std::vector<int> TgtArgsPositions(ArgNum, -1);
1272
1273 for (int32_t I = 0; I < ArgNum; ++I) {
1274 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1275 // This is not a target parameter, do not push it into TgtArgs.
1276 // Check for lambda mapping.
1277 if (isLambdaMapping(ArgTypes[I])) {
1278 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1279 "PTR_AND_OBJ must be also MEMBER_OF.");
1280 unsigned Idx = getParentIndex(ArgTypes[I]);
1281 int TgtIdx = TgtArgsPositions[Idx];
1282 assert(TgtIdx != -1 && "Base address must be translated already.");
1283 // The parent lambda must be processed already and it must be the last
1284 // in TgtArgs and TgtOffsets arrays.
1285 void *HstPtrVal = Args[I];
1286 void *HstPtrBegin = ArgBases[I];
1287 void *HstPtrBase = Args[Idx];
1288 bool IsLast, IsHostPtr; // unused.
1289 void *TgtPtrBase =
1290 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1291 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1292 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1293 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1294 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1295 PointerTgtPtrBegin = Device.getTgtPtrBegin(HstPtrVal, ArgSizes[I],
1296 IsLast, false, IsHostPtr);
1297 if (!PointerTgtPtrBegin) {
1298 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1299 DPxPTR(HstPtrVal));
1300 continue;
1301 }
1302 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1303 TgtPtrBegin == HstPtrBegin) {
1304 DP("Unified memory is active, no need to map lambda captured"
1305 "variable (" DPxMOD ")\n",
1306 DPxPTR(HstPtrVal));
1307 continue;
1308 }
1309 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1310 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1311 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1312 sizeof(void *), AsyncInfo);
1313 if (Ret != OFFLOAD_SUCCESS) {
1314 REPORT("Copying data to device failed.\n");
1315 return OFFLOAD_FAIL;
1316 }
1317 }
1318 continue;
1319 }
1320 void *HstPtrBegin = Args[I];
1321 void *HstPtrBase = ArgBases[I];
1322 void *TgtPtrBegin;
1323 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1324 ptrdiff_t TgtBaseOffset;
1325 bool IsLast, IsHostPtr; // unused.
1326 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1327 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1328 DPxPTR(HstPtrBase));
1329 TgtPtrBegin = HstPtrBase;
1330 TgtBaseOffset = 0;
1331 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1332 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1333 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1334 // If there is a next argument and it depends on the current one, we need
1335 // to allocate the private memory immediately. If this is not the case,
1336 // then the argument can be marked for optimization and packed with the
1337 // other privates.
1338 const bool AllocImmediately =
1339 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1340 Ret = PrivateArgumentManager.addArg(
1341 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1342 TgtArgs.size(), HstPtrName, AllocImmediately);
1343 if (Ret != OFFLOAD_SUCCESS) {
1344 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1345 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1346 return OFFLOAD_FAIL;
1347 }
1348 } else {
1349 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1350 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1351 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1352 false, IsHostPtr);
1353 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1354 #ifdef OMPTARGET_DEBUG
1355 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1356 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1357 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1358 #endif
1359 }
1360 TgtArgsPositions[I] = TgtArgs.size();
1361 TgtArgs.push_back(TgtPtrBegin);
1362 TgtOffsets.push_back(TgtBaseOffset);
1363 }
1364
1365 assert(TgtArgs.size() == TgtOffsets.size() &&
1366 "Size mismatch in arguments and offsets");
1367
1368 // Pack and transfer first-private arguments
1369 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1370 if (Ret != OFFLOAD_SUCCESS) {
1371 DP("Failed to pack and transfer first private arguments\n");
1372 return OFFLOAD_FAIL;
1373 }
1374
1375 return OFFLOAD_SUCCESS;
1376 }
1377
1378 /// Process data after launching the kernel, including transferring data back to
1379 /// host if needed and deallocating target memory of (first-)private variables.
processDataAfter(ident_t * loc,int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,PrivateArgumentManagerTy & PrivateArgumentManager,AsyncInfoTy & AsyncInfo)1380 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1381 int32_t ArgNum, void **ArgBases, void **Args,
1382 int64_t *ArgSizes, int64_t *ArgTypes,
1383 map_var_info_t *ArgNames, void **ArgMappers,
1384 PrivateArgumentManagerTy &PrivateArgumentManager,
1385 AsyncInfoTy &AsyncInfo) {
1386 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1387 DeviceTy &Device = PM->Devices[DeviceId];
1388
1389 // Move data from device.
1390 int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1391 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1392 if (Ret != OFFLOAD_SUCCESS) {
1393 REPORT("Call to targetDataEnd failed, abort target.\n");
1394 return OFFLOAD_FAIL;
1395 }
1396
1397 // Free target memory for private arguments
1398 Ret = PrivateArgumentManager.free();
1399 if (Ret != OFFLOAD_SUCCESS) {
1400 REPORT("Failed to deallocate target memory for private args\n");
1401 return OFFLOAD_FAIL;
1402 }
1403
1404 return OFFLOAD_SUCCESS;
1405 }
1406 } // namespace
1407
1408 /// performs the same actions as data_begin in case arg_num is
1409 /// non-zero and initiates run of the offloaded region on the target platform;
1410 /// if arg_num is non-zero after the region execution is done it also
1411 /// performs the same action as data_update and data_end above. This function
1412 /// returns 0 if it was able to transfer the execution to a target and an
1413 /// integer different from zero otherwise.
target(ident_t * loc,DeviceTy & Device,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,int32_t TeamNum,int32_t ThreadLimit,int IsTeamConstruct,AsyncInfoTy & AsyncInfo)1414 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1415 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1416 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1417 int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1418 int32_t DeviceId = Device.DeviceID;
1419
1420 TableMap *TM = getTableMap(HostPtr);
1421 // No map for this host pointer found!
1422 if (!TM) {
1423 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1424 DPxPTR(HostPtr));
1425 return OFFLOAD_FAIL;
1426 }
1427
1428 // get target table.
1429 __tgt_target_table *TargetTable = nullptr;
1430 {
1431 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1432 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1433 "Not expecting a device ID outside the table's bounds!");
1434 TargetTable = TM->Table->TargetsTable[DeviceId];
1435 }
1436 assert(TargetTable && "Global data has not been mapped\n");
1437
1438 std::vector<void *> TgtArgs;
1439 std::vector<ptrdiff_t> TgtOffsets;
1440
1441 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1442
1443 int Ret;
1444 if (ArgNum) {
1445 // Process data, such as data mapping, before launching the kernel
1446 Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1447 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1448 TgtOffsets, PrivateArgumentManager, AsyncInfo);
1449 if (Ret != OFFLOAD_SUCCESS) {
1450 REPORT("Failed to process data before launching the kernel.\n");
1451 return OFFLOAD_FAIL;
1452 }
1453 }
1454
1455 // Get loop trip count
1456 uint64_t LoopTripCount = getLoopTripCount(DeviceId);
1457
1458 // Launch device execution.
1459 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1460 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1461 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1462
1463 {
1464 TIMESCOPE_WITH_NAME_AND_IDENT(
1465 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1466 if (IsTeamConstruct)
1467 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1468 TgtArgs.size(), TeamNum, ThreadLimit,
1469 LoopTripCount, AsyncInfo);
1470 else
1471 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1472 TgtArgs.size(), AsyncInfo);
1473 }
1474
1475 if (Ret != OFFLOAD_SUCCESS) {
1476 REPORT("Executing target region abort target.\n");
1477 return OFFLOAD_FAIL;
1478 }
1479
1480 if (ArgNum) {
1481 // Transfer data back and deallocate target memory for (first-)private
1482 // variables
1483 Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1484 ArgSizes, ArgTypes, ArgNames, ArgMappers,
1485 PrivateArgumentManager, AsyncInfo);
1486 if (Ret != OFFLOAD_SUCCESS) {
1487 REPORT("Failed to process data after launching the kernel.\n");
1488 return OFFLOAD_FAIL;
1489 }
1490 }
1491
1492 return OFFLOAD_SUCCESS;
1493 }
1494