1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18 
19 #include <cassert>
20 #include <vector>
21 
synchronize()22 int AsyncInfoTy::synchronize() {
23   int Result = OFFLOAD_SUCCESS;
24   if (AsyncInfo.Queue) {
25     // If we have a queue we need to synchronize it now.
26     Result = Device.synchronize(*this);
27     assert(AsyncInfo.Queue == nullptr &&
28            "The device plugin should have nulled the queue to indicate there "
29            "are no outstanding actions!");
30   }
31   return Result;
32 }
33 
getVoidPtrLocation()34 void *&AsyncInfoTy::getVoidPtrLocation() {
35   BufferLocations.push_back(nullptr);
36   return BufferLocations.back();
37 }
38 
39 /* All begin addresses for partially mapped structs must be 8-aligned in order
40  * to ensure proper alignment of members. E.g.
41  *
42  * struct S {
43  *   int a;   // 4-aligned
44  *   int b;   // 4-aligned
45  *   int *p;  // 8-aligned
46  * } s1;
47  * ...
48  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
49  * {
50  *   s1.b = 5;
51  *   for (int i...) s1.p[i] = ...;
52  * }
53  *
54  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
55  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
56  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
57  * requirements for its type. Now, when we allocate memory on the device, in
58  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
59  * This means that the chunk of the struct on the device will start at a
60  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
61  * address of p will be a misaligned 0x204 (on the host there was no need to add
62  * padding between b and p, so p comes exactly 4 bytes after b). If the device
63  * kernel tries to access s1.p, a misaligned address error occurs (as reported
64  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
65  * extending the size of the allocated chuck accordingly, the chuck on the
66  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
67  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
68  */
69 static const int64_t Alignment = 8;
70 
71 /// Map global data and execute pending ctors
InitLibrary(DeviceTy & Device)72 static int InitLibrary(DeviceTy &Device) {
73   /*
74    * Map global data
75    */
76   int32_t device_id = Device.DeviceID;
77   int rc = OFFLOAD_SUCCESS;
78   bool supportsEmptyImages = Device.RTL->supports_empty_images &&
79                              Device.RTL->supports_empty_images() > 0;
80 
81   Device.PendingGlobalsMtx.lock();
82   PM->TrlTblMtx.lock();
83   for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
84     TranslationTable *TransTable =
85         &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
86     if (TransTable->HostTable.EntriesBegin ==
87             TransTable->HostTable.EntriesEnd &&
88         !supportsEmptyImages) {
89       // No host entry so no need to proceed
90       continue;
91     }
92 
93     if (TransTable->TargetsTable[device_id] != 0) {
94       // Library entries have already been processed
95       continue;
96     }
97 
98     // 1) get image.
99     assert(TransTable->TargetsImages.size() > (size_t)device_id &&
100            "Not expecting a device ID outside the table's bounds!");
101     __tgt_device_image *img = TransTable->TargetsImages[device_id];
102     if (!img) {
103       REPORT("No image loaded for device id %d.\n", device_id);
104       rc = OFFLOAD_FAIL;
105       break;
106     }
107     // 2) load image into the target table.
108     __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
109         Device.load_binary(img);
110     // Unable to get table for this image: invalidate image and fail.
111     if (!TargetTable) {
112       REPORT("Unable to generate entries table for device id %d.\n", device_id);
113       TransTable->TargetsImages[device_id] = 0;
114       rc = OFFLOAD_FAIL;
115       break;
116     }
117 
118     // Verify whether the two table sizes match.
119     size_t hsize =
120         TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
121     size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
122 
123     // Invalid image for these host entries!
124     if (hsize != tsize) {
125       REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
126              device_id, hsize, tsize);
127       TransTable->TargetsImages[device_id] = 0;
128       TransTable->TargetsTable[device_id] = 0;
129       rc = OFFLOAD_FAIL;
130       break;
131     }
132 
133     // process global data that needs to be mapped.
134     Device.DataMapMtx.lock();
135     __tgt_target_table *HostTable = &TransTable->HostTable;
136     for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
137                              *CurrHostEntry = HostTable->EntriesBegin,
138                              *EntryDeviceEnd = TargetTable->EntriesEnd;
139          CurrDeviceEntry != EntryDeviceEnd;
140          CurrDeviceEntry++, CurrHostEntry++) {
141       if (CurrDeviceEntry->size != 0) {
142         // has data.
143         assert(CurrDeviceEntry->size == CurrHostEntry->size &&
144                "data size mismatch");
145 
146         // Fortran may use multiple weak declarations for the same symbol,
147         // therefore we must allow for multiple weak symbols to be loaded from
148         // the fat binary. Treat these mappings as any other "regular" mapping.
149         // Add entry to map.
150         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
151           continue;
152         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
153            "\n",
154            DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
155            CurrDeviceEntry->size);
156         Device.HostDataToTargetMap.emplace(
157             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
158             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
159             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
160             (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr,
161             true /*IsRefCountINF*/);
162       }
163     }
164     Device.DataMapMtx.unlock();
165   }
166   PM->TrlTblMtx.unlock();
167 
168   if (rc != OFFLOAD_SUCCESS) {
169     Device.PendingGlobalsMtx.unlock();
170     return rc;
171   }
172 
173   /*
174    * Run ctors for static objects
175    */
176   if (!Device.PendingCtorsDtors.empty()) {
177     AsyncInfoTy AsyncInfo(Device);
178     // Call all ctors for all libraries registered so far
179     for (auto &lib : Device.PendingCtorsDtors) {
180       if (!lib.second.PendingCtors.empty()) {
181         DP("Has pending ctors... call now\n");
182         for (auto &entry : lib.second.PendingCtors) {
183           void *ctor = entry;
184           int rc =
185               target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
186                      nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
187           if (rc != OFFLOAD_SUCCESS) {
188             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
189             Device.PendingGlobalsMtx.unlock();
190             return OFFLOAD_FAIL;
191           }
192         }
193         // Clear the list to indicate that this device has been used
194         lib.second.PendingCtors.clear();
195         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
196       }
197     }
198     // All constructors have been issued, wait for them now.
199     if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
200       return OFFLOAD_FAIL;
201   }
202   Device.HasPendingGlobals = false;
203   Device.PendingGlobalsMtx.unlock();
204 
205   return OFFLOAD_SUCCESS;
206 }
207 
handleTargetOutcome(bool Success,ident_t * Loc)208 void handleTargetOutcome(bool Success, ident_t *Loc) {
209   switch (PM->TargetOffloadPolicy) {
210   case tgt_disabled:
211     if (Success) {
212       FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
213     }
214     break;
215   case tgt_default:
216     FATAL_MESSAGE0(1, "default offloading policy must be switched to "
217                       "mandatory or disabled");
218     break;
219   case tgt_mandatory:
220     if (!Success) {
221       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
222         for (auto &Device : PM->Devices)
223           dumpTargetPointerMappings(Loc, Device);
224       else
225         FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target "
226                         "pointer mappings.\n",
227                         OMP_INFOTYPE_DUMP_TABLE);
228 
229       SourceInfo info(Loc);
230       if (info.isAvailible())
231         fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
232                 info.getColumn());
233       else
234         FAILURE_MESSAGE("Source location information not present. Compile with "
235                         "-g or -gline-tables-only.\n");
236       FATAL_MESSAGE0(
237           1, "failure of target construct while offloading is mandatory");
238     } else {
239       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
240         for (auto &Device : PM->Devices)
241           dumpTargetPointerMappings(Loc, Device);
242     }
243     break;
244   }
245 }
246 
handleDefaultTargetOffload()247 static void handleDefaultTargetOffload() {
248   PM->TargetOffloadMtx.lock();
249   if (PM->TargetOffloadPolicy == tgt_default) {
250     if (omp_get_num_devices() > 0) {
251       DP("Default TARGET OFFLOAD policy is now mandatory "
252          "(devices were found)\n");
253       PM->TargetOffloadPolicy = tgt_mandatory;
254     } else {
255       DP("Default TARGET OFFLOAD policy is now disabled "
256          "(no devices were found)\n");
257       PM->TargetOffloadPolicy = tgt_disabled;
258     }
259   }
260   PM->TargetOffloadMtx.unlock();
261 }
262 
isOffloadDisabled()263 static bool isOffloadDisabled() {
264   if (PM->TargetOffloadPolicy == tgt_default)
265     handleDefaultTargetOffload();
266   return PM->TargetOffloadPolicy == tgt_disabled;
267 }
268 
269 // If offload is enabled, ensure that device DeviceID has been initialized,
270 // global ctors have been executed, and global data has been mapped.
271 //
272 // There are three possible results:
273 // - Return OFFLOAD_SUCCESS if the device is ready for offload.
274 // - Return OFFLOAD_FAIL without reporting a runtime error if offload is
275 //   disabled, perhaps because the initial device was specified.
276 // - Report a runtime error and return OFFLOAD_FAIL.
277 //
278 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
279 // This step might be skipped if offload is disabled.
checkDeviceAndCtors(int64_t & DeviceID,ident_t * Loc)280 int checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
281   if (isOffloadDisabled()) {
282     DP("Offload is disabled\n");
283     return OFFLOAD_FAIL;
284   }
285 
286   if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
287     DeviceID = omp_get_default_device();
288     DP("Use default device id %" PRId64 "\n", DeviceID);
289   }
290 
291   // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
292   if (omp_get_num_devices() == 0) {
293     DP("omp_get_num_devices() == 0 but offload is manadatory\n");
294     handleTargetOutcome(false, Loc);
295     return OFFLOAD_FAIL;
296   }
297 
298   if (DeviceID == omp_get_initial_device()) {
299     DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
300        DeviceID);
301     return OFFLOAD_FAIL;
302   }
303 
304   // Is device ready?
305   if (!device_is_ready(DeviceID)) {
306     REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
307     handleTargetOutcome(false, Loc);
308     return OFFLOAD_FAIL;
309   }
310 
311   // Get device info.
312   DeviceTy &Device = PM->Devices[DeviceID];
313 
314   // Check whether global data has been mapped for this device
315   Device.PendingGlobalsMtx.lock();
316   bool hasPendingGlobals = Device.HasPendingGlobals;
317   Device.PendingGlobalsMtx.unlock();
318   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
319     REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
320     handleTargetOutcome(false, Loc);
321     return OFFLOAD_FAIL;
322   }
323 
324   return OFFLOAD_SUCCESS;
325 }
326 
getParentIndex(int64_t type)327 static int32_t getParentIndex(int64_t type) {
328   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
329 }
330 
targetAllocExplicit(size_t size,int device_num,int kind,const char * name)331 void *targetAllocExplicit(size_t size, int device_num, int kind,
332                           const char *name) {
333   TIMESCOPE();
334   DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
335 
336   if (size <= 0) {
337     DP("Call to %s with non-positive length\n", name);
338     return NULL;
339   }
340 
341   void *rc = NULL;
342 
343   if (device_num == omp_get_initial_device()) {
344     rc = malloc(size);
345     DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
346     return rc;
347   }
348 
349   if (!device_is_ready(device_num)) {
350     DP("%s returns NULL ptr\n", name);
351     return NULL;
352   }
353 
354   DeviceTy &Device = PM->Devices[device_num];
355   rc = Device.allocData(size, nullptr, kind);
356   DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
357   return rc;
358 }
359 
360 /// Call the user-defined mapper function followed by the appropriate
361 // targetData* function (targetData{Begin,End,Update}).
targetDataMapper(ident_t * loc,DeviceTy & Device,void * arg_base,void * arg,int64_t arg_size,int64_t arg_type,map_var_info_t arg_names,void * arg_mapper,AsyncInfoTy & AsyncInfo,TargetDataFuncPtrTy target_data_function)362 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
363                      int64_t arg_size, int64_t arg_type,
364                      map_var_info_t arg_names, void *arg_mapper,
365                      AsyncInfoTy &AsyncInfo,
366                      TargetDataFuncPtrTy target_data_function) {
367   TIMESCOPE_WITH_IDENT(loc);
368   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
369 
370   // The mapper function fills up Components.
371   MapperComponentsTy MapperComponents;
372   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
373   (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
374                    arg_names);
375 
376   // Construct new arrays for args_base, args, arg_sizes and arg_types
377   // using the information in MapperComponents and call the corresponding
378   // targetData* function using these new arrays.
379   std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
380   std::vector<void *> MapperArgs(MapperComponents.Components.size());
381   std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
382   std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
383   std::vector<void *> MapperArgNames(MapperComponents.Components.size());
384 
385   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
386     auto &C = MapperComponents.Components[I];
387     MapperArgsBase[I] = C.Base;
388     MapperArgs[I] = C.Begin;
389     MapperArgSizes[I] = C.Size;
390     MapperArgTypes[I] = C.Type;
391     MapperArgNames[I] = C.Name;
392   }
393 
394   int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
395                                 MapperArgsBase.data(), MapperArgs.data(),
396                                 MapperArgSizes.data(), MapperArgTypes.data(),
397                                 MapperArgNames.data(), /*arg_mappers*/ nullptr,
398                                 AsyncInfo, /*FromMapper=*/true);
399 
400   return rc;
401 }
402 
403 /// Internal function to do the mapping and transfer the data to the device
targetDataBegin(ident_t * loc,DeviceTy & Device,int32_t arg_num,void ** args_base,void ** args,int64_t * arg_sizes,int64_t * arg_types,map_var_info_t * arg_names,void ** arg_mappers,AsyncInfoTy & AsyncInfo,bool FromMapper)404 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
405                     void **args_base, void **args, int64_t *arg_sizes,
406                     int64_t *arg_types, map_var_info_t *arg_names,
407                     void **arg_mappers, AsyncInfoTy &AsyncInfo,
408                     bool FromMapper) {
409   // process each input.
410   for (int32_t i = 0; i < arg_num; ++i) {
411     // Ignore private variables and arrays - there is no mapping for them.
412     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
413         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
414       continue;
415 
416     if (arg_mappers && arg_mappers[i]) {
417       // Instead of executing the regular path of targetDataBegin, call the
418       // targetDataMapper variant which will call targetDataBegin again
419       // with new arguments.
420       DP("Calling targetDataMapper for the %dth argument\n", i);
421 
422       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
423       int rc = targetDataMapper(loc, Device, args_base[i], args[i],
424                                 arg_sizes[i], arg_types[i], arg_name,
425                                 arg_mappers[i], AsyncInfo, targetDataBegin);
426 
427       if (rc != OFFLOAD_SUCCESS) {
428         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
429                " failed.\n");
430         return OFFLOAD_FAIL;
431       }
432 
433       // Skip the rest of this function, continue to the next argument.
434       continue;
435     }
436 
437     void *HstPtrBegin = args[i];
438     void *HstPtrBase = args_base[i];
439     int64_t data_size = arg_sizes[i];
440     map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
441 
442     // Adjust for proper alignment if this is a combined entry (for structs).
443     // Look at the next argument - if that is MEMBER_OF this one, then this one
444     // is a combined entry.
445     int64_t padding = 0;
446     const int next_i = i + 1;
447     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
448         getParentIndex(arg_types[next_i]) == i) {
449       padding = (int64_t)HstPtrBegin % Alignment;
450       if (padding) {
451         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
452            "\n",
453            padding, DPxPTR(HstPtrBegin));
454         HstPtrBegin = (char *)HstPtrBegin - padding;
455         data_size += padding;
456       }
457     }
458 
459     // Address of pointer on the host and device, respectively.
460     void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
461     TargetPointerResultTy Pointer_TPR;
462     bool IsHostPtr = false;
463     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
464     // Force the creation of a device side copy of the data when:
465     // a close map modifier was associated with a map that contained a to.
466     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
467     bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
468     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
469     // have reached this point via __tgt_target_data_begin and not __tgt_target
470     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
471     // associated with a target region, so there are no target parameters). This
472     // may be considered a hack, we could revise the scheme in the future.
473     bool UpdateRef =
474         !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0);
475     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
476       DP("Has a pointer entry: \n");
477       // Base is address of pointer.
478       //
479       // Usually, the pointer is already allocated by this time.  For example:
480       //
481       //   #pragma omp target map(s.p[0:N])
482       //
483       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
484       // afterward, so the pointer is already allocated by the time the
485       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
486       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
487       // entry for a global that might not already be allocated by the time the
488       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
489       // when HasPresentModifier.
490       Pointer_TPR = Device.getTargetPointer(
491           HstPtrBase, HstPtrBase, sizeof(void *), nullptr,
492           MoveDataStateTy::NONE, IsImplicit, UpdateRef, HasCloseModifier,
493           HasPresentModifier, AsyncInfo);
494       PointerTgtPtrBegin = Pointer_TPR.TargetPointer;
495       IsHostPtr = Pointer_TPR.Flags.IsHostPointer;
496       if (!PointerTgtPtrBegin) {
497         REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
498                HasPresentModifier ? "'present' map type modifier"
499                                   : "device failure or illegal mapping");
500         return OFFLOAD_FAIL;
501       }
502       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
503          "\n",
504          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
505          (Pointer_TPR.Flags.IsNewEntry ? "" : " not"));
506       Pointer_HstPtrBegin = HstPtrBase;
507       // modify current entry.
508       HstPtrBase = *(void **)HstPtrBase;
509       // No need to update pointee ref count for the first element of the
510       // subelement that comes from mapper.
511       UpdateRef =
512           (!FromMapper || i != 0); // subsequently update ref count of pointee
513     }
514 
515     MoveDataStateTy MoveData = MoveDataStateTy::NONE;
516     const bool UseUSM = PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
517     const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
518     const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
519     if (HasFlagTo && (!UseUSM || HasCloseModifier))
520       MoveData = HasFlagAlways ? MoveDataStateTy::REQUIRED
521                                : MoveData = MoveDataStateTy::UNKNOWN;
522 
523     auto TPR = Device.getTargetPointer(
524         HstPtrBegin, HstPtrBase, data_size, HstPtrName, MoveData, IsImplicit,
525         UpdateRef, HasCloseModifier, HasPresentModifier, AsyncInfo);
526     void *TgtPtrBegin = TPR.TargetPointer;
527     IsHostPtr = TPR.Flags.IsHostPointer;
528     // If data_size==0, then the argument could be a zero-length pointer to
529     // NULL, so getOrAlloc() returning NULL is not an error.
530     if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
531       REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
532              HasPresentModifier ? "'present' map type modifier"
533                                 : "device failure or illegal mapping");
534       return OFFLOAD_FAIL;
535     }
536     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
537        " - is%s new\n",
538        data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
539 
540     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
541       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
542       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
543       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
544       args_base[i] = TgtPtrBase;
545     }
546 
547     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
548       // Check whether we need to update the pointer on the device
549       bool UpdateDevPtr = false;
550 
551       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
552       void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
553 
554       Device.ShadowMtx.lock();
555       auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin);
556       // If this pointer is not in the map we need to insert it. If the map
557       // contains a stale entry, we need to update it (e.g. if the pointee was
558       // deallocated and later on is reallocated at another device address). The
559       // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
560       // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
561       // pair is deallocated, not when the OBJ is deallocated. In
562       // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
563       // so it stays in the map for the lifetime of the application. When the
564       // OBJ is deallocated and later on allocated again (at a different device
565       // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
566       // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
567       if (Entry == Device.ShadowPtrMap.end() ||
568           Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
569         // create or update shadow pointers for this entry
570         Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
571             HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
572         UpdateDevPtr = true;
573       }
574 
575       if (UpdateDevPtr) {
576         Pointer_TPR.MapTableEntry->lock();
577         Device.ShadowMtx.unlock();
578 
579         DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
580            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
581 
582         void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
583         TgtPtrBase = ExpectedTgtPtrBase;
584 
585         int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
586                                    sizeof(void *), AsyncInfo);
587         Pointer_TPR.MapTableEntry->unlock();
588 
589         if (rt != OFFLOAD_SUCCESS) {
590           REPORT("Copying data to device failed.\n");
591           return OFFLOAD_FAIL;
592         }
593       } else
594         Device.ShadowMtx.unlock();
595     }
596   }
597 
598   return OFFLOAD_SUCCESS;
599 }
600 
601 namespace {
602 /// This structure contains information to deallocate a target pointer, aka.
603 /// used to call the function \p DeviceTy::deallocTgtPtr.
604 struct DeallocTgtPtrInfo {
605   /// Host pointer used to look up into the map table
606   void *HstPtrBegin;
607   /// Size of the data
608   int64_t DataSize;
609   /// Whether it has \p close modifier
610   bool HasCloseModifier;
611 
DeallocTgtPtrInfo__anon0cffc9bb0111::DeallocTgtPtrInfo612   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasCloseModifier)
613       : HstPtrBegin(HstPtr), DataSize(Size),
614         HasCloseModifier(HasCloseModifier) {}
615 };
616 } // namespace
617 
618 /// Internal function to undo the mapping and retrieve the data from the device.
targetDataEnd(ident_t * loc,DeviceTy & Device,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,AsyncInfoTy & AsyncInfo,bool FromMapper)619 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
620                   void **ArgBases, void **Args, int64_t *ArgSizes,
621                   int64_t *ArgTypes, map_var_info_t *ArgNames,
622                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
623   int Ret;
624   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
625   void *FromMapperBase = nullptr;
626   // process each input.
627   for (int32_t I = ArgNum - 1; I >= 0; --I) {
628     // Ignore private variables and arrays - there is no mapping for them.
629     // Also, ignore the use_device_ptr directive, it has no effect here.
630     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
631         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
632       continue;
633 
634     if (ArgMappers && ArgMappers[I]) {
635       // Instead of executing the regular path of targetDataEnd, call the
636       // targetDataMapper variant which will call targetDataEnd again
637       // with new arguments.
638       DP("Calling targetDataMapper for the %dth argument\n", I);
639 
640       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
641       Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
642                              ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
643                              targetDataEnd);
644 
645       if (Ret != OFFLOAD_SUCCESS) {
646         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
647                " failed.\n");
648         return OFFLOAD_FAIL;
649       }
650 
651       // Skip the rest of this function, continue to the next argument.
652       continue;
653     }
654 
655     void *HstPtrBegin = Args[I];
656     int64_t DataSize = ArgSizes[I];
657     // Adjust for proper alignment if this is a combined entry (for structs).
658     // Look at the next argument - if that is MEMBER_OF this one, then this one
659     // is a combined entry.
660     const int NextI = I + 1;
661     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
662         getParentIndex(ArgTypes[NextI]) == I) {
663       int64_t Padding = (int64_t)HstPtrBegin % Alignment;
664       if (Padding) {
665         DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
666            "\n",
667            Padding, DPxPTR(HstPtrBegin));
668         HstPtrBegin = (char *)HstPtrBegin - Padding;
669         DataSize += Padding;
670       }
671     }
672 
673     bool IsLast, IsHostPtr;
674     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
675     bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
676                       (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
677                      !(FromMapper && I == 0);
678     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
679     bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
680     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
681 
682     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
683     void *TgtPtrBegin =
684         Device.getTgtPtrBegin(HstPtrBegin, DataSize, IsLast, UpdateRef,
685                               IsHostPtr, !IsImplicit, ForceDelete);
686     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
687       DP("Mapping does not exist (%s)\n",
688          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
689       if (HasPresentModifier) {
690         // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
691         // "If a map clause appears on a target, target data, target enter data
692         // or target exit data construct with a present map-type-modifier then
693         // on entry to the region if the corresponding list item does not appear
694         // in the device data environment then an error occurs and the program
695         // terminates."
696         //
697         // This should be an error upon entering an "omp target exit data".  It
698         // should not be an error upon exiting an "omp target data" or "omp
699         // target".  For "omp target data", Clang thus doesn't include present
700         // modifiers for end calls.  For "omp target", we have not found a valid
701         // OpenMP program for which the error matters: it appears that, if a
702         // program can guarantee that data is present at the beginning of an
703         // "omp target" region so that there's no error there, that data is also
704         // guaranteed to be present at the end.
705         MESSAGE("device mapping required by 'present' map type modifier does "
706                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
707                 DPxPTR(HstPtrBegin), DataSize);
708         return OFFLOAD_FAIL;
709       }
710     } else {
711       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
712          " - is%s last\n",
713          DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
714     }
715 
716     // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
717     // "If the map clause appears on a target, target data, or target exit data
718     // construct and a corresponding list item of the original list item is not
719     // present in the device data environment on exit from the region then the
720     // list item is ignored."
721     if (!TgtPtrBegin)
722       continue;
723 
724     bool DelEntry = IsLast;
725 
726     // If the last element from the mapper (for end transfer args comes in
727     // reverse order), do not remove the partial entry, the parent struct still
728     // exists.
729     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
730         !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
731       DelEntry = false; // protect parent struct from being deallocated
732     }
733 
734     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
735       // Move data back to the host
736       if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
737         bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
738         bool CopyMember = false;
739         if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
740             HasCloseModifier) {
741           if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
742               !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ))
743             CopyMember = IsLast;
744         }
745 
746         if ((DelEntry || Always || CopyMember) &&
747             !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
748               TgtPtrBegin == HstPtrBegin)) {
749           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
750              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
751           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
752                                     AsyncInfo);
753           if (Ret != OFFLOAD_SUCCESS) {
754             REPORT("Copying data from device failed.\n");
755             return OFFLOAD_FAIL;
756           }
757         }
758       }
759       if (DelEntry && FromMapper && I == 0) {
760         DelEntry = false;
761         FromMapperBase = HstPtrBegin;
762       }
763 
764       // If we copied back to the host a struct/array containing pointers, we
765       // need to restore the original host pointer values from their shadow
766       // copies. If the struct is going to be deallocated, remove any remaining
767       // shadow pointer entries for this struct.
768       uintptr_t LB = (uintptr_t)HstPtrBegin;
769       uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
770       Device.ShadowMtx.lock();
771       for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
772            Itr != Device.ShadowPtrMap.end();) {
773         void **ShadowHstPtrAddr = (void **)Itr->first;
774 
775         // An STL map is sorted on its keys; use this property
776         // to quickly determine when to break out of the loop.
777         if ((uintptr_t)ShadowHstPtrAddr < LB) {
778           ++Itr;
779           continue;
780         }
781         if ((uintptr_t)ShadowHstPtrAddr >= UB)
782           break;
783 
784         // If we copied the struct to the host, we need to restore the pointer.
785         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
786           DP("Restoring original host pointer value " DPxMOD " for host "
787              "pointer " DPxMOD "\n",
788              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
789           *ShadowHstPtrAddr = Itr->second.HstPtrVal;
790         }
791         // If the struct is to be deallocated, remove the shadow entry.
792         if (DelEntry) {
793           DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
794           Itr = Device.ShadowPtrMap.erase(Itr);
795         } else {
796           ++Itr;
797         }
798       }
799       Device.ShadowMtx.unlock();
800 
801       // Add pointer to the buffer for later deallocation
802       if (DelEntry)
803         DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasCloseModifier);
804     }
805   }
806 
807   // TODO: We should not synchronize here but pass the AsyncInfo object to the
808   //       allocate/deallocate device APIs.
809   //
810   // We need to synchronize before deallocating data.
811   Ret = AsyncInfo.synchronize();
812   if (Ret != OFFLOAD_SUCCESS)
813     return OFFLOAD_FAIL;
814 
815   // Deallocate target pointer
816   for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
817     if (FromMapperBase && FromMapperBase == Info.HstPtrBegin)
818       continue;
819     Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
820                                Info.HasCloseModifier);
821     if (Ret != OFFLOAD_SUCCESS) {
822       REPORT("Deallocating data from device failed.\n");
823       return OFFLOAD_FAIL;
824     }
825   }
826 
827   return OFFLOAD_SUCCESS;
828 }
829 
targetDataContiguous(ident_t * loc,DeviceTy & Device,void * ArgsBase,void * HstPtrBegin,int64_t ArgSize,int64_t ArgType,AsyncInfoTy & AsyncInfo)830 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
831                                 void *HstPtrBegin, int64_t ArgSize,
832                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
833   TIMESCOPE_WITH_IDENT(loc);
834   bool IsLast, IsHostPtr;
835   void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
836                                             IsHostPtr, /*MustContain=*/true);
837   if (!TgtPtrBegin) {
838     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
839     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
840       MESSAGE("device mapping required by 'present' motion modifier does not "
841               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
842               DPxPTR(HstPtrBegin), ArgSize);
843       return OFFLOAD_FAIL;
844     }
845     return OFFLOAD_SUCCESS;
846   }
847 
848   if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
849       TgtPtrBegin == HstPtrBegin) {
850     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
851        DPxPTR(HstPtrBegin));
852     return OFFLOAD_SUCCESS;
853   }
854 
855   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
856     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
857        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
858     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
859     if (Ret != OFFLOAD_SUCCESS) {
860       REPORT("Copying data from device failed.\n");
861       return OFFLOAD_FAIL;
862     }
863 
864     uintptr_t LB = (uintptr_t)HstPtrBegin;
865     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
866     Device.ShadowMtx.lock();
867     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
868          IT != Device.ShadowPtrMap.end(); ++IT) {
869       void **ShadowHstPtrAddr = (void **)IT->first;
870       if ((uintptr_t)ShadowHstPtrAddr < LB)
871         continue;
872       if ((uintptr_t)ShadowHstPtrAddr >= UB)
873         break;
874       DP("Restoring original host pointer value " DPxMOD
875          " for host pointer " DPxMOD "\n",
876          DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
877       *ShadowHstPtrAddr = IT->second.HstPtrVal;
878     }
879     Device.ShadowMtx.unlock();
880   }
881 
882   if (ArgType & OMP_TGT_MAPTYPE_TO) {
883     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
884        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
885     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
886     if (Ret != OFFLOAD_SUCCESS) {
887       REPORT("Copying data to device failed.\n");
888       return OFFLOAD_FAIL;
889     }
890 
891     uintptr_t LB = (uintptr_t)HstPtrBegin;
892     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
893     Device.ShadowMtx.lock();
894     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
895          IT != Device.ShadowPtrMap.end(); ++IT) {
896       void **ShadowHstPtrAddr = (void **)IT->first;
897       if ((uintptr_t)ShadowHstPtrAddr < LB)
898         continue;
899       if ((uintptr_t)ShadowHstPtrAddr >= UB)
900         break;
901       DP("Restoring original target pointer value " DPxMOD " for target "
902          "pointer " DPxMOD "\n",
903          DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
904       Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
905                               sizeof(void *), AsyncInfo);
906       if (Ret != OFFLOAD_SUCCESS) {
907         REPORT("Copying data to device failed.\n");
908         Device.ShadowMtx.unlock();
909         return OFFLOAD_FAIL;
910       }
911     }
912     Device.ShadowMtx.unlock();
913   }
914   return OFFLOAD_SUCCESS;
915 }
916 
targetDataNonContiguous(ident_t * loc,DeviceTy & Device,void * ArgsBase,__tgt_target_non_contig * NonContig,uint64_t Size,int64_t ArgType,int CurrentDim,int DimSize,uint64_t Offset,AsyncInfoTy & AsyncInfo)917 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
918                                    void *ArgsBase,
919                                    __tgt_target_non_contig *NonContig,
920                                    uint64_t Size, int64_t ArgType,
921                                    int CurrentDim, int DimSize, uint64_t Offset,
922                                    AsyncInfoTy &AsyncInfo) {
923   TIMESCOPE_WITH_IDENT(loc);
924   int Ret = OFFLOAD_SUCCESS;
925   if (CurrentDim < DimSize) {
926     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
927       uint64_t CurOffset =
928           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
929       // we only need to transfer the first element for the last dimension
930       // since we've already got a contiguous piece.
931       if (CurrentDim != DimSize - 1 || I == 0) {
932         Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
933                                       ArgType, CurrentDim + 1, DimSize,
934                                       Offset + CurOffset, AsyncInfo);
935         // Stop the whole process if any contiguous piece returns anything
936         // other than OFFLOAD_SUCCESS.
937         if (Ret != OFFLOAD_SUCCESS)
938           return Ret;
939       }
940     }
941   } else {
942     char *Ptr = (char *)ArgsBase + Offset;
943     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
944        " len %" PRIu64 "\n",
945        DPxPTR(Ptr), Offset, Size);
946     Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
947                                AsyncInfo);
948   }
949   return Ret;
950 }
951 
getNonContigMergedDimension(__tgt_target_non_contig * NonContig,int32_t DimSize)952 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
953                                        int32_t DimSize) {
954   int RemovedDim = 0;
955   for (int I = DimSize - 1; I > 0; --I) {
956     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
957       RemovedDim++;
958   }
959   return RemovedDim;
960 }
961 
962 /// Internal function to pass data to/from the target.
targetDataUpdate(ident_t * loc,DeviceTy & Device,int32_t ArgNum,void ** ArgsBase,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,AsyncInfoTy & AsyncInfo,bool)963 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
964                      void **ArgsBase, void **Args, int64_t *ArgSizes,
965                      int64_t *ArgTypes, map_var_info_t *ArgNames,
966                      void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
967   // process each input.
968   for (int32_t I = 0; I < ArgNum; ++I) {
969     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
970         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
971       continue;
972 
973     if (ArgMappers && ArgMappers[I]) {
974       // Instead of executing the regular path of targetDataUpdate, call the
975       // targetDataMapper variant which will call targetDataUpdate again
976       // with new arguments.
977       DP("Calling targetDataMapper for the %dth argument\n", I);
978 
979       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
980       int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
981                                  ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
982                                  targetDataUpdate);
983 
984       if (Ret != OFFLOAD_SUCCESS) {
985         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
986                " failed.\n");
987         return OFFLOAD_FAIL;
988       }
989 
990       // Skip the rest of this function, continue to the next argument.
991       continue;
992     }
993 
994     int Ret = OFFLOAD_SUCCESS;
995 
996     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
997       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
998       int32_t DimSize = ArgSizes[I];
999       uint64_t Size =
1000           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1001       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1002       Ret = targetDataNonContiguous(
1003           loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1004           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1005     } else {
1006       Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1007                                  ArgTypes[I], AsyncInfo);
1008     }
1009     if (Ret == OFFLOAD_FAIL)
1010       return OFFLOAD_FAIL;
1011   }
1012   return OFFLOAD_SUCCESS;
1013 }
1014 
1015 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1016                                       OMP_TGT_MAPTYPE_LITERAL |
1017                                       OMP_TGT_MAPTYPE_IMPLICIT;
isLambdaMapping(int64_t Mapping)1018 static bool isLambdaMapping(int64_t Mapping) {
1019   return (Mapping & LambdaMapping) == LambdaMapping;
1020 }
1021 
1022 namespace {
1023 /// Find the table information in the map or look it up in the translation
1024 /// tables.
getTableMap(void * HostPtr)1025 TableMap *getTableMap(void *HostPtr) {
1026   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1027   HostPtrToTableMapTy::iterator TableMapIt =
1028       PM->HostPtrToTableMap.find(HostPtr);
1029 
1030   if (TableMapIt != PM->HostPtrToTableMap.end())
1031     return &TableMapIt->second;
1032 
1033   // We don't have a map. So search all the registered libraries.
1034   TableMap *TM = nullptr;
1035   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1036   for (HostEntriesBeginToTransTableTy::iterator Itr =
1037            PM->HostEntriesBeginToTransTable.begin();
1038        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1039     // get the translation table (which contains all the good info).
1040     TranslationTable *TransTable = &Itr->second;
1041     // iterate over all the host table entries to see if we can locate the
1042     // host_ptr.
1043     __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1044     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1045       if (Cur->addr != HostPtr)
1046         continue;
1047       // we got a match, now fill the HostPtrToTableMap so that we
1048       // may avoid this search next time.
1049       TM = &(PM->HostPtrToTableMap)[HostPtr];
1050       TM->Table = TransTable;
1051       TM->Index = I;
1052       return TM;
1053     }
1054   }
1055 
1056   return nullptr;
1057 }
1058 
1059 /// Get loop trip count
1060 /// FIXME: This function will not work right if calling
1061 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1062 /// another thread, which might occur when we call task yield.
getLoopTripCount(int64_t DeviceId)1063 uint64_t getLoopTripCount(int64_t DeviceId) {
1064   DeviceTy &Device = PM->Devices[DeviceId];
1065   uint64_t LoopTripCount = 0;
1066 
1067   {
1068     std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1069     auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1070     if (I != Device.LoopTripCnt.end()) {
1071       LoopTripCount = I->second;
1072       Device.LoopTripCnt.erase(I);
1073       DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
1074     }
1075   }
1076 
1077   return LoopTripCount;
1078 }
1079 
1080 /// A class manages private arguments in a target region.
1081 class PrivateArgumentManagerTy {
1082   /// A data structure for the information of first-private arguments. We can
1083   /// use this information to optimize data transfer by packing all
1084   /// first-private arguments and transfer them all at once.
1085   struct FirstPrivateArgInfoTy {
1086     /// The index of the element in \p TgtArgs corresponding to the argument
1087     const int Index;
1088     /// Host pointer begin
1089     const char *HstPtrBegin;
1090     /// Host pointer end
1091     const char *HstPtrEnd;
1092     /// Aligned size
1093     const int64_t AlignedSize;
1094     /// Host pointer name
1095     const map_var_info_t HstPtrName = nullptr;
1096 
FirstPrivateArgInfoTy__anon0cffc9bb0211::PrivateArgumentManagerTy::FirstPrivateArgInfoTy1097     FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1098                           const map_var_info_t HstPtrName = nullptr)
1099         : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1100           HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1101           HstPtrName(HstPtrName) {}
1102   };
1103 
1104   /// A vector of target pointers for all private arguments
1105   std::vector<void *> TgtPtrs;
1106 
1107   /// A vector of information of all first-private arguments to be packed
1108   std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1109   /// Host buffer for all arguments to be packed
1110   std::vector<char> FirstPrivateArgBuffer;
1111   /// The total size of all arguments to be packed
1112   int64_t FirstPrivateArgSize = 0;
1113 
1114   /// A reference to the \p DeviceTy object
1115   DeviceTy &Device;
1116   /// A pointer to a \p AsyncInfoTy object
1117   AsyncInfoTy &AsyncInfo;
1118 
1119   // TODO: What would be the best value here? Should we make it configurable?
1120   // If the size is larger than this threshold, we will allocate and transfer it
1121   // immediately instead of packing it.
1122   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1123 
1124 public:
1125   /// Constructor
PrivateArgumentManagerTy(DeviceTy & Dev,AsyncInfoTy & AsyncInfo)1126   PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1127       : Device(Dev), AsyncInfo(AsyncInfo) {}
1128 
1129   /// Add a private argument
addArg(void * HstPtr,int64_t ArgSize,int64_t ArgOffset,bool IsFirstPrivate,void * & TgtPtr,int TgtArgsIndex,const map_var_info_t HstPtrName=nullptr,const bool AllocImmediately=false)1130   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1131              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1132              const map_var_info_t HstPtrName = nullptr,
1133              const bool AllocImmediately = false) {
1134     // If the argument is not first-private, or its size is greater than a
1135     // predefined threshold, we will allocate memory and issue the transfer
1136     // immediately.
1137     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1138         AllocImmediately) {
1139       TgtPtr = Device.allocData(ArgSize, HstPtr);
1140       if (!TgtPtr) {
1141         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1142            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1143         return OFFLOAD_FAIL;
1144       }
1145 #ifdef OMPTARGET_DEBUG
1146       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1147       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1148          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1149          "\n",
1150          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1151          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1152 #endif
1153       // If first-private, copy data from host
1154       if (IsFirstPrivate) {
1155         DP("Submitting firstprivate data to the device.\n");
1156         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1157         if (Ret != OFFLOAD_SUCCESS) {
1158           DP("Copying data to device failed, failed.\n");
1159           return OFFLOAD_FAIL;
1160         }
1161       }
1162       TgtPtrs.push_back(TgtPtr);
1163     } else {
1164       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1165          DPxPTR(HstPtr), ArgSize);
1166       // When reach this point, the argument must meet all following
1167       // requirements:
1168       // 1. Its size does not exceed the threshold (see the comment for
1169       // FirstPrivateArgSizeThreshold);
1170       // 2. It must be first-private (needs to be mapped to target device).
1171       // We will pack all this kind of arguments to transfer them all at once
1172       // to reduce the number of data transfer. We will not take
1173       // non-first-private arguments, aka. private arguments that doesn't need
1174       // to be mapped to target device, into account because data allocation
1175       // can be very efficient with memory manager.
1176 
1177       // Placeholder value
1178       TgtPtr = nullptr;
1179       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1180                                        HstPtrName);
1181       FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1182     }
1183 
1184     return OFFLOAD_SUCCESS;
1185   }
1186 
1187   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1188   /// and start the transfer.
packAndTransfer(std::vector<void * > & TgtArgs)1189   int packAndTransfer(std::vector<void *> &TgtArgs) {
1190     if (!FirstPrivateArgInfo.empty()) {
1191       assert(FirstPrivateArgSize != 0 &&
1192              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1193       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1194       auto Itr = FirstPrivateArgBuffer.begin();
1195       // Copy all host data to this buffer
1196       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1197         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1198         Itr = std::next(Itr, Info.AlignedSize);
1199       }
1200       // Allocate target memory
1201       void *TgtPtr =
1202           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1203       if (TgtPtr == nullptr) {
1204         DP("Failed to allocate target memory for private arguments.\n");
1205         return OFFLOAD_FAIL;
1206       }
1207       TgtPtrs.push_back(TgtPtr);
1208       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1209          FirstPrivateArgSize, DPxPTR(TgtPtr));
1210       // Transfer data to target device
1211       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1212                                   FirstPrivateArgSize, AsyncInfo);
1213       if (Ret != OFFLOAD_SUCCESS) {
1214         DP("Failed to submit data of private arguments.\n");
1215         return OFFLOAD_FAIL;
1216       }
1217       // Fill in all placeholder pointers
1218       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1219       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1220         void *&Ptr = TgtArgs[Info.Index];
1221         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1222         Ptr = reinterpret_cast<void *>(TP);
1223         TP += Info.AlignedSize;
1224         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1225            "\n",
1226            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1227            DPxPTR(Ptr));
1228       }
1229     }
1230 
1231     return OFFLOAD_SUCCESS;
1232   }
1233 
1234   /// Free all target memory allocated for private arguments
free()1235   int free() {
1236     for (void *P : TgtPtrs) {
1237       int Ret = Device.deleteData(P);
1238       if (Ret != OFFLOAD_SUCCESS) {
1239         DP("Deallocation of (first-)private arrays failed.\n");
1240         return OFFLOAD_FAIL;
1241       }
1242     }
1243 
1244     TgtPtrs.clear();
1245 
1246     return OFFLOAD_SUCCESS;
1247   }
1248 };
1249 
1250 /// Process data before launching the kernel, including calling targetDataBegin
1251 /// to map and transfer data to target device, transferring (first-)private
1252 /// variables.
processDataBefore(ident_t * loc,int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,std::vector<void * > & TgtArgs,std::vector<ptrdiff_t> & TgtOffsets,PrivateArgumentManagerTy & PrivateArgumentManager,AsyncInfoTy & AsyncInfo)1253 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1254                              int32_t ArgNum, void **ArgBases, void **Args,
1255                              int64_t *ArgSizes, int64_t *ArgTypes,
1256                              map_var_info_t *ArgNames, void **ArgMappers,
1257                              std::vector<void *> &TgtArgs,
1258                              std::vector<ptrdiff_t> &TgtOffsets,
1259                              PrivateArgumentManagerTy &PrivateArgumentManager,
1260                              AsyncInfoTy &AsyncInfo) {
1261   TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1262   DeviceTy &Device = PM->Devices[DeviceId];
1263   int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1264                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1265   if (Ret != OFFLOAD_SUCCESS) {
1266     REPORT("Call to targetDataBegin failed, abort target.\n");
1267     return OFFLOAD_FAIL;
1268   }
1269 
1270   // List of (first-)private arrays allocated for this target region
1271   std::vector<int> TgtArgsPositions(ArgNum, -1);
1272 
1273   for (int32_t I = 0; I < ArgNum; ++I) {
1274     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1275       // This is not a target parameter, do not push it into TgtArgs.
1276       // Check for lambda mapping.
1277       if (isLambdaMapping(ArgTypes[I])) {
1278         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1279                "PTR_AND_OBJ must be also MEMBER_OF.");
1280         unsigned Idx = getParentIndex(ArgTypes[I]);
1281         int TgtIdx = TgtArgsPositions[Idx];
1282         assert(TgtIdx != -1 && "Base address must be translated already.");
1283         // The parent lambda must be processed already and it must be the last
1284         // in TgtArgs and TgtOffsets arrays.
1285         void *HstPtrVal = Args[I];
1286         void *HstPtrBegin = ArgBases[I];
1287         void *HstPtrBase = Args[Idx];
1288         bool IsLast, IsHostPtr; // unused.
1289         void *TgtPtrBase =
1290             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1291         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1292         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1293         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1294         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1295         PointerTgtPtrBegin = Device.getTgtPtrBegin(HstPtrVal, ArgSizes[I],
1296                                                    IsLast, false, IsHostPtr);
1297         if (!PointerTgtPtrBegin) {
1298           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1299              DPxPTR(HstPtrVal));
1300           continue;
1301         }
1302         if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1303             TgtPtrBegin == HstPtrBegin) {
1304           DP("Unified memory is active, no need to map lambda captured"
1305              "variable (" DPxMOD ")\n",
1306              DPxPTR(HstPtrVal));
1307           continue;
1308         }
1309         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1310            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1311         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1312                                 sizeof(void *), AsyncInfo);
1313         if (Ret != OFFLOAD_SUCCESS) {
1314           REPORT("Copying data to device failed.\n");
1315           return OFFLOAD_FAIL;
1316         }
1317       }
1318       continue;
1319     }
1320     void *HstPtrBegin = Args[I];
1321     void *HstPtrBase = ArgBases[I];
1322     void *TgtPtrBegin;
1323     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1324     ptrdiff_t TgtBaseOffset;
1325     bool IsLast, IsHostPtr; // unused.
1326     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1327       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1328          DPxPTR(HstPtrBase));
1329       TgtPtrBegin = HstPtrBase;
1330       TgtBaseOffset = 0;
1331     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1332       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1333       const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1334       // If there is a next argument and it depends on the current one, we need
1335       // to allocate the private memory immediately. If this is not the case,
1336       // then the argument can be marked for optimization and packed with the
1337       // other privates.
1338       const bool AllocImmediately =
1339           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1340       Ret = PrivateArgumentManager.addArg(
1341           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1342           TgtArgs.size(), HstPtrName, AllocImmediately);
1343       if (Ret != OFFLOAD_SUCCESS) {
1344         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1345                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1346         return OFFLOAD_FAIL;
1347       }
1348     } else {
1349       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1350         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1351       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1352                                           false, IsHostPtr);
1353       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1354 #ifdef OMPTARGET_DEBUG
1355       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1356       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1357          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1358 #endif
1359     }
1360     TgtArgsPositions[I] = TgtArgs.size();
1361     TgtArgs.push_back(TgtPtrBegin);
1362     TgtOffsets.push_back(TgtBaseOffset);
1363   }
1364 
1365   assert(TgtArgs.size() == TgtOffsets.size() &&
1366          "Size mismatch in arguments and offsets");
1367 
1368   // Pack and transfer first-private arguments
1369   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1370   if (Ret != OFFLOAD_SUCCESS) {
1371     DP("Failed to pack and transfer first private arguments\n");
1372     return OFFLOAD_FAIL;
1373   }
1374 
1375   return OFFLOAD_SUCCESS;
1376 }
1377 
1378 /// Process data after launching the kernel, including transferring data back to
1379 /// host if needed and deallocating target memory of (first-)private variables.
processDataAfter(ident_t * loc,int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,PrivateArgumentManagerTy & PrivateArgumentManager,AsyncInfoTy & AsyncInfo)1380 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1381                             int32_t ArgNum, void **ArgBases, void **Args,
1382                             int64_t *ArgSizes, int64_t *ArgTypes,
1383                             map_var_info_t *ArgNames, void **ArgMappers,
1384                             PrivateArgumentManagerTy &PrivateArgumentManager,
1385                             AsyncInfoTy &AsyncInfo) {
1386   TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1387   DeviceTy &Device = PM->Devices[DeviceId];
1388 
1389   // Move data from device.
1390   int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1391                           ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1392   if (Ret != OFFLOAD_SUCCESS) {
1393     REPORT("Call to targetDataEnd failed, abort target.\n");
1394     return OFFLOAD_FAIL;
1395   }
1396 
1397   // Free target memory for private arguments
1398   Ret = PrivateArgumentManager.free();
1399   if (Ret != OFFLOAD_SUCCESS) {
1400     REPORT("Failed to deallocate target memory for private args\n");
1401     return OFFLOAD_FAIL;
1402   }
1403 
1404   return OFFLOAD_SUCCESS;
1405 }
1406 } // namespace
1407 
1408 /// performs the same actions as data_begin in case arg_num is
1409 /// non-zero and initiates run of the offloaded region on the target platform;
1410 /// if arg_num is non-zero after the region execution is done it also
1411 /// performs the same action as data_update and data_end above. This function
1412 /// returns 0 if it was able to transfer the execution to a target and an
1413 /// integer different from zero otherwise.
target(ident_t * loc,DeviceTy & Device,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,int32_t TeamNum,int32_t ThreadLimit,int IsTeamConstruct,AsyncInfoTy & AsyncInfo)1414 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1415            void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1416            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1417            int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1418   int32_t DeviceId = Device.DeviceID;
1419 
1420   TableMap *TM = getTableMap(HostPtr);
1421   // No map for this host pointer found!
1422   if (!TM) {
1423     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1424            DPxPTR(HostPtr));
1425     return OFFLOAD_FAIL;
1426   }
1427 
1428   // get target table.
1429   __tgt_target_table *TargetTable = nullptr;
1430   {
1431     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1432     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1433            "Not expecting a device ID outside the table's bounds!");
1434     TargetTable = TM->Table->TargetsTable[DeviceId];
1435   }
1436   assert(TargetTable && "Global data has not been mapped\n");
1437 
1438   std::vector<void *> TgtArgs;
1439   std::vector<ptrdiff_t> TgtOffsets;
1440 
1441   PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1442 
1443   int Ret;
1444   if (ArgNum) {
1445     // Process data, such as data mapping, before launching the kernel
1446     Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1447                             ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1448                             TgtOffsets, PrivateArgumentManager, AsyncInfo);
1449     if (Ret != OFFLOAD_SUCCESS) {
1450       REPORT("Failed to process data before launching the kernel.\n");
1451       return OFFLOAD_FAIL;
1452     }
1453   }
1454 
1455   // Get loop trip count
1456   uint64_t LoopTripCount = getLoopTripCount(DeviceId);
1457 
1458   // Launch device execution.
1459   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1460   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1461      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1462 
1463   {
1464     TIMESCOPE_WITH_NAME_AND_IDENT(
1465         IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1466     if (IsTeamConstruct)
1467       Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1468                                  TgtArgs.size(), TeamNum, ThreadLimit,
1469                                  LoopTripCount, AsyncInfo);
1470     else
1471       Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1472                              TgtArgs.size(), AsyncInfo);
1473   }
1474 
1475   if (Ret != OFFLOAD_SUCCESS) {
1476     REPORT("Executing target region abort target.\n");
1477     return OFFLOAD_FAIL;
1478   }
1479 
1480   if (ArgNum) {
1481     // Transfer data back and deallocate target memory for (first-)private
1482     // variables
1483     Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1484                            ArgSizes, ArgTypes, ArgNames, ArgMappers,
1485                            PrivateArgumentManager, AsyncInfo);
1486     if (Ret != OFFLOAD_SUCCESS) {
1487       REPORT("Failed to process data after launching the kernel.\n");
1488       return OFFLOAD_FAIL;
1489     }
1490   }
1491 
1492   return OFFLOAD_SUCCESS;
1493 }
1494