1 //===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for NEC Aurora TSUBASA machines
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <algorithm>
15 #include <cassert>
16 #include <cerrno>
17 #include <cstring>
18 #include <list>
19 #include <stdlib.h>
20 #include <string>
21 #include <sys/stat.h>
22 #include <ve_offload.h>
23 #include <vector>
24 #include <veosinfo/veosinfo.h>
25 
26 #include "Debug.h"
27 #include "omptargetplugin.h"
28 
29 #ifndef TARGET_NAME
30 #define TARGET_NAME VE
31 #endif
32 
33 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
34 
35 #ifndef TARGET_ELF_ID
36 #define TARGET_ELF_ID 0
37 #endif
38 
39 #include "elf_common.h"
40 
41 struct DynLibTy {
42   char *FileName;
43   uint64_t VeoLibHandle;
44 };
45 
46 /// Keep entries table per device.
47 struct FuncOrGblEntryTy {
48   __tgt_target_table Table;
49   std::vector<__tgt_offload_entry> Entries;
50 };
51 
52 class RTLDeviceInfoTy {
53   std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry;
54 
55 public:
56   std::vector<struct veo_proc_handle *> ProcHandles;
57   std::vector<struct veo_thr_ctxt *> Contexts;
58   std::vector<uint64_t> LibraryHandles;
59   std::list<DynLibTy> DynLibs;
60   // Maps OpenMP device Ids to Ve nodeids
61   std::vector<int> NodeIds;
62 
buildOffloadTableFromHost(int32_t device_id,uint64_t VeoLibHandle,__tgt_offload_entry * HostBegin,__tgt_offload_entry * HostEnd)63   void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle,
64                                  __tgt_offload_entry *HostBegin,
65                                  __tgt_offload_entry *HostEnd) {
66     FuncOrGblEntry[device_id].emplace_back();
67     std::vector<__tgt_offload_entry> &T =
68         FuncOrGblEntry[device_id].back().Entries;
69     T.clear();
70     for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) {
71       char *SymbolName = i->name;
72       // we have not enough access to the target memory to conveniently parse
73       // the offload table there so we need to lookup every symbol with the host
74       // table
75       DP("Looking up symbol: %s\n", SymbolName);
76       uint64_t SymbolTargetAddr =
77           veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName);
78       __tgt_offload_entry Entry;
79 
80       if (!SymbolTargetAddr) {
81         DP("Symbol %s not found in target image\n", SymbolName);
82         Entry = {NULL, NULL, 0, 0, 0};
83       } else {
84         DP("Found symbol %s successfully in target image (addr: %p)\n",
85            SymbolName, reinterpret_cast<void *>(SymbolTargetAddr));
86         Entry = {reinterpret_cast<void *>(SymbolTargetAddr), i->name, i->size,
87                  i->flags, 0};
88       }
89 
90       T.push_back(Entry);
91     }
92 
93     FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front();
94     FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1;
95   }
96 
getOffloadTable(int32_t device_id)97   __tgt_target_table *getOffloadTable(int32_t device_id) {
98     return &FuncOrGblEntry[device_id].back().Table;
99   }
100 
RTLDeviceInfoTy()101   RTLDeviceInfoTy() {
102 
103     struct ve_nodeinfo node_info;
104     ve_node_info(&node_info);
105 
106     // Build a predictable mapping between VE node ids and OpenMP device ids.
107     // This is necessary, because nodes can be missing or offline and (active)
108     // node ids are thus not consecutive. The entries in ve_nodeinfo may also
109     // not be in the order of their node ids.
110     for (int i = 0; i < node_info.total_node_count; ++i) {
111       if (node_info.status[i] == 0) {
112         NodeIds.push_back(node_info.nodeid[i]);
113       }
114     }
115 
116     // Because the entries in ve_nodeinfo may not be in the order of their node
117     // ids, we sort NodeIds to get a predictable mapping.
118     std::sort(NodeIds.begin(), NodeIds.end());
119 
120     int NumDevices = NodeIds.size();
121     DP("Found %i VE devices\n", NumDevices);
122     ProcHandles.resize(NumDevices, NULL);
123     Contexts.resize(NumDevices, NULL);
124     FuncOrGblEntry.resize(NumDevices);
125     LibraryHandles.resize(NumDevices);
126   }
127 
~RTLDeviceInfoTy()128   ~RTLDeviceInfoTy() {
129     for (auto &ctx : Contexts) {
130       if (ctx != NULL) {
131         if (veo_context_close(ctx) != 0) {
132           DP("Failed to close VEO context.\n");
133         }
134       }
135     }
136 
137     for (auto &hdl : ProcHandles) {
138       if (hdl != NULL) {
139         veo_proc_destroy(hdl);
140       }
141     }
142 
143     for (auto &lib : DynLibs) {
144       if (lib.FileName) {
145         remove(lib.FileName);
146       }
147     }
148   }
149 };
150 
151 static RTLDeviceInfoTy DeviceInfo;
152 
target_run_function_wait(uint32_t DeviceID,uint64_t FuncAddr,struct veo_args * args,uint64_t * RetVal)153 static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr,
154                                     struct veo_args *args, uint64_t *RetVal) {
155   DP("Running function with entry point %p\n",
156      reinterpret_cast<void *>(FuncAddr));
157   uint64_t RequestHandle =
158       veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args);
159   if (RequestHandle == VEO_REQUEST_ID_INVALID) {
160     DP("Execution of entry point %p failed\n",
161        reinterpret_cast<void *>(FuncAddr));
162     return OFFLOAD_FAIL;
163   }
164 
165   DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n",
166      reinterpret_cast<void *>(FuncAddr), RequestHandle);
167 
168   int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle,
169                                  RetVal);
170   if (ret != 0) {
171     DP("Waiting for entry point %p failed (Error code %d)\n",
172        reinterpret_cast<void *>(FuncAddr), ret);
173     return OFFLOAD_FAIL;
174   }
175   return OFFLOAD_SUCCESS;
176 }
177 
178 // Return the number of available devices of the type supported by the
179 // target RTL.
__tgt_rtl_number_of_devices(void)180 int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); }
181 
182 // Return an integer different from zero if the provided device image can be
183 // supported by the runtime. The functionality is similar to comparing the
184 // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
185 // lightweight query to determine if the RTL is suitable for an image without
186 // having to load the library, which can be expensive.
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)187 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
188 #if TARGET_ELF_ID < 1
189   return 0;
190 #else
191   return elf_check_machine(Image, TARGET_ELF_ID);
192 #endif
193 }
194 
195 // Initialize the specified device. In case of success return 0; otherwise
196 // return an error code.
__tgt_rtl_init_device(int32_t ID)197 int32_t __tgt_rtl_init_device(int32_t ID) {
198   DP("Available VEO version: %i\n", veo_api_version());
199 
200   // At the moment we do not really initialize (i.e. create a process or
201   // context on) the device here, but in "__tgt_rtl_load_binary".
202   // The reason for this is, that, when we create a process for a statically
203   // linked binary, the VEO api needs us to already supply the binary (but we
204   // can load a dynamically linked binary later, after we create the process).
205   // At this stage, we cannot check if we have a dynamically or statically
206   // linked binary so we defer process creation until we know.
207   return OFFLOAD_SUCCESS;
208 }
209 
210 // Pass an executable image section described by image to the specified
211 // device and prepare an address table of target entities. In case of error,
212 // return NULL. Otherwise, return a pointer to the built address table.
213 // Individual entries in the table may also be NULL, when the corresponding
214 // offload region is not supported on the target device.
__tgt_rtl_load_binary(int32_t ID,__tgt_device_image * Image)215 __tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
216                                           __tgt_device_image *Image) {
217   DP("Dev %d: load binary from " DPxMOD " image\n", ID,
218      DPxPTR(Image->ImageStart));
219 
220   assert(ID >= 0 && "bad dev id");
221 
222   size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
223   size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
224   DP("Expecting to have %zd entries defined.\n", NumEntries);
225 
226   // load dynamic library and get the entry points. We use the dl library
227   // to do the loading of the library, but we could do it directly to avoid the
228   // dump to the temporary file.
229   //
230   // 1) Create tmp file with the library contents.
231   // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
232   char tmp_name[] = "/tmp/tmpfile_XXXXXX";
233   int tmp_fd = mkstemp(tmp_name);
234 
235   if (tmp_fd == -1) {
236     return NULL;
237   }
238 
239   FILE *ftmp = fdopen(tmp_fd, "wb");
240 
241   if (!ftmp) {
242     DP("fdopen() for %s failed. Could not write target image\n", tmp_name);
243     return NULL;
244   }
245 
246   fwrite(Image->ImageStart, ImageSize, 1, ftmp);
247 
248   // at least for the static case we need to change the permissions
249   chmod(tmp_name, 0700);
250 
251   DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize);
252 
253   fclose(ftmp);
254 
255   // See comment in "__tgt_rtl_init_device"
256   bool is_dyn = true;
257   if (DeviceInfo.ProcHandles[ID] == NULL) {
258     struct veo_proc_handle *proc_handle;
259     is_dyn = elf_is_dynamic(Image);
260     // If we have a dynamically linked image, we create the process handle, then
261     // the thread, and then load the image.
262     // If we have a statically linked image, we need to create the process
263     // handle and load the image at the same time with veo_proc_create_static().
264     if (is_dyn) {
265       proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
266       if (!proc_handle) {
267         DP("veo_proc_create() failed for device %d\n", ID);
268         return NULL;
269       }
270     } else {
271       proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name);
272       if (!proc_handle) {
273         DP("veo_proc_create_static() failed for device %d, image=%s\n", ID,
274            tmp_name);
275         return NULL;
276       }
277     }
278     DeviceInfo.ProcHandles[ID] = proc_handle;
279   }
280 
281   if (DeviceInfo.Contexts[ID] == NULL) {
282     struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]);
283 
284     if (!ctx) {
285       DP("veo_context_open() failed: %s\n", std::strerror(errno));
286       return NULL;
287     }
288 
289     DeviceInfo.Contexts[ID] = ctx;
290   }
291 
292   DP("Aurora device successfully initialized with loaded binary: "
293      "proc_handle=%p, ctx=%p\n",
294      DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]);
295 
296   uint64_t LibHandle = 0UL;
297   if (is_dyn) {
298     LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name);
299 
300     if (!LibHandle) {
301       DP("veo_load_library() failed: LibHandle=%" PRIu64
302          " Name=%s. Set env VEORUN_BIN for static linked target code.\n",
303          LibHandle, tmp_name);
304       return NULL;
305     }
306 
307     DP("Successfully loaded library dynamically\n");
308   } else {
309     DP("Symbol table is expected to have been created by "
310        "veo_create_proc_static()\n");
311   }
312 
313   DynLibTy Lib = {tmp_name, LibHandle};
314   DeviceInfo.DynLibs.push_back(Lib);
315   DeviceInfo.LibraryHandles[ID] = LibHandle;
316 
317   DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin,
318                                        Image->EntriesEnd);
319 
320   return DeviceInfo.getOffloadTable(ID);
321 }
322 
323 // Allocate data on the particular target device, of the specified size.
324 // HostPtr is a address of the host data the allocated target data
325 // will be associated with (HostPtr may be NULL if it is not known at
326 // allocation time, like for example it would be for target data that
327 // is allocated by omp_target_alloc() API). Return address of the
328 // allocated data on the target that will be used by libomptarget.so to
329 // initialize the target data mapping structures. These addresses are
330 // used to generate a table of target variables to pass to
331 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
332 // case an error occurred on the target device.
__tgt_rtl_data_alloc(int32_t ID,int64_t Size,void * HostPtr,int32_t kind)333 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr,
334                            int32_t kind) {
335   int ret;
336   uint64_t addr;
337 
338   if (kind != TARGET_ALLOC_DEFAULT) {
339     REPORT("Invalid target data allocation kind or requested allocator not "
340            "implemented yet\n");
341     return NULL;
342   }
343 
344   if (DeviceInfo.ProcHandles[ID] == NULL) {
345     struct veo_proc_handle *proc_handle;
346     proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
347     if (!proc_handle) {
348       DP("veo_proc_create() failed for device %d\n", ID);
349       return NULL;
350     }
351     DeviceInfo.ProcHandles[ID] = proc_handle;
352     DP("Aurora device successfully initialized: proc_handle=%p", proc_handle);
353   }
354 
355   ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size);
356   DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n",
357      ID, reinterpret_cast<void *>(addr), Size);
358   if (ret != 0) {
359     DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n", ID,
360        reinterpret_cast<void *>(addr), Size, ret);
361     return NULL;
362   }
363 
364   return reinterpret_cast<void *>(addr);
365 }
366 
367 // Pass the data content to the target device using the target address.
368 // In case of success, return zero. Otherwise, return an error code.
__tgt_rtl_data_submit(int32_t ID,void * TargetPtr,void * HostPtr,int64_t Size)369 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
370                               int64_t Size) {
371   int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr,
372                           HostPtr, (size_t)Size);
373   if (ret != 0) {
374     DP("veo_write_mem() failed with error code %d\n", ret);
375     return OFFLOAD_FAIL;
376   }
377   return OFFLOAD_SUCCESS;
378 }
379 
380 // Retrieve the data content from the target device using its address.
381 // In case of success, return zero. Otherwise, return an error code.
__tgt_rtl_data_retrieve(int32_t ID,void * HostPtr,void * TargetPtr,int64_t Size)382 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
383                                 int64_t Size) {
384   int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr,
385                          (uint64_t)TargetPtr, Size);
386   if (ret != 0) {
387     DP("veo_read_mem() failed with error code %d\n", ret);
388     return OFFLOAD_FAIL;
389   }
390   return OFFLOAD_SUCCESS;
391 }
392 
393 // De-allocate the data referenced by target ptr on the device. In case of
394 // success, return zero. Otherwise, return an error code.
__tgt_rtl_data_delete(int32_t ID,void * TargetPtr)395 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) {
396   int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
397 
398   if (ret != 0) {
399     DP("veo_free_mem() failed with error code %d\n", ret);
400     return OFFLOAD_FAIL;
401   }
402   return OFFLOAD_SUCCESS;
403 }
404 
405 // Similar to __tgt_rtl_run_target_region, but additionally specify the
406 // number of teams to be created and a number of threads in each team.
__tgt_rtl_run_target_team_region(int32_t ID,void * Entry,void ** Args,ptrdiff_t * Offsets,int32_t NumArgs,int32_t NumTeams,int32_t ThreadLimit,uint64_t loop_tripcount)407 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
408                                          ptrdiff_t *Offsets, int32_t NumArgs,
409                                          int32_t NumTeams, int32_t ThreadLimit,
410                                          uint64_t loop_tripcount) {
411   int ret;
412 
413   // ignore team num and thread limit.
414   std::vector<void *> ptrs(NumArgs);
415 
416   struct veo_args *TargetArgs;
417   TargetArgs = veo_args_alloc();
418 
419   if (TargetArgs == NULL) {
420     DP("Could not allocate VEO args\n");
421     return OFFLOAD_FAIL;
422   }
423 
424   for (int i = 0; i < NumArgs; ++i) {
425     ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]);
426 
427     if (ret != 0) {
428       DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n", ret,
429          i, Args[i]);
430       return OFFLOAD_FAIL;
431     }
432   }
433 
434   uint64_t RetVal;
435   if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry),
436                                TargetArgs, &RetVal) != OFFLOAD_SUCCESS) {
437     veo_args_free(TargetArgs);
438     return OFFLOAD_FAIL;
439   }
440   veo_args_free(TargetArgs);
441   return OFFLOAD_SUCCESS;
442 }
443 
444 // Transfer control to the offloaded entry Entry on the target device.
445 // Args and Offsets are arrays of NumArgs size of target addresses and
446 // offsets. An offset should be added to the target address before passing it
447 // to the outlined function on device side. In case of success, return zero.
448 // Otherwise, return an error code.
__tgt_rtl_run_target_region(int32_t ID,void * Entry,void ** Args,ptrdiff_t * Offsets,int32_t NumArgs)449 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
450                                     ptrdiff_t *Offsets, int32_t NumArgs) {
451   return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1,
452                                           1, 0);
453 }
454 
__tgt_rtl_supports_empty_images()455 int32_t __tgt_rtl_supports_empty_images() { return 1; }
456 
457 // VEC plugin's internal InfoLevel.
458 std::atomic<uint32_t> InfoLevel;
459