1 //===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for NEC Aurora TSUBASA machines
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptargetplugin.h"
15 
16 #include <algorithm>
17 #include <cassert>
18 #include <cerrno>
19 #include <cstring>
20 #include <list>
21 #include <stdlib.h>
22 #include <string>
23 #include <sys/stat.h>
24 #include <ve_offload.h>
25 #include <vector>
26 #include <veosinfo/veosinfo.h>
27 
28 #ifndef TARGET_ELF_ID
29 #define TARGET_ELF_ID 0
30 #endif
31 
32 #ifdef OMPTARGET_DEBUG
33 static int DebugLevel = 0;
34 
35 #define GETNAME2(name) #name
36 #define GETNAME(name) GETNAME2(name)
37 #define DP(...)                                                                \
38   do {                                                                         \
39     if (DebugLevel > 0) {                                                      \
40       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__);              \
41     }                                                                          \
42   } while (false)
43 #else // OMPTARGET_DEBUG
44 #define DP(...)                                                                \
45   {}
46 #endif // OMPTARGET_DEBUG
47 
48 #include "../../common/elf_common.c"
49 
50 struct DynLibTy {
51   char *FileName;
52   uint64_t VeoLibHandle;
53 };
54 
55 /// Keep entries table per device.
56 struct FuncOrGblEntryTy {
57   __tgt_target_table Table;
58   std::vector<__tgt_offload_entry> Entries;
59 };
60 
61 class RTLDeviceInfoTy {
62   std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry;
63 
64 public:
65   std::vector<struct veo_proc_handle *> ProcHandles;
66   std::vector<struct veo_thr_ctxt *> Contexts;
67   std::vector<uint64_t> LibraryHandles;
68   std::list<DynLibTy> DynLibs;
69   // Maps OpenMP device Ids to Ve nodeids
70   std::vector<int> NodeIds;
71 
buildOffloadTableFromHost(int32_t device_id,uint64_t VeoLibHandle,__tgt_offload_entry * HostBegin,__tgt_offload_entry * HostEnd)72   void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle,
73                                  __tgt_offload_entry *HostBegin,
74                                  __tgt_offload_entry *HostEnd) {
75     FuncOrGblEntry[device_id].emplace_back();
76     std::vector<__tgt_offload_entry> &T =
77         FuncOrGblEntry[device_id].back().Entries;
78     T.clear();
79     for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) {
80       char *SymbolName = i->name;
81       // we have not enough access to the target memory to conveniently parse
82       // the offload table there so we need to lookup every symbol with the host
83       // table
84       DP("Looking up symbol: %s\n", SymbolName);
85       uint64_t SymbolTargetAddr =
86           veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName);
87       __tgt_offload_entry Entry;
88 
89       if (!SymbolTargetAddr) {
90         DP("Symbol %s not found in target image\n", SymbolName);
91         Entry = {NULL, NULL, 0, 0, 0};
92       } else {
93         DP("Found symbol %s successfully in target image (addr: %p)\n",
94            SymbolName, reinterpret_cast<void *>(SymbolTargetAddr));
95         Entry = { reinterpret_cast<void *>(SymbolTargetAddr),
96                   i->name,
97                   i->size,
98                   i->flags,
99                   0 };
100       }
101 
102       T.push_back(Entry);
103     }
104 
105     FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front();
106     FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1;
107   }
108 
getOffloadTable(int32_t device_id)109   __tgt_target_table *getOffloadTable(int32_t device_id) {
110     return &FuncOrGblEntry[device_id].back().Table;
111   }
112 
RTLDeviceInfoTy()113   RTLDeviceInfoTy() {
114 #ifdef OMPTARGET_DEBUG
115     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
116       DebugLevel = std::stoi(envStr);
117     }
118 #endif // OMPTARGET_DEBUG
119 
120     struct ve_nodeinfo node_info;
121     ve_node_info(&node_info);
122 
123     // Build a predictable mapping between VE node ids and OpenMP device ids.
124     // This is necessary, because nodes can be missing or offline and (active)
125     // node ids are thus not consecutive. The entries in ve_nodeinfo may also
126     // not be in the order of their node ids.
127     for (int i = 0; i < node_info.total_node_count; ++i) {
128       if (node_info.status[i] == 0) {
129         NodeIds.push_back(node_info.nodeid[i]);
130       }
131     }
132 
133     // Because the entries in ve_nodeinfo may not be in the order of their node
134     // ids, we sort NodeIds to get a predictable mapping.
135     std::sort(NodeIds.begin(), NodeIds.end());
136 
137     int NumDevices = NodeIds.size();
138     DP("Found %i VE devices\n", NumDevices);
139     ProcHandles.resize(NumDevices, NULL);
140     Contexts.resize(NumDevices, NULL);
141     FuncOrGblEntry.resize(NumDevices);
142     LibraryHandles.resize(NumDevices);
143   }
144 
~RTLDeviceInfoTy()145   ~RTLDeviceInfoTy() {
146     for (auto &ctx : Contexts) {
147       if (ctx != NULL) {
148         if (veo_context_close(ctx) != 0) {
149           DP("Failed to close VEO context.\n");
150         }
151       }
152     }
153 
154     for (auto &hdl : ProcHandles) {
155       if (hdl != NULL) {
156         veo_proc_destroy(hdl);
157       }
158     }
159 
160     for (auto &lib : DynLibs) {
161       if (lib.FileName) {
162         remove(lib.FileName);
163       }
164     }
165   }
166 };
167 
168 static RTLDeviceInfoTy DeviceInfo;
169 
target_run_function_wait(uint32_t DeviceID,uint64_t FuncAddr,struct veo_args * args,uint64_t * RetVal)170 static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr,
171                                     struct veo_args *args, uint64_t *RetVal) {
172   DP("Running function with entry point %p\n",
173      reinterpret_cast<void *>(FuncAddr));
174   uint64_t RequestHandle =
175       veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args);
176   if (RequestHandle == VEO_REQUEST_ID_INVALID) {
177     DP("Execution of entry point %p failed\n",
178        reinterpret_cast<void *>(FuncAddr));
179     return OFFLOAD_FAIL;
180   }
181 
182   DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n",
183      reinterpret_cast<void *>(FuncAddr), RequestHandle);
184 
185   int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle,
186                                  RetVal);
187   if (ret != 0) {
188     DP("Waiting for entry point %p failed (Error code %d)\n",
189        reinterpret_cast<void *>(FuncAddr), ret);
190     return OFFLOAD_FAIL;
191   }
192   return OFFLOAD_SUCCESS;
193 }
194 
195 
196 // Return the number of available devices of the type supported by the
197 // target RTL.
__tgt_rtl_number_of_devices(void)198 int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); }
199 
200 // Return an integer different from zero if the provided device image can be
201 // supported by the runtime. The functionality is similar to comparing the
202 // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
203 // lightweight query to determine if the RTL is suitable for an image without
204 // having to load the library, which can be expensive.
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)205 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
206 #if TARGET_ELF_ID < 1
207   return 0;
208 #else
209   return elf_check_machine(Image, TARGET_ELF_ID);
210 #endif
211 }
212 
213 // Initialize the specified device. In case of success return 0; otherwise
214 // return an error code.
__tgt_rtl_init_device(int32_t ID)215 int32_t __tgt_rtl_init_device(int32_t ID) {
216   DP("Available VEO version: %i\n", veo_api_version());
217 
218   // At the moment we do not really initialize (i.e. create a process or
219   // context on) the device here, but in "__tgt_rtl_load_binary".
220   // The reason for this is, that, when we create a process for a statically
221   // linked binary, the VEO api needs us to already supply the binary (but we
222   // can load a dynamically linked binary later, after we create the process).
223   // At this stage, we cannot check if we have a dynamically or statically
224   // linked binary so we defer process creation until we know.
225   return OFFLOAD_SUCCESS;
226 }
227 
228 // Pass an executable image section described by image to the specified
229 // device and prepare an address table of target entities. In case of error,
230 // return NULL. Otherwise, return a pointer to the built address table.
231 // Individual entries in the table may also be NULL, when the corresponding
232 // offload region is not supported on the target device.
__tgt_rtl_load_binary(int32_t ID,__tgt_device_image * Image)233 __tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
234                                           __tgt_device_image *Image) {
235   DP("Dev %d: load binary from " DPxMOD " image\n", ID,
236      DPxPTR(Image->ImageStart));
237 
238   assert(ID >= 0 && "bad dev id");
239 
240   size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
241   size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
242   DP("Expecting to have %zd entries defined.\n", NumEntries);
243 
244   // load dynamic library and get the entry points. We use the dl library
245   // to do the loading of the library, but we could do it directly to avoid the
246   // dump to the temporary file.
247   //
248   // 1) Create tmp file with the library contents.
249   // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
250   char tmp_name[] = "/tmp/tmpfile_XXXXXX";
251   int tmp_fd = mkstemp(tmp_name);
252 
253   if (tmp_fd == -1) {
254     return NULL;
255   }
256 
257   FILE *ftmp = fdopen(tmp_fd, "wb");
258 
259   if (!ftmp) {
260     DP("fdopen() for %s failed. Could not write target image\n", tmp_name);
261     return NULL;
262   }
263 
264   fwrite(Image->ImageStart, ImageSize, 1, ftmp);
265 
266   // at least for the static case we need to change the permissions
267   chmod(tmp_name, 0700);
268 
269   DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize);
270 
271   fclose(ftmp);
272 
273   // See comment in "__tgt_rtl_init_device"
274   bool is_dyn = true;
275   if (DeviceInfo.ProcHandles[ID] == NULL) {
276     struct veo_proc_handle *proc_handle;
277     is_dyn = elf_is_dynamic(Image);
278     // If we have a dynamically linked image, we create the process handle, then
279     // the thread, and then load the image.
280     // If we have a statically linked image, we need to create the process
281     // handle and load the image at the same time with veo_proc_create_static().
282     if (is_dyn) {
283       proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
284       if (!proc_handle) {
285         DP("veo_proc_create() failed for device %d\n", ID);
286         return NULL;
287       }
288     } else {
289       proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name);
290       if (!proc_handle) {
291         DP("veo_proc_create_static() failed for device %d, image=%s\n", ID,
292            tmp_name);
293         return NULL;
294       }
295     }
296     DeviceInfo.ProcHandles[ID] = proc_handle;
297   }
298 
299   if (DeviceInfo.Contexts[ID] == NULL) {
300     struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]);
301 
302     if (!ctx) {
303       DP("veo_context_open() failed: %s\n", std::strerror(errno));
304       return NULL;
305     }
306 
307     DeviceInfo.Contexts[ID] = ctx;
308   }
309 
310   DP("Aurora device successfully initialized with loaded binary: "
311      "proc_handle=%p, ctx=%p\n",
312      DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]);
313 
314   uint64_t LibHandle = 0UL;
315   if (is_dyn) {
316     LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name);
317 
318     if (!LibHandle) {
319       DP("veo_load_library() failed: LibHandle=%" PRIu64
320          " Name=%s. Set env VEORUN_BIN for static linked target code.\n",
321          LibHandle, tmp_name);
322       return NULL;
323     }
324 
325     DP("Successfully loaded library dynamically\n");
326   } else {
327     DP("Symbol table is expected to have been created by "
328        "veo_create_proc_static()\n");
329   }
330 
331   DynLibTy Lib = {tmp_name, LibHandle};
332   DeviceInfo.DynLibs.push_back(Lib);
333   DeviceInfo.LibraryHandles[ID] = LibHandle;
334 
335   DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin,
336                                        Image->EntriesEnd);
337 
338   return DeviceInfo.getOffloadTable(ID);
339 }
340 
341 // Allocate data on the particular target device, of the specified size.
342 // HostPtr is a address of the host data the allocated target data
343 // will be associated with (HostPtr may be NULL if it is not known at
344 // allocation time, like for example it would be for target data that
345 // is allocated by omp_target_alloc() API). Return address of the
346 // allocated data on the target that will be used by libomptarget.so to
347 // initialize the target data mapping structures. These addresses are
348 // used to generate a table of target variables to pass to
349 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
350 // case an error occurred on the target device.
__tgt_rtl_data_alloc(int32_t ID,int64_t Size,void * HostPtr)351 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) {
352   int ret;
353   uint64_t addr;
354 
355   if (DeviceInfo.ProcHandles[ID] == NULL) {
356     struct veo_proc_handle *proc_handle;
357     proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
358     if (!proc_handle) {
359       DP("veo_proc_create() failed for device %d\n", ID);
360       return NULL;
361     }
362     DeviceInfo.ProcHandles[ID] = proc_handle;
363     DP("Aurora device successfully initialized: proc_handle=%p", proc_handle);
364   }
365 
366   ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size);
367   DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n",
368      ID, reinterpret_cast<void *>(addr), Size);
369   if (ret != 0) {
370     DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n",
371        ID, reinterpret_cast<void *>(addr), Size, ret);
372     return NULL;
373   }
374 
375   return reinterpret_cast<void *>(addr);
376 }
377 
378 // Pass the data content to the target device using the target address.
379 // In case of success, return zero. Otherwise, return an error code.
__tgt_rtl_data_submit(int32_t ID,void * TargetPtr,void * HostPtr,int64_t Size)380 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
381                               int64_t Size) {
382   int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr,
383                           HostPtr, (size_t)Size);
384   if (ret != 0) {
385     DP("veo_write_mem() failed with error code %d\n", ret);
386     return OFFLOAD_FAIL;
387   }
388   return OFFLOAD_SUCCESS;
389 }
390 
391 // Retrieve the data content from the target device using its address.
392 // In case of success, return zero. Otherwise, return an error code.
__tgt_rtl_data_retrieve(int32_t ID,void * HostPtr,void * TargetPtr,int64_t Size)393 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
394                                 int64_t Size) {
395   int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr,
396                          (uint64_t)TargetPtr, Size);
397   if (ret != 0) {
398     DP("veo_read_mem() failed with error code %d\n", ret);
399     return OFFLOAD_FAIL;
400   }
401   return OFFLOAD_SUCCESS;
402 }
403 
404 // De-allocate the data referenced by target ptr on the device. In case of
405 // success, return zero. Otherwise, return an error code.
__tgt_rtl_data_delete(int32_t ID,void * TargetPtr)406 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) {
407   int ret =  veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
408 
409   if (ret != 0) {
410     DP("veo_free_mem() failed with error code %d\n", ret);
411     return OFFLOAD_FAIL;
412   }
413   return OFFLOAD_SUCCESS;
414 }
415 
416 // Similar to __tgt_rtl_run_target_region, but additionally specify the
417 // number of teams to be created and a number of threads in each team.
__tgt_rtl_run_target_team_region(int32_t ID,void * Entry,void ** Args,ptrdiff_t * Offsets,int32_t NumArgs,int32_t NumTeams,int32_t ThreadLimit,uint64_t loop_tripcount)418 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
419                                          ptrdiff_t *Offsets, int32_t NumArgs,
420                                          int32_t NumTeams, int32_t ThreadLimit,
421                                          uint64_t loop_tripcount) {
422   int ret;
423 
424   // ignore team num and thread limit.
425   std::vector<void *> ptrs(NumArgs);
426 
427   struct veo_args *TargetArgs;
428   TargetArgs = veo_args_alloc();
429 
430   if (TargetArgs == NULL) {
431     DP("Could not allocate VEO args\n");
432     return OFFLOAD_FAIL;
433   }
434 
435   for (int i = 0; i < NumArgs; ++i) {
436     ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]);
437 
438     if (ret != 0) {
439       DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n",
440          ret, i, Args[i]);
441       return OFFLOAD_FAIL;
442     }
443   }
444 
445   uint64_t RetVal;
446   if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry),
447                                TargetArgs, &RetVal) != OFFLOAD_SUCCESS) {
448     veo_args_free(TargetArgs);
449     return OFFLOAD_FAIL;
450   }
451   veo_args_free(TargetArgs);
452   return OFFLOAD_SUCCESS;
453 }
454 
455 // Transfer control to the offloaded entry Entry on the target device.
456 // Args and Offsets are arrays of NumArgs size of target addresses and
457 // offsets. An offset should be added to the target address before passing it
458 // to the outlined function on device side. In case of success, return zero.
459 // Otherwise, return an error code.
__tgt_rtl_run_target_region(int32_t ID,void * Entry,void ** Args,ptrdiff_t * Offsets,int32_t NumArgs)460 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
461                                     ptrdiff_t *Offsets, int32_t NumArgs) {
462   return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1,
463                                           1, 0);
464 }
465