1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
2 /*                                                                            */
3 /* Part of the LLVM Project, under the Apache License v2.0 with LLVM          */
4 /* Exceptions.                                                                */
5 /* See https://llvm.org/LICENSE.txt for license information.                  */
6 /* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    */
7 /*                                                                            */
8 /******************************************************************************/
9 /*                                                                            */
10 /*  This file implements GPUJIT, a ptx string execution engine for GPU.       */
11 /*                                                                            */
12 /******************************************************************************/
13 
14 #include "GPUJIT.h"
15 
16 #ifdef HAS_LIBCUDART
17 #include <cuda.h>
18 #include <cuda_runtime.h>
19 #endif /* HAS_LIBCUDART */
20 
21 #ifdef HAS_LIBOPENCL
22 #ifdef __APPLE__
23 #include <OpenCL/opencl.h>
24 #else
25 #include <CL/cl.h>
26 #endif /* __APPLE__ */
27 #endif /* HAS_LIBOPENCL */
28 
29 #include <assert.h>
30 #include <dlfcn.h>
31 #include <stdarg.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36 
37 static int DebugMode;
38 static int CacheMode;
39 #define max(x, y) ((x) > (y) ? (x) : (y))
40 
41 static PollyGPURuntime Runtime = RUNTIME_NONE;
42 
debug_print(const char * format,...)43 static void debug_print(const char *format, ...) {
44   if (!DebugMode)
45     return;
46 
47   va_list args;
48   va_start(args, format);
49   vfprintf(stderr, format, args);
50   va_end(args);
51 }
52 #define dump_function() debug_print("-> %s\n", __func__)
53 
54 #define KERNEL_CACHE_SIZE 10
55 
56 static void err_runtime() __attribute__((noreturn));
err_runtime()57 static void err_runtime() {
58   fprintf(stderr, "Runtime not correctly initialized.\n");
59   exit(-1);
60 }
61 
62 struct PollyGPUContextT {
63   void *Context;
64 };
65 
66 struct PollyGPUFunctionT {
67   void *Kernel;
68 };
69 
70 struct PollyGPUDevicePtrT {
71   void *DevicePtr;
72 };
73 
74 /******************************************************************************/
75 /*                                  OpenCL                                    */
76 /******************************************************************************/
77 #ifdef HAS_LIBOPENCL
78 
79 struct OpenCLContextT {
80   cl_context Context;
81   cl_command_queue CommandQueue;
82 };
83 
84 struct OpenCLKernelT {
85   cl_kernel Kernel;
86   cl_program Program;
87   const char *BinaryString;
88 };
89 
90 struct OpenCLDevicePtrT {
91   cl_mem MemObj;
92 };
93 
94 /* Dynamic library handles for the OpenCL runtime library. */
95 static void *HandleOpenCL;
96 static void *HandleOpenCLBeignet;
97 
98 /* Type-defines of function pointer to OpenCL Runtime API. */
99 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
100                                      cl_platform_id *Platforms,
101                                      cl_uint *NumPlatforms);
102 static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
103 
104 typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
105                                    cl_device_type DeviceType,
106                                    cl_uint NumEntries, cl_device_id *Devices,
107                                    cl_uint *NumDevices);
108 static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
109 
110 typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
111                                     cl_device_info ParamName,
112                                     size_t ParamValueSize, void *ParamValue,
113                                     size_t *ParamValueSizeRet);
114 static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
115 
116 typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
117                                     size_t ParamValueSize, void *ParamValue,
118                                     size_t *ParamValueSizeRet);
119 static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
120 
121 typedef cl_context clCreateContextFcnTy(
122     const cl_context_properties *Properties, cl_uint NumDevices,
123     const cl_device_id *Devices,
124     void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
125                                  size_t CB, void *UserData),
126     void *UserData, cl_int *ErrcodeRet);
127 static clCreateContextFcnTy *clCreateContextFcnPtr;
128 
129 typedef cl_command_queue
130 clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
131                           cl_command_queue_properties Properties,
132                           cl_int *ErrcodeRet);
133 static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
134 
135 typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
136                                    size_t Size, void *HostPtr,
137                                    cl_int *ErrcodeRet);
138 static clCreateBufferFcnTy *clCreateBufferFcnPtr;
139 
140 typedef cl_int
141 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
142                           cl_bool BlockingWrite, size_t Offset, size_t Size,
143                           const void *Ptr, cl_uint NumEventsInWaitList,
144                           const cl_event *EventWaitList, cl_event *Event);
145 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
146 
147 typedef cl_program
148 clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
149                                   const cl_device_id *DeviceList,
150                                   const char *Filename, cl_int *ErrcodeRet);
151 static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
152 
153 typedef cl_program clCreateProgramWithBinaryFcnTy(
154     cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
155     const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
156     cl_int *ErrcodeRet);
157 static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
158 
159 typedef cl_int clBuildProgramFcnTy(
160     cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
161     const char *Options,
162     void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
163     void *UserData);
164 static clBuildProgramFcnTy *clBuildProgramFcnPtr;
165 
166 typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
167                                       const char *KernelName,
168                                       cl_int *ErrcodeRet);
169 static clCreateKernelFcnTy *clCreateKernelFcnPtr;
170 
171 typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
172                                    size_t ArgSize, const void *ArgValue);
173 static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
174 
175 typedef cl_int clEnqueueNDRangeKernelFcnTy(
176     cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
177     const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
178     const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
179     const cl_event *EventWaitList, cl_event *Event);
180 static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
181 
182 typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
183                                         cl_mem Buffer, cl_bool BlockingRead,
184                                         size_t Offset, size_t Size, void *Ptr,
185                                         cl_uint NumEventsInWaitList,
186                                         const cl_event *EventWaitList,
187                                         cl_event *Event);
188 static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
189 
190 typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
191 static clFlushFcnTy *clFlushFcnPtr;
192 
193 typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
194 static clFinishFcnTy *clFinishFcnPtr;
195 
196 typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
197 static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
198 
199 typedef cl_int clReleaseProgramFcnTy(cl_program Program);
200 static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
201 
202 typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
203 static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
204 
205 typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
206 static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
207 
208 typedef cl_int clReleaseContextFcnTy(cl_context Context);
209 static clReleaseContextFcnTy *clReleaseContextFcnPtr;
210 
getAPIHandleCL(void * Handle,const char * FuncName)211 static void *getAPIHandleCL(void *Handle, const char *FuncName) {
212   char *Err;
213   void *FuncPtr;
214   dlerror();
215   FuncPtr = dlsym(Handle, FuncName);
216   if ((Err = dlerror()) != 0) {
217     fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
218     return 0;
219   }
220   return FuncPtr;
221 }
222 
initialDeviceAPILibrariesCL()223 static int initialDeviceAPILibrariesCL() {
224   HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
225   HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
226   if (!HandleOpenCL) {
227     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
228     return 0;
229   }
230   return 1;
231 }
232 
233 /* Get function pointer to OpenCL Runtime API.
234  *
235  * Note that compilers conforming to the ISO C standard are required to
236  * generate a warning if a conversion from a void * pointer to a function
237  * pointer is attempted as in the following statements. The warning
238  * of this kind of cast may not be emitted by clang and new versions of gcc
239  * as it is valid on POSIX 2008. For compilers required to generate a warning,
240  * we temporarily disable -Wpedantic, to avoid bloating the output with
241  * unnecessary warnings.
242  *
243  * Reference:
244  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
245  */
246 #pragma GCC diagnostic push
247 #pragma GCC diagnostic ignored "-Wpedantic"
initialDeviceAPIsCL()248 static int initialDeviceAPIsCL() {
249   if (initialDeviceAPILibrariesCL() == 0)
250     return 0;
251 
252   // FIXME: We are now always selecting the Intel Beignet driver if it is
253   // available on the system, instead of a possible NVIDIA or AMD OpenCL
254   // API. This selection should occurr based on the target architecture
255   // chosen when compiling.
256   void *Handle =
257       (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
258 
259   clGetPlatformIDsFcnPtr =
260       (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
261 
262   clGetDeviceIDsFcnPtr =
263       (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
264 
265   clGetDeviceInfoFcnPtr =
266       (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
267 
268   clGetKernelInfoFcnPtr =
269       (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
270 
271   clCreateContextFcnPtr =
272       (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
273 
274   clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
275       Handle, "clCreateCommandQueue");
276 
277   clCreateBufferFcnPtr =
278       (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
279 
280   clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
281       Handle, "clEnqueueWriteBuffer");
282 
283   if (HandleOpenCLBeignet)
284     clCreateProgramWithLLVMIntelFcnPtr =
285         (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
286             Handle, "clCreateProgramWithLLVMIntel");
287 
288   clCreateProgramWithBinaryFcnPtr =
289       (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
290           Handle, "clCreateProgramWithBinary");
291 
292   clBuildProgramFcnPtr =
293       (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
294 
295   clCreateKernelFcnPtr =
296       (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
297 
298   clSetKernelArgFcnPtr =
299       (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
300 
301   clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
302       Handle, "clEnqueueNDRangeKernel");
303 
304   clEnqueueReadBufferFcnPtr =
305       (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
306 
307   clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
308 
309   clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
310 
311   clReleaseKernelFcnPtr =
312       (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
313 
314   clReleaseProgramFcnPtr =
315       (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
316 
317   clReleaseMemObjectFcnPtr =
318       (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
319 
320   clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
321       Handle, "clReleaseCommandQueue");
322 
323   clReleaseContextFcnPtr =
324       (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
325 
326   return 1;
327 }
328 #pragma GCC diagnostic pop
329 
330 /* Context and Device. */
331 static PollyGPUContext *GlobalContext = NULL;
332 static cl_device_id GlobalDeviceID = NULL;
333 
334 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
335 static void printOpenCLError(int Error);
336 
checkOpenCLError(int Ret,const char * format,...)337 static void checkOpenCLError(int Ret, const char *format, ...) {
338   if (Ret == CL_SUCCESS)
339     return;
340 
341   printOpenCLError(Ret);
342   va_list args;
343   va_start(args, format);
344   vfprintf(stderr, format, args);
345   va_end(args);
346   exit(-1);
347 }
348 
initContextCL()349 static PollyGPUContext *initContextCL() {
350   dump_function();
351 
352   PollyGPUContext *Context;
353 
354   cl_platform_id PlatformID = NULL;
355   cl_device_id DeviceID = NULL;
356   cl_uint NumDevicesRet;
357   cl_int Ret;
358 
359   char DeviceRevision[256];
360   char DeviceName[256];
361   size_t DeviceRevisionRetSize, DeviceNameRetSize;
362 
363   static __thread PollyGPUContext *CurrentContext = NULL;
364 
365   if (CurrentContext)
366     return CurrentContext;
367 
368   /* Get API handles. */
369   if (initialDeviceAPIsCL() == 0) {
370     fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
371     exit(-1);
372   }
373 
374   /* Get number of devices that support OpenCL. */
375   static const int NumberOfPlatforms = 1;
376   Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
377   checkOpenCLError(Ret, "Failed to get platform IDs.\n");
378   // TODO: Extend to CL_DEVICE_TYPE_ALL?
379   static const int NumberOfDevices = 1;
380   Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
381                              &DeviceID, &NumDevicesRet);
382   checkOpenCLError(Ret, "Failed to get device IDs.\n");
383 
384   GlobalDeviceID = DeviceID;
385   if (NumDevicesRet == 0) {
386     fprintf(stderr, "There is no device supporting OpenCL.\n");
387     exit(-1);
388   }
389 
390   /* Get device revision. */
391   Ret =
392       clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
393                             DeviceRevision, &DeviceRevisionRetSize);
394   checkOpenCLError(Ret, "Failed to fetch device revision.\n");
395 
396   /* Get device name. */
397   Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
398                               DeviceName, &DeviceNameRetSize);
399   checkOpenCLError(Ret, "Failed to fetch device name.\n");
400 
401   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
402 
403   /* Create context on the device. */
404   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
405   if (Context == 0) {
406     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
407     exit(-1);
408   }
409   Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
410   if (Context->Context == 0) {
411     fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
412     exit(-1);
413   }
414   ((OpenCLContext *)Context->Context)->Context =
415       clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
416   checkOpenCLError(Ret, "Failed to create context.\n");
417 
418   static const int ExtraProperties = 0;
419   ((OpenCLContext *)Context->Context)->CommandQueue =
420       clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
421                                  DeviceID, ExtraProperties, &Ret);
422   checkOpenCLError(Ret, "Failed to create command queue.\n");
423 
424   if (CacheMode)
425     CurrentContext = Context;
426 
427   GlobalContext = Context;
428   return Context;
429 }
430 
freeKernelCL(PollyGPUFunction * Kernel)431 static void freeKernelCL(PollyGPUFunction *Kernel) {
432   dump_function();
433 
434   if (CacheMode)
435     return;
436 
437   if (!GlobalContext) {
438     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
439     exit(-1);
440   }
441 
442   cl_int Ret;
443   Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
444   checkOpenCLError(Ret, "Failed to flush command queue.\n");
445   Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
446   checkOpenCLError(Ret, "Failed to finish command queue.\n");
447 
448   if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
449     cl_int Ret =
450         clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
451     checkOpenCLError(Ret, "Failed to release kernel.\n");
452   }
453 
454   if (((OpenCLKernel *)Kernel->Kernel)->Program) {
455     cl_int Ret =
456         clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
457     checkOpenCLError(Ret, "Failed to release program.\n");
458   }
459 
460   if (Kernel->Kernel)
461     free((OpenCLKernel *)Kernel->Kernel);
462 
463   if (Kernel)
464     free(Kernel);
465 }
466 
getKernelCL(const char * BinaryBuffer,const char * KernelName)467 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
468                                      const char *KernelName) {
469   dump_function();
470 
471   if (!GlobalContext) {
472     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
473     exit(-1);
474   }
475 
476   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
477   static __thread int NextCacheItem = 0;
478 
479   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
480     // We exploit here the property that all Polly-ACC kernels are allocated
481     // as global constants, hence a pointer comparision is sufficient to
482     // determin equality.
483     if (KernelCache[i] &&
484         ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
485             BinaryBuffer) {
486       debug_print("  -> using cached kernel\n");
487       return KernelCache[i];
488     }
489   }
490 
491   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
492   if (Function == 0) {
493     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
494     exit(-1);
495   }
496   Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
497   if (Function->Kernel == 0) {
498     fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
499     exit(-1);
500   }
501 
502   if (!GlobalDeviceID) {
503     fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
504     exit(-1);
505   }
506 
507   cl_int Ret;
508 
509   if (HandleOpenCLBeignet) {
510     // This is a workaround, since clCreateProgramWithLLVMIntel only
511     // accepts a filename to a valid llvm-ir file as an argument, instead
512     // of accepting the BinaryBuffer directly.
513     char FileName[] = "/tmp/polly_kernelXXXXXX";
514     int File = mkstemp(FileName);
515     write(File, BinaryBuffer, strlen(BinaryBuffer));
516 
517     ((OpenCLKernel *)Function->Kernel)->Program =
518         clCreateProgramWithLLVMIntelFcnPtr(
519             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
520             &GlobalDeviceID, FileName, &Ret);
521     checkOpenCLError(Ret, "Failed to create program from llvm.\n");
522     close(File);
523     unlink(FileName);
524   } else {
525     size_t BinarySize = strlen(BinaryBuffer);
526     ((OpenCLKernel *)Function->Kernel)->Program =
527         clCreateProgramWithBinaryFcnPtr(
528             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
529             &GlobalDeviceID, (const size_t *)&BinarySize,
530             (const unsigned char **)&BinaryBuffer, NULL, &Ret);
531     checkOpenCLError(Ret, "Failed to create program from binary.\n");
532   }
533 
534   Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
535                              &GlobalDeviceID, NULL, NULL, NULL);
536   checkOpenCLError(Ret, "Failed to build program.\n");
537 
538   ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
539       ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
540   checkOpenCLError(Ret, "Failed to create kernel.\n");
541 
542   ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
543 
544   if (CacheMode) {
545     if (KernelCache[NextCacheItem])
546       freeKernelCL(KernelCache[NextCacheItem]);
547 
548     KernelCache[NextCacheItem] = Function;
549 
550     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
551   }
552 
553   return Function;
554 }
555 
copyFromHostToDeviceCL(void * HostData,PollyGPUDevicePtr * DevData,long MemSize)556 static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
557                                    long MemSize) {
558   dump_function();
559 
560   if (!GlobalContext) {
561     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
562     exit(-1);
563   }
564 
565   cl_int Ret;
566   Ret = clEnqueueWriteBufferFcnPtr(
567       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
568       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
569       HostData, 0, NULL, NULL);
570   checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
571 }
572 
copyFromDeviceToHostCL(PollyGPUDevicePtr * DevData,void * HostData,long MemSize)573 static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
574                                    long MemSize) {
575   dump_function();
576 
577   if (!GlobalContext) {
578     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
579     exit(-1);
580   }
581 
582   cl_int Ret;
583   Ret = clEnqueueReadBufferFcnPtr(
584       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
585       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
586       HostData, 0, NULL, NULL);
587   checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
588 }
589 
launchKernelCL(PollyGPUFunction * Kernel,unsigned int GridDimX,unsigned int GridDimY,unsigned int BlockDimX,unsigned int BlockDimY,unsigned int BlockDimZ,void ** Parameters)590 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
591                            unsigned int GridDimY, unsigned int BlockDimX,
592                            unsigned int BlockDimY, unsigned int BlockDimZ,
593                            void **Parameters) {
594   dump_function();
595 
596   cl_int Ret;
597   cl_uint NumArgs;
598 
599   if (!GlobalContext) {
600     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
601     exit(-1);
602   }
603 
604   OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
605   Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
606                               sizeof(cl_uint), &NumArgs, NULL);
607   checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
608 
609   /* Argument sizes are stored at the end of the Parameters array. */
610   for (cl_uint i = 0; i < NumArgs; i++) {
611     Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
612                                *((int *)Parameters[NumArgs + i]),
613                                (void *)Parameters[i]);
614     checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
615   }
616 
617   unsigned int GridDimZ = 1;
618   size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
619                               BlockDimZ * GridDimZ};
620   size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
621 
622   static const int WorkDim = 3;
623   OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
624   Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
625                                      WorkDim, NULL, GlobalWorkSize,
626                                      LocalWorkSize, 0, NULL, NULL);
627   checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
628 }
629 
freeDeviceMemoryCL(PollyGPUDevicePtr * Allocation)630 static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
631   dump_function();
632 
633   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
634   cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
635   checkOpenCLError(Ret, "Failed to free device memory.\n");
636 
637   free(DevPtr);
638   free(Allocation);
639 }
640 
allocateMemoryForDeviceCL(long MemSize)641 static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
642   dump_function();
643 
644   if (!GlobalContext) {
645     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
646     exit(-1);
647   }
648 
649   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
650   if (DevData == 0) {
651     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
652     exit(-1);
653   }
654   DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
655   if (DevData->DevicePtr == 0) {
656     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
657     exit(-1);
658   }
659 
660   cl_int Ret;
661   ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
662       clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
663                            CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
664   checkOpenCLError(Ret,
665                    "Allocate memory for GPU device memory pointer failed.\n");
666 
667   return DevData;
668 }
669 
getDevicePtrCL(PollyGPUDevicePtr * Allocation)670 static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
671   dump_function();
672 
673   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
674   return (void *)DevPtr->MemObj;
675 }
676 
synchronizeDeviceCL()677 static void synchronizeDeviceCL() {
678   dump_function();
679 
680   if (!GlobalContext) {
681     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
682     exit(-1);
683   }
684 
685   if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
686       CL_SUCCESS) {
687     fprintf(stderr, "Synchronizing device and host memory failed.\n");
688     exit(-1);
689   }
690 }
691 
freeContextCL(PollyGPUContext * Context)692 static void freeContextCL(PollyGPUContext *Context) {
693   dump_function();
694 
695   cl_int Ret;
696 
697   GlobalContext = NULL;
698 
699   OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
700   if (Ctx->CommandQueue) {
701     Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
702     checkOpenCLError(Ret, "Could not release command queue.\n");
703   }
704 
705   if (Ctx->Context) {
706     Ret = clReleaseContextFcnPtr(Ctx->Context);
707     checkOpenCLError(Ret, "Could not release context.\n");
708   }
709 
710   free(Ctx);
711   free(Context);
712 }
713 
printOpenCLError(int Error)714 static void printOpenCLError(int Error) {
715 
716   switch (Error) {
717   case CL_SUCCESS:
718     // Success, don't print an error.
719     break;
720 
721   // JIT/Runtime errors.
722   case CL_DEVICE_NOT_FOUND:
723     fprintf(stderr, "Device not found.\n");
724     break;
725   case CL_DEVICE_NOT_AVAILABLE:
726     fprintf(stderr, "Device not available.\n");
727     break;
728   case CL_COMPILER_NOT_AVAILABLE:
729     fprintf(stderr, "Compiler not available.\n");
730     break;
731   case CL_MEM_OBJECT_ALLOCATION_FAILURE:
732     fprintf(stderr, "Mem object allocation failure.\n");
733     break;
734   case CL_OUT_OF_RESOURCES:
735     fprintf(stderr, "Out of resources.\n");
736     break;
737   case CL_OUT_OF_HOST_MEMORY:
738     fprintf(stderr, "Out of host memory.\n");
739     break;
740   case CL_PROFILING_INFO_NOT_AVAILABLE:
741     fprintf(stderr, "Profiling info not available.\n");
742     break;
743   case CL_MEM_COPY_OVERLAP:
744     fprintf(stderr, "Mem copy overlap.\n");
745     break;
746   case CL_IMAGE_FORMAT_MISMATCH:
747     fprintf(stderr, "Image format mismatch.\n");
748     break;
749   case CL_IMAGE_FORMAT_NOT_SUPPORTED:
750     fprintf(stderr, "Image format not supported.\n");
751     break;
752   case CL_BUILD_PROGRAM_FAILURE:
753     fprintf(stderr, "Build program failure.\n");
754     break;
755   case CL_MAP_FAILURE:
756     fprintf(stderr, "Map failure.\n");
757     break;
758   case CL_MISALIGNED_SUB_BUFFER_OFFSET:
759     fprintf(stderr, "Misaligned sub buffer offset.\n");
760     break;
761   case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
762     fprintf(stderr, "Exec status error for events in wait list.\n");
763     break;
764   case CL_COMPILE_PROGRAM_FAILURE:
765     fprintf(stderr, "Compile program failure.\n");
766     break;
767   case CL_LINKER_NOT_AVAILABLE:
768     fprintf(stderr, "Linker not available.\n");
769     break;
770   case CL_LINK_PROGRAM_FAILURE:
771     fprintf(stderr, "Link program failure.\n");
772     break;
773   case CL_DEVICE_PARTITION_FAILED:
774     fprintf(stderr, "Device partition failed.\n");
775     break;
776   case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
777     fprintf(stderr, "Kernel arg info not available.\n");
778     break;
779 
780   // Compiler errors.
781   case CL_INVALID_VALUE:
782     fprintf(stderr, "Invalid value.\n");
783     break;
784   case CL_INVALID_DEVICE_TYPE:
785     fprintf(stderr, "Invalid device type.\n");
786     break;
787   case CL_INVALID_PLATFORM:
788     fprintf(stderr, "Invalid platform.\n");
789     break;
790   case CL_INVALID_DEVICE:
791     fprintf(stderr, "Invalid device.\n");
792     break;
793   case CL_INVALID_CONTEXT:
794     fprintf(stderr, "Invalid context.\n");
795     break;
796   case CL_INVALID_QUEUE_PROPERTIES:
797     fprintf(stderr, "Invalid queue properties.\n");
798     break;
799   case CL_INVALID_COMMAND_QUEUE:
800     fprintf(stderr, "Invalid command queue.\n");
801     break;
802   case CL_INVALID_HOST_PTR:
803     fprintf(stderr, "Invalid host pointer.\n");
804     break;
805   case CL_INVALID_MEM_OBJECT:
806     fprintf(stderr, "Invalid memory object.\n");
807     break;
808   case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
809     fprintf(stderr, "Invalid image format descriptor.\n");
810     break;
811   case CL_INVALID_IMAGE_SIZE:
812     fprintf(stderr, "Invalid image size.\n");
813     break;
814   case CL_INVALID_SAMPLER:
815     fprintf(stderr, "Invalid sampler.\n");
816     break;
817   case CL_INVALID_BINARY:
818     fprintf(stderr, "Invalid binary.\n");
819     break;
820   case CL_INVALID_BUILD_OPTIONS:
821     fprintf(stderr, "Invalid build options.\n");
822     break;
823   case CL_INVALID_PROGRAM:
824     fprintf(stderr, "Invalid program.\n");
825     break;
826   case CL_INVALID_PROGRAM_EXECUTABLE:
827     fprintf(stderr, "Invalid program executable.\n");
828     break;
829   case CL_INVALID_KERNEL_NAME:
830     fprintf(stderr, "Invalid kernel name.\n");
831     break;
832   case CL_INVALID_KERNEL_DEFINITION:
833     fprintf(stderr, "Invalid kernel definition.\n");
834     break;
835   case CL_INVALID_KERNEL:
836     fprintf(stderr, "Invalid kernel.\n");
837     break;
838   case CL_INVALID_ARG_INDEX:
839     fprintf(stderr, "Invalid arg index.\n");
840     break;
841   case CL_INVALID_ARG_VALUE:
842     fprintf(stderr, "Invalid arg value.\n");
843     break;
844   case CL_INVALID_ARG_SIZE:
845     fprintf(stderr, "Invalid arg size.\n");
846     break;
847   case CL_INVALID_KERNEL_ARGS:
848     fprintf(stderr, "Invalid kernel args.\n");
849     break;
850   case CL_INVALID_WORK_DIMENSION:
851     fprintf(stderr, "Invalid work dimension.\n");
852     break;
853   case CL_INVALID_WORK_GROUP_SIZE:
854     fprintf(stderr, "Invalid work group size.\n");
855     break;
856   case CL_INVALID_WORK_ITEM_SIZE:
857     fprintf(stderr, "Invalid work item size.\n");
858     break;
859   case CL_INVALID_GLOBAL_OFFSET:
860     fprintf(stderr, "Invalid global offset.\n");
861     break;
862   case CL_INVALID_EVENT_WAIT_LIST:
863     fprintf(stderr, "Invalid event wait list.\n");
864     break;
865   case CL_INVALID_EVENT:
866     fprintf(stderr, "Invalid event.\n");
867     break;
868   case CL_INVALID_OPERATION:
869     fprintf(stderr, "Invalid operation.\n");
870     break;
871   case CL_INVALID_GL_OBJECT:
872     fprintf(stderr, "Invalid GL object.\n");
873     break;
874   case CL_INVALID_BUFFER_SIZE:
875     fprintf(stderr, "Invalid buffer size.\n");
876     break;
877   case CL_INVALID_MIP_LEVEL:
878     fprintf(stderr, "Invalid mip level.\n");
879     break;
880   case CL_INVALID_GLOBAL_WORK_SIZE:
881     fprintf(stderr, "Invalid global work size.\n");
882     break;
883   case CL_INVALID_PROPERTY:
884     fprintf(stderr, "Invalid property.\n");
885     break;
886   case CL_INVALID_IMAGE_DESCRIPTOR:
887     fprintf(stderr, "Invalid image descriptor.\n");
888     break;
889   case CL_INVALID_COMPILER_OPTIONS:
890     fprintf(stderr, "Invalid compiler options.\n");
891     break;
892   case CL_INVALID_LINKER_OPTIONS:
893     fprintf(stderr, "Invalid linker options.\n");
894     break;
895   case CL_INVALID_DEVICE_PARTITION_COUNT:
896     fprintf(stderr, "Invalid device partition count.\n");
897     break;
898   case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
899     fprintf(stderr, "Invalid pipe size.\n");
900     break;
901   case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
902     fprintf(stderr, "Invalid device queue.\n");
903     break;
904 
905   // NVIDIA specific error.
906   case -9999:
907     fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
908     break;
909 
910   default:
911     fprintf(stderr, "Unknown error code!\n");
912     break;
913   }
914 }
915 
916 #endif /* HAS_LIBOPENCL */
917 /******************************************************************************/
918 /*                                   CUDA                                     */
919 /******************************************************************************/
920 #ifdef HAS_LIBCUDART
921 
922 struct CUDAContextT {
923   CUcontext Cuda;
924 };
925 
926 struct CUDAKernelT {
927   CUfunction Cuda;
928   CUmodule CudaModule;
929   const char *BinaryString;
930 };
931 
932 struct CUDADevicePtrT {
933   CUdeviceptr Cuda;
934 };
935 
936 /* Dynamic library handles for the CUDA and CUDA runtime library. */
937 static void *HandleCuda;
938 static void *HandleCudaRT;
939 
940 /* Type-defines of function pointer to CUDA driver APIs. */
941 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
942 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
943 
944 typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
945                                                 unsigned int);
946 static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;
947 
948 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
949     CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
950     unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
951     unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
952     void **KernelParams, void **Extra);
953 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
954 
955 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
956 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
957 
958 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
959 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
960 
961 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
962 static CuMemFreeFcnTy *CuMemFreeFcnPtr;
963 
964 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
965 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
966 
967 typedef CUresult CUDAAPI CuProfilerStopFcnTy();
968 static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
969 
970 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
971 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
972 
973 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
974 static CuInitFcnTy *CuInitFcnPtr;
975 
976 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
977 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
978 
979 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
980 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
981 
982 typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
983 static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
984 
985 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
986 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
987 
988 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
989                                                  unsigned int, CUjit_option *,
990                                                  void **);
991 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
992 
993 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
994                                                const void *Image);
995 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
996 
997 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
998                                                   const char *);
999 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
1000 
1001 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
1002 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
1003 
1004 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
1005 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
1006 
1007 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
1008                                             CUjitInputType Type, void *Data,
1009                                             size_t Size, const char *Name,
1010                                             unsigned int NumOptions,
1011                                             CUjit_option *Options,
1012                                             void **OptionValues);
1013 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
1014 
1015 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
1016                                            CUjit_option *Options,
1017                                            void **OptionValues,
1018                                            CUlinkState *StateOut);
1019 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
1020 
1021 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
1022                                              size_t *SizeOut);
1023 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
1024 
1025 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
1026 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
1027 
1028 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
1029 static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
1030 
1031 /* Type-defines of function pointer ot CUDA runtime APIs. */
1032 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
1033 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
1034 
getAPIHandleCUDA(void * Handle,const char * FuncName)1035 static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
1036   char *Err;
1037   void *FuncPtr;
1038   dlerror();
1039   FuncPtr = dlsym(Handle, FuncName);
1040   if ((Err = dlerror()) != 0) {
1041     fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
1042     return 0;
1043   }
1044   return FuncPtr;
1045 }
1046 
initialDeviceAPILibrariesCUDA()1047 static int initialDeviceAPILibrariesCUDA() {
1048   HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
1049   if (!HandleCuda) {
1050     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1051     return 0;
1052   }
1053 
1054   HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
1055   if (!HandleCudaRT) {
1056     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1057     return 0;
1058   }
1059 
1060   return 1;
1061 }
1062 
1063 /* Get function pointer to CUDA Driver APIs.
1064  *
1065  * Note that compilers conforming to the ISO C standard are required to
1066  * generate a warning if a conversion from a void * pointer to a function
1067  * pointer is attempted as in the following statements. The warning
1068  * of this kind of cast may not be emitted by clang and new versions of gcc
1069  * as it is valid on POSIX 2008. For compilers required to generate a warning,
1070  * we temporarily disable -Wpedantic, to avoid bloating the output with
1071  * unnecessary warnings.
1072  *
1073  * Reference:
1074  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1075  */
1076 #pragma GCC diagnostic push
1077 #pragma GCC diagnostic ignored "-Wpedantic"
initialDeviceAPIsCUDA()1078 static int initialDeviceAPIsCUDA() {
1079   if (initialDeviceAPILibrariesCUDA() == 0)
1080     return 0;
1081 
1082   CuLaunchKernelFcnPtr =
1083       (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
1084 
1085   CuMemAllocFcnPtr =
1086       (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
1087 
1088   CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
1089       HandleCuda, "cuMemAllocManaged");
1090 
1091   CuMemFreeFcnPtr =
1092       (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
1093 
1094   CuMemcpyDtoHFcnPtr =
1095       (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
1096 
1097   CuMemcpyHtoDFcnPtr =
1098       (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
1099 
1100   CuModuleUnloadFcnPtr =
1101       (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
1102 
1103   CuProfilerStopFcnPtr =
1104       (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
1105 
1106   CuCtxDestroyFcnPtr =
1107       (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
1108 
1109   CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
1110 
1111   CuDeviceGetCountFcnPtr =
1112       (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
1113 
1114   CuDeviceGetFcnPtr =
1115       (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
1116 
1117   CuCtxCreateFcnPtr =
1118       (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
1119 
1120   CuCtxGetCurrentFcnPtr =
1121       (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
1122 
1123   CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
1124       HandleCuda, "cuModuleLoadDataEx");
1125 
1126   CuModuleLoadDataFcnPtr =
1127       (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
1128 
1129   CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
1130       HandleCuda, "cuModuleGetFunction");
1131 
1132   CuDeviceComputeCapabilityFcnPtr =
1133       (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
1134           HandleCuda, "cuDeviceComputeCapability");
1135 
1136   CuDeviceGetNameFcnPtr =
1137       (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
1138 
1139   CuLinkAddDataFcnPtr =
1140       (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
1141 
1142   CuLinkCreateFcnPtr =
1143       (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
1144 
1145   CuLinkCompleteFcnPtr =
1146       (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
1147 
1148   CuLinkDestroyFcnPtr =
1149       (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
1150 
1151   CuCtxSynchronizeFcnPtr =
1152       (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
1153 
1154   /* Get function pointer to CUDA Runtime APIs. */
1155   CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
1156       HandleCudaRT, "cudaThreadSynchronize");
1157 
1158   return 1;
1159 }
1160 #pragma GCC diagnostic pop
1161 
initContextCUDA()1162 static PollyGPUContext *initContextCUDA() {
1163   dump_function();
1164   PollyGPUContext *Context;
1165   CUdevice Device;
1166 
1167   int Major = 0, Minor = 0, DeviceID = 0;
1168   char DeviceName[256];
1169   int DeviceCount = 0;
1170 
1171   static __thread PollyGPUContext *CurrentContext = NULL;
1172 
1173   if (CurrentContext)
1174     return CurrentContext;
1175 
1176   /* Get API handles. */
1177   if (initialDeviceAPIsCUDA() == 0) {
1178     fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
1179     exit(-1);
1180   }
1181 
1182   if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
1183     fprintf(stderr, "Initializing the CUDA driver API failed.\n");
1184     exit(-1);
1185   }
1186 
1187   /* Get number of devices that supports CUDA. */
1188   CuDeviceGetCountFcnPtr(&DeviceCount);
1189   if (DeviceCount == 0) {
1190     fprintf(stderr, "There is no device supporting CUDA.\n");
1191     exit(-1);
1192   }
1193 
1194   CuDeviceGetFcnPtr(&Device, 0);
1195 
1196   /* Get compute capabilities and the device name. */
1197   CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
1198   CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
1199   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
1200 
1201   /* Create context on the device. */
1202   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
1203   if (Context == 0) {
1204     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
1205     exit(-1);
1206   }
1207   Context->Context = malloc(sizeof(CUDAContext));
1208   if (Context->Context == 0) {
1209     fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
1210     exit(-1);
1211   }
1212 
1213   // In cases where managed memory is used, it is quite likely that
1214   // `cudaMallocManaged` / `polly_mallocManaged` was called before
1215   // `polly_initContext` was called.
1216   //
1217   // If `polly_initContext` calls `CuCtxCreate` when there already was a
1218   // pre-existing context created by the runtime API, this causes code running
1219   // on P100 to hang. So, we query for a pre-existing context to try and use.
1220   // If there is no pre-existing context, we create a new context
1221 
1222   // The possible pre-existing context from previous runtime API calls.
1223   CUcontext MaybeRuntimeAPIContext;
1224   if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
1225     fprintf(stderr, "cuCtxGetCurrent failed.\n");
1226     exit(-1);
1227   }
1228 
1229   // There was no previous context, initialise it.
1230   if (MaybeRuntimeAPIContext == NULL) {
1231     if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
1232                           Device) != CUDA_SUCCESS) {
1233       fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
1234       exit(-1);
1235     }
1236   } else {
1237     ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
1238   }
1239 
1240   if (CacheMode)
1241     CurrentContext = Context;
1242 
1243   return Context;
1244 }
1245 
freeKernelCUDA(PollyGPUFunction * Kernel)1246 static void freeKernelCUDA(PollyGPUFunction *Kernel) {
1247   dump_function();
1248 
1249   if (CacheMode)
1250     return;
1251 
1252   if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
1253     CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
1254 
1255   if (Kernel->Kernel)
1256     free((CUDAKernel *)Kernel->Kernel);
1257 
1258   if (Kernel)
1259     free(Kernel);
1260 }
1261 
getKernelCUDA(const char * BinaryBuffer,const char * KernelName)1262 static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
1263                                        const char *KernelName) {
1264   dump_function();
1265 
1266   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
1267   static __thread int NextCacheItem = 0;
1268 
1269   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
1270     // We exploit here the property that all Polly-ACC kernels are allocated
1271     // as global constants, hence a pointer comparision is sufficient to
1272     // determin equality.
1273     if (KernelCache[i] &&
1274         ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
1275       debug_print("  -> using cached kernel\n");
1276       return KernelCache[i];
1277     }
1278   }
1279 
1280   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
1281   if (Function == 0) {
1282     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
1283     exit(-1);
1284   }
1285   Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
1286   if (Function->Kernel == 0) {
1287     fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
1288     exit(-1);
1289   }
1290 
1291   CUresult Res;
1292   CUlinkState LState;
1293   CUjit_option Options[6];
1294   void *OptionVals[6];
1295   float Walltime = 0;
1296   unsigned long LogSize = 8192;
1297   char ErrorLog[8192], InfoLog[8192];
1298   void *CuOut;
1299   size_t OutSize;
1300 
1301   // Setup linker options
1302   // Return walltime from JIT compilation
1303   Options[0] = CU_JIT_WALL_TIME;
1304   OptionVals[0] = (void *)&Walltime;
1305   // Pass a buffer for info messages
1306   Options[1] = CU_JIT_INFO_LOG_BUFFER;
1307   OptionVals[1] = (void *)InfoLog;
1308   // Pass the size of the info buffer
1309   Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
1310   OptionVals[2] = (void *)LogSize;
1311   // Pass a buffer for error message
1312   Options[3] = CU_JIT_ERROR_LOG_BUFFER;
1313   OptionVals[3] = (void *)ErrorLog;
1314   // Pass the size of the error buffer
1315   Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
1316   OptionVals[4] = (void *)LogSize;
1317   // Make the linker verbose
1318   Options[5] = CU_JIT_LOG_VERBOSE;
1319   OptionVals[5] = (void *)1;
1320 
1321   memset(ErrorLog, 0, sizeof(ErrorLog));
1322 
1323   CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
1324   Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
1325                             strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
1326   if (Res != CUDA_SUCCESS) {
1327     fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
1328     exit(-1);
1329   }
1330 
1331   Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
1332   if (Res != CUDA_SUCCESS) {
1333     fprintf(stderr, "Complete ptx linker step failed.\n");
1334     fprintf(stderr, "\n%s\n", ErrorLog);
1335     exit(-1);
1336   }
1337 
1338   debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
1339               InfoLog);
1340 
1341   Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
1342                                CuOut);
1343   if (Res != CUDA_SUCCESS) {
1344     fprintf(stderr, "Loading ptx assembly text failed.\n");
1345     exit(-1);
1346   }
1347 
1348   Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
1349                                   ((CUDAKernel *)Function->Kernel)->CudaModule,
1350                                   KernelName);
1351   if (Res != CUDA_SUCCESS) {
1352     fprintf(stderr, "Loading kernel function failed.\n");
1353     exit(-1);
1354   }
1355 
1356   CuLinkDestroyFcnPtr(LState);
1357 
1358   ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
1359 
1360   if (CacheMode) {
1361     if (KernelCache[NextCacheItem])
1362       freeKernelCUDA(KernelCache[NextCacheItem]);
1363 
1364     KernelCache[NextCacheItem] = Function;
1365 
1366     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
1367   }
1368 
1369   return Function;
1370 }
1371 
synchronizeDeviceCUDA()1372 static void synchronizeDeviceCUDA() {
1373   dump_function();
1374   if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
1375     fprintf(stderr, "Synchronizing device and host memory failed.\n");
1376     exit(-1);
1377   }
1378 }
1379 
copyFromHostToDeviceCUDA(void * HostData,PollyGPUDevicePtr * DevData,long MemSize)1380 static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
1381                                      long MemSize) {
1382   dump_function();
1383 
1384   CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
1385   CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
1386 }
1387 
copyFromDeviceToHostCUDA(PollyGPUDevicePtr * DevData,void * HostData,long MemSize)1388 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
1389                                      long MemSize) {
1390   dump_function();
1391 
1392   if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
1393                          MemSize) != CUDA_SUCCESS) {
1394     fprintf(stderr, "Copying results from device to host memory failed.\n");
1395     exit(-1);
1396   }
1397 }
1398 
launchKernelCUDA(PollyGPUFunction * Kernel,unsigned int GridDimX,unsigned int GridDimY,unsigned int BlockDimX,unsigned int BlockDimY,unsigned int BlockDimZ,void ** Parameters)1399 static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
1400                              unsigned int GridDimY, unsigned int BlockDimX,
1401                              unsigned int BlockDimY, unsigned int BlockDimZ,
1402                              void **Parameters) {
1403   dump_function();
1404 
1405   unsigned GridDimZ = 1;
1406   unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
1407   CUstream Stream = 0;
1408   void **Extra = 0;
1409 
1410   CUresult Res;
1411   Res =
1412       CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
1413                            GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
1414                            SharedMemBytes, Stream, Parameters, Extra);
1415   if (Res != CUDA_SUCCESS) {
1416     fprintf(stderr, "Launching CUDA kernel failed.\n");
1417     exit(-1);
1418   }
1419 }
1420 
1421 // Maximum number of managed memory pointers.
1422 #define DEFAULT_MAX_POINTERS 4000
1423 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1424 void **g_managedptrs;
1425 unsigned long long g_nmanagedptrs = 0;
1426 unsigned long long g_maxmanagedptrs = 0;
1427 
initManagedPtrsBuffer()1428 __attribute__((constructor)) static void initManagedPtrsBuffer() {
1429   g_maxmanagedptrs = DEFAULT_MAX_POINTERS;
1430   const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS");
1431   if (maxManagedPointersString)
1432     g_maxmanagedptrs = atoll(maxManagedPointersString);
1433 
1434   g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs);
1435 }
1436 
1437 // Add a pointer as being allocated by cuMallocManaged
addManagedPtr(void * mem)1438 void addManagedPtr(void *mem) {
1439   assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!");
1440   assert(g_nmanagedptrs < g_maxmanagedptrs &&
1441          "We have hit the maximum number of "
1442          "managed pointers allowed. Set the "
1443          "POLLY_MAX_MANAGED_POINTERS environment variable. ");
1444   g_managedptrs[g_nmanagedptrs++] = mem;
1445 }
1446 
isManagedPtr(void * mem)1447 int isManagedPtr(void *mem) {
1448   for (unsigned long long i = 0; i < g_nmanagedptrs; i++) {
1449     if (g_managedptrs[i] == mem)
1450       return 1;
1451   }
1452   return 0;
1453 }
1454 
freeManagedCUDA(void * mem)1455 void freeManagedCUDA(void *mem) {
1456   dump_function();
1457 
1458   // In a real-world program this was used (COSMO), there were more `free`
1459   // calls in the original source than `malloc` calls. Hence, replacing all
1460   // `free`s with `cudaFree` does not work, since we would try to free
1461   // 'illegal' memory.
1462   // As a quick fix, we keep a free list and check if `mem` is a managed memory
1463   // pointer. If it is, we call `cudaFree`.
1464   // If not, we pass it along to the underlying allocator.
1465   // This is a hack, and can be removed if the underlying issue is fixed.
1466   if (isManagedPtr(mem)) {
1467     if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
1468       fprintf(stderr, "cudaFree failed.\n");
1469       exit(-1);
1470     }
1471     return;
1472   } else {
1473     free(mem);
1474   }
1475 }
1476 
mallocManagedCUDA(size_t size)1477 void *mallocManagedCUDA(size_t size) {
1478   // Note: [Size 0 allocations]
1479   // Sometimes, some runtime computation of size could create a size of 0
1480   // for an allocation. In these cases, we do not wish to fail.
1481   // The CUDA API fails on size 0 allocations.
1482   // So, we allocate size a minimum of size 1.
1483   if (!size && DebugMode)
1484     fprintf(stderr, "cudaMallocManaged called with size 0. "
1485                     "Promoting to size 1");
1486   size = max(size, 1);
1487   PollyGPUContext *_ = polly_initContextCUDA();
1488   assert(_ && "polly_initContextCUDA failed");
1489 
1490   void *newMemPtr;
1491   const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
1492                                                CU_MEM_ATTACH_GLOBAL);
1493   if (Res != CUDA_SUCCESS) {
1494     fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
1495     exit(-1);
1496   }
1497   addManagedPtr(newMemPtr);
1498   return newMemPtr;
1499 }
1500 
freeDeviceMemoryCUDA(PollyGPUDevicePtr * Allocation)1501 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
1502   dump_function();
1503   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1504   CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
1505   free(DevPtr);
1506   free(Allocation);
1507 }
1508 
allocateMemoryForDeviceCUDA(long MemSize)1509 static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
1510   if (!MemSize && DebugMode)
1511     fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. "
1512                     "Promoting to size 1");
1513   // see: [Size 0 allocations]
1514   MemSize = max(MemSize, 1);
1515   dump_function();
1516 
1517   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
1518   if (DevData == 0) {
1519     fprintf(stderr,
1520             "Allocate memory for GPU device memory pointer failed."
1521             " Line: %d | Size: %ld\n",
1522             __LINE__, MemSize);
1523     exit(-1);
1524   }
1525   DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
1526   if (DevData->DevicePtr == 0) {
1527     fprintf(stderr,
1528             "Allocate memory for GPU device memory pointer failed."
1529             " Line: %d | Size: %ld\n",
1530             __LINE__, MemSize);
1531     exit(-1);
1532   }
1533 
1534   CUresult Res =
1535       CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
1536 
1537   if (Res != CUDA_SUCCESS) {
1538     fprintf(stderr,
1539             "Allocate memory for GPU device memory pointer failed."
1540             " Line: %d | Size: %ld\n",
1541             __LINE__, MemSize);
1542     exit(-1);
1543   }
1544 
1545   return DevData;
1546 }
1547 
getDevicePtrCUDA(PollyGPUDevicePtr * Allocation)1548 static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
1549   dump_function();
1550 
1551   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1552   return (void *)DevPtr->Cuda;
1553 }
1554 
freeContextCUDA(PollyGPUContext * Context)1555 static void freeContextCUDA(PollyGPUContext *Context) {
1556   dump_function();
1557 
1558   CUDAContext *Ctx = (CUDAContext *)Context->Context;
1559   if (Ctx->Cuda) {
1560     CuProfilerStopFcnPtr();
1561     CuCtxDestroyFcnPtr(Ctx->Cuda);
1562     free(Ctx);
1563     free(Context);
1564   }
1565 
1566   dlclose(HandleCuda);
1567   dlclose(HandleCudaRT);
1568 }
1569 
1570 #endif /* HAS_LIBCUDART */
1571 /******************************************************************************/
1572 /*                                    API                                     */
1573 /******************************************************************************/
1574 
polly_initContext()1575 PollyGPUContext *polly_initContext() {
1576   DebugMode = getenv("POLLY_DEBUG") != 0;
1577   CacheMode = getenv("POLLY_NOCACHE") == 0;
1578 
1579   dump_function();
1580 
1581   PollyGPUContext *Context;
1582 
1583   switch (Runtime) {
1584 #ifdef HAS_LIBCUDART
1585   case RUNTIME_CUDA:
1586     Context = initContextCUDA();
1587     break;
1588 #endif /* HAS_LIBCUDART */
1589 #ifdef HAS_LIBOPENCL
1590   case RUNTIME_CL:
1591     Context = initContextCL();
1592     break;
1593 #endif /* HAS_LIBOPENCL */
1594   default:
1595     err_runtime();
1596   }
1597 
1598   return Context;
1599 }
1600 
polly_freeKernel(PollyGPUFunction * Kernel)1601 void polly_freeKernel(PollyGPUFunction *Kernel) {
1602   dump_function();
1603 
1604   switch (Runtime) {
1605 #ifdef HAS_LIBCUDART
1606   case RUNTIME_CUDA:
1607     freeKernelCUDA(Kernel);
1608     break;
1609 #endif /* HAS_LIBCUDART */
1610 #ifdef HAS_LIBOPENCL
1611   case RUNTIME_CL:
1612     freeKernelCL(Kernel);
1613     break;
1614 #endif /* HAS_LIBOPENCL */
1615   default:
1616     err_runtime();
1617   }
1618 }
1619 
polly_getKernel(const char * BinaryBuffer,const char * KernelName)1620 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
1621                                   const char *KernelName) {
1622   dump_function();
1623 
1624   PollyGPUFunction *Function;
1625 
1626   switch (Runtime) {
1627 #ifdef HAS_LIBCUDART
1628   case RUNTIME_CUDA:
1629     Function = getKernelCUDA(BinaryBuffer, KernelName);
1630     break;
1631 #endif /* HAS_LIBCUDART */
1632 #ifdef HAS_LIBOPENCL
1633   case RUNTIME_CL:
1634     Function = getKernelCL(BinaryBuffer, KernelName);
1635     break;
1636 #endif /* HAS_LIBOPENCL */
1637   default:
1638     err_runtime();
1639   }
1640 
1641   return Function;
1642 }
1643 
polly_copyFromHostToDevice(void * HostData,PollyGPUDevicePtr * DevData,long MemSize)1644 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
1645                                 long MemSize) {
1646   dump_function();
1647 
1648   switch (Runtime) {
1649 #ifdef HAS_LIBCUDART
1650   case RUNTIME_CUDA:
1651     copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
1652     break;
1653 #endif /* HAS_LIBCUDART */
1654 #ifdef HAS_LIBOPENCL
1655   case RUNTIME_CL:
1656     copyFromHostToDeviceCL(HostData, DevData, MemSize);
1657     break;
1658 #endif /* HAS_LIBOPENCL */
1659   default:
1660     err_runtime();
1661   }
1662 }
1663 
polly_copyFromDeviceToHost(PollyGPUDevicePtr * DevData,void * HostData,long MemSize)1664 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
1665                                 long MemSize) {
1666   dump_function();
1667 
1668   switch (Runtime) {
1669 #ifdef HAS_LIBCUDART
1670   case RUNTIME_CUDA:
1671     copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
1672     break;
1673 #endif /* HAS_LIBCUDART */
1674 #ifdef HAS_LIBOPENCL
1675   case RUNTIME_CL:
1676     copyFromDeviceToHostCL(DevData, HostData, MemSize);
1677     break;
1678 #endif /* HAS_LIBOPENCL */
1679   default:
1680     err_runtime();
1681   }
1682 }
1683 
polly_launchKernel(PollyGPUFunction * Kernel,unsigned int GridDimX,unsigned int GridDimY,unsigned int BlockDimX,unsigned int BlockDimY,unsigned int BlockDimZ,void ** Parameters)1684 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
1685                         unsigned int GridDimY, unsigned int BlockDimX,
1686                         unsigned int BlockDimY, unsigned int BlockDimZ,
1687                         void **Parameters) {
1688   dump_function();
1689 
1690   switch (Runtime) {
1691 #ifdef HAS_LIBCUDART
1692   case RUNTIME_CUDA:
1693     launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1694                      BlockDimZ, Parameters);
1695     break;
1696 #endif /* HAS_LIBCUDART */
1697 #ifdef HAS_LIBOPENCL
1698   case RUNTIME_CL:
1699     launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
1700                    Parameters);
1701     break;
1702 #endif /* HAS_LIBOPENCL */
1703   default:
1704     err_runtime();
1705   }
1706 }
1707 
polly_freeDeviceMemory(PollyGPUDevicePtr * Allocation)1708 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
1709   dump_function();
1710 
1711   switch (Runtime) {
1712 #ifdef HAS_LIBCUDART
1713   case RUNTIME_CUDA:
1714     freeDeviceMemoryCUDA(Allocation);
1715     break;
1716 #endif /* HAS_LIBCUDART */
1717 #ifdef HAS_LIBOPENCL
1718   case RUNTIME_CL:
1719     freeDeviceMemoryCL(Allocation);
1720     break;
1721 #endif /* HAS_LIBOPENCL */
1722   default:
1723     err_runtime();
1724   }
1725 }
1726 
polly_allocateMemoryForDevice(long MemSize)1727 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
1728   dump_function();
1729 
1730   PollyGPUDevicePtr *DevData;
1731 
1732   switch (Runtime) {
1733 #ifdef HAS_LIBCUDART
1734   case RUNTIME_CUDA:
1735     DevData = allocateMemoryForDeviceCUDA(MemSize);
1736     break;
1737 #endif /* HAS_LIBCUDART */
1738 #ifdef HAS_LIBOPENCL
1739   case RUNTIME_CL:
1740     DevData = allocateMemoryForDeviceCL(MemSize);
1741     break;
1742 #endif /* HAS_LIBOPENCL */
1743   default:
1744     err_runtime();
1745   }
1746 
1747   return DevData;
1748 }
1749 
polly_getDevicePtr(PollyGPUDevicePtr * Allocation)1750 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
1751   dump_function();
1752 
1753   void *DevPtr;
1754 
1755   switch (Runtime) {
1756 #ifdef HAS_LIBCUDART
1757   case RUNTIME_CUDA:
1758     DevPtr = getDevicePtrCUDA(Allocation);
1759     break;
1760 #endif /* HAS_LIBCUDART */
1761 #ifdef HAS_LIBOPENCL
1762   case RUNTIME_CL:
1763     DevPtr = getDevicePtrCL(Allocation);
1764     break;
1765 #endif /* HAS_LIBOPENCL */
1766   default:
1767     err_runtime();
1768   }
1769 
1770   return DevPtr;
1771 }
1772 
polly_synchronizeDevice()1773 void polly_synchronizeDevice() {
1774   dump_function();
1775 
1776   switch (Runtime) {
1777 #ifdef HAS_LIBCUDART
1778   case RUNTIME_CUDA:
1779     synchronizeDeviceCUDA();
1780     break;
1781 #endif /* HAS_LIBCUDART */
1782 #ifdef HAS_LIBOPENCL
1783   case RUNTIME_CL:
1784     synchronizeDeviceCL();
1785     break;
1786 #endif /* HAS_LIBOPENCL */
1787   default:
1788     err_runtime();
1789   }
1790 }
1791 
polly_freeContext(PollyGPUContext * Context)1792 void polly_freeContext(PollyGPUContext *Context) {
1793   dump_function();
1794 
1795   if (CacheMode)
1796     return;
1797 
1798   switch (Runtime) {
1799 #ifdef HAS_LIBCUDART
1800   case RUNTIME_CUDA:
1801     freeContextCUDA(Context);
1802     break;
1803 #endif /* HAS_LIBCUDART */
1804 #ifdef HAS_LIBOPENCL
1805   case RUNTIME_CL:
1806     freeContextCL(Context);
1807     break;
1808 #endif /* HAS_LIBOPENCL */
1809   default:
1810     err_runtime();
1811   }
1812 }
1813 
polly_freeManaged(void * mem)1814 void polly_freeManaged(void *mem) {
1815   dump_function();
1816 
1817 #ifdef HAS_LIBCUDART
1818   freeManagedCUDA(mem);
1819 #else
1820   fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1821   exit(-1);
1822 #endif
1823 }
1824 
polly_mallocManaged(size_t size)1825 void *polly_mallocManaged(size_t size) {
1826   dump_function();
1827 
1828 #ifdef HAS_LIBCUDART
1829   return mallocManagedCUDA(size);
1830 #else
1831   fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1832   exit(-1);
1833 #endif
1834 }
1835 
1836 /* Initialize GPUJIT with CUDA as runtime library. */
polly_initContextCUDA()1837 PollyGPUContext *polly_initContextCUDA() {
1838 #ifdef HAS_LIBCUDART
1839   Runtime = RUNTIME_CUDA;
1840   return polly_initContext();
1841 #else
1842   fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
1843   exit(-1);
1844 #endif /* HAS_LIBCUDART */
1845 }
1846 
1847 /* Initialize GPUJIT with OpenCL as runtime library. */
polly_initContextCL()1848 PollyGPUContext *polly_initContextCL() {
1849 #ifdef HAS_LIBOPENCL
1850   Runtime = RUNTIME_CL;
1851   return polly_initContext();
1852 #else
1853   fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
1854   exit(-1);
1855 #endif /* HAS_LIBOPENCL */
1856 }
1857