1 /******************************************************************************/
2 /*                                                                            */
3 /* Part of the LLVM Project, under the Apache License v2.0 with LLVM          */
4 /* Exceptions.                                                                */
5 /* See https://llvm.org/LICENSE.txt for license information.                  */
6 /* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    */
7 /*                                                                            */
8 /******************************************************************************/
9 /*                                                                            */
10 /*  This file defines GPUJIT.                                                 */
11 /*                                                                            */
12 /******************************************************************************/
13 
14 #ifndef GPUJIT_H_
15 #define GPUJIT_H_
16 #include "stddef.h"
17 
18 /*
19  * The following demostrates how we can use the GPURuntime library to
20  * execute a GPU kernel.
21  *
22  * char KernelString[] = "\n\
23  *   .version 1.4\n\
24  *   .target sm_10, map_f64_to_f32\n\
25  *   .entry _Z8myKernelPi (\n\
26  *   .param .u64 __cudaparm__Z8myKernelPi_data)\n\
27  *   {\n\
28  *     .reg .u16 %rh<4>;\n\
29  *     .reg .u32 %r<5>;\n\
30  *     .reg .u64 %rd<6>;\n\
31  *     cvt.u32.u16     %r1, %tid.x;\n\
32  *     mov.u16         %rh1, %ctaid.x;\n\
33  *     mov.u16         %rh2, %ntid.x;\n\
34  *     mul.wide.u16    %r2, %rh1, %rh2;\n\
35  *     add.u32         %r3, %r1, %r2;\n\
36  *     ld.param.u64    %rd1, [__cudaparm__Z8myKernelPi_data];\n\
37  *     cvt.s64.s32     %rd2, %r3;\n\
38  *     mul.wide.s32    %rd3, %r3, 4;\n\
39  *     add.u64         %rd4, %rd1, %rd3;\n\
40  *     st.global.s32   [%rd4+0], %r3;\n\
41  *     exit;\n\
42  *   }\n\
43  * ";
44  *
45  * const char *Entry = "_Z8myKernelPi";
46  *
47  * int main() {
48  *   PollyGPUFunction *Kernel;
49  *   PollyGPUContext *Context;
50  *   PollyGPUDevicePtr *DevArray;
51  *   int *HostData;
52  *   int MemSize;
53  *
54  *   int GridX = 8;
55  *   int GridY = 8;
56  *
57  *   int BlockX = 16;
58  *   int BlockY = 16;
59  *   int BlockZ = 1;
60  *
61  *   MemSize = 256*64*sizeof(int);
62  *   Context = polly_initContext();
63  *   DevArray = polly_allocateMemoryForDevice(MemSize);
64  *   Kernel = polly_getKernel(KernelString, KernelName);
65  *
66  *   void *Params[1];
67  *   void *DevPtr = polly_getDevicePtr(DevArray)
68  *   Params[0] = &DevPtr;
69  *
70  *   polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params);
71  *
72  *   polly_copyFromDeviceToHost(HostData, DevData, MemSize);
73  *   polly_freeKernel(Kernel);
74  *   polly_freeDeviceMemory(DevArray);
75  *   polly_freeContext(Context);
76  * }
77  *
78  */
79 
80 typedef enum PollyGPURuntimeT {
81   RUNTIME_NONE,
82   RUNTIME_CUDA,
83   RUNTIME_CL
84 } PollyGPURuntime;
85 
86 typedef struct PollyGPUContextT PollyGPUContext;
87 typedef struct PollyGPUFunctionT PollyGPUFunction;
88 typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
89 
90 typedef struct OpenCLContextT OpenCLContext;
91 typedef struct OpenCLKernelT OpenCLKernel;
92 typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
93 
94 typedef struct CUDAContextT CUDAContext;
95 typedef struct CUDAKernelT CUDAKernel;
96 typedef struct CUDADevicePtrT CUDADevicePtr;
97 
98 PollyGPUContext *polly_initContextCUDA();
99 PollyGPUContext *polly_initContextCL();
100 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
101                                   const char *KernelName);
102 void polly_freeKernel(PollyGPUFunction *Kernel);
103 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
104                                 long MemSize);
105 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
106                                 long MemSize);
107 void polly_synchronizeDevice();
108 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
109                         unsigned int GridDimY, unsigned int BlockSizeX,
110                         unsigned int BlockSizeY, unsigned int BlockSizeZ,
111                         void **Parameters);
112 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
113 void polly_freeContext(PollyGPUContext *Context);
114 
115 // Note that polly_{malloc/free}Managed are currently not used by Polly.
116 // We use them in COSMO by replacing all malloc with polly_mallocManaged and all
117 // frees with cudaFree, so we can get managed memory "automatically".
118 // Needless to say, this is a hack.
119 // Please make sure that this code is not present in Polly when 2018 rolls in.
120 // If this is still present, ping Siddharth Bhat <siddu.druid@gmail.com>
121 void *polly_mallocManaged(size_t size);
122 void polly_freeManaged(void *mem);
123 #endif /* GPUJIT_H_ */
124