1 /*
2 Copyright (C) 2016 X. Andrade
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 02110-1301, USA.
18
19 */
20
21 #include <config.h>
22
23 #ifdef HAVE_CUDA
24 #include <cuda.h>
25 #include <nvrtc.h>
26 // all kernels and transfers are submitted to this non-blocking stream
27 // -> allows operations from libraries to overlap with this stream
28 CUstream * phStream;
29 int current_stream;
30 static int number_streams = 32;
31 #else
32 typedef int CUcontext;
33 typedef int CUdevice;
34 typedef int CUmodule;
35 typedef int CUfunction;
36 typedef int CUdeviceptr;
37 typedef int CUstream;
38 #endif
39
40 #include <stdlib.h> //we have to include this before cmath to workaround a bug in the PGI "compiler".
41 #include <cmath>
42
43 #include <iostream>
44
45 #include <fstream>
46
47 #include "string_f.h" /* fortran <-> c string compatibility issues */
48
49 #include <vector>
50 #include <sstream>
51 #include <iterator>
52 #include <cassert>
53 #include <cstring>
54 #include <map>
55 #include <stdbool.h>
56
57 #include <fortran_types.h>
58
59 #define NVRTC_SAFE_CALL(x) \
60 do { \
61 nvrtcResult result = x; \
62 if (result != NVRTC_SUCCESS) { \
63 std::cerr << "\nerror: " #x " failed with error " \
64 << nvrtcGetErrorString(result) << '\n'; \
65 exit(1); \
66 } \
67 } while(0)
68
69 #define CUDA_SAFE_CALL(x) \
70 do { \
71 CUresult result = x; \
72 if (result != CUDA_SUCCESS) { \
73 const char *msg; \
74 cuGetErrorName(result, &msg); \
75 std::cerr << "\nerror: " #x " failed with error " \
76 << msg << '\n'; \
77 exit(1); \
78 } \
79 } while(0)
80
81 using namespace std;
82
FC_FUNC_(cuda_init,CUDA_INIT)83 extern "C" void FC_FUNC_(cuda_init, CUDA_INIT)(CUcontext ** context, CUdevice ** device, CUstream ** stream, fint * device_number, fint * rank){
84
85 #ifdef HAVE_CUDA
86 CUDA_SAFE_CALL(cuInit(0));
87
88 *context = new CUcontext;
89 *device = new CUdevice;
90
91 int ndevices;
92
93 CUDA_SAFE_CALL(cuDeviceGetCount(&ndevices));
94
95 if (ndevices == 0) {
96 cerr << "Error: no CUDA devices available." << std::endl;
97 exit(1);
98 }
99
100 *device_number = (*device_number + *rank) % ndevices;
101 CUDA_SAFE_CALL(cuDeviceGet(*device, *device_number));
102
103 CUDA_SAFE_CALL(cuCtxCreate(*context, 0, **device));
104
105 CUDA_SAFE_CALL(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1));
106
107 phStream = new CUstream[number_streams];
108 for(current_stream = 0; current_stream < number_streams; ++current_stream) {
109 CUDA_SAFE_CALL(cuStreamCreate(&phStream[current_stream], CU_STREAM_NON_BLOCKING));
110 }
111 current_stream = 0;
112 *stream = &phStream[current_stream];
113 #endif
114 }
115
FC_FUNC_(cuda_end,CUDA_END)116 extern "C" void FC_FUNC_(cuda_end, CUDA_END)(CUcontext ** context, CUdevice ** device){
117 #ifdef HAVE_CUDA
118
119 CUDA_SAFE_CALL(cuStreamDestroy(phStream[current_stream]));
120 CUDA_SAFE_CALL(cuCtxDestroy(**context));
121
122 delete *context;
123 delete *device;
124 #endif
125 }
126
FC_FUNC_(cuda_module_map_init,CUDA_MODULES_MAP_INIT)127 extern "C" void FC_FUNC_(cuda_module_map_init, CUDA_MODULES_MAP_INIT)(map<string, CUmodule *> ** module_map){
128 *module_map = new map<string, CUmodule *>;
129 }
130
FC_FUNC_(cuda_module_map_end,CUDA_MODULES_MAP_END)131 extern "C" void FC_FUNC_(cuda_module_map_end, CUDA_MODULES_MAP_END)(map<string, CUmodule *> ** module_map){
132
133 for(map<string, CUmodule *>::iterator map_it = (**module_map).begin(); map_it != (**module_map).end(); ++map_it){
134 CUmodule * module = map_it->second;
135 #ifdef HAVE_CUDA
136 CUDA_SAFE_CALL(cuModuleUnload(*module));
137 #endif
138 delete module;
139 }
140
141 delete *module_map;
142 }
143
FC_FUNC_(cuda_build_program,CUDA_BUILD_PROGRAM)144 extern "C" void FC_FUNC_(cuda_build_program, CUDA_BUILD_PROGRAM)(map<string, CUmodule *> ** module_map, CUmodule ** module, CUdevice ** device,
145 STR_F_TYPE const fname, STR_F_TYPE const flags STR_ARG2){
146 #ifdef HAVE_CUDA
147 char *fname_c;
148 char *flags_c;
149
150 TO_C_STR1(fname, fname_c);
151 TO_C_STR2(flags, flags_c);
152
153 string map_descriptor = string(fname_c) + string(flags_c);
154
155 map<string, CUmodule *>::iterator map_it = (**module_map).find(map_descriptor);
156 if(map_it != (**module_map).end()){
157 *module = map_it->second;
158 free(fname_c);
159 return;
160 }
161
162 // read the source
163 string source;
164
165 source = "#include \"" + string(fname_c) + "\"\n";
166
167 // cout << source << "|" << endl;
168
169 nvrtcProgram prog;
170 NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, source.c_str(), "kernel_include.c", 0, NULL, NULL));
171
172 int major = 0, minor = 0;
173 CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, **device));
174 CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, **device));
175
176 char compute_version[3];
177 sprintf(compute_version, "%.1d%.1d", major, minor);
178
179 string all_flags = "--gpu-architecture=compute_" + string(compute_version)
180 + " --ftz=true --fmad=true -DCUDA -default-device " + string(flags_c);
181
182 stringstream flags_stream(all_flags);
183 istream_iterator<string> iter(flags_stream);
184 istream_iterator<string> end;
185 vector<string> tokens(iter, end);
186
187 const char ** opts = new const char*[tokens.size()];
188 for (unsigned ii = 0; ii < tokens.size(); ii++) opts[ii] = tokens[ii].c_str();
189
190
191
192 nvrtcResult err = nvrtcCompileProgram(prog, tokens.size(), opts);
193
194 free(flags_c);
195
196 size_t logSize;
197 NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
198 char *log = new char[logSize];
199 NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
200
201 if(logSize > 1){
202
203 cout << "Cuda compilation messages" << endl;
204
205 cout << "File : " << fname_c << endl;
206
207 cout << "Options : " << all_flags << endl;
208
209 cout << log << endl;
210
211 }
212
213 if(NVRTC_SUCCESS != err){
214 cerr << "Error in compiling" << endl;
215 exit(1);
216 }
217
218 delete [] log;
219 delete [] opts;
220
221 // Obtain PTX from the program.
222 size_t ptxSize;
223 NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
224 char *ptx = new char[ptxSize];
225 NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));
226
227 NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
228
229 *module = new CUmodule;
230
231 const int num_options = 2;
232 CUjit_option options[num_options];
233 void * option_values[num_options];
234
235 unsigned log_size = 4096;
236 char log_buffer[log_size];
237
238 options[0] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
239 option_values[0] = (void *) (long)log_size;
240
241 options[1] = CU_JIT_ERROR_LOG_BUFFER;
242 option_values[1] = (void *) log_buffer;
243
244 CUresult result = cuModuleLoadDataEx(*module, ptx, num_options, options, option_values);
245
246 if(result != CUDA_SUCCESS){
247 std::cerr << log_buffer << std::endl;
248 const char *msg;
249 cuGetErrorName(result, &msg);
250 std::cerr << "\nerror: cuModuleLoadDataEx failed with error " << msg << '\n';
251 exit(1);
252 }
253
254 delete [] ptx;
255
256 (**module_map)[map_descriptor] = *module;
257
258 free(fname_c);
259 #endif
260 }
261
FC_FUNC_(cuda_create_kernel,CUDA_CREATE_KERNEL)262 extern "C" void FC_FUNC_(cuda_create_kernel, CUDA_CREATE_KERNEL)(CUfunction ** kernel, CUmodule ** module, STR_F_TYPE kernel_name STR_ARG1){
263 #ifdef HAVE_CUDA
264 char *kernel_name_c;
265
266 TO_C_STR1(kernel_name, kernel_name_c);
267
268 *kernel = new CUfunction;
269
270 CUDA_SAFE_CALL(cuModuleGetFunction(*kernel, **module, kernel_name_c));
271
272 free(kernel_name_c);
273 #endif
274 }
275
FC_FUNC_(cuda_release_module,CUDA_RELEASE_MODULE)276 extern "C" void FC_FUNC_(cuda_release_module, CUDA_RELEASE_MODULE)(CUmodule ** module){
277 #ifdef HAVE_CUDA
278 CUDA_SAFE_CALL(cuModuleUnload(**module));
279 delete *module;
280 #endif
281 }
282
FC_FUNC_(cuda_release_kernel,CUDA_RELEASE_KERNEL)283 extern "C" void FC_FUNC_(cuda_release_kernel, CUDA_RELEASE_KERNEL)(CUfunction ** kernel){
284 #ifdef HAVE_CUDA
285 delete *kernel;
286 #endif
287 }
288
FC_FUNC_(cuda_device_max_threads_per_block,CUDA_DEVICE_MAX_THREADS_PER_BLOCK)289 extern "C" void FC_FUNC_(cuda_device_max_threads_per_block, CUDA_DEVICE_MAX_THREADS_PER_BLOCK)(CUdevice ** device, fint * max_threads){
290 #ifdef HAVE_CUDA
291 int value;
292 CUDA_SAFE_CALL(cuDeviceGetAttribute (&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, **device));
293 *max_threads = value;
294 #endif
295 }
296
FC_FUNC_(cuda_device_total_memory,CUDA_DEVICE_TOTAL_MEMORY)297 extern "C" void FC_FUNC_(cuda_device_total_memory, CUDA_DEVICE_TOTAL_MEMORY)(CUdevice ** device, fint8 * total_memory){
298 #ifdef HAVE_CUDA
299 size_t mem;
300 CUDA_SAFE_CALL(cuDeviceTotalMem(&mem, **device));
301 *total_memory = mem;
302 #endif
303 }
304
FC_FUNC_(cuda_device_shared_memory,CUDA_DEVICE_SHARED_MEMORY)305 extern "C" void FC_FUNC_(cuda_device_shared_memory, CUDA_DEVICE_SHARED_MEMORY)(CUdevice ** device, fint8 * shared_memory){
306 #ifdef HAVE_CUDA
307 int mem;
308 CUDA_SAFE_CALL(cuDeviceGetAttribute(&mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, **device));
309 *shared_memory = mem;
310 #endif
311 }
312
FC_FUNC_(cuda_mem_alloc,CUDA_MEM_ALLOC)313 extern "C" void FC_FUNC_(cuda_mem_alloc, CUDA_MEM_ALLOC)(CUdeviceptr ** cuda_ptr, const fint8 * size){
314 #ifdef HAVE_CUDA
315 *cuda_ptr = new CUdeviceptr;
316 CUDA_SAFE_CALL(cuMemAlloc(*cuda_ptr, *size));
317 #endif
318 }
319
FC_FUNC_(cuda_mem_free,CUDA_MEM_FREE)320 extern "C" void FC_FUNC_(cuda_mem_free, CUDA_MEM_FREE)(CUdeviceptr ** cuda_ptr){
321 #ifdef HAVE_CUDA
322 CUDA_SAFE_CALL(cuMemFree(**cuda_ptr));
323 delete *cuda_ptr;
324 #endif
325 }
326
FC_FUNC_(cuda_memcpy_htod,CUDA_MEMCPY_HTOD)327 extern "C" void FC_FUNC_(cuda_memcpy_htod, CUDA_MEMCPY_HTOD)(CUdeviceptr ** cuda_ptr, const void * data, fint8 * size, fint8 * offset, bool * async){
328 #ifdef HAVE_CUDA
329 CUDA_SAFE_CALL(cuMemcpyHtoDAsync(**cuda_ptr + *offset, data, *size, phStream[current_stream]));
330 if(!(*async)) CUDA_SAFE_CALL(cuStreamSynchronize(phStream[current_stream]));
331 #endif
332 }
333
FC_FUNC_(cuda_memcpy_dtoh,CUDA_MEMCPY_DTOH)334 extern "C" void FC_FUNC_(cuda_memcpy_dtoh, CUDA_MEMCPY_DTOH)(CUdeviceptr ** cuda_ptr, void * data, fint8 * size, fint8 * offset, bool * async){
335 #ifdef HAVE_CUDA
336 CUDA_SAFE_CALL(cuMemcpyDtoHAsync(data, **cuda_ptr + *offset, *size, phStream[current_stream]));
337 if(!(*async)) CUDA_SAFE_CALL(cuStreamSynchronize(phStream[current_stream]));
338 #endif
339 }
340
FC_FUNC_(cuda_alloc_arg_array,CUDA_ALLOC_ARG_ARRAY)341 extern "C" void FC_FUNC_(cuda_alloc_arg_array, CUDA_ALLOC_ARG_ARRAY)(vector<void *> ** arg_array){
342 *arg_array = new vector<void *>;
343 }
344
FC_FUNC_(cuda_free_arg_array,CUDA_FREE_ARG_ARRAY)345 extern "C" void FC_FUNC_(cuda_free_arg_array, CUDA_FREE_ARG_ARRAY)(vector<void *> ** arg_array){
346
347 for(unsigned ii = 0; ii < (**arg_array).size(); ii++) free((**arg_array)[ii]);
348 delete *arg_array;
349
350 }
351
FC_FUNC_(cuda_kernel_set_arg_buffer,CUDA_KERNEL_SET_ARG_BUFFER)352 extern "C" void FC_FUNC_(cuda_kernel_set_arg_buffer, CUDA_KERNEL_SET_ARG_BUFFER)
353 (vector<void *> ** arg_array, CUdeviceptr ** cuda_ptr, fint * arg_index){
354
355 if(unsigned(*arg_index) >= (**arg_array).size()) (**arg_array).resize(*arg_index + 1, NULL);
356
357 if((**arg_array)[*arg_index] == NULL) (**arg_array)[*arg_index] = malloc(sizeof(CUdeviceptr));
358
359 memcpy((**arg_array)[*arg_index], *cuda_ptr, sizeof(CUdeviceptr));
360 }
361
FC_FUNC_(cuda_kernel_set_arg_value,CUDA_KERNEL_SET_ARG_VALUE)362 extern "C" void FC_FUNC_(cuda_kernel_set_arg_value, CUDA_KERNEL_SET_ARG_VALUE)
363 (vector<void *> ** arg_array, void * arg, fint * arg_index, fint * size){
364
365 if(unsigned(*arg_index) >= (**arg_array).size()) (**arg_array).resize(*arg_index + 1, NULL);
366
367 if((**arg_array)[*arg_index] == NULL) (**arg_array)[*arg_index] = malloc(*size);
368
369 memcpy((**arg_array)[*arg_index], arg, *size);
370
371 }
372
FC_FUNC_(cuda_context_synchronize,CUDA_CONTEXT_SYNCHRONIZE)373 extern "C" void FC_FUNC_(cuda_context_synchronize, CUDA_CONTEXT_SYNCHRONIZE)(){
374 #ifdef HAVE_CUDA
375 CUDA_SAFE_CALL(cuStreamSynchronize(phStream[current_stream]));
376 #endif
377 }
378
FC_FUNC_(cuda_synchronize_all_streams,CUDA_SYNCHRONIZE_ALL_STREAMS)379 extern "C" void FC_FUNC_(cuda_synchronize_all_streams, CUDA_SYNCHRONIZE_ALL_STREAMS)(){
380 #ifdef HAVE_CUDA
381 for(int i = 0; i < number_streams; ++i)
382 CUDA_SAFE_CALL(cuStreamSynchronize(phStream[i]));
383 #endif
384 }
385
FC_FUNC_(cuda_launch_kernel,CUDA_LAUNCH_KERNEL)386 extern "C" void FC_FUNC_(cuda_launch_kernel, CUDA_LAUNCH_KERNEL)
387 (CUfunction ** kernel, fint8 * griddim, fint8 * blockdim, fint8 * shared_mem, vector<void *> ** arg_array){
388 #ifdef HAVE_CUDA
389 /*
390 cout << "Kernel call" << endl;
391
392 int nn;
393 CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, **kernel));
394 cout << "SIZE " << nn << endl;
395 CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_PTX_VERSION, **kernel));
396 cout << "PTX " << nn << endl;
397 CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_BINARY_VERSION, **kernel));
398 cout << "BINARY " << nn << endl;
399
400 for(unsigned ii = 0; ii < (**arg_array).size(); ii++) cout << ii << " " << (**arg_array)[ii] << endl;
401
402 cout << "GRID " << griddim[0] << " " << griddim[1] << " " << griddim[2] << endl;
403 cout << "BLOCK " << blockdim[0] << " " << blockdim[1] << " " << blockdim[2] << endl;
404 */
405
406 assert((**arg_array).size() > 0);
407 for(unsigned ii = 0; ii < (**arg_array).size(); ii++) assert((**arg_array)[ii] != NULL);
408
409 CUDA_SAFE_CALL(cuLaunchKernel(**kernel, griddim[0], griddim[1], griddim[2],
410 blockdim[0], blockdim[1], blockdim[2], *shared_mem, phStream[current_stream], &(**arg_array)[0], NULL));
411
412 // release the stored argument, this is not necessary in principle,
413 // but it should help us to detect missing arguments.
414 for(unsigned ii = 0; ii < (**arg_array).size(); ii++) free((**arg_array)[ii]);
415 (**arg_array).resize(0);
416 #endif
417 }
418
FC_FUNC_(cuda_device_name,CUDA_DEVICE_NAME)419 extern "C" void FC_FUNC_(cuda_device_name, CUDA_DEVICE_NAME)(CUdevice ** device, STR_F_TYPE name STR_ARG1){
420 #ifdef HAVE_CUDA
421 char devicename[200];
422 CUDA_SAFE_CALL(cuDeviceGetName(devicename, sizeof(devicename), **device));
423 TO_F_STR1(devicename, name);
424
425 #endif
426
427 }
428
FC_FUNC_(cuda_device_capability,CUDA_DEVICE_CAPABILITY)429 extern "C" void FC_FUNC_(cuda_device_capability, CUDA_DEVICE_CAPABILITY)(CUdevice ** device, fint * major, fint * minor){
430 #ifdef HAVE_CUDA
431 int cmajor = 0, cminor = 0;
432 CUDA_SAFE_CALL(cuDeviceGetAttribute(&cmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, **device));
433 CUDA_SAFE_CALL(cuDeviceGetAttribute(&cminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, **device));
434 *major = cmajor;
435 *minor = cminor;
436 #endif
437 }
438
FC_FUNC_(cuda_driver_version,CUDA_DRIVER_VERSION)439 extern "C" void FC_FUNC_(cuda_driver_version, CUDA_DRIVER_VERSION)(fint * version){
440 #ifdef HAVE_CUDA
441 int driverversion;
442 CUDA_SAFE_CALL(cuDriverGetVersion(&driverversion));
443 *version = driverversion;
444 #endif
445 }
446
FC_FUNC_(cuda_device_get_warpsize,CUDA_DEVICE_GET_WARPSIZE)447 extern "C" void FC_FUNC_(cuda_device_get_warpsize, CUDA_DEVICE_GET_WARPSIZE)(CUdevice ** device, fint * warpSize){
448 #ifdef HAVE_CUDA
449 int cwarpSize=0;
450 CUDA_SAFE_CALL(cuDeviceGetAttribute(&cwarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, **device));
451 *warpSize = cwarpSize;
452 #endif
453 }
454
FC_FUNC_(cuda_deref,CUDA_DEREF)455 extern "C" void FC_FUNC_(cuda_deref, CUDA_DEREF)(CUdeviceptr ** cuda_ptr, void ** cuda_deref_ptr) {
456 *cuda_deref_ptr = (void *) **cuda_ptr;
457 }
458
FC_FUNC_(cuda_set_stream,CUDA_SET_STREAM)459 extern "C" void FC_FUNC_(cuda_set_stream, CUDA_SET_STREAM)(CUstream ** stream, fint * number) {
460 #ifdef HAVE_CUDA
461 current_stream = (*number - 1) % number_streams;
462 *stream = &phStream[current_stream];
463 #endif
464 }
465