1 /*
2  Copyright (C) 2016 X. Andrade
3 
4  This program is free software; you can redistribute it and/or modify
5  it under the terms of the GNU General Public License as published by
6  the Free Software Foundation; either version 2, or (at your option)
7  any later version.
8 
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  GNU General Public License for more details.
13 
14  You should have received a copy of the GNU General Public License
15  along with this program; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17  02110-1301, USA.
18 
19 */
20 
21 #include <config.h>
22 
23 #ifdef HAVE_CUDA
24 #include <cuda.h>
25 #include <nvrtc.h>
26 // all kernels and transfers are submitted to this non-blocking stream
27 // -> allows operations from libraries to overlap with this stream
28 CUstream * phStream;
29 int current_stream;
30 static int number_streams = 32;
31 #else
32 typedef int CUcontext;
33 typedef int CUdevice;
34 typedef int CUmodule;
35 typedef int CUfunction;
36 typedef int CUdeviceptr;
37 typedef int CUstream;
38 #endif
39 
40 #include <stdlib.h> //we have to include this before cmath to workaround a bug in the PGI "compiler".
41 #include <cmath>
42 
43 #include <iostream>
44 
45 #include <fstream>
46 
47 #include "string_f.h" /* fortran <-> c string compatibility issues */
48 
49 #include <vector>
50 #include <sstream>
51 #include <iterator>
52 #include <cassert>
53 #include <cstring>
54 #include <map>
55 #include <stdbool.h>
56 
57 #include <fortran_types.h>
58 
59 #define NVRTC_SAFE_CALL(x)                                        \
60   do {                                                            \
61     nvrtcResult result = x;                                       \
62     if (result != NVRTC_SUCCESS) {                                \
63       std::cerr << "\nerror: " #x " failed with error "           \
64                 << nvrtcGetErrorString(result) << '\n';           \
65       exit(1);                                                    \
66     }                                                             \
67   } while(0)
68 
69 #define CUDA_SAFE_CALL(x)                                         \
70   do {                                                            \
71     CUresult result = x;                                          \
72     if (result != CUDA_SUCCESS) {                                 \
73       const char *msg;                                            \
74       cuGetErrorName(result, &msg);                               \
75       std::cerr << "\nerror: " #x " failed with error "           \
76                 << msg << '\n';                                   \
77       exit(1);                                                    \
78     }                                                             \
79   } while(0)
80 
81 using namespace std;
82 
FC_FUNC_(cuda_init,CUDA_INIT)83 extern "C" void FC_FUNC_(cuda_init, CUDA_INIT)(CUcontext ** context, CUdevice ** device, CUstream ** stream, fint * device_number, fint * rank){
84 
85 #ifdef HAVE_CUDA
86   CUDA_SAFE_CALL(cuInit(0));
87 
88   *context = new CUcontext;
89   *device = new CUdevice;
90 
91   int ndevices;
92 
93   CUDA_SAFE_CALL(cuDeviceGetCount(&ndevices));
94 
95   if (ndevices == 0) {
96     cerr << "Error: no CUDA devices available." << std::endl;
97     exit(1);
98   }
99 
100   *device_number = (*device_number + *rank) % ndevices;
101   CUDA_SAFE_CALL(cuDeviceGet(*device, *device_number));
102 
103   CUDA_SAFE_CALL(cuCtxCreate(*context, 0, **device));
104 
105   CUDA_SAFE_CALL(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1));
106 
107   phStream = new CUstream[number_streams];
108   for(current_stream = 0; current_stream < number_streams; ++current_stream) {
109     CUDA_SAFE_CALL(cuStreamCreate(&phStream[current_stream], CU_STREAM_NON_BLOCKING));
110   }
111   current_stream = 0;
112   *stream = &phStream[current_stream];
113 #endif
114 }
115 
FC_FUNC_(cuda_end,CUDA_END)116 extern "C" void FC_FUNC_(cuda_end, CUDA_END)(CUcontext ** context, CUdevice ** device){
117 #ifdef HAVE_CUDA
118 
119   CUDA_SAFE_CALL(cuStreamDestroy(phStream[current_stream]));
120   CUDA_SAFE_CALL(cuCtxDestroy(**context));
121 
122   delete *context;
123   delete *device;
124 #endif
125 }
126 
FC_FUNC_(cuda_module_map_init,CUDA_MODULES_MAP_INIT)127 extern "C" void FC_FUNC_(cuda_module_map_init, CUDA_MODULES_MAP_INIT)(map<string, CUmodule *> ** module_map){
128   *module_map = new map<string, CUmodule *>;
129 }
130 
FC_FUNC_(cuda_module_map_end,CUDA_MODULES_MAP_END)131 extern "C" void FC_FUNC_(cuda_module_map_end, CUDA_MODULES_MAP_END)(map<string, CUmodule *> ** module_map){
132 
133   for(map<string, CUmodule *>::iterator map_it = (**module_map).begin(); map_it != (**module_map).end(); ++map_it){
134     CUmodule * module = map_it->second;
135 #ifdef HAVE_CUDA
136     CUDA_SAFE_CALL(cuModuleUnload(*module));
137 #endif
138     delete module;
139   }
140 
141   delete *module_map;
142 }
143 
FC_FUNC_(cuda_build_program,CUDA_BUILD_PROGRAM)144 extern "C" void FC_FUNC_(cuda_build_program, CUDA_BUILD_PROGRAM)(map<string, CUmodule *> ** module_map, CUmodule ** module, CUdevice ** device,
145 								 STR_F_TYPE const fname, STR_F_TYPE const flags STR_ARG2){
146 #ifdef HAVE_CUDA
147   char *fname_c;
148   char *flags_c;
149 
150   TO_C_STR1(fname, fname_c);
151   TO_C_STR2(flags, flags_c);
152 
153   string map_descriptor = string(fname_c) + string(flags_c);
154 
155   map<string, CUmodule *>::iterator map_it = (**module_map).find(map_descriptor);
156   if(map_it != (**module_map).end()){
157     *module = map_it->second;
158     free(fname_c);
159     return;
160   }
161 
162   // read the source
163   string source;
164 
165   source = "#include \"" + string(fname_c) + "\"\n";
166 
167   // cout << source << "|" << endl;
168 
169   nvrtcProgram prog;
170   NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, source.c_str(), "kernel_include.c", 0, NULL, NULL));
171 
172   int major = 0, minor = 0;
173   CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, **device));
174   CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, **device));
175 
176   char compute_version[3];
177   sprintf(compute_version, "%.1d%.1d", major, minor);
178 
179   string all_flags = "--gpu-architecture=compute_" + string(compute_version)
180     + " --ftz=true --fmad=true -DCUDA -default-device " + string(flags_c);
181 
182   stringstream flags_stream(all_flags);
183   istream_iterator<string> iter(flags_stream);
184   istream_iterator<string> end;
185   vector<string> tokens(iter, end);
186 
187   const char ** opts = new const char*[tokens.size()];
188   for (unsigned ii = 0; ii < tokens.size(); ii++) opts[ii] = tokens[ii].c_str();
189 
190 
191 
192   nvrtcResult err = nvrtcCompileProgram(prog, tokens.size(), opts);
193 
194   free(flags_c);
195 
196   size_t logSize;
197   NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
198   char *log = new char[logSize];
199   NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
200 
201   if(logSize > 1){
202 
203     cout << "Cuda compilation messages" << endl;
204 
205     cout << "File    : " << fname_c << endl;
206 
207     cout << "Options : " << all_flags << endl;
208 
209     cout << log << endl;
210 
211   }
212 
213   if(NVRTC_SUCCESS != err){
214     cerr << "Error in compiling" << endl;
215     exit(1);
216   }
217 
218   delete [] log;
219   delete [] opts;
220 
221   // Obtain PTX from the program.
222   size_t ptxSize;
223   NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
224   char *ptx = new char[ptxSize];
225   NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));
226 
227   NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
228 
229   *module = new CUmodule;
230 
231   const int num_options = 2;
232   CUjit_option options[num_options];
233   void * option_values[num_options];
234 
235   unsigned log_size = 4096;
236   char log_buffer[log_size];
237 
238   options[0] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
239   option_values[0] = (void *) (long)log_size;
240 
241   options[1] = CU_JIT_ERROR_LOG_BUFFER;
242   option_values[1] = (void *) log_buffer;
243 
244   CUresult result = cuModuleLoadDataEx(*module, ptx, num_options, options, option_values);
245 
246   if(result != CUDA_SUCCESS){
247     std::cerr << log_buffer << std::endl;
248     const char *msg;
249     cuGetErrorName(result, &msg);
250     std::cerr << "\nerror: cuModuleLoadDataEx failed with error " << msg << '\n';
251     exit(1);
252   }
253 
254   delete [] ptx;
255 
256   (**module_map)[map_descriptor] = *module;
257 
258   free(fname_c);
259 #endif
260 }
261 
FC_FUNC_(cuda_create_kernel,CUDA_CREATE_KERNEL)262 extern "C" void FC_FUNC_(cuda_create_kernel, CUDA_CREATE_KERNEL)(CUfunction ** kernel, CUmodule ** module, STR_F_TYPE kernel_name STR_ARG1){
263 #ifdef HAVE_CUDA
264   char *kernel_name_c;
265 
266   TO_C_STR1(kernel_name, kernel_name_c);
267 
268   *kernel = new CUfunction;
269 
270   CUDA_SAFE_CALL(cuModuleGetFunction(*kernel, **module, kernel_name_c));
271 
272   free(kernel_name_c);
273 #endif
274 }
275 
FC_FUNC_(cuda_release_module,CUDA_RELEASE_MODULE)276 extern "C" void FC_FUNC_(cuda_release_module, CUDA_RELEASE_MODULE)(CUmodule ** module){
277 #ifdef HAVE_CUDA
278   CUDA_SAFE_CALL(cuModuleUnload(**module));
279   delete *module;
280 #endif
281 }
282 
FC_FUNC_(cuda_release_kernel,CUDA_RELEASE_KERNEL)283 extern "C" void FC_FUNC_(cuda_release_kernel, CUDA_RELEASE_KERNEL)(CUfunction ** kernel){
284 #ifdef HAVE_CUDA
285   delete *kernel;
286 #endif
287 }
288 
FC_FUNC_(cuda_device_max_threads_per_block,CUDA_DEVICE_MAX_THREADS_PER_BLOCK)289 extern "C" void FC_FUNC_(cuda_device_max_threads_per_block, CUDA_DEVICE_MAX_THREADS_PER_BLOCK)(CUdevice ** device, fint * max_threads){
290 #ifdef HAVE_CUDA
291   int value;
292   CUDA_SAFE_CALL(cuDeviceGetAttribute (&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, **device));
293   *max_threads = value;
294 #endif
295 }
296 
FC_FUNC_(cuda_device_total_memory,CUDA_DEVICE_TOTAL_MEMORY)297 extern "C" void FC_FUNC_(cuda_device_total_memory, CUDA_DEVICE_TOTAL_MEMORY)(CUdevice ** device, fint8 * total_memory){
298 #ifdef HAVE_CUDA
299   size_t mem;
300   CUDA_SAFE_CALL(cuDeviceTotalMem(&mem, **device));
301   *total_memory = mem;
302 #endif
303 }
304 
FC_FUNC_(cuda_device_shared_memory,CUDA_DEVICE_SHARED_MEMORY)305 extern "C" void FC_FUNC_(cuda_device_shared_memory, CUDA_DEVICE_SHARED_MEMORY)(CUdevice ** device, fint8 * shared_memory){
306 #ifdef HAVE_CUDA
307   int mem;
308   CUDA_SAFE_CALL(cuDeviceGetAttribute(&mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, **device));
309   *shared_memory = mem;
310 #endif
311 }
312 
FC_FUNC_(cuda_mem_alloc,CUDA_MEM_ALLOC)313 extern "C" void FC_FUNC_(cuda_mem_alloc, CUDA_MEM_ALLOC)(CUdeviceptr ** cuda_ptr, const fint8 * size){
314 #ifdef HAVE_CUDA
315   *cuda_ptr = new CUdeviceptr;
316   CUDA_SAFE_CALL(cuMemAlloc(*cuda_ptr, *size));
317 #endif
318 }
319 
FC_FUNC_(cuda_mem_free,CUDA_MEM_FREE)320 extern "C" void FC_FUNC_(cuda_mem_free, CUDA_MEM_FREE)(CUdeviceptr ** cuda_ptr){
321 #ifdef HAVE_CUDA
322   CUDA_SAFE_CALL(cuMemFree(**cuda_ptr));
323   delete *cuda_ptr;
324 #endif
325 }
326 
FC_FUNC_(cuda_memcpy_htod,CUDA_MEMCPY_HTOD)327 extern "C" void FC_FUNC_(cuda_memcpy_htod, CUDA_MEMCPY_HTOD)(CUdeviceptr ** cuda_ptr, const void * data, fint8 * size, fint8 * offset, bool * async){
328 #ifdef HAVE_CUDA
329   CUDA_SAFE_CALL(cuMemcpyHtoDAsync(**cuda_ptr + *offset, data, *size, phStream[current_stream]));
330   if(!(*async)) CUDA_SAFE_CALL(cuStreamSynchronize(phStream[current_stream]));
331 #endif
332 }
333 
FC_FUNC_(cuda_memcpy_dtoh,CUDA_MEMCPY_DTOH)334 extern "C" void FC_FUNC_(cuda_memcpy_dtoh, CUDA_MEMCPY_DTOH)(CUdeviceptr ** cuda_ptr, void * data, fint8 * size, fint8 * offset, bool * async){
335 #ifdef HAVE_CUDA
336   CUDA_SAFE_CALL(cuMemcpyDtoHAsync(data, **cuda_ptr + *offset, *size, phStream[current_stream]));
337   if(!(*async)) CUDA_SAFE_CALL(cuStreamSynchronize(phStream[current_stream]));
338 #endif
339 }
340 
FC_FUNC_(cuda_alloc_arg_array,CUDA_ALLOC_ARG_ARRAY)341 extern "C" void FC_FUNC_(cuda_alloc_arg_array, CUDA_ALLOC_ARG_ARRAY)(vector<void *> ** arg_array){
342   *arg_array = new vector<void *>;
343 }
344 
FC_FUNC_(cuda_free_arg_array,CUDA_FREE_ARG_ARRAY)345 extern "C" void FC_FUNC_(cuda_free_arg_array, CUDA_FREE_ARG_ARRAY)(vector<void *> ** arg_array){
346 
347   for(unsigned ii = 0; ii < (**arg_array).size(); ii++) free((**arg_array)[ii]);
348   delete *arg_array;
349 
350 }
351 
FC_FUNC_(cuda_kernel_set_arg_buffer,CUDA_KERNEL_SET_ARG_BUFFER)352 extern "C" void FC_FUNC_(cuda_kernel_set_arg_buffer, CUDA_KERNEL_SET_ARG_BUFFER)
353   (vector<void *> ** arg_array, CUdeviceptr ** cuda_ptr, fint * arg_index){
354 
355   if(unsigned(*arg_index) >= (**arg_array).size()) (**arg_array).resize(*arg_index + 1, NULL);
356 
357   if((**arg_array)[*arg_index] == NULL) (**arg_array)[*arg_index] = malloc(sizeof(CUdeviceptr));
358 
359   memcpy((**arg_array)[*arg_index], *cuda_ptr, sizeof(CUdeviceptr));
360 }
361 
FC_FUNC_(cuda_kernel_set_arg_value,CUDA_KERNEL_SET_ARG_VALUE)362 extern "C" void FC_FUNC_(cuda_kernel_set_arg_value, CUDA_KERNEL_SET_ARG_VALUE)
363   (vector<void *> ** arg_array, void * arg, fint * arg_index, fint * size){
364 
365   if(unsigned(*arg_index) >= (**arg_array).size()) (**arg_array).resize(*arg_index + 1, NULL);
366 
367   if((**arg_array)[*arg_index] == NULL) (**arg_array)[*arg_index] = malloc(*size);
368 
369   memcpy((**arg_array)[*arg_index], arg, *size);
370 
371 }
372 
FC_FUNC_(cuda_context_synchronize,CUDA_CONTEXT_SYNCHRONIZE)373 extern "C" void FC_FUNC_(cuda_context_synchronize, CUDA_CONTEXT_SYNCHRONIZE)(){
374 #ifdef HAVE_CUDA
375   CUDA_SAFE_CALL(cuStreamSynchronize(phStream[current_stream]));
376 #endif
377 }
378 
FC_FUNC_(cuda_synchronize_all_streams,CUDA_SYNCHRONIZE_ALL_STREAMS)379 extern "C" void FC_FUNC_(cuda_synchronize_all_streams, CUDA_SYNCHRONIZE_ALL_STREAMS)(){
380 #ifdef HAVE_CUDA
381   for(int i = 0; i < number_streams; ++i)
382     CUDA_SAFE_CALL(cuStreamSynchronize(phStream[i]));
383 #endif
384 }
385 
FC_FUNC_(cuda_launch_kernel,CUDA_LAUNCH_KERNEL)386 extern "C" void FC_FUNC_(cuda_launch_kernel, CUDA_LAUNCH_KERNEL)
387   (CUfunction ** kernel, fint8 * griddim, fint8 * blockdim, fint8 * shared_mem, vector<void *> ** arg_array){
388 #ifdef HAVE_CUDA
389   /*
390   cout << "Kernel call" << endl;
391 
392   int nn;
393   CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, **kernel));
394   cout << "SIZE   " << nn << endl;
395   CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_PTX_VERSION, **kernel));
396   cout << "PTX    " << nn << endl;
397   CUDA_SAFE_CALL(cuFuncGetAttribute(&nn, CU_FUNC_ATTRIBUTE_BINARY_VERSION, **kernel));
398   cout << "BINARY " << nn << endl;
399 
400   for(unsigned ii = 0; ii < (**arg_array).size(); ii++) cout << ii << " " << (**arg_array)[ii] << endl;
401 
402   cout << "GRID  " << griddim[0] << " " << griddim[1] << " " <<  griddim[2] << endl;
403   cout << "BLOCK " << blockdim[0] << " " << blockdim[1] << " " <<  blockdim[2] << endl;
404   */
405 
406   assert((**arg_array).size() > 0);
407   for(unsigned ii = 0; ii < (**arg_array).size(); ii++) assert((**arg_array)[ii] != NULL);
408 
409   CUDA_SAFE_CALL(cuLaunchKernel(**kernel, griddim[0], griddim[1], griddim[2],
410          blockdim[0], blockdim[1], blockdim[2], *shared_mem, phStream[current_stream], &(**arg_array)[0], NULL));
411 
412   // release the stored argument, this is not necessary in principle,
413   // but it should help us to detect missing arguments.
414   for(unsigned ii = 0; ii < (**arg_array).size(); ii++) free((**arg_array)[ii]);
415   (**arg_array).resize(0);
416 #endif
417 }
418 
FC_FUNC_(cuda_device_name,CUDA_DEVICE_NAME)419 extern "C" void FC_FUNC_(cuda_device_name, CUDA_DEVICE_NAME)(CUdevice ** device, STR_F_TYPE name STR_ARG1){
420 #ifdef HAVE_CUDA
421   char devicename[200];
422   CUDA_SAFE_CALL(cuDeviceGetName(devicename, sizeof(devicename), **device));
423   TO_F_STR1(devicename, name);
424 
425 #endif
426 
427 }
428 
FC_FUNC_(cuda_device_capability,CUDA_DEVICE_CAPABILITY)429 extern "C" void FC_FUNC_(cuda_device_capability, CUDA_DEVICE_CAPABILITY)(CUdevice ** device, fint * major, fint * minor){
430 #ifdef HAVE_CUDA
431   int cmajor = 0, cminor = 0;
432   CUDA_SAFE_CALL(cuDeviceGetAttribute(&cmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, **device));
433   CUDA_SAFE_CALL(cuDeviceGetAttribute(&cminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, **device));
434   *major = cmajor;
435   *minor = cminor;
436 #endif
437 }
438 
FC_FUNC_(cuda_driver_version,CUDA_DRIVER_VERSION)439 extern "C" void FC_FUNC_(cuda_driver_version, CUDA_DRIVER_VERSION)(fint * version){
440 #ifdef HAVE_CUDA
441   int driverversion;
442   CUDA_SAFE_CALL(cuDriverGetVersion(&driverversion));
443   *version = driverversion;
444 #endif
445 }
446 
FC_FUNC_(cuda_device_get_warpsize,CUDA_DEVICE_GET_WARPSIZE)447 extern "C" void FC_FUNC_(cuda_device_get_warpsize, CUDA_DEVICE_GET_WARPSIZE)(CUdevice ** device, fint * warpSize){
448 #ifdef HAVE_CUDA
449   int cwarpSize=0;
450   CUDA_SAFE_CALL(cuDeviceGetAttribute(&cwarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, **device));
451   *warpSize = cwarpSize;
452 #endif
453 }
454 
FC_FUNC_(cuda_deref,CUDA_DEREF)455 extern "C" void FC_FUNC_(cuda_deref, CUDA_DEREF)(CUdeviceptr ** cuda_ptr, void ** cuda_deref_ptr) {
456   *cuda_deref_ptr = (void *) **cuda_ptr;
457 }
458 
FC_FUNC_(cuda_set_stream,CUDA_SET_STREAM)459 extern "C" void FC_FUNC_(cuda_set_stream, CUDA_SET_STREAM)(CUstream ** stream, fint * number) {
460 #ifdef HAVE_CUDA
461   current_stream = (*number - 1) % number_streams;
462   *stream = &phStream[current_stream];
463 #endif
464 }
465