1 /*------------------------------------------------------------------------------------------------*
2  * Copyright (C) by the DBCSR developers group - All rights reserved                              *
3  * This file is part of the DBCSR library.                                                        *
4  *                                                                                                *
5  * For information on the license, see the LICENSE file.                                          *
6  * For further information please visit https://dbcsr.cp2k.org                                    *
7  * SPDX-License-Identifier: GPL-2.0+                                                              *
8  *------------------------------------------------------------------------------------------------*/
9 
10 #include "parameters.h"
11 #include "parameters_utils.h"
12 #include "libsmm_acc.h"
13 #include "libsmm_acc_benchmark.h"
14 #include "smm_acc_kernels.h"
15 
16 #include <sstream>
17 #include <fstream>
18 #include <string>
19 #include <cstring>
20 #include <algorithm>
21 #include <array>
22 #include <iostream>
23 
24 #if defined _OPENMP
25 #include <omp.h>
26 #endif
27 
28 #define dbcsr_type_real_4     1
29 #define dbcsr_type_real_8     3
30 #define dbcsr_type_complex_4  5
31 #define dbcsr_type_complex_8  7
32 
33 // MACRO HELPERS
34 #define STRINGIFY_NX(x) #x
35 #define STRINGIFY(x) STRINGIFY_NX(x)
36 #define CONCAT_NX(A, B) A ## B
37 #define CONCAT(A, B) CONCAT_NX(A, B)
38 
39 // The macro ARCH_OPTION, when expanded, is a string literal containing the
40 // jit compiler option specifying the target architecture
41 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
42 #define ARCH_OPTION_NAME --gpu-architecture=compute_
43 #else
44 #define ARCH_OPTION_NAME --amdgpu-target=
45 #endif
46 #define ARCH_OPTION STRINGIFY(CONCAT(ARCH_OPTION_NAME, ARCH_NUMBER))
47 
48 
49 //===========================================================================
launch_kernel_from_handle(ACC_DRV (function)const & kern_func,int nblks,int threads,ACC_DRV (stream)stream,void ** args)50 inline int launch_kernel_from_handle(ACC_DRV(function) const& kern_func, int nblks, int threads, ACC_DRV(stream) stream, void** args){
51 
52     ACC_DRV_CALL(
53         LaunchJITKernel, (kern_func,      // kernel function,
54                           nblks, 1, 1,    // grid dimension x, y, z
55                           threads, 1, 1,  // block dimension x, y, z
56                           0, stream,      // shared memory size and stream
57                           args, NULL));   // arguments
58     return 0;
59 
60 }
61 
62 
63 //===========================================================================
validate_kernel(ACC_DRV (function)& kern_func,ACC_DRV (stream)stream,int threads,int grouping,int m,int n,int k)64 inline void validate_kernel(ACC_DRV(function)& kern_func, ACC_DRV(stream) stream, int threads, int grouping, int m, int n, int k){
65 
66     libsmm_acc_benchmark_t* h;
67     libsmm_acc_benchmark_init(&h, test, m, n, k);
68 
69     // Run the matrix-matrix multiplication on the CPU
70     memset(h->mat_c, 0, h->n_c * m * n * sizeof(double));
71     matInit(h->mat_a, h->n_a, m, k, 42);
72     matInit(h->mat_b, h->n_b, k, n, 24);
73     stackInit(h->stack, h->n_stack, h->n_c, h->mat_c, h->n_a, h->mat_a, h->n_b, h->mat_b, m, n, k);
74 
75     stackCalc(h->stack, h->n_stack, h->mat_c, h->mat_a, h->mat_b, m, n, k);
76     double sumCPU = checkSum(h->mat_c, h->n_c, m, n);
77 
78     // Run the matrix-matrix multiplication kernel on the GPU
79     ACC_API_CALL(Memcpy, (h->d_mat_a, h->mat_a, h->n_a * m * k * sizeof(double), ACC(MemcpyHostToDevice)));
80     ACC_API_CALL(Memcpy, (h->d_mat_b, h->mat_b, h->n_b * k * n * sizeof(double), ACC(MemcpyHostToDevice)));
81     ACC_API_CALL(Memcpy, (h->d_stack, h->stack, h->n_stack * 3 * sizeof(int), ACC(MemcpyHostToDevice)));
82     ACC_API_CALL(Memset, (h->d_mat_c, 0, h->n_c * m * n * sizeof(double)));
83 
84     void *args[] = { &h->d_stack, &h->n_stack, &h->d_mat_a, &h->d_mat_b, &h->d_mat_c };
85     launch_kernel_from_handle(kern_func, ((h->n_stack + grouping - 1) / grouping), threads, stream, args);
86     ACC_API_CALL(Memcpy, (h->mat_c, h->d_mat_c, h->n_c * m * n * sizeof(double), ACC(MemcpyDeviceToHost)));
87 
88     // Validate the kernel based on results
89     double sumGPU =  checkSum(h->mat_c, h->n_c, m, n);
90     if(sumGPU != sumCPU){
91         printf("Kernel validation failed for kernel %ix%ix%i\nchecksum_diff: %g\nthreads: %i, grouping: %i\n", m, n, k, sumGPU-sumCPU, threads, grouping);
92         exit(1);
93     }
94     libsmm_acc_benchmark_finalize(h);
95 }
96 
97 
98 //===========================================================================
jit_kernel(ACC_DRV (function)& kern_func,libsmm_acc_algo algo,int tile_m,int tile_n,int w,int v,int threads,int grouping,int minblocks,int m,int n,int k)99 inline void jit_kernel(ACC_DRV(function)& kern_func, libsmm_acc_algo algo, int tile_m, int tile_n, int w, int v, int threads, int grouping, int minblocks, int m, int n, int k){
100 
101     // Get the code and the lowered name corresponding the kernel to launch
102     std::string kernel_code = smm_acc_common; // prepend include file content to code
103     std::string kernel_name;
104     switch(algo) {
105         case 1:
106             kernel_code += smm_acc_dnt_largeDB1;
107             kernel_name = "smm_acc_dnt_largeDB1<" +
108                           std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
109                           std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
110                           std::to_string(w) + ", " + std::to_string(v) + ", " +
111                           std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
112             break;
113         case 2:
114             kernel_code += smm_acc_dnt_largeDB2;
115             kernel_name = "smm_acc_dnt_largeDB2<" +
116                           std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
117                           std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
118                           std::to_string(w) + ", " + std::to_string(v) + ", " +
119                           std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
120             break;
121         case 3:
122             kernel_code += smm_acc_dnt_medium;
123             kernel_name = "smm_acc_dnt_medium<" +
124                           std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
125                           std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
126                           std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
127             break;
128         case 4:
129             kernel_code += smm_acc_dnt_small;
130             kernel_name = "smm_acc_dnt_small<" +
131                           std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
132                           std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
133                           std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
134             break;
135         case 5:
136             kernel_code += smm_acc_dnt_tiny;
137             kernel_name = "smm_acc_dnt_tiny<" +
138                           std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
139                           std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
140             break;
141         default:
142             printf("\nerror: algorithm number %i is not encoded.", algo);
143             exit(1);
144     }
145 
146     // Create JIT program
147     ACC_RTC(Program) kernel_program;
148     ACC_RTC_CALL(CreateProgram, (&kernel_program, kernel_code.c_str(), "smm_acc_kernel.cpp", 0, NULL, NULL));
149 
150     // Add lowered name
151     ACC_RTC_CALL(AddNameExpression, (kernel_program, kernel_name.c_str()));
152 
153     // (JIT-)compile kernel program
154 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
155     const char *compileOptions[] = {"-D__CUDA", "-w", ARCH_OPTION};
156     size_t nOptions = 3;
157 #else
158     const char *compileOptions[] = {"-D__HIP"};
159     size_t nOptions = 1;
160 #endif
161     ACC_RTC_CALL(CompileProgram, (kernel_program, nOptions, compileOptions));
162 
163     // Obtain PTX from the program.
164     size_t codeSize;
165     ACC_RTC_CALL(GetLowLevelCodeSize, (kernel_program, &codeSize));
166     char *code = new char[codeSize];
167     ACC_RTC_CALL(GetLowLevelCode, (kernel_program, code));
168 
169     // Get lowered name
170     const char *lowered_kernel_name;
171     ACC_RTC_CALL(GetLoweredName, (kernel_program, kernel_name.c_str(), &lowered_kernel_name));
172 
173     // Get pointer to kernel from PTX
174     ACC_DRV(module) module;
175     ACC_DRV_CALL(ModuleLoadDataEx, (&module, code, 0, 0, 0));
176     delete[] code;
177     ACC_DRV_CALL(ModuleGetFunction, (&kern_func, module, lowered_kernel_name));
178 
179     // Set shared memory configuration
180 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
181     ACC_DRV_CALL(FuncSetSharedMemConfig, (kern_func, ACC_DRV(SharedMemBankSizeEightByte)));
182 #endif
183 
184     // Destroy program
185     ACC_RTC_CALL(DestroyProgram, (&kernel_program));
186 }
187 
188 
add_kernel_handle_to_jitted_kernels(ACC_DRV (function)kern_func,ACC_DRV (stream)stream,Triplet h_mnk,int & threads,int & grouping)189 kernel_map_iterator add_kernel_handle_to_jitted_kernels(ACC_DRV(function) kern_func, ACC_DRV(stream) stream, Triplet h_mnk, int& threads, int& grouping){
190 
191     kernel_map_iterator kernel_it = kernel_handles.end();
192 
193     // Check whether autotuned parameters are given for this kernel, and if so, retrieve them
194     if (ht.find(h_mnk) != ht.end()){
195 
196         // Retrieve launching parameters
197         const KernelParameters params = ht.at(h_mnk);
198         libsmm_acc_algo algo = libsmm_acc_algo(params[0]); // enum {largeDB1, largeDB2, medium, small, tiny}
199         int tile_m = params[1];
200         int tile_n = params[2];
201         int w = params[3];
202         int v = params[4];
203         threads = params[5];
204         grouping = params[6];
205         int minblocks =  params[7];
206 
207         // JIT and validate the kernel
208         jit_kernel(kern_func, algo, tile_m, tile_n, w, v, threads, grouping, minblocks, h_mnk[0], h_mnk[1], h_mnk[2]);
209         validate_kernel(kern_func, stream, threads, grouping, h_mnk[0], h_mnk[1], h_mnk[2]);
210 
211         // Store the handle to the JIT-ed kernel
212         auto kernel_it_emplaced = kernel_handles.emplace(h_mnk, kernel_launcher(kern_func, threads, grouping));
213         kernel_it = kernel_it_emplaced.first;
214 
215     }
216 
217     return kernel_it;
218 
219 }
220 
221 
222 //===========================================================================
libsmm_acc_process_d(const int * param_stack,int stack_size,ACC_DRV (stream)stream,int m,int n,int k,const double * a_data,const double * b_data,double * c_data)223 int libsmm_acc_process_d(const int *param_stack, int stack_size, ACC_DRV(stream) stream, int m, int n, int k, const double *a_data, const double *b_data, double *c_data){
224 
225     ACC_DRV(function) kern_func = NULL;
226     int threads, grouping;
227     Triplet h_mnk = { m, n, k };
228     kernel_map_iterator kernel_it;
229 
230 #pragma omp critical (jit_multiplication)
231 {
232 
233     // Look up the kernel in the table of already JITed kernels
234     kernel_it = kernel_handles.find(h_mnk);
235     if (kernel_it == kernel_handles.end()){  // the kernel has not been JIT-ed yet
236 
237         kernel_it = add_kernel_handle_to_jitted_kernels(kern_func, stream, h_mnk, threads, grouping);
238 
239     }  // if the kernel could be jited successfully, the kernel_it iterator now points to the kernel_launcher.
240        // if this wasn't possible, is set to kernel_handles.end()
241 
242 }
243 
244     if (kernel_it == kernel_handles.end()){  // the kernel could not be JIT-ed, so we should fall back to CPU
245 
246         return -2; // fall back to CPU
247 
248     } else {
249 
250         // Retrieve kernel launching parameters
251         kern_func = kernel_it->second.kernel_function;
252         threads = kernel_it->second.threads;
253         grouping = kernel_it->second.grouping;
254 
255         // Construct argument pointer list and launch kernel
256         void *args[] = { &param_stack, &stack_size, &a_data, &b_data, &c_data };
257         return launch_kernel_from_handle(kern_func, ((stack_size + grouping - 1) / grouping), threads, stream, args);
258 
259     }
260 
261 }
262 
263 
264 //===========================================================================
libsmm_acc_process(const libsmm_acc_stack_descriptor_type * param_stack,int stack_size,int nparams,acc_data_t datatype,const void * a_data,const void * b_data,void * c_data,int m,int n,int k,int def_mnk,acc_stream_t * stream)265 extern "C" int libsmm_acc_process (const libsmm_acc_stack_descriptor_type *param_stack, int stack_size, int nparams, acc_data_t datatype, const void *a_data, const void *b_data, void *c_data, int m, int n, int k, int def_mnk, acc_stream_t *stream){
266     if(def_mnk!=1)
267         return -1; // inhomogeneous stacks not supported
268     if(datatype==dbcsr_type_real_8) {
269       if(m>MAX_BLOCK_DIM || n>MAX_BLOCK_DIM || k>MAX_BLOCK_DIM)
270         return -1; // maximum size over any dimension
271       else
272         return (libsmm_acc_process_d ((const int *) param_stack, stack_size, *((ACC_DRV(stream) *) stream), m, n, k, (const double *) a_data, (const double *) b_data, (double *) c_data));
273     }
274     return -1; // datatype not supported
275 };
276 
277 
278 //===========================================================================
jit_transpose_handle(ACC_DRV (function)& kern_func,int m,int n)279 void jit_transpose_handle(ACC_DRV(function)& kern_func, int m, int n){
280 
281     // Create nvrtcProgram
282     ACC_RTC(Program) kernel_program;
283     std::string transpose_code = smm_acc_common + smm_acc_transpose;
284     ACC_RTC_CALL(CreateProgram, (&kernel_program, transpose_code.c_str(), "transpose_kernel.cpp", 0, NULL, NULL));
285 
286     // Add lowered name
287     std::string kernel_name = "transpose_d<" + std::to_string(m) + ", " + std::to_string(n) + ">";
288     ACC_RTC_CALL(AddNameExpression, (kernel_program, kernel_name.c_str()));
289 
290     // (JIT-)compile
291 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
292     const char *compileOptions[] = {"-D__CUDA", "-w", ARCH_OPTION};
293     size_t nOptions = 3;
294 #else
295     const char *compileOptions[] = {"-D__HIP"};
296     size_t nOptions = 1;
297 #endif
298     ACC_RTC_CALL(CompileProgram, (kernel_program, nOptions, compileOptions));
299 
300     // Obtain PTX from the program.
301     size_t codeSize;
302     ACC_RTC_CALL(GetLowLevelCodeSize, (kernel_program, &codeSize));
303     char *code = new char[codeSize];
304     ACC_RTC_CALL(GetLowLevelCode, (kernel_program, code));
305 
306     // Get lowered name
307     const char *lowered_kernel_name;
308     ACC_RTC_CALL(GetLoweredName, (kernel_program, kernel_name.c_str(), &lowered_kernel_name));
309 
310     // Get pointer to kernel from PTX
311     ACC_DRV(module) module;
312     ACC_DRV_CALL(ModuleLoadDataEx, (&module, code, 0, 0, 0));
313     delete[] code;
314     ACC_DRV_CALL(ModuleGetFunction, (&kern_func, module, lowered_kernel_name));
315 
316     // Set shared memory configuration
317 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
318     ACC_DRV_CALL(FuncSetSharedMemConfig, (kern_func, ACC_DRV(SharedMemBankSizeEightByte)));
319 #endif
320 
321     // Destroy program
322     ACC_RTC_CALL(DestroyProgram, (&kernel_program));
323 }
324 
325 
326 //===========================================================================
libsmm_acc_transpose_d(const int * trs_stack,int offset,int nblks,double * buffer,int m,int n,ACC_DRV (stream)stream)327 int libsmm_acc_transpose_d(const int *trs_stack, int offset, int nblks,
328                            double *buffer, int m, int n, ACC_DRV(stream) stream) {
329 
330     ACC_DRV(function) kern_func;
331 
332     // Look up the kernel in the table of already JITed kernels
333     Triplet h_mnk = { m, n, 0 };
334     std::unordered_map<std::array<int, 3>, ACC_DRV(function)>::iterator kernel_it;
335 
336 #pragma omp critical (jit_transpose)
337 {
338 
339     kernel_it = transpose_handles.find(h_mnk);
340     if(kernel_it == transpose_handles.end()){  // the kernel has not been JIT-ed yet
341 
342         // JIT and store a kernel for this transposition
343         jit_transpose_handle(kern_func, m, n);
344         transpose_handles.emplace(h_mnk, kern_func);
345         kernel_it = transpose_handles.find(h_mnk);
346 
347     }
348 
349 }
350 
351     // Construct argument pointer list and launch function
352     kern_func = kernel_it->second; // retrieve handle
353     const int* trs_stack_ = trs_stack + offset;
354     void *args[] = { &trs_stack_, &buffer};
355 
356     return launch_kernel_from_handle(kern_func, nblks, 128, stream, args);
357 
358 }
359 
360 
361 //===========================================================================
libsmm_acc_transpose(const int * trs_stack,int offset,int nblks,void * buffer,acc_data_t datatype,int m,int n,acc_stream_t * stream)362 extern "C" int libsmm_acc_transpose (const int *trs_stack, int offset, int nblks, void *buffer, acc_data_t datatype, int m, int n, acc_stream_t* stream) {
363     if(datatype != dbcsr_type_real_8)
364         return 0; // transpose not needed
365     if(m>MAX_BLOCK_DIM || n>MAX_BLOCK_DIM)
366       return 0; // maximum size over any dimension
367     return libsmm_acc_transpose_d(trs_stack, offset, nblks, (double *) buffer, m, n, *((ACC_DRV(stream) *) stream));
368 }
369