1 /*------------------------------------------------------------------------------------------------*
2 * Copyright (C) by the DBCSR developers group - All rights reserved *
3 * This file is part of the DBCSR library. *
4 * *
5 * For information on the license, see the LICENSE file. *
6 * For further information please visit https://dbcsr.cp2k.org *
7 * SPDX-License-Identifier: GPL-2.0+ *
8 *------------------------------------------------------------------------------------------------*/
9
10 #include "parameters.h"
11 #include "parameters_utils.h"
12 #include "libsmm_acc.h"
13 #include "libsmm_acc_benchmark.h"
14 #include "smm_acc_kernels.h"
15
16 #include <sstream>
17 #include <fstream>
18 #include <string>
19 #include <cstring>
20 #include <algorithm>
21 #include <array>
22 #include <iostream>
23
24 #if defined _OPENMP
25 #include <omp.h>
26 #endif
27
28 #define dbcsr_type_real_4 1
29 #define dbcsr_type_real_8 3
30 #define dbcsr_type_complex_4 5
31 #define dbcsr_type_complex_8 7
32
33 // MACRO HELPERS
34 #define STRINGIFY_NX(x) #x
35 #define STRINGIFY(x) STRINGIFY_NX(x)
36 #define CONCAT_NX(A, B) A ## B
37 #define CONCAT(A, B) CONCAT_NX(A, B)
38
39 // The macro ARCH_OPTION, when expanded, is a string literal containing the
40 // jit compiler option specifying the target architecture
41 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
42 #define ARCH_OPTION_NAME --gpu-architecture=compute_
43 #else
44 #define ARCH_OPTION_NAME --amdgpu-target=
45 #endif
46 #define ARCH_OPTION STRINGIFY(CONCAT(ARCH_OPTION_NAME, ARCH_NUMBER))
47
48
49 //===========================================================================
launch_kernel_from_handle(ACC_DRV (function)const & kern_func,int nblks,int threads,ACC_DRV (stream)stream,void ** args)50 inline int launch_kernel_from_handle(ACC_DRV(function) const& kern_func, int nblks, int threads, ACC_DRV(stream) stream, void** args){
51
52 ACC_DRV_CALL(
53 LaunchJITKernel, (kern_func, // kernel function,
54 nblks, 1, 1, // grid dimension x, y, z
55 threads, 1, 1, // block dimension x, y, z
56 0, stream, // shared memory size and stream
57 args, NULL)); // arguments
58 return 0;
59
60 }
61
62
63 //===========================================================================
validate_kernel(ACC_DRV (function)& kern_func,ACC_DRV (stream)stream,int threads,int grouping,int m,int n,int k)64 inline void validate_kernel(ACC_DRV(function)& kern_func, ACC_DRV(stream) stream, int threads, int grouping, int m, int n, int k){
65
66 libsmm_acc_benchmark_t* h;
67 libsmm_acc_benchmark_init(&h, test, m, n, k);
68
69 // Run the matrix-matrix multiplication on the CPU
70 memset(h->mat_c, 0, h->n_c * m * n * sizeof(double));
71 matInit(h->mat_a, h->n_a, m, k, 42);
72 matInit(h->mat_b, h->n_b, k, n, 24);
73 stackInit(h->stack, h->n_stack, h->n_c, h->mat_c, h->n_a, h->mat_a, h->n_b, h->mat_b, m, n, k);
74
75 stackCalc(h->stack, h->n_stack, h->mat_c, h->mat_a, h->mat_b, m, n, k);
76 double sumCPU = checkSum(h->mat_c, h->n_c, m, n);
77
78 // Run the matrix-matrix multiplication kernel on the GPU
79 ACC_API_CALL(Memcpy, (h->d_mat_a, h->mat_a, h->n_a * m * k * sizeof(double), ACC(MemcpyHostToDevice)));
80 ACC_API_CALL(Memcpy, (h->d_mat_b, h->mat_b, h->n_b * k * n * sizeof(double), ACC(MemcpyHostToDevice)));
81 ACC_API_CALL(Memcpy, (h->d_stack, h->stack, h->n_stack * 3 * sizeof(int), ACC(MemcpyHostToDevice)));
82 ACC_API_CALL(Memset, (h->d_mat_c, 0, h->n_c * m * n * sizeof(double)));
83
84 void *args[] = { &h->d_stack, &h->n_stack, &h->d_mat_a, &h->d_mat_b, &h->d_mat_c };
85 launch_kernel_from_handle(kern_func, ((h->n_stack + grouping - 1) / grouping), threads, stream, args);
86 ACC_API_CALL(Memcpy, (h->mat_c, h->d_mat_c, h->n_c * m * n * sizeof(double), ACC(MemcpyDeviceToHost)));
87
88 // Validate the kernel based on results
89 double sumGPU = checkSum(h->mat_c, h->n_c, m, n);
90 if(sumGPU != sumCPU){
91 printf("Kernel validation failed for kernel %ix%ix%i\nchecksum_diff: %g\nthreads: %i, grouping: %i\n", m, n, k, sumGPU-sumCPU, threads, grouping);
92 exit(1);
93 }
94 libsmm_acc_benchmark_finalize(h);
95 }
96
97
98 //===========================================================================
jit_kernel(ACC_DRV (function)& kern_func,libsmm_acc_algo algo,int tile_m,int tile_n,int w,int v,int threads,int grouping,int minblocks,int m,int n,int k)99 inline void jit_kernel(ACC_DRV(function)& kern_func, libsmm_acc_algo algo, int tile_m, int tile_n, int w, int v, int threads, int grouping, int minblocks, int m, int n, int k){
100
101 // Get the code and the lowered name corresponding the kernel to launch
102 std::string kernel_code = smm_acc_common; // prepend include file content to code
103 std::string kernel_name;
104 switch(algo) {
105 case 1:
106 kernel_code += smm_acc_dnt_largeDB1;
107 kernel_name = "smm_acc_dnt_largeDB1<" +
108 std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
109 std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
110 std::to_string(w) + ", " + std::to_string(v) + ", " +
111 std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
112 break;
113 case 2:
114 kernel_code += smm_acc_dnt_largeDB2;
115 kernel_name = "smm_acc_dnt_largeDB2<" +
116 std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
117 std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
118 std::to_string(w) + ", " + std::to_string(v) + ", " +
119 std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
120 break;
121 case 3:
122 kernel_code += smm_acc_dnt_medium;
123 kernel_name = "smm_acc_dnt_medium<" +
124 std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
125 std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
126 std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
127 break;
128 case 4:
129 kernel_code += smm_acc_dnt_small;
130 kernel_name = "smm_acc_dnt_small<" +
131 std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
132 std::to_string(tile_m) + ", " + std::to_string(tile_n) + ", " +
133 std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
134 break;
135 case 5:
136 kernel_code += smm_acc_dnt_tiny;
137 kernel_name = "smm_acc_dnt_tiny<" +
138 std::to_string(m) + ", " + std::to_string(n) + ", " + std::to_string(k) + ", " +
139 std::to_string(threads) + ", " + std::to_string(grouping) + ", " + std::to_string(minblocks) + ">";
140 break;
141 default:
142 printf("\nerror: algorithm number %i is not encoded.", algo);
143 exit(1);
144 }
145
146 // Create JIT program
147 ACC_RTC(Program) kernel_program;
148 ACC_RTC_CALL(CreateProgram, (&kernel_program, kernel_code.c_str(), "smm_acc_kernel.cpp", 0, NULL, NULL));
149
150 // Add lowered name
151 ACC_RTC_CALL(AddNameExpression, (kernel_program, kernel_name.c_str()));
152
153 // (JIT-)compile kernel program
154 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
155 const char *compileOptions[] = {"-D__CUDA", "-w", ARCH_OPTION};
156 size_t nOptions = 3;
157 #else
158 const char *compileOptions[] = {"-D__HIP"};
159 size_t nOptions = 1;
160 #endif
161 ACC_RTC_CALL(CompileProgram, (kernel_program, nOptions, compileOptions));
162
163 // Obtain PTX from the program.
164 size_t codeSize;
165 ACC_RTC_CALL(GetLowLevelCodeSize, (kernel_program, &codeSize));
166 char *code = new char[codeSize];
167 ACC_RTC_CALL(GetLowLevelCode, (kernel_program, code));
168
169 // Get lowered name
170 const char *lowered_kernel_name;
171 ACC_RTC_CALL(GetLoweredName, (kernel_program, kernel_name.c_str(), &lowered_kernel_name));
172
173 // Get pointer to kernel from PTX
174 ACC_DRV(module) module;
175 ACC_DRV_CALL(ModuleLoadDataEx, (&module, code, 0, 0, 0));
176 delete[] code;
177 ACC_DRV_CALL(ModuleGetFunction, (&kern_func, module, lowered_kernel_name));
178
179 // Set shared memory configuration
180 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
181 ACC_DRV_CALL(FuncSetSharedMemConfig, (kern_func, ACC_DRV(SharedMemBankSizeEightByte)));
182 #endif
183
184 // Destroy program
185 ACC_RTC_CALL(DestroyProgram, (&kernel_program));
186 }
187
188
add_kernel_handle_to_jitted_kernels(ACC_DRV (function)kern_func,ACC_DRV (stream)stream,Triplet h_mnk,int & threads,int & grouping)189 kernel_map_iterator add_kernel_handle_to_jitted_kernels(ACC_DRV(function) kern_func, ACC_DRV(stream) stream, Triplet h_mnk, int& threads, int& grouping){
190
191 kernel_map_iterator kernel_it = kernel_handles.end();
192
193 // Check whether autotuned parameters are given for this kernel, and if so, retrieve them
194 if (ht.find(h_mnk) != ht.end()){
195
196 // Retrieve launching parameters
197 const KernelParameters params = ht.at(h_mnk);
198 libsmm_acc_algo algo = libsmm_acc_algo(params[0]); // enum {largeDB1, largeDB2, medium, small, tiny}
199 int tile_m = params[1];
200 int tile_n = params[2];
201 int w = params[3];
202 int v = params[4];
203 threads = params[5];
204 grouping = params[6];
205 int minblocks = params[7];
206
207 // JIT and validate the kernel
208 jit_kernel(kern_func, algo, tile_m, tile_n, w, v, threads, grouping, minblocks, h_mnk[0], h_mnk[1], h_mnk[2]);
209 validate_kernel(kern_func, stream, threads, grouping, h_mnk[0], h_mnk[1], h_mnk[2]);
210
211 // Store the handle to the JIT-ed kernel
212 auto kernel_it_emplaced = kernel_handles.emplace(h_mnk, kernel_launcher(kern_func, threads, grouping));
213 kernel_it = kernel_it_emplaced.first;
214
215 }
216
217 return kernel_it;
218
219 }
220
221
222 //===========================================================================
libsmm_acc_process_d(const int * param_stack,int stack_size,ACC_DRV (stream)stream,int m,int n,int k,const double * a_data,const double * b_data,double * c_data)223 int libsmm_acc_process_d(const int *param_stack, int stack_size, ACC_DRV(stream) stream, int m, int n, int k, const double *a_data, const double *b_data, double *c_data){
224
225 ACC_DRV(function) kern_func = NULL;
226 int threads, grouping;
227 Triplet h_mnk = { m, n, k };
228 kernel_map_iterator kernel_it;
229
230 #pragma omp critical (jit_multiplication)
231 {
232
233 // Look up the kernel in the table of already JITed kernels
234 kernel_it = kernel_handles.find(h_mnk);
235 if (kernel_it == kernel_handles.end()){ // the kernel has not been JIT-ed yet
236
237 kernel_it = add_kernel_handle_to_jitted_kernels(kern_func, stream, h_mnk, threads, grouping);
238
239 } // if the kernel could be jited successfully, the kernel_it iterator now points to the kernel_launcher.
240 // if this wasn't possible, is set to kernel_handles.end()
241
242 }
243
244 if (kernel_it == kernel_handles.end()){ // the kernel could not be JIT-ed, so we should fall back to CPU
245
246 return -2; // fall back to CPU
247
248 } else {
249
250 // Retrieve kernel launching parameters
251 kern_func = kernel_it->second.kernel_function;
252 threads = kernel_it->second.threads;
253 grouping = kernel_it->second.grouping;
254
255 // Construct argument pointer list and launch kernel
256 void *args[] = { ¶m_stack, &stack_size, &a_data, &b_data, &c_data };
257 return launch_kernel_from_handle(kern_func, ((stack_size + grouping - 1) / grouping), threads, stream, args);
258
259 }
260
261 }
262
263
264 //===========================================================================
libsmm_acc_process(const libsmm_acc_stack_descriptor_type * param_stack,int stack_size,int nparams,acc_data_t datatype,const void * a_data,const void * b_data,void * c_data,int m,int n,int k,int def_mnk,acc_stream_t * stream)265 extern "C" int libsmm_acc_process (const libsmm_acc_stack_descriptor_type *param_stack, int stack_size, int nparams, acc_data_t datatype, const void *a_data, const void *b_data, void *c_data, int m, int n, int k, int def_mnk, acc_stream_t *stream){
266 if(def_mnk!=1)
267 return -1; // inhomogeneous stacks not supported
268 if(datatype==dbcsr_type_real_8) {
269 if(m>MAX_BLOCK_DIM || n>MAX_BLOCK_DIM || k>MAX_BLOCK_DIM)
270 return -1; // maximum size over any dimension
271 else
272 return (libsmm_acc_process_d ((const int *) param_stack, stack_size, *((ACC_DRV(stream) *) stream), m, n, k, (const double *) a_data, (const double *) b_data, (double *) c_data));
273 }
274 return -1; // datatype not supported
275 };
276
277
278 //===========================================================================
jit_transpose_handle(ACC_DRV (function)& kern_func,int m,int n)279 void jit_transpose_handle(ACC_DRV(function)& kern_func, int m, int n){
280
281 // Create nvrtcProgram
282 ACC_RTC(Program) kernel_program;
283 std::string transpose_code = smm_acc_common + smm_acc_transpose;
284 ACC_RTC_CALL(CreateProgram, (&kernel_program, transpose_code.c_str(), "transpose_kernel.cpp", 0, NULL, NULL));
285
286 // Add lowered name
287 std::string kernel_name = "transpose_d<" + std::to_string(m) + ", " + std::to_string(n) + ">";
288 ACC_RTC_CALL(AddNameExpression, (kernel_program, kernel_name.c_str()));
289
290 // (JIT-)compile
291 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
292 const char *compileOptions[] = {"-D__CUDA", "-w", ARCH_OPTION};
293 size_t nOptions = 3;
294 #else
295 const char *compileOptions[] = {"-D__HIP"};
296 size_t nOptions = 1;
297 #endif
298 ACC_RTC_CALL(CompileProgram, (kernel_program, nOptions, compileOptions));
299
300 // Obtain PTX from the program.
301 size_t codeSize;
302 ACC_RTC_CALL(GetLowLevelCodeSize, (kernel_program, &codeSize));
303 char *code = new char[codeSize];
304 ACC_RTC_CALL(GetLowLevelCode, (kernel_program, code));
305
306 // Get lowered name
307 const char *lowered_kernel_name;
308 ACC_RTC_CALL(GetLoweredName, (kernel_program, kernel_name.c_str(), &lowered_kernel_name));
309
310 // Get pointer to kernel from PTX
311 ACC_DRV(module) module;
312 ACC_DRV_CALL(ModuleLoadDataEx, (&module, code, 0, 0, 0));
313 delete[] code;
314 ACC_DRV_CALL(ModuleGetFunction, (&kern_func, module, lowered_kernel_name));
315
316 // Set shared memory configuration
317 #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__)
318 ACC_DRV_CALL(FuncSetSharedMemConfig, (kern_func, ACC_DRV(SharedMemBankSizeEightByte)));
319 #endif
320
321 // Destroy program
322 ACC_RTC_CALL(DestroyProgram, (&kernel_program));
323 }
324
325
326 //===========================================================================
libsmm_acc_transpose_d(const int * trs_stack,int offset,int nblks,double * buffer,int m,int n,ACC_DRV (stream)stream)327 int libsmm_acc_transpose_d(const int *trs_stack, int offset, int nblks,
328 double *buffer, int m, int n, ACC_DRV(stream) stream) {
329
330 ACC_DRV(function) kern_func;
331
332 // Look up the kernel in the table of already JITed kernels
333 Triplet h_mnk = { m, n, 0 };
334 std::unordered_map<std::array<int, 3>, ACC_DRV(function)>::iterator kernel_it;
335
336 #pragma omp critical (jit_transpose)
337 {
338
339 kernel_it = transpose_handles.find(h_mnk);
340 if(kernel_it == transpose_handles.end()){ // the kernel has not been JIT-ed yet
341
342 // JIT and store a kernel for this transposition
343 jit_transpose_handle(kern_func, m, n);
344 transpose_handles.emplace(h_mnk, kern_func);
345 kernel_it = transpose_handles.find(h_mnk);
346
347 }
348
349 }
350
351 // Construct argument pointer list and launch function
352 kern_func = kernel_it->second; // retrieve handle
353 const int* trs_stack_ = trs_stack + offset;
354 void *args[] = { &trs_stack_, &buffer};
355
356 return launch_kernel_from_handle(kern_func, nblks, 128, stream, args);
357
358 }
359
360
361 //===========================================================================
libsmm_acc_transpose(const int * trs_stack,int offset,int nblks,void * buffer,acc_data_t datatype,int m,int n,acc_stream_t * stream)362 extern "C" int libsmm_acc_transpose (const int *trs_stack, int offset, int nblks, void *buffer, acc_data_t datatype, int m, int n, acc_stream_t* stream) {
363 if(datatype != dbcsr_type_real_8)
364 return 0; // transpose not needed
365 if(m>MAX_BLOCK_DIM || n>MAX_BLOCK_DIM)
366 return 0; // maximum size over any dimension
367 return libsmm_acc_transpose_d(trs_stack, offset, nblks, (double *) buffer, m, n, *((ACC_DRV(stream) *) stream));
368 }
369