1 /*****************************************************************************
2 * CP2K: A general program to perform molecular dynamics simulations *
3 * Copyright (C) 2000 - 2019 CP2K developers group *
4 *****************************************************************************/
5
6 /******************************************************************************
7 * Authors: Benjamin G Levine, Andreas Gloess
8 *
9 * 2012/05/18 Refacturing - original files:
10 * - cuda_tools/cufft.h
11 * - cuda_tools/cufft.cu
12 *
13 *****************************************************************************/
14 #if defined ( __PW_CUDA )
15
16 // global dependencies
17 #include <cuda_runtime.h>
18 #include <cufft.h>
19 #include <cublas.h>
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <unistd.h>
23
24 // local dependencies
25 #include "pw_cuda_utils.h"
26 #include "fft_cuda_utils.h"
27 #include "fft_cuda_internal.h"
28
29 // debug flag
30 #define CHECK 1
31 #define VERBOSE 0
32
33 // dimensions
34 static int n_plans = 0;
35 static cufftHandle saved_plans[max_plans];
36 static int iplandims[max_plans][5];
37
38 // --- CODE -------------------------------------------------------------------
39
40
41 /******************************************************************************
42 * \brief Sets up and save a double precision complex 3D-FFT plan on the GPU.
43 * Saved plans are reused if they fit the requirements.
44 * \author Andreas Gloess
45 * \date 2012-05-18
46 * \version 0.01
47 *****************************************************************************/
fftcu_plan3d_z(cufftHandle & plan,int & ioverflow,const int * n,const cudaStream_t cuda_stream)48 void fftcu_plan3d_z( cufftHandle &plan,
49 int &ioverflow,
50 const int *n,
51 const cudaStream_t cuda_stream) {
52
53 int i;
54 cufftResult_t cErr;
55
56 ioverflow = 0;
57 for (i = 0; i < n_plans; i++) {
58 if ( iplandims[i][0] == 3 &&
59 iplandims[i][1] == n[0] &&
60 iplandims[i][2] == n[1] &&
61 iplandims[i][3] == n[2] &&
62 iplandims[i][4] == 0 ) {
63 plan = saved_plans[i];
64 return;
65 }
66 }
67
68 if (VERBOSE) printf("FFT 3D (%d-%d-%d)\n", n[0], n[1], n[2]);
69 cErr = cufftPlan3d(&plan, n[2], n[1], n[0], CUFFT_Z2Z);
70 if (CHECK) cufft_error_check(cErr, __LINE__);
71 cErr = cufftSetStream(plan, cuda_stream);
72 if (CHECK) cufft_error_check(cErr, __LINE__);
73 #if (__CUDACC_VER_MAJOR__<8)
74 cErr = cufftSetCompatibilityMode(plan, FFT_ALIGNMENT);
75 if (CHECK) cufft_error_check(cErr, __LINE__);
76 #endif
77
78 if ( n_plans < max_3d_plans ) {
79 saved_plans[n_plans] = plan;
80 iplandims[n_plans][0] = 3;
81 iplandims[n_plans][1] = n[0];
82 iplandims[n_plans][2] = n[1];
83 iplandims[n_plans][3] = n[2];
84 iplandims[n_plans][4] = 0;
85 n_plans++;
86 return;
87 }
88 ioverflow=1;
89 }
90
91 /******************************************************************************
92 * \brief Sets up and save a double precision complex 2D-FFT plan on the GPU.
93 * Saved plans are reused if they fit the requirements.
94 * \author Andreas Gloess
95 * \date 2012-07-16
96 * \version 0.01
97 *****************************************************************************/
fftcu_plan2dm_z(cufftHandle & plan,int & ioverflow,const int * n,const int fsign,const cudaStream_t cuda_stream)98 void fftcu_plan2dm_z( cufftHandle &plan,
99 int &ioverflow,
100 const int *n,
101 const int fsign,
102 const cudaStream_t cuda_stream) {
103
104 int i, istride, idist, ostride, odist, batch;
105 int nsize[2], inembed[2], onembed[2];
106 cufftResult_t cErr;
107
108 ioverflow = 0;
109 for (i = 0; i < n_plans; i++) {
110 if ( iplandims[i][0] == 2 &&
111 iplandims[i][1] == n[0] &&
112 iplandims[i][2] == n[1] &&
113 iplandims[i][3] == n[2] &&
114 iplandims[i][4] == fsign ) {
115 plan = saved_plans[i];
116 return;
117 }
118 }
119
120 nsize[0] = n[2];
121 nsize[1] = n[1];
122 inembed[0] = n[2];
123 inembed[1] = n[1];
124 onembed[0] = n[2];
125 onembed[1] = n[1];
126 batch = n[0];
127 if ( fsign == +1 ) {
128 istride = n[0];
129 idist = 1;
130 ostride = 1;
131 odist = n[1]*n[2];
132 } else {
133 istride = 1;
134 idist = n[1]*n[2];
135 ostride = n[0];
136 odist = 1;
137 }
138
139 if (VERBOSE) printf("FFT 2D (%d) (%d-%d-%d) %d %d %d %d\n", fsign, n[0], n[1], n[2], istride, idist, ostride, odist);
140 cErr = cufftPlanMany(&plan, 2, nsize, inembed, istride, idist, onembed, ostride, odist, CUFFT_Z2Z, batch);
141 if (CHECK) cufft_error_check(cErr, __LINE__);
142 cErr = cufftSetStream(plan, cuda_stream);
143 if (CHECK) cufft_error_check(cErr, __LINE__);
144 #if (__CUDACC_VER_MAJOR__<8)
145 cErr = cufftSetCompatibilityMode(plan, FFT_ALIGNMENT);
146 if (CHECK) cufft_error_check(cErr, __LINE__);
147 #endif
148
149 if ( n_plans < max_2d_plans ) {
150 saved_plans[n_plans] = plan;
151 iplandims[n_plans][0] = 2;
152 iplandims[n_plans][1] = n[0];
153 iplandims[n_plans][2] = n[1];
154 iplandims[n_plans][3] = n[2];
155 iplandims[n_plans][4] = fsign;
156 n_plans++;
157 return;
158 }
159
160 ioverflow=1;
161 }
162
163 /******************************************************************************
164 * \brief Sets up and save a double precision complex 1D-FFT plan on the GPU.
165 * Saved plans are reused if they fit the requirements.
166 * \author Andreas Gloess
167 * \date 2012-07-04
168 * \version 0.01
169 *****************************************************************************/
fftcu_plan1dm_z(cufftHandle & plan,int & ioverflow,const int n,const int m,const int fsign,const cudaStream_t cuda_stream)170 void fftcu_plan1dm_z( cufftHandle &plan,
171 int &ioverflow,
172 const int n,
173 const int m,
174 const int fsign,
175 const cudaStream_t cuda_stream) {
176
177 int i, istride, idist, ostride, odist, batch;
178 int nsize[1], inembed[1], onembed[1];
179 cufftResult_t cErr;
180
181 ioverflow = 0;
182 for (i = 0; i<n_plans; i++) {
183 if ( iplandims[i][0] == 1 &&
184 iplandims[i][1] == n &&
185 iplandims[i][2] == m &&
186 iplandims[i][3] == 0 &&
187 iplandims[i][4] == fsign ) {
188 plan = saved_plans[i];
189 return;
190 }
191 }
192
193 nsize[0] = n;
194 inembed[0] = 0; // is ignored, but is not allowed to be NULL pointer (for adv. strided I/O)
195 onembed[0] = 0; // is ignored, but is not allowed to be NULL pointer (for adv. strided I/O)
196 batch = m;
197 if ( fsign == +1 ) {
198 istride = m;
199 idist = 1;
200 ostride = 1;
201 odist = n;
202 } else {
203 istride = 1;
204 idist = n;
205 ostride = m;
206 odist = 1;
207 }
208
209 if (VERBOSE) printf("FFT 1D (%d) (%d-%d) %d %d %d %d\n", fsign, n, m, istride, idist, ostride, odist);
210 cErr = cufftPlanMany(&plan, 1, nsize, inembed, istride, idist, onembed, ostride, odist, CUFFT_Z2Z, batch);
211 if (CHECK) cufft_error_check(cErr, __LINE__);
212 cErr = cufftSetStream(plan, cuda_stream);
213 if (CHECK) cufft_error_check(cErr, __LINE__);
214 #if (__CUDACC_VER_MAJOR__<8)
215 cErr = cufftSetCompatibilityMode(plan, FFT_ALIGNMENT);
216 if (CHECK) cufft_error_check(cErr, __LINE__);
217 #endif
218
219 if ( n_plans < max_1d_plans ) {
220 saved_plans[n_plans] = plan;
221 iplandims[n_plans][0] = 1;
222 iplandims[n_plans][1] = n;
223 iplandims[n_plans][2] = m;
224 iplandims[n_plans][3] = 0;
225 iplandims[n_plans][4] = fsign;
226 n_plans++;
227 return;
228 }
229
230 ioverflow=1;
231 }
232
233 /******************************************************************************
234 * \brief Performs a scaled double precision complex 3D-FFT on the GPU.
235 * Input/output is a DEVICE pointer (data).
236 * \author Andreas Gloess
237 * \date 2012-05-18
238 * \version 0.01
239 *****************************************************************************/
fftcu_run_3d_z_(const int fsign,const int * n,const double scale,cufftDoubleComplex * data,const cudaStream_t cuda_stream)240 extern "C" void fftcu_run_3d_z_(const int fsign,
241 const int *n,
242 const double scale,
243 cufftDoubleComplex *data,
244 const cudaStream_t cuda_stream) {
245
246 int ioverflow, lmem;
247 cufftHandle plan;
248 cufftResult_t cErr;
249 cudaError_t cuErr;
250
251 lmem = n[0] * n[1] * n[2];
252
253 fftcu_plan3d_z(plan, ioverflow, n, cuda_stream);
254 if ( fsign < 0 ) {
255 cErr = cufftExecZ2Z(plan, data, data, CUFFT_INVERSE);
256 if (CHECK) cufft_error_check(cErr, __LINE__);
257 }
258 else {
259 cErr = cufftExecZ2Z(plan, data, data, CUFFT_FORWARD);
260 if (CHECK) cufft_error_check(cErr, __LINE__);
261 }
262
263 if (scale != 1.0e0) {
264 cuErr = cudaStreamSynchronize(cuda_stream);
265 if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
266 cublasDscal(2*lmem, scale, (double *) data, 1);
267 }
268
269 if (ioverflow) {
270 cuErr = cudaStreamSynchronize(cuda_stream);
271 if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
272 cErr = cufftDestroy(plan);
273 if (CHECK) cufft_error_check(cErr, __LINE__);
274 }
275 }
276
277 /******************************************************************************
278 * \brief Performs a scaled double precision complex 2D-FFT many times on
279 * the GPU.
280 * Input/output are DEVICE pointers (data_in, date_out).
281 * \author Andreas Gloess
282 * \date 2012-07-16
283 * \version 0.01
284 *****************************************************************************/
fftcu_run_2dm_z_(const int fsign,const int * n,const double scale,cufftDoubleComplex * data_in,cufftDoubleComplex * data_out,const cudaStream_t cuda_stream)285 extern "C" void fftcu_run_2dm_z_(const int fsign,
286 const int *n,
287 const double scale,
288 cufftDoubleComplex *data_in,
289 cufftDoubleComplex *data_out,
290 const cudaStream_t cuda_stream) {
291
292 int ioverflow, lmem;
293 cufftHandle plan;
294 cufftResult_t cErr;
295 cudaError_t cuErr;
296
297 lmem = n[0] * n[1] * n[2];
298
299 fftcu_plan2dm_z(plan, ioverflow, n, fsign, cuda_stream);
300 if ( fsign < 0 ) {
301 cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_INVERSE);
302 if (CHECK) cufft_error_check(cErr, __LINE__);
303 }
304 else {
305 cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_FORWARD);
306 if (CHECK) cufft_error_check(cErr, __LINE__);
307 }
308
309 if (scale != 1.0e0) {
310 cuErr = cudaStreamSynchronize(cuda_stream);
311 if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
312 cublasDscal(2 * lmem, scale, (double *) data_out, 1);
313 }
314
315 if (ioverflow) {
316 cuErr = cudaStreamSynchronize(cuda_stream);
317 if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
318 cErr = cufftDestroy(plan);
319 if (CHECK) cufft_error_check(cErr, __LINE__);
320 }
321 }
322
323 /******************************************************************************
324 * \brief Performs a scaled double precision complex 1D-FFT many times on
325 * the GPU.
326 * Input/output are DEVICE pointers (data_in, date_out).
327 * \author Andreas Gloess
328 * \date 2012-05-18
329 * \version 0.01
330 *****************************************************************************/
fftcu_run_1dm_z_(const int fsign,const int n,const int m,const double scale,cufftDoubleComplex * data_in,cufftDoubleComplex * data_out,const cudaStream_t cuda_stream)331 extern "C" void fftcu_run_1dm_z_(const int fsign,
332 const int n,
333 const int m,
334 const double scale,
335 cufftDoubleComplex *data_in,
336 cufftDoubleComplex *data_out,
337 const cudaStream_t cuda_stream) {
338
339 int ioverflow, lmem;
340 cufftHandle plan;
341 cufftResult_t cErr;
342 cudaError_t cuErr;
343
344 lmem = n * m;
345
346 fftcu_plan1dm_z(plan, ioverflow, n, m, fsign, cuda_stream);
347 if ( fsign < 0 ) {
348 cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_INVERSE);
349 if (CHECK) cufft_error_check(cErr, __LINE__);
350 }
351 else {
352 cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_FORWARD);
353 if (CHECK) cufft_error_check(cErr, __LINE__);
354 }
355
356 if (scale != 1.0e0) {
357 cuErr = cudaStreamSynchronize(cuda_stream);
358 if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
359 cublasDscal(2 * lmem, scale, (double *) data_out, 1);
360 }
361
362 if (ioverflow) {
363 cuErr = cudaStreamSynchronize(cuda_stream);
364 if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
365 cErr = cufftDestroy(plan);
366 if (CHECK) cufft_error_check(cErr, __LINE__);
367 }
368 }
369
370 /******************************************************************************
371 * \brief Release all stored plans.
372 *
373 * \author Andreas Gloess
374 * \date 2013-04-23
375 * \version 0.01
376 *****************************************************************************/
fftcu_release_()377 extern "C" void fftcu_release_() {
378 int i;
379 cufftHandle plan;
380 cufftResult_t cErr;
381 cudaError_t cuErr;
382
383 for (i = 0; i < n_plans; i++) {
384 plan = saved_plans[i];
385 cuErr = cudaDeviceSynchronize();
386 if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
387 cErr = cufftDestroy(plan);
388 if (CHECK) cufft_error_check(cErr, __LINE__);
389 }
390 n_plans = 0;
391 }
392
393 #endif
394