1 /*****************************************************************************
2  *  CP2K: A general program to perform molecular dynamics simulations        *
3  *  Copyright (C) 2000 - 2019  CP2K developers group                         *
4  *****************************************************************************/
5 
6 /******************************************************************************
7  *  Authors: Benjamin G Levine, Andreas Gloess
8  *
9  *  2012/05/18                 Refacturing - original files:
10  *                              - cuda_tools/cufft.h
11  *                              - cuda_tools/cufft.cu
12  *
13  *****************************************************************************/
14 #if defined ( __PW_CUDA )
15 
16 // global dependencies
17 #include <cuda_runtime.h>
18 #include <cufft.h>
19 #include <cublas.h>
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <unistd.h>
23 
24 // local dependencies
25 #include "pw_cuda_utils.h"
26 #include "fft_cuda_utils.h"
27 #include "fft_cuda_internal.h"
28 
29 // debug flag
30 #define CHECK 1
31 #define VERBOSE 0
32 
33 // dimensions
34 static int         n_plans = 0;
35 static cufftHandle saved_plans[max_plans];
36 static int         iplandims[max_plans][5];
37 
38 // --- CODE -------------------------------------------------------------------
39 
40 
41 /******************************************************************************
42  * \brief   Sets up and save a double precision complex 3D-FFT plan on the GPU.
43  *          Saved plans are reused if they fit the requirements.
44  * \author  Andreas Gloess
45  * \date    2012-05-18
46  * \version 0.01
47  *****************************************************************************/
fftcu_plan3d_z(cufftHandle & plan,int & ioverflow,const int * n,const cudaStream_t cuda_stream)48 void fftcu_plan3d_z(      cufftHandle  &plan,
49                           int          &ioverflow,
50                     const int          *n,
51                     const cudaStream_t  cuda_stream) {
52 
53   int i;
54   cufftResult_t cErr;
55 
56   ioverflow = 0;
57   for (i = 0; i < n_plans; i++) {
58     if ( iplandims[i][0] == 3 &&
59          iplandims[i][1] == n[0] &&
60          iplandims[i][2] == n[1] &&
61          iplandims[i][3] == n[2] &&
62          iplandims[i][4] == 0 ) {
63       plan = saved_plans[i];
64       return;
65     }
66   }
67 
68   if (VERBOSE) printf("FFT 3D (%d-%d-%d)\n", n[0], n[1], n[2]);
69   cErr = cufftPlan3d(&plan, n[2], n[1], n[0], CUFFT_Z2Z);
70   if (CHECK) cufft_error_check(cErr, __LINE__);
71   cErr = cufftSetStream(plan, cuda_stream);
72   if (CHECK) cufft_error_check(cErr, __LINE__);
73 #if (__CUDACC_VER_MAJOR__<8)
74   cErr = cufftSetCompatibilityMode(plan, FFT_ALIGNMENT);
75   if (CHECK) cufft_error_check(cErr, __LINE__);
76 #endif
77 
78   if ( n_plans < max_3d_plans ) {
79     saved_plans[n_plans] = plan;
80     iplandims[n_plans][0] = 3;
81     iplandims[n_plans][1] = n[0];
82     iplandims[n_plans][2] = n[1];
83     iplandims[n_plans][3] = n[2];
84     iplandims[n_plans][4] = 0;
85     n_plans++;
86     return;
87   }
88   ioverflow=1;
89 }
90 
91 /******************************************************************************
92  * \brief   Sets up and save a double precision complex 2D-FFT plan on the GPU.
93  *          Saved plans are reused if they fit the requirements.
94  * \author  Andreas Gloess
95  * \date    2012-07-16
96  * \version 0.01
97  *****************************************************************************/
fftcu_plan2dm_z(cufftHandle & plan,int & ioverflow,const int * n,const int fsign,const cudaStream_t cuda_stream)98 void fftcu_plan2dm_z(      cufftHandle  &plan,
99                            int          &ioverflow,
100                      const int          *n,
101                      const int           fsign,
102                      const cudaStream_t  cuda_stream) {
103 
104   int i, istride, idist, ostride, odist, batch;
105   int nsize[2], inembed[2], onembed[2];
106   cufftResult_t cErr;
107 
108   ioverflow = 0;
109   for (i = 0; i < n_plans; i++) {
110     if ( iplandims[i][0] == 2 &&
111          iplandims[i][1] == n[0] &&
112          iplandims[i][2] == n[1] &&
113          iplandims[i][3] == n[2] &&
114          iplandims[i][4] == fsign ) {
115       plan = saved_plans[i];
116       return;
117     }
118   }
119 
120   nsize[0] = n[2];
121   nsize[1] = n[1];
122   inembed[0] = n[2];
123   inembed[1] = n[1];
124   onembed[0] = n[2];
125   onembed[1] = n[1];
126   batch = n[0];
127   if ( fsign == +1 ) {
128     istride = n[0];
129     idist = 1;
130     ostride = 1;
131     odist = n[1]*n[2];
132   } else {
133     istride = 1;
134     idist = n[1]*n[2];
135     ostride = n[0];
136     odist = 1;
137   }
138 
139   if (VERBOSE) printf("FFT 2D (%d) (%d-%d-%d) %d %d %d %d\n", fsign, n[0], n[1], n[2], istride, idist, ostride, odist);
140   cErr = cufftPlanMany(&plan, 2, nsize, inembed, istride, idist, onembed, ostride, odist, CUFFT_Z2Z, batch);
141   if (CHECK) cufft_error_check(cErr, __LINE__);
142   cErr = cufftSetStream(plan, cuda_stream);
143   if (CHECK) cufft_error_check(cErr, __LINE__);
144 #if (__CUDACC_VER_MAJOR__<8)
145   cErr = cufftSetCompatibilityMode(plan, FFT_ALIGNMENT);
146   if (CHECK) cufft_error_check(cErr, __LINE__);
147 #endif
148 
149   if ( n_plans < max_2d_plans ) {
150     saved_plans[n_plans] = plan;
151     iplandims[n_plans][0] = 2;
152     iplandims[n_plans][1] = n[0];
153     iplandims[n_plans][2] = n[1];
154     iplandims[n_plans][3] = n[2];
155     iplandims[n_plans][4] = fsign;
156     n_plans++;
157     return;
158   }
159 
160   ioverflow=1;
161 }
162 
163 /******************************************************************************
164  * \brief   Sets up and save a double precision complex 1D-FFT plan on the GPU.
165  *          Saved plans are reused if they fit the requirements.
166  * \author  Andreas Gloess
167  * \date    2012-07-04
168  * \version 0.01
169  *****************************************************************************/
fftcu_plan1dm_z(cufftHandle & plan,int & ioverflow,const int n,const int m,const int fsign,const cudaStream_t cuda_stream)170 void fftcu_plan1dm_z(      cufftHandle  &plan,
171                            int          &ioverflow,
172                      const int           n,
173                      const int           m,
174                      const int           fsign,
175                      const cudaStream_t  cuda_stream) {
176 
177   int i, istride, idist, ostride, odist, batch;
178   int nsize[1], inembed[1], onembed[1];
179   cufftResult_t cErr;
180 
181   ioverflow = 0;
182   for (i = 0; i<n_plans; i++) {
183     if ( iplandims[i][0] == 1 &&
184          iplandims[i][1] == n &&
185          iplandims[i][2] == m &&
186          iplandims[i][3] == 0 &&
187          iplandims[i][4] == fsign ) {
188       plan = saved_plans[i];
189       return;
190     }
191   }
192 
193   nsize[0] = n;
194   inembed[0] = 0; // is ignored, but is not allowed to be NULL pointer (for adv. strided I/O)
195   onembed[0] = 0; // is ignored, but is not allowed to be NULL pointer (for adv. strided I/O)
196   batch = m;
197   if ( fsign == +1 ) {
198     istride = m;
199     idist = 1;
200     ostride = 1;
201     odist = n;
202   } else {
203     istride = 1;
204     idist = n;
205     ostride = m;
206     odist = 1;
207   }
208 
209   if (VERBOSE) printf("FFT 1D (%d) (%d-%d) %d %d %d %d\n", fsign, n, m, istride, idist, ostride, odist);
210   cErr = cufftPlanMany(&plan, 1, nsize, inembed, istride, idist, onembed, ostride, odist, CUFFT_Z2Z, batch);
211   if (CHECK) cufft_error_check(cErr, __LINE__);
212   cErr = cufftSetStream(plan, cuda_stream);
213   if (CHECK) cufft_error_check(cErr, __LINE__);
214 #if (__CUDACC_VER_MAJOR__<8)
215   cErr = cufftSetCompatibilityMode(plan, FFT_ALIGNMENT);
216   if (CHECK) cufft_error_check(cErr, __LINE__);
217 #endif
218 
219   if ( n_plans < max_1d_plans ) {
220     saved_plans[n_plans] = plan;
221     iplandims[n_plans][0] = 1;
222     iplandims[n_plans][1] = n;
223     iplandims[n_plans][2] = m;
224     iplandims[n_plans][3] = 0;
225     iplandims[n_plans][4] = fsign;
226     n_plans++;
227     return;
228   }
229 
230   ioverflow=1;
231 }
232 
233 /******************************************************************************
234  * \brief   Performs a scaled double precision complex 3D-FFT on the GPU.
235  *          Input/output is a DEVICE pointer (data).
236  * \author  Andreas Gloess
237  * \date    2012-05-18
238  * \version 0.01
239  *****************************************************************************/
fftcu_run_3d_z_(const int fsign,const int * n,const double scale,cufftDoubleComplex * data,const cudaStream_t cuda_stream)240 extern "C" void fftcu_run_3d_z_(const int                 fsign,
241                                 const int                *n,
242                                 const double              scale,
243                                       cufftDoubleComplex *data,
244                                 const cudaStream_t        cuda_stream) {
245 
246   int ioverflow, lmem;
247   cufftHandle   plan;
248   cufftResult_t cErr;
249   cudaError_t  cuErr;
250 
251   lmem = n[0] * n[1] * n[2];
252 
253   fftcu_plan3d_z(plan, ioverflow, n, cuda_stream);
254   if ( fsign < 0  ) {
255     cErr = cufftExecZ2Z(plan, data, data, CUFFT_INVERSE);
256     if (CHECK) cufft_error_check(cErr, __LINE__);
257   }
258   else {
259     cErr = cufftExecZ2Z(plan, data, data, CUFFT_FORWARD);
260     if (CHECK) cufft_error_check(cErr, __LINE__);
261   }
262 
263   if (scale != 1.0e0) {
264     cuErr = cudaStreamSynchronize(cuda_stream);
265     if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
266     cublasDscal(2*lmem, scale, (double *) data, 1);
267   }
268 
269   if (ioverflow) {
270     cuErr = cudaStreamSynchronize(cuda_stream);
271     if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
272     cErr = cufftDestroy(plan);
273     if (CHECK) cufft_error_check(cErr, __LINE__);
274   }
275 }
276 
277 /******************************************************************************
278  * \brief   Performs a scaled double precision complex 2D-FFT many times on
279  *          the GPU.
280  *          Input/output are DEVICE pointers (data_in, date_out).
281  * \author  Andreas Gloess
282  * \date    2012-07-16
283  * \version 0.01
284  *****************************************************************************/
fftcu_run_2dm_z_(const int fsign,const int * n,const double scale,cufftDoubleComplex * data_in,cufftDoubleComplex * data_out,const cudaStream_t cuda_stream)285 extern "C" void fftcu_run_2dm_z_(const int                 fsign,
286                                  const int                *n,
287                                  const double              scale,
288                                        cufftDoubleComplex *data_in,
289                                        cufftDoubleComplex *data_out,
290                                  const cudaStream_t        cuda_stream) {
291 
292   int ioverflow, lmem;
293   cufftHandle   plan;
294   cufftResult_t cErr;
295   cudaError_t  cuErr;
296 
297   lmem = n[0] * n[1] * n[2];
298 
299   fftcu_plan2dm_z(plan, ioverflow, n, fsign, cuda_stream);
300   if ( fsign < 0 ) {
301     cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_INVERSE);
302     if (CHECK) cufft_error_check(cErr, __LINE__);
303   }
304   else {
305     cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_FORWARD);
306     if (CHECK) cufft_error_check(cErr, __LINE__);
307   }
308 
309   if (scale != 1.0e0) {
310     cuErr = cudaStreamSynchronize(cuda_stream);
311     if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
312     cublasDscal(2 * lmem, scale, (double *) data_out, 1);
313   }
314 
315   if (ioverflow) {
316     cuErr = cudaStreamSynchronize(cuda_stream);
317     if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
318     cErr = cufftDestroy(plan);
319     if (CHECK) cufft_error_check(cErr, __LINE__);
320   }
321 }
322 
323 /******************************************************************************
324  * \brief   Performs a scaled double precision complex 1D-FFT many times on
325  *          the GPU.
326  *          Input/output are DEVICE pointers (data_in, date_out).
327  * \author  Andreas Gloess
328  * \date    2012-05-18
329  * \version 0.01
330  *****************************************************************************/
fftcu_run_1dm_z_(const int fsign,const int n,const int m,const double scale,cufftDoubleComplex * data_in,cufftDoubleComplex * data_out,const cudaStream_t cuda_stream)331 extern "C" void fftcu_run_1dm_z_(const int                 fsign,
332                                  const int                 n,
333                                  const int                 m,
334                                  const double              scale,
335                                        cufftDoubleComplex *data_in,
336                                        cufftDoubleComplex *data_out,
337                                  const cudaStream_t        cuda_stream) {
338 
339   int ioverflow, lmem;
340   cufftHandle   plan;
341   cufftResult_t cErr;
342   cudaError_t  cuErr;
343 
344   lmem = n * m;
345 
346   fftcu_plan1dm_z(plan, ioverflow, n, m, fsign, cuda_stream);
347   if ( fsign < 0 ) {
348     cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_INVERSE);
349     if (CHECK) cufft_error_check(cErr, __LINE__);
350   }
351   else {
352     cErr = cufftExecZ2Z(plan, data_in, data_out, CUFFT_FORWARD);
353     if (CHECK) cufft_error_check(cErr, __LINE__);
354   }
355 
356   if (scale != 1.0e0) {
357     cuErr = cudaStreamSynchronize(cuda_stream);
358     if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
359     cublasDscal(2 * lmem, scale, (double *) data_out, 1);
360   }
361 
362   if (ioverflow) {
363     cuErr = cudaStreamSynchronize(cuda_stream);
364     if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
365     cErr = cufftDestroy(plan);
366     if (CHECK) cufft_error_check(cErr, __LINE__);
367   }
368 }
369 
370 /******************************************************************************
371  * \brief   Release all stored plans.
372  *
373  * \author  Andreas Gloess
374  * \date    2013-04-23
375  * \version 0.01
376  *****************************************************************************/
fftcu_release_()377 extern "C" void fftcu_release_() {
378   int           i;
379   cufftHandle   plan;
380   cufftResult_t cErr;
381   cudaError_t   cuErr;
382 
383   for (i = 0; i < n_plans; i++) {
384     plan = saved_plans[i];
385     cuErr = cudaDeviceSynchronize();
386     if (CHECK) pw_cuda_error_check(cuErr, __LINE__);
387     cErr = cufftDestroy(plan);
388     if (CHECK) cufft_error_check(cErr, __LINE__);
389   }
390   n_plans = 0;
391 }
392 
393 #endif
394