1!
2! Copyright (C) Quantum ESPRESSO group
3!
4! This file is distributed under the terms of the
5! GNU General Public License. See the file `License'
6! in the root directory of the present distribution,
7! or http://www.gnu.org/copyleft/gpl.txt .
8!
9!----------------------------------------------------------------------
10! FFT base Module.
11! Written by Carlo Cavazzoni, modified by Paolo Giannozzi
12! Rewritten by Stefano de Gironcoli, ported to GPU by Pietro Bonfa'
13!----------------------------------------------------------------------
14
15#if defined(__CUDA)
16
17#define __NON_BLOCKING_SCATTER
18
19!=----------------------------------------------------------------------=!
20   MODULE fft_scatter_gpu
21!=----------------------------------------------------------------------=!
22#if defined(__CUDA)
23        USE fft_types, ONLY: fft_type_descriptor
24        USE fft_param
25
26        IMPLICIT NONE
27        SAVE
28
29        PRIVATE
30        !
31        PUBLIC :: fft_type_descriptor
32        PUBLIC :: fft_scatter_xy_gpu, fft_scatter_yz_gpu, fft_scatter_tg_gpu, &
33                  fft_scatter_tg_opt_gpu, fft_scatter_many_yz_gpu
34
35!=----------------------------------------------------------------------=!
36      CONTAINS
37!=----------------------------------------------------------------------=!
38!
39!
40!-----------------------------------------------------------------------
41SUBROUTINE fft_scatter_xy_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn, stream )
42  !-----------------------------------------------------------------------
43  !
44  ! transpose of the fft xy planes across the desc%comm2 communicator
45  !
46  ! a) From Y-oriented columns to X-oriented partial slices (isgn > 0)
47  !    Active columns along the Y direction corresponding to a subset of the
48  !    active X values and a range of Z values (in this order) are stored
49  !    consecutively for each processor and are such that the subgroup owns
50  !    all data for a range of Z values.
51  !
52  !    The Y pencil -> X-oriented partial slices transposition is performed
53  !    in the subgroup of processors (desc%comm2) owning this range of Z values.
54  !
55  !    The transpose takes place in two steps:
56  !    1) on each processor the columns are sliced into sections along Y
57  !       that are stored one after the other. On each processor, slices for
58  !       processor "iproc2" are desc%nr2p(iproc2)*desc%nr1p(me2)*desc%my_nr3p big.
59  !    2) all processors communicate to exchange slices (all sectin of columns with
60  !       Y in the slice belonging to "me" must be received, all the others
61  !       must be sent to "iproc2")
62  !
63  !    Finally one gets the "partial slice" representation: each processor has
64  !    all the X values of desc%my_nr2p Y and desc%my_nr3p Z values.
65  !    Data are organized with the X index running fastest, then Y, then Z.
66  !
67  !    f_in  contains the input Y columns, is destroyed on output
68  !    f_aux contains the output X-oriented partial slices.
69  !
70  !  b) From planes to columns (isgn < 0)
71  !
72  !    Quite the same in the opposite direction
73  !    f_aux contains the input X-oriented partial slices, is destroyed on output
74  !    f_in  contains the output Y columns.
75  !
76  USE cudafor
77  !USE nvtx_fft
78  USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, &
79                          f_aux=>pin_space_scatter_out
80  IMPLICIT NONE
81
82  TYPE (fft_type_descriptor), INTENT(in) :: desc
83  INTEGER, INTENT(in)                    :: nxx_, isgn
84  COMPLEX (DP), DEVICE, INTENT(inout)    :: f_in_d (nxx_), f_aux_d (nxx_)
85  INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution
86
87#if defined(__MPI)
88  !
89  INTEGER :: ierr, me2, nproc2, iproc2, ncpx, nr1x, my_nr2p, nr2px, ip, ip0
90  INTEGER :: i, it, j, k, kfrom, kdest, offset, ioff, mc, m1, m3, i1, icompact, sendsize, aux
91  INTEGER, ALLOCATABLE :: ncp_(:)
92  INTEGER, POINTER, DEVICE :: nr1p__d(:), indx_d(:,:)
93  !
94#if defined(__NON_BLOCKING_SCATTER)
95  INTEGER :: sh(desc%nproc2), rh(desc%nproc2)
96#endif
97  !
98  !CALL nvtxStartRangeAsync("fft_scatter_xy_gpu", isgn + 5)
99  !
100  me2    = desc%mype2 + 1
101  nproc2 = desc%nproc2 ; if ( abs(isgn) == 3 ) nproc2 = 1
102
103  ! This mapping improves readability but, most importantly, it is needed
104  ! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from
105  ! `desc` appear in the body of the do loops, the compiler generates incorrect GPU code.
106  nr1x   = desc%nr1x
107
108  ! allocate auxiliary array for columns distribution
109  ALLOCATE ( ncp_(nproc2))
110  if ( abs (isgn) == 1 ) then          ! It's a potential FFT
111     ncp_ = desc%nr1p * desc%my_nr3p
112     nr1p__d=> desc%nr1p_d
113     indx_d => desc%indp_d
114     my_nr2p=desc%my_nr2p
115  else if ( abs (isgn) == 2 ) then     ! It's a wavefunction FFT
116     ncp_ = desc%nr1w * desc%my_nr3p
117     nr1p__d=> desc%nr1w_d
118     indx_d => desc%indw_d
119     my_nr2p=desc%my_nr2p
120  else if ( abs (isgn) == 3 ) then     ! It's a wavefunction FFT with task group
121     ncp_ = desc%nr1w_tg * desc%my_nr3p!
122     nr1p__d=> desc%nr1w_tg_d               !
123     indx_d => desc%indw_tg_d         !
124     my_nr2p=desc%nr2x                 ! in task group FFTs whole Y colums are distributed
125  end if
126  !
127  CALL start_clock ('fft_scatt_xy')
128  !
129  ! calculate the message size
130  !
131  if ( abs (isgn) == 3 ) then
132     nr2px = desc%nr2x ! if it's a task group FFT whole planes are distributed
133  else
134     nr2px = MAXVAL ( desc%nr2p )  ! maximum number of Y values to be disributed
135  end if
136
137  ncpx  = MAXVAL ( ncp_ )       ! maximum number of Y columns to be disributed
138
139  sendsize = ncpx * nr2px       ! dimension of the scattered chunks (safe value)
140  !
141  ! check host copy allocation of f_in and f_aux
142  CALL check_buffers_size(desc)
143  !
144  ierr = 0
145  IF (isgn.gt.0) THEN
146
147     IF (nproc2==1) GO TO 10
148     !
149     ! "forward" scatter from columns to planes
150     !
151     ! step one: store contiguously the slices
152     !
153     offset = 0
154     DO iproc2 = 1, nproc2
155        kdest = ( iproc2 - 1 ) * sendsize
156        kfrom = offset
157        !DO k = 1, ncp_(me2)
158        !  !DO i = 1, desc%nr2p( iproc2 )
159        !      !f_aux ( kdest + i ) =  f_in ( kfrom + i )
160        !  !ENDDO
161        !
162        !  !kdest = kdest + nr2px
163        !  !kfrom = kfrom + desc%nr2x
164        !ENDDO
165
166        ierr = cudaMemcpy2DAsync( f_aux(kdest + 1), nr2px, f_in_d(kfrom + 1 ), desc%nr2x, desc%nr2p( iproc2 ), ncp_(me2), cudaMemcpyDeviceToHost, stream )
167        offset = offset + desc%nr2p( iproc2 )
168     ENDDO
169     !
170     ! step two: communication  across the    nproc3    group
171     !
172#if defined(__NON_BLOCKING_SCATTER)
173     ierr = cudaStreamSynchronize(stream)
174     DO iproc2 = 1, nproc2
175        kdest = ( iproc2 - 1 ) * sendsize
176        CALL mpi_irecv( f_in( kdest + 1 ), sendsize, &
177                           MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, &
178                           desc%comm2, rh( iproc2 ), ierr )
179        CALL mpi_isend( f_aux( kdest + 1 ), sendsize, &
180                        MPI_DOUBLE_COMPLEX, iproc2-1, me2, desc%comm2, &
181                        sh( iproc2 ), ierr )
182     ENDDO
183#else
184     ierr = cudaStreamSynchronize(stream)
185     CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), &
186                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
187     !
188     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
189
190     !f_in_d(1:nxx_) = f_in(1:nxx_)
191     ierr = cudaMemcpyAsync( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice, stream )
192
193#endif
194     !
19510   CONTINUE
196     !
197     !f_aux = (0.0_DP, 0.0_DP)
198     !$cuf kernel do (1) <<<*,*,0,stream>>>
199     DO i = 1, nxx_
200       f_aux_d(i) = (0.d0, 0.d0)
201     END DO
202     !
203     DO iproc2 = 1, nproc2
204#if defined(__NON_BLOCKING_SCATTER)
205        IF (nproc2 > 1) THEN
206           kdest = (iproc2-1)*sendsize
207           call mpi_wait( rh(iproc2), MPI_STATUSES_IGNORE, ierr )
208           call mpi_wait( sh(iproc2), MPI_STATUSES_IGNORE, ierr )
209           ierr = cudaMemcpyAsync( f_in_d(kdest + 1), f_in(kdest + 1 ), sendsize, cudaMemcpyHostToDevice, stream )
210        END IF
211#endif
212        aux = ncp_( iproc2 )
213        !$cuf kernel do(2) <<<*,*,0,stream>>>
214        DO i = 1, aux
215           DO j = 1, my_nr2p
216              it = ( iproc2 - 1 ) * sendsize + (i-1)*nr2px
217              m3 = (i-1)/nr1p__d(iproc2)+1
218              i1 = mod(i-1,nr1p__d(iproc2))+1
219              m1 = indx_d(i1,iproc2)
220              icompact = m1 + (m3-1)*nr1x*my_nr2p + (j-1)*nr1x
221              ! old do loop started here
222              !f_aux( m1 + (j-1)*nr1x + (m3-1)*nr1x*my_nr2p ) = f_in( j + it )
223              f_aux_d( icompact ) = f_in_d( j + it )
224              !icompact = icompact + nr1x
225           ENDDO
226           !it = it + nr2px
227        ENDDO
228     ENDDO
229
230  ELSE
231     !
232     !  "backward" scatter from planes to columns
233     !
234     DO iproc2 = 1, nproc2
235        aux = ncp_( iproc2 )
236        !$cuf kernel do (2) <<<*,*,0,stream>>>
237        DO i = 1, aux
238           DO j = 1, my_nr2p
239              it = ( iproc2 - 1 ) * sendsize + (i-1)*nr2px
240              m3 = (i-1)/nr1p__d(iproc2)+1 ; i1  = mod(i-1,nr1p__d(iproc2))+1 ;  m1 = indx_d(i1,iproc2)
241              icompact = m1 + (m3-1)*nr1x*my_nr2p + (j-1)*nr1x
242           !DO j = 1, my_nr2p
243              !f_in( j + it ) = f_aux( m1 + (j-1)*nr1x + (m3-1)*nr1x*my_nr2p )
244              f_in_d( j + it ) = f_aux_d( icompact )
245           !   icompact = icompact + nr1x
246           ENDDO
247           !it = it + nr2px
248        ENDDO
249        !
250#if defined(__NON_BLOCKING_SCATTER)
251        IF( nproc2 > 1 ) THEN
252          kdest = ( iproc2 - 1 ) * sendsize
253          ierr = cudaMemcpyAsync( f_in(kdest + 1), f_in_d(kdest + 1 ), sendsize, cudaMemcpyDeviceToHost, stream )
254        END IF
255#endif
256     ENDDO
257     IF (nproc2==1) GO TO 20
258     !
259     !  step two: communication
260     !
261
262#if defined(__NON_BLOCKING_SCATTER)
263     DO iproc2 = 1, nproc2
264        kdest = (iproc2-1)*sendsize
265        CALL mpi_irecv( f_aux( ( iproc2 - 1 ) * sendsize + 1 ), sendsize, &
266                        MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, &
267                        desc%comm2, rh(iproc2), ierr )
268        ierr = cudaStreamSynchronize(stream)
269        CALL mpi_isend( f_in( kdest + 1 ), &
270                            sendsize, MPI_DOUBLE_COMPLEX, iproc2-1, me2, &
271                            desc%comm2, sh( iproc2 ), ierr )
272        ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) )
273     ENDDO
274#else
275     !f_in(1:nxx_) = f_in_d(1:nxx_)
276     ierr = cudaMemcpy( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost)
277     CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), &
278                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
279
280     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
281#endif
282     !
283     !  step one: store contiguously the columns
284     !
285     ! ensures that no garbage is present in the output
286     ! not useless ... clean the array to be returned from the garbage of previous A2A step
287     !f_in = (0.0_DP, 0.0_DP) !
288     !$cuf kernel do (1) <<<*,*,0,stream>>>
289     do i = 1, nxx_
290       f_in_d(i) = (0.d0, 0.d0)
291     end do
292     !
293     offset = 0
294     DO iproc2 = 1, nproc2
295        kdest = ( iproc2 - 1 ) * sendsize
296        kfrom = offset
297
298#if defined(__NON_BLOCKING_SCATTER)
299        call mpi_wait( rh(iproc2), MPI_STATUSES_IGNORE, ierr )
300        call mpi_wait( sh(iproc2), MPI_STATUSES_IGNORE, ierr )
301#endif
302        !DO k = 1, ncp_(me2)
303        !   DO i = 1, desc%nr2p( iproc2 )
304        !      f_in ( kfrom + i ) = f_aux ( kdest + i )
305        !   ENDDO
306        !   kdest = kdest + nr2px
307        !   kfrom = kfrom + desc%nr2x
308        !ENDDO
309        ierr = cudaMemcpy2DAsync( f_in_d(kfrom +1 ), desc%nr2x, f_aux(kdest + 1), nr2px, desc%nr2p( iproc2 ), ncp_(me2), cudaMemcpyHostToDevice, stream )
310        offset = offset + desc%nr2p( iproc2 )
311     ENDDO
312
31320   CONTINUE
314
315
316  ENDIF
317  !
318  !CALL nvtxEndRangeAsync()
319  DEALLOCATE ( ncp_ )
320  CALL stop_clock ('fft_scatt_xy')
321
322#endif
323
324  RETURN
32599 format ( 20 ('(',2f12.9,')') )
326
327END SUBROUTINE fft_scatter_xy_gpu
328!
329!-----------------------------------------------------------------------
330SUBROUTINE fft_scatter_yz_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn )
331  !-----------------------------------------------------------------------
332  !
333  ! transpose of the fft yz planes across the desc%comm3 communicator
334  !
335  ! a) From Z-oriented columns to Y-oriented colums (isgn > 0)
336  !    Active columns (or sticks or pencils) along the Z direction for each
337  !    processor are stored consecutively and are such that they correspond
338  !    to a subset of the active X values.
339  !
340  !    The pencil -> slices transposition is performed in the subgroup
341  !    of processors (desc%comm3) owning these X values.
342  !
343  !    The transpose takes place in two steps:
344  !    1) on each processor the columns are sliced into sections along Z
345  !       that are stored one after the other. On each processor, slices for
346  !       processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big.
347  !    2) all processors communicate to exchange slices (all columns with
348  !       Z in the slice belonging to "me" must be received, all the others
349  !       must be sent to "iproc3")
350  !
351  !    Finally one gets the "slice" representation: each processor has
352  !    desc%nr3p(mype3) Z values of all the active pencils along Y for the
353  !    X values of the current group. Data are organized with the Y index
354  !    running fastest, then the reordered X values, then Z.
355  !
356  !    f_in  contains the input Z columns, is destroyed on output
357  !    f_aux contains the output Y colums.
358  !
359  !  b) From planes to columns (isgn < 0)
360  !
361  !    Quite the same in the opposite direction
362  !    f_aux contains the input Y columns, is destroyed on output
363  !    f_in  contains the output Z columns.
364  !
365  USE cudafor
366  !USE nvtx_fft
367  USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, &
368                          f_aux=>pin_space_scatter_out
369  IMPLICIT NONE
370
371  TYPE (fft_type_descriptor), INTENT(in) :: desc
372  INTEGER, INTENT(in)                    :: nxx_, isgn
373  COMPLEX (DP), DEVICE, INTENT(inout)    :: f_in_d (nxx_), f_aux_d (nxx_)
374!  INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution
375
376#if defined(__MPI)
377  INTEGER, DEVICE, POINTER :: desc_ismap_d(:)
378  !
379  INTEGER :: ierr, me, me2, me2_start, me2_end, me3, nproc3, iproc3, ncpx, nr3px, ip, ip0
380  INTEGER :: nr1x, nr2x, nr3, nr3x
381  INTEGER :: i, it, it0, k, kfrom, kdest, offset, ioff, mc, m1, m2, i1,  sendsize, aux
382  INTEGER, ALLOCATABLE :: ncp_(:)
383  INTEGER, DEVICE, POINTER :: ir1p__d(:)
384  INTEGER :: my_nr1p_
385  !
386#if defined(__NON_BLOCKING_SCATTER)
387  INTEGER :: sh(desc%nproc3), rh(desc%nproc3)
388#endif
389  TYPE(cudaEvent) :: zero_event
390  !CALL nvtxStartRangeAsync("fft_scatter_yz_gpu", isgn + 5)
391  ierr = cudaEventCreate( zero_event )
392  !
393  me     = desc%mype  + 1
394  me2    = desc%mype2 + 1
395  me3    = desc%mype3 + 1
396  nproc3 = desc%nproc3
397
398  ! This mapping improves readability but, most importantly, it is needed
399  ! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from
400  ! `desc` appear in the body of the do loops, the compiler generates incorrect GPU code.
401  nr1x   = desc%nr1x
402  nr2x   = desc%nr2x
403  nr3    = desc%nr3
404  nr3x   = desc%nr3x
405
406  ! allocate auxiliary array for columns distribution
407  ALLOCATE ( ncp_( desc%nproc) )
408
409  me2_start = me2 ; me2_end = me2
410  if ( abs (isgn) == 1 ) then      ! It's a potential FFT
411     ncp_ = desc%nsp
412     my_nr1p_ = count (desc%ir1p   > 0)
413     ir1p__d => desc%ir1p_d
414  else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT
415     ncp_ = desc%nsw
416     my_nr1p_ = count (desc%ir1w   > 0)
417     ir1p__d => desc%ir1w_d
418  else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group
419     ncp_ = desc%nsw
420     my_nr1p_ = count (desc%ir1w_tg > 0)
421     ir1p__d => desc%ir1w_tg_d
422     me2_start = 1 ; me2_end = desc%nproc2
423  end if
424  !
425  CALL start_clock ('fft_scatt_yz')
426  !
427  ! calculate the message size
428  !
429  nr3px = MAXVAL ( desc%nr3p )  ! maximum number of Z values to be exchanged
430  ncpx  = MAXVAL ( ncp_ )       ! maximum number of Z columns to be exchanged
431  if (abs(isgn)==3) then
432     ncpx = ncpx * desc%nproc2  ! if it's a task group FFT groups of columns are exchanged
433  end if
434
435  sendsize = ncpx * nr3px       ! dimension of the scattered chunks
436
437  ! check host copy allocation of f_in and f_aux
438  CALL check_buffers_size(desc)
439  desc_ismap_d => desc%ismap_d
440
441  ierr = 0
442  IF (isgn.gt.0) THEN
443
444     IF (nproc3==1) GO TO 10
445     !
446     ! "forward" scatter from columns to planes
447     !
448     ! step one: store contiguously the slices
449     !
450     offset = 0
451     DO iproc3 = 1, nproc3
452        kdest = ( iproc3 - 1 ) * sendsize
453        kfrom = offset
454        DO me2 = me2_start, me2_end
455           ip = desc%iproc(me2,me3)
456        !   DO k = 1, ncp_ (ip)   ! was ncp_(me3)
457        !      DO i = 1, desc%nr3p( iproc3 )
458        !         f_aux ( kdest + i ) =  f_in ( kfrom + i )
459        !      ENDDO
460        !      kdest = kdest + nr3px
461        !      kfrom = kfrom + desc%nr3x
462        !   ENDDO
463
464           ierr = cudaMemcpy2DAsync( f_aux(kdest + 1), nr3px, f_in_d(kfrom + 1 ), nr3x, desc%nr3p( iproc3 ), ncp_(ip), cudaMemcpyDeviceToHost, desc%stream_scatter_yz(iproc3) )
465           kdest = kdest + nr3px*ncp_ (ip)
466           kfrom = kfrom + nr3x*ncp_ (ip)
467        ENDDO
468        offset = offset + desc%nr3p( iproc3 )
469     ENDDO
470     !
471     ! ensures that no garbage is present in the output
472     ! useless; the later accessed elements are overwritten by the A2A step
473     !
474     ! step two: communication  across the    nproc3    group
475     !
476#if defined(__NON_BLOCKING_SCATTER)
477     DO iproc3 = 1, nproc3
478        ierr = cudaStreamSynchronize(desc%stream_scatter_yz(iproc3))
479        CALL mpi_irecv( f_in(  ( iproc3 - 1 ) * sendsize + 1 ), sendsize, &
480                        MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, &
481                        desc%comm3, rh( iproc3 ), ierr )
482        CALL mpi_isend( f_aux( ( iproc3 - 1 ) * sendsize + 1 ), sendsize, &
483                        MPI_DOUBLE_COMPLEX, iproc3-1, me3, desc%comm3, &
484                        sh( iproc3 ), ierr )
485     ENDDO
486     !
487#else
488     ierr = cudaDeviceSynchronize()
489     CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), &
490                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
491
492     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
493     !
494     !f_in_d(1:nxx_) = f_in(1:nxx_)
495     ierr = cudaMemcpy( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice )
496#endif
497     !
49810   CONTINUE
499     !
500     !f_aux = (0.0_DP, 0.0_DP) !
501!$cuf kernel do (1) <<<*,*,0,desc%stream_scatter_yz(1)>>>
502     DO i = 1, desc%my_nr3p*my_nr1p_*nr2x
503       f_aux_d(i) = (0.d0, 0.d0)
504     END DO
505     ierr = cudaEventRecord ( zero_event, desc%stream_scatter_yz(1) )
506     !
507     DO iproc3 = 1, desc%nproc3
508        it0 = ( iproc3 - 1 ) * sendsize
509#if defined(__NON_BLOCKING_SCATTER)
510        IF (nproc3 > 1) THEN
511           CALL mpi_wait( rh(iproc3), MPI_STATUSES_IGNORE, ierr )
512           CALL mpi_wait( sh(iproc3), MPI_STATUSES_IGNORE, ierr )
513           ierr = cudaMemcpyAsync( f_in_d(it0+1), f_in(it0+1), sendsize, cudaMemcpyHostToDevice, desc%stream_scatter_yz(iproc3)  )
514        END IF
515#endif
516        IF (iproc3 == 2) ierr = cudaEventSynchronize( zero_event )
517        DO me2 = me2_start, me2_end
518           ip = desc%iproc( me2, iproc3)
519           ioff = desc%iss(ip)
520           aux = ncp_( ip )
521!$cuf kernel do(2) <<<*,*,0,desc%stream_scatter_yz(iproc3)>>>
522           DO i = 1, aux ! was ncp_(iproc3)
523              DO k = 1, desc%my_nr3p
524                 it = it0 + (i-1)*nr3px
525                 mc = desc_ismap_d( i + ioff ) ! this is  m1+(m2-1)*nr1x  of the  current pencil
526                 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1
527                 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*nr2x*my_nr1p_
528
529                 f_aux_d( i1 ) = f_in_d( k + it )
530                 !i1 = i1 + desc%nr2x*my_nr1p_
531              ENDDO
532           ENDDO
533           it0 = it0 + ncp_( ip )*nr3px
534        ENDDO
535     ENDDO
536
537  ELSE
538     !
539     !  "backward" scatter from planes to columns
540     !
541     DO iproc3 = 1, nproc3
542        it0 = ( iproc3 - 1 ) * sendsize
543        DO me2 = me2_start, me2_end
544           ip = desc%iproc(me2, iproc3)
545           ioff = desc%iss(ip)
546           aux = ncp_( ip )
547!$cuf kernel do(2) <<<*,*,0,desc%stream_scatter_yz(iproc3)>>>
548           DO i = 1, aux
549              DO k = 1, desc%my_nr3p
550                 it = it0 + (i-1)*nr3px
551                 mc = desc_ismap_d( i + ioff ) ! this is  m1+(m2-1)*nr1x  of the  current pencil
552                 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1
553                 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*(nr2x * my_nr1p_)
554
555                 f_in_d( k + it ) = f_aux_d( i1 )
556                 !i1 = i1 + desc%nr2x * my_nr1p_
557              ENDDO
558              !it = it + nr3px
559           ENDDO
560           it0 = it0 + ncp_( ip )*nr3px
561        ENDDO
562#if defined(__NON_BLOCKING_SCATTER)
563        IF( nproc3 > 1 ) THEN
564           kdest = ( iproc3 - 1 ) * sendsize
565           ierr = cudaMemcpyAsync( f_in(kdest + 1), f_in_d(kdest + 1 ), sendsize, cudaMemcpyDeviceToHost, desc%stream_scatter_yz(iproc3) )
566        END IF
567#endif
568     ENDDO
569
570     IF( nproc3 == 1 ) GO TO 20
571     !
572     !
573     !  step two: communication
574     !
575#if defined(__NON_BLOCKING_SCATTER)
576     DO iproc3 = 1, nproc3
577           CALL mpi_irecv( f_aux( (iproc3 - 1) * sendsize + 1 ), sendsize, &
578                           MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, &
579                           desc%comm3, rh(iproc3), ierr )
580           ierr = cudaStreamSynchronize(desc%stream_scatter_yz(iproc3))
581           CALL mpi_isend( f_in( ( iproc3 - 1 ) * sendsize + 1 ), &
582                                        sendsize, MPI_DOUBLE_COMPLEX, iproc3-1, &
583                                        me3, desc%comm3, sh( iproc3 ), ierr )
584     END DO
585#else
586     !f_in(1:nxx_) = f_in_d(1:nxx_)
587     !ierr = cudaDeviceSynchronize()
588     ierr = cudaMemcpy( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost )
589
590     CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), &
591                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
592
593     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
594
595
596#endif
597     !
598     !  step one: store contiguously the columns
599     !
600     offset = 0
601     DO iproc3 = 1, nproc3
602        kdest = ( iproc3 - 1 ) * sendsize
603#if defined(__NON_BLOCKING_SCATTER)
604        IF( nproc3 > 1 ) THEN
605           call mpi_wait( rh(iproc3), MPI_STATUSES_IGNORE, ierr )
606           call mpi_wait( sh(iproc3), MPI_STATUSES_IGNORE, ierr )
607        END IF
608#endif
609        kfrom = offset
610        DO me2 = me2_start, me2_end
611           ip = desc%iproc(me2,me3)
612        !   DO k = 1, ncp_ (ip)
613        !      DO i = 1, desc%nr3p( iproc3 )
614        !         f_in ( kfrom + i ) = f_aux ( kdest + i )
615        !      ENDDO
616        !      kdest = kdest + nr3px
617        !      kfrom = kfrom + desc%nr3x
618        !   ENDDO
619          ierr = cudaMemcpy2DAsync( f_in_d(kfrom +1 ), nr3x, f_aux(kdest + 1), nr3px, desc%nr3p( iproc3 ), ncp_ (ip), cudaMemcpyHostToDevice, desc%stream_scatter_yz(iproc3) )
620          kdest = kdest + nr3px*ncp_ (ip)
621          kfrom = kfrom + nr3x*ncp_ (ip)
622        ENDDO
623        offset = offset + desc%nr3p( iproc3 )
624     ENDDO
625     !
626     DO iproc3 = 1, nproc3
627        ierr = cudaStreamSynchronize(desc%stream_scatter_yz(iproc3))
628     END DO
629
630
631     ! clean extra array elements in each stick
632
633     IF( nr3x /= nr3 ) THEN
634       aux = ncp_ ( desc%mype+1 )
635!$cuf kernel do(2) <<<*,*>>>
636       DO k = 1, aux
637          DO i = nr3, nr3x
638             f_in_d( (k-1)*nr3x + i ) = (0.d0, 0.d0)
639          END DO
640       END DO
641     END IF
642
643
64420   CONTINUE
645
646  ENDIF
647
648  DEALLOCATE ( ncp_ )
649  !CALL nvtxEndRangeAsync()
650  CALL stop_clock ('fft_scatt_yz')
651
652#endif
653
654  RETURN
65598 format ( 10 ('(',2f12.9,')') )
65699 format ( 20 ('(',2f12.9,')') )
657
658END SUBROUTINE fft_scatter_yz_gpu
659!
660!-----------------------------------------------------------------------
661SUBROUTINE fft_scatter_tg_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn, stream )
662  !-----------------------------------------------------------------------
663  !
664  ! task group wavefunction redistribution
665  !
666  ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one
667  !
668  ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one
669  !
670  ! in both cases:
671  !    f_in  contains the input data, is overwritten with the desired output
672  !    f_aux is used as working array, may contain garbage in output
673  !
674  USE cudafor
675  USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, &
676                          f_aux=>pin_space_scatter_out
677  IMPLICIT NONE
678
679  TYPE (fft_type_descriptor), INTENT(in) :: desc
680  INTEGER, INTENT(in)                    :: nxx_, isgn
681  COMPLEX (DP), DEVICE, INTENT(inout)    :: f_in_d (nxx_), f_aux_d (nxx_)
682  INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution
683
684  INTEGER :: ierr
685
686  CALL start_clock ('fft_scatt_tg')
687
688  if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 )
689  ! get pinned memory buffers for ALLTOALL, check allocation
690  CALL check_buffers_size(desc)
691#if defined(__MPI)
692  ! == OPTIMIZE, replace this with overlapped comunication on host and device,
693  ! or possibly use GPU MPI?
694
695  !f_aux(1:nxx_) = f_in_d(1:nxx_)
696  ierr = cudaMemcpyAsync( f_aux, f_in_d, nxx_, cudaMemcpyDeviceToHost, stream )
697  ierr = cudaStreamSynchronize(stream)
698  if ( isgn > 0 ) then
699
700     CALL MPI_ALLTOALLV( f_aux,  desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, &
701                         f_in, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
702     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) )
703
704  else
705
706     CALL MPI_ALLTOALLV( f_aux,  desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, &
707                         f_in, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
708     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) )
709  end if
710  !f_in_d(1:nxx_) = f_in(1:nxx_)
711  ierr = cudaMemcpyAsync( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice, stream )
712  !ierr = cudaStreamSynchronize(stream)
713#endif
714  !
715  CALL stop_clock ('fft_scatt_tg')
716  RETURN
71799 format ( 20 ('(',2f12.9,')') )
718
719END SUBROUTINE fft_scatter_tg_gpu
720!
721!-----------------------------------------------------------------------
722SUBROUTINE fft_scatter_tg_opt_gpu ( desc, f_in_d, f_out_d, nxx_, isgn, stream )
723  !-----------------------------------------------------------------------
724  !
725  ! task group wavefunction redistribution
726  !
727  ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one
728  !
729  ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one
730  !
731  ! in both cases:
732  !    f_in  contains the input data
733  !    f_out contains the output data
734  !
735  USE cudafor
736  USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, &
737                          f_out=>pin_space_scatter_out
738  IMPLICIT NONE
739
740  TYPE (fft_type_descriptor), INTENT(in) :: desc
741  INTEGER, INTENT(in)                    :: nxx_, isgn
742  COMPLEX (DP), DEVICE, INTENT(inout)    :: f_in_d (nxx_), f_out_d (nxx_)
743  INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution
744
745  INTEGER :: ierr
746
747  CALL start_clock ('fft_scatt_tg')
748
749  if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 )
750  !
751  ! check host copy allocation of f_in and f_aux
752  CALL check_buffers_size(desc)
753  ierr = cudaMemcpyAsync( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost, stream )
754  ierr = cudaStreamSynchronize(stream)
755#if defined(__MPI)
756  !
757  if ( isgn > 0 ) then
758
759     CALL MPI_ALLTOALLV( f_in,  desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, &
760                         f_out, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
761     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) )
762
763  else
764
765     CALL MPI_ALLTOALLV( f_in,  desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, &
766                         f_out, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
767     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) )
768  end if
769
770#endif
771  !f_out_d(1:nxx_) = f_out(1:nxx_)
772  ierr = cudaMemcpyAsync( f_out_d, f_out, nxx_, cudaMemcpyHostToDevice, stream )
773  !ierr = cudaStreamSynchronize(stream)
774  !
775  CALL stop_clock ('fft_scatt_tg')
776
777  RETURN
77899 format ( 20 ('(',2f12.9,')') )
779
780END SUBROUTINE fft_scatter_tg_opt_gpu
781
782#endif
783
784
785!-----------------------------------------------------------------------
786SUBROUTINE fft_scatter_many_yz_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn, howmany )
787  !-----------------------------------------------------------------------
788  !
789  ! transpose of the fft yz planes across the desc%comm3 communicator
790  !
791  ! a) From Z-oriented columns to Y-oriented colums (isgn > 0)
792  !    Active columns (or sticks or pencils) along the Z direction for each
793  !    processor are stored consecutively and are such that they correspond
794  !    to a subset of the active X values.
795  !
796  !    The pencil -> slices transposition is performed in the subgroup
797  !    of processors (desc%comm3) owning these X values.
798  !
799  !    The transpose takes place in two steps:
800  !    1) on each processor the columns are sliced into sections along Z
801  !       that are stored one after the other. On each processor, slices for
802  !       processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big.
803  !    2) all processors communicate to exchange slices (all columns with
804  !       Z in the slice belonging to "me" must be received, all the others
805  !       must be sent to "iproc3")
806  !
807  !    Finally one gets the "slice" representation: each processor has
808  !    desc%nr3p(mype3) Z values of all the active pencils along Y for the
809  !    X values of the current group. Data are organized with the Y index
810  !    running fastest, then the reordered X values, then Z.
811  !
812  !    f_in  contains the input Z columns, is destroyed on output
813  !    f_aux contains the output Y colums.
814  !
815  !  b) From planes to columns (isgn < 0)
816  !
817  !    Quite the same in the opposite direction
818  !    f_aux contains the input Y columns, is destroyed on output
819  !    f_in  contains the output Z columns.
820  !
821  USE cudafor
822  !USE nvtx_fft
823  USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, &
824                          f_aux=>pin_space_scatter_out
825  IMPLICIT NONE
826
827  TYPE (fft_type_descriptor), INTENT(in) :: desc
828  INTEGER, INTENT(in)                    :: nxx_, isgn
829  COMPLEX (DP), DEVICE, INTENT(inout)    :: f_in_d (nxx_), f_aux_d (nxx_)
830  INTEGER, INTENT(IN)                    :: howmany
831
832#if defined(__MPI)
833  INTEGER, DEVICE, POINTER :: desc_ismap_d(:)
834  !
835  INTEGER :: ierr, me, me2, me3, nproc3, iproc3, ncpx, nr3px, ip, ip0, me2_start, me2_end
836  INTEGER :: nr1x, nr2x, nr3, nr3x, nnr
837  INTEGER :: i, j, it, it0, k, kfrom, kdest, offset, ioff, mc, m1, m2, i1,  sendsize, aux
838  INTEGER, ALLOCATABLE :: ncp_(:)
839  INTEGER, DEVICE, POINTER :: ir1p__d(:)
840  INTEGER :: my_nr1p_
841  !
842#if defined(__NON_BLOCKING_SCATTER)
843  INTEGER :: sh(desc%nproc3), rh(desc%nproc3)
844#endif
845  !CALL nvtxStartRangeAsync("fft_scatter_many_yz_gpu", isgn + 5)
846  !
847  me     = desc%mype  + 1
848  me2    = desc%mype2 + 1
849  me3    = desc%mype3 + 1
850  nproc3 = desc%nproc3
851
852  ! This mapping improves readability but, most importantly, it is needed
853  ! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from
854  ! `desc` appear in the body of the do loops, the compiler generates incorrect GPU code.
855  nr1x   = desc%nr1x
856  nr2x   = desc%nr2x
857  nr3    = desc%nr3
858  nr3x   = desc%nr3x
859  nnr    = desc%nnr
860
861  ! allocate auxiliary array for columns distribution
862  ALLOCATE ( ncp_( desc%nproc) )
863  !
864  me2_start = me2 ; me2_end = me2
865  if ( abs (isgn) == 1 ) then      ! It's a potential FFT
866     ncp_ = desc%nsp
867     my_nr1p_ = count (desc%ir1p   > 0)
868     ir1p__d => desc%ir1p_d
869  else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT
870     ncp_ = desc%nsw
871     my_nr1p_ = count (desc%ir1w   > 0)
872     ir1p__d => desc%ir1w_d
873  else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group
874     print *, "ERRORE, this should never happen!"
875  end if
876  !
877  CALL start_clock ('fft_scatt_many_yz')
878  !
879  ! calculate the message size
880  !
881  nr3px = MAXVAL ( desc%nr3p )  ! maximum number of Z values to be exchanged
882  ncpx  = MAXVAL ( ncp_ )       ! maximum number of Z columns to be exchanged
883  !
884  sendsize = howmany * ncpx * nr3px       ! dimension of the scattered chunks
885  !
886  ! check dimensions of f_in and f_aux
887  CALL check_buffers_size(desc, howmany)
888  desc_ismap_d => desc%ismap_d
889  !
890  ierr = 0
891  IF (isgn.gt.0) THEN
892
893     IF (nproc3==1) GO TO 10
894     !
895     ! "forward" scatter from columns to planes
896     !
897     ! step one: store contiguously the slices
898     !
899     offset = 0
900     DO iproc3 = 1, nproc3
901        kdest = ( iproc3 - 1 ) * sendsize
902        kfrom = offset
903        !
904        ierr = cudaMemcpy2D( f_aux(kdest + 1), nr3px, f_in_d(kfrom + 1 ), nr3x, desc%nr3p( iproc3 ), howmany*ncpx, cudaMemcpyDeviceToHost)
905        !
906        offset = offset + desc%nr3p( iproc3 )
907     ENDDO
908     !
909     ! ensures that no garbage is present in the output
910     ! useless; the later accessed elements are overwritten by the A2A step
911     !
912     ! step two: communication  across the    nproc3    group
913     !
914
915     CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), &
916                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
917
918     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
919     !
920     !f_in_d(1:nxx_) = f_in(1:nxx_)
921     ierr = cudaMemcpy( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice)
922     !
92310   CONTINUE
924     !
925     !f_aux = (0.0_DP, 0.0_DP) !
926     !$cuf kernel do (1) <<<*,*>>>
927     do i = 1, nxx_
928       f_aux_d(i) = (0.d0, 0.d0)
929     end do
930     !
931     DO iproc3 = 1, desc%nproc3
932        it0 = ( iproc3 - 1 ) * sendsize
933
934        ioff = desc%iss(iproc3)
935        aux = ncp_( iproc3 )
936
937!$cuf kernel do(3) <<<*,*>>>
938        DO j=0, howmany-1
939           DO i = 1, aux ! was ncp_(iproc3)
940              DO k = 1, desc%my_nr3p
941                 it = it0 + (i-1)*nr3px + j*ncpx*nr3px !desc%nnr !aux*nr3px
942                 mc = desc_ismap_d( i + ioff ) ! this is  m1+(m2-1)*nr1x  of the  current pencil
943                 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1
944                 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*nr2x*my_nr1p_
945
946                 f_aux_d( i1 + j*nnr ) = f_in_d( k + it )
947                 !i1 = i1 + nr2x*my_nr1p_
948              ENDDO
949           ENDDO
950           !it0 = it0 + ncp_( ip )*nr3px
951        ENDDO
952     ENDDO
953
954  ELSE
955     !
956     !  "backward" scatter from planes to columns
957     !
958     DO iproc3 = 1, nproc3
959        it0 = ( iproc3 - 1 ) * sendsize
960
961        ip = desc%iproc(me2, iproc3)
962        ioff = desc%iss(ip)
963        aux = ncp_( ip )
964!$cuf kernel do(3) <<<*,*>>>
965        DO j = 0, howmany - 1
966           DO i = 1, aux
967              DO k = 1, desc%my_nr3p
968                 it = it0 + (i-1)*nr3px + j*ncpx*nr3px
969                 mc = desc_ismap_d( i + ioff ) ! this is  m1+(m2-1)*nr1x  of the  current pencil
970                 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1
971                 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*(nr2x * my_nr1p_)
972
973                 f_in_d( k + it ) = f_aux_d( i1  + j*nnr )
974                 !i1 = i1 + desc%nr2x * my_nr1p_
975              ENDDO
976              !it = it + nr3px
977           ENDDO
978           !it0 = it0 + ncp_( ip )*nr3px
979        ENDDO
980     ENDDO
981
982     IF( nproc3 == 1 ) GO TO 20
983     !
984     !
985     !  step two: communication
986     !
987     ierr = cudaMemcpy( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost )
988
989     CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), &
990                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
991
992     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
993
994
995     !
996     !  step one: store contiguously the columns
997     !
998     offset = 0
999     DO iproc3 = 1, nproc3
1000        kdest = ( iproc3 - 1 ) * sendsize
1001        kfrom = offset
1002        ierr = cudaMemcpy2D( f_in_d(kfrom +1 ), nr3x, f_aux(kdest + 1), nr3px, desc%nr3p( iproc3 ), howmany * ncpx, cudaMemcpyHostToDevice)
1003        !
1004        offset = offset + desc%nr3p( iproc3 )
1005     ENDDO
1006
1007     ! clean extra array elements in each stick
1008
1009     IF( nr3x /= nr3 ) THEN
1010       aux = ncp_ ( desc%mype+1 )
1011!$cuf kernel do(3) <<<*,*>>>
1012       DO j=0, howmany-1
1013          DO k = 1, aux
1014             DO i = nr3, nr3x
1015                f_in_d( j*ncpx*nr3x + (k-1)*nr3x + i) = 0.0d0
1016             END DO
1017          END DO
1018       END DO
1019     END IF
1020
102120   CONTINUE
1022
1023  ENDIF
1024
1025  DEALLOCATE ( ncp_ )
1026  !CALL nvtxEndRangeAsync()
1027  CALL stop_clock ('fft_scatt_many_yz')
1028
1029#endif
1030
1031  RETURN
103298 format ( 10 ('(',2f12.9,')') )
103399 format ( 20 ('(',2f12.9,')') )
1034
1035END SUBROUTINE fft_scatter_many_yz_gpu
1036
1037!=----------------------------------------------------------------------=!
1038END MODULE fft_scatter_gpu
1039!=----------------------------------------------------------------------=!
1040
1041! defined (__CUDA)
1042#endif
1043