1!
2! Copyright (C) Quantum ESPRESSO group
3!
4! This file is distributed under the terms of the
5! GNU General Public License. See the file `License'
6! in the root directory of the present distribution,
7! or http://www.gnu.org/copyleft/gpl.txt .
8!
9!----------------------------------------------------------------------
10! Basic scatter operations needed by parallel FFT
11! Written by Carlo Cavazzoni and Stefano de Gironcoli, modified by PG
12!
13!----------------------------------------------------------------------
14!
15!=----------------------------------------------------------------------=!
16   MODULE fft_scatter
17!=----------------------------------------------------------------------=!
18
19        USE fft_types, ONLY: fft_type_descriptor
20        USE fft_param
21
22        IMPLICIT NONE
23
24        SAVE
25
26        PRIVATE
27
28        PUBLIC :: fft_scatter_xy, fft_scatter_yz, fft_scatter_many_xy, fft_scatter_many_yz
29        PUBLIC :: fft_scatter_tg, fft_scatter_tg_opt
30
31!=----------------------------------------------------------------------=!
32      CONTAINS
33!=----------------------------------------------------------------------=!
34!
35!
36!-----------------------------------------------------------------------
37SUBROUTINE fft_scatter_xy ( desc, f_in, f_aux, nxx_, isgn, comm )
38  !-----------------------------------------------------------------------
39  !
40  ! Transpose of the fft xy planes across the comm communicator.
41  ! If the optional comm is not provided as input, the transpose is made
42  ! across desc%comm2 communicator.
43  !
44  ! a) From Y-oriented columns to X-oriented partial slices (isgn > 0)
45  !    Active columns along the Y direction corresponding to a subset of the
46  !    active X values and a range of Z values (in this order) are stored
47  !    consecutively for each processor and are such that the subgroup owns
48  !    all data for a range of Z values.
49  !
50  !    The Y pencil -> X-oriented partial slices transposition is performed
51  !    in the subgroup of processors (desc%comm2) owning this range of Z values.
52  !
53  !    The transpose takes place in two steps:
54  !    1) on each processor the columns are sliced into sections along Y
55  !       that are stored one after the other. On each processor, slices for
56  !       processor "iproc2" are desc%nr2p(iproc2)*desc%nr1p(me2)*desc%my_nr3p big.
57  !    2) all processors communicate to exchange slices (all sectin of columns with
58  !       Y in the slice belonging to "me" must be received, all the others
59  !       must be sent to "iproc2")
60  !
61  !    Finally one gets the "partial slice" representation: each processor has
62  !    all the X values of desc%my_nr2p Y and desc%my_nr3p Z values.
63  !    Data are organized with the X index running fastest, then Y, then Z.
64  !
65  !    f_in  contains the input Y columns, is destroyed on output
66  !    f_aux contains the output X-oriented partial slices.
67  !
68  !  b) From planes to columns (isgn < 0)
69  !
70  !    Quite the same in the opposite direction
71  !    f_aux contains the input X-oriented partial slices, is destroyed on output
72  !    f_in  contains the output Y columns.
73  !
74  IMPLICIT NONE
75
76  TYPE (fft_type_descriptor), INTENT(in) :: desc
77  INTEGER, INTENT(in)           :: nxx_, isgn
78  COMPLEX (DP), INTENT(inout)   :: f_in (nxx_), f_aux (nxx_)
79  INTEGER, OPTIONAL, INTENT(in) :: comm
80  INTEGER :: nr1_temp(1), comm_
81
82#if defined(__MPI)
83!$omp master
84  CALL start_clock ('fft_scatt_xy')
85!$omp end master
86
87  IF (PRESENT(comm)) THEN
88     comm_ = comm
89  ELSE
90     comm_ = desc%comm2
91  ENDIF
92  !
93  if ( abs (isgn) == 1 ) then          ! It's a potential FFT
94     CALL impl_xy( MAXVAL ( desc%nr2p ), desc%nproc2, desc%my_nr2p, desc%nr1p, desc%indp, desc%iplp, comm_)
95  else if ( abs (isgn) == 2 ) then     ! It's a wavefunction FFT
96     CALL impl_xy( MAXVAL ( desc%nr2p ), desc%nproc2, desc%my_nr2p, desc%nr1w, desc%indw, desc%iplw, comm_)
97  else if ( abs (isgn) == 3 ) then     ! It's a wavefunction FFT with task group
98     ! in task group FFTs whole Y colums are distributed
99     nr1_temp = desc%nr1w_tg
100     CALL impl_xy( desc%nr2x, 1, desc%nr2x, nr1_temp, desc%indw_tg, desc%iplw, comm_)
101  end if
102  !
103!$omp master
104  CALL stop_clock ('fft_scatt_xy')
105!$omp end master
106
107  RETURN
108
109  CONTAINS
110
111  SUBROUTINE impl_xy(nr2px, nproc2, my_nr2p, nr1p_, indx, iplx, mpi_comm)
112  IMPLICIT NONE
113  !
114  INTEGER, INTENT(in):: nr2px, nproc2, my_nr2p
115  INTEGER, INTENT(in):: nr1p_(nproc2), indx(desc%nr1x,nproc2), iplx(desc%nr1x), mpi_comm
116  !
117  INTEGER :: ierr, me2, iproc2, ncpx
118  INTEGER :: i, it, j, k, kfrom, kdest, mc, m1, m3, i1, icompact, sendsize
119  !
120#if defined(__NON_BLOCKING_SCATTER)
121  INTEGER :: sh(desc%nproc2), rh(desc%nproc2)
122#endif
123
124  me2    = desc%mype2 + 1
125  ncpx = MAXVAL(nr1p_) * desc%my_nr3p       ! maximum number of Y columns to be disributed
126  ! calculate the message size
127  sendsize = ncpx * nr2px       ! dimension of the scattered chunks (safe value)
128
129  ierr = 0
130  IF (isgn.gt.0) THEN
131     !
132     IF (nproc2==1) GO TO 10
133     !
134     ! "forward" scatter from columns to planes
135     !
136     ! step one: store contiguously the slices
137     !
138!$omp parallel do collapse(2) private(kdest,kfrom)
139     DO iproc2 = 1, nproc2
140        DO k = 0, nr1p_(me2)*desc%my_nr3p-1
141           kdest = ( iproc2 - 1 ) * sendsize + nr2px * k
142           kfrom = desc%nr2p_offset(iproc2) + desc%nr2x *k
143           DO i = 1, desc%nr2p( iproc2 )
144              f_aux ( kdest + i ) =  f_in ( kfrom + i )
145           ENDDO
146        ENDDO
147     ENDDO
148!$omp end parallel do
149#if defined(__NON_BLOCKING_SCATTER)
150     DO iproc2 = 1, nproc2
151        CALL mpi_isend( f_aux( (iproc2-1)*sendsize + 1 ), sendsize, &
152                        MPI_DOUBLE_COMPLEX, iproc2-1, me2, mpi_comm, &
153                        sh( iproc2 ), ierr )
154     ENDDO
155#endif
156     !
157     ! step two: communication  across the    nproc3    group
158     !
159#if defined(__NON_BLOCKING_SCATTER)
160     DO iproc2 = 1, nproc2
161        !
162        ! now post the receive
163        !
164        CALL mpi_irecv( f_in( (iproc2-1)*sendsize + 1 ), sendsize, &
165                        MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, &
166                        mpi_comm, rh( iproc2 ), ierr )
167        !IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward receive info<>0', abs(ierr) )
168     ENDDO
169
170     call mpi_waitall( nproc2, sh, MPI_STATUSES_IGNORE, ierr )
171     !
172#else
173     CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), &
174                        sendsize, MPI_DOUBLE_COMPLEX, mpi_comm, ierr)
175     !
176     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
177#endif
178     !
17910   CONTINUE
180     !
181#if defined(__NON_BLOCKING_SCATTER)
182     if (nproc2 > 1) call mpi_waitall( nproc2, rh, MPI_STATUSES_IGNORE, ierr )
183#endif
184     !
185!$omp parallel
186!$omp do collapse(2) private(it,m3,i1,m1,icompact)
187     DO iproc2 = 1, nproc2
188        DO i = 0, ncpx-1
189           IF(i>=nr1p_(iproc2)*desc%my_nr3p) CYCLE ! control i from 0 to nr1p_(iproc2)*desc%my_nr3p-1
190           it = ( iproc2 - 1 ) * sendsize + nr2px * i
191           m3 = i/nr1p_(iproc2)+1
192           i1 = mod(i,nr1p_(iproc2))+1
193           m1 = indx(i1,iproc2)
194           icompact = m1 + (m3-1)*desc%nr1x*my_nr2p
195           DO j = 1, my_nr2p
196              !f_aux( m1 + (j-1)*desc%nr1x + (m3-1)*desc%nr1x*my_nr2p ) = f_in( j + it )
197              f_aux( icompact ) = f_in( j + it )
198              icompact = icompact + desc%nr1x
199           ENDDO
200        ENDDO
201     ENDDO
202!$omp end do nowait
203     !
204     ! clean extra array elements in each stick
205     !
206!$omp do
207     DO k = 1, my_nr2p*desc%my_nr3p
208        DO i1 = 1, desc%nr1x
209           IF(iplx(i1)==0) f_aux(desc%nr1x*(k-1)+i1) = (0.0_DP, 0.0_DP)
210        ENDDO
211     ENDDO
212!$omp end do nowait
213!$omp end parallel
214     !
215  ELSE
216     !
217     !  "backward" scatter from planes to columns
218     !
219!$omp parallel do collapse(2) private(it,m3,i1,m1,icompact)
220     DO iproc2 = 1, nproc2
221        DO i = 0, ncpx-1
222           IF(i>=nr1p_(iproc2)*desc%my_nr3p) CYCLE ! control i from 0 to nr1p_(iproc2)*desc%my_nr3p-1
223           it = ( iproc2 - 1 ) * sendsize + nr2px * i
224           m3 = i/nr1p_(iproc2)+1
225           i1 = mod(i,nr1p_(iproc2))+1
226           m1 = indx(i1,iproc2)
227           icompact = m1 + (m3-1)*desc%nr1x*my_nr2p
228           DO j = 1, my_nr2p
229              !f_in( j + it ) = f_aux( m1 + (j-1)*desc%nr1x + (m3-1)*desc%nr1x*my_nr2p )
230              f_in( j + it ) = f_aux( icompact )
231              icompact = icompact + desc%nr1x
232           ENDDO
233        ENDDO
234     ENDDO
235!$omp end parallel do
236#if defined(__NON_BLOCKING_SCATTER)
237     DO iproc2 = 1, nproc2
238        IF( nproc2 > 1 ) CALL mpi_isend( f_in( ( iproc2 - 1 ) * sendsize + 1 ), &
239                              sendsize, MPI_DOUBLE_COMPLEX, iproc2-1, me2, &
240                              mpi_comm, sh( iproc2 ), ierr )
241     ENDDO
242#endif
243     IF (nproc2==1) GO TO 20
244
245     !
246     !  step two: communication
247     !
248#if ! defined(__NON_BLOCKING_SCATTER)
249     CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), &
250                        sendsize, MPI_DOUBLE_COMPLEX, mpi_comm, ierr)
251
252     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
253#else
254     DO iproc2 = 1, nproc2
255        CALL mpi_irecv( f_aux( (iproc2-1)*sendsize + 1 ), sendsize, &
256                        MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, &
257                        mpi_comm, rh(iproc2), ierr )
258        ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) )
259     ENDDO
260
261     call mpi_waitall( nproc2, sh, MPI_STATUSES_IGNORE, ierr )
262     call mpi_waitall( nproc2, rh, MPI_STATUSES_IGNORE, ierr )
263#endif
264     !
265     !  step one: store contiguously the columns
266     !
267!$omp parallel do collapse(2) private(kdest,kfrom)
268     DO iproc2 = 1, nproc2
269        DO k = 0, nr1p_(me2)*desc%my_nr3p-1
270           kdest = ( iproc2 - 1 ) * sendsize + nr2px * k
271           kfrom = desc%nr2p_offset(iproc2) + desc%nr2x * k
272           DO i = 1, desc%nr2p( iproc2 )
273              f_in ( kfrom + i ) = f_aux ( kdest + i )
274           ENDDO
275        ENDDO
276     ENDDO
277!$omp end parallel do
278     !
279     ! clean extra array elements in each stick
280     !
281     IF( desc%nr2x /= desc%nr2 ) THEN
282        DO k = 1, nr1p_(me2)*desc%my_nr3p
283           f_in(desc%nr2x*(k-1)+desc%nr2+1:desc%nr2x*k) = (0.0_DP, 0.0_DP)
284        ENDDO
285     ENDIF
286
28720   CONTINUE
288
289  ENDIF
290
291  END SUBROUTINE impl_xy
292
293#endif
294
295END SUBROUTINE fft_scatter_xy
296!
297!-----------------------------------------------------------------------
298SUBROUTINE fft_scatter_many_xy ( desc, f_in, f_aux, isgn, howmany)
299  !-----------------------------------------------------------------------
300  !
301  ! transpose of the fft xy planes across the desc%comm2 communicator
302  !
303  ! a) From Y-oriented columns to X-oriented partial slices (isgn > 0)
304  !    Active columns along the Y direction corresponding to a subset of the
305  !    active X values and a range of Z values (in this order) are stored
306  !    consecutively for each processor and are such that the subgroup owns
307  !    all data for a range of Z values.
308  !
309  !    The Y pencil -> X-oriented partial slices transposition is performed
310  !    in the subgroup of processors (desc%comm2) owning this range of Z values.
311  !
312  !    The transpose takes place in two steps:
313  !    1) on each processor the columns are sliced into sections along Y
314  !       that are stored one after the other. On each processor, slices for
315  !       processor "iproc2" are desc%nr2p(iproc2)*desc%nr1p(me2)*desc%my_nr3p big.
316  !    2) all processors communicate to exchange slices (all sectin of columns with
317  !       Y in the slice belonging to "me" must be received, all the others
318  !       must be sent to "iproc2")
319  !
320  !    Finally one gets the "partial slice" representation: each processor has
321  !    all the X values of desc%my_nr2p Y and desc%my_nr3p Z values.
322  !    Data are organized with the X index running fastest, then Y, then Z.
323  !
324  !    f_in  contains the input Y columns, is destroyed on output
325  !    f_aux contains the output X-oriented partial slices.
326  !
327  !  b) From planes to columns (isgn < 0)
328  !
329  !    Quite the same in the opposite direction
330  !    f_aux contains the input X-oriented partial slices, is destroyed on output
331  !    f_in  contains the output Y columns.
332  !
333  IMPLICIT NONE
334
335  TYPE (fft_type_descriptor), INTENT(in), TARGET :: desc
336  INTEGER, INTENT(in)                            :: isgn
337  INTEGER, INTENT(in)                            :: howmany
338
339  COMPLEX (DP), INTENT(inout)   :: f_in (:), f_aux (:)
340
341#if defined(__MPI)
342  !
343  INTEGER :: ierr, me2, nproc2, iproc2, ncpx, nr2px, ip, icompact
344  INTEGER :: i, j, it, it0, k, kfrom, kdest, offset, m1, m2, i1, sendsize
345  INTEGER, POINTER :: iplx(:), nr1p_(:), indx(:,:)
346  !
347  me2    = desc%mype2 + 1
348  nproc2 = desc%nproc2
349  !
350  if ( abs (isgn) == 1 ) then      ! It's a potential FFT
351     iplx     => desc%iplp
352     nr1p_    => desc%nr1p
353     indx     => desc%indp
354  else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT
355     iplx     => desc%iplw
356     nr1p_    => desc%nr1w
357     indx     => desc%indw
358  else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group
359     print *, "ERRORE, this should never happen!"
360  end if
361  !
362  CALL start_clock ('fft_scatt_many_xy')
363  !
364  ! calculate the message size
365  !
366  nr2px = MAXVAL ( desc%nr2p )                  ! maximum number of Y values to be exchanged
367  ncpx  = MAXVAL ( nr1p_ ) * MAXVAL(desc%nr3p)  ! maximum number of Y columns to be exchanged
368
369  sendsize = howmany * ncpx * nr2px             ! dimension of the scattered chunks
370  !
371  ierr = 0
372  !
373  IF (isgn.gt.0) THEN
374     !
375     ! "forward" scatter from columns to planes
376     !
377     ! step one: store contiguously the slices
378     !
379     offset = 0
380
381     IF (nproc2==1) THEN
382!$omp parallel default(none)                                   &
383!$omp          private(i, j, k, ip, it, m1, m2, i1, icompact)  &
384!$omp          shared(howmany, ncpx, nr2px, nr1p_, iplx)       &
385!$omp&         shared(desc, f_aux, f_in, indx)
386        ip = nr1p_(1) * desc%my_nr3p + 1
387!$omp do
388        DO k=0, howmany-1
389           DO j = 1, ncpx
390              IF ( j >= ip ) CYCLE
391              it = (j - 1) * nr2px + k * ncpx * nr2px
392              m2 = (j - 1) / nr1p_(1) + 1
393              i1 = mod((j - 1),nr1p_(1)) + 1
394              m1 = indx(i1,1)
395              icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p
396              DO i = 1, desc%my_nr2p
397                 f_aux( icompact + k * desc%nnr ) = f_in( i + it )
398                 icompact = icompact + desc%nr1x
399              ENDDO
400           ENDDO
401        ENDDO
402!$omp end do
403!$omp do
404        DO k=0, howmany-1
405           DO j = 1, desc%my_nr2p*desc%my_nr3p
406              DO i1 = 1, desc%nr1x
407                 IF(iplx(i1)==0) f_aux(desc%nr1x*(j-1)+i1+k*desc%nnr) = (0.0_DP, 0.0_DP)
408              ENDDO
409           ENDDO
410        ENDDO
411!$omp end do
412!$omp end parallel
413     ELSE
414!$omp parallel default(none)                                  &
415!$omp          private(iproc2, i, j, k, kdest, kfrom)         &
416!$omp          shared(nproc2, howmany, ncpx, sendsize, nr1p_) &
417!$omp&         shared(nr2px, desc, f_aux, f_in, me2)
418!$omp do
419        DO k = 0, howmany-1
420           DO iproc2 = 1, nproc2
421              DO j = 0, nr1p_(me2) * desc%my_nr3p - 1
422                 kdest = ( iproc2 - 1 ) * sendsize + nr2px * (j + k*ncpx)
423                 kfrom = desc%nr2p_offset(iproc2)  + desc%nr2x * (j + k*ncpx)
424                    DO i = 1, desc%nr2p( iproc2 )
425                       f_aux ( kdest + i ) =  f_in ( kfrom + i )
426                    ENDDO
427              ENDDO
428           ENDDO
429        ENDDO
430!$omp end do
431!$omp end parallel
432
433        CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), &
434                           sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
435
436        IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
437
438!$omp parallel default(none)                                               &
439!$omp          private(iproc2, i, j, k, ip, it, m1, m2, icompact, it0, i1) &
440!$omp          shared(nproc2, howmany, ncpx, sendsize, nr2px, indx)        &
441!$omp&         shared(desc, f_aux, f_in, iplx, nr1p_)
442!$omp do
443        DO k=0, howmany-1
444           DO iproc2 = 1, nproc2
445              ip = nr1p_(iproc2) * desc%my_nr3p + 1
446              it0 = ( iproc2 - 1 ) * sendsize
447              DO j = 1, ncpx
448                 IF ( j >= ip ) CYCLE
449                 it = it0 + (j - 1) * nr2px + k * ncpx * nr2px
450                 m2 = (j - 1) / nr1p_(iproc2) + 1
451                 i1 = mod((j - 1),nr1p_(iproc2)) + 1
452                 m1 = indx(i1,iproc2)
453                 icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p
454                 DO i = 1, desc%my_nr2p
455                    f_aux( icompact + k*desc%nnr ) = f_in( i + it )
456                    icompact = icompact + desc%nr1x
457                 ENDDO
458              ENDDO
459           ENDDO
460        ENDDO
461!$omp end do
462!$omp do
463        DO k=0, howmany-1
464           DO j = 1, desc%my_nr2p*desc%my_nr3p
465              DO i1 = 1, desc%nr1x
466                 IF(iplx(i1)==0) f_aux(desc%nr1x*(j-1)+i1+k*desc%nnr) = (0.0_DP, 0.0_DP)
467              ENDDO
468           ENDDO
469        ENDDO
470!$omp end do
471!$omp end parallel
472     ENDIF
473     !
474  ELSE
475     !
476     !  "backward" scatter from planes to columns
477     !
478     IF( nproc2 == 1 ) THEN
479!$omp parallel default(none)                                   &
480!$omp          private(i, j, k, ip, it, m1, m2, i1, icompact)  &
481!$omp          shared(howmany, ncpx, nr2px, nr1p_)             &
482!$omp&         shared(desc, f_aux, f_in, indx)
483        ip = nr1p_(1) * desc%my_nr3p + 1
484!$omp do
485        DO k = 0, howmany - 1
486           DO j = 1, ncpx
487              IF ( j >= ip ) CYCLE
488              it = (j - 1) * nr2px + k * ncpx * nr2px
489              m2 = (j - 1) / nr1p_(1) + 1
490              i1 = mod((j - 1),nr1p_(1)) + 1
491              m1 = indx(i1,1)
492              icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p
493              DO i = 1, desc%my_nr2p
494                 f_in( i + it ) = f_aux( icompact + k * desc%nnr )
495                 icompact = icompact + desc%nr1x
496              ENDDO
497           ENDDO
498        ENDDO
499!$omp end do
500!$omp end parallel
501     ELSE
502!$omp parallel default(none)                                               &
503!$omp          private(iproc2, i, j, k, ip, it, m1, m2, icompact, it0, i1) &
504!$omp          shared(nproc2, howmany, ncpx, sendsize, nr2px, indx)        &
505!$omp&         shared(desc, f_aux, f_in, nr1p_)
506!$omp do
507        DO k=0, howmany-1
508           DO iproc2 = 1, nproc2
509              ip = nr1p_(iproc2) * desc%my_nr3p + 1
510              it0 = ( iproc2 - 1 ) * sendsize
511              DO j = 1, ncpx
512                 IF ( j >= ip ) CYCLE
513                 it = it0 + (j - 1) * nr2px + k * ncpx * nr2px
514                 m2 = (j - 1) / nr1p_(iproc2) + 1
515                 i1 = mod((j - 1),nr1p_(iproc2))+1
516                 m1 = indx(i1,iproc2)
517                 icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p
518                 DO i = 1, desc%my_nr2p
519                    f_in( i + it ) = f_aux( icompact + k*desc%nnr )
520                    icompact = icompact + desc%nr1x
521                 ENDDO
522              ENDDO
523           ENDDO
524        ENDDO
525!$omp end do
526!$omp end parallel
527        !
528        !
529        !  step two: communication
530        !
531        CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), &
532                           sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
533
534        IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
535        !
536        !  step one: store contiguously the columns
537        !
538!$omp parallel default(none)                                  &
539!$omp          private(iproc2, i, j, k, kdest, kfrom)         &
540!$omp          shared(nproc2, howmany, ncpx, sendsize, nr2px) &
541!$omp&         shared(desc, f_aux, f_in, nr1p_, me2)
542!$omp do
543        DO k = 0, howmany-1
544           DO iproc2 = 1, nproc2
545              DO j = 0, nr1p_(me2) * desc%my_nr3p - 1
546                 kdest = ( iproc2 - 1 ) * sendsize + nr2px * (j + k*ncpx)
547                 kfrom = desc%nr2p_offset(iproc2)  + desc%nr2x * (j + k*ncpx)
548                    DO i = 1, desc%nr2p( iproc2 )
549                       f_in ( kfrom + i ) =  f_aux ( kdest + i )
550                    ENDDO
551              ENDDO
552           ENDDO
553        ENDDO
554!$omp end do
555           ! clean extra array elements in each stick
556        IF( desc%nr2x /= desc%nr2 ) THEN
557!$omp do
558           DO k = 0, howmany-1
559              DO j = 1, nr1p_(me2)*desc%my_nr3p
560                 DO i = desc%nr2+1, desc%nr2x
561                    f_in( k*ncpx*desc%nr2x + (j-1)*desc%nr2x + i) = 0.0d0
562                 END DO
563              END DO
564           ENDDO
565!$omp end do
566        ENDIF
567!$omp end parallel
568     ENDIF
569  ENDIF
570
571  CALL stop_clock ('fft_scatt_many_xy')
572
573#endif
574
575  RETURN
57698 format ( 10 ('(',2f12.9,')') )
57799 format ( 20 ('(',2f12.9,')') )
578
579END SUBROUTINE fft_scatter_many_xy
580!
581!-----------------------------------------------------------------------
582SUBROUTINE fft_scatter_yz ( desc, f_in, f_aux, nxx_, isgn )
583  !-----------------------------------------------------------------------
584  !
585  ! transpose of the fft yz planes across the desc%comm3 communicator
586  !
587  ! a) From Z-oriented columns to Y-oriented colums (isgn > 0)
588  !    Active columns (or sticks or pencils) along the Z direction for each
589  !    processor are stored consecutively and are such that they correspond
590  !    to a subset of the active X values.
591  !
592  !    The pencil -> slices transposition is performed in the subgroup
593  !    of processors (desc%comm3) owning these X values.
594  !
595  !    The transpose takes place in two steps:
596  !    1) on each processor the columns are sliced into sections along Z
597  !       that are stored one after the other. On each processor, slices for
598  !       processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big.
599  !    2) all processors communicate to exchange slices (all columns with
600  !       Z in the slice belonging to "me" must be received, all the others
601  !       must be sent to "iproc3")
602  !
603  !    Finally one gets the "slice" representation: each processor has
604  !    desc%nr3p(mype3) Z values of all the active pencils along Y for the
605  !    X values of the current group. Data are organized with the Y index
606  !    running fastest, then the reordered X values, then Z.
607  !
608  !    f_in  contains the input Z columns, is destroyed on output
609  !    f_aux contains the output Y colums.
610  !
611  !  b) From planes to columns (isgn < 0)
612  !
613  !    Quite the same in the opposite direction
614  !    f_aux contains the input Y columns, is destroyed on output
615  !    f_in  contains the output Z columns.
616  !
617  IMPLICIT NONE
618
619  TYPE (fft_type_descriptor), INTENT(in) :: desc
620  INTEGER, INTENT(in)           :: nxx_, isgn
621  COMPLEX (DP), INTENT(inout)   :: f_in (nxx_), f_aux (nxx_)
622
623#if defined(__MPI)
624  !
625  CALL start_clock ('fft_scatt_yz')
626
627  if ( abs (isgn) == 1 ) then      ! It's a potential FFT
628     CALL impl_yz(desc%mype2+1, desc%mype2+1, desc%nsp, desc%ir1p, desc%nsp_offset)
629  else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT
630     CALL impl_yz(desc%mype2+1, desc%mype2+1, desc%nsw, desc%ir1w, desc%nsw_offset)
631  else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group
632     CALL impl_yz(1, desc%nproc2, desc%nsw, desc%ir1w_tg, desc%nsw_offset)
633  end if
634
635  CALL stop_clock ('fft_scatt_yz')
636
637  RETURN
638
639  CONTAINS
640
641  SUBROUTINE impl_yz(me2_start, me2_end, ncp_, ir1p_, me2_iproc3_offset)
642  IMPLICIT NONE
643  !
644  INTEGER, INTENT(in) :: me2_start, me2_end
645  INTEGER, INTENT(in) :: ncp_(desc%nproc), ir1p_(desc%nr1x), me2_iproc3_offset(desc%nproc2, desc%nproc3)
646  !
647  INTEGER :: ierr, me2, me3, nproc3, iproc3, ncpx, nr3px, ip
648  INTEGER :: i, it, k, kfrom, kdest, mc, m1, m2, i1, sendsize
649  INTEGER :: my_nr1p_
650  !
651#if defined(__NON_BLOCKING_SCATTER)
652  INTEGER :: sh(desc%nproc3), rh(desc%nproc3)
653#endif
654  !
655  me3    = desc%mype3 + 1
656  nproc3 = desc%nproc3
657  !
658  my_nr1p_ = count (ir1p_ > 0)
659  !
660  ! calculate the message size
661  !
662  nr3px = MAXVAL ( desc%nr3p )  ! maximum number of Z values to be exchanged
663  ncpx  = MAXVAL ( ncp_ )       ! maximum number of Z columns to be exchanged
664  sendsize = ncpx * nr3px * ( me2_end - me2_start + 1 )  ! dimension of the scattered chunks
665  !
666  ierr = 0
667  IF (isgn.gt.0) THEN
668     !
669     IF (nproc3==1) GO TO 10
670     !
671     ! "forward" scatter from columns to planes
672     !
673     ! step one: store contiguously the slices
674     !
675!$omp parallel do collapse(3) private(kdest,kfrom)
676     DO iproc3 = 1, nproc3
677        DO me2 = me2_start, me2_end
678           DO k = 0, ncpx-1   ! was ncp_(me3)
679              IF (k>=ncp_(desc%iproc(me2,me3))) CYCLE ! control k from 0 to ncp_(desc%iproc(me2,me3))-1
680              kdest = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k )
681              kfrom = desc%nr3p_offset(iproc3) + desc%nr3x * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k )
682              DO i = 1, desc%nr3p( iproc3 )
683                 f_aux ( kdest + i ) =  f_in ( kfrom + i )
684              ENDDO
685           ENDDO
686        ENDDO
687     ENDDO
688!$omp end parallel do
689#if defined(__NON_BLOCKING_SCATTER)
690     DO iproc3 = 1, nproc3
691        CALL mpi_isend( f_aux( ( iproc3 - 1 ) * sendsize + 1 ), sendsize, &
692                        MPI_DOUBLE_COMPLEX, iproc3-1, me3, desc%comm3, &
693                        sh( iproc3 ), ierr )
694     ENDDO
695#endif
696     !
697     ! step two: communication  across the    nproc3    group
698     !
699#if defined(__NON_BLOCKING_SCATTER)
700     DO iproc3 = 1, nproc3
701        !
702        ! now post the receive
703        !
704        CALL mpi_irecv( f_in( (iproc3-1)*sendsize + 1 ), sendsize, &
705                        MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, &
706                        desc%comm3, rh( iproc3 ), ierr )
707        !IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward receive info<>0', abs(ierr) )
708        !
709        !
710     ENDDO
711
712     call mpi_waitall( nproc3, sh, MPI_STATUSES_IGNORE, ierr )
713     !
714#else
715     CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), &
716                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
717
718     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
719#endif
720     !
72110   CONTINUE
722     !
723#if defined(__NON_BLOCKING_SCATTER)
724     if (nproc3 > 1) call mpi_waitall( nproc3, rh, MPI_STATUSES_IGNORE, ierr )
725#endif
726     !
727!$omp parallel
728     ! ensures that no garbage is present in the output
729     !
730!$omp do
731     DO k = 1, desc%my_nr3p*my_nr1p_*desc%nr2x
732        f_aux(k) = (0.0_DP, 0.0_DP)
733     ENDDO
734!$omp end do
735!$omp do collapse(3) private(ip,it,mc,m1,m2,i1)
736     DO iproc3 = 1, desc%nproc3
737        DO me2 = me2_start, me2_end
738           DO i = 1, ncpx ! was ncp_(iproc3)
739              ip = desc%iproc( me2, iproc3)
740              IF ( i>ncp_(ip) ) CYCLE
741              it = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, iproc3 ) + i - 1 )
742              mc = desc%ismap( i + desc%iss(ip) ) ! this is  m1+(m2-1)*nr1x  of the  current pencil
743              m1 = mod (mc-1,desc%nr1x) + 1
744              m2 = (mc-1)/desc%nr1x + 1
745              i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x
746              DO k = 1, desc%my_nr3p
747                 f_aux( i1 ) = f_in( k + it )
748                 i1 = i1 + desc%nr2x*my_nr1p_
749              ENDDO
750           ENDDO
751        ENDDO
752     ENDDO
753!$omp end do nowait
754!$omp end parallel
755     !
756  ELSE
757     !
758     !  "backward" scatter from planes to columns
759     !
760!$omp parallel do collapse(3) private(ip,it,mc,m1,m2,i1)
761     DO iproc3 = 1, desc%nproc3
762        DO me2 = me2_start, me2_end
763           DO i = 1, ncpx
764              ip = desc%iproc( me2, iproc3)
765              IF ( i>ncp_(ip) ) CYCLE
766              it = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, iproc3 ) + i - 1 )
767              mc = desc%ismap( i + desc%iss(ip) ) ! this is  m1+(m2-1)*nr1x  of the  current pencil
768              m1 = mod (mc-1,desc%nr1x) + 1
769              m2 = (mc-1)/desc%nr1x + 1
770              i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x
771              DO k = 1, desc%my_nr3p
772                 f_in( k + it ) = f_aux( i1 )
773                 i1 = i1 + desc%nr2x * my_nr1p_
774              ENDDO
775           ENDDO
776        ENDDO
777     ENDDO
778!$omp end parallel do
779#if defined(__NON_BLOCKING_SCATTER)
780     DO iproc3 = 1, desc%nproc3
781        IF( nproc3 > 1 ) CALL mpi_isend( f_in( ( iproc3 - 1 ) * sendsize + 1 ), &
782                                        sendsize, MPI_DOUBLE_COMPLEX, iproc3-1, &
783                                        me3, desc%comm3, sh( iproc3 ), ierr )
784     ENDDO
785#endif
786
787     IF( nproc3 == 1 ) GO TO 20
788     !
789     !  step two: communication
790     !
791#if ! defined(__NON_BLOCKING_SCATTER)
792     CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), &
793                        sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
794
795     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
796#else
797     DO iproc3 = 1, desc%nproc3
798        CALL mpi_irecv( f_aux( (iproc3-1)*sendsize + 1 ), sendsize, &
799                        MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, &
800                        desc%comm3, rh(iproc3), ierr )
801        ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) )
802     ENDDO
803
804     call mpi_waitall( desc%nproc3, sh, MPI_STATUSES_IGNORE, ierr )
805     call mpi_waitall( desc%nproc3, rh, MPI_STATUSES_IGNORE, ierr )
806#endif
807     !
808     !  step one: store contiguously the columns
809     !
810!$omp parallel do collapse(3) private(kdest,kfrom)
811     DO iproc3 = 1, nproc3
812        DO me2 = me2_start, me2_end
813           DO k = 0, ncpx-1   ! was ncp_(me3)
814              IF (k>=ncp_(desc%iproc(me2,me3))) CYCLE ! control k from 0 to ncp_(desc%iproc(me2,me3))-1
815              kdest = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k )
816              kfrom = desc%nr3p_offset(iproc3) + desc%nr3x * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k )
817              DO i = 1, desc%nr3p( iproc3 )
818                 f_in ( kfrom + i ) = f_aux ( kdest + i )
819              ENDDO
820           ENDDO
821        ENDDO
822     ENDDO
823!$omp end parallel do
824
825     ! clean extra array elements in each stick
826
827     IF( desc%nr3x /= desc%nr3 ) THEN
828        DO k = 1, ncp_(desc%mype+1)
829           f_in(desc%nr3x*(k-1)+desc%nr3+1:desc%nr3x*k) = (0.0_DP, 0.0_DP)
830        END DO
831     END IF
832
83320   CONTINUE
834
835  ENDIF
836
837  END SUBROUTINE impl_yz
838
839#endif
840
841END SUBROUTINE fft_scatter_yz
842!
843!-----------------------------------------------------------------------
844SUBROUTINE fft_scatter_many_yz ( desc, f_in, f_aux, isgn, howmany )
845  !-----------------------------------------------------------------------
846  !
847  ! transpose of the fft yz planes across the desc%comm3 communicator
848  !
849  ! a) From Z-oriented columns to Y-oriented colums (isgn > 0)
850  !    Active columns (or sticks or pencils) along the Z direction for each
851  !    processor are stored consecutively and are such that they correspond
852  !    to a subset of the active X values.
853  !
854  !    The pencil -> slices transposition is performed in the subgroup
855  !    of processors (desc%comm3) owning these X values.
856  !
857  !    The transpose takes place in two steps:
858  !    1) on each processor the columns are sliced into sections along Z
859  !       that are stored one after the other. On each processor, slices for
860  !       processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big.
861  !    2) all processors communicate to exchange slices (all columns with
862  !       Z in the slice belonging to "me" must be received, all the others
863  !       must be sent to "iproc3")
864  !
865  !    Finally one gets the "slice" representation: each processor has
866  !    desc%nr3p(mype3) Z values of all the active pencils along Y for the
867  !    X values of the current group. Data are organized with the Y index
868  !    running fastest, then the reordered X values, then Z.
869  !
870  !    f_in  contains the input Z columns, is destroyed on output
871  !    f_aux contains the output Y colums.
872  !
873  !  b) From planes to columns (isgn < 0)
874  !
875  !    Quite the same in the opposite direction
876  !    f_aux contains the input Y columns, is destroyed on output
877  !    f_in  contains the output Z columns.
878  !
879  IMPLICIT NONE
880
881  TYPE (fft_type_descriptor), INTENT(in), TARGET :: desc
882  COMPLEX (DP), INTENT(inout)                    :: f_in(:), f_aux(:)
883  INTEGER, INTENT(in)                            :: isgn, howmany
884
885#if defined(__MPI)
886  !
887  INTEGER :: ierr, me, me2, me3, nproc3, iproc3, ncpx, nr3px, ip
888  INTEGER :: i, j, it, it0, k, kfrom, kdest, mc, m1, m2, i1, sendsize
889  INTEGER, POINTER :: ncp_(:), ir1p_(:)
890  INTEGER :: my_nr1p_
891  !
892  me     = desc%mype  + 1
893  me2    = desc%mype2 + 1
894  me3    = desc%mype3 + 1
895  nproc3 = desc%nproc3
896  !
897  if ( abs (isgn) == 1 ) then      ! It's a potential FFT
898     ncp_     => desc%nsp
899     ir1p_    => desc%ir1p
900     my_nr1p_ = count (desc%ir1p   > 0)
901  else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT
902     ncp_     => desc%nsw
903     ir1p_    => desc%ir1w
904     my_nr1p_ = count (desc%ir1w   > 0)
905  else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group
906     print *, "ERRORE, this should never happen!"
907  end if
908  !
909  CALL start_clock ('fft_scatt_many_yz')
910  !
911  ! calculate the message size
912  !
913  nr3px = MAXVAL ( desc%nr3p )  ! maximum number of Z values to be exchanged
914  ncpx  = MAXVAL ( ncp_ )       ! maximum number of Z columns to be exchanged
915
916  sendsize = howmany * ncpx * nr3px       ! dimension of the scattered chunks
917  !
918  ierr = 0
919  IF (isgn.gt.0) THEN
920     !
921     ! "forward" scatter from columns to planes
922     !
923     ! step one: store contiguously the slices
924     !
925     IF (nproc3==1) THEN
926!$omp parallel default(none)                                   &
927!$omp          private(i, j, k, it, mc, m1, m2, i1)            &
928!$omp          shared(howmany, ncpx, nr3px, desc, f_aux, f_in) &
929!$omp&         shared(ncp_, me2, ir1p_, my_nr1p_)
930!$omp do collapse(2)
931        DO k=0, howmany-1
932           DO j = 1, desc%my_nr3p*desc%nr2x*my_nr1p_
933              f_aux(j+k*desc%nnr) = (0.0_DP, 0.0_DP)
934           ENDDO
935        ENDDO
936!$omp end do
937!$omp do collapse(2)
938        DO k=0, howmany-1
939           DO j = 1, ncp_(desc%iproc( me2, 1))
940              it = (j-1)*nr3px + k*ncpx*nr3px
941              mc = desc%ismap( j + desc%iss(desc%iproc( me2, 1)) )
942              m1 = mod (mc-1,desc%nr1x) + 1
943              m2 = (mc-1)/desc%nr1x + 1
944              i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x
945              DO i = 1, desc%my_nr3p
946                 f_aux( i1 + k*desc%nnr ) = f_in( it + i)
947                 i1 = i1 + desc%nr2x * my_nr1p_
948              ENDDO
949           ENDDO
950        ENDDO
951!$omp end do
952!$omp end parallel
953     ELSE
954!$omp parallel default(none)                                   &
955!$omp          private(i, j, k, kdest, kfrom, iproc3)          &
956!$omp&         private(it, mc, m1, m2, i1)                     &
957!$omp&         shared(nproc3, howmany, ncpx, sendsize, nr3px)  &
958!$omp&         shared(ncp_, ir1p_, my_nr1p_, ierr)             &
959!$omp&         shared(desc, f_aux, f_in, me2, me3)
960!$omp do collapse(3)
961        DO k = 0, howmany-1
962           DO iproc3 = 1, nproc3
963              DO j = 0, ncp_(desc%iproc(me2,me3))-1
964                 kdest = ( iproc3 - 1 ) * sendsize + nr3px * (j + k*ncpx)
965                 kfrom = desc%nr3p_offset(iproc3)  + desc%nr3x * (j + k*ncpx)
966                 DO i = 1, desc%nr3p( iproc3 )
967                    f_aux ( kdest + i ) =  f_in ( kfrom + i )
968                 ENDDO
969              ENDDO
970           ENDDO
971        ENDDO
972!$omp end do
973        !
974        ! ensures that no garbage is present in the output
975        ! useless; the later accessed elements are overwritten by the A2A step
976        !
977        ! step two: communication  across the    nproc3    group
978        !
979!$omp single
980        CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), &
981                           sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
982
983        IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
984!$omp end single
985        !
986!$omp do collapse(2)
987        DO k=0, howmany-1
988           DO j = 1, desc%my_nr3p*desc%nr2x*my_nr1p_
989              f_aux(j+k*desc%nnr) = (0.0_DP, 0.0_DP)
990           ENDDO
991        ENDDO
992!$omp end do
993!$omp do collapse(3)
994        DO k=0, howmany-1
995           DO iproc3 = 1, nproc3
996              DO j = 1, ncpx
997                 IF ( j>ncp_(desc%iproc( me2, iproc3)) ) CYCLE
998                 it = (iproc3 - 1) * sendsize + (j-1)*nr3px + k*ncpx*nr3px
999                 mc = desc%ismap( j + desc%iss(desc%iproc( me2, iproc3)) )
1000                 m1 = mod (mc-1,desc%nr1x) + 1
1001                 m2 = (mc-1)/desc%nr1x + 1
1002                 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x
1003                 DO i = 1, desc%my_nr3p
1004                    f_aux( i1 + k*desc%nnr ) = f_in( it + i)
1005                    i1 = i1 + desc%nr2x * my_nr1p_
1006                 ENDDO
1007              ENDDO
1008           ENDDO
1009        ENDDO
1010!$omp end do
1011!$omp end parallel
1012     ENDIF
1013     !
1014  ELSE
1015     !
1016     !  "backward" scatter from planes to columns
1017     !
1018     IF( nproc3 == 1 ) THEN
1019!$omp parallel default(none)                                   &
1020!$omp          private(i, j, k, it, mc, m1, m2, i1)            &
1021!$omp          shared(howmany, ncpx, nr3px, desc, f_aux, f_in) &
1022!$omp&         shared(ncp_, me2, ir1p_, my_nr1p_)
1023!$omp do collapse(2)
1024        DO k = 0, howmany - 1
1025           DO j = 1, ncp_(desc%iproc(me2,1))
1026              it = (j-1)*nr3px + k*ncpx*nr3px
1027              mc = desc%ismap( j + desc%iss(desc%iproc(me2,1)))
1028              m1 = mod (mc-1,desc%nr1x) + 1
1029              m2 = (mc-1)/desc%nr1x + 1
1030              i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x
1031              DO i = 1, desc%my_nr3p
1032                 f_in( i + it ) = f_aux( i1  + k*desc%nnr )
1033                 i1 = i1 + desc%nr2x*my_nr1p_
1034              ENDDO
1035           ENDDO
1036        ENDDO
1037!$omp end do
1038!$omp end parallel
1039     ELSE
1040     !
1041!$omp parallel default(none)                                             &
1042!$omp          private(iproc3, i, j, k, ip, it, mc, m1, m2, i1, it0,kdest,kfrom)     &
1043!$omp          shared(nproc3, howmany, ncpx, sendsize, nr3px, desc, me3) &
1044!$omp&         shared(f_aux, f_in, ncp_, me2, ir1p_, my_nr1p_, ierr)
1045!$omp do collapse(3)
1046        DO k = 0, howmany - 1
1047           DO iproc3 = 1, nproc3
1048              DO j = 1, ncpx
1049                 IF ( j>ncp_(desc%iproc( me2, iproc3)) ) CYCLE
1050                 it = (iproc3-1) * sendsize + (j-1)*nr3px + k*ncpx*nr3px
1051                 mc = desc%ismap( j + desc%iss(desc%iproc( me2, iproc3)) )
1052                 m1 = mod (mc-1,desc%nr1x) + 1
1053                 m2 = (mc-1)/desc%nr1x + 1
1054                 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x
1055                 DO i = 1, desc%my_nr3p
1056                    f_in( i + it ) = f_aux( i1  + k*desc%nnr )
1057                    i1 = i1 + desc%nr2x*my_nr1p_
1058                 ENDDO
1059              ENDDO
1060           ENDDO
1061        ENDDO
1062!$omp end do
1063        !
1064!$omp single
1065        !
1066        !  step two: communication
1067        !
1068        CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), &
1069                           sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr)
1070
1071        IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
1072!$omp end single
1073        !
1074        !  step one: store contiguously the columns
1075        !
1076!$omp do collapse(3)
1077        DO k = 0, howmany-1
1078           DO iproc3 = 1, nproc3
1079              DO j = 0, ncp_(desc%iproc(me2,me3))-1
1080                 kdest = ( iproc3 - 1 ) * sendsize + nr3px * (j + k*ncpx)
1081                 kfrom = desc%nr3p_offset(iproc3)  + desc%nr3x * (j + k*ncpx)
1082                 DO i = 1, desc%nr3p( iproc3 )
1083                    f_in( kfrom + i ) = f_aux( kdest + i )
1084                 ENDDO
1085              ENDDO
1086           ENDDO
1087        ENDDO
1088!$omp end do
1089        ! clean extra array elements in each stick
1090        IF( desc%nr3x /= desc%nr3 ) THEN
1091!$omp do collapse(2)
1092           DO k = 0, howmany-1
1093              DO j = 1, ncp_ ( desc%mype+1 )
1094                 DO i = desc%nr3+1, desc%nr3x
1095                    f_in( k*ncpx*desc%nr3x + (j-1)*desc%nr3x + i) = 0.0d0
1096                 END DO
1097              END DO
1098           END DO
1099!$omp end do
1100        ENDIF
1101!$omp end parallel
1102     ENDIF
1103     !
1104  ENDIF
1105
1106  CALL stop_clock ('fft_scatt_many_yz')
1107
1108#endif
1109
1110  RETURN
111198 format ( 10 ('(',2f12.9,')') )
111299 format ( 20 ('(',2f12.9,')') )
1113
1114END SUBROUTINE fft_scatter_many_yz
1115
1116!-----------------------------------------------------------------------
1117SUBROUTINE fft_scatter_tg ( desc, f_in, f_aux, nxx_, isgn )
1118  !-----------------------------------------------------------------------
1119  !
1120  ! task group wavefunction redistribution
1121  !
1122  ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one
1123  !
1124  ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one
1125  !
1126  ! in both cases:
1127  !    f_in  contains the input data, is overwritten with the desired output
1128  !    f_aux is used as working array, may contain garbage in output
1129  !
1130  IMPLICIT NONE
1131
1132  TYPE (fft_type_descriptor), INTENT(in) :: desc
1133  INTEGER, INTENT(in)           :: nxx_, isgn
1134  COMPLEX (DP), INTENT(inout)   :: f_in (nxx_), f_aux (nxx_)
1135
1136  INTEGER :: ierr
1137
1138  CALL start_clock ('fft_scatt_tg')
1139
1140  if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 )
1141
1142#if defined(__MPI)
1143  !
1144  f_aux = f_in
1145  if ( isgn > 0 ) then
1146
1147     CALL MPI_ALLTOALLV( f_aux,  desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, &
1148                         f_in, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
1149     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) )
1150
1151  else
1152
1153     CALL MPI_ALLTOALLV( f_aux,  desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, &
1154                         f_in, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
1155     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) )
1156   end if
1157
1158#endif
1159  CALL stop_clock ('fft_scatt_tg')
1160
1161  RETURN
1162
1163END SUBROUTINE fft_scatter_tg
1164!
1165!-----------------------------------------------------------------------
1166SUBROUTINE fft_scatter_tg_opt ( desc, f_in, f_out, nxx_, isgn )
1167  !-----------------------------------------------------------------------
1168  !
1169  ! task group wavefunction redistribution
1170  !
1171  ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one
1172  !
1173  ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one
1174  !
1175  ! in both cases:
1176  !    f_in  contains the input data
1177  !    f_out contains the output data
1178  !
1179  IMPLICIT NONE
1180
1181  TYPE (fft_type_descriptor), INTENT(in) :: desc
1182  INTEGER, INTENT(in)           :: nxx_, isgn
1183  COMPLEX (DP), INTENT(inout)   :: f_in (nxx_), f_out (nxx_)
1184
1185  INTEGER :: ierr
1186
1187  CALL start_clock ('fft_scatt_tg')
1188
1189  if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 )
1190
1191#if defined(__MPI)
1192  !
1193  if ( isgn > 0 ) then
1194
1195     CALL MPI_ALLTOALLV( f_in,  desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, &
1196                         f_out, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
1197     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) )
1198
1199  else
1200
1201     CALL MPI_ALLTOALLV( f_in,  desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, &
1202                         f_out, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr)
1203     IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) )
1204   end if
1205
1206#endif
1207  CALL stop_clock ('fft_scatt_tg')
1208
1209  RETURN
1210
1211END SUBROUTINE fft_scatter_tg_opt
1212!
1213!=----------------------------------------------------------------------=!
1214   END MODULE fft_scatter
1215!=----------------------------------------------------------------------=!
1216