1! 2! Copyright (C) Quantum ESPRESSO group 3! 4! This file is distributed under the terms of the 5! GNU General Public License. See the file `License' 6! in the root directory of the present distribution, 7! or http://www.gnu.org/copyleft/gpl.txt . 8! 9!---------------------------------------------------------------------- 10! Basic scatter operations needed by parallel FFT 11! Written by Carlo Cavazzoni and Stefano de Gironcoli, modified by PG 12! 13!---------------------------------------------------------------------- 14! 15!=----------------------------------------------------------------------=! 16 MODULE fft_scatter 17!=----------------------------------------------------------------------=! 18 19 USE fft_types, ONLY: fft_type_descriptor 20 USE fft_param 21 22 IMPLICIT NONE 23 24 SAVE 25 26 PRIVATE 27 28 PUBLIC :: fft_scatter_xy, fft_scatter_yz, fft_scatter_many_xy, fft_scatter_many_yz 29 PUBLIC :: fft_scatter_tg, fft_scatter_tg_opt 30 31!=----------------------------------------------------------------------=! 32 CONTAINS 33!=----------------------------------------------------------------------=! 34! 35! 36!----------------------------------------------------------------------- 37SUBROUTINE fft_scatter_xy ( desc, f_in, f_aux, nxx_, isgn, comm ) 38 !----------------------------------------------------------------------- 39 ! 40 ! Transpose of the fft xy planes across the comm communicator. 41 ! If the optional comm is not provided as input, the transpose is made 42 ! across desc%comm2 communicator. 43 ! 44 ! a) From Y-oriented columns to X-oriented partial slices (isgn > 0) 45 ! Active columns along the Y direction corresponding to a subset of the 46 ! active X values and a range of Z values (in this order) are stored 47 ! consecutively for each processor and are such that the subgroup owns 48 ! all data for a range of Z values. 49 ! 50 ! The Y pencil -> X-oriented partial slices transposition is performed 51 ! in the subgroup of processors (desc%comm2) owning this range of Z values. 52 ! 53 ! The transpose takes place in two steps: 54 ! 1) on each processor the columns are sliced into sections along Y 55 ! that are stored one after the other. On each processor, slices for 56 ! processor "iproc2" are desc%nr2p(iproc2)*desc%nr1p(me2)*desc%my_nr3p big. 57 ! 2) all processors communicate to exchange slices (all sectin of columns with 58 ! Y in the slice belonging to "me" must be received, all the others 59 ! must be sent to "iproc2") 60 ! 61 ! Finally one gets the "partial slice" representation: each processor has 62 ! all the X values of desc%my_nr2p Y and desc%my_nr3p Z values. 63 ! Data are organized with the X index running fastest, then Y, then Z. 64 ! 65 ! f_in contains the input Y columns, is destroyed on output 66 ! f_aux contains the output X-oriented partial slices. 67 ! 68 ! b) From planes to columns (isgn < 0) 69 ! 70 ! Quite the same in the opposite direction 71 ! f_aux contains the input X-oriented partial slices, is destroyed on output 72 ! f_in contains the output Y columns. 73 ! 74 IMPLICIT NONE 75 76 TYPE (fft_type_descriptor), INTENT(in) :: desc 77 INTEGER, INTENT(in) :: nxx_, isgn 78 COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_) 79 INTEGER, OPTIONAL, INTENT(in) :: comm 80 INTEGER :: nr1_temp(1), comm_ 81 82#if defined(__MPI) 83!$omp master 84 CALL start_clock ('fft_scatt_xy') 85!$omp end master 86 87 IF (PRESENT(comm)) THEN 88 comm_ = comm 89 ELSE 90 comm_ = desc%comm2 91 ENDIF 92 ! 93 if ( abs (isgn) == 1 ) then ! It's a potential FFT 94 CALL impl_xy( MAXVAL ( desc%nr2p ), desc%nproc2, desc%my_nr2p, desc%nr1p, desc%indp, desc%iplp, comm_) 95 else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT 96 CALL impl_xy( MAXVAL ( desc%nr2p ), desc%nproc2, desc%my_nr2p, desc%nr1w, desc%indw, desc%iplw, comm_) 97 else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group 98 ! in task group FFTs whole Y colums are distributed 99 nr1_temp = desc%nr1w_tg 100 CALL impl_xy( desc%nr2x, 1, desc%nr2x, nr1_temp, desc%indw_tg, desc%iplw, comm_) 101 end if 102 ! 103!$omp master 104 CALL stop_clock ('fft_scatt_xy') 105!$omp end master 106 107 RETURN 108 109 CONTAINS 110 111 SUBROUTINE impl_xy(nr2px, nproc2, my_nr2p, nr1p_, indx, iplx, mpi_comm) 112 IMPLICIT NONE 113 ! 114 INTEGER, INTENT(in):: nr2px, nproc2, my_nr2p 115 INTEGER, INTENT(in):: nr1p_(nproc2), indx(desc%nr1x,nproc2), iplx(desc%nr1x), mpi_comm 116 ! 117 INTEGER :: ierr, me2, iproc2, ncpx 118 INTEGER :: i, it, j, k, kfrom, kdest, mc, m1, m3, i1, icompact, sendsize 119 ! 120#if defined(__NON_BLOCKING_SCATTER) 121 INTEGER :: sh(desc%nproc2), rh(desc%nproc2) 122#endif 123 124 me2 = desc%mype2 + 1 125 ncpx = MAXVAL(nr1p_) * desc%my_nr3p ! maximum number of Y columns to be disributed 126 ! calculate the message size 127 sendsize = ncpx * nr2px ! dimension of the scattered chunks (safe value) 128 129 ierr = 0 130 IF (isgn.gt.0) THEN 131 ! 132 IF (nproc2==1) GO TO 10 133 ! 134 ! "forward" scatter from columns to planes 135 ! 136 ! step one: store contiguously the slices 137 ! 138!$omp parallel do collapse(2) private(kdest,kfrom) 139 DO iproc2 = 1, nproc2 140 DO k = 0, nr1p_(me2)*desc%my_nr3p-1 141 kdest = ( iproc2 - 1 ) * sendsize + nr2px * k 142 kfrom = desc%nr2p_offset(iproc2) + desc%nr2x *k 143 DO i = 1, desc%nr2p( iproc2 ) 144 f_aux ( kdest + i ) = f_in ( kfrom + i ) 145 ENDDO 146 ENDDO 147 ENDDO 148!$omp end parallel do 149#if defined(__NON_BLOCKING_SCATTER) 150 DO iproc2 = 1, nproc2 151 CALL mpi_isend( f_aux( (iproc2-1)*sendsize + 1 ), sendsize, & 152 MPI_DOUBLE_COMPLEX, iproc2-1, me2, mpi_comm, & 153 sh( iproc2 ), ierr ) 154 ENDDO 155#endif 156 ! 157 ! step two: communication across the nproc3 group 158 ! 159#if defined(__NON_BLOCKING_SCATTER) 160 DO iproc2 = 1, nproc2 161 ! 162 ! now post the receive 163 ! 164 CALL mpi_irecv( f_in( (iproc2-1)*sendsize + 1 ), sendsize, & 165 MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, & 166 mpi_comm, rh( iproc2 ), ierr ) 167 !IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward receive info<>0', abs(ierr) ) 168 ENDDO 169 170 call mpi_waitall( nproc2, sh, MPI_STATUSES_IGNORE, ierr ) 171 ! 172#else 173 CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), & 174 sendsize, MPI_DOUBLE_COMPLEX, mpi_comm, ierr) 175 ! 176 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 177#endif 178 ! 17910 CONTINUE 180 ! 181#if defined(__NON_BLOCKING_SCATTER) 182 if (nproc2 > 1) call mpi_waitall( nproc2, rh, MPI_STATUSES_IGNORE, ierr ) 183#endif 184 ! 185!$omp parallel 186!$omp do collapse(2) private(it,m3,i1,m1,icompact) 187 DO iproc2 = 1, nproc2 188 DO i = 0, ncpx-1 189 IF(i>=nr1p_(iproc2)*desc%my_nr3p) CYCLE ! control i from 0 to nr1p_(iproc2)*desc%my_nr3p-1 190 it = ( iproc2 - 1 ) * sendsize + nr2px * i 191 m3 = i/nr1p_(iproc2)+1 192 i1 = mod(i,nr1p_(iproc2))+1 193 m1 = indx(i1,iproc2) 194 icompact = m1 + (m3-1)*desc%nr1x*my_nr2p 195 DO j = 1, my_nr2p 196 !f_aux( m1 + (j-1)*desc%nr1x + (m3-1)*desc%nr1x*my_nr2p ) = f_in( j + it ) 197 f_aux( icompact ) = f_in( j + it ) 198 icompact = icompact + desc%nr1x 199 ENDDO 200 ENDDO 201 ENDDO 202!$omp end do nowait 203 ! 204 ! clean extra array elements in each stick 205 ! 206!$omp do 207 DO k = 1, my_nr2p*desc%my_nr3p 208 DO i1 = 1, desc%nr1x 209 IF(iplx(i1)==0) f_aux(desc%nr1x*(k-1)+i1) = (0.0_DP, 0.0_DP) 210 ENDDO 211 ENDDO 212!$omp end do nowait 213!$omp end parallel 214 ! 215 ELSE 216 ! 217 ! "backward" scatter from planes to columns 218 ! 219!$omp parallel do collapse(2) private(it,m3,i1,m1,icompact) 220 DO iproc2 = 1, nproc2 221 DO i = 0, ncpx-1 222 IF(i>=nr1p_(iproc2)*desc%my_nr3p) CYCLE ! control i from 0 to nr1p_(iproc2)*desc%my_nr3p-1 223 it = ( iproc2 - 1 ) * sendsize + nr2px * i 224 m3 = i/nr1p_(iproc2)+1 225 i1 = mod(i,nr1p_(iproc2))+1 226 m1 = indx(i1,iproc2) 227 icompact = m1 + (m3-1)*desc%nr1x*my_nr2p 228 DO j = 1, my_nr2p 229 !f_in( j + it ) = f_aux( m1 + (j-1)*desc%nr1x + (m3-1)*desc%nr1x*my_nr2p ) 230 f_in( j + it ) = f_aux( icompact ) 231 icompact = icompact + desc%nr1x 232 ENDDO 233 ENDDO 234 ENDDO 235!$omp end parallel do 236#if defined(__NON_BLOCKING_SCATTER) 237 DO iproc2 = 1, nproc2 238 IF( nproc2 > 1 ) CALL mpi_isend( f_in( ( iproc2 - 1 ) * sendsize + 1 ), & 239 sendsize, MPI_DOUBLE_COMPLEX, iproc2-1, me2, & 240 mpi_comm, sh( iproc2 ), ierr ) 241 ENDDO 242#endif 243 IF (nproc2==1) GO TO 20 244 245 ! 246 ! step two: communication 247 ! 248#if ! defined(__NON_BLOCKING_SCATTER) 249 CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), & 250 sendsize, MPI_DOUBLE_COMPLEX, mpi_comm, ierr) 251 252 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 253#else 254 DO iproc2 = 1, nproc2 255 CALL mpi_irecv( f_aux( (iproc2-1)*sendsize + 1 ), sendsize, & 256 MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, & 257 mpi_comm, rh(iproc2), ierr ) 258 ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) ) 259 ENDDO 260 261 call mpi_waitall( nproc2, sh, MPI_STATUSES_IGNORE, ierr ) 262 call mpi_waitall( nproc2, rh, MPI_STATUSES_IGNORE, ierr ) 263#endif 264 ! 265 ! step one: store contiguously the columns 266 ! 267!$omp parallel do collapse(2) private(kdest,kfrom) 268 DO iproc2 = 1, nproc2 269 DO k = 0, nr1p_(me2)*desc%my_nr3p-1 270 kdest = ( iproc2 - 1 ) * sendsize + nr2px * k 271 kfrom = desc%nr2p_offset(iproc2) + desc%nr2x * k 272 DO i = 1, desc%nr2p( iproc2 ) 273 f_in ( kfrom + i ) = f_aux ( kdest + i ) 274 ENDDO 275 ENDDO 276 ENDDO 277!$omp end parallel do 278 ! 279 ! clean extra array elements in each stick 280 ! 281 IF( desc%nr2x /= desc%nr2 ) THEN 282 DO k = 1, nr1p_(me2)*desc%my_nr3p 283 f_in(desc%nr2x*(k-1)+desc%nr2+1:desc%nr2x*k) = (0.0_DP, 0.0_DP) 284 ENDDO 285 ENDIF 286 28720 CONTINUE 288 289 ENDIF 290 291 END SUBROUTINE impl_xy 292 293#endif 294 295END SUBROUTINE fft_scatter_xy 296! 297!----------------------------------------------------------------------- 298SUBROUTINE fft_scatter_many_xy ( desc, f_in, f_aux, isgn, howmany) 299 !----------------------------------------------------------------------- 300 ! 301 ! transpose of the fft xy planes across the desc%comm2 communicator 302 ! 303 ! a) From Y-oriented columns to X-oriented partial slices (isgn > 0) 304 ! Active columns along the Y direction corresponding to a subset of the 305 ! active X values and a range of Z values (in this order) are stored 306 ! consecutively for each processor and are such that the subgroup owns 307 ! all data for a range of Z values. 308 ! 309 ! The Y pencil -> X-oriented partial slices transposition is performed 310 ! in the subgroup of processors (desc%comm2) owning this range of Z values. 311 ! 312 ! The transpose takes place in two steps: 313 ! 1) on each processor the columns are sliced into sections along Y 314 ! that are stored one after the other. On each processor, slices for 315 ! processor "iproc2" are desc%nr2p(iproc2)*desc%nr1p(me2)*desc%my_nr3p big. 316 ! 2) all processors communicate to exchange slices (all sectin of columns with 317 ! Y in the slice belonging to "me" must be received, all the others 318 ! must be sent to "iproc2") 319 ! 320 ! Finally one gets the "partial slice" representation: each processor has 321 ! all the X values of desc%my_nr2p Y and desc%my_nr3p Z values. 322 ! Data are organized with the X index running fastest, then Y, then Z. 323 ! 324 ! f_in contains the input Y columns, is destroyed on output 325 ! f_aux contains the output X-oriented partial slices. 326 ! 327 ! b) From planes to columns (isgn < 0) 328 ! 329 ! Quite the same in the opposite direction 330 ! f_aux contains the input X-oriented partial slices, is destroyed on output 331 ! f_in contains the output Y columns. 332 ! 333 IMPLICIT NONE 334 335 TYPE (fft_type_descriptor), INTENT(in), TARGET :: desc 336 INTEGER, INTENT(in) :: isgn 337 INTEGER, INTENT(in) :: howmany 338 339 COMPLEX (DP), INTENT(inout) :: f_in (:), f_aux (:) 340 341#if defined(__MPI) 342 ! 343 INTEGER :: ierr, me2, nproc2, iproc2, ncpx, nr2px, ip, icompact 344 INTEGER :: i, j, it, it0, k, kfrom, kdest, offset, m1, m2, i1, sendsize 345 INTEGER, POINTER :: iplx(:), nr1p_(:), indx(:,:) 346 ! 347 me2 = desc%mype2 + 1 348 nproc2 = desc%nproc2 349 ! 350 if ( abs (isgn) == 1 ) then ! It's a potential FFT 351 iplx => desc%iplp 352 nr1p_ => desc%nr1p 353 indx => desc%indp 354 else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT 355 iplx => desc%iplw 356 nr1p_ => desc%nr1w 357 indx => desc%indw 358 else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group 359 print *, "ERRORE, this should never happen!" 360 end if 361 ! 362 CALL start_clock ('fft_scatt_many_xy') 363 ! 364 ! calculate the message size 365 ! 366 nr2px = MAXVAL ( desc%nr2p ) ! maximum number of Y values to be exchanged 367 ncpx = MAXVAL ( nr1p_ ) * MAXVAL(desc%nr3p) ! maximum number of Y columns to be exchanged 368 369 sendsize = howmany * ncpx * nr2px ! dimension of the scattered chunks 370 ! 371 ierr = 0 372 ! 373 IF (isgn.gt.0) THEN 374 ! 375 ! "forward" scatter from columns to planes 376 ! 377 ! step one: store contiguously the slices 378 ! 379 offset = 0 380 381 IF (nproc2==1) THEN 382!$omp parallel default(none) & 383!$omp private(i, j, k, ip, it, m1, m2, i1, icompact) & 384!$omp shared(howmany, ncpx, nr2px, nr1p_, iplx) & 385!$omp& shared(desc, f_aux, f_in, indx) 386 ip = nr1p_(1) * desc%my_nr3p + 1 387!$omp do 388 DO k=0, howmany-1 389 DO j = 1, ncpx 390 IF ( j >= ip ) CYCLE 391 it = (j - 1) * nr2px + k * ncpx * nr2px 392 m2 = (j - 1) / nr1p_(1) + 1 393 i1 = mod((j - 1),nr1p_(1)) + 1 394 m1 = indx(i1,1) 395 icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p 396 DO i = 1, desc%my_nr2p 397 f_aux( icompact + k * desc%nnr ) = f_in( i + it ) 398 icompact = icompact + desc%nr1x 399 ENDDO 400 ENDDO 401 ENDDO 402!$omp end do 403!$omp do 404 DO k=0, howmany-1 405 DO j = 1, desc%my_nr2p*desc%my_nr3p 406 DO i1 = 1, desc%nr1x 407 IF(iplx(i1)==0) f_aux(desc%nr1x*(j-1)+i1+k*desc%nnr) = (0.0_DP, 0.0_DP) 408 ENDDO 409 ENDDO 410 ENDDO 411!$omp end do 412!$omp end parallel 413 ELSE 414!$omp parallel default(none) & 415!$omp private(iproc2, i, j, k, kdest, kfrom) & 416!$omp shared(nproc2, howmany, ncpx, sendsize, nr1p_) & 417!$omp& shared(nr2px, desc, f_aux, f_in, me2) 418!$omp do 419 DO k = 0, howmany-1 420 DO iproc2 = 1, nproc2 421 DO j = 0, nr1p_(me2) * desc%my_nr3p - 1 422 kdest = ( iproc2 - 1 ) * sendsize + nr2px * (j + k*ncpx) 423 kfrom = desc%nr2p_offset(iproc2) + desc%nr2x * (j + k*ncpx) 424 DO i = 1, desc%nr2p( iproc2 ) 425 f_aux ( kdest + i ) = f_in ( kfrom + i ) 426 ENDDO 427 ENDDO 428 ENDDO 429 ENDDO 430!$omp end do 431!$omp end parallel 432 433 CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), & 434 sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 435 436 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 437 438!$omp parallel default(none) & 439!$omp private(iproc2, i, j, k, ip, it, m1, m2, icompact, it0, i1) & 440!$omp shared(nproc2, howmany, ncpx, sendsize, nr2px, indx) & 441!$omp& shared(desc, f_aux, f_in, iplx, nr1p_) 442!$omp do 443 DO k=0, howmany-1 444 DO iproc2 = 1, nproc2 445 ip = nr1p_(iproc2) * desc%my_nr3p + 1 446 it0 = ( iproc2 - 1 ) * sendsize 447 DO j = 1, ncpx 448 IF ( j >= ip ) CYCLE 449 it = it0 + (j - 1) * nr2px + k * ncpx * nr2px 450 m2 = (j - 1) / nr1p_(iproc2) + 1 451 i1 = mod((j - 1),nr1p_(iproc2)) + 1 452 m1 = indx(i1,iproc2) 453 icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p 454 DO i = 1, desc%my_nr2p 455 f_aux( icompact + k*desc%nnr ) = f_in( i + it ) 456 icompact = icompact + desc%nr1x 457 ENDDO 458 ENDDO 459 ENDDO 460 ENDDO 461!$omp end do 462!$omp do 463 DO k=0, howmany-1 464 DO j = 1, desc%my_nr2p*desc%my_nr3p 465 DO i1 = 1, desc%nr1x 466 IF(iplx(i1)==0) f_aux(desc%nr1x*(j-1)+i1+k*desc%nnr) = (0.0_DP, 0.0_DP) 467 ENDDO 468 ENDDO 469 ENDDO 470!$omp end do 471!$omp end parallel 472 ENDIF 473 ! 474 ELSE 475 ! 476 ! "backward" scatter from planes to columns 477 ! 478 IF( nproc2 == 1 ) THEN 479!$omp parallel default(none) & 480!$omp private(i, j, k, ip, it, m1, m2, i1, icompact) & 481!$omp shared(howmany, ncpx, nr2px, nr1p_) & 482!$omp& shared(desc, f_aux, f_in, indx) 483 ip = nr1p_(1) * desc%my_nr3p + 1 484!$omp do 485 DO k = 0, howmany - 1 486 DO j = 1, ncpx 487 IF ( j >= ip ) CYCLE 488 it = (j - 1) * nr2px + k * ncpx * nr2px 489 m2 = (j - 1) / nr1p_(1) + 1 490 i1 = mod((j - 1),nr1p_(1)) + 1 491 m1 = indx(i1,1) 492 icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p 493 DO i = 1, desc%my_nr2p 494 f_in( i + it ) = f_aux( icompact + k * desc%nnr ) 495 icompact = icompact + desc%nr1x 496 ENDDO 497 ENDDO 498 ENDDO 499!$omp end do 500!$omp end parallel 501 ELSE 502!$omp parallel default(none) & 503!$omp private(iproc2, i, j, k, ip, it, m1, m2, icompact, it0, i1) & 504!$omp shared(nproc2, howmany, ncpx, sendsize, nr2px, indx) & 505!$omp& shared(desc, f_aux, f_in, nr1p_) 506!$omp do 507 DO k=0, howmany-1 508 DO iproc2 = 1, nproc2 509 ip = nr1p_(iproc2) * desc%my_nr3p + 1 510 it0 = ( iproc2 - 1 ) * sendsize 511 DO j = 1, ncpx 512 IF ( j >= ip ) CYCLE 513 it = it0 + (j - 1) * nr2px + k * ncpx * nr2px 514 m2 = (j - 1) / nr1p_(iproc2) + 1 515 i1 = mod((j - 1),nr1p_(iproc2))+1 516 m1 = indx(i1,iproc2) 517 icompact = m1 + (m2 - 1) * desc%nr1x * desc%my_nr2p 518 DO i = 1, desc%my_nr2p 519 f_in( i + it ) = f_aux( icompact + k*desc%nnr ) 520 icompact = icompact + desc%nr1x 521 ENDDO 522 ENDDO 523 ENDDO 524 ENDDO 525!$omp end do 526!$omp end parallel 527 ! 528 ! 529 ! step two: communication 530 ! 531 CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), & 532 sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 533 534 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 535 ! 536 ! step one: store contiguously the columns 537 ! 538!$omp parallel default(none) & 539!$omp private(iproc2, i, j, k, kdest, kfrom) & 540!$omp shared(nproc2, howmany, ncpx, sendsize, nr2px) & 541!$omp& shared(desc, f_aux, f_in, nr1p_, me2) 542!$omp do 543 DO k = 0, howmany-1 544 DO iproc2 = 1, nproc2 545 DO j = 0, nr1p_(me2) * desc%my_nr3p - 1 546 kdest = ( iproc2 - 1 ) * sendsize + nr2px * (j + k*ncpx) 547 kfrom = desc%nr2p_offset(iproc2) + desc%nr2x * (j + k*ncpx) 548 DO i = 1, desc%nr2p( iproc2 ) 549 f_in ( kfrom + i ) = f_aux ( kdest + i ) 550 ENDDO 551 ENDDO 552 ENDDO 553 ENDDO 554!$omp end do 555 ! clean extra array elements in each stick 556 IF( desc%nr2x /= desc%nr2 ) THEN 557!$omp do 558 DO k = 0, howmany-1 559 DO j = 1, nr1p_(me2)*desc%my_nr3p 560 DO i = desc%nr2+1, desc%nr2x 561 f_in( k*ncpx*desc%nr2x + (j-1)*desc%nr2x + i) = 0.0d0 562 END DO 563 END DO 564 ENDDO 565!$omp end do 566 ENDIF 567!$omp end parallel 568 ENDIF 569 ENDIF 570 571 CALL stop_clock ('fft_scatt_many_xy') 572 573#endif 574 575 RETURN 57698 format ( 10 ('(',2f12.9,')') ) 57799 format ( 20 ('(',2f12.9,')') ) 578 579END SUBROUTINE fft_scatter_many_xy 580! 581!----------------------------------------------------------------------- 582SUBROUTINE fft_scatter_yz ( desc, f_in, f_aux, nxx_, isgn ) 583 !----------------------------------------------------------------------- 584 ! 585 ! transpose of the fft yz planes across the desc%comm3 communicator 586 ! 587 ! a) From Z-oriented columns to Y-oriented colums (isgn > 0) 588 ! Active columns (or sticks or pencils) along the Z direction for each 589 ! processor are stored consecutively and are such that they correspond 590 ! to a subset of the active X values. 591 ! 592 ! The pencil -> slices transposition is performed in the subgroup 593 ! of processors (desc%comm3) owning these X values. 594 ! 595 ! The transpose takes place in two steps: 596 ! 1) on each processor the columns are sliced into sections along Z 597 ! that are stored one after the other. On each processor, slices for 598 ! processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big. 599 ! 2) all processors communicate to exchange slices (all columns with 600 ! Z in the slice belonging to "me" must be received, all the others 601 ! must be sent to "iproc3") 602 ! 603 ! Finally one gets the "slice" representation: each processor has 604 ! desc%nr3p(mype3) Z values of all the active pencils along Y for the 605 ! X values of the current group. Data are organized with the Y index 606 ! running fastest, then the reordered X values, then Z. 607 ! 608 ! f_in contains the input Z columns, is destroyed on output 609 ! f_aux contains the output Y colums. 610 ! 611 ! b) From planes to columns (isgn < 0) 612 ! 613 ! Quite the same in the opposite direction 614 ! f_aux contains the input Y columns, is destroyed on output 615 ! f_in contains the output Z columns. 616 ! 617 IMPLICIT NONE 618 619 TYPE (fft_type_descriptor), INTENT(in) :: desc 620 INTEGER, INTENT(in) :: nxx_, isgn 621 COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_) 622 623#if defined(__MPI) 624 ! 625 CALL start_clock ('fft_scatt_yz') 626 627 if ( abs (isgn) == 1 ) then ! It's a potential FFT 628 CALL impl_yz(desc%mype2+1, desc%mype2+1, desc%nsp, desc%ir1p, desc%nsp_offset) 629 else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT 630 CALL impl_yz(desc%mype2+1, desc%mype2+1, desc%nsw, desc%ir1w, desc%nsw_offset) 631 else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group 632 CALL impl_yz(1, desc%nproc2, desc%nsw, desc%ir1w_tg, desc%nsw_offset) 633 end if 634 635 CALL stop_clock ('fft_scatt_yz') 636 637 RETURN 638 639 CONTAINS 640 641 SUBROUTINE impl_yz(me2_start, me2_end, ncp_, ir1p_, me2_iproc3_offset) 642 IMPLICIT NONE 643 ! 644 INTEGER, INTENT(in) :: me2_start, me2_end 645 INTEGER, INTENT(in) :: ncp_(desc%nproc), ir1p_(desc%nr1x), me2_iproc3_offset(desc%nproc2, desc%nproc3) 646 ! 647 INTEGER :: ierr, me2, me3, nproc3, iproc3, ncpx, nr3px, ip 648 INTEGER :: i, it, k, kfrom, kdest, mc, m1, m2, i1, sendsize 649 INTEGER :: my_nr1p_ 650 ! 651#if defined(__NON_BLOCKING_SCATTER) 652 INTEGER :: sh(desc%nproc3), rh(desc%nproc3) 653#endif 654 ! 655 me3 = desc%mype3 + 1 656 nproc3 = desc%nproc3 657 ! 658 my_nr1p_ = count (ir1p_ > 0) 659 ! 660 ! calculate the message size 661 ! 662 nr3px = MAXVAL ( desc%nr3p ) ! maximum number of Z values to be exchanged 663 ncpx = MAXVAL ( ncp_ ) ! maximum number of Z columns to be exchanged 664 sendsize = ncpx * nr3px * ( me2_end - me2_start + 1 ) ! dimension of the scattered chunks 665 ! 666 ierr = 0 667 IF (isgn.gt.0) THEN 668 ! 669 IF (nproc3==1) GO TO 10 670 ! 671 ! "forward" scatter from columns to planes 672 ! 673 ! step one: store contiguously the slices 674 ! 675!$omp parallel do collapse(3) private(kdest,kfrom) 676 DO iproc3 = 1, nproc3 677 DO me2 = me2_start, me2_end 678 DO k = 0, ncpx-1 ! was ncp_(me3) 679 IF (k>=ncp_(desc%iproc(me2,me3))) CYCLE ! control k from 0 to ncp_(desc%iproc(me2,me3))-1 680 kdest = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k ) 681 kfrom = desc%nr3p_offset(iproc3) + desc%nr3x * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k ) 682 DO i = 1, desc%nr3p( iproc3 ) 683 f_aux ( kdest + i ) = f_in ( kfrom + i ) 684 ENDDO 685 ENDDO 686 ENDDO 687 ENDDO 688!$omp end parallel do 689#if defined(__NON_BLOCKING_SCATTER) 690 DO iproc3 = 1, nproc3 691 CALL mpi_isend( f_aux( ( iproc3 - 1 ) * sendsize + 1 ), sendsize, & 692 MPI_DOUBLE_COMPLEX, iproc3-1, me3, desc%comm3, & 693 sh( iproc3 ), ierr ) 694 ENDDO 695#endif 696 ! 697 ! step two: communication across the nproc3 group 698 ! 699#if defined(__NON_BLOCKING_SCATTER) 700 DO iproc3 = 1, nproc3 701 ! 702 ! now post the receive 703 ! 704 CALL mpi_irecv( f_in( (iproc3-1)*sendsize + 1 ), sendsize, & 705 MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, & 706 desc%comm3, rh( iproc3 ), ierr ) 707 !IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward receive info<>0', abs(ierr) ) 708 ! 709 ! 710 ENDDO 711 712 call mpi_waitall( nproc3, sh, MPI_STATUSES_IGNORE, ierr ) 713 ! 714#else 715 CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), & 716 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 717 718 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 719#endif 720 ! 72110 CONTINUE 722 ! 723#if defined(__NON_BLOCKING_SCATTER) 724 if (nproc3 > 1) call mpi_waitall( nproc3, rh, MPI_STATUSES_IGNORE, ierr ) 725#endif 726 ! 727!$omp parallel 728 ! ensures that no garbage is present in the output 729 ! 730!$omp do 731 DO k = 1, desc%my_nr3p*my_nr1p_*desc%nr2x 732 f_aux(k) = (0.0_DP, 0.0_DP) 733 ENDDO 734!$omp end do 735!$omp do collapse(3) private(ip,it,mc,m1,m2,i1) 736 DO iproc3 = 1, desc%nproc3 737 DO me2 = me2_start, me2_end 738 DO i = 1, ncpx ! was ncp_(iproc3) 739 ip = desc%iproc( me2, iproc3) 740 IF ( i>ncp_(ip) ) CYCLE 741 it = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, iproc3 ) + i - 1 ) 742 mc = desc%ismap( i + desc%iss(ip) ) ! this is m1+(m2-1)*nr1x of the current pencil 743 m1 = mod (mc-1,desc%nr1x) + 1 744 m2 = (mc-1)/desc%nr1x + 1 745 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x 746 DO k = 1, desc%my_nr3p 747 f_aux( i1 ) = f_in( k + it ) 748 i1 = i1 + desc%nr2x*my_nr1p_ 749 ENDDO 750 ENDDO 751 ENDDO 752 ENDDO 753!$omp end do nowait 754!$omp end parallel 755 ! 756 ELSE 757 ! 758 ! "backward" scatter from planes to columns 759 ! 760!$omp parallel do collapse(3) private(ip,it,mc,m1,m2,i1) 761 DO iproc3 = 1, desc%nproc3 762 DO me2 = me2_start, me2_end 763 DO i = 1, ncpx 764 ip = desc%iproc( me2, iproc3) 765 IF ( i>ncp_(ip) ) CYCLE 766 it = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, iproc3 ) + i - 1 ) 767 mc = desc%ismap( i + desc%iss(ip) ) ! this is m1+(m2-1)*nr1x of the current pencil 768 m1 = mod (mc-1,desc%nr1x) + 1 769 m2 = (mc-1)/desc%nr1x + 1 770 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x 771 DO k = 1, desc%my_nr3p 772 f_in( k + it ) = f_aux( i1 ) 773 i1 = i1 + desc%nr2x * my_nr1p_ 774 ENDDO 775 ENDDO 776 ENDDO 777 ENDDO 778!$omp end parallel do 779#if defined(__NON_BLOCKING_SCATTER) 780 DO iproc3 = 1, desc%nproc3 781 IF( nproc3 > 1 ) CALL mpi_isend( f_in( ( iproc3 - 1 ) * sendsize + 1 ), & 782 sendsize, MPI_DOUBLE_COMPLEX, iproc3-1, & 783 me3, desc%comm3, sh( iproc3 ), ierr ) 784 ENDDO 785#endif 786 787 IF( nproc3 == 1 ) GO TO 20 788 ! 789 ! step two: communication 790 ! 791#if ! defined(__NON_BLOCKING_SCATTER) 792 CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), & 793 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 794 795 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 796#else 797 DO iproc3 = 1, desc%nproc3 798 CALL mpi_irecv( f_aux( (iproc3-1)*sendsize + 1 ), sendsize, & 799 MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, & 800 desc%comm3, rh(iproc3), ierr ) 801 ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) ) 802 ENDDO 803 804 call mpi_waitall( desc%nproc3, sh, MPI_STATUSES_IGNORE, ierr ) 805 call mpi_waitall( desc%nproc3, rh, MPI_STATUSES_IGNORE, ierr ) 806#endif 807 ! 808 ! step one: store contiguously the columns 809 ! 810!$omp parallel do collapse(3) private(kdest,kfrom) 811 DO iproc3 = 1, nproc3 812 DO me2 = me2_start, me2_end 813 DO k = 0, ncpx-1 ! was ncp_(me3) 814 IF (k>=ncp_(desc%iproc(me2,me3))) CYCLE ! control k from 0 to ncp_(desc%iproc(me2,me3))-1 815 kdest = ( iproc3 - 1 ) * sendsize + nr3px * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k ) 816 kfrom = desc%nr3p_offset(iproc3) + desc%nr3x * ( me2_iproc3_offset( me2 - me2_start + 1, me3 ) + k ) 817 DO i = 1, desc%nr3p( iproc3 ) 818 f_in ( kfrom + i ) = f_aux ( kdest + i ) 819 ENDDO 820 ENDDO 821 ENDDO 822 ENDDO 823!$omp end parallel do 824 825 ! clean extra array elements in each stick 826 827 IF( desc%nr3x /= desc%nr3 ) THEN 828 DO k = 1, ncp_(desc%mype+1) 829 f_in(desc%nr3x*(k-1)+desc%nr3+1:desc%nr3x*k) = (0.0_DP, 0.0_DP) 830 END DO 831 END IF 832 83320 CONTINUE 834 835 ENDIF 836 837 END SUBROUTINE impl_yz 838 839#endif 840 841END SUBROUTINE fft_scatter_yz 842! 843!----------------------------------------------------------------------- 844SUBROUTINE fft_scatter_many_yz ( desc, f_in, f_aux, isgn, howmany ) 845 !----------------------------------------------------------------------- 846 ! 847 ! transpose of the fft yz planes across the desc%comm3 communicator 848 ! 849 ! a) From Z-oriented columns to Y-oriented colums (isgn > 0) 850 ! Active columns (or sticks or pencils) along the Z direction for each 851 ! processor are stored consecutively and are such that they correspond 852 ! to a subset of the active X values. 853 ! 854 ! The pencil -> slices transposition is performed in the subgroup 855 ! of processors (desc%comm3) owning these X values. 856 ! 857 ! The transpose takes place in two steps: 858 ! 1) on each processor the columns are sliced into sections along Z 859 ! that are stored one after the other. On each processor, slices for 860 ! processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big. 861 ! 2) all processors communicate to exchange slices (all columns with 862 ! Z in the slice belonging to "me" must be received, all the others 863 ! must be sent to "iproc3") 864 ! 865 ! Finally one gets the "slice" representation: each processor has 866 ! desc%nr3p(mype3) Z values of all the active pencils along Y for the 867 ! X values of the current group. Data are organized with the Y index 868 ! running fastest, then the reordered X values, then Z. 869 ! 870 ! f_in contains the input Z columns, is destroyed on output 871 ! f_aux contains the output Y colums. 872 ! 873 ! b) From planes to columns (isgn < 0) 874 ! 875 ! Quite the same in the opposite direction 876 ! f_aux contains the input Y columns, is destroyed on output 877 ! f_in contains the output Z columns. 878 ! 879 IMPLICIT NONE 880 881 TYPE (fft_type_descriptor), INTENT(in), TARGET :: desc 882 COMPLEX (DP), INTENT(inout) :: f_in(:), f_aux(:) 883 INTEGER, INTENT(in) :: isgn, howmany 884 885#if defined(__MPI) 886 ! 887 INTEGER :: ierr, me, me2, me3, nproc3, iproc3, ncpx, nr3px, ip 888 INTEGER :: i, j, it, it0, k, kfrom, kdest, mc, m1, m2, i1, sendsize 889 INTEGER, POINTER :: ncp_(:), ir1p_(:) 890 INTEGER :: my_nr1p_ 891 ! 892 me = desc%mype + 1 893 me2 = desc%mype2 + 1 894 me3 = desc%mype3 + 1 895 nproc3 = desc%nproc3 896 ! 897 if ( abs (isgn) == 1 ) then ! It's a potential FFT 898 ncp_ => desc%nsp 899 ir1p_ => desc%ir1p 900 my_nr1p_ = count (desc%ir1p > 0) 901 else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT 902 ncp_ => desc%nsw 903 ir1p_ => desc%ir1w 904 my_nr1p_ = count (desc%ir1w > 0) 905 else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group 906 print *, "ERRORE, this should never happen!" 907 end if 908 ! 909 CALL start_clock ('fft_scatt_many_yz') 910 ! 911 ! calculate the message size 912 ! 913 nr3px = MAXVAL ( desc%nr3p ) ! maximum number of Z values to be exchanged 914 ncpx = MAXVAL ( ncp_ ) ! maximum number of Z columns to be exchanged 915 916 sendsize = howmany * ncpx * nr3px ! dimension of the scattered chunks 917 ! 918 ierr = 0 919 IF (isgn.gt.0) THEN 920 ! 921 ! "forward" scatter from columns to planes 922 ! 923 ! step one: store contiguously the slices 924 ! 925 IF (nproc3==1) THEN 926!$omp parallel default(none) & 927!$omp private(i, j, k, it, mc, m1, m2, i1) & 928!$omp shared(howmany, ncpx, nr3px, desc, f_aux, f_in) & 929!$omp& shared(ncp_, me2, ir1p_, my_nr1p_) 930!$omp do collapse(2) 931 DO k=0, howmany-1 932 DO j = 1, desc%my_nr3p*desc%nr2x*my_nr1p_ 933 f_aux(j+k*desc%nnr) = (0.0_DP, 0.0_DP) 934 ENDDO 935 ENDDO 936!$omp end do 937!$omp do collapse(2) 938 DO k=0, howmany-1 939 DO j = 1, ncp_(desc%iproc( me2, 1)) 940 it = (j-1)*nr3px + k*ncpx*nr3px 941 mc = desc%ismap( j + desc%iss(desc%iproc( me2, 1)) ) 942 m1 = mod (mc-1,desc%nr1x) + 1 943 m2 = (mc-1)/desc%nr1x + 1 944 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x 945 DO i = 1, desc%my_nr3p 946 f_aux( i1 + k*desc%nnr ) = f_in( it + i) 947 i1 = i1 + desc%nr2x * my_nr1p_ 948 ENDDO 949 ENDDO 950 ENDDO 951!$omp end do 952!$omp end parallel 953 ELSE 954!$omp parallel default(none) & 955!$omp private(i, j, k, kdest, kfrom, iproc3) & 956!$omp& private(it, mc, m1, m2, i1) & 957!$omp& shared(nproc3, howmany, ncpx, sendsize, nr3px) & 958!$omp& shared(ncp_, ir1p_, my_nr1p_, ierr) & 959!$omp& shared(desc, f_aux, f_in, me2, me3) 960!$omp do collapse(3) 961 DO k = 0, howmany-1 962 DO iproc3 = 1, nproc3 963 DO j = 0, ncp_(desc%iproc(me2,me3))-1 964 kdest = ( iproc3 - 1 ) * sendsize + nr3px * (j + k*ncpx) 965 kfrom = desc%nr3p_offset(iproc3) + desc%nr3x * (j + k*ncpx) 966 DO i = 1, desc%nr3p( iproc3 ) 967 f_aux ( kdest + i ) = f_in ( kfrom + i ) 968 ENDDO 969 ENDDO 970 ENDDO 971 ENDDO 972!$omp end do 973 ! 974 ! ensures that no garbage is present in the output 975 ! useless; the later accessed elements are overwritten by the A2A step 976 ! 977 ! step two: communication across the nproc3 group 978 ! 979!$omp single 980 CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), & 981 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 982 983 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 984!$omp end single 985 ! 986!$omp do collapse(2) 987 DO k=0, howmany-1 988 DO j = 1, desc%my_nr3p*desc%nr2x*my_nr1p_ 989 f_aux(j+k*desc%nnr) = (0.0_DP, 0.0_DP) 990 ENDDO 991 ENDDO 992!$omp end do 993!$omp do collapse(3) 994 DO k=0, howmany-1 995 DO iproc3 = 1, nproc3 996 DO j = 1, ncpx 997 IF ( j>ncp_(desc%iproc( me2, iproc3)) ) CYCLE 998 it = (iproc3 - 1) * sendsize + (j-1)*nr3px + k*ncpx*nr3px 999 mc = desc%ismap( j + desc%iss(desc%iproc( me2, iproc3)) ) 1000 m1 = mod (mc-1,desc%nr1x) + 1 1001 m2 = (mc-1)/desc%nr1x + 1 1002 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x 1003 DO i = 1, desc%my_nr3p 1004 f_aux( i1 + k*desc%nnr ) = f_in( it + i) 1005 i1 = i1 + desc%nr2x * my_nr1p_ 1006 ENDDO 1007 ENDDO 1008 ENDDO 1009 ENDDO 1010!$omp end do 1011!$omp end parallel 1012 ENDIF 1013 ! 1014 ELSE 1015 ! 1016 ! "backward" scatter from planes to columns 1017 ! 1018 IF( nproc3 == 1 ) THEN 1019!$omp parallel default(none) & 1020!$omp private(i, j, k, it, mc, m1, m2, i1) & 1021!$omp shared(howmany, ncpx, nr3px, desc, f_aux, f_in) & 1022!$omp& shared(ncp_, me2, ir1p_, my_nr1p_) 1023!$omp do collapse(2) 1024 DO k = 0, howmany - 1 1025 DO j = 1, ncp_(desc%iproc(me2,1)) 1026 it = (j-1)*nr3px + k*ncpx*nr3px 1027 mc = desc%ismap( j + desc%iss(desc%iproc(me2,1))) 1028 m1 = mod (mc-1,desc%nr1x) + 1 1029 m2 = (mc-1)/desc%nr1x + 1 1030 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x 1031 DO i = 1, desc%my_nr3p 1032 f_in( i + it ) = f_aux( i1 + k*desc%nnr ) 1033 i1 = i1 + desc%nr2x*my_nr1p_ 1034 ENDDO 1035 ENDDO 1036 ENDDO 1037!$omp end do 1038!$omp end parallel 1039 ELSE 1040 ! 1041!$omp parallel default(none) & 1042!$omp private(iproc3, i, j, k, ip, it, mc, m1, m2, i1, it0,kdest,kfrom) & 1043!$omp shared(nproc3, howmany, ncpx, sendsize, nr3px, desc, me3) & 1044!$omp& shared(f_aux, f_in, ncp_, me2, ir1p_, my_nr1p_, ierr) 1045!$omp do collapse(3) 1046 DO k = 0, howmany - 1 1047 DO iproc3 = 1, nproc3 1048 DO j = 1, ncpx 1049 IF ( j>ncp_(desc%iproc( me2, iproc3)) ) CYCLE 1050 it = (iproc3-1) * sendsize + (j-1)*nr3px + k*ncpx*nr3px 1051 mc = desc%ismap( j + desc%iss(desc%iproc( me2, iproc3)) ) 1052 m1 = mod (mc-1,desc%nr1x) + 1 1053 m2 = (mc-1)/desc%nr1x + 1 1054 i1 = m2 + ( ir1p_(m1) - 1 ) * desc%nr2x 1055 DO i = 1, desc%my_nr3p 1056 f_in( i + it ) = f_aux( i1 + k*desc%nnr ) 1057 i1 = i1 + desc%nr2x*my_nr1p_ 1058 ENDDO 1059 ENDDO 1060 ENDDO 1061 ENDDO 1062!$omp end do 1063 ! 1064!$omp single 1065 ! 1066 ! step two: communication 1067 ! 1068 CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), & 1069 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 1070 1071 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 1072!$omp end single 1073 ! 1074 ! step one: store contiguously the columns 1075 ! 1076!$omp do collapse(3) 1077 DO k = 0, howmany-1 1078 DO iproc3 = 1, nproc3 1079 DO j = 0, ncp_(desc%iproc(me2,me3))-1 1080 kdest = ( iproc3 - 1 ) * sendsize + nr3px * (j + k*ncpx) 1081 kfrom = desc%nr3p_offset(iproc3) + desc%nr3x * (j + k*ncpx) 1082 DO i = 1, desc%nr3p( iproc3 ) 1083 f_in( kfrom + i ) = f_aux( kdest + i ) 1084 ENDDO 1085 ENDDO 1086 ENDDO 1087 ENDDO 1088!$omp end do 1089 ! clean extra array elements in each stick 1090 IF( desc%nr3x /= desc%nr3 ) THEN 1091!$omp do collapse(2) 1092 DO k = 0, howmany-1 1093 DO j = 1, ncp_ ( desc%mype+1 ) 1094 DO i = desc%nr3+1, desc%nr3x 1095 f_in( k*ncpx*desc%nr3x + (j-1)*desc%nr3x + i) = 0.0d0 1096 END DO 1097 END DO 1098 END DO 1099!$omp end do 1100 ENDIF 1101!$omp end parallel 1102 ENDIF 1103 ! 1104 ENDIF 1105 1106 CALL stop_clock ('fft_scatt_many_yz') 1107 1108#endif 1109 1110 RETURN 111198 format ( 10 ('(',2f12.9,')') ) 111299 format ( 20 ('(',2f12.9,')') ) 1113 1114END SUBROUTINE fft_scatter_many_yz 1115 1116!----------------------------------------------------------------------- 1117SUBROUTINE fft_scatter_tg ( desc, f_in, f_aux, nxx_, isgn ) 1118 !----------------------------------------------------------------------- 1119 ! 1120 ! task group wavefunction redistribution 1121 ! 1122 ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one 1123 ! 1124 ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one 1125 ! 1126 ! in both cases: 1127 ! f_in contains the input data, is overwritten with the desired output 1128 ! f_aux is used as working array, may contain garbage in output 1129 ! 1130 IMPLICIT NONE 1131 1132 TYPE (fft_type_descriptor), INTENT(in) :: desc 1133 INTEGER, INTENT(in) :: nxx_, isgn 1134 COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_) 1135 1136 INTEGER :: ierr 1137 1138 CALL start_clock ('fft_scatt_tg') 1139 1140 if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 ) 1141 1142#if defined(__MPI) 1143 ! 1144 f_aux = f_in 1145 if ( isgn > 0 ) then 1146 1147 CALL MPI_ALLTOALLV( f_aux, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, & 1148 f_in, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 1149 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) ) 1150 1151 else 1152 1153 CALL MPI_ALLTOALLV( f_aux, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, & 1154 f_in, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 1155 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) ) 1156 end if 1157 1158#endif 1159 CALL stop_clock ('fft_scatt_tg') 1160 1161 RETURN 1162 1163END SUBROUTINE fft_scatter_tg 1164! 1165!----------------------------------------------------------------------- 1166SUBROUTINE fft_scatter_tg_opt ( desc, f_in, f_out, nxx_, isgn ) 1167 !----------------------------------------------------------------------- 1168 ! 1169 ! task group wavefunction redistribution 1170 ! 1171 ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one 1172 ! 1173 ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one 1174 ! 1175 ! in both cases: 1176 ! f_in contains the input data 1177 ! f_out contains the output data 1178 ! 1179 IMPLICIT NONE 1180 1181 TYPE (fft_type_descriptor), INTENT(in) :: desc 1182 INTEGER, INTENT(in) :: nxx_, isgn 1183 COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_out (nxx_) 1184 1185 INTEGER :: ierr 1186 1187 CALL start_clock ('fft_scatt_tg') 1188 1189 if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 ) 1190 1191#if defined(__MPI) 1192 ! 1193 if ( isgn > 0 ) then 1194 1195 CALL MPI_ALLTOALLV( f_in, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, & 1196 f_out, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 1197 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) ) 1198 1199 else 1200 1201 CALL MPI_ALLTOALLV( f_in, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, & 1202 f_out, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 1203 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) ) 1204 end if 1205 1206#endif 1207 CALL stop_clock ('fft_scatt_tg') 1208 1209 RETURN 1210 1211END SUBROUTINE fft_scatter_tg_opt 1212! 1213!=----------------------------------------------------------------------=! 1214 END MODULE fft_scatter 1215!=----------------------------------------------------------------------=! 1216