1! 2! Copyright (C) Quantum ESPRESSO group 3! 4! This file is distributed under the terms of the 5! GNU General Public License. See the file `License' 6! in the root directory of the present distribution, 7! or http://www.gnu.org/copyleft/gpl.txt . 8! 9!---------------------------------------------------------------------- 10! FFT base Module. 11! Written by Carlo Cavazzoni, modified by Paolo Giannozzi 12! Rewritten by Stefano de Gironcoli, ported to GPU by Pietro Bonfa' 13!---------------------------------------------------------------------- 14 15#if defined(__CUDA) 16 17#define __NON_BLOCKING_SCATTER 18 19!=----------------------------------------------------------------------=! 20 MODULE fft_scatter_gpu 21!=----------------------------------------------------------------------=! 22#if defined(__CUDA) 23 USE fft_types, ONLY: fft_type_descriptor 24 USE fft_param 25 26 IMPLICIT NONE 27 SAVE 28 29 PRIVATE 30 ! 31 PUBLIC :: fft_type_descriptor 32 PUBLIC :: fft_scatter_xy_gpu, fft_scatter_yz_gpu, fft_scatter_tg_gpu, & 33 fft_scatter_tg_opt_gpu, fft_scatter_many_yz_gpu 34 35!=----------------------------------------------------------------------=! 36 CONTAINS 37!=----------------------------------------------------------------------=! 38! 39! 40!----------------------------------------------------------------------- 41SUBROUTINE fft_scatter_xy_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn, stream ) 42 !----------------------------------------------------------------------- 43 ! 44 ! transpose of the fft xy planes across the desc%comm2 communicator 45 ! 46 ! a) From Y-oriented columns to X-oriented partial slices (isgn > 0) 47 ! Active columns along the Y direction corresponding to a subset of the 48 ! active X values and a range of Z values (in this order) are stored 49 ! consecutively for each processor and are such that the subgroup owns 50 ! all data for a range of Z values. 51 ! 52 ! The Y pencil -> X-oriented partial slices transposition is performed 53 ! in the subgroup of processors (desc%comm2) owning this range of Z values. 54 ! 55 ! The transpose takes place in two steps: 56 ! 1) on each processor the columns are sliced into sections along Y 57 ! that are stored one after the other. On each processor, slices for 58 ! processor "iproc2" are desc%nr2p(iproc2)*desc%nr1p(me2)*desc%my_nr3p big. 59 ! 2) all processors communicate to exchange slices (all sectin of columns with 60 ! Y in the slice belonging to "me" must be received, all the others 61 ! must be sent to "iproc2") 62 ! 63 ! Finally one gets the "partial slice" representation: each processor has 64 ! all the X values of desc%my_nr2p Y and desc%my_nr3p Z values. 65 ! Data are organized with the X index running fastest, then Y, then Z. 66 ! 67 ! f_in contains the input Y columns, is destroyed on output 68 ! f_aux contains the output X-oriented partial slices. 69 ! 70 ! b) From planes to columns (isgn < 0) 71 ! 72 ! Quite the same in the opposite direction 73 ! f_aux contains the input X-oriented partial slices, is destroyed on output 74 ! f_in contains the output Y columns. 75 ! 76 USE cudafor 77 !USE nvtx_fft 78 USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, & 79 f_aux=>pin_space_scatter_out 80 IMPLICIT NONE 81 82 TYPE (fft_type_descriptor), INTENT(in) :: desc 83 INTEGER, INTENT(in) :: nxx_, isgn 84 COMPLEX (DP), DEVICE, INTENT(inout) :: f_in_d (nxx_), f_aux_d (nxx_) 85 INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution 86 87#if defined(__MPI) 88 ! 89 INTEGER :: ierr, me2, nproc2, iproc2, ncpx, nr1x, my_nr2p, nr2px, ip, ip0 90 INTEGER :: i, it, j, k, kfrom, kdest, offset, ioff, mc, m1, m3, i1, icompact, sendsize, aux 91 INTEGER, ALLOCATABLE :: ncp_(:) 92 INTEGER, POINTER, DEVICE :: nr1p__d(:), indx_d(:,:) 93 ! 94#if defined(__NON_BLOCKING_SCATTER) 95 INTEGER :: sh(desc%nproc2), rh(desc%nproc2) 96#endif 97 ! 98 !CALL nvtxStartRangeAsync("fft_scatter_xy_gpu", isgn + 5) 99 ! 100 me2 = desc%mype2 + 1 101 nproc2 = desc%nproc2 ; if ( abs(isgn) == 3 ) nproc2 = 1 102 103 ! This mapping improves readability but, most importantly, it is needed 104 ! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from 105 ! `desc` appear in the body of the do loops, the compiler generates incorrect GPU code. 106 nr1x = desc%nr1x 107 108 ! allocate auxiliary array for columns distribution 109 ALLOCATE ( ncp_(nproc2)) 110 if ( abs (isgn) == 1 ) then ! It's a potential FFT 111 ncp_ = desc%nr1p * desc%my_nr3p 112 nr1p__d=> desc%nr1p_d 113 indx_d => desc%indp_d 114 my_nr2p=desc%my_nr2p 115 else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT 116 ncp_ = desc%nr1w * desc%my_nr3p 117 nr1p__d=> desc%nr1w_d 118 indx_d => desc%indw_d 119 my_nr2p=desc%my_nr2p 120 else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group 121 ncp_ = desc%nr1w_tg * desc%my_nr3p! 122 nr1p__d=> desc%nr1w_tg_d ! 123 indx_d => desc%indw_tg_d ! 124 my_nr2p=desc%nr2x ! in task group FFTs whole Y colums are distributed 125 end if 126 ! 127 CALL start_clock ('fft_scatt_xy') 128 ! 129 ! calculate the message size 130 ! 131 if ( abs (isgn) == 3 ) then 132 nr2px = desc%nr2x ! if it's a task group FFT whole planes are distributed 133 else 134 nr2px = MAXVAL ( desc%nr2p ) ! maximum number of Y values to be disributed 135 end if 136 137 ncpx = MAXVAL ( ncp_ ) ! maximum number of Y columns to be disributed 138 139 sendsize = ncpx * nr2px ! dimension of the scattered chunks (safe value) 140 ! 141 ! check host copy allocation of f_in and f_aux 142 CALL check_buffers_size(desc) 143 ! 144 ierr = 0 145 IF (isgn.gt.0) THEN 146 147 IF (nproc2==1) GO TO 10 148 ! 149 ! "forward" scatter from columns to planes 150 ! 151 ! step one: store contiguously the slices 152 ! 153 offset = 0 154 DO iproc2 = 1, nproc2 155 kdest = ( iproc2 - 1 ) * sendsize 156 kfrom = offset 157 !DO k = 1, ncp_(me2) 158 ! !DO i = 1, desc%nr2p( iproc2 ) 159 ! !f_aux ( kdest + i ) = f_in ( kfrom + i ) 160 ! !ENDDO 161 ! 162 ! !kdest = kdest + nr2px 163 ! !kfrom = kfrom + desc%nr2x 164 !ENDDO 165 166 ierr = cudaMemcpy2DAsync( f_aux(kdest + 1), nr2px, f_in_d(kfrom + 1 ), desc%nr2x, desc%nr2p( iproc2 ), ncp_(me2), cudaMemcpyDeviceToHost, stream ) 167 offset = offset + desc%nr2p( iproc2 ) 168 ENDDO 169 ! 170 ! step two: communication across the nproc3 group 171 ! 172#if defined(__NON_BLOCKING_SCATTER) 173 ierr = cudaStreamSynchronize(stream) 174 DO iproc2 = 1, nproc2 175 kdest = ( iproc2 - 1 ) * sendsize 176 CALL mpi_irecv( f_in( kdest + 1 ), sendsize, & 177 MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, & 178 desc%comm2, rh( iproc2 ), ierr ) 179 CALL mpi_isend( f_aux( kdest + 1 ), sendsize, & 180 MPI_DOUBLE_COMPLEX, iproc2-1, me2, desc%comm2, & 181 sh( iproc2 ), ierr ) 182 ENDDO 183#else 184 ierr = cudaStreamSynchronize(stream) 185 CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), & 186 sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 187 ! 188 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 189 190 !f_in_d(1:nxx_) = f_in(1:nxx_) 191 ierr = cudaMemcpyAsync( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice, stream ) 192 193#endif 194 ! 19510 CONTINUE 196 ! 197 !f_aux = (0.0_DP, 0.0_DP) 198 !$cuf kernel do (1) <<<*,*,0,stream>>> 199 DO i = 1, nxx_ 200 f_aux_d(i) = (0.d0, 0.d0) 201 END DO 202 ! 203 DO iproc2 = 1, nproc2 204#if defined(__NON_BLOCKING_SCATTER) 205 IF (nproc2 > 1) THEN 206 kdest = (iproc2-1)*sendsize 207 call mpi_wait( rh(iproc2), MPI_STATUSES_IGNORE, ierr ) 208 call mpi_wait( sh(iproc2), MPI_STATUSES_IGNORE, ierr ) 209 ierr = cudaMemcpyAsync( f_in_d(kdest + 1), f_in(kdest + 1 ), sendsize, cudaMemcpyHostToDevice, stream ) 210 END IF 211#endif 212 aux = ncp_( iproc2 ) 213 !$cuf kernel do(2) <<<*,*,0,stream>>> 214 DO i = 1, aux 215 DO j = 1, my_nr2p 216 it = ( iproc2 - 1 ) * sendsize + (i-1)*nr2px 217 m3 = (i-1)/nr1p__d(iproc2)+1 218 i1 = mod(i-1,nr1p__d(iproc2))+1 219 m1 = indx_d(i1,iproc2) 220 icompact = m1 + (m3-1)*nr1x*my_nr2p + (j-1)*nr1x 221 ! old do loop started here 222 !f_aux( m1 + (j-1)*nr1x + (m3-1)*nr1x*my_nr2p ) = f_in( j + it ) 223 f_aux_d( icompact ) = f_in_d( j + it ) 224 !icompact = icompact + nr1x 225 ENDDO 226 !it = it + nr2px 227 ENDDO 228 ENDDO 229 230 ELSE 231 ! 232 ! "backward" scatter from planes to columns 233 ! 234 DO iproc2 = 1, nproc2 235 aux = ncp_( iproc2 ) 236 !$cuf kernel do (2) <<<*,*,0,stream>>> 237 DO i = 1, aux 238 DO j = 1, my_nr2p 239 it = ( iproc2 - 1 ) * sendsize + (i-1)*nr2px 240 m3 = (i-1)/nr1p__d(iproc2)+1 ; i1 = mod(i-1,nr1p__d(iproc2))+1 ; m1 = indx_d(i1,iproc2) 241 icompact = m1 + (m3-1)*nr1x*my_nr2p + (j-1)*nr1x 242 !DO j = 1, my_nr2p 243 !f_in( j + it ) = f_aux( m1 + (j-1)*nr1x + (m3-1)*nr1x*my_nr2p ) 244 f_in_d( j + it ) = f_aux_d( icompact ) 245 ! icompact = icompact + nr1x 246 ENDDO 247 !it = it + nr2px 248 ENDDO 249 ! 250#if defined(__NON_BLOCKING_SCATTER) 251 IF( nproc2 > 1 ) THEN 252 kdest = ( iproc2 - 1 ) * sendsize 253 ierr = cudaMemcpyAsync( f_in(kdest + 1), f_in_d(kdest + 1 ), sendsize, cudaMemcpyDeviceToHost, stream ) 254 END IF 255#endif 256 ENDDO 257 IF (nproc2==1) GO TO 20 258 ! 259 ! step two: communication 260 ! 261 262#if defined(__NON_BLOCKING_SCATTER) 263 DO iproc2 = 1, nproc2 264 kdest = (iproc2-1)*sendsize 265 CALL mpi_irecv( f_aux( ( iproc2 - 1 ) * sendsize + 1 ), sendsize, & 266 MPI_DOUBLE_COMPLEX, iproc2-1, MPI_ANY_TAG, & 267 desc%comm2, rh(iproc2), ierr ) 268 ierr = cudaStreamSynchronize(stream) 269 CALL mpi_isend( f_in( kdest + 1 ), & 270 sendsize, MPI_DOUBLE_COMPLEX, iproc2-1, me2, & 271 desc%comm2, sh( iproc2 ), ierr ) 272 ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) ) 273 ENDDO 274#else 275 !f_in(1:nxx_) = f_in_d(1:nxx_) 276 ierr = cudaMemcpy( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost) 277 CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), & 278 sendsize, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 279 280 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 281#endif 282 ! 283 ! step one: store contiguously the columns 284 ! 285 ! ensures that no garbage is present in the output 286 ! not useless ... clean the array to be returned from the garbage of previous A2A step 287 !f_in = (0.0_DP, 0.0_DP) ! 288 !$cuf kernel do (1) <<<*,*,0,stream>>> 289 do i = 1, nxx_ 290 f_in_d(i) = (0.d0, 0.d0) 291 end do 292 ! 293 offset = 0 294 DO iproc2 = 1, nproc2 295 kdest = ( iproc2 - 1 ) * sendsize 296 kfrom = offset 297 298#if defined(__NON_BLOCKING_SCATTER) 299 call mpi_wait( rh(iproc2), MPI_STATUSES_IGNORE, ierr ) 300 call mpi_wait( sh(iproc2), MPI_STATUSES_IGNORE, ierr ) 301#endif 302 !DO k = 1, ncp_(me2) 303 ! DO i = 1, desc%nr2p( iproc2 ) 304 ! f_in ( kfrom + i ) = f_aux ( kdest + i ) 305 ! ENDDO 306 ! kdest = kdest + nr2px 307 ! kfrom = kfrom + desc%nr2x 308 !ENDDO 309 ierr = cudaMemcpy2DAsync( f_in_d(kfrom +1 ), desc%nr2x, f_aux(kdest + 1), nr2px, desc%nr2p( iproc2 ), ncp_(me2), cudaMemcpyHostToDevice, stream ) 310 offset = offset + desc%nr2p( iproc2 ) 311 ENDDO 312 31320 CONTINUE 314 315 316 ENDIF 317 ! 318 !CALL nvtxEndRangeAsync() 319 DEALLOCATE ( ncp_ ) 320 CALL stop_clock ('fft_scatt_xy') 321 322#endif 323 324 RETURN 32599 format ( 20 ('(',2f12.9,')') ) 326 327END SUBROUTINE fft_scatter_xy_gpu 328! 329!----------------------------------------------------------------------- 330SUBROUTINE fft_scatter_yz_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn ) 331 !----------------------------------------------------------------------- 332 ! 333 ! transpose of the fft yz planes across the desc%comm3 communicator 334 ! 335 ! a) From Z-oriented columns to Y-oriented colums (isgn > 0) 336 ! Active columns (or sticks or pencils) along the Z direction for each 337 ! processor are stored consecutively and are such that they correspond 338 ! to a subset of the active X values. 339 ! 340 ! The pencil -> slices transposition is performed in the subgroup 341 ! of processors (desc%comm3) owning these X values. 342 ! 343 ! The transpose takes place in two steps: 344 ! 1) on each processor the columns are sliced into sections along Z 345 ! that are stored one after the other. On each processor, slices for 346 ! processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big. 347 ! 2) all processors communicate to exchange slices (all columns with 348 ! Z in the slice belonging to "me" must be received, all the others 349 ! must be sent to "iproc3") 350 ! 351 ! Finally one gets the "slice" representation: each processor has 352 ! desc%nr3p(mype3) Z values of all the active pencils along Y for the 353 ! X values of the current group. Data are organized with the Y index 354 ! running fastest, then the reordered X values, then Z. 355 ! 356 ! f_in contains the input Z columns, is destroyed on output 357 ! f_aux contains the output Y colums. 358 ! 359 ! b) From planes to columns (isgn < 0) 360 ! 361 ! Quite the same in the opposite direction 362 ! f_aux contains the input Y columns, is destroyed on output 363 ! f_in contains the output Z columns. 364 ! 365 USE cudafor 366 !USE nvtx_fft 367 USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, & 368 f_aux=>pin_space_scatter_out 369 IMPLICIT NONE 370 371 TYPE (fft_type_descriptor), INTENT(in) :: desc 372 INTEGER, INTENT(in) :: nxx_, isgn 373 COMPLEX (DP), DEVICE, INTENT(inout) :: f_in_d (nxx_), f_aux_d (nxx_) 374! INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution 375 376#if defined(__MPI) 377 INTEGER, DEVICE, POINTER :: desc_ismap_d(:) 378 ! 379 INTEGER :: ierr, me, me2, me2_start, me2_end, me3, nproc3, iproc3, ncpx, nr3px, ip, ip0 380 INTEGER :: nr1x, nr2x, nr3, nr3x 381 INTEGER :: i, it, it0, k, kfrom, kdest, offset, ioff, mc, m1, m2, i1, sendsize, aux 382 INTEGER, ALLOCATABLE :: ncp_(:) 383 INTEGER, DEVICE, POINTER :: ir1p__d(:) 384 INTEGER :: my_nr1p_ 385 ! 386#if defined(__NON_BLOCKING_SCATTER) 387 INTEGER :: sh(desc%nproc3), rh(desc%nproc3) 388#endif 389 TYPE(cudaEvent) :: zero_event 390 !CALL nvtxStartRangeAsync("fft_scatter_yz_gpu", isgn + 5) 391 ierr = cudaEventCreate( zero_event ) 392 ! 393 me = desc%mype + 1 394 me2 = desc%mype2 + 1 395 me3 = desc%mype3 + 1 396 nproc3 = desc%nproc3 397 398 ! This mapping improves readability but, most importantly, it is needed 399 ! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from 400 ! `desc` appear in the body of the do loops, the compiler generates incorrect GPU code. 401 nr1x = desc%nr1x 402 nr2x = desc%nr2x 403 nr3 = desc%nr3 404 nr3x = desc%nr3x 405 406 ! allocate auxiliary array for columns distribution 407 ALLOCATE ( ncp_( desc%nproc) ) 408 409 me2_start = me2 ; me2_end = me2 410 if ( abs (isgn) == 1 ) then ! It's a potential FFT 411 ncp_ = desc%nsp 412 my_nr1p_ = count (desc%ir1p > 0) 413 ir1p__d => desc%ir1p_d 414 else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT 415 ncp_ = desc%nsw 416 my_nr1p_ = count (desc%ir1w > 0) 417 ir1p__d => desc%ir1w_d 418 else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group 419 ncp_ = desc%nsw 420 my_nr1p_ = count (desc%ir1w_tg > 0) 421 ir1p__d => desc%ir1w_tg_d 422 me2_start = 1 ; me2_end = desc%nproc2 423 end if 424 ! 425 CALL start_clock ('fft_scatt_yz') 426 ! 427 ! calculate the message size 428 ! 429 nr3px = MAXVAL ( desc%nr3p ) ! maximum number of Z values to be exchanged 430 ncpx = MAXVAL ( ncp_ ) ! maximum number of Z columns to be exchanged 431 if (abs(isgn)==3) then 432 ncpx = ncpx * desc%nproc2 ! if it's a task group FFT groups of columns are exchanged 433 end if 434 435 sendsize = ncpx * nr3px ! dimension of the scattered chunks 436 437 ! check host copy allocation of f_in and f_aux 438 CALL check_buffers_size(desc) 439 desc_ismap_d => desc%ismap_d 440 441 ierr = 0 442 IF (isgn.gt.0) THEN 443 444 IF (nproc3==1) GO TO 10 445 ! 446 ! "forward" scatter from columns to planes 447 ! 448 ! step one: store contiguously the slices 449 ! 450 offset = 0 451 DO iproc3 = 1, nproc3 452 kdest = ( iproc3 - 1 ) * sendsize 453 kfrom = offset 454 DO me2 = me2_start, me2_end 455 ip = desc%iproc(me2,me3) 456 ! DO k = 1, ncp_ (ip) ! was ncp_(me3) 457 ! DO i = 1, desc%nr3p( iproc3 ) 458 ! f_aux ( kdest + i ) = f_in ( kfrom + i ) 459 ! ENDDO 460 ! kdest = kdest + nr3px 461 ! kfrom = kfrom + desc%nr3x 462 ! ENDDO 463 464 ierr = cudaMemcpy2DAsync( f_aux(kdest + 1), nr3px, f_in_d(kfrom + 1 ), nr3x, desc%nr3p( iproc3 ), ncp_(ip), cudaMemcpyDeviceToHost, desc%stream_scatter_yz(iproc3) ) 465 kdest = kdest + nr3px*ncp_ (ip) 466 kfrom = kfrom + nr3x*ncp_ (ip) 467 ENDDO 468 offset = offset + desc%nr3p( iproc3 ) 469 ENDDO 470 ! 471 ! ensures that no garbage is present in the output 472 ! useless; the later accessed elements are overwritten by the A2A step 473 ! 474 ! step two: communication across the nproc3 group 475 ! 476#if defined(__NON_BLOCKING_SCATTER) 477 DO iproc3 = 1, nproc3 478 ierr = cudaStreamSynchronize(desc%stream_scatter_yz(iproc3)) 479 CALL mpi_irecv( f_in( ( iproc3 - 1 ) * sendsize + 1 ), sendsize, & 480 MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, & 481 desc%comm3, rh( iproc3 ), ierr ) 482 CALL mpi_isend( f_aux( ( iproc3 - 1 ) * sendsize + 1 ), sendsize, & 483 MPI_DOUBLE_COMPLEX, iproc3-1, me3, desc%comm3, & 484 sh( iproc3 ), ierr ) 485 ENDDO 486 ! 487#else 488 ierr = cudaDeviceSynchronize() 489 CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), & 490 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 491 492 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 493 ! 494 !f_in_d(1:nxx_) = f_in(1:nxx_) 495 ierr = cudaMemcpy( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice ) 496#endif 497 ! 49810 CONTINUE 499 ! 500 !f_aux = (0.0_DP, 0.0_DP) ! 501!$cuf kernel do (1) <<<*,*,0,desc%stream_scatter_yz(1)>>> 502 DO i = 1, desc%my_nr3p*my_nr1p_*nr2x 503 f_aux_d(i) = (0.d0, 0.d0) 504 END DO 505 ierr = cudaEventRecord ( zero_event, desc%stream_scatter_yz(1) ) 506 ! 507 DO iproc3 = 1, desc%nproc3 508 it0 = ( iproc3 - 1 ) * sendsize 509#if defined(__NON_BLOCKING_SCATTER) 510 IF (nproc3 > 1) THEN 511 CALL mpi_wait( rh(iproc3), MPI_STATUSES_IGNORE, ierr ) 512 CALL mpi_wait( sh(iproc3), MPI_STATUSES_IGNORE, ierr ) 513 ierr = cudaMemcpyAsync( f_in_d(it0+1), f_in(it0+1), sendsize, cudaMemcpyHostToDevice, desc%stream_scatter_yz(iproc3) ) 514 END IF 515#endif 516 IF (iproc3 == 2) ierr = cudaEventSynchronize( zero_event ) 517 DO me2 = me2_start, me2_end 518 ip = desc%iproc( me2, iproc3) 519 ioff = desc%iss(ip) 520 aux = ncp_( ip ) 521!$cuf kernel do(2) <<<*,*,0,desc%stream_scatter_yz(iproc3)>>> 522 DO i = 1, aux ! was ncp_(iproc3) 523 DO k = 1, desc%my_nr3p 524 it = it0 + (i-1)*nr3px 525 mc = desc_ismap_d( i + ioff ) ! this is m1+(m2-1)*nr1x of the current pencil 526 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1 527 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*nr2x*my_nr1p_ 528 529 f_aux_d( i1 ) = f_in_d( k + it ) 530 !i1 = i1 + desc%nr2x*my_nr1p_ 531 ENDDO 532 ENDDO 533 it0 = it0 + ncp_( ip )*nr3px 534 ENDDO 535 ENDDO 536 537 ELSE 538 ! 539 ! "backward" scatter from planes to columns 540 ! 541 DO iproc3 = 1, nproc3 542 it0 = ( iproc3 - 1 ) * sendsize 543 DO me2 = me2_start, me2_end 544 ip = desc%iproc(me2, iproc3) 545 ioff = desc%iss(ip) 546 aux = ncp_( ip ) 547!$cuf kernel do(2) <<<*,*,0,desc%stream_scatter_yz(iproc3)>>> 548 DO i = 1, aux 549 DO k = 1, desc%my_nr3p 550 it = it0 + (i-1)*nr3px 551 mc = desc_ismap_d( i + ioff ) ! this is m1+(m2-1)*nr1x of the current pencil 552 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1 553 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*(nr2x * my_nr1p_) 554 555 f_in_d( k + it ) = f_aux_d( i1 ) 556 !i1 = i1 + desc%nr2x * my_nr1p_ 557 ENDDO 558 !it = it + nr3px 559 ENDDO 560 it0 = it0 + ncp_( ip )*nr3px 561 ENDDO 562#if defined(__NON_BLOCKING_SCATTER) 563 IF( nproc3 > 1 ) THEN 564 kdest = ( iproc3 - 1 ) * sendsize 565 ierr = cudaMemcpyAsync( f_in(kdest + 1), f_in_d(kdest + 1 ), sendsize, cudaMemcpyDeviceToHost, desc%stream_scatter_yz(iproc3) ) 566 END IF 567#endif 568 ENDDO 569 570 IF( nproc3 == 1 ) GO TO 20 571 ! 572 ! 573 ! step two: communication 574 ! 575#if defined(__NON_BLOCKING_SCATTER) 576 DO iproc3 = 1, nproc3 577 CALL mpi_irecv( f_aux( (iproc3 - 1) * sendsize + 1 ), sendsize, & 578 MPI_DOUBLE_COMPLEX, iproc3-1, MPI_ANY_TAG, & 579 desc%comm3, rh(iproc3), ierr ) 580 ierr = cudaStreamSynchronize(desc%stream_scatter_yz(iproc3)) 581 CALL mpi_isend( f_in( ( iproc3 - 1 ) * sendsize + 1 ), & 582 sendsize, MPI_DOUBLE_COMPLEX, iproc3-1, & 583 me3, desc%comm3, sh( iproc3 ), ierr ) 584 END DO 585#else 586 !f_in(1:nxx_) = f_in_d(1:nxx_) 587 !ierr = cudaDeviceSynchronize() 588 ierr = cudaMemcpy( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost ) 589 590 CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), & 591 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 592 593 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 594 595 596#endif 597 ! 598 ! step one: store contiguously the columns 599 ! 600 offset = 0 601 DO iproc3 = 1, nproc3 602 kdest = ( iproc3 - 1 ) * sendsize 603#if defined(__NON_BLOCKING_SCATTER) 604 IF( nproc3 > 1 ) THEN 605 call mpi_wait( rh(iproc3), MPI_STATUSES_IGNORE, ierr ) 606 call mpi_wait( sh(iproc3), MPI_STATUSES_IGNORE, ierr ) 607 END IF 608#endif 609 kfrom = offset 610 DO me2 = me2_start, me2_end 611 ip = desc%iproc(me2,me3) 612 ! DO k = 1, ncp_ (ip) 613 ! DO i = 1, desc%nr3p( iproc3 ) 614 ! f_in ( kfrom + i ) = f_aux ( kdest + i ) 615 ! ENDDO 616 ! kdest = kdest + nr3px 617 ! kfrom = kfrom + desc%nr3x 618 ! ENDDO 619 ierr = cudaMemcpy2DAsync( f_in_d(kfrom +1 ), nr3x, f_aux(kdest + 1), nr3px, desc%nr3p( iproc3 ), ncp_ (ip), cudaMemcpyHostToDevice, desc%stream_scatter_yz(iproc3) ) 620 kdest = kdest + nr3px*ncp_ (ip) 621 kfrom = kfrom + nr3x*ncp_ (ip) 622 ENDDO 623 offset = offset + desc%nr3p( iproc3 ) 624 ENDDO 625 ! 626 DO iproc3 = 1, nproc3 627 ierr = cudaStreamSynchronize(desc%stream_scatter_yz(iproc3)) 628 END DO 629 630 631 ! clean extra array elements in each stick 632 633 IF( nr3x /= nr3 ) THEN 634 aux = ncp_ ( desc%mype+1 ) 635!$cuf kernel do(2) <<<*,*>>> 636 DO k = 1, aux 637 DO i = nr3, nr3x 638 f_in_d( (k-1)*nr3x + i ) = (0.d0, 0.d0) 639 END DO 640 END DO 641 END IF 642 643 64420 CONTINUE 645 646 ENDIF 647 648 DEALLOCATE ( ncp_ ) 649 !CALL nvtxEndRangeAsync() 650 CALL stop_clock ('fft_scatt_yz') 651 652#endif 653 654 RETURN 65598 format ( 10 ('(',2f12.9,')') ) 65699 format ( 20 ('(',2f12.9,')') ) 657 658END SUBROUTINE fft_scatter_yz_gpu 659! 660!----------------------------------------------------------------------- 661SUBROUTINE fft_scatter_tg_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn, stream ) 662 !----------------------------------------------------------------------- 663 ! 664 ! task group wavefunction redistribution 665 ! 666 ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one 667 ! 668 ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one 669 ! 670 ! in both cases: 671 ! f_in contains the input data, is overwritten with the desired output 672 ! f_aux is used as working array, may contain garbage in output 673 ! 674 USE cudafor 675 USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, & 676 f_aux=>pin_space_scatter_out 677 IMPLICIT NONE 678 679 TYPE (fft_type_descriptor), INTENT(in) :: desc 680 INTEGER, INTENT(in) :: nxx_, isgn 681 COMPLEX (DP), DEVICE, INTENT(inout) :: f_in_d (nxx_), f_aux_d (nxx_) 682 INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution 683 684 INTEGER :: ierr 685 686 CALL start_clock ('fft_scatt_tg') 687 688 if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 ) 689 ! get pinned memory buffers for ALLTOALL, check allocation 690 CALL check_buffers_size(desc) 691#if defined(__MPI) 692 ! == OPTIMIZE, replace this with overlapped comunication on host and device, 693 ! or possibly use GPU MPI? 694 695 !f_aux(1:nxx_) = f_in_d(1:nxx_) 696 ierr = cudaMemcpyAsync( f_aux, f_in_d, nxx_, cudaMemcpyDeviceToHost, stream ) 697 ierr = cudaStreamSynchronize(stream) 698 if ( isgn > 0 ) then 699 700 CALL MPI_ALLTOALLV( f_aux, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, & 701 f_in, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 702 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) ) 703 704 else 705 706 CALL MPI_ALLTOALLV( f_aux, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, & 707 f_in, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 708 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) ) 709 end if 710 !f_in_d(1:nxx_) = f_in(1:nxx_) 711 ierr = cudaMemcpyAsync( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice, stream ) 712 !ierr = cudaStreamSynchronize(stream) 713#endif 714 ! 715 CALL stop_clock ('fft_scatt_tg') 716 RETURN 71799 format ( 20 ('(',2f12.9,')') ) 718 719END SUBROUTINE fft_scatter_tg_gpu 720! 721!----------------------------------------------------------------------- 722SUBROUTINE fft_scatter_tg_opt_gpu ( desc, f_in_d, f_out_d, nxx_, isgn, stream ) 723 !----------------------------------------------------------------------- 724 ! 725 ! task group wavefunction redistribution 726 ! 727 ! a) (isgn >0 ) From many-wfc partial-plane arrangement to single-wfc whole-plane one 728 ! 729 ! b) (isgn <0 ) From single-wfc whole-plane arrangement to many-wfc partial-plane one 730 ! 731 ! in both cases: 732 ! f_in contains the input data 733 ! f_out contains the output data 734 ! 735 USE cudafor 736 USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, & 737 f_out=>pin_space_scatter_out 738 IMPLICIT NONE 739 740 TYPE (fft_type_descriptor), INTENT(in) :: desc 741 INTEGER, INTENT(in) :: nxx_, isgn 742 COMPLEX (DP), DEVICE, INTENT(inout) :: f_in_d (nxx_), f_out_d (nxx_) 743 INTEGER(kind=cuda_stream_kind), INTENT(IN) :: stream ! cuda stream for the execution 744 745 INTEGER :: ierr 746 747 CALL start_clock ('fft_scatt_tg') 748 749 if ( abs (isgn) /= 3 ) call fftx_error__ ('fft_scatter_tg', 'wrong call', 1 ) 750 ! 751 ! check host copy allocation of f_in and f_aux 752 CALL check_buffers_size(desc) 753 ierr = cudaMemcpyAsync( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost, stream ) 754 ierr = cudaStreamSynchronize(stream) 755#if defined(__MPI) 756 ! 757 if ( isgn > 0 ) then 758 759 CALL MPI_ALLTOALLV( f_in, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, & 760 f_out, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 761 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 1 ', abs(ierr) ) 762 763 else 764 765 CALL MPI_ALLTOALLV( f_in, desc%tg_rcv, desc%tg_rdsp, MPI_DOUBLE_COMPLEX, & 766 f_out, desc%tg_snd, desc%tg_sdsp, MPI_DOUBLE_COMPLEX, desc%comm2, ierr) 767 IF( ierr /= 0 ) CALL fftx_error__( 'fft_scatter_tg', ' alltoall error 2 ', abs(ierr) ) 768 end if 769 770#endif 771 !f_out_d(1:nxx_) = f_out(1:nxx_) 772 ierr = cudaMemcpyAsync( f_out_d, f_out, nxx_, cudaMemcpyHostToDevice, stream ) 773 !ierr = cudaStreamSynchronize(stream) 774 ! 775 CALL stop_clock ('fft_scatt_tg') 776 777 RETURN 77899 format ( 20 ('(',2f12.9,')') ) 779 780END SUBROUTINE fft_scatter_tg_opt_gpu 781 782#endif 783 784 785!----------------------------------------------------------------------- 786SUBROUTINE fft_scatter_many_yz_gpu ( desc, f_in_d, f_aux_d, nxx_, isgn, howmany ) 787 !----------------------------------------------------------------------- 788 ! 789 ! transpose of the fft yz planes across the desc%comm3 communicator 790 ! 791 ! a) From Z-oriented columns to Y-oriented colums (isgn > 0) 792 ! Active columns (or sticks or pencils) along the Z direction for each 793 ! processor are stored consecutively and are such that they correspond 794 ! to a subset of the active X values. 795 ! 796 ! The pencil -> slices transposition is performed in the subgroup 797 ! of processors (desc%comm3) owning these X values. 798 ! 799 ! The transpose takes place in two steps: 800 ! 1) on each processor the columns are sliced into sections along Z 801 ! that are stored one after the other. On each processor, slices for 802 ! processor "iproc3" are desc%nr3p(iproc3)*desc%nsw/nsp(me) big. 803 ! 2) all processors communicate to exchange slices (all columns with 804 ! Z in the slice belonging to "me" must be received, all the others 805 ! must be sent to "iproc3") 806 ! 807 ! Finally one gets the "slice" representation: each processor has 808 ! desc%nr3p(mype3) Z values of all the active pencils along Y for the 809 ! X values of the current group. Data are organized with the Y index 810 ! running fastest, then the reordered X values, then Z. 811 ! 812 ! f_in contains the input Z columns, is destroyed on output 813 ! f_aux contains the output Y colums. 814 ! 815 ! b) From planes to columns (isgn < 0) 816 ! 817 ! Quite the same in the opposite direction 818 ! f_aux contains the input Y columns, is destroyed on output 819 ! f_in contains the output Z columns. 820 ! 821 USE cudafor 822 !USE nvtx_fft 823 USE fft_buffers, ONLY : check_buffers_size, f_in => pin_space_scatter_in, & 824 f_aux=>pin_space_scatter_out 825 IMPLICIT NONE 826 827 TYPE (fft_type_descriptor), INTENT(in) :: desc 828 INTEGER, INTENT(in) :: nxx_, isgn 829 COMPLEX (DP), DEVICE, INTENT(inout) :: f_in_d (nxx_), f_aux_d (nxx_) 830 INTEGER, INTENT(IN) :: howmany 831 832#if defined(__MPI) 833 INTEGER, DEVICE, POINTER :: desc_ismap_d(:) 834 ! 835 INTEGER :: ierr, me, me2, me3, nproc3, iproc3, ncpx, nr3px, ip, ip0, me2_start, me2_end 836 INTEGER :: nr1x, nr2x, nr3, nr3x, nnr 837 INTEGER :: i, j, it, it0, k, kfrom, kdest, offset, ioff, mc, m1, m2, i1, sendsize, aux 838 INTEGER, ALLOCATABLE :: ncp_(:) 839 INTEGER, DEVICE, POINTER :: ir1p__d(:) 840 INTEGER :: my_nr1p_ 841 ! 842#if defined(__NON_BLOCKING_SCATTER) 843 INTEGER :: sh(desc%nproc3), rh(desc%nproc3) 844#endif 845 !CALL nvtxStartRangeAsync("fft_scatter_many_yz_gpu", isgn + 5) 846 ! 847 me = desc%mype + 1 848 me2 = desc%mype2 + 1 849 me3 = desc%mype3 + 1 850 nproc3 = desc%nproc3 851 852 ! This mapping improves readability but, most importantly, it is needed 853 ! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from 854 ! `desc` appear in the body of the do loops, the compiler generates incorrect GPU code. 855 nr1x = desc%nr1x 856 nr2x = desc%nr2x 857 nr3 = desc%nr3 858 nr3x = desc%nr3x 859 nnr = desc%nnr 860 861 ! allocate auxiliary array for columns distribution 862 ALLOCATE ( ncp_( desc%nproc) ) 863 ! 864 me2_start = me2 ; me2_end = me2 865 if ( abs (isgn) == 1 ) then ! It's a potential FFT 866 ncp_ = desc%nsp 867 my_nr1p_ = count (desc%ir1p > 0) 868 ir1p__d => desc%ir1p_d 869 else if ( abs (isgn) == 2 ) then ! It's a wavefunction FFT 870 ncp_ = desc%nsw 871 my_nr1p_ = count (desc%ir1w > 0) 872 ir1p__d => desc%ir1w_d 873 else if ( abs (isgn) == 3 ) then ! It's a wavefunction FFT with task group 874 print *, "ERRORE, this should never happen!" 875 end if 876 ! 877 CALL start_clock ('fft_scatt_many_yz') 878 ! 879 ! calculate the message size 880 ! 881 nr3px = MAXVAL ( desc%nr3p ) ! maximum number of Z values to be exchanged 882 ncpx = MAXVAL ( ncp_ ) ! maximum number of Z columns to be exchanged 883 ! 884 sendsize = howmany * ncpx * nr3px ! dimension of the scattered chunks 885 ! 886 ! check dimensions of f_in and f_aux 887 CALL check_buffers_size(desc, howmany) 888 desc_ismap_d => desc%ismap_d 889 ! 890 ierr = 0 891 IF (isgn.gt.0) THEN 892 893 IF (nproc3==1) GO TO 10 894 ! 895 ! "forward" scatter from columns to planes 896 ! 897 ! step one: store contiguously the slices 898 ! 899 offset = 0 900 DO iproc3 = 1, nproc3 901 kdest = ( iproc3 - 1 ) * sendsize 902 kfrom = offset 903 ! 904 ierr = cudaMemcpy2D( f_aux(kdest + 1), nr3px, f_in_d(kfrom + 1 ), nr3x, desc%nr3p( iproc3 ), howmany*ncpx, cudaMemcpyDeviceToHost) 905 ! 906 offset = offset + desc%nr3p( iproc3 ) 907 ENDDO 908 ! 909 ! ensures that no garbage is present in the output 910 ! useless; the later accessed elements are overwritten by the A2A step 911 ! 912 ! step two: communication across the nproc3 group 913 ! 914 915 CALL mpi_alltoall (f_aux(1), sendsize, MPI_DOUBLE_COMPLEX, f_in(1), & 916 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 917 918 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 919 ! 920 !f_in_d(1:nxx_) = f_in(1:nxx_) 921 ierr = cudaMemcpy( f_in_d, f_in, nxx_, cudaMemcpyHostToDevice) 922 ! 92310 CONTINUE 924 ! 925 !f_aux = (0.0_DP, 0.0_DP) ! 926 !$cuf kernel do (1) <<<*,*>>> 927 do i = 1, nxx_ 928 f_aux_d(i) = (0.d0, 0.d0) 929 end do 930 ! 931 DO iproc3 = 1, desc%nproc3 932 it0 = ( iproc3 - 1 ) * sendsize 933 934 ioff = desc%iss(iproc3) 935 aux = ncp_( iproc3 ) 936 937!$cuf kernel do(3) <<<*,*>>> 938 DO j=0, howmany-1 939 DO i = 1, aux ! was ncp_(iproc3) 940 DO k = 1, desc%my_nr3p 941 it = it0 + (i-1)*nr3px + j*ncpx*nr3px !desc%nnr !aux*nr3px 942 mc = desc_ismap_d( i + ioff ) ! this is m1+(m2-1)*nr1x of the current pencil 943 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1 944 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*nr2x*my_nr1p_ 945 946 f_aux_d( i1 + j*nnr ) = f_in_d( k + it ) 947 !i1 = i1 + nr2x*my_nr1p_ 948 ENDDO 949 ENDDO 950 !it0 = it0 + ncp_( ip )*nr3px 951 ENDDO 952 ENDDO 953 954 ELSE 955 ! 956 ! "backward" scatter from planes to columns 957 ! 958 DO iproc3 = 1, nproc3 959 it0 = ( iproc3 - 1 ) * sendsize 960 961 ip = desc%iproc(me2, iproc3) 962 ioff = desc%iss(ip) 963 aux = ncp_( ip ) 964!$cuf kernel do(3) <<<*,*>>> 965 DO j = 0, howmany - 1 966 DO i = 1, aux 967 DO k = 1, desc%my_nr3p 968 it = it0 + (i-1)*nr3px + j*ncpx*nr3px 969 mc = desc_ismap_d( i + ioff ) ! this is m1+(m2-1)*nr1x of the current pencil 970 m1 = mod (mc-1,nr1x) + 1 ; m2 = (mc-1)/nr1x + 1 971 i1 = m2 + ( ir1p__d(m1) - 1 ) * nr2x + (k-1)*(nr2x * my_nr1p_) 972 973 f_in_d( k + it ) = f_aux_d( i1 + j*nnr ) 974 !i1 = i1 + desc%nr2x * my_nr1p_ 975 ENDDO 976 !it = it + nr3px 977 ENDDO 978 !it0 = it0 + ncp_( ip )*nr3px 979 ENDDO 980 ENDDO 981 982 IF( nproc3 == 1 ) GO TO 20 983 ! 984 ! 985 ! step two: communication 986 ! 987 ierr = cudaMemcpy( f_in, f_in_d, nxx_, cudaMemcpyDeviceToHost ) 988 989 CALL mpi_alltoall (f_in(1), sendsize, MPI_DOUBLE_COMPLEX, f_aux(1), & 990 sendsize, MPI_DOUBLE_COMPLEX, desc%comm3, ierr) 991 992 IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) ) 993 994 995 ! 996 ! step one: store contiguously the columns 997 ! 998 offset = 0 999 DO iproc3 = 1, nproc3 1000 kdest = ( iproc3 - 1 ) * sendsize 1001 kfrom = offset 1002 ierr = cudaMemcpy2D( f_in_d(kfrom +1 ), nr3x, f_aux(kdest + 1), nr3px, desc%nr3p( iproc3 ), howmany * ncpx, cudaMemcpyHostToDevice) 1003 ! 1004 offset = offset + desc%nr3p( iproc3 ) 1005 ENDDO 1006 1007 ! clean extra array elements in each stick 1008 1009 IF( nr3x /= nr3 ) THEN 1010 aux = ncp_ ( desc%mype+1 ) 1011!$cuf kernel do(3) <<<*,*>>> 1012 DO j=0, howmany-1 1013 DO k = 1, aux 1014 DO i = nr3, nr3x 1015 f_in_d( j*ncpx*nr3x + (k-1)*nr3x + i) = 0.0d0 1016 END DO 1017 END DO 1018 END DO 1019 END IF 1020 102120 CONTINUE 1022 1023 ENDIF 1024 1025 DEALLOCATE ( ncp_ ) 1026 !CALL nvtxEndRangeAsync() 1027 CALL stop_clock ('fft_scatt_many_yz') 1028 1029#endif 1030 1031 RETURN 103298 format ( 10 ('(',2f12.9,')') ) 103399 format ( 20 ('(',2f12.9,')') ) 1034 1035END SUBROUTINE fft_scatter_many_yz_gpu 1036 1037!=----------------------------------------------------------------------=! 1038END MODULE fft_scatter_gpu 1039!=----------------------------------------------------------------------=! 1040 1041! defined (__CUDA) 1042#endif 1043