1! 2! Copyright (C) 2002-2013 Quantum ESPRESSO group 3! This file is distributed under the terms of the 4! GNU General Public License. See the file `License' 5! in the root directory of the present distribution, 6! or http://www.gnu.org/copyleft/gpl.txt . 7! 8! This module contains interfaces to most low-level MPI operations: 9! initialization and stopping, broadcast, parallel sum, etc. 10! 11!------------------------------------------------------------------------------! 12MODULE mp 13!------------------------------------------------------------------------------! 14 USE util_param, ONLY : DP, stdout, i8b 15 USE parallel_include 16#if defined(__CUDA) 17 USE cudafor, ONLY : cudamemcpy, cudamemcpy2d, & 18 & cudaMemcpyDeviceToDevice, & 19 & cudaDeviceSynchronize 20#endif 21 ! 22 IMPLICIT NONE 23 PRIVATE 24 ! 25 PUBLIC :: mp_start, mp_abort, mp_stop, mp_end, & 26 mp_bcast, mp_sum, mp_max, mp_min, mp_rank, mp_size, & 27 mp_gather, mp_alltoall, mp_get, mp_put, & 28 mp_barrier, mp_report, mp_group_free, & 29 mp_root_sum, mp_comm_free, mp_comm_create, mp_comm_group, & 30 mp_group_create, mp_comm_split, mp_set_displs, & 31 mp_circular_shift_left, mp_circular_shift_left_start, & 32 mp_get_comm_null, mp_get_comm_self, mp_count_nodes, & 33 mp_type_create_column_section, mp_type_create_row_section, mp_type_free, & 34 mp_allgather, mp_waitall, mp_testall 35 ! 36 INTERFACE mp_bcast 37 MODULE PROCEDURE mp_bcast_i1, mp_bcast_r1, mp_bcast_c1, & 38 mp_bcast_z, mp_bcast_zv, & 39 mp_bcast_iv, mp_bcast_i8v, mp_bcast_rv, mp_bcast_cv, mp_bcast_l, mp_bcast_rm, & 40 mp_bcast_cm, mp_bcast_im, mp_bcast_it, mp_bcast_i4d, mp_bcast_rt, mp_bcast_lv, & 41 mp_bcast_lm, mp_bcast_r4d, mp_bcast_r5d, mp_bcast_ct, mp_bcast_c4d,& 42 mp_bcast_c5d, mp_bcast_c6d 43#if defined(__CUDA) 44 MODULE PROCEDURE mp_bcast_i1_gpu, mp_bcast_r1_gpu, mp_bcast_c1_gpu, & 45 !mp_bcast_z_gpu, mp_bcast_zv_gpu, & 46 mp_bcast_iv_gpu, mp_bcast_rv_gpu, mp_bcast_cv_gpu, mp_bcast_l_gpu, mp_bcast_rm_gpu, & 47 mp_bcast_cm_gpu, mp_bcast_im_gpu, mp_bcast_it_gpu, mp_bcast_i4d_gpu, mp_bcast_rt_gpu, mp_bcast_lv_gpu, & 48 mp_bcast_lm_gpu, mp_bcast_r4d_gpu, mp_bcast_r5d_gpu, mp_bcast_ct_gpu, mp_bcast_c4d_gpu,& 49 mp_bcast_c5d_gpu, mp_bcast_c6d_gpu 50#endif 51 END INTERFACE 52 ! 53 INTERFACE mp_sum 54 MODULE PROCEDURE mp_sum_i1, mp_sum_iv, mp_sum_i8v, mp_sum_im, mp_sum_it, mp_sum_i4, mp_sum_i5, & 55 mp_sum_r1, mp_sum_rv, mp_sum_rm, mp_sum_rt, mp_sum_r4d, & 56 mp_sum_c1, mp_sum_cv, mp_sum_cm, mp_sum_ct, mp_sum_c4d, & 57 mp_sum_c5d, mp_sum_c6d, mp_sum_rmm, mp_sum_cmm, mp_sum_r5d, & 58 mp_sum_r6d 59#if defined(__CUDA) 60 MODULE PROCEDURE mp_sum_i1_gpu, mp_sum_iv_gpu, mp_sum_im_gpu, mp_sum_it_gpu, & 61 mp_sum_r1_gpu, mp_sum_rv_gpu, mp_sum_rm_gpu, mp_sum_rt_gpu, mp_sum_r4d_gpu, & 62 mp_sum_c1_gpu, mp_sum_cv_gpu, mp_sum_cm_gpu, mp_sum_ct_gpu, mp_sum_c4d_gpu, & 63 mp_sum_c5d_gpu, mp_sum_c6d_gpu, mp_sum_rmm_gpu, mp_sum_cmm_gpu, mp_sum_r5d_gpu, & 64 mp_sum_r6d_gpu 65#endif 66 END INTERFACE 67 ! 68 INTERFACE mp_root_sum 69 MODULE PROCEDURE mp_root_sum_rm, mp_root_sum_cm 70#if defined(__CUDA) 71 MODULE PROCEDURE mp_root_sum_rm_gpu, mp_root_sum_cm_gpu 72#endif 73 END INTERFACE 74 ! 75 INTERFACE mp_get 76 MODULE PROCEDURE mp_get_r1, mp_get_rv, mp_get_cv, mp_get_i1, mp_get_iv, mp_get_rm, mp_get_cm 77#if defined(__CUDA) 78 MODULE PROCEDURE mp_get_r1_gpu, mp_get_rv_gpu, mp_get_cv_gpu, mp_get_i1_gpu, mp_get_iv_gpu, & 79 mp_get_rm_gpu, mp_get_cm_gpu 80#endif 81 END INTERFACE 82 ! 83 INTERFACE mp_put 84 MODULE PROCEDURE mp_put_rv, mp_put_cv, mp_put_i1, mp_put_iv, & 85 mp_put_rm 86#if defined(__CUDA) 87 MODULE PROCEDURE mp_put_rv_gpu, mp_put_cv_gpu, mp_put_i1_gpu, mp_put_iv_gpu, & 88 mp_put_rm_gpu 89#endif 90 END INTERFACE 91 ! 92 INTERFACE mp_max 93 MODULE PROCEDURE mp_max_i, mp_max_r, mp_max_rv, mp_max_iv 94#if defined(__CUDA) 95 MODULE PROCEDURE mp_max_i_gpu, mp_max_r_gpu, mp_max_rv_gpu, mp_max_iv_gpu 96#endif 97 END INTERFACE 98 ! 99 INTERFACE mp_min 100 MODULE PROCEDURE mp_min_i, mp_min_r, mp_min_rv, mp_min_iv 101#if defined(__CUDA) 102 MODULE PROCEDURE mp_min_i_gpu, mp_min_r_gpu, mp_min_rv_gpu, mp_min_iv_gpu 103#endif 104 END INTERFACE 105 ! 106 INTERFACE mp_gather 107 MODULE PROCEDURE mp_gather_i1, mp_gather_iv, mp_gatherv_rv, mp_gatherv_iv, & 108 mp_gatherv_rm, mp_gatherv_im, mp_gatherv_cv, & 109 mp_gatherv_inplace_cplx_array 110#if defined(__CUDA) 111 MODULE PROCEDURE mp_gather_i1_gpu, mp_gather_iv_gpu, mp_gatherv_rv_gpu, mp_gatherv_iv_gpu, & 112 mp_gatherv_rm_gpu, mp_gatherv_im_gpu, mp_gatherv_cv_gpu, mp_gatherv_inplace_cplx_array_gpu 113#endif 114 END INTERFACE 115 ! 116 INTERFACE mp_allgather 117 MODULE PROCEDURE mp_allgatherv_inplace_cplx_array 118 MODULE PROCEDURE mp_allgatherv_inplace_real_array 119#if defined(__CUDA) 120 MODULE PROCEDURE mp_allgatherv_inplace_cplx_array_gpu 121#endif 122 END INTERFACE 123 ! 124 INTERFACE mp_alltoall 125 MODULE PROCEDURE mp_alltoall_c3d, mp_alltoall_i3d 126#if defined(__CUDA) 127 MODULE PROCEDURE mp_alltoall_c3d_gpu, mp_alltoall_i3d_gpu 128#endif 129 END INTERFACE 130 ! 131 INTERFACE mp_circular_shift_left 132 MODULE PROCEDURE mp_circular_shift_left_i0, & 133 mp_circular_shift_left_i1, & 134 mp_circular_shift_left_i2, & 135 mp_circular_shift_left_r2d, & 136 mp_circular_shift_left_c2d 137#if defined(__CUDA) 138 MODULE PROCEDURE mp_circular_shift_left_i0_gpu, & 139 mp_circular_shift_left_i1_gpu, & 140 mp_circular_shift_left_i2_gpu, & 141 mp_circular_shift_left_r2d_gpu, & 142 mp_circular_shift_left_c2d_gpu 143#endif 144 END INTERFACE 145 ! 146 INTERFACE mp_circular_shift_left_start 147 MODULE PROCEDURE mp_circular_shift_left_start_i0, & 148 mp_circular_shift_left_start_i1, & 149 mp_circular_shift_left_start_i2, & 150 mp_circular_shift_left_start_r2d, & 151 mp_circular_shift_left_start_c2d 152 END INTERFACE 153 ! 154 INTERFACE mp_type_create_column_section 155 MODULE PROCEDURE mp_type_create_cplx_column_section 156 MODULE PROCEDURE mp_type_create_real_column_section 157#if defined(__CUDA) 158 MODULE PROCEDURE mp_type_create_cplx_column_section_gpu 159 MODULE PROCEDURE mp_type_create_real_column_section_gpu 160#endif 161 END INTERFACE 162 163 INTERFACE mp_type_create_row_section 164 MODULE PROCEDURE mp_type_create_cplx_row_section 165 MODULE PROCEDURE mp_type_create_real_row_section 166#if defined(__CUDA) 167 MODULE PROCEDURE mp_type_create_cplx_row_section_gpu 168 MODULE PROCEDURE mp_type_create_real_row_section_gpu 169#endif 170 END INTERFACE 171!------------------------------------------------------------------------------! 172! 173 CONTAINS 174! 175!------------------------------------------------------------------------------! 176! 177!------------------------------------------------------------------------------! 178!..mp_gather_i1 179 SUBROUTINE mp_gather_i1(mydata, alldata, root, gid) 180 IMPLICIT NONE 181 INTEGER, INTENT(IN) :: mydata, root 182 INTEGER, INTENT(IN) :: gid 183 INTEGER :: group 184 INTEGER, INTENT(OUT) :: alldata(:) 185 INTEGER :: ierr 186 187 188#if defined (__MPI) 189 group = gid 190 CALL MPI_GATHER(mydata, 1, MPI_INTEGER, alldata, 1, MPI_INTEGER, root, group, IERR) 191 IF (ierr/=0) CALL mp_stop( 8013 ) 192#else 193 alldata(1) = mydata 194#endif 195 RETURN 196 END SUBROUTINE mp_gather_i1 197 198!------------------------------------------------------------------------------! 199!..mp_gather_iv 200!..Carlo Cavazzoni 201 SUBROUTINE mp_gather_iv(mydata, alldata, root, gid) 202 IMPLICIT NONE 203 INTEGER, INTENT(IN) :: mydata(:), root 204 INTEGER, INTENT(IN) :: gid 205 INTEGER :: group 206 INTEGER, INTENT(OUT) :: alldata(:,:) 207 INTEGER :: msglen, ierr 208 209 210#if defined (__MPI) 211 msglen = SIZE(mydata) 212 IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 ) 213 group = gid 214 CALL MPI_GATHER(mydata, msglen, MPI_INTEGER, alldata, msglen, MPI_INTEGER, root, group, IERR) 215 IF (ierr/=0) CALL mp_stop( 8014 ) 216#else 217 msglen = SIZE(mydata) 218 IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 ) 219 alldata(:,1) = mydata(:) 220#endif 221 RETURN 222 END SUBROUTINE mp_gather_iv 223 224! 225!------------------------------------------------------------------------------! 226!..mp_start 227 SUBROUTINE mp_start(numtask, taskid, group) 228 229! ... 230 IMPLICIT NONE 231 INTEGER, INTENT (OUT) :: numtask, taskid 232 INTEGER, INTENT (IN) :: group 233 INTEGER :: ierr 234! ... 235 ierr = 0 236 numtask = 1 237 taskid = 0 238 239# if defined(__MPI) 240 IF (ierr/=0) CALL mp_stop( 8004 ) 241 CALL mpi_comm_rank(group,taskid,ierr) 242 IF (ierr/=0) CALL mp_stop( 8005 ) 243 CALL mpi_comm_size(group,numtask,ierr) 244 IF (ierr/=0) CALL mp_stop( 8006 ) 245! ... 246 CALL allocate_buffers() 247#if defined(__CUDA) 248 CALL allocate_buffers_gpu() 249#endif 250#endif 251 RETURN 252 END SUBROUTINE mp_start 253! 254!------------------------------------------------------------------------------! 255!..mp_abort 256 257 SUBROUTINE mp_abort(errorcode,gid) 258 IMPLICIT NONE 259 INTEGER :: ierr 260 INTEGER, INTENT(IN):: errorcode, gid 261#if defined(__MPI) 262 CALL deallocate_buffers() 263#if defined(__CUDA) 264 CALL deallocate_buffers_gpu() 265#endif 266 CALL mpi_abort(gid, errorcode, ierr) 267#endif 268 END SUBROUTINE mp_abort 269! 270!------------------------------------------------------------------------------! 271!..mp_end 272 273 SUBROUTINE mp_end(groupid, cleanup) 274 IMPLICIT NONE 275 INTEGER, INTENT(IN) :: groupid 276 LOGICAL, OPTIONAL, INTENT(IN) :: cleanup 277 INTEGER :: ierr, taskid 278 LOGICAL :: cleanup_ 279 280 ierr = 0 281 taskid = 0 282 283#if defined(__MPI) 284 CALL mpi_comm_rank( groupid, taskid, ierr) 285 cleanup_ = .FALSE. 286 IF (PRESENT(cleanup)) cleanup_ = cleanup 287 IF(cleanup_) THEN 288 CALL deallocate_buffers() 289#if defined(__CUDA) 290 CALL deallocate_buffers_gpu() 291#endif 292 END IF 293#endif 294 RETURN 295 END SUBROUTINE mp_end 296 297!------------------------------------------------------------------------------! 298!..mp_group 299 300 SUBROUTINE mp_comm_group( comm, group ) 301 IMPLICIT NONE 302 INTEGER, INTENT (IN) :: comm 303 INTEGER, INTENT (OUT) :: group 304 INTEGER :: ierr 305 ierr = 0 306#if defined(__MPI) 307 CALL mpi_comm_group( comm, group, ierr ) 308 IF (ierr/=0) CALL mp_stop( 8007 ) 309#else 310 group = 0 311#endif 312 END SUBROUTINE mp_comm_group 313 314 SUBROUTINE mp_comm_split( old_comm, color, key, new_comm ) 315 IMPLICIT NONE 316 INTEGER, INTENT (IN) :: old_comm 317 INTEGER, INTENT (IN) :: color, key 318 INTEGER, INTENT (OUT) :: new_comm 319 INTEGER :: ierr 320 ierr = 0 321#if defined(__MPI) 322 CALL MPI_COMM_SPLIT( old_comm, color, key, new_comm, ierr ) 323 IF (ierr/=0) CALL mp_stop( 8008 ) 324#else 325 new_comm = old_comm 326#endif 327 END SUBROUTINE mp_comm_split 328 329 330 SUBROUTINE mp_group_create( group_list, group_size, old_grp, new_grp ) 331 IMPLICIT NONE 332 INTEGER, INTENT (IN) :: group_list(:), group_size, old_grp 333 INTEGER, INTENT (OUT) :: new_grp 334 INTEGER :: ierr 335 336 ierr = 0 337 new_grp = old_grp 338#if defined(__MPI) 339 CALL mpi_group_incl( old_grp, group_size, group_list, new_grp, ierr ) 340 IF (ierr/=0) CALL mp_stop( 8009 ) 341#endif 342 END SUBROUTINE mp_group_create 343 344!------------------------------------------------------------------------------! 345 SUBROUTINE mp_comm_create( old_comm, new_grp, new_comm ) 346 IMPLICIT NONE 347 INTEGER, INTENT (IN) :: old_comm 348 INTEGER, INTENT (IN) :: new_grp 349 INTEGER, INTENT (OUT) :: new_comm 350 INTEGER :: ierr 351 352 ierr = 0 353 new_comm = old_comm 354#if defined(__MPI) 355 CALL mpi_comm_create( old_comm, new_grp, new_comm, ierr ) 356 IF (ierr/=0) CALL mp_stop( 8010 ) 357#endif 358 END SUBROUTINE mp_comm_create 359 360!------------------------------------------------------------------------------! 361!..mp_group_free 362 SUBROUTINE mp_group_free( group ) 363 IMPLICIT NONE 364 INTEGER, INTENT (INOUT) :: group 365 INTEGER :: ierr 366 ierr = 0 367#if defined(__MPI) 368 CALL mpi_group_free( group, ierr ) 369 IF (ierr/=0) CALL mp_stop( 8011 ) 370#endif 371 END SUBROUTINE mp_group_free 372!------------------------------------------------------------------------------! 373 374 SUBROUTINE mp_comm_free( comm ) 375 IMPLICIT NONE 376 INTEGER, INTENT (INOUT) :: comm 377 INTEGER :: ierr 378 ierr = 0 379#if defined(__MPI) 380 IF( comm /= MPI_COMM_NULL ) THEN 381 CALL mpi_comm_free( comm, ierr ) 382 IF (ierr/=0) CALL mp_stop( 8012 ) 383 END IF 384#endif 385 RETURN 386 END SUBROUTINE mp_comm_free 387 388!------------------------------------------------------------------------------! 389! non-blocking helpers 390! waits till all request are completed 391 SUBROUTINE mp_waitall(requests) 392! ... 393 IMPLICIT NONE 394 INTEGER, INTENT (INOUT) :: requests(:) 395 INTEGER :: ierr 396#if defined(__MPI) 397 INTEGER :: istatus(MPI_STATUS_SIZE, size(requests)) 398#endif 399 ierr = 0 400#if defined(__MPI) 401 call MPI_Waitall(size(requests), requests, istatus, ierr) 402 IF (ierr/=0) CALL mp_stop( 8004 ) 403#endif 404 RETURN 405 END SUBROUTINE mp_waitall 406 407!tests all requests 408 SUBROUTINE mp_testall(requests, flag) 409 ! ... 410 IMPLICIT NONE 411 INTEGER, INTENT (INOUT) :: requests(:) 412 INTEGER :: ierr 413#if defined(__MPI) 414 INTEGER :: istatus(MPI_STATUS_SIZE, size(requests)) 415#endif 416 LOGICAL, INTENT(OUT):: flag 417 ! 418 ierr = 0 419 flag = .FALSE. 420#if defined(__MPI) 421 call MPI_Testall(size(requests), requests, flag, istatus, ierr) 422 IF (ierr/=0) CALL mp_stop( 8004 ) 423#else 424 flag = .TRUE. 425#endif 426 RETURN 427 END SUBROUTINE mp_testall 428 429!------------------------------------------------------------------------------! 430!..mp_bcast 431 SUBROUTINE mp_bcast_i1(msg,source,gid) 432 IMPLICIT NONE 433 INTEGER :: msg 434 INTEGER :: source 435 INTEGER, INTENT(IN) :: gid 436 INTEGER :: group 437 INTEGER :: msglen 438 439#if defined(__MPI) 440 msglen = 1 441 group = gid 442 CALL bcast_integer( msg, msglen, source, group ) 443#endif 444 END SUBROUTINE mp_bcast_i1 445 ! 446 !------------------------------------------------------------------------------! 447 SUBROUTINE mp_bcast_iv(msg, source, gid) 448 !------------------------------------------------------------------------------! 449 !! 450 !! Bcast an integer vector 451 !! 452 IMPLICIT NONE 453 ! 454 INTEGER :: msg(:) 455 INTEGER, INTENT(in) :: source 456 INTEGER, INTENT(in) :: gid 457#if defined(__MPI) 458 INTEGER :: msglen 459 msglen = SIZE(msg) 460 CALL bcast_integer(msg, msglen, source, gid) 461#endif 462 !------------------------------------------------------------------------------! 463 END SUBROUTINE mp_bcast_iv 464 !------------------------------------------------------------------------------! 465 ! 466 !------------------------------------------------------------------------------! 467 SUBROUTINE mp_bcast_i8v(msg, source, gid) 468 !------------------------------------------------------------------------------! 469 !! 470 !! Bcast an integer vector of kind i8b. 471 !! 472 IMPLICIT NONE 473 ! 474 INTEGER(KIND = i8b) :: msg(:) 475 INTEGER, INTENT(in) :: source 476 INTEGER, INTENT(in) :: gid 477#if defined(__MPI) 478 INTEGER :: msglen 479 msglen = SIZE(msg) 480 CALL bcast_integer8(msg, msglen, source, gid) 481#endif 482 !------------------------------------------------------------------------------! 483 END SUBROUTINE mp_bcast_i8v 484 !------------------------------------------------------------------------------! 485 ! 486 !------------------------------------------------------------------------------! 487 SUBROUTINE mp_bcast_im(msg, source, gid) 488 !------------------------------------------------------------------------------! 489 IMPLICIT NONE 490 INTEGER :: msg(:,:) 491 INTEGER, INTENT(IN) :: source 492 INTEGER, INTENT(IN) :: gid 493#if defined(__MPI) 494 INTEGER :: msglen 495 msglen = size(msg) 496 CALL bcast_integer(msg, msglen, source, gid) 497#endif 498 END SUBROUTINE mp_bcast_im 499! 500!------------------------------------------------------------------------------! 501! 502! Carlo Cavazzoni 503! 504 SUBROUTINE mp_bcast_it( msg, source, gid ) 505 IMPLICIT NONE 506 INTEGER :: msg(:,:,:) 507 INTEGER, INTENT(IN) :: source 508 INTEGER, INTENT(IN) :: gid 509#if defined(__MPI) 510 INTEGER :: msglen 511 msglen = size(msg) 512 CALL bcast_integer( msg, msglen, source, gid ) 513#endif 514 END SUBROUTINE mp_bcast_it 515! 516!------------------------------------------------------------------------------! 517! 518! Samuel Ponce 519! 520 SUBROUTINE mp_bcast_i4d(msg, source, gid) 521 IMPLICIT NONE 522 INTEGER :: msg(:,:,:,:) 523 INTEGER, INTENT(IN) :: source 524 INTEGER, INTENT(IN) :: gid 525#if defined(__MPI) 526 INTEGER :: msglen 527 msglen = size(msg) 528 CALL bcast_integer( msg, msglen, source, gid ) 529#endif 530 END SUBROUTINE mp_bcast_i4d 531! 532!------------------------------------------------------------------------------! 533! 534 SUBROUTINE mp_bcast_r1( msg, source, gid ) 535 IMPLICIT NONE 536 REAL (DP) :: msg 537 INTEGER, INTENT(IN) :: source 538 INTEGER, INTENT(IN) :: gid 539#if defined(__MPI) 540 INTEGER :: msglen 541 msglen = 1 542 CALL bcast_real( msg, msglen, source, gid ) 543#endif 544 END SUBROUTINE mp_bcast_r1 545! 546!------------------------------------------------------------------------------! 547! 548 SUBROUTINE mp_bcast_rv(msg,source,gid) 549 IMPLICIT NONE 550 REAL (DP) :: msg(:) 551 INTEGER, INTENT(IN) :: source 552 INTEGER, INTENT(IN) :: gid 553#if defined(__MPI) 554 INTEGER :: msglen 555 msglen = size(msg) 556 CALL bcast_real( msg, msglen, source, gid ) 557#endif 558 END SUBROUTINE mp_bcast_rv 559! 560!------------------------------------------------------------------------------! 561! 562 SUBROUTINE mp_bcast_rm(msg,source,gid) 563 IMPLICIT NONE 564 REAL (DP) :: msg(:,:) 565 INTEGER, INTENT(IN) :: source 566 INTEGER, INTENT(IN) :: gid 567#if defined(__MPI) 568 INTEGER :: msglen 569 msglen = size(msg) 570 CALL bcast_real( msg, msglen, source, gid ) 571#endif 572 END SUBROUTINE mp_bcast_rm 573! 574!------------------------------------------------------------------------------! 575! 576! Carlo Cavazzoni 577! 578 SUBROUTINE mp_bcast_rt(msg,source,gid) 579 IMPLICIT NONE 580 REAL (DP) :: msg(:,:,:) 581 INTEGER, INTENT(IN) :: source 582 INTEGER, INTENT(IN) :: gid 583#if defined(__MPI) 584 INTEGER :: msglen 585 msglen = size(msg) 586 CALL bcast_real( msg, msglen, source, gid ) 587#endif 588 END SUBROUTINE mp_bcast_rt 589! 590!------------------------------------------------------------------------------! 591! 592! Carlo Cavazzoni 593! 594 SUBROUTINE mp_bcast_r4d(msg, source, gid) 595 IMPLICIT NONE 596 REAL (DP) :: msg(:,:,:,:) 597 INTEGER, INTENT(IN) :: source 598 INTEGER, INTENT(IN) :: gid 599#if defined(__MPI) 600 INTEGER :: msglen 601 msglen = size(msg) 602 CALL bcast_real( msg, msglen, source, gid ) 603#endif 604 END SUBROUTINE mp_bcast_r4d 605 606! 607!------------------------------------------------------------------------------! 608! 609! Carlo Cavazzoni 610! 611 SUBROUTINE mp_bcast_r5d(msg, source, gid) 612 IMPLICIT NONE 613 REAL (DP) :: msg(:,:,:,:,:) 614 INTEGER, INTENT(IN) :: source 615 INTEGER, INTENT(IN) :: gid 616#if defined(__MPI) 617 INTEGER :: msglen 618 msglen = size(msg) 619 CALL bcast_real( msg, msglen, source, gid ) 620#endif 621 END SUBROUTINE mp_bcast_r5d 622 623!------------------------------------------------------------------------------! 624! 625 SUBROUTINE mp_bcast_c1(msg,source,gid) 626 IMPLICIT NONE 627 COMPLEX (DP) :: msg 628 INTEGER, INTENT(IN) :: source 629 INTEGER, INTENT(IN) :: gid 630#if defined(__MPI) 631 INTEGER :: msglen 632 msglen = 1 633 CALL bcast_real( msg, 2 * msglen, source, gid ) 634#endif 635 END SUBROUTINE mp_bcast_c1 636! 637!------------------------------------------------------------------------------! 638 SUBROUTINE mp_bcast_cv(msg,source,gid) 639 IMPLICIT NONE 640 COMPLEX (DP) :: msg(:) 641 INTEGER, INTENT(IN) :: source 642 INTEGER, INTENT(IN) :: gid 643#if defined(__MPI) 644 INTEGER :: msglen 645 msglen = size(msg) 646 CALL bcast_real( msg, 2 * msglen, source, gid ) 647#endif 648 END SUBROUTINE mp_bcast_cv 649! 650!------------------------------------------------------------------------------! 651 SUBROUTINE mp_bcast_cm(msg,source,gid) 652 IMPLICIT NONE 653 COMPLEX (DP) :: msg(:,:) 654 INTEGER, INTENT(IN) :: source 655 INTEGER, INTENT(IN) :: gid 656#if defined(__MPI) 657 INTEGER :: msglen 658 msglen = size(msg) 659 CALL bcast_real( msg, 2 * msglen, source, gid ) 660#endif 661 END SUBROUTINE mp_bcast_cm 662! 663!------------------------------------------------------------------------------! 664 SUBROUTINE mp_bcast_ct(msg,source,gid) 665 IMPLICIT NONE 666 COMPLEX (DP) :: msg(:,:,:) 667 INTEGER, INTENT(IN) :: source 668 INTEGER, INTENT(IN) :: gid 669#if defined(__MPI) 670 INTEGER :: msglen 671 msglen = size(msg) 672 CALL bcast_real( msg, 2 * msglen, source, gid ) 673#endif 674 END SUBROUTINE mp_bcast_ct 675 676! 677!------------------------------------------------------------------------------! 678 SUBROUTINE mp_bcast_c4d(msg,source,gid) 679 IMPLICIT NONE 680 COMPLEX (DP) :: msg(:,:,:,:) 681 INTEGER, INTENT(IN) :: source 682 INTEGER, INTENT(IN) :: gid 683#if defined(__MPI) 684 INTEGER :: msglen 685 msglen = size(msg) 686 CALL bcast_real( msg, 2 * msglen, source, gid ) 687#endif 688 END SUBROUTINE mp_bcast_c4d 689 690 SUBROUTINE mp_bcast_c5d(msg,source,gid) 691 IMPLICIT NONE 692 COMPLEX (DP) :: msg(:,:,:,:,:) 693 INTEGER, INTENT(IN) :: source 694 INTEGER, INTENT(IN) :: gid 695#if defined(__MPI) 696 INTEGER :: msglen 697 msglen = size(msg) 698 CALL bcast_real( msg, 2 * msglen, source, gid ) 699#endif 700 END SUBROUTINE mp_bcast_c5d 701 702 SUBROUTINE mp_bcast_c6d(msg,source,gid) 703 IMPLICIT NONE 704 COMPLEX (DP) :: msg(:,:,:,:,:,:) 705 INTEGER, INTENT(IN) :: source 706 INTEGER, INTENT(IN) :: gid 707#if defined(__MPI) 708 INTEGER :: msglen 709 msglen = size(msg) 710 CALL bcast_real( msg, 2 * msglen, source, gid ) 711#endif 712 END SUBROUTINE mp_bcast_c6d 713 714! 715!------------------------------------------------------------------------------! 716 717 SUBROUTINE mp_bcast_l(msg,source,gid) 718 IMPLICIT NONE 719 LOGICAL :: msg 720 INTEGER, INTENT(IN) :: source 721 INTEGER, INTENT(IN) :: gid 722#if defined(__MPI) 723 INTEGER :: msglen 724 msglen = 1 725 CALL bcast_logical( msg, msglen, source, gid ) 726#endif 727 END SUBROUTINE mp_bcast_l 728! 729!------------------------------------------------------------------------------! 730! 731! Carlo Cavazzoni 732! 733 SUBROUTINE mp_bcast_lv(msg,source,gid) 734 IMPLICIT NONE 735 LOGICAL :: msg(:) 736 INTEGER, INTENT(IN) :: source 737 INTEGER, INTENT(IN) :: gid 738#if defined(__MPI) 739 INTEGER :: msglen 740 msglen = size(msg) 741 CALL bcast_logical( msg, msglen, source, gid ) 742#endif 743 END SUBROUTINE mp_bcast_lv 744 745!------------------------------------------------------------------------------! 746! 747! Carlo Cavazzoni 748! 749 SUBROUTINE mp_bcast_lm(msg,source,gid) 750 IMPLICIT NONE 751 LOGICAL :: msg(:,:) 752 INTEGER, INTENT(IN) :: source 753 INTEGER, INTENT(IN) :: gid 754#if defined(__MPI) 755 INTEGER :: msglen 756 msglen = size(msg) 757 CALL bcast_logical( msg, msglen, source, gid ) 758#endif 759 END SUBROUTINE mp_bcast_lm 760 761 762! 763!------------------------------------------------------------------------------! 764! 765 SUBROUTINE mp_bcast_z(msg,source,gid) 766 IMPLICIT NONE 767 CHARACTER (len=*) :: msg 768 INTEGER, INTENT(IN) :: source 769 INTEGER, INTENT(IN) :: gid 770 INTEGER :: group 771 INTEGER :: msglen, ierr, i 772 INTEGER, ALLOCATABLE :: imsg(:) 773#if defined(__MPI) 774 ierr = 0 775 msglen = len(msg) 776 group = gid 777 ALLOCATE (imsg(1:msglen), STAT=ierr) 778 IF (ierr/=0) CALL mp_stop( 8015 ) 779 DO i = 1, msglen 780 imsg(i) = ichar(msg(i:i)) 781 END DO 782 CALL bcast_integer( imsg, msglen, source, group ) 783 DO i = 1, msglen 784 msg(i:i) = char(imsg(i)) 785 END DO 786 DEALLOCATE (imsg, STAT=ierr) 787 IF (ierr/=0) CALL mp_stop( 8016 ) 788#endif 789 END SUBROUTINE mp_bcast_z 790! 791!------------------------------------------------------------------------------! 792! 793!------------------------------------------------------------------------------! 794! 795 SUBROUTINE mp_bcast_zv(msg,source,gid) 796 IMPLICIT NONE 797 CHARACTER (len=*) :: msg(:) 798 INTEGER, INTENT(IN) :: source 799 INTEGER, INTENT(IN) :: gid 800 INTEGER :: group 801 INTEGER :: msglen, m1, m2, ierr, i, j 802 INTEGER, ALLOCATABLE :: imsg(:,:) 803#if defined(__MPI) 804 ierr = 0 805 m1 = LEN(msg) 806 m2 = SIZE(msg) 807 msglen = LEN(msg)*SIZE(msg) 808 group = gid 809 ALLOCATE (imsg(1:m1,1:m2), STAT=ierr) 810 IF (ierr/=0) CALL mp_stop( 8017 ) 811 DO j = 1, m2 812 DO i = 1, m1 813 imsg(i,j) = ichar(msg(j)(i:i)) 814 END DO 815 END DO 816 CALL bcast_integer( imsg, msglen, source, group ) 817 DO j = 1, m2 818 DO i = 1, m1 819 msg(j)(i:i) = char(imsg(i,j)) 820 END DO 821 END DO 822 DEALLOCATE (imsg, STAT=ierr) 823 IF (ierr/=0) CALL mp_stop( 8018 ) 824#endif 825 END SUBROUTINE mp_bcast_zv 826! 827!------------------------------------------------------------------------------! 828! 829! Carlo Cavazzoni 830! 831 SUBROUTINE mp_get_i1(msg_dest, msg_sour, mpime, dest, sour, ip, gid) 832 INTEGER :: msg_dest 833 INTEGER, INTENT(IN) :: msg_sour 834 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 835 INTEGER, INTENT(IN) :: gid 836 INTEGER :: group 837#if defined(__MPI) 838 INTEGER :: istatus(MPI_STATUS_SIZE) 839#endif 840 INTEGER :: ierr, nrcv 841 INTEGER :: msglen = 1 842 843#if defined(__MPI) 844 group = gid 845#endif 846 847 ! processors not taking part in the communication have 0 length message 848 849 msglen = 0 850 851 IF(dest .NE. sour) THEN 852#if defined(__MPI) 853 IF(mpime .EQ. sour) THEN 854 msglen=1 855 CALL MPI_SEND( msg_sour, msglen, MPI_INTEGER, dest, ip, group, ierr) 856 IF (ierr/=0) CALL mp_stop( 8019 ) 857 ELSE IF(mpime .EQ. dest) THEN 858 msglen=1 859 CALL MPI_RECV( msg_dest, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR ) 860 IF (ierr/=0) CALL mp_stop( 8020 ) 861 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 862 IF (ierr/=0) CALL mp_stop( 8021 ) 863 msglen = nrcv 864 END IF 865#endif 866 ELSEIF(mpime .EQ. sour)THEN 867 msg_dest = msg_sour 868 msglen = 1 869 END IF 870 871#if defined(__MPI) 872 CALL MPI_BARRIER(group, IERR) 873 IF (ierr/=0) CALL mp_stop( 8022 ) 874#endif 875 876 877 RETURN 878 END SUBROUTINE mp_get_i1 879 880!------------------------------------------------------------------------------! 881! 882! Carlo Cavazzoni 883! 884 SUBROUTINE mp_get_iv(msg_dest, msg_sour, mpime, dest, sour, ip, gid) 885 INTEGER :: msg_dest(:) 886 INTEGER, INTENT(IN) :: msg_sour(:) 887 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 888 INTEGER, INTENT(IN) :: gid 889 INTEGER :: group 890#if defined(__MPI) 891 INTEGER :: istatus(MPI_STATUS_SIZE) 892#endif 893 INTEGER :: ierr, nrcv 894 INTEGER :: msglen 895 896#if defined(__MPI) 897 group = gid 898#endif 899 900 ! processors not taking part in the communication have 0 length message 901 902 msglen = 0 903 904 IF(sour .NE. dest) THEN 905#if defined(__MPI) 906 IF(mpime .EQ. sour) THEN 907 msglen = SIZE(msg_sour) 908 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr) 909 IF (ierr/=0) CALL mp_stop( 8023 ) 910 ELSE IF(mpime .EQ. dest) THEN 911 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR ) 912 IF (ierr/=0) CALL mp_stop( 8024 ) 913 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 914 IF (ierr/=0) CALL mp_stop( 8025 ) 915 msglen = nrcv 916 END IF 917#endif 918 ELSEIF(mpime .EQ. sour)THEN 919 msg_dest(1:SIZE(msg_sour)) = msg_sour(:) 920 msglen = SIZE(msg_sour) 921 END IF 922#if defined(__MPI) 923 CALL MPI_BARRIER(group, IERR) 924 IF (ierr/=0) CALL mp_stop( 8026 ) 925#endif 926 RETURN 927 END SUBROUTINE mp_get_iv 928 929!------------------------------------------------------------------------------! 930 931 SUBROUTINE mp_get_r1(msg_dest, msg_sour, mpime, dest, sour, ip, gid) 932 REAL (DP) :: msg_dest 933 REAL (DP), INTENT(IN) :: msg_sour 934 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 935 INTEGER, INTENT(IN) :: gid 936 INTEGER :: group 937#if defined(__MPI) 938 INTEGER :: istatus(MPI_STATUS_SIZE) 939#endif 940 INTEGER :: ierr, nrcv 941 INTEGER :: msglen 942 943#if defined(__MPI) 944 group = gid 945#endif 946 947 ! processors not taking part in the communication have 0 length message 948 949 msglen = 0 950 951 IF(sour .NE. dest) THEN 952#if defined(__MPI) 953 IF(mpime .EQ. sour) THEN 954 msglen = 1 955 CALL MPI_SEND( msg_sour, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 956 IF (ierr/=0) CALL mp_stop( 8027 ) 957 ELSE IF(mpime .EQ. dest) THEN 958 CALL MPI_RECV( msg_dest, 1, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 959 IF (ierr/=0) CALL mp_stop( 8028 ) 960 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 961 IF (ierr/=0) CALL mp_stop( 8029 ) 962 msglen = nrcv 963 END IF 964#endif 965 ELSEIF(mpime .EQ. sour)THEN 966 msg_dest = msg_sour 967 msglen = 1 968 END IF 969#if defined(__MPI) 970 CALL MPI_BARRIER(group, IERR) 971 IF (ierr/=0) CALL mp_stop( 8030 ) 972#endif 973 RETURN 974 END SUBROUTINE mp_get_r1 975 976!------------------------------------------------------------------------------! 977! 978! Carlo Cavazzoni 979! 980 SUBROUTINE mp_get_rv(msg_dest, msg_sour, mpime, dest, sour, ip, gid) 981 REAL (DP) :: msg_dest(:) 982 REAL (DP), INTENT(IN) :: msg_sour(:) 983 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 984 INTEGER, INTENT(IN) :: gid 985 INTEGER :: group 986#if defined(__MPI) 987 INTEGER :: istatus(MPI_STATUS_SIZE) 988#endif 989 INTEGER :: ierr, nrcv 990 INTEGER :: msglen 991 992#if defined(__MPI) 993 group = gid 994#endif 995 996 ! processors not taking part in the communication have 0 length message 997 998 msglen = 0 999 1000 IF(sour .NE. dest) THEN 1001#if defined(__MPI) 1002 IF(mpime .EQ. sour) THEN 1003 msglen = SIZE(msg_sour) 1004 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 1005 IF (ierr/=0) CALL mp_stop( 8027 ) 1006 ELSE IF(mpime .EQ. dest) THEN 1007 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 1008 IF (ierr/=0) CALL mp_stop( 8028 ) 1009 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 1010 IF (ierr/=0) CALL mp_stop( 8029 ) 1011 msglen = nrcv 1012 END IF 1013#endif 1014 ELSEIF(mpime .EQ. sour)THEN 1015 msg_dest(1:SIZE(msg_sour)) = msg_sour(:) 1016 msglen = SIZE(msg_sour) 1017 END IF 1018#if defined(__MPI) 1019 CALL MPI_BARRIER(group, IERR) 1020 IF (ierr/=0) CALL mp_stop( 8030 ) 1021#endif 1022 RETURN 1023 END SUBROUTINE mp_get_rv 1024 1025!------------------------------------------------------------------------------! 1026! 1027! Carlo Cavazzoni 1028! 1029 SUBROUTINE mp_get_rm(msg_dest, msg_sour, mpime, dest, sour, ip, gid) 1030 REAL (DP) :: msg_dest(:,:) 1031 REAL (DP), INTENT(IN) :: msg_sour(:,:) 1032 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1033 INTEGER, INTENT(IN) :: gid 1034 INTEGER :: group 1035#if defined(__MPI) 1036 INTEGER :: istatus(MPI_STATUS_SIZE) 1037#endif 1038 INTEGER :: ierr, nrcv 1039 INTEGER :: msglen 1040 1041#if defined(__MPI) 1042 group = gid 1043#endif 1044 1045 ! processors not taking part in the communication have 0 length message 1046 1047 msglen = 0 1048 1049 IF(sour .NE. dest) THEN 1050#if defined(__MPI) 1051 IF(mpime .EQ. sour) THEN 1052 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 1053 IF (ierr/=0) CALL mp_stop( 8031 ) 1054 msglen = SIZE(msg_sour) 1055 ELSE IF(mpime .EQ. dest) THEN 1056 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 1057 IF (ierr/=0) CALL mp_stop( 8032 ) 1058 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 1059 IF (ierr/=0) CALL mp_stop( 8033 ) 1060 msglen = nrcv 1061 END IF 1062#endif 1063 ELSEIF(mpime .EQ. sour)THEN 1064 msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:) 1065 msglen = SIZE( msg_sour ) 1066 END IF 1067#if defined(__MPI) 1068 CALL MPI_BARRIER(group, IERR) 1069 IF (ierr/=0) CALL mp_stop( 8034 ) 1070#endif 1071 RETURN 1072 END SUBROUTINE mp_get_rm 1073 1074 1075!------------------------------------------------------------------------------! 1076! 1077! Carlo Cavazzoni 1078! 1079 SUBROUTINE mp_get_cv(msg_dest, msg_sour, mpime, dest, sour, ip, gid) 1080 COMPLEX (DP) :: msg_dest(:) 1081 COMPLEX (DP), INTENT(IN) :: msg_sour(:) 1082 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1083 INTEGER, INTENT(IN) :: gid 1084 INTEGER :: group 1085#if defined(__MPI) 1086 INTEGER :: istatus(MPI_STATUS_SIZE) 1087#endif 1088 INTEGER :: ierr, nrcv 1089 INTEGER :: msglen 1090 1091#if defined(__MPI) 1092 group = gid 1093#endif 1094 1095 ! processors not taking part in the communication have 0 length message 1096 1097 msglen = 0 1098 1099 IF( dest .NE. sour ) THEN 1100#if defined(__MPI) 1101 IF(mpime .EQ. sour) THEN 1102 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr) 1103 IF (ierr/=0) CALL mp_stop( 8035 ) 1104 msglen = SIZE(msg_sour) 1105 ELSE IF(mpime .EQ. dest) THEN 1106 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR ) 1107 IF (ierr/=0) CALL mp_stop( 8036 ) 1108 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr) 1109 IF (ierr/=0) CALL mp_stop( 8037 ) 1110 msglen = nrcv 1111 END IF 1112#endif 1113 ELSEIF(mpime .EQ. sour)THEN 1114 msg_dest(1:SIZE(msg_sour)) = msg_sour(:) 1115 msglen = SIZE(msg_sour) 1116 END IF 1117#if defined(__MPI) 1118 CALL MPI_BARRIER(group, IERR) 1119 IF (ierr/=0) CALL mp_stop( 8038 ) 1120#endif 1121 RETURN 1122 END SUBROUTINE mp_get_cv 1123 1124 1125 1126!------------------------------------------------------------------------------! 1127! 1128! Marco Govoni 1129! 1130 SUBROUTINE mp_get_cm(msg_dest, msg_sour, mpime, dest, sour, ip, gid) 1131 COMPLEX (DP) :: msg_dest(:,:) 1132 COMPLEX (DP), INTENT(IN) :: msg_sour(:,:) 1133 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1134 INTEGER, INTENT(IN) :: gid 1135 INTEGER :: group 1136#if defined(__MPI) 1137 INTEGER :: istatus(MPI_STATUS_SIZE) 1138#endif 1139 INTEGER :: ierr, nrcv 1140 INTEGER :: msglen 1141 1142#if defined(__MPI) 1143 group = gid 1144#endif 1145 1146 ! processors not taking part in the communication have 0 length message 1147 1148 msglen = 0 1149 1150 IF(sour .NE. dest) THEN 1151#if defined(__MPI) 1152 IF(mpime .EQ. sour) THEN 1153 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr) 1154 IF (ierr/=0) CALL mp_stop( 8031 ) 1155 msglen = SIZE(msg_sour) 1156 ELSE IF(mpime .EQ. dest) THEN 1157 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR ) 1158 IF (ierr/=0) CALL mp_stop( 8032 ) 1159 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr) 1160 IF (ierr/=0) CALL mp_stop( 8033 ) 1161 msglen = nrcv 1162 END IF 1163#endif 1164 ELSEIF(mpime .EQ. sour)THEN 1165 msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:) 1166 msglen = SIZE( msg_sour ) 1167 END IF 1168#if defined(__MPI) 1169 CALL MPI_BARRIER(group, IERR) 1170 IF (ierr/=0) CALL mp_stop( 8034 ) 1171#endif 1172 RETURN 1173 END SUBROUTINE mp_get_cm 1174!------------------------------------------------------------------------------! 1175! 1176! 1177!------------------------------------------------------------------------------! 1178 1179 1180 SUBROUTINE mp_put_i1(msg_dest, msg_sour, mpime, sour, dest, ip, gid) 1181 INTEGER :: msg_dest 1182 INTEGER, INTENT(IN) :: msg_sour 1183 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1184 INTEGER, INTENT(IN) :: gid 1185 INTEGER :: group 1186#if defined(__MPI) 1187 INTEGER :: istatus(MPI_STATUS_SIZE) 1188#endif 1189 INTEGER :: ierr, nrcv 1190 INTEGER :: msglen 1191 1192#if defined(__MPI) 1193 group = gid 1194#endif 1195 1196 ! processors not taking part in the communication have 0 length message 1197 1198 msglen = 0 1199 1200 IF(dest .NE. sour) THEN 1201#if defined(__MPI) 1202 IF(mpime .EQ. sour) THEN 1203 CALL MPI_SEND( msg_sour, 1, MPI_INTEGER, dest, ip, group, ierr) 1204 IF (ierr/=0) CALL mp_stop( 8039 ) 1205 msglen = 1 1206 ELSE IF(mpime .EQ. dest) THEN 1207 CALL MPI_RECV( msg_dest, 1, MPI_INTEGER, sour, ip, group, istatus, IERR ) 1208 IF (ierr/=0) CALL mp_stop( 8040 ) 1209 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 1210 IF (ierr/=0) CALL mp_stop( 8041 ) 1211 msglen = 1 1212 END IF 1213#endif 1214 ELSEIF(mpime .EQ. sour)THEN 1215 msg_dest = msg_sour 1216 msglen = 1 1217 END IF 1218#if defined(__MPI) 1219 CALL MPI_BARRIER(group, IERR) 1220 IF (ierr/=0) CALL mp_stop( 8042 ) 1221#endif 1222 RETURN 1223 END SUBROUTINE mp_put_i1 1224 1225!------------------------------------------------------------------------------! 1226! 1227! 1228 SUBROUTINE mp_put_iv(msg_dest, msg_sour, mpime, sour, dest, ip, gid) 1229 INTEGER :: msg_dest(:) 1230 INTEGER, INTENT(IN) :: msg_sour(:) 1231 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1232 INTEGER, INTENT(IN) :: gid 1233 INTEGER :: group 1234#if defined(__MPI) 1235 INTEGER :: istatus(MPI_STATUS_SIZE) 1236#endif 1237 INTEGER :: ierr, nrcv 1238 INTEGER :: msglen 1239#if defined(__MPI) 1240 group = gid 1241#endif 1242 ! processors not taking part in the communication have 0 length message 1243 1244 msglen = 0 1245 1246 IF(sour .NE. dest) THEN 1247#if defined(__MPI) 1248 IF(mpime .EQ. sour) THEN 1249 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr) 1250 IF (ierr/=0) CALL mp_stop( 8043 ) 1251 msglen = SIZE(msg_sour) 1252 ELSE IF(mpime .EQ. dest) THEN 1253 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR ) 1254 IF (ierr/=0) CALL mp_stop( 8044 ) 1255 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 1256 IF (ierr/=0) CALL mp_stop( 8045 ) 1257 msglen = nrcv 1258 END IF 1259#endif 1260 ELSEIF(mpime .EQ. sour)THEN 1261 msg_dest(1:SIZE(msg_sour)) = msg_sour(:) 1262 msglen = SIZE(msg_sour) 1263 END IF 1264#if defined(__MPI) 1265 CALL MPI_BARRIER(group, IERR) 1266 IF (ierr/=0) CALL mp_stop( 8046 ) 1267#endif 1268 RETURN 1269 END SUBROUTINE mp_put_iv 1270 1271!------------------------------------------------------------------------------! 1272! 1273! 1274 SUBROUTINE mp_put_rv(msg_dest, msg_sour, mpime, sour, dest, ip, gid) 1275 REAL (DP) :: msg_dest(:) 1276 REAL (DP), INTENT(IN) :: msg_sour(:) 1277 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1278 INTEGER, INTENT(IN) :: gid 1279 INTEGER :: group 1280#if defined(__MPI) 1281 INTEGER :: istatus(MPI_STATUS_SIZE) 1282#endif 1283 INTEGER :: ierr, nrcv 1284 INTEGER :: msglen 1285#if defined(__MPI) 1286 group = gid 1287#endif 1288 ! processors not taking part in the communication have 0 length message 1289 1290 msglen = 0 1291 1292 IF(sour .NE. dest) THEN 1293#if defined(__MPI) 1294 IF(mpime .EQ. sour) THEN 1295 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 1296 IF (ierr/=0) CALL mp_stop( 8047 ) 1297 msglen = SIZE(msg_sour) 1298 ELSE IF(mpime .EQ. dest) THEN 1299 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 1300 IF (ierr/=0) CALL mp_stop( 8048 ) 1301 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 1302 IF (ierr/=0) CALL mp_stop( 8049 ) 1303 msglen = nrcv 1304 END IF 1305#endif 1306 ELSEIF(mpime .EQ. sour)THEN 1307 msg_dest(1:SIZE(msg_sour)) = msg_sour(:) 1308 msglen = SIZE(msg_sour) 1309 END IF 1310#if defined(__MPI) 1311 CALL MPI_BARRIER(group, IERR) 1312 IF (ierr/=0) CALL mp_stop( 8050 ) 1313#endif 1314 RETURN 1315 END SUBROUTINE mp_put_rv 1316 1317!------------------------------------------------------------------------------! 1318! 1319! 1320 SUBROUTINE mp_put_rm(msg_dest, msg_sour, mpime, sour, dest, ip, gid) 1321 REAL (DP) :: msg_dest(:,:) 1322 REAL (DP), INTENT(IN) :: msg_sour(:,:) 1323 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1324 INTEGER, INTENT(IN) :: gid 1325 INTEGER :: group 1326#if defined(__MPI) 1327 INTEGER :: istatus(MPI_STATUS_SIZE) 1328#endif 1329 INTEGER :: ierr, nrcv 1330 INTEGER :: msglen 1331#if defined(__MPI) 1332 group = gid 1333#endif 1334 ! processors not taking part in the communication have 0 length message 1335 1336 msglen = 0 1337 1338 IF(sour .NE. dest) THEN 1339#if defined(__MPI) 1340 IF(mpime .EQ. sour) THEN 1341 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 1342 IF (ierr/=0) CALL mp_stop( 8051 ) 1343 msglen = SIZE(msg_sour) 1344 ELSE IF(mpime .EQ. dest) THEN 1345 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 1346 IF (ierr/=0) CALL mp_stop( 8052 ) 1347 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 1348 IF (ierr/=0) CALL mp_stop( 8053 ) 1349 msglen = nrcv 1350 END IF 1351#endif 1352 ELSEIF(mpime .EQ. sour)THEN 1353 msg_dest(1:SIZE(msg_sour,1),1:SIZE(msg_sour,2)) = msg_sour(:,:) 1354 msglen = SIZE(msg_sour) 1355 END IF 1356#if defined(__MPI) 1357 CALL MPI_BARRIER(group, IERR) 1358 IF (ierr/=0) CALL mp_stop( 8054 ) 1359#endif 1360 RETURN 1361 END SUBROUTINE mp_put_rm 1362 1363 1364!------------------------------------------------------------------------------! 1365! 1366! 1367 SUBROUTINE mp_put_cv(msg_dest, msg_sour, mpime, sour, dest, ip, gid) 1368 COMPLEX (DP) :: msg_dest(:) 1369 COMPLEX (DP), INTENT(IN) :: msg_sour(:) 1370 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 1371 INTEGER, INTENT(IN) :: gid 1372 INTEGER :: group 1373#if defined(__MPI) 1374 INTEGER :: istatus(MPI_STATUS_SIZE) 1375#endif 1376 INTEGER :: ierr, nrcv 1377 INTEGER :: msglen 1378#if defined(__MPI) 1379 group = gid 1380#endif 1381 ! processors not taking part in the communication have 0 length message 1382 1383 msglen = 0 1384 1385 IF( dest .NE. sour ) THEN 1386#if defined(__MPI) 1387 IF(mpime .EQ. sour) THEN 1388 CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr) 1389 IF (ierr/=0) CALL mp_stop( 8055 ) 1390 msglen = SIZE(msg_sour) 1391 ELSE IF(mpime .EQ. dest) THEN 1392 CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR ) 1393 IF (ierr/=0) CALL mp_stop( 8056 ) 1394 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr) 1395 IF (ierr/=0) CALL mp_stop( 8057 ) 1396 msglen = nrcv 1397 END IF 1398#endif 1399 ELSEIF(mpime .EQ. sour)THEN 1400 msg_dest(1:SIZE(msg_sour)) = msg_sour(:) 1401 msglen = SIZE(msg_sour) 1402 END IF 1403#if defined(__MPI) 1404 CALL MPI_BARRIER(group, IERR) 1405 IF (ierr/=0) CALL mp_stop( 8058 ) 1406#endif 1407 RETURN 1408 END SUBROUTINE mp_put_cv 1409 1410! 1411!------------------------------------------------------------------------------! 1412! 1413!..mp_stop 1414! 1415 SUBROUTINE mp_stop(code) 1416 IMPLICIT NONE 1417 INTEGER, INTENT (IN) :: code 1418 INTEGER :: ierr 1419 WRITE( stdout, fmt='( "*** error in Message Passing (mp) module ***")' ) 1420 WRITE( stdout, fmt='( "*** error code: ",I5)' ) code 1421#if defined(__MPI) 1422 ! abort with extreme prejudice across the entire MPI set of tasks 1423 CALL mpi_abort(MPI_COMM_WORLD,code,ierr) 1424#endif 1425 STOP 1426 END SUBROUTINE mp_stop 1427!------------------------------------------------------------------------------! 1428! 1429!..mp_sum 1430 SUBROUTINE mp_sum_i1(msg,gid) 1431 IMPLICIT NONE 1432 INTEGER, INTENT (INOUT) :: msg 1433 INTEGER, INTENT(IN) :: gid 1434#if defined(__MPI) 1435 INTEGER :: msglen 1436 msglen = 1 1437 CALL reduce_base_integer( msglen, msg, gid, -1 ) 1438#endif 1439 END SUBROUTINE mp_sum_i1 1440 ! 1441 !------------------------------------------------------------------------------! 1442 SUBROUTINE mp_sum_iv(msg, gid) 1443 !------------------------------------------------------------------------------! 1444 !! 1445 !! MPI sum an integer vector from all cores and bcast the result to all. 1446 !! 1447 IMPLICIT NONE 1448 ! 1449 INTEGER, INTENT(inout) :: msg(:) 1450 INTEGER, INTENT(in) :: gid 1451#if defined(__MPI) 1452 INTEGER :: msglen 1453 msglen = SIZE(msg) 1454 CALL reduce_base_integer(msglen, msg, gid, -1) 1455#endif 1456 !------------------------------------------------------------------------------! 1457 END SUBROUTINE mp_sum_iv 1458 !------------------------------------------------------------------------------! 1459 ! 1460 !------------------------------------------------------------------------------! 1461 SUBROUTINE mp_sum_i8v(msg, gid) 1462 !------------------------------------------------------------------------------! 1463 !! 1464 !! MPI sum an integer vector from all cores and bcast the result to all. 1465 !! 1466 IMPLICIT NONE 1467 ! 1468 INTEGER(KIND = i8b), INTENT(inout) :: msg(:) 1469 INTEGER, INTENT(in) :: gid 1470#if defined(__MPI) 1471 INTEGER :: msglen 1472 msglen = SIZE(msg) 1473 CALL reduce_base_integer8(msglen, msg, gid, -1) 1474#endif 1475 !------------------------------------------------------------------------------! 1476 END SUBROUTINE mp_sum_i8v 1477 !------------------------------------------------------------------------------! 1478 ! 1479 !------------------------------------------------------------------------------! 1480 SUBROUTINE mp_sum_im(msg,gid) 1481 !------------------------------------------------------------------------------! 1482 IMPLICIT NONE 1483 INTEGER, INTENT (INOUT) :: msg(:,:) 1484 INTEGER, INTENT(IN) :: gid 1485#if defined(__MPI) 1486 INTEGER :: msglen 1487 msglen = size(msg) 1488 CALL reduce_base_integer( msglen, msg, gid, -1 ) 1489#endif 1490 END SUBROUTINE mp_sum_im 1491! 1492!------------------------------------------------------------------------------! 1493 1494 SUBROUTINE mp_sum_it(msg,gid) 1495 IMPLICIT NONE 1496 INTEGER, INTENT (INOUT) :: msg(:,:,:) 1497 INTEGER, INTENT (IN) :: gid 1498#if defined(__MPI) 1499 INTEGER :: msglen 1500 msglen = size(msg) 1501 CALL reduce_base_integer( msglen, msg, gid, -1 ) 1502#endif 1503 END SUBROUTINE mp_sum_it 1504 1505!------------------------------------------------------------------------------! 1506 1507 SUBROUTINE mp_sum_i4(msg,gid) 1508 IMPLICIT NONE 1509 INTEGER, INTENT (INOUT) :: msg(:,:,:,:) 1510 INTEGER, INTENT (IN) :: gid 1511#if defined(__MPI) 1512 INTEGER :: msglen 1513 msglen = size(msg) 1514 CALL reduce_base_integer( msglen, msg, gid, -1 ) 1515#endif 1516 END SUBROUTINE mp_sum_i4 1517 1518!------------------------------------------------------------------------------! 1519 1520 SUBROUTINE mp_sum_i5(msg,gid) 1521 IMPLICIT NONE 1522 INTEGER, INTENT (INOUT) :: msg(:,:,:,:,:) 1523 INTEGER, INTENT (IN) :: gid 1524#if defined(__MPI) 1525 INTEGER :: msglen 1526 msglen = size(msg) 1527 CALL reduce_base_integer( msglen, msg, gid, -1 ) 1528#endif 1529 END SUBROUTINE mp_sum_i5 1530 1531 1532!------------------------------------------------------------------------------! 1533 1534 SUBROUTINE mp_sum_r1(msg,gid) 1535 IMPLICIT NONE 1536 REAL (DP), INTENT (INOUT) :: msg 1537 INTEGER, INTENT (IN) :: gid 1538#if defined(__MPI) 1539 INTEGER :: msglen 1540 msglen = 1 1541 CALL reduce_base_real( msglen, msg, gid, -1 ) 1542#endif 1543 END SUBROUTINE mp_sum_r1 1544 1545! 1546!------------------------------------------------------------------------------! 1547 1548 SUBROUTINE mp_sum_rv(msg,gid) 1549 IMPLICIT NONE 1550 REAL (DP), INTENT (INOUT) :: msg(:) 1551 INTEGER, INTENT (IN) :: gid 1552#if defined(__MPI) 1553 INTEGER :: msglen 1554 msglen = size(msg) 1555 CALL reduce_base_real( msglen, msg, gid, -1 ) 1556#endif 1557 END SUBROUTINE mp_sum_rv 1558! 1559!------------------------------------------------------------------------------! 1560 1561 1562 SUBROUTINE mp_sum_rm(msg, gid) 1563 IMPLICIT NONE 1564 REAL (DP), INTENT (INOUT) :: msg(:,:) 1565 INTEGER, INTENT (IN) :: gid 1566#if defined(__MPI) 1567 INTEGER :: msglen 1568 msglen = size(msg) 1569 CALL reduce_base_real( msglen, msg, gid, -1 ) 1570#endif 1571 END SUBROUTINE mp_sum_rm 1572 1573 1574 SUBROUTINE mp_root_sum_rm( msg, res, root, gid ) 1575 IMPLICIT NONE 1576 REAL (DP), INTENT (IN) :: msg(:,:) 1577 REAL (DP), INTENT (OUT) :: res(:,:) 1578 INTEGER, INTENT (IN) :: root 1579 INTEGER, INTENT (IN) :: gid 1580#if defined(__MPI) 1581 INTEGER :: msglen, ierr, taskid 1582 1583 msglen = size(msg) 1584 1585 CALL mpi_comm_rank( gid, taskid, ierr) 1586 IF( ierr /= 0 ) CALL mp_stop( 8059 ) 1587 ! 1588 IF( taskid == root ) THEN 1589 IF( msglen > size(res) ) CALL mp_stop( 8060 ) 1590 END IF 1591 1592 CALL reduce_base_real_to( msglen, msg, res, gid, root ) 1593 1594#else 1595 1596 res = msg 1597 1598#endif 1599 1600 END SUBROUTINE mp_root_sum_rm 1601 1602 1603 SUBROUTINE mp_root_sum_cm( msg, res, root, gid ) 1604 IMPLICIT NONE 1605 COMPLEX (DP), INTENT (IN) :: msg(:,:) 1606 COMPLEX (DP), INTENT (OUT) :: res(:,:) 1607 INTEGER, INTENT (IN) :: root 1608 INTEGER, INTENT (IN) :: gid 1609#if defined(__MPI) 1610 INTEGER :: msglen, ierr, taskid 1611 1612 msglen = size(msg) 1613 1614 CALL mpi_comm_rank( gid, taskid, ierr) 1615 IF( ierr /= 0 ) CALL mp_stop( 8061 ) 1616 1617 IF( taskid == root ) THEN 1618 IF( msglen > size(res) ) CALL mp_stop( 8062 ) 1619 END IF 1620 1621 CALL reduce_base_real_to( 2 * msglen, msg, res, gid, root ) 1622 1623#else 1624 1625 res = msg 1626 1627#endif 1628 1629 END SUBROUTINE mp_root_sum_cm 1630 1631! 1632!------------------------------------------------------------------------------! 1633 1634 1635!------------------------------------------------------------------------------! 1636! 1637 1638 SUBROUTINE mp_sum_rmm( msg, res, root, gid ) 1639 IMPLICIT NONE 1640 REAL (DP), INTENT (IN) :: msg(:,:) 1641 REAL (DP), INTENT (OUT) :: res(:,:) 1642 INTEGER, INTENT (IN) :: root 1643 INTEGER, INTENT (IN) :: gid 1644 INTEGER :: group 1645 INTEGER :: msglen 1646 INTEGER :: taskid, ierr 1647 1648 msglen = size(msg) 1649 1650#if defined(__MPI) 1651 1652 group = gid 1653 ! 1654 CALL mpi_comm_rank( group, taskid, ierr) 1655 IF( ierr /= 0 ) CALL mp_stop( 8063 ) 1656 1657 IF( taskid == root ) THEN 1658 IF( msglen > size(res) ) CALL mp_stop( 8064 ) 1659 END IF 1660 ! 1661 CALL reduce_base_real_to( msglen, msg, res, group, root ) 1662 ! 1663 1664#else 1665 res = msg 1666#endif 1667 1668 END SUBROUTINE mp_sum_rmm 1669 1670 1671! 1672!------------------------------------------------------------------------------! 1673 1674 1675 SUBROUTINE mp_sum_rt( msg, gid ) 1676 IMPLICIT NONE 1677 REAL (DP), INTENT (INOUT) :: msg(:,:,:) 1678 INTEGER, INTENT(IN) :: gid 1679#if defined(__MPI) 1680 INTEGER :: msglen 1681 msglen = size(msg) 1682 CALL reduce_base_real( msglen, msg, gid, -1 ) 1683#endif 1684 END SUBROUTINE mp_sum_rt 1685 1686! 1687!------------------------------------------------------------------------------! 1688! 1689! Carlo Cavazzoni 1690! 1691 SUBROUTINE mp_sum_r4d(msg,gid) 1692 IMPLICIT NONE 1693 REAL (DP), INTENT (INOUT) :: msg(:,:,:,:) 1694 INTEGER, INTENT(IN) :: gid 1695#if defined(__MPI) 1696 INTEGER :: msglen 1697 msglen = size(msg) 1698 CALL reduce_base_real( msglen, msg, gid, -1 ) 1699#endif 1700 END SUBROUTINE mp_sum_r4d 1701 1702 1703 1704!------------------------------------------------------------------------------! 1705 1706 SUBROUTINE mp_sum_c1(msg,gid) 1707 IMPLICIT NONE 1708 COMPLEX (DP), INTENT (INOUT) :: msg 1709 INTEGER, INTENT(IN) :: gid 1710#if defined(__MPI) 1711 INTEGER :: msglen 1712 msglen = 1 1713 CALL reduce_base_real( 2 * msglen, msg, gid, -1 ) 1714#endif 1715 END SUBROUTINE mp_sum_c1 1716! 1717!------------------------------------------------------------------------------! 1718 1719 SUBROUTINE mp_sum_cv(msg,gid) 1720 IMPLICIT NONE 1721 COMPLEX (DP), INTENT (INOUT) :: msg(:) 1722 INTEGER, INTENT(IN) :: gid 1723#if defined(__MPI) 1724 INTEGER :: msglen 1725 msglen = size(msg) 1726 CALL reduce_base_real( 2 * msglen, msg, gid, -1 ) 1727#endif 1728 END SUBROUTINE mp_sum_cv 1729! 1730!------------------------------------------------------------------------------! 1731 1732 SUBROUTINE mp_sum_cm(msg, gid) 1733 IMPLICIT NONE 1734 COMPLEX (DP), INTENT (INOUT) :: msg(:,:) 1735 INTEGER, INTENT (IN) :: gid 1736#if defined(__MPI) 1737 INTEGER :: msglen 1738 msglen = size(msg) 1739 CALL reduce_base_real( 2 * msglen, msg, gid, -1 ) 1740#endif 1741 END SUBROUTINE mp_sum_cm 1742! 1743!------------------------------------------------------------------------------! 1744 1745 1746 SUBROUTINE mp_sum_cmm(msg, res, gid) 1747 IMPLICIT NONE 1748 COMPLEX (DP), INTENT (IN) :: msg(:,:) 1749 COMPLEX (DP), INTENT (OUT) :: res(:,:) 1750 INTEGER, INTENT (IN) :: gid 1751#if defined(__MPI) 1752 INTEGER :: msglen 1753 msglen = size(msg) 1754 CALL reduce_base_real_to( 2 * msglen, msg, res, gid, -1 ) 1755#else 1756 res = msg 1757#endif 1758 END SUBROUTINE mp_sum_cmm 1759 1760 1761! 1762!------------------------------------------------------------------------------! 1763! 1764! Carlo Cavazzoni 1765! 1766 SUBROUTINE mp_sum_ct(msg,gid) 1767 IMPLICIT NONE 1768 COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:) 1769 INTEGER, INTENT(IN) :: gid 1770#if defined(__MPI) 1771 INTEGER :: msglen 1772 msglen = SIZE(msg) 1773 CALL reduce_base_real( 2 * msglen, msg, gid, -1 ) 1774#endif 1775 END SUBROUTINE mp_sum_ct 1776 1777! 1778!------------------------------------------------------------------------------! 1779! 1780! Carlo Cavazzoni 1781! 1782 SUBROUTINE mp_sum_c4d(msg,gid) 1783 IMPLICIT NONE 1784 COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:) 1785 INTEGER, INTENT(IN) :: gid 1786#if defined(__MPI) 1787 INTEGER :: msglen 1788 msglen = size(msg) 1789 CALL reduce_base_real( 2 * msglen, msg, gid, -1 ) 1790#endif 1791 END SUBROUTINE mp_sum_c4d 1792! 1793!------------------------------------------------------------------------------! 1794! 1795! Carlo Cavazzoni 1796! 1797 SUBROUTINE mp_sum_c5d(msg,gid) 1798 IMPLICIT NONE 1799 COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:) 1800 INTEGER, INTENT(IN) :: gid 1801#if defined(__MPI) 1802 INTEGER :: msglen 1803 msglen = size(msg) 1804 CALL reduce_base_real( 2 * msglen, msg, gid, -1 ) 1805#endif 1806 END SUBROUTINE mp_sum_c5d 1807 1808!------------------------------------------------------------------------------! 1809! 1810! Carlo Cavazzoni 1811! 1812 SUBROUTINE mp_sum_r5d(msg,gid) 1813 IMPLICIT NONE 1814 REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:) 1815 INTEGER, INTENT(IN) :: gid 1816#if defined(__MPI) 1817 INTEGER :: msglen 1818 msglen = size(msg) 1819 CALL reduce_base_real( msglen, msg, gid, -1 ) 1820#endif 1821 END SUBROUTINE mp_sum_r5d 1822 1823 1824 SUBROUTINE mp_sum_r6d(msg,gid) 1825 IMPLICIT NONE 1826 REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:) 1827 INTEGER, INTENT(IN) :: gid 1828#if defined(__MPI) 1829 INTEGER :: msglen 1830 msglen = size(msg) 1831 CALL reduce_base_real( msglen, msg, gid, -1 ) 1832#endif 1833 END SUBROUTINE mp_sum_r6d 1834 1835! 1836!------------------------------------------------------------------------------! 1837! 1838! Carlo Cavazzoni 1839! 1840 SUBROUTINE mp_sum_c6d(msg,gid) 1841 IMPLICIT NONE 1842 COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:) 1843 INTEGER, INTENT(IN) :: gid 1844#if defined(__MPI) 1845 INTEGER :: msglen 1846 msglen = size(msg) 1847 CALL reduce_base_real( 2 * msglen, msg, gid, -1 ) 1848#endif 1849 END SUBROUTINE mp_sum_c6d 1850 1851 1852 1853!------------------------------------------------------------------------------! 1854 SUBROUTINE mp_max_i(msg,gid) 1855 IMPLICIT NONE 1856 INTEGER, INTENT (INOUT) :: msg 1857 INTEGER, INTENT(IN) :: gid 1858#if defined(__MPI) 1859 INTEGER :: msglen 1860 msglen = 1 1861 CALL parallel_max_integer( msglen, msg, gid, -1 ) 1862#endif 1863 END SUBROUTINE mp_max_i 1864! 1865!------------------------------------------------------------------------------! 1866! 1867!..mp_max_iv 1868!..Carlo Cavazzoni 1869! 1870 SUBROUTINE mp_max_iv(msg,gid) 1871 IMPLICIT NONE 1872 INTEGER, INTENT (INOUT) :: msg(:) 1873 INTEGER, INTENT(IN) :: gid 1874#if defined(__MPI) 1875 INTEGER :: msglen 1876 msglen = size(msg) 1877 CALL parallel_max_integer( msglen, msg, gid, -1 ) 1878#endif 1879 END SUBROUTINE mp_max_iv 1880! 1881!---------------------------------------------------------------------- 1882 1883 SUBROUTINE mp_max_r(msg,gid) 1884 IMPLICIT NONE 1885 REAL (DP), INTENT (INOUT) :: msg 1886 INTEGER, INTENT(IN) :: gid 1887#if defined(__MPI) 1888 INTEGER :: msglen 1889 msglen = 1 1890 CALL parallel_max_real( msglen, msg, gid, -1 ) 1891#endif 1892 END SUBROUTINE mp_max_r 1893! 1894!------------------------------------------------------------------------------! 1895 SUBROUTINE mp_max_rv(msg,gid) 1896 IMPLICIT NONE 1897 REAL (DP), INTENT (INOUT) :: msg(:) 1898 INTEGER, INTENT(IN) :: gid 1899#if defined(__MPI) 1900 INTEGER :: msglen 1901 msglen = size(msg) 1902 CALL parallel_max_real( msglen, msg, gid, -1 ) 1903#endif 1904 END SUBROUTINE mp_max_rv 1905!------------------------------------------------------------------------------! 1906 SUBROUTINE mp_min_i(msg,gid) 1907 IMPLICIT NONE 1908 INTEGER, INTENT (INOUT) :: msg 1909 INTEGER, INTENT(IN) :: gid 1910#if defined(__MPI) 1911 INTEGER :: msglen 1912 msglen = 1 1913 CALL parallel_min_integer( msglen, msg, gid, -1 ) 1914#endif 1915 END SUBROUTINE mp_min_i 1916!------------------------------------------------------------------------------! 1917 SUBROUTINE mp_min_iv(msg,gid) 1918 IMPLICIT NONE 1919 INTEGER, INTENT (INOUT) :: msg(:) 1920 INTEGER, INTENT(IN) :: gid 1921#if defined(__MPI) 1922 INTEGER :: msglen 1923 msglen = SIZE(msg) 1924 CALL parallel_min_integer( msglen, msg, gid, -1 ) 1925#endif 1926 END SUBROUTINE mp_min_iv 1927!------------------------------------------------------------------------------! 1928 SUBROUTINE mp_min_r(msg,gid) 1929 IMPLICIT NONE 1930 REAL (DP), INTENT (INOUT) :: msg 1931 INTEGER, INTENT(IN) :: gid 1932#if defined(__MPI) 1933 INTEGER :: msglen 1934 msglen = 1 1935 CALL parallel_min_real( msglen, msg, gid, -1 ) 1936#endif 1937 END SUBROUTINE mp_min_r 1938! 1939!------------------------------------------------------------------------------! 1940 SUBROUTINE mp_min_rv(msg,gid) 1941 IMPLICIT NONE 1942 REAL (DP), INTENT (INOUT) :: msg(:) 1943 INTEGER, INTENT(IN) :: gid 1944#if defined(__MPI) 1945 INTEGER :: msglen 1946 msglen = size(msg) 1947 CALL parallel_min_real( msglen, msg, gid, -1 ) 1948#endif 1949 END SUBROUTINE mp_min_rv 1950 1951!------------------------------------------------------------------------------! 1952 1953 SUBROUTINE mp_barrier(gid) 1954 IMPLICIT NONE 1955 INTEGER, INTENT(IN) :: gid 1956 INTEGER :: ierr 1957#if defined(__MPI) 1958 CALL MPI_BARRIER(gid,IERR) 1959 IF (ierr/=0) CALL mp_stop( 8066 ) 1960#endif 1961 END SUBROUTINE mp_barrier 1962 1963!------------------------------------------------------------------------------! 1964!.. Carlo Cavazzoni 1965!..mp_rank 1966 FUNCTION mp_rank( comm ) 1967 IMPLICIT NONE 1968 INTEGER :: mp_rank 1969 INTEGER, INTENT(IN) :: comm 1970 INTEGER :: ierr, taskid 1971 1972 ierr = 0 1973 taskid = 0 1974#if defined(__MPI) 1975 CALL mpi_comm_rank(comm,taskid,ierr) 1976 IF (ierr/=0) CALL mp_stop( 8067 ) 1977#endif 1978 mp_rank = taskid 1979 END FUNCTION mp_rank 1980 1981!------------------------------------------------------------------------------! 1982!.. Carlo Cavazzoni 1983!..mp_size 1984 FUNCTION mp_size( comm ) 1985 IMPLICIT NONE 1986 INTEGER :: mp_size 1987 INTEGER, INTENT(IN) :: comm 1988 INTEGER :: ierr, numtask 1989 1990 ierr = 0 1991 numtask = 1 1992#if defined(__MPI) 1993 CALL mpi_comm_size(comm,numtask,ierr) 1994 IF (ierr/=0) CALL mp_stop( 8068 ) 1995#endif 1996 mp_size = numtask 1997 END FUNCTION mp_size 1998 1999 SUBROUTINE mp_report 2000 INTEGER :: i 2001 WRITE( stdout, *) 2002#if defined(__MPI) 2003# if defined (__MP_STAT) 2004 WRITE( stdout, 20 ) 200520 FORMAT(3X,'please use an MPI profiler to analyze communications ') 2006# endif 2007#else 2008 WRITE( stdout, *) 2009#endif 2010 RETURN 2011 END SUBROUTINE mp_report 2012 2013 2014!------------------------------------------------------------------------------! 2015!..mp_gatherv_rv 2016!..Carlo Cavazzoni 2017 2018 SUBROUTINE mp_gatherv_rv( mydata, alldata, recvcount, displs, root, gid) 2019 IMPLICIT NONE 2020 REAL(DP) :: mydata(:) 2021 REAL(DP) :: alldata(:) 2022 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 2023 INTEGER, INTENT(IN) :: gid 2024 INTEGER :: group 2025 INTEGER :: ierr, npe, myid 2026 2027#if defined (__MPI) 2028 group = gid 2029 CALL mpi_comm_size( group, npe, ierr ) 2030 IF (ierr/=0) CALL mp_stop( 8069 ) 2031 CALL mpi_comm_rank( group, myid, ierr ) 2032 IF (ierr/=0) CALL mp_stop( 8070 ) 2033 ! 2034 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2035 IF ( myid == root ) THEN 2036 IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 ) 2037 END IF 2038 IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 ) 2039 ! 2040 CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, & 2041 alldata, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr ) 2042 IF (ierr/=0) CALL mp_stop( 8074 ) 2043#else 2044 IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 ) 2045 IF ( SIZE( mydata ) < recvcount( 1 ) ) CALL mp_stop( 8076 ) 2046 ! 2047 alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) ) 2048#endif 2049 RETURN 2050 END SUBROUTINE mp_gatherv_rv 2051 2052!------------------------------------------------------------------------------! 2053!..mp_gatherv_cv 2054!..Carlo Cavazzoni 2055 2056 SUBROUTINE mp_gatherv_cv( mydata, alldata, recvcount, displs, root, gid) 2057 IMPLICIT NONE 2058 COMPLEX(DP) :: mydata(:) 2059 COMPLEX(DP) :: alldata(:) 2060 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 2061 INTEGER, INTENT(IN) :: gid 2062 INTEGER :: group 2063 INTEGER :: ierr, npe, myid 2064 2065#if defined (__MPI) 2066 group = gid 2067 CALL mpi_comm_size( group, npe, ierr ) 2068 IF (ierr/=0) CALL mp_stop( 8069 ) 2069 CALL mpi_comm_rank( group, myid, ierr ) 2070 IF (ierr/=0) CALL mp_stop( 8070 ) 2071 ! 2072 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2073 IF ( myid == root ) THEN 2074 IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 ) 2075 END IF 2076 IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 ) 2077 ! 2078 CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, & 2079 alldata, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr ) 2080 IF (ierr/=0) CALL mp_stop( 8074 ) 2081#else 2082 IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 ) 2083 IF ( SIZE( mydata ) < recvcount( 1 ) ) CALL mp_stop( 8076 ) 2084 ! 2085 alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) ) 2086#endif 2087 RETURN 2088 END SUBROUTINE mp_gatherv_cv 2089 2090!------------------------------------------------------------------------------! 2091!..mp_gatherv_rv 2092!..Carlo Cavazzoni 2093 2094 SUBROUTINE mp_gatherv_iv( mydata, alldata, recvcount, displs, root, gid) 2095 IMPLICIT NONE 2096 INTEGER :: mydata(:) 2097 INTEGER :: alldata(:) 2098 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 2099 INTEGER, INTENT(IN) :: gid 2100 INTEGER :: group 2101 INTEGER :: ierr, npe, myid 2102 2103#if defined (__MPI) 2104 group = gid 2105 CALL mpi_comm_size( group, npe, ierr ) 2106 IF (ierr/=0) CALL mp_stop( 8069 ) 2107 CALL mpi_comm_rank( group, myid, ierr ) 2108 IF (ierr/=0) CALL mp_stop( 8070 ) 2109 ! 2110 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2111 IF ( myid == root ) THEN 2112 IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 ) 2113 END IF 2114 IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 ) 2115 ! 2116 CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_INTEGER, & 2117 alldata, recvcount, displs, MPI_INTEGER, root, group, ierr ) 2118 IF (ierr/=0) CALL mp_stop( 8074 ) 2119#else 2120 IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 ) 2121 IF ( SIZE( mydata ) < recvcount( 1 ) ) CALL mp_stop( 8076 ) 2122 ! 2123 alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) ) 2124#endif 2125 RETURN 2126 END SUBROUTINE mp_gatherv_iv 2127 2128 2129!------------------------------------------------------------------------------! 2130!..mp_gatherv_rm 2131!..Carlo Cavazzoni 2132 2133 SUBROUTINE mp_gatherv_rm( mydata, alldata, recvcount, displs, root, gid) 2134 IMPLICIT NONE 2135 REAL(DP) :: mydata(:,:) ! Warning first dimension is supposed constant! 2136 REAL(DP) :: alldata(:,:) 2137 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 2138 INTEGER, INTENT(IN) :: gid 2139 INTEGER :: group 2140 INTEGER :: ierr, npe, myid, nsiz 2141 INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:) 2142 2143 2144#if defined (__MPI) 2145 group = gid 2146 CALL mpi_comm_size( group, npe, ierr ) 2147 IF (ierr/=0) CALL mp_stop( 8069 ) 2148 CALL mpi_comm_rank( group, myid, ierr ) 2149 IF (ierr/=0) CALL mp_stop( 8070 ) 2150 ! 2151 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2152 IF ( myid == root ) THEN 2153 IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 ) 2154 IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 ) 2155 END IF 2156 IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 ) 2157 ! 2158 ALLOCATE( nrecv( npe ), ndisp( npe ) ) 2159 ! 2160 nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 ) 2161 ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 ) 2162 ! 2163 CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, & 2164 alldata, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr ) 2165 IF (ierr/=0) CALL mp_stop( 8074 ) 2166 ! 2167 DEALLOCATE( nrecv, ndisp ) 2168 ! 2169#else 2170 IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 ) 2171 IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 ) 2172 IF ( SIZE( mydata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8076 ) 2173 ! 2174 alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) ) 2175#endif 2176 RETURN 2177 END SUBROUTINE mp_gatherv_rm 2178 2179!------------------------------------------------------------------------------! 2180!..mp_gatherv_im 2181!..Carlo Cavazzoni 2182 2183 SUBROUTINE mp_gatherv_im( mydata, alldata, recvcount, displs, root, gid) 2184 IMPLICIT NONE 2185 INTEGER :: mydata(:,:) ! Warning first dimension is supposed constant! 2186 INTEGER :: alldata(:,:) 2187 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 2188 INTEGER, INTENT(IN) :: gid 2189 INTEGER :: group 2190 INTEGER :: ierr, npe, myid, nsiz 2191 INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:) 2192 2193 2194#if defined (__MPI) 2195 group = gid 2196 CALL mpi_comm_size( group, npe, ierr ) 2197 IF (ierr/=0) CALL mp_stop( 8069 ) 2198 CALL mpi_comm_rank( group, myid, ierr ) 2199 IF (ierr/=0) CALL mp_stop( 8070 ) 2200 ! 2201 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2202 IF ( myid == root ) THEN 2203 IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 ) 2204 IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 ) 2205 END IF 2206 IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 ) 2207 ! 2208 ALLOCATE( nrecv( npe ), ndisp( npe ) ) 2209 ! 2210 nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 ) 2211 ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 ) 2212 ! 2213 CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_INTEGER, & 2214 alldata, nrecv, ndisp, MPI_INTEGER, root, group, ierr ) 2215 IF (ierr/=0) CALL mp_stop( 8074 ) 2216 ! 2217 DEALLOCATE( nrecv, ndisp ) 2218 ! 2219#else 2220 IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 ) 2221 IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 ) 2222 IF ( SIZE( mydata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8076 ) 2223 ! 2224 alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) ) 2225#endif 2226 RETURN 2227 END SUBROUTINE mp_gatherv_im 2228 2229 2230!------------------------------------------------------------------------------! 2231!..mp_gatherv_inplace_cplx_array 2232!..Ye Luo 2233 2234 SUBROUTINE mp_gatherv_inplace_cplx_array(alldata, my_column_type, recvcount, displs, root, gid) 2235 IMPLICIT NONE 2236 COMPLEX(DP) :: alldata(:,:) 2237 INTEGER, INTENT(IN) :: my_column_type 2238 INTEGER, INTENT(IN) :: recvcount(:), displs(:) 2239 INTEGER, INTENT(IN) :: root, gid 2240 INTEGER :: ierr, npe, myid 2241 2242#if defined (__MPI) 2243 CALL mpi_comm_size( gid, npe, ierr ) 2244 IF (ierr/=0) CALL mp_stop( 8069 ) 2245 CALL mpi_comm_rank( gid, myid, ierr ) 2246 IF (ierr/=0) CALL mp_stop( 8070 ) 2247 ! 2248 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2249 ! 2250 IF (myid==root) THEN 2251 CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, & 2252 alldata, recvcount, displs, my_column_type, root, gid, ierr ) 2253 ELSE 2254 CALL MPI_GATHERV( alldata(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, & 2255 MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr ) 2256 ENDIF 2257 IF (ierr/=0) CALL mp_stop( 8074 ) 2258#endif 2259 RETURN 2260 END SUBROUTINE mp_gatherv_inplace_cplx_array 2261 2262!------------------------------------------------------------------------------! 2263!..mp_allgatherv_inplace_cplx_array 2264!..Ye Luo 2265 2266 SUBROUTINE mp_allgatherv_inplace_cplx_array(alldata, my_element_type, recvcount, displs, gid) 2267 IMPLICIT NONE 2268 COMPLEX(DP) :: alldata(:,:) 2269 INTEGER, INTENT(IN) :: my_element_type 2270 INTEGER, INTENT(IN) :: recvcount(:), displs(:) 2271 INTEGER, INTENT(IN) :: gid 2272 INTEGER :: ierr, npe, myid 2273 2274#if defined (__MPI) 2275 CALL mpi_comm_size( gid, npe, ierr ) 2276 IF (ierr/=0) CALL mp_stop( 8069 ) 2277 CALL mpi_comm_rank( gid, myid, ierr ) 2278 IF (ierr/=0) CALL mp_stop( 8070 ) 2279 ! 2280 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2281 ! 2282 CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, & 2283 alldata, recvcount, displs, my_element_type, gid, ierr ) 2284 IF (ierr/=0) CALL mp_stop( 8074 ) 2285#endif 2286 RETURN 2287 END SUBROUTINE mp_allgatherv_inplace_cplx_array 2288 2289!.. SdG added 16/08/19 2290 SUBROUTINE mp_allgatherv_inplace_real_array(alldata, my_element_type, recvcount, displs, gid) 2291 IMPLICIT NONE 2292 REAL(DP) :: alldata(:,:) 2293 INTEGER, INTENT(IN) :: my_element_type 2294 INTEGER, INTENT(IN) :: recvcount(:), displs(:) 2295 INTEGER, INTENT(IN) :: gid 2296 INTEGER :: ierr, npe, myid 2297 2298#if defined (__MPI) 2299 CALL mpi_comm_size( gid, npe, ierr ) 2300 IF (ierr/=0) CALL mp_stop( 8069 ) 2301 CALL mpi_comm_rank( gid, myid, ierr ) 2302 IF (ierr/=0) CALL mp_stop( 8070 ) 2303 ! 2304 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 ) 2305 ! 2306 CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, & 2307 alldata, recvcount, displs, my_element_type, gid, ierr ) 2308 IF (ierr/=0) CALL mp_stop( 8074 ) 2309#endif 2310 RETURN 2311 END SUBROUTINE mp_allgatherv_inplace_real_array 2312 2313!------------------------------------------------------------------------------! 2314 2315 SUBROUTINE mp_set_displs( recvcount, displs, ntot, nproc ) 2316 ! Given the number of elements on each processor (recvcount), this subroutine 2317 ! sets the correct offsets (displs) to collect them on a single 2318 ! array with contiguous elemets 2319 IMPLICIT NONE 2320 INTEGER, INTENT(IN) :: recvcount(:) ! number of elements on each processor 2321 INTEGER, INTENT(OUT) :: displs(:) ! offsets/displacements 2322 INTEGER, INTENT(OUT) :: ntot 2323 INTEGER, INTENT(IN) :: nproc 2324 INTEGER :: i 2325 2326 displs( 1 ) = 0 2327 ! 2328#if defined (__MPI) 2329 IF( nproc < 1 ) CALL mp_stop( 8090 ) 2330 DO i = 2, nproc 2331 displs( i ) = displs( i - 1 ) + recvcount( i - 1 ) 2332 END DO 2333 ntot = displs( nproc ) + recvcount( nproc ) 2334#else 2335 ntot = recvcount( 1 ) 2336#endif 2337 RETURN 2338 END SUBROUTINE mp_set_displs 2339 2340!------------------------------------------------------------------------------! 2341 2342 2343SUBROUTINE mp_alltoall_c3d( sndbuf, rcvbuf, gid ) 2344 IMPLICIT NONE 2345 COMPLEX(DP) :: sndbuf( :, :, : ) 2346 COMPLEX(DP) :: rcvbuf( :, :, : ) 2347 INTEGER, INTENT(IN) :: gid 2348 INTEGER :: nsiz, group, ierr, npe 2349 2350#if defined (__MPI) 2351 2352 group = gid 2353 2354 CALL mpi_comm_size( group, npe, ierr ) 2355 IF (ierr/=0) CALL mp_stop( 8069 ) 2356 2357 IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 ) 2358 IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 ) 2359 2360 nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 ) 2361 2362 CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_DOUBLE_COMPLEX, & 2363 rcvbuf, nsiz, MPI_DOUBLE_COMPLEX, group, ierr ) 2364 2365 IF (ierr/=0) CALL mp_stop( 8074 ) 2366 2367#else 2368 2369 rcvbuf = sndbuf 2370 2371#endif 2372 2373 RETURN 2374END SUBROUTINE mp_alltoall_c3d 2375 2376 2377!------------------------------------------------------------------------------! 2378 2379SUBROUTINE mp_alltoall_i3d( sndbuf, rcvbuf, gid ) 2380 IMPLICIT NONE 2381 INTEGER :: sndbuf( :, :, : ) 2382 INTEGER :: rcvbuf( :, :, : ) 2383 INTEGER, INTENT(IN) :: gid 2384 INTEGER :: nsiz, group, ierr, npe 2385 2386#if defined (__MPI) 2387 2388 group = gid 2389 2390 CALL mpi_comm_size( group, npe, ierr ) 2391 IF (ierr/=0) CALL mp_stop( 8069 ) 2392 2393 IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 ) 2394 IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 ) 2395 2396 nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 ) 2397 2398 CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_INTEGER, & 2399 rcvbuf, nsiz, MPI_INTEGER, group, ierr ) 2400 2401 IF (ierr/=0) CALL mp_stop( 8074 ) 2402 2403#else 2404 2405 rcvbuf = sndbuf 2406 2407#endif 2408 2409 RETURN 2410END SUBROUTINE mp_alltoall_i3d 2411 2412SUBROUTINE mp_circular_shift_left_i0( buf, itag, gid ) 2413 IMPLICIT NONE 2414 INTEGER :: buf 2415 INTEGER, INTENT(IN) :: itag 2416 INTEGER, INTENT(IN) :: gid 2417 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2418 2419#if defined (__MPI) 2420 2421 INTEGER :: istatus( mpi_status_size ) 2422 ! 2423 group = gid 2424 ! 2425 CALL mpi_comm_size( group, npe, ierr ) 2426 IF (ierr/=0) CALL mp_stop( 8100 ) 2427 CALL mpi_comm_rank( group, mype, ierr ) 2428 IF (ierr/=0) CALL mp_stop( 8101 ) 2429 ! 2430 sour = mype + 1 2431 IF( sour == npe ) sour = 0 2432 dest = mype - 1 2433 IF( dest == -1 ) dest = npe - 1 2434 ! 2435 CALL MPI_Sendrecv_replace( buf, 1, MPI_INTEGER, & 2436 dest, itag, sour, itag, group, istatus, ierr) 2437 ! 2438 IF (ierr/=0) CALL mp_stop( 8102 ) 2439 ! 2440#else 2441 ! do nothing 2442#endif 2443 RETURN 2444END SUBROUTINE mp_circular_shift_left_i0 2445 2446 2447SUBROUTINE mp_circular_shift_left_i1( buf, itag, gid ) 2448 IMPLICIT NONE 2449 INTEGER :: buf(:) 2450 INTEGER, INTENT(IN) :: itag 2451 INTEGER, INTENT(IN) :: gid 2452 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2453 2454#if defined (__MPI) 2455 2456 INTEGER :: istatus( mpi_status_size ) 2457 ! 2458 group = gid 2459 ! 2460 CALL mpi_comm_size( group, npe, ierr ) 2461 IF (ierr/=0) CALL mp_stop( 8100 ) 2462 CALL mpi_comm_rank( group, mype, ierr ) 2463 IF (ierr/=0) CALL mp_stop( 8101 ) 2464 ! 2465 sour = mype + 1 2466 IF( sour == npe ) sour = 0 2467 dest = mype - 1 2468 IF( dest == -1 ) dest = npe - 1 2469 ! 2470 CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, & 2471 dest, itag, sour, itag, group, istatus, ierr) 2472 ! 2473 IF (ierr/=0) CALL mp_stop( 8102 ) 2474 ! 2475#else 2476 ! do nothing 2477#endif 2478 RETURN 2479END SUBROUTINE mp_circular_shift_left_i1 2480 2481 2482SUBROUTINE mp_circular_shift_left_i2( buf, itag, gid ) 2483 IMPLICIT NONE 2484 INTEGER :: buf(:,:) 2485 INTEGER, INTENT(IN) :: itag 2486 INTEGER, INTENT(IN) :: gid 2487 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2488 2489#if defined (__MPI) 2490 2491 INTEGER :: istatus( mpi_status_size ) 2492 ! 2493 group = gid 2494 ! 2495 CALL mpi_comm_size( group, npe, ierr ) 2496 IF (ierr/=0) CALL mp_stop( 8100 ) 2497 CALL mpi_comm_rank( group, mype, ierr ) 2498 IF (ierr/=0) CALL mp_stop( 8101 ) 2499 ! 2500 sour = mype + 1 2501 IF( sour == npe ) sour = 0 2502 dest = mype - 1 2503 IF( dest == -1 ) dest = npe - 1 2504 ! 2505 CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, & 2506 dest, itag, sour, itag, group, istatus, ierr) 2507 ! 2508 IF (ierr/=0) CALL mp_stop( 8102 ) 2509 ! 2510#else 2511 ! do nothing 2512#endif 2513 RETURN 2514END SUBROUTINE mp_circular_shift_left_i2 2515 2516 2517SUBROUTINE mp_circular_shift_left_r2d( buf, itag, gid ) 2518 IMPLICIT NONE 2519 REAL(DP) :: buf( :, : ) 2520 INTEGER, INTENT(IN) :: itag 2521 INTEGER, INTENT(IN) :: gid 2522 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2523 2524#if defined (__MPI) 2525 2526 INTEGER :: istatus( mpi_status_size ) 2527 ! 2528 group = gid 2529 ! 2530 CALL mpi_comm_size( group, npe, ierr ) 2531 IF (ierr/=0) CALL mp_stop( 8100 ) 2532 CALL mpi_comm_rank( group, mype, ierr ) 2533 IF (ierr/=0) CALL mp_stop( 8101 ) 2534 ! 2535 sour = mype + 1 2536 IF( sour == npe ) sour = 0 2537 dest = mype - 1 2538 IF( dest == -1 ) dest = npe - 1 2539 ! 2540 CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_PRECISION, & 2541 dest, itag, sour, itag, group, istatus, ierr) 2542 ! 2543 IF (ierr/=0) CALL mp_stop( 8102 ) 2544 ! 2545#else 2546 ! do nothing 2547#endif 2548 RETURN 2549END SUBROUTINE mp_circular_shift_left_r2d 2550 2551SUBROUTINE mp_circular_shift_left_c2d( buf, itag, gid ) 2552 IMPLICIT NONE 2553 COMPLEX(DP) :: buf( :, : ) 2554 INTEGER, INTENT(IN) :: itag 2555 INTEGER, INTENT(IN) :: gid 2556 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2557 2558#if defined (__MPI) 2559 2560 INTEGER :: istatus( mpi_status_size ) 2561 ! 2562 group = gid 2563 ! 2564 CALL mpi_comm_size( group, npe, ierr ) 2565 IF (ierr/=0) CALL mp_stop( 8100 ) 2566 CALL mpi_comm_rank( group, mype, ierr ) 2567 IF (ierr/=0) CALL mp_stop( 8101 ) 2568 ! 2569 sour = mype + 1 2570 IF( sour == npe ) sour = 0 2571 dest = mype - 1 2572 IF( dest == -1 ) dest = npe - 1 2573 ! 2574 CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_COMPLEX, & 2575 dest, itag, sour, itag, group, istatus, ierr) 2576 ! 2577 IF (ierr/=0) CALL mp_stop( 8102 ) 2578 ! 2579#else 2580 ! do nothing 2581#endif 2582 RETURN 2583END SUBROUTINE mp_circular_shift_left_c2d 2584 2585 2586!------------------------------------------------------------------------------! 2587!..mp_circular_shift_left_start 2588SUBROUTINE mp_circular_shift_left_start_i0( sendbuf, recvbuf, itag, gid, requests) 2589 IMPLICIT NONE 2590 INTEGER :: sendbuf, recvbuf 2591 INTEGER, INTENT(IN) :: itag 2592 INTEGER, INTENT(IN) :: gid 2593 INTEGER, INTENT(INOUT) :: requests(2) 2594 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2595 2596#if defined (__MPI) 2597 2598 !set null requests 2599 requests = mpi_request_null 2600 2601 !communicator 2602 group = gid 2603 ! 2604 CALL mpi_comm_size( group, npe, ierr ) 2605 IF (ierr/=0) CALL mp_stop( 8100 ) 2606 CALL mpi_comm_rank( group, mype, ierr ) 2607 IF (ierr/=0) CALL mp_stop( 8101 ) 2608 ! 2609 sour = modulo(mype + 1, npe) 2610 dest = modulo(mype + npe - 1, npe) 2611 ! 2612 CALL MPI_Irecv( recvbuf, 1, MPI_INTEGER, & 2613 sour, itag, group, requests(1), ierr) 2614 ! 2615 IF (ierr/=0) CALL mp_stop( 8102 ) 2616 ! 2617 CALL MPI_Isend( sendbuf, 1, MPI_INTEGER, & 2618 dest, itag, group, requests(2), ierr) 2619 ! 2620 IF (ierr/=0) CALL mp_stop( 8103 ) 2621 ! 2622#else 2623 2624 recvbuf = sendbuf 2625 2626#endif 2627 RETURN 2628END SUBROUTINE mp_circular_shift_left_start_i0 2629 2630 2631SUBROUTINE mp_circular_shift_left_start_i1( sendbuf, recvbuf, itag, gid, requests) 2632 IMPLICIT NONE 2633 INTEGER :: sendbuf( : ), recvbuf( : ) 2634 INTEGER, INTENT(IN) :: itag 2635 INTEGER, INTENT(IN) :: gid 2636 INTEGER, INTENT(INOUT) :: requests(2) 2637 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2638 2639#if defined (__MPI) 2640 2641 !set null requests 2642 requests = mpi_request_null 2643 2644 !communicator 2645 group = gid 2646 ! 2647 IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099) 2648 CALL mpi_comm_size( group, npe, ierr ) 2649 IF (ierr/=0) CALL mp_stop( 8100 ) 2650 CALL mpi_comm_rank( group, mype, ierr ) 2651 IF (ierr/=0) CALL mp_stop( 8101 ) 2652 ! 2653 sour = modulo(mype + 1, npe) 2654 dest = modulo(mype + npe - 1, npe) 2655 ! 2656 CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, & 2657 sour, itag, group, requests(1), ierr) 2658 ! 2659 IF (ierr/=0) CALL mp_stop( 8102 ) 2660 ! 2661 CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, & 2662 dest, itag, group, requests(2), ierr) 2663 ! 2664 IF (ierr/=0) CALL mp_stop( 8103 ) 2665 ! 2666#else 2667 2668 recvbuf = sendbuf 2669 2670#endif 2671 RETURN 2672END SUBROUTINE mp_circular_shift_left_start_i1 2673 2674 2675SUBROUTINE mp_circular_shift_left_start_i2( sendbuf, recvbuf, itag, gid, requests) 2676 IMPLICIT NONE 2677 INTEGER :: sendbuf( :, : ), recvbuf( :, : ) 2678 INTEGER, INTENT(IN) :: itag 2679 INTEGER, INTENT(IN) :: gid 2680 INTEGER, INTENT(INOUT) :: requests(2) 2681 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2682 2683#if defined (__MPI) 2684 2685 !set null requests 2686 requests = mpi_request_null 2687 2688 !communicator 2689 group = gid 2690 ! 2691 IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099) 2692 CALL mpi_comm_size( group, npe, ierr ) 2693 IF (ierr/=0) CALL mp_stop( 8100 ) 2694 CALL mpi_comm_rank( group, mype, ierr ) 2695 IF (ierr/=0) CALL mp_stop( 8101 ) 2696 ! 2697 sour = modulo(mype + 1, npe) 2698 dest = modulo(mype + npe - 1, npe) 2699 ! 2700 CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, & 2701 sour, itag, group, requests(1), ierr) 2702 ! 2703 IF (ierr/=0) CALL mp_stop( 8102 ) 2704 ! 2705 CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, & 2706 dest, itag, group, requests(2), ierr) 2707 ! 2708 IF (ierr/=0) CALL mp_stop( 8103 ) 2709 ! 2710#else 2711 2712 recvbuf = sendbuf 2713 2714#endif 2715 RETURN 2716END SUBROUTINE mp_circular_shift_left_start_i2 2717 2718 2719SUBROUTINE mp_circular_shift_left_start_r2d( sendbuf, recvbuf, itag, gid, requests) 2720 IMPLICIT NONE 2721 REAL(DP) :: sendbuf( :, : ), recvbuf( :, : ) 2722 INTEGER, INTENT(IN) :: itag 2723 INTEGER, INTENT(IN) :: gid 2724 INTEGER, INTENT(INOUT) :: requests(2) 2725 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2726 2727#if defined (__MPI) 2728 2729 !set null requests 2730 requests = mpi_request_null 2731 2732 !communicator 2733 group = gid 2734 ! 2735 IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099) 2736 CALL mpi_comm_size( group, npe, ierr ) 2737 IF (ierr/=0) CALL mp_stop( 8100 ) 2738 CALL mpi_comm_rank( group, mype, ierr ) 2739 IF (ierr/=0) CALL mp_stop( 8101 ) 2740 ! 2741 sour = modulo(mype + 1, npe) 2742 dest = modulo(mype + npe - 1, npe) 2743 ! 2744 CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_PRECISION, & 2745 sour, itag, group, requests(1), ierr) 2746 ! 2747 IF (ierr/=0) CALL mp_stop( 8102 ) 2748 ! 2749 CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_PRECISION, & 2750 dest, itag, group, requests(2), ierr) 2751 ! 2752 IF (ierr/=0) CALL mp_stop( 8103 ) 2753 ! 2754#else 2755 2756 recvbuf = sendbuf 2757 2758#endif 2759 RETURN 2760END SUBROUTINE mp_circular_shift_left_start_r2d 2761 2762 2763SUBROUTINE mp_circular_shift_left_start_c2d( sendbuf, recvbuf, itag, gid, requests) 2764 IMPLICIT NONE 2765 COMPLEX(DP) :: sendbuf( :, : ), recvbuf( :, : ) 2766 INTEGER, INTENT(IN) :: itag 2767 INTEGER, INTENT(IN) :: gid 2768 INTEGER, INTENT(INOUT) :: requests(2) 2769 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 2770 2771#if defined (__MPI) 2772 2773 !set null requests 2774 requests = mpi_request_null 2775 2776 !communicator 2777 group = gid 2778 ! 2779 IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099) 2780 CALL mpi_comm_size( group, npe, ierr ) 2781 IF (ierr/=0) CALL mp_stop( 8100 ) 2782 CALL mpi_comm_rank( group, mype, ierr ) 2783 IF (ierr/=0) CALL mp_stop( 8101 ) 2784 ! 2785 sour = modulo(mype + 1, npe) 2786 dest = modulo(mype + npe - 1, npe) 2787 ! 2788 CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_COMPLEX, & 2789 sour, itag, group, requests(1), ierr) 2790 ! 2791 IF (ierr/=0) CALL mp_stop( 8102 ) 2792 ! 2793 CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_COMPLEX, & 2794 dest, itag, group, requests(2), ierr) 2795 ! 2796 IF (ierr/=0) CALL mp_stop( 8103 ) 2797 ! 2798#else 2799 2800 recvbuf = sendbuf 2801 2802#endif 2803 RETURN 2804END SUBROUTINE mp_circular_shift_left_start_c2d 2805! 2806! 2807!------------------------------------------------------------------------------! 2808!..mp_count_nodes 2809SUBROUTINE mp_count_nodes(num_nodes, color, key, group) 2810 ! 2811 ! ... This routine counts the number of nodes using 2812 ! ... MPI_GET_PROCESSOR_NAME in the group specified by `group`. 2813 ! ... It returns colors and keys to be used in MPI_COMM_SPLIT. 2814 ! ... When running in parallel, the evaluation of color and key 2815 ! ... is done by all processors. 2816 ! ... 2817 ! ... 2818 ! ... input: 2819 ! ... group Communicator used to count nodes. 2820 ! 2821 ! ... output: 2822 ! ... num_nodes Number of unique nodes in the communicator 2823 ! ... color Integer (positive), same for all processes residing on a node. 2824 ! ... key Integer, unique number identifying each process on the same node. 2825 ! ... 2826 IMPLICIT NONE 2827 INTEGER, INTENT (OUT) :: num_nodes 2828 INTEGER, INTENT (OUT) :: color 2829 INTEGER, INTENT (OUT) :: key 2830 INTEGER, INTENT (IN) :: group 2831#if defined (__MPI) 2832 CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: hostname 2833 CHARACTER(len=MPI_MAX_PROCESSOR_NAME), ALLOCATABLE :: host_list(:) 2834#endif 2835 2836 LOGICAL, ALLOCATABLE :: found_list(:) 2837 INTEGER, ALLOCATABLE :: color_list(:) 2838 INTEGER, ALLOCATABLE :: key_list(:) 2839 ! 2840 INTEGER :: hostname_len, max_hostname_len, numtask, me, ierr 2841 ! 2842 ! Loops variables 2843 INTEGER :: i, j, e, s, c, k 2844 ! ... 2845 ierr = 0 2846 num_nodes = 1 2847 color = 1 2848 key = 0 2849 ! 2850#if defined(__MPI) 2851 ! 2852 CALL MPI_GET_PROCESSOR_NAME(hostname, hostname_len, ierr) 2853 IF (ierr/=0) CALL mp_stop( 8103 ) 2854 2855 ! find total number of ranks and my rank in communicator 2856 CALL MPI_COMM_SIZE(group, numtask, ierr) 2857 IF (ierr/=0) CALL mp_stop( 8104 ) 2858 ! 2859 CALL MPI_COMM_RANK(group, me, ierr) 2860 IF (ierr/=0) CALL mp_stop( 8105 ) 2861 ! 2862 ALLOCATE(host_list(0:numtask-1)) 2863 ! 2864 host_list(me) = hostname(1:hostname_len) 2865 ! 2866 ! Each process broadcast its name to the others 2867 DO i=0,numtask-1 2868 CALL MPI_BCAST(host_list(i), MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER,& 2869 i, group, ierr) 2870 IF (ierr/=0) CALL mp_stop( 8106 ) 2871 END DO 2872 ! 2873 ! Simple algorithm to count unique entries. 2874 ! 2875 ALLOCATE(found_list(0:numtask-1),color_list(0:numtask-1)) 2876 ALLOCATE(key_list(0:numtask-1)) 2877 found_list(:) = .false. 2878 color_list(:) = -1 2879 key_list(:) = -1 2880 ! 2881 ! c is the counter for colors 2882 ! k is the counter for keys 2883 ! 2884 c = 0 2885 DO i=0,numtask-1 2886 ! if node_counter == .true., this element has already been found, 2887 ! so skip it. 2888 IF (found_list(i)) CYCLE 2889 ! else increment color counter and reset key counter 2890 c = c + 1; k = 0 2891 color_list(i) = c 2892 key_list(i) = k 2893 ! 2894 DO j=i+1,numtask-1 2895 ! 2896 IF ( LLE(host_list(i),host_list(j)) .and. & 2897 LGE(host_list(i),host_list(j)) ) THEN 2898 ! increment the key, key=0 is the one we are comparing to 2899 k = k + 1 2900 ! element should not be already found 2901 IF ( found_list(j) ) CALL mp_stop( 8107 ) 2902 found_list(j) = .true. 2903 color_list(j) = c 2904 key_list(j) = k 2905 END IF 2906 END DO 2907 END DO 2908 ! Sanity checks 2909 IF ( MINVAL(color_list) < 0 ) CALL mp_stop( 8108 ) 2910 IF ( MINVAL(key_list) < 0 ) CALL mp_stop( 8109 ) 2911 ! 2912 color = color_list(me) 2913 key = key_list(me) 2914 num_nodes = MAXVAL(color_list) 2915 DEALLOCATE(host_list,found_list,color_list,key_list) 2916! 2917#endif 2918 RETURN 2919END SUBROUTINE mp_count_nodes 2920! 2921FUNCTION mp_get_comm_null( ) 2922 IMPLICIT NONE 2923 INTEGER :: mp_get_comm_null 2924 mp_get_comm_null = MPI_COMM_NULL 2925END FUNCTION mp_get_comm_null 2926 2927FUNCTION mp_get_comm_self( ) 2928 IMPLICIT NONE 2929 INTEGER :: mp_get_comm_self 2930 mp_get_comm_self = MPI_COMM_SELF 2931END FUNCTION mp_get_comm_self 2932 2933SUBROUTINE mp_type_create_cplx_column_section(dummy, start, length, stride, mytype) 2934 IMPLICIT NONE 2935 ! 2936 COMPLEX (DP), INTENT(IN) :: dummy 2937 INTEGER, INTENT(IN) :: start, length, stride 2938 INTEGER, INTENT(OUT) :: mytype 2939 ! 2940#if defined(__MPI) 2941 INTEGER :: ierr 2942 ! 2943 CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,& 2944 MPI_DOUBLE_COMPLEX, mytype, ierr) 2945 IF (ierr/=0) CALL mp_stop( 8081 ) 2946 CALL MPI_Type_commit(mytype, ierr) 2947 IF (ierr/=0) CALL mp_stop( 8082 ) 2948#else 2949 mytype = 0; 2950#endif 2951 ! 2952 RETURN 2953END SUBROUTINE mp_type_create_cplx_column_section 2954 2955SUBROUTINE mp_type_create_real_column_section(dummy, start, length, stride, mytype) 2956 IMPLICIT NONE 2957 ! 2958 REAL (DP), INTENT(IN) :: dummy 2959 INTEGER, INTENT(IN) :: start, length, stride 2960 INTEGER, INTENT(OUT) :: mytype 2961 ! 2962#if defined(__MPI) 2963 INTEGER :: ierr 2964 ! 2965 CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,& 2966 MPI_DOUBLE_PRECISION, mytype, ierr) 2967 IF (ierr/=0) CALL mp_stop( 8083 ) 2968 CALL MPI_Type_commit(mytype, ierr) 2969 IF (ierr/=0) CALL mp_stop( 8084 ) 2970#else 2971 mytype = 0; 2972#endif 2973 ! 2974 RETURN 2975END SUBROUTINE mp_type_create_real_column_section 2976 2977SUBROUTINE mp_type_create_cplx_row_section(dummy, column_start, column_stride, row_length, mytype) 2978 IMPLICIT NONE 2979 ! 2980 COMPLEX (DP), INTENT(IN) :: dummy 2981 INTEGER, INTENT(IN) :: column_start, column_stride, row_length 2982 INTEGER, INTENT(OUT) :: mytype 2983 ! 2984#if defined(__MPI) 2985 INTEGER :: ierr, temporary 2986 INTEGER :: strides(2), lengths(2), starts(2) 2987 INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent 2988 ! 2989 strides(1) = column_stride ; strides(2) = row_length 2990 lengths(1) = 1 ; lengths(2) = row_length 2991 starts(1) = column_start ; starts(2) = 0 2992 CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,& 2993 MPI_DOUBLE_COMPLEX, temporary, ierr) 2994 IF (ierr/=0) CALL mp_stop( 8085 ) 2995 CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr) 2996 IF (ierr/=0) CALL mp_stop( 8085 ) 2997 CALL MPI_TYPE_COMMIT(temporary, ierr) 2998 IF (ierr/=0) CALL mp_stop( 8085 ) 2999 CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr) 3000 IF (ierr/=0) CALL mp_stop( 8086 ) 3001 CALL MPI_Type_commit(mytype, ierr) 3002 IF (ierr/=0) CALL mp_stop( 8086 ) 3003#else 3004 mytype = 0; 3005#endif 3006 ! 3007 RETURN 3008END SUBROUTINE mp_type_create_cplx_row_section 3009 3010SUBROUTINE mp_type_create_real_row_section(dummy, column_start, column_stride, row_length, mytype) 3011 IMPLICIT NONE 3012 ! 3013 REAL (DP), INTENT(IN) :: dummy 3014 INTEGER, INTENT(IN) :: column_start, column_stride, row_length 3015 INTEGER, INTENT(OUT) :: mytype 3016 ! 3017#if defined(__MPI) 3018 INTEGER :: ierr, temporary 3019 INTEGER :: strides(2), lengths(2), starts(2) 3020 INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent 3021 ! 3022 strides(1) = column_stride ; strides(2) = row_length 3023 lengths(1) = 1 ; lengths(2) = row_length 3024 starts(1) = column_start ; starts(2) = 0 3025 CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,& 3026 MPI_DOUBLE_PRECISION, temporary, ierr) 3027 IF (ierr/=0) CALL mp_stop( 8087 ) 3028 CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr) 3029 IF (ierr/=0) CALL mp_stop( 8087 ) 3030 CALL MPI_TYPE_COMMIT(temporary, ierr) 3031 IF (ierr/=0) CALL mp_stop( 8087 ) 3032 CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr) 3033 IF (ierr/=0) CALL mp_stop( 8088 ) 3034 CALL MPI_Type_commit(mytype, ierr) 3035 IF (ierr/=0) CALL mp_stop( 8088 ) 3036#else 3037 mytype = 0; 3038#endif 3039 ! 3040 RETURN 3041END SUBROUTINE mp_type_create_real_row_section 3042 3043SUBROUTINE mp_type_free(mytype) 3044 IMPLICIT NONE 3045 INTEGER :: mytype, ierr 3046 ! 3047#if defined(__MPI) 3048 CALL MPI_TYPE_FREE(mytype, ierr) 3049 IF (ierr/=0) CALL mp_stop( 8089 ) 3050#endif 3051 ! 3052 RETURN 3053END SUBROUTINE mp_type_free 3054!------------------------------------------------------------------------------! 3055! GPU specific subroutines (Pietro Bonfa') 3056!------------------------------------------------------------------------------! 3057! Before hacking on the CUDA part remember that: 3058! 3059! 1. all mp_* interface should be blocking with respect to both MPI and CUDA. 3060! MPI will only wait for completion on the default stream therefore device 3061! synchronization must be enforced. 3062! 2. Host -> device memory copies of a memory block of 64 KB or less are 3063! asynchronous in the sense that they may return before the data is actually 3064! available on the GPU. However, the user is still free to change the buffer 3065! as soon as those calls return with no ill effects. 3066! (https://devtalk.nvidia.com/default/topic/471866/cuda-programming-and-performance/host-device-memory-copies-up-to-64-kb-are-asynchronous/) 3067! 3. For transfers from device to either pageable or pinned host memory, 3068! the function returns only once the copy has completed. 3069! 4. GPU synchronization is always enforced even if no communication takes place. 3070!------------------------------------------------------------------------------! 3071#ifdef __CUDA 3072 3073!------------------------------------------------------------------------------! 3074!..mp_bcast 3075 3076 SUBROUTINE mp_bcast_i1_gpu(msg_d,source,gid) 3077 IMPLICIT NONE 3078 INTEGER, DEVICE :: msg_d 3079 INTEGER :: msg_h 3080 INTEGER :: source 3081 INTEGER, INTENT(IN) :: gid 3082 INTEGER :: group 3083 INTEGER :: msglen, ierr 3084 ! 3085#if defined(__MPI) 3086 msglen = 1 3087 group = gid 3088#if defined(__GPU_MPI) 3089 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3090 CALL bcast_integer_gpu( msg_d, msglen, source, group ) 3091 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3092#else 3093 msg_h = msg_d ! This syncs __MPI case 3094 CALL bcast_integer( msg_h, msglen, source, group ) 3095 msg_d = msg_h 3096#endif 3097#endif 3098 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3099 END SUBROUTINE mp_bcast_i1_gpu 3100! 3101!------------------------------------------------------------------------------! 3102! 3103 SUBROUTINE mp_bcast_iv_gpu(msg_d,source,gid) 3104 IMPLICIT NONE 3105 INTEGER, DEVICE :: msg_d(:) 3106 INTEGER, ALLOCATABLE :: msg_h(:) 3107 INTEGER, INTENT(IN) :: source 3108 INTEGER, INTENT(IN) :: gid 3109 INTEGER :: msglen, ierr 3110 ! 3111#if defined(__MPI) 3112#if defined(__GPU_MPI) 3113 msglen = size(msg_d) 3114 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3115 CALL bcast_integer_gpu( msg_d, msglen, source, gid ) 3116 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3117#else 3118 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3119 msglen = size(msg_h) 3120 CALL bcast_integer( msg_h, msglen, source, gid ) 3121 msg_d = msg_h; DEALLOCATE( msg_h ) 3122#endif 3123#endif 3124 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3125 END SUBROUTINE mp_bcast_iv_gpu 3126! 3127!------------------------------------------------------------------------------! 3128! 3129 SUBROUTINE mp_bcast_im_gpu( msg_d, source, gid ) 3130 IMPLICIT NONE 3131 INTEGER, DEVICE :: msg_d(:,:) 3132 INTEGER, ALLOCATABLE :: msg_h(:,:) 3133 INTEGER, INTENT(IN) :: source 3134 INTEGER, INTENT(IN) :: gid 3135 INTEGER :: msglen, ierr 3136#if defined(__MPI) 3137#if defined(__GPU_MPI) 3138 msglen = size(msg_d) 3139 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3140 CALL bcast_integer_gpu( msg_d, msglen, source, gid ) 3141 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3142#else 3143 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3144 msglen = size(msg_h) 3145 CALL bcast_integer( msg_h, msglen, source, gid ) 3146 msg_d = msg_h; DEALLOCATE( msg_h ) 3147#endif 3148#endif 3149 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3150 END SUBROUTINE mp_bcast_im_gpu 3151! 3152!------------------------------------------------------------------------------! 3153! 3154 SUBROUTINE mp_bcast_it_gpu( msg_d, source, gid ) 3155 IMPLICIT NONE 3156 INTEGER, DEVICE :: msg_d(:,:,:) 3157 3158 INTEGER, INTENT(IN) :: source 3159 INTEGER, INTENT(IN) :: gid 3160 INTEGER :: msglen, ierr 3161#if defined(__MPI) 3162#if defined(__GPU_MPI) 3163 msglen = size(msg_d) 3164 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3165 CALL bcast_integer_gpu( msg_d, msglen, source, gid ) 3166 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3167#else 3168 INTEGER, ALLOCATABLE :: msg_h(:,:,:) 3169 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3170 msglen = size(msg_h) 3171 CALL bcast_integer( msg_h, msglen, source, gid ) 3172 msg_d = msg_h; DEALLOCATE( msg_h ) 3173#endif 3174#endif 3175 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3176 END SUBROUTINE mp_bcast_it_gpu 3177! 3178!------------------------------------------------------------------------------! 3179! 3180 SUBROUTINE mp_bcast_i4d_gpu(msg_d, source, gid) 3181 IMPLICIT NONE 3182 INTEGER, DEVICE :: msg_d(:,:,:,:) 3183 INTEGER, ALLOCATABLE :: msg_h(:,:,:,:) 3184 INTEGER, INTENT(IN) :: source 3185 INTEGER, INTENT(IN) :: gid 3186 INTEGER :: msglen, ierr 3187#if defined(__MPI) 3188#if defined(__GPU_MPI) 3189 msglen = size(msg_d) 3190 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3191 CALL bcast_integer_gpu( msg_d, msglen, source, gid ) 3192 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3193#else 3194 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3195 msglen = size(msg_h) 3196 CALL bcast_integer( msg_h, msglen, source, gid ) 3197 msg_d = msg_h; DEALLOCATE( msg_h ) 3198#endif 3199#endif 3200 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3201 END SUBROUTINE mp_bcast_i4d_gpu 3202! 3203!------------------------------------------------------------------------------! 3204! 3205 SUBROUTINE mp_bcast_r1_gpu( msg_d, source, gid ) 3206 IMPLICIT NONE 3207 REAL (DP), DEVICE :: msg_d 3208 REAL (DP) :: msg_h 3209 INTEGER, INTENT(IN) :: source 3210 INTEGER, INTENT(IN) :: gid 3211 INTEGER :: msglen, ierr 3212#if defined(__MPI) 3213 msglen = 1 3214#if defined(__GPU_MPI) 3215 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3216 CALL bcast_real_gpu( msg_d, msglen, source, gid ) 3217 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3218#else 3219 msg_h=msg_d ! This syncs __MPI case 3220 CALL bcast_real( msg_h, msglen, source, gid ) 3221 msg_d = msg_h 3222#endif 3223#endif 3224 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3225 END SUBROUTINE mp_bcast_r1_gpu 3226! 3227!------------------------------------------------------------------------------! 3228! 3229 SUBROUTINE mp_bcast_rv_gpu(msg_d,source,gid) 3230 IMPLICIT NONE 3231 REAL (DP), DEVICE :: msg_d(:) 3232 REAL (DP), ALLOCATABLE :: msg_h(:) 3233 INTEGER, INTENT(IN) :: source 3234 INTEGER, INTENT(IN) :: gid 3235 INTEGER :: msglen, ierr 3236#if defined(__MPI) 3237#if defined(__GPU_MPI) 3238 msglen = size(msg_d) 3239 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3240 CALL bcast_real_gpu( msg_d, msglen, source, gid ) 3241 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3242#else 3243 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3244 msglen = size(msg_h) 3245 CALL bcast_real( msg_h, msglen, source, gid ) 3246 msg_d = msg_h ; DEALLOCATE(msg_h) 3247#endif 3248#endif 3249 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3250 END SUBROUTINE mp_bcast_rv_gpu 3251! 3252!------------------------------------------------------------------------------! 3253! 3254 SUBROUTINE mp_bcast_rm_gpu(msg_d,source,gid) 3255 IMPLICIT NONE 3256 REAL (DP), DEVICE :: msg_d(:,:) 3257 INTEGER, INTENT(IN) :: source 3258 INTEGER, INTENT(IN) :: gid 3259 INTEGER :: msglen, ierr 3260#if defined(__MPI) 3261#if defined(__GPU_MPI) 3262 msglen = size(msg_d) 3263 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3264 CALL bcast_real_gpu( msg_d, msglen, source, gid ) 3265 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3266#else 3267 REAL (DP), ALLOCATABLE :: msg_h(:,:) 3268 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3269 msglen = size(msg_h) 3270 CALL bcast_real( msg_h, msglen, source, gid ) 3271 msg_d = msg_h ; DEALLOCATE(msg_h) 3272#endif 3273#endif 3274 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3275 END SUBROUTINE mp_bcast_rm_gpu 3276! 3277!------------------------------------------------------------------------------! 3278! 3279 SUBROUTINE mp_bcast_rt_gpu(msg_d,source,gid) 3280 IMPLICIT NONE 3281 REAL (DP), DEVICE :: msg_d(:,:,:) 3282 REAL (DP), ALLOCATABLE :: msg_h(:,:,:) 3283 INTEGER, INTENT(IN) :: source 3284 INTEGER, INTENT(IN) :: gid 3285 INTEGER :: msglen, ierr 3286#if defined(__MPI) 3287#if defined(__GPU_MPI) 3288 msglen = size(msg_d) 3289 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3290 CALL bcast_real_gpu( msg_d, msglen, source, gid ) 3291 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3292#else 3293 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3294 msglen = size(msg_h) 3295 CALL bcast_real( msg_h, msglen, source, gid ) 3296 msg_d = msg_h ; DEALLOCATE(msg_h) 3297#endif 3298#endif 3299 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3300 END SUBROUTINE mp_bcast_rt_gpu 3301! 3302!------------------------------------------------------------------------------! 3303! 3304 SUBROUTINE mp_bcast_r4d_gpu(msg_d, source, gid) 3305 IMPLICIT NONE 3306 REAL (DP), DEVICE :: msg_d(:,:,:,:) 3307 REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:) 3308 INTEGER, INTENT(IN) :: source 3309 INTEGER, INTENT(IN) :: gid 3310 INTEGER :: msglen, ierr 3311#if defined(__MPI) 3312#if defined(__GPU_MPI) 3313 msglen = size(msg_d) 3314 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3315 CALL bcast_real_gpu( msg_d, msglen, source, gid ) 3316 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3317#else 3318 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3319 msglen = size(msg_h) 3320 CALL bcast_real( msg_h, msglen, source, gid ) 3321 msg_d = msg_h ; DEALLOCATE(msg_h) 3322#endif 3323#endif 3324 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3325 END SUBROUTINE mp_bcast_r4d_gpu 3326! 3327!------------------------------------------------------------------------------! 3328! 3329 SUBROUTINE mp_bcast_r5d_gpu(msg_d, source, gid) 3330 IMPLICIT NONE 3331 REAL (DP), DEVICE :: msg_d(:,:,:,:,:) 3332 REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:) 3333 INTEGER, INTENT(IN) :: source 3334 INTEGER, INTENT(IN) :: gid 3335 INTEGER :: msglen, ierr 3336#if defined(__MPI) 3337#if defined(__GPU_MPI) 3338 msglen = size(msg_d) 3339 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3340 CALL bcast_real_gpu( msg_d, msglen, source, gid ) 3341 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3342#else 3343 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3344 msglen = size(msg_h) 3345 CALL bcast_real( msg_h, msglen, source, gid ) 3346 msg_d = msg_h ; DEALLOCATE(msg_h) 3347#endif 3348#endif 3349 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3350 END SUBROUTINE mp_bcast_r5d_gpu 3351! 3352!------------------------------------------------------------------------------! 3353! 3354 SUBROUTINE mp_bcast_c1_gpu(msg_d,source,gid) 3355 IMPLICIT NONE 3356 COMPLEX (DP), DEVICE :: msg_d 3357 COMPLEX (DP) :: msg_h 3358 INTEGER, INTENT(IN) :: source 3359 INTEGER, INTENT(IN) :: gid 3360 INTEGER :: msglen, ierr 3361#if defined(__MPI) 3362 msglen = 1 3363#if defined(__GPU_MPI) 3364 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3365 CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid ) 3366 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3367#else 3368 msg_h=msg_d ! This syncs __MPI case 3369 CALL bcast_real( msg_h, 2 * msglen, source, gid ) 3370 msg_d = msg_h 3371#endif 3372#endif 3373 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3374 END SUBROUTINE mp_bcast_c1_gpu 3375! 3376!------------------------------------------------------------------------------! 3377! 3378 SUBROUTINE mp_bcast_cv_gpu(msg_d,source,gid) 3379 IMPLICIT NONE 3380 COMPLEX (DP), DEVICE :: msg_d(:) 3381 COMPLEX (DP), ALLOCATABLE :: msg_h(:) 3382 INTEGER, INTENT(IN) :: source 3383 INTEGER, INTENT(IN) :: gid 3384 INTEGER :: msglen, ierr 3385#if defined(__MPI) 3386#if defined(__GPU_MPI) 3387 msglen = size(msg_d) 3388 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3389 CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid ) 3390 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3391#else 3392 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3393 msglen = size(msg_h) 3394 CALL bcast_real( msg_h, 2 * msglen, source, gid ) 3395 msg_d = msg_h ; DEALLOCATE(msg_h) 3396#endif 3397#endif 3398 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3399 END SUBROUTINE mp_bcast_cv_gpu 3400! 3401!------------------------------------------------------------------------------! 3402! 3403 SUBROUTINE mp_bcast_cm_gpu(msg_d,source,gid) 3404 IMPLICIT NONE 3405 COMPLEX (DP), DEVICE :: msg_d(:,:) 3406 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:) 3407 INTEGER, INTENT(IN) :: source 3408 INTEGER, INTENT(IN) :: gid 3409 INTEGER :: msglen, ierr 3410#if defined(__MPI) 3411#if defined(__GPU_MPI) 3412 msglen = size(msg_d) 3413 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3414 CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid ) 3415 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3416#else 3417 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3418 msglen = size(msg_h) 3419 CALL bcast_real( msg_h, 2 * msglen, source, gid ) 3420 msg_d = msg_h ; DEALLOCATE(msg_h) 3421#endif 3422#endif 3423 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3424 END SUBROUTINE mp_bcast_cm_gpu 3425! 3426!------------------------------------------------------------------------------! 3427! 3428 SUBROUTINE mp_bcast_ct_gpu(msg_d,source,gid) 3429 IMPLICIT NONE 3430 COMPLEX (DP), DEVICE :: msg_d(:,:,:) 3431 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:) 3432 INTEGER, INTENT(IN) :: source 3433 INTEGER, INTENT(IN) :: gid 3434 INTEGER :: msglen, ierr 3435#if defined(__MPI) 3436#if defined(__GPU_MPI) 3437 msglen = size(msg_d) 3438 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3439 CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid ) 3440 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3441#else 3442 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3443 msglen = size(msg_h) 3444 CALL bcast_real( msg_h, 2 * msglen, source, gid ) 3445 msg_d = msg_h ; DEALLOCATE(msg_h) 3446#endif 3447#endif 3448 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3449 END SUBROUTINE mp_bcast_ct_gpu 3450! 3451!------------------------------------------------------------------------------! 3452! 3453 SUBROUTINE mp_bcast_c4d_gpu(msg_d,source,gid) 3454 IMPLICIT NONE 3455 COMPLEX (DP), DEVICE :: msg_d(:,:,:,:) 3456 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:) 3457 INTEGER, INTENT(IN) :: source 3458 INTEGER, INTENT(IN) :: gid 3459 INTEGER :: msglen, ierr 3460#if defined(__MPI) 3461#if defined(__GPU_MPI) 3462 msglen = size(msg_d) 3463 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3464 CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid ) 3465 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3466#else 3467 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3468 msglen = size(msg_h) 3469 CALL bcast_real( msg_h, 2 * msglen, source, gid ) 3470 msg_d = msg_h ; DEALLOCATE(msg_h) 3471#endif 3472#endif 3473 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3474 END SUBROUTINE mp_bcast_c4d_gpu 3475! 3476!------------------------------------------------------------------------------! 3477! 3478 SUBROUTINE mp_bcast_c5d_gpu(msg_d,source,gid) 3479 IMPLICIT NONE 3480 COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:) 3481 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:) 3482 INTEGER, INTENT(IN) :: source 3483 INTEGER, INTENT(IN) :: gid 3484 INTEGER :: msglen, ierr 3485#if defined(__MPI) 3486#if defined(__GPU_MPI) 3487 msglen = size(msg_d) 3488 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3489 CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid ) 3490 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3491#else 3492 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3493 msglen = size(msg_h) 3494 CALL bcast_real( msg_h, 2 * msglen, source, gid ) 3495 msg_d = msg_h ; DEALLOCATE(msg_h) 3496#endif 3497#endif 3498 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3499 END SUBROUTINE mp_bcast_c5d_gpu 3500! 3501!------------------------------------------------------------------------------! 3502! 3503 SUBROUTINE mp_bcast_c6d_gpu(msg_d,source,gid) 3504 IMPLICIT NONE 3505 COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:,:) 3506 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:) 3507 INTEGER, INTENT(IN) :: source 3508 INTEGER, INTENT(IN) :: gid 3509 INTEGER :: msglen, ierr 3510#if defined(__MPI) 3511#if defined(__GPU_MPI) 3512 msglen = size(msg_d) 3513 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3514 CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid ) 3515 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3516#else 3517 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 3518 msglen = size(msg_h) 3519 CALL bcast_real( msg_h, 2 * msglen, source, gid ) 3520 msg_d = msg_h ; DEALLOCATE(msg_h) 3521#endif 3522#endif 3523 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3524 END SUBROUTINE mp_bcast_c6d_gpu 3525! 3526!------------------------------------------------------------------------------! 3527! 3528 SUBROUTINE mp_bcast_l_gpu(msg_d,source,gid) 3529 IMPLICIT NONE 3530 LOGICAL, DEVICE :: msg_d 3531 LOGICAL :: msg_h 3532 INTEGER, INTENT(IN) :: source 3533 INTEGER, INTENT(IN) :: gid 3534 INTEGER :: msglen, ierr 3535#if defined(__MPI) 3536 msglen = 1 3537#if defined(__GPU_MPI) 3538 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3539 CALL bcast_logical_gpu( msg_d, msglen, source, gid ) 3540 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3541#else 3542 msg_h = msg_d ! This syncs __MPI case 3543 CALL bcast_logical( msg_h, msglen, source, gid ) 3544 msg_d = msg_h 3545#endif 3546#endif 3547 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3548 END SUBROUTINE mp_bcast_l_gpu 3549! 3550!------------------------------------------------------------------------------! 3551! 3552 SUBROUTINE mp_bcast_lv_gpu(msg_d,source,gid) 3553 IMPLICIT NONE 3554 LOGICAL, DEVICE :: msg_d(:) 3555 INTEGER, INTENT(IN) :: source 3556 INTEGER, INTENT(IN) :: gid 3557 INTEGER :: msglen, ierr 3558#if defined(__MPI) 3559#if defined(__GPU_MPI) 3560 msglen = size(msg_d) 3561 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3562 CALL bcast_logical_gpu( msg_d, msglen, source, gid ) 3563 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3564#else 3565 LOGICAL, ALLOCATABLE :: msg_h(:) 3566 ALLOCATE(msg_h, source=msg_d) ! This syncs __MPI case 3567 msglen = size(msg_h) 3568 CALL bcast_logical( msg_h, msglen, source, gid ) 3569 msg_d = msg_h; DEALLOCATE(msg_h) 3570#endif 3571#endif 3572 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3573 END SUBROUTINE mp_bcast_lv_gpu 3574! 3575!------------------------------------------------------------------------------! 3576! 3577 SUBROUTINE mp_bcast_lm_gpu(msg_d,source,gid) 3578 IMPLICIT NONE 3579 LOGICAL, DEVICE :: msg_d(:,:) 3580 INTEGER, INTENT(IN) :: source 3581 INTEGER, INTENT(IN) :: gid 3582 INTEGER :: msglen, ierr 3583#if defined(__MPI) 3584#if defined(__GPU_MPI) 3585 msglen = size(msg_d) 3586 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case 3587 CALL bcast_logical_gpu( msg_d, msglen, source, gid ) 3588 RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu) 3589#else 3590 LOGICAL, ALLOCATABLE :: msg_h(:,:) 3591 ALLOCATE(msg_h, source=msg_d) ! This syncs __MPI case 3592 msglen = size(msg_h) 3593 CALL bcast_logical( msg_h, msglen, source, gid ) 3594 msg_d = msg_h; DEALLOCATE(msg_h) 3595#endif 3596#endif 3597 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 3598 END SUBROUTINE mp_bcast_lm_gpu 3599! 3600!------------------------------------------------------------------------------! 3601! 3602 SUBROUTINE mp_get_i1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid) 3603 INTEGER, DEVICE :: msg_dest_d 3604 INTEGER, INTENT(IN), DEVICE :: msg_sour_d 3605 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 3606 INTEGER, INTENT(IN) :: gid 3607 INTEGER :: group, ierr 3608#if defined(__MPI) 3609 INTEGER :: istatus(MPI_STATUS_SIZE) 3610#endif 3611 INTEGER :: nrcv 3612 INTEGER :: msglen = 1 3613 3614#if ! defined(__GPU_MPI) 3615 ! Call CPU implementation 3616 INTEGER :: msg_dest_h, msg_sour_h 3617 ! 3618 msg_dest_h = msg_dest_d; msg_sour_h = msg_sour_d ! This syncs __MPI case 3619 CALL mp_get_i1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid) 3620 msg_dest_d = msg_dest_h 3621#else 3622 3623#if defined(__MPI) 3624 group = gid 3625#endif 3626 3627 ! processors not taking part in the communication have 0 length message 3628 3629 msglen = 0 3630 ! 3631 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 3632 ! 3633 IF(dest .NE. sour) THEN 3634#if defined(__MPI) 3635 IF(mpime .EQ. sour) THEN 3636 msglen=1 3637 CALL MPI_SEND( msg_sour_d, msglen, MPI_INTEGER, dest, ip, group, ierr) 3638 IF (ierr/=0) CALL mp_stop( -8001 ) 3639 ELSE IF(mpime .EQ. dest) THEN 3640 msglen=1 3641 CALL MPI_RECV( msg_dest_d, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR ) 3642 IF (ierr/=0) CALL mp_stop( -8002 ) 3643 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 3644 IF (ierr/=0) CALL mp_stop( -8003 ) 3645 END IF 3646#endif 3647 ELSEIF(mpime .EQ. sour)THEN 3648 msg_dest_d = msg_sour_d 3649 msglen = 1 3650 END IF 3651 3652#if defined(__MPI) 3653 CALL MPI_BARRIER(group, IERR) 3654 IF (ierr/=0) CALL mp_stop( -8004 ) 3655#endif 3656 3657#endif 3658 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 3659 RETURN 3660 END SUBROUTINE mp_get_i1_gpu 3661! 3662!------------------------------------------------------------------------------! 3663! 3664 SUBROUTINE mp_get_iv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid) 3665 INTEGER, DEVICE :: msg_dest_d(:) 3666 INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:) 3667 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 3668 INTEGER, INTENT(IN) :: gid 3669 INTEGER :: group 3670#if defined(__MPI) 3671 INTEGER :: istatus(MPI_STATUS_SIZE) 3672#endif 3673 INTEGER :: ierr, nrcv 3674 INTEGER :: msglen 3675 ! 3676#if ! defined(__GPU_MPI) 3677 INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:) 3678 ! 3679 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 3680 CALL mp_get_iv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid) 3681 msg_dest_d = msg_dest_h 3682 DEALLOCATE(msg_dest_h, msg_sour_h) 3683#else 3684 3685#if defined(__MPI) 3686 group = gid 3687#endif 3688 3689 ! processors not taking part in the communication have 0 length message 3690 3691 msglen = 0 3692 ! 3693 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 3694 ! 3695 IF(sour .NE. dest) THEN 3696#if defined(__MPI) 3697 IF(mpime .EQ. sour) THEN 3698 msglen = SIZE(msg_sour_d) 3699 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr) 3700 IF (ierr/=0) CALL mp_stop( 9001 ) 3701 ELSE IF(mpime .EQ. dest) THEN 3702 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR ) 3703 IF (ierr/=0) CALL mp_stop( 9002 ) 3704 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 3705 IF (ierr/=0) CALL mp_stop( 9003 ) 3706 msglen = nrcv 3707 END IF 3708#endif 3709 ELSEIF(mpime .EQ. sour)THEN 3710 !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:) 3711 ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice ) 3712 msglen = SIZE(msg_sour_d) 3713 END IF 3714#if defined(__MPI) 3715 CALL MPI_BARRIER(group, IERR) 3716 IF (ierr/=0) CALL mp_stop( 9004 ) 3717#endif 3718#endif 3719 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 3720 RETURN 3721 END SUBROUTINE mp_get_iv_gpu 3722! 3723!------------------------------------------------------------------------------! 3724! 3725 SUBROUTINE mp_get_r1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid) 3726 REAL (DP), DEVICE :: msg_dest_d 3727 REAL (DP), INTENT(IN), DEVICE :: msg_sour_d 3728 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 3729 INTEGER, INTENT(IN) :: gid 3730 INTEGER :: group 3731#if defined(__MPI) 3732 INTEGER :: istatus(MPI_STATUS_SIZE) 3733#endif 3734 INTEGER :: ierr, nrcv 3735 INTEGER :: msglen 3736#if ! defined(__GPU_MPI) 3737 REAL(DP) :: msg_dest_h, msg_sour_h 3738 ! 3739 msg_dest_h=msg_dest_d; msg_sour_h=msg_sour_d ! This syncs __MPI case 3740 CALL mp_get_r1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid) 3741 msg_dest_d = msg_dest_h 3742#else 3743#if defined(__MPI) 3744 group = gid 3745#endif 3746 3747 ! processors not taking part in the communication have 0 length message 3748 3749 msglen = 0 3750 ! 3751 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 3752 ! 3753 IF(sour .NE. dest) THEN 3754#if defined(__MPI) 3755 IF(mpime .EQ. sour) THEN 3756 msglen = 1 3757 CALL MPI_SEND( msg_sour_d, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 3758 IF (ierr/=0) CALL mp_stop( 9005 ) 3759 ELSE IF(mpime .EQ. dest) THEN 3760 msglen = 1 3761 CALL MPI_RECV( msg_dest_d, msglen, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 3762 IF (ierr/=0) CALL mp_stop( 9006 ) 3763 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 3764 IF (ierr/=0) CALL mp_stop( 9007 ) 3765 msglen = nrcv 3766 END IF 3767#endif 3768 ELSEIF(mpime .EQ. sour)THEN 3769 msg_dest_d = msg_sour_d 3770 msglen = 1 3771 END IF 3772#if defined(__MPI) 3773 CALL MPI_BARRIER(group, IERR) 3774 IF (ierr/=0) CALL mp_stop( 9008 ) 3775#endif 3776#endif 3777 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 3778 RETURN 3779 END SUBROUTINE mp_get_r1_gpu 3780! 3781!------------------------------------------------------------------------------! 3782! 3783 SUBROUTINE mp_get_rv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid) 3784 REAL (DP), DEVICE :: msg_dest_d(:) 3785 REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:) 3786 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 3787 INTEGER, INTENT(IN) :: gid 3788 INTEGER :: group 3789#if defined(__MPI) 3790 INTEGER :: istatus(MPI_STATUS_SIZE) 3791#endif 3792 INTEGER :: ierr, nrcv 3793 INTEGER :: msglen 3794 ! 3795#if ! defined(__GPU_MPI) 3796 REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:) 3797 ! 3798 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 3799 CALL mp_get_rv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid) 3800 msg_dest_d = msg_dest_h 3801 DEALLOCATE(msg_dest_h, msg_sour_h) 3802#else 3803 ! 3804#if defined(__MPI) 3805 group = gid 3806#endif 3807 3808 ! processors not taking part in the communication have 0 length message 3809 3810 msglen = 0 3811 ! 3812 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 3813 ! 3814 IF(sour .NE. dest) THEN 3815#if defined(__MPI) 3816 IF(mpime .EQ. sour) THEN 3817 msglen = SIZE(msg_sour_d) 3818 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 3819 IF (ierr/=0) CALL mp_stop( 9009 ) 3820 ELSE IF(mpime .EQ. dest) THEN 3821 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 3822 IF (ierr/=0) CALL mp_stop( 9010 ) 3823 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 3824 IF (ierr/=0) CALL mp_stop( 9011 ) 3825 msglen = nrcv 3826 END IF 3827#endif 3828 ELSEIF(mpime .EQ. sour)THEN 3829 !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:) 3830 ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice ) 3831 msglen = SIZE(msg_sour_d) 3832 END IF 3833#if defined(__MPI) 3834 CALL MPI_BARRIER(group, IERR) 3835 IF (ierr/=0) CALL mp_stop( 9012 ) 3836#endif 3837#endif 3838 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 3839 RETURN 3840 END SUBROUTINE mp_get_rv_gpu 3841! 3842!------------------------------------------------------------------------------! 3843! 3844 SUBROUTINE mp_get_rm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid) 3845 REAL (DP), DEVICE :: msg_dest_d(:,:) 3846 REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:) 3847 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 3848 INTEGER, INTENT(IN) :: gid 3849 INTEGER :: group 3850#if defined(__MPI) 3851 INTEGER :: istatus(MPI_STATUS_SIZE) 3852#endif 3853 INTEGER :: ierr, nrcv 3854 INTEGER :: msglen 3855 ! 3856#if ! defined(__GPU_MPI) 3857 REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:) 3858 ! 3859 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 3860 CALL mp_get_rm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid) 3861 msg_dest_d = msg_dest_h 3862 DEALLOCATE(msg_dest_h, msg_sour_h) 3863#else 3864 3865#if defined(__MPI) 3866 group = gid 3867#endif 3868 3869 ! processors not taking part in the communication have 0 length message 3870 3871 msglen = 0 3872 ! 3873 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 3874 ! 3875 IF(sour .NE. dest) THEN 3876#if defined(__MPI) 3877 IF(mpime .EQ. sour) THEN 3878 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 3879 IF (ierr/=0) CALL mp_stop( 9013 ) 3880 msglen = SIZE(msg_sour_d) 3881 ELSE IF(mpime .EQ. dest) THEN 3882 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 3883 IF (ierr/=0) CALL mp_stop( 9014 ) 3884 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 3885 IF (ierr/=0) CALL mp_stop( 9015 ) 3886 msglen = nrcv 3887 END IF 3888#endif 3889 ELSEIF(mpime .EQ. sour)THEN 3890 !msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:) 3891 ! function cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kdir) 3892 ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),& 3893 msg_sour_d, SIZE(msg_sour_d,1),& 3894 SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), & 3895 cudaMemcpyDeviceToDevice ) 3896 msglen = SIZE( msg_sour_d ) 3897 END IF 3898#if defined(__MPI) 3899 CALL MPI_BARRIER(group, IERR) 3900 IF (ierr/=0) CALL mp_stop( 9016 ) 3901#endif 3902#endif 3903 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 3904 RETURN 3905 END SUBROUTINE mp_get_rm_gpu 3906! 3907!------------------------------------------------------------------------------! 3908! 3909 SUBROUTINE mp_get_cv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid) 3910 COMPLEX (DP), DEVICE :: msg_dest_d(:) 3911 COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:) 3912 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 3913 INTEGER, INTENT(IN) :: gid 3914 INTEGER :: group 3915#if defined(__MPI) 3916 INTEGER :: istatus(MPI_STATUS_SIZE) 3917#endif 3918 INTEGER :: ierr, nrcv 3919 INTEGER :: msglen 3920 ! 3921#if ! defined(__GPU_MPI) 3922 COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:) 3923 ! 3924 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 3925 CALL mp_get_cv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid) 3926 msg_dest_d = msg_dest_h; 3927 DEALLOCATE(msg_dest_h, msg_sour_h) 3928#else 3929 ! 3930#if defined(__MPI) 3931 group = gid 3932#endif 3933 3934 ! processors not taking part in the communication have 0 length message 3935 3936 msglen = 0 3937 ! 3938 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 3939 ! 3940 IF( dest .NE. sour ) THEN 3941#if defined(__MPI) 3942 IF(mpime .EQ. sour) THEN 3943 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr) 3944 IF (ierr/=0) CALL mp_stop( 9017 ) 3945 msglen = SIZE(msg_sour_d) 3946 ELSE IF(mpime .EQ. dest) THEN 3947 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR ) 3948 IF (ierr/=0) CALL mp_stop( 9018 ) 3949 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr) 3950 IF (ierr/=0) CALL mp_stop( 9019 ) 3951 msglen = nrcv 3952 END IF 3953#endif 3954 ELSEIF(mpime .EQ. sour)THEN 3955 !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:) 3956 ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice ) 3957 msglen = SIZE(msg_sour_d) 3958 END IF 3959#if defined(__MPI) 3960 CALL MPI_BARRIER(group, IERR) 3961 IF (ierr/=0) CALL mp_stop( 9020 ) 3962#endif 3963#endif 3964 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 3965 RETURN 3966 END SUBROUTINE mp_get_cv_gpu 3967! 3968!------------------------------------------------------------------------------! 3969! 3970 SUBROUTINE mp_get_cm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid) 3971 COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:) 3972 COMPLEX (DP), DEVICE :: msg_dest_d(:,:) 3973 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 3974 INTEGER, INTENT(IN) :: gid 3975 INTEGER :: group 3976#if defined(__MPI) 3977 INTEGER :: istatus(MPI_STATUS_SIZE) 3978#endif 3979 INTEGER :: ierr, nrcv 3980 INTEGER :: msglen 3981 ! 3982#if ! defined(__GPU_MPI) 3983 COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:) 3984 ! 3985 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 3986 CALL mp_get_cm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid) 3987 msg_dest_d = msg_dest_h; 3988 DEALLOCATE(msg_dest_h, msg_sour_h) 3989#else 3990 ! 3991#if defined(__MPI) 3992 group = gid 3993#endif 3994 3995 ! processors not taking part in the communication have 0 length message 3996 3997 msglen = 0 3998 ! 3999 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 4000 ! 4001 IF(sour .NE. dest) THEN 4002#if defined(__MPI) 4003 IF(mpime .EQ. sour) THEN 4004 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr) 4005 IF (ierr/=0) CALL mp_stop( 9021 ) 4006 msglen = SIZE(msg_sour_d) 4007 ELSE IF(mpime .EQ. dest) THEN 4008 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR ) 4009 IF (ierr/=0) CALL mp_stop( 9022 ) 4010 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr) 4011 IF (ierr/=0) CALL mp_stop( 9023 ) 4012 msglen = nrcv 4013 END IF 4014#endif 4015 ELSEIF(mpime .EQ. sour)THEN 4016 !msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:) 4017 ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),& 4018 msg_sour_d, SIZE(msg_sour_d,1),& 4019 SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), & 4020 cudaMemcpyDeviceToDevice ) 4021 msglen = SIZE( msg_sour_d ) 4022 END IF 4023#if defined(__MPI) 4024 CALL MPI_BARRIER(group, IERR) 4025 IF (ierr/=0) CALL mp_stop( 9024 ) 4026#endif 4027#endif 4028 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 4029 RETURN 4030 END SUBROUTINE mp_get_cm_gpu 4031! 4032!------------------------------------------------------------------------------! 4033! 4034 SUBROUTINE mp_put_i1_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid) 4035 INTEGER, DEVICE :: msg_dest_d 4036 INTEGER, INTENT(IN), DEVICE :: msg_sour_d 4037 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 4038 INTEGER, INTENT(IN) :: gid 4039 INTEGER :: group 4040#if defined(__MPI) 4041 INTEGER :: istatus(MPI_STATUS_SIZE) 4042#endif 4043 INTEGER :: ierr, nrcv 4044 INTEGER :: msglen 4045 ! 4046#if ! defined(__GPU_MPI) 4047 INTEGER :: msg_dest_h, msg_sour_h 4048 ! 4049 msg_dest_h=msg_dest_d ; msg_sour_h=msg_sour_d ! This syncs __MPI case 4050 CALL mp_put_i1(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid) 4051 msg_dest_d = msg_dest_h 4052#else 4053 4054#if defined(__MPI) 4055 group = gid 4056#endif 4057 4058 ! processors not taking part in the communication have 0 length message 4059 4060 msglen = 0 4061 ! 4062 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 4063 ! 4064 IF(dest .NE. sour) THEN 4065#if defined(__MPI) 4066 IF(mpime .EQ. sour) THEN 4067 CALL MPI_SEND( msg_sour_d, 1, MPI_INTEGER, dest, ip, group, ierr) 4068 IF (ierr/=0) CALL mp_stop( 9025 ) 4069 msglen = 1 4070 ELSE IF(mpime .EQ. dest) THEN 4071 CALL MPI_RECV( msg_dest_d, 1, MPI_INTEGER, sour, ip, group, istatus, IERR ) 4072 IF (ierr/=0) CALL mp_stop( 9026 ) 4073 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 4074 IF (ierr/=0) CALL mp_stop( 9027 ) 4075 msglen = 1 4076 END IF 4077#endif 4078 ELSEIF(mpime .EQ. sour)THEN 4079 msg_dest_d = msg_sour_d 4080 msglen = 1 4081 END IF 4082#if defined(__MPI) 4083 CALL MPI_BARRIER(group, IERR) 4084 IF (ierr/=0) CALL mp_stop( 9028 ) 4085#endif 4086#endif 4087 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 4088 RETURN 4089 END SUBROUTINE mp_put_i1_gpu 4090! 4091!------------------------------------------------------------------------------! 4092! 4093 SUBROUTINE mp_put_iv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid) 4094 INTEGER, DEVICE :: msg_dest_d(:) 4095 INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:) 4096 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 4097 INTEGER, INTENT(IN) :: gid 4098 INTEGER :: group 4099#if defined(__MPI) 4100 INTEGER :: istatus(MPI_STATUS_SIZE) 4101#endif 4102 INTEGER :: ierr, nrcv 4103 INTEGER :: msglen 4104 ! 4105#if ! defined(__GPU_MPI) 4106 INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:) 4107 ! 4108 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 4109 CALL mp_put_iv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid) 4110 msg_dest_d = msg_dest_h 4111 DEALLOCATE(msg_dest_h, msg_sour_h) 4112#else 4113 ! 4114#if defined(__MPI) 4115 group = gid 4116#endif 4117 ! processors not taking part in the communication have 0 length message 4118 4119 msglen = 0 4120 ! 4121 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 4122 ! 4123 IF(sour .NE. dest) THEN 4124#if defined(__MPI) 4125 IF(mpime .EQ. sour) THEN 4126 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr) 4127 IF (ierr/=0) CALL mp_stop( 9029 ) 4128 msglen = SIZE(msg_sour_d) 4129 ELSE IF(mpime .EQ. dest) THEN 4130 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR ) 4131 IF (ierr/=0) CALL mp_stop( 9030 ) 4132 CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr) 4133 IF (ierr/=0) CALL mp_stop( 9031 ) 4134 msglen = nrcv 4135 END IF 4136#endif 4137 ELSEIF(mpime .EQ. sour)THEN 4138 !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:) 4139 ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice ) 4140 msglen = SIZE(msg_sour_d) 4141 END IF 4142#if defined(__MPI) 4143 CALL MPI_BARRIER(group, IERR) 4144 IF (ierr/=0) CALL mp_stop( 9032 ) 4145#endif 4146#endif 4147 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 4148 RETURN 4149 END SUBROUTINE mp_put_iv_gpu 4150! 4151!------------------------------------------------------------------------------! 4152! 4153 SUBROUTINE mp_put_rv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid) 4154 REAL (DP), DEVICE :: msg_dest_d(:) 4155 REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:) 4156 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 4157 INTEGER, INTENT(IN) :: gid 4158 INTEGER :: group 4159#if defined(__MPI) 4160 INTEGER :: istatus(MPI_STATUS_SIZE) 4161#endif 4162 INTEGER :: ierr, nrcv 4163 INTEGER :: msglen 4164 ! 4165#if ! defined(__GPU_MPI) 4166 REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:) 4167 ! 4168 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 4169 CALL mp_put_rv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid) 4170 msg_dest_d = msg_dest_h 4171 DEALLOCATE(msg_dest_h, msg_sour_h) 4172#else 4173 ! 4174#if defined(__MPI) 4175 group = gid 4176#endif 4177 ! processors not taking part in the communication have 0 length message 4178 4179 msglen = 0 4180 ! 4181 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 4182 ! 4183 IF(sour .NE. dest) THEN 4184#if defined(__MPI) 4185 IF(mpime .EQ. sour) THEN 4186 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 4187 IF (ierr/=0) CALL mp_stop( 9033 ) 4188 msglen = SIZE(msg_sour_d) 4189 ELSE IF(mpime .EQ. dest) THEN 4190 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 4191 IF (ierr/=0) CALL mp_stop( 9034 ) 4192 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 4193 IF (ierr/=0) CALL mp_stop( 9035 ) 4194 msglen = nrcv 4195 END IF 4196#endif 4197 ELSEIF(mpime .EQ. sour)THEN 4198 !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:) 4199 ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice ) 4200 msglen = SIZE(msg_sour_d) 4201 END IF 4202#if defined(__MPI) 4203 CALL MPI_BARRIER(group, IERR) 4204 IF (ierr/=0) CALL mp_stop( 9036 ) 4205#endif 4206#endif 4207 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 4208 RETURN 4209 END SUBROUTINE mp_put_rv_gpu 4210! 4211!------------------------------------------------------------------------------! 4212! 4213 SUBROUTINE mp_put_rm_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid) 4214 REAL (DP), DEVICE :: msg_dest_d(:,:) 4215 REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:) 4216 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 4217 INTEGER, INTENT(IN) :: gid 4218 INTEGER :: group 4219#if defined(__MPI) 4220 INTEGER :: istatus(MPI_STATUS_SIZE) 4221#endif 4222 INTEGER :: ierr, nrcv 4223 INTEGER :: msglen 4224 ! 4225#if ! defined(__GPU_MPI) 4226 REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:) 4227 ! 4228 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 4229 CALL mp_put_rm(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid) 4230 msg_dest_d = msg_dest_h 4231 DEALLOCATE(msg_dest_h, msg_sour_h) 4232#else 4233 ! 4234#if defined(__MPI) 4235 group = gid 4236#endif 4237 ! processors not taking part in the communication have 0 length message 4238 4239 msglen = 0 4240 ! 4241 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 4242 ! 4243 IF(sour .NE. dest) THEN 4244#if defined(__MPI) 4245 IF(mpime .EQ. sour) THEN 4246 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr) 4247 IF (ierr/=0) CALL mp_stop( 9037 ) 4248 msglen = SIZE(msg_sour_d) 4249 ELSE IF(mpime .EQ. dest) THEN 4250 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR ) 4251 IF (ierr/=0) CALL mp_stop( 9038 ) 4252 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr) 4253 IF (ierr/=0) CALL mp_stop( 9039 ) 4254 msglen = nrcv 4255 END IF 4256#endif 4257 ELSEIF(mpime .EQ. sour)THEN 4258 !msg_dest_d(1:SIZE(msg_sour_d,1),1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:) 4259 ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),& 4260 msg_sour_d, SIZE(msg_sour_d,1),& 4261 SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), & 4262 cudaMemcpyDeviceToDevice ) 4263 msglen = SIZE(msg_sour_d) 4264 END IF 4265#if defined(__MPI) 4266 CALL MPI_BARRIER(group, IERR) 4267 IF (ierr/=0) CALL mp_stop( 9040 ) 4268#endif 4269#endif 4270 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 4271 RETURN 4272 END SUBROUTINE mp_put_rm_gpu 4273! 4274!------------------------------------------------------------------------------! 4275! 4276 SUBROUTINE mp_put_cv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid) 4277 COMPLEX (DP), DEVICE :: msg_dest_d(:) 4278 COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:) 4279 INTEGER, INTENT(IN) :: dest, sour, ip, mpime 4280 INTEGER, INTENT(IN) :: gid 4281 INTEGER :: group 4282#if defined(__MPI) 4283 INTEGER :: istatus(MPI_STATUS_SIZE) 4284#endif 4285 INTEGER :: ierr, nrcv 4286 INTEGER :: msglen 4287 ! 4288#if ! defined(__GPU_MPI) 4289 COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:) 4290 ! 4291 ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case 4292 CALL mp_put_cv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid) 4293 msg_dest_d = msg_dest_h 4294 DEALLOCATE(msg_dest_h, msg_sour_h) 4295#else 4296 ! 4297#if defined(__MPI) 4298 group = gid 4299#endif 4300 ! processors not taking part in the communication have 0 length message 4301 4302 msglen = 0 4303 ! 4304 ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI 4305 ! 4306 IF( dest .NE. sour ) THEN 4307#if defined(__MPI) 4308 IF(mpime .EQ. sour) THEN 4309 CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr) 4310 IF (ierr/=0) CALL mp_stop( 9041 ) 4311 msglen = SIZE(msg_sour_d) 4312 ELSE IF(mpime .EQ. dest) THEN 4313 CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR ) 4314 IF (ierr/=0) CALL mp_stop( 9042 ) 4315 CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr) 4316 IF (ierr/=0) CALL mp_stop( 9043 ) 4317 msglen = nrcv 4318 END IF 4319#endif 4320 ELSEIF(mpime .EQ. sour)THEN 4321 !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:) 4322 ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice ) 4323 msglen = SIZE(msg_sour_d) 4324 END IF 4325#if defined(__MPI) 4326 CALL MPI_BARRIER(group, IERR) 4327 IF (ierr/=0) CALL mp_stop( 9044 ) 4328#endif 4329#endif 4330 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI 4331 RETURN 4332 END SUBROUTINE mp_put_cv_gpu 4333! 4334!------------------------------------------------------------------------------! 4335! 4336!..mp_sum 4337 SUBROUTINE mp_sum_i1_gpu(msg_d,gid) 4338 IMPLICIT NONE 4339 INTEGER, INTENT (INOUT), DEVICE :: msg_d 4340 INTEGER, msg_h 4341 INTEGER, INTENT(IN) :: gid 4342 INTEGER :: msglen, ierr 4343 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4344 IF ( mp_size(gid) == 1 ) THEN 4345 ierr = cudaDeviceSynchronize() 4346 RETURN 4347 END IF 4348 ! 4349#if defined(__MPI) 4350 msglen = 1 4351#if defined(__GPU_MPI) 4352 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4353 CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) 4354 ! No need for final syncronization 4355#else 4356 ! 4357 msg_h = msg_d ! This syncs __MPI case 4358 CALL reduce_base_integer( msglen, msg_h, gid, -1 ) 4359 msg_d = msg_h 4360 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4361#endif 4362#endif 4363 END SUBROUTINE mp_sum_i1_gpu 4364! 4365!------------------------------------------------------------------------------! 4366! 4367 SUBROUTINE mp_sum_iv_gpu(msg_d,gid) 4368 IMPLICIT NONE 4369 INTEGER, INTENT (INOUT), DEVICE :: msg_d(:) 4370 INTEGER, ALLOCATABLE :: msg_h(:) 4371 INTEGER, INTENT(IN) :: gid 4372 ! 4373 INTEGER :: msglen, ierr 4374 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4375 IF ( mp_size(gid) == 1 ) THEN 4376 ierr = cudaDeviceSynchronize() 4377 RETURN 4378 END IF 4379 ! 4380#if defined(__MPI) 4381#if defined(__GPU_MPI) 4382 msglen = size(msg_d) 4383 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4384 CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) 4385 ! No need for final syncronization 4386#else 4387 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4388 msglen = size(msg_h) 4389 CALL reduce_base_integer( msglen, msg_h, gid, -1 ) 4390 msg_d = msg_h; DEALLOCATE(msg_h) 4391 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4392#endif 4393#endif 4394 END SUBROUTINE mp_sum_iv_gpu 4395! 4396!------------------------------------------------------------------------------! 4397! 4398 SUBROUTINE mp_sum_im_gpu(msg_d,gid) 4399 IMPLICIT NONE 4400 INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:) 4401 INTEGER, ALLOCATABLE :: msg_h(:,:) 4402 INTEGER, INTENT(IN) :: gid 4403 ! 4404 INTEGER :: msglen, ierr 4405 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4406 IF ( mp_size(gid) == 1 ) THEN 4407 ierr = cudaDeviceSynchronize() 4408 RETURN 4409 END IF 4410 ! 4411#if defined(__MPI) 4412#if defined(__GPU_MPI) 4413 msglen = size(msg_d) 4414 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4415 CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) 4416 ! No need for final syncronization 4417#else 4418 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4419 msglen = size(msg_h) 4420 CALL reduce_base_integer( msglen, msg_h, gid, -1 ) 4421 msg_d = msg_h; DEALLOCATE(msg_h) 4422 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4423#endif 4424#endif 4425 END SUBROUTINE mp_sum_im_gpu 4426! 4427!------------------------------------------------------------------------------! 4428! 4429 SUBROUTINE mp_sum_it_gpu(msg_d,gid) 4430 IMPLICIT NONE 4431 INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:,:) 4432 INTEGER, ALLOCATABLE :: msg_h(:,:,:) 4433 INTEGER, INTENT (IN) :: gid 4434 ! 4435 INTEGER :: msglen, ierr 4436 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4437 IF ( mp_size(gid) == 1 ) THEN 4438 ierr = cudaDeviceSynchronize() 4439 RETURN 4440 END IF 4441 ! 4442#if defined(__MPI) 4443#if defined(__GPU_MPI) 4444 msglen = size(msg_d) 4445 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4446 CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) 4447 ! No need for final syncronization 4448#else 4449 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4450 msglen = size(msg_h) 4451 CALL reduce_base_integer( msglen, msg_h, gid, -1 ) 4452 msg_d = msg_h; DEALLOCATE(msg_h) 4453 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4454#endif 4455#endif 4456 END SUBROUTINE mp_sum_it_gpu 4457! 4458!------------------------------------------------------------------------------! 4459! 4460 SUBROUTINE mp_sum_r1_gpu(msg_d,gid) 4461 IMPLICIT NONE 4462 REAL (DP), INTENT (INOUT), DEVICE :: msg_d 4463 REAL(DP) :: msg_h 4464 INTEGER, INTENT (IN) :: gid 4465 ! 4466 INTEGER :: msglen, ierr 4467 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4468 IF ( mp_size(gid) == 1 ) THEN 4469 ierr = cudaDeviceSynchronize() 4470 RETURN 4471 END IF 4472 ! 4473#if defined(__MPI) 4474 msglen = 1 4475#if defined(__GPU_MPI) 4476 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4477 CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) 4478 ! No need for final syncronization 4479#else 4480 msg_h=msg_d ! This syncs __MPI case 4481 CALL reduce_base_real( msglen, msg_h, gid, -1 ) 4482 msg_d = msg_h 4483 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4484#endif 4485#endif 4486 END SUBROUTINE mp_sum_r1_gpu 4487! 4488!------------------------------------------------------------------------------! 4489! 4490 SUBROUTINE mp_sum_rv_gpu(msg_d,gid) 4491 IMPLICIT NONE 4492 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:) 4493 REAL(DP), ALLOCATABLE :: msg_h(:) 4494 INTEGER, INTENT (IN) :: gid 4495 ! 4496 INTEGER :: msglen, ierr 4497 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4498 IF ( mp_size(gid) == 1 ) THEN 4499 ierr = cudaDeviceSynchronize() 4500 RETURN 4501 END IF 4502 ! 4503#if defined(__MPI) 4504#if defined(__GPU_MPI) 4505 msglen = size(msg_d) 4506 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4507 CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) 4508 ! No need for final syncronization 4509#else 4510 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4511 msglen = size(msg_h) 4512 CALL reduce_base_real( msglen, msg_h, gid, -1 ) 4513 msg_d = msg_h; DEALLOCATE(msg_h) 4514 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4515#endif 4516#endif 4517 END SUBROUTINE mp_sum_rv_gpu 4518! 4519!------------------------------------------------------------------------------! 4520! 4521 SUBROUTINE mp_sum_rm_gpu(msg_d, gid) 4522 IMPLICIT NONE 4523 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:) 4524 REAL (DP), ALLOCATABLE :: msg_h(:,:) 4525 INTEGER, INTENT (IN) :: gid 4526 ! 4527 INTEGER :: msglen, ierr 4528 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4529 IF ( mp_size(gid) == 1 ) THEN 4530 ierr = cudaDeviceSynchronize() 4531 RETURN 4532 END IF 4533 ! 4534#if defined(__MPI) 4535#if defined(__GPU_MPI) 4536 msglen = size(msg_d) 4537 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4538 CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) 4539 ! No need for final syncronization 4540#else 4541 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4542 msglen = size(msg_h) 4543 CALL reduce_base_real( msglen, msg_h, gid, -1 ) 4544 msg_d = msg_h; DEALLOCATE(msg_h) 4545 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4546#endif 4547#endif 4548 END SUBROUTINE mp_sum_rm_gpu 4549! 4550!------------------------------------------------------------------------------! 4551! 4552 SUBROUTINE mp_root_sum_rm_gpu( msg_d, res_d, root, gid ) 4553 IMPLICIT NONE 4554 REAL (DP), INTENT (IN) , DEVICE :: msg_d(:,:) 4555 REAL (DP), INTENT (OUT), DEVICE :: res_d(:,:) 4556 REAL (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:) 4557 INTEGER, INTENT (IN) :: root 4558 INTEGER, INTENT (IN) :: gid 4559 ! 4560 INTEGER :: msglen, ierr, taskid 4561#if defined(__MPI) 4562 ! 4563 CALL mpi_comm_rank( gid, taskid, ierr) 4564 IF( ierr /= 0 ) CALL mp_stop( 9045 ) 4565 ! 4566 msglen = size(msg_d) 4567 IF( taskid == root ) THEN 4568 IF( msglen > size(res_d) ) CALL mp_stop( 9046 ) 4569 END IF 4570#if defined(__GPU_MPI) 4571 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4572 CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, gid, root ) 4573 RETURN ! Sync not needed in this case 4574#else 4575 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4576 IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2))); 4577 CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root ) 4578 IF( taskid == root ) res_d = res_h; 4579 IF( taskid == root ) DEALLOCATE(res_h) 4580 DEALLOCATE(msg_h) 4581#endif 4582 4583#else 4584 res_d = msg_d 4585#endif 4586 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 4587 END SUBROUTINE mp_root_sum_rm_gpu 4588! 4589!------------------------------------------------------------------------------! 4590! 4591 SUBROUTINE mp_root_sum_cm_gpu( msg_d, res_d, root, gid ) 4592 IMPLICIT NONE 4593 COMPLEX (DP), INTENT (IN) , DEVICE :: msg_d(:,:) 4594 COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:) 4595 COMPLEX (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:) 4596 INTEGER, INTENT (IN) :: root 4597 INTEGER, INTENT (IN) :: gid 4598 ! 4599 INTEGER :: msglen, ierr, taskid 4600#if defined(__MPI) 4601 msglen = size(msg_d) 4602 4603 CALL mpi_comm_rank( gid, taskid, ierr) 4604 IF( ierr /= 0 ) CALL mp_stop( 9047 ) 4605 4606 IF( taskid == root ) THEN 4607 IF( msglen > size(res_d) ) CALL mp_stop( 9048 ) 4608 END IF 4609#if defined(__GPU_MPI) 4610 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4611 CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_d, gid, root ) 4612 RETURN ! Sync not needed in this case 4613#else 4614 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4615 IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2))); 4616 CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, root ) 4617 IF( taskid == root ) res_d = res_h; 4618 IF( taskid == root ) DEALLOCATE(res_h) 4619 DEALLOCATE(msg_h) 4620#endif 4621#else 4622 res_d = msg_d 4623#endif 4624 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 4625 END SUBROUTINE mp_root_sum_cm_gpu 4626! 4627!------------------------------------------------------------------------------! 4628! 4629 SUBROUTINE mp_sum_rmm_gpu( msg_d, res_d, root, gid ) 4630 IMPLICIT NONE 4631 REAL (DP), INTENT (IN), DEVICE :: msg_d(:,:) 4632 REAL (DP), INTENT (OUT),DEVICE :: res_d(:,:) 4633 REAL (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:) 4634 INTEGER, INTENT (IN) :: root 4635 INTEGER, INTENT (IN) :: gid 4636 INTEGER :: group 4637 INTEGER :: msglen 4638 INTEGER :: taskid, ierr 4639 4640 4641 4642#if defined(__MPI) 4643 4644 msglen = size(msg_d) 4645 ! 4646 group = gid 4647 ! 4648 CALL mpi_comm_rank( group, taskid, ierr) 4649 IF( ierr /= 0 ) CALL mp_stop( 9049 ) 4650 4651 IF( taskid == root ) THEN 4652 IF( msglen > size(res_d) ) CALL mp_stop( 9050 ) 4653 END IF 4654 ! 4655#if defined(__GPU_MPI) 4656 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4657 CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, group, root ) 4658 RETURN ! Sync not needed in this case 4659#else 4660 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4661 IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2))); 4662 CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root ) 4663 IF( taskid == root ) res_d = res_h; 4664 IF( taskid == root ) DEALLOCATE(res_h) 4665 DEALLOCATE(msg_h) 4666#endif 4667 ! 4668#else 4669 res_d = msg_d 4670#endif 4671 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 4672 END SUBROUTINE mp_sum_rmm_gpu 4673! 4674!------------------------------------------------------------------------------! 4675! 4676 SUBROUTINE mp_sum_rt_gpu( msg_d, gid ) 4677 IMPLICIT NONE 4678 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:) 4679 REAL (DP), ALLOCATABLE :: msg_h(:,:,:) 4680 INTEGER, INTENT(IN) :: gid 4681 INTEGER :: msglen, ierr 4682 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4683 IF ( mp_size(gid) == 1 ) THEN 4684 ierr = cudaDeviceSynchronize() 4685 RETURN 4686 END IF 4687 ! 4688#if defined(__MPI) 4689#if defined(__GPU_MPI) 4690 msglen = size(msg_d) 4691 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4692 CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) 4693 ! Sync not needed after MPI call 4694#else 4695 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4696 msglen = size(msg_h) 4697 CALL reduce_base_real( msglen, msg_h, gid, -1 ) 4698 msg_d = msg_h; DEALLOCATE(msg_h) 4699 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4700#endif 4701#endif 4702 END SUBROUTINE mp_sum_rt_gpu 4703! 4704!------------------------------------------------------------------------------! 4705! 4706 SUBROUTINE mp_sum_r4d_gpu(msg_d,gid) 4707 IMPLICIT NONE 4708 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:) 4709 REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:) 4710 INTEGER, INTENT(IN) :: gid 4711 ! 4712 INTEGER :: msglen, ierr 4713 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4714 IF ( mp_size(gid) == 1 ) THEN 4715 ierr = cudaDeviceSynchronize() 4716 RETURN 4717 END IF 4718 ! 4719#if defined(__MPI) 4720#if defined(__GPU_MPI) 4721 msglen = size(msg_d) 4722 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4723 CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) 4724 ! Sync not needed after MPI call 4725#else 4726 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4727 msglen = size(msg_h) 4728 CALL reduce_base_real( msglen, msg_h, gid, -1 ) 4729 msg_d = msg_h; DEALLOCATE(msg_h) 4730 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4731#endif 4732#endif 4733 END SUBROUTINE mp_sum_r4d_gpu 4734! 4735!------------------------------------------------------------------------------! 4736! 4737 SUBROUTINE mp_sum_c1_gpu(msg_d,gid) 4738 IMPLICIT NONE 4739 COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d 4740 COMPLEX (DP) :: msg_h 4741 INTEGER, INTENT(IN) :: gid 4742 ! 4743 INTEGER :: msglen, ierr 4744 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4745 IF ( mp_size(gid) == 1 ) THEN 4746 ierr = cudaDeviceSynchronize() 4747 RETURN 4748 END IF 4749 ! 4750#if defined(__MPI) 4751 msglen = 1 4752#if defined(__GPU_MPI) 4753 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4754 CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) 4755 ! Sync not needed after MPI call 4756#else 4757 msg_h=msg_d ! This syncs __MPI case 4758 CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) 4759 msg_d = msg_h 4760 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4761#endif 4762#endif 4763 END SUBROUTINE mp_sum_c1_gpu 4764! 4765!------------------------------------------------------------------------------! 4766! 4767 SUBROUTINE mp_sum_cv_gpu(msg_d,gid) 4768 IMPLICIT NONE 4769 COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:) 4770 COMPLEX (DP), ALLOCATABLE :: msg_h(:) 4771 INTEGER, INTENT(IN) :: gid 4772 ! 4773 INTEGER :: msglen, ierr 4774 ! 4775 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4776 IF ( mp_size(gid) == 1 ) THEN 4777 ierr = cudaDeviceSynchronize() 4778 RETURN 4779 END IF 4780 ! 4781#if defined(__MPI) 4782#if defined(__GPU_MPI) 4783 msglen = size(msg_d) 4784 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4785 CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) 4786 ! Sync not needed after MPI call 4787#else 4788 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4789 msglen = size(msg_h) 4790 CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) 4791 msg_d = msg_h; DEALLOCATE(msg_h) 4792 ierr = cudaDeviceSynchronize() ! This syncs the device after small message copies 4793#endif 4794#endif 4795 END SUBROUTINE mp_sum_cv_gpu 4796! 4797!------------------------------------------------------------------------------! 4798! 4799 SUBROUTINE mp_sum_cm_gpu(msg_d, gid) 4800 IMPLICIT NONE 4801 COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:) 4802 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:) 4803 INTEGER, INTENT (IN) :: gid 4804 INTEGER :: msglen, ierr 4805 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4806 IF ( mp_size(gid) == 1 ) THEN 4807 ierr = cudaDeviceSynchronize() 4808 RETURN 4809 END IF 4810 ! 4811#if defined(__MPI) 4812#if defined(__GPU_MPI) 4813 msglen = size(msg_d) 4814 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4815 CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) 4816 ! Sync not needed after MPI call 4817#else 4818 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4819 msglen = size(msg_h) 4820 CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) 4821 msg_d = msg_h; DEALLOCATE(msg_h) 4822 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4823#endif 4824#endif 4825 END SUBROUTINE mp_sum_cm_gpu 4826! 4827!------------------------------------------------------------------------------! 4828! 4829 SUBROUTINE mp_sum_cmm_gpu(msg_d, res_d, gid) 4830 IMPLICIT NONE 4831 COMPLEX (DP), INTENT (IN), DEVICE :: msg_d(:,:) 4832 COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:) 4833 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:), res_h(:,:) 4834 INTEGER, INTENT (IN) :: gid 4835 ! 4836 INTEGER :: msglen, ierr 4837#if defined(__MPI) 4838#if defined(__GPU_MPI) 4839 msglen = size(msg_d) 4840 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4841 CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_h, gid, -1 ) 4842 RETURN ! Sync not needed after MPI call 4843#else 4844 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4845 msglen = size(msg_h) 4846 ALLOCATE( res_h(lbound(msg_h,1):ubound(msg_h,1), lbound(msg_h,2):ubound(msg_h,2))); 4847 CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, -1 ) 4848 res_d = res_h; DEALLOCATE(msg_h, res_h) 4849#endif 4850#else 4851 res_d = msg_d 4852#endif 4853 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 4854 END SUBROUTINE mp_sum_cmm_gpu 4855! 4856!------------------------------------------------------------------------------! 4857! 4858 SUBROUTINE mp_sum_ct_gpu(msg_d,gid) 4859 IMPLICIT NONE 4860 COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:) 4861 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:) 4862 INTEGER, INTENT(IN) :: gid 4863 ! 4864 INTEGER :: msglen, ierr 4865 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4866 IF ( mp_size(gid) == 1 ) THEN 4867 ierr = cudaDeviceSynchronize() 4868 RETURN 4869 END IF 4870 ! 4871#if defined(__MPI) 4872#if defined(__GPU_MPI) 4873 msglen = SIZE(msg_d) 4874 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4875 CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) 4876 ! Sync not needed after MPI call 4877#else 4878 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4879 msglen = size(msg_h) 4880 CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) 4881 msg_d = msg_h; DEALLOCATE(msg_h) 4882 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4883#endif 4884#endif 4885 END SUBROUTINE mp_sum_ct_gpu 4886! 4887!------------------------------------------------------------------------------! 4888! 4889 SUBROUTINE mp_sum_c4d_gpu(msg_d,gid) 4890 IMPLICIT NONE 4891 COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:) 4892 COMPLEX (DP),ALLOCATABLE :: msg_h(:,:,:,:) 4893 INTEGER, INTENT(IN) :: gid 4894 ! 4895 INTEGER :: msglen, ierr 4896 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4897 IF ( mp_size(gid) == 1 ) THEN 4898 ierr = cudaDeviceSynchronize() 4899 RETURN 4900 END IF 4901 ! 4902#if defined(__MPI) 4903#if defined(__GPU_MPI) 4904 msglen = size(msg_d) 4905 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4906 CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) 4907 ! Sync not needed after MPI call 4908#else 4909 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4910 msglen = size(msg_h) 4911 CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) 4912 msg_d = msg_h; DEALLOCATE(msg_h) 4913 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4914#endif 4915#endif 4916 END SUBROUTINE mp_sum_c4d_gpu 4917! 4918!------------------------------------------------------------------------------! 4919! 4920 SUBROUTINE mp_sum_c5d_gpu(msg_d,gid) 4921 IMPLICIT NONE 4922 COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:) 4923 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:) 4924 INTEGER, INTENT(IN) :: gid 4925 ! 4926 INTEGER :: msglen, ierr 4927 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4928 IF ( mp_size(gid) == 1 ) THEN 4929 ierr = cudaDeviceSynchronize() 4930 RETURN 4931 END IF 4932 ! 4933#if defined(__MPI) 4934#if defined(__GPU_MPI) 4935 msglen = size(msg_d) 4936 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4937 CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) 4938 ! Sync not needed after MPI call 4939#else 4940 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4941 msglen = size(msg_h) 4942 CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) 4943 msg_d = msg_h; DEALLOCATE(msg_h) 4944 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4945#endif 4946#endif 4947 END SUBROUTINE mp_sum_c5d_gpu 4948! 4949!------------------------------------------------------------------------------! 4950! 4951 SUBROUTINE mp_sum_r5d_gpu(msg_d,gid) 4952 IMPLICIT NONE 4953 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:) 4954 REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:) 4955 INTEGER, INTENT(IN) :: gid 4956 ! 4957 INTEGER :: msglen, ierr 4958 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4959 IF ( mp_size(gid) == 1 ) THEN 4960 ierr = cudaDeviceSynchronize() 4961 RETURN 4962 END IF 4963 ! 4964#if defined(__MPI) 4965#if defined(__GPU_MPI) 4966 msglen = size(msg_d) 4967 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4968 CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) 4969 ! Sync not needed after MPI call 4970#else 4971 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 4972 msglen = size(msg_h) 4973 CALL reduce_base_real( msglen, msg_h, gid, -1 ) 4974 msg_d = msg_h; DEALLOCATE(msg_h) 4975 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 4976#endif 4977#endif 4978 END SUBROUTINE mp_sum_r5d_gpu 4979! 4980!------------------------------------------------------------------------------! 4981! 4982 SUBROUTINE mp_sum_r6d_gpu(msg_d,gid) 4983 IMPLICIT NONE 4984 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:) 4985 REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:) 4986 INTEGER, INTENT(IN) :: gid 4987 ! 4988 INTEGER :: msglen, ierr 4989 ! Avoid unnecessary communications on __MPI and syncs SERIAL 4990 IF ( mp_size(gid) == 1 ) THEN 4991 ierr = cudaDeviceSynchronize() 4992 RETURN 4993 END IF 4994 ! 4995#if defined(__MPI) 4996#if defined(__GPU_MPI) 4997 msglen = size(msg_d) 4998 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 4999 CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) 5000 ! Sync not needed after MPI call 5001#else 5002 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 5003 msglen = size(msg_h) 5004 CALL reduce_base_real( msglen, msg_h, gid, -1 ) 5005 msg_d = msg_h; DEALLOCATE(msg_h) 5006 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5007#endif 5008#endif 5009 END SUBROUTINE mp_sum_r6d_gpu 5010! 5011!------------------------------------------------------------------------------! 5012! 5013 SUBROUTINE mp_sum_c6d_gpu(msg_d,gid) 5014 IMPLICIT NONE 5015 COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:) 5016 COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:) 5017 INTEGER, INTENT(IN) :: gid 5018 ! 5019 INTEGER :: msglen, ierr 5020 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5021 IF ( mp_size(gid) == 1 ) THEN 5022 ierr = cudaDeviceSynchronize() 5023 RETURN 5024 END IF 5025 ! 5026#if defined(__MPI) 5027#if defined(__GPU_MPI) 5028 msglen = size(msg_d) 5029 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5030 CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) 5031 ! Sync not needed after MPI call 5032#else 5033 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 5034 msglen = size(msg_h) 5035 CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) 5036 msg_d = msg_h; DEALLOCATE(msg_h) 5037 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5038#endif 5039#endif 5040 END SUBROUTINE mp_sum_c6d_gpu 5041! 5042!------------------------------------------------------------------------------! 5043! 5044 SUBROUTINE mp_max_i_gpu(msg_d,gid) 5045 IMPLICIT NONE 5046 INTEGER, INTENT (INOUT), DEVICE :: msg_d 5047 INTEGER :: msg_h 5048 INTEGER, INTENT(IN) :: gid 5049 ! 5050 INTEGER :: msglen, ierr 5051 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5052 IF ( mp_size(gid) == 1 ) THEN 5053 ierr = cudaDeviceSynchronize() 5054 RETURN 5055 END IF 5056 ! 5057#if defined(__MPI) 5058 msglen = 1 5059#if defined(__GPU_MPI) 5060 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5061 CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 ) 5062 ! Sync not needed after MPI call 5063#else 5064 msg_h = msg_d ! This syncs __MPI case 5065 CALL parallel_max_integer( msglen, msg_h, gid, -1 ) 5066 msg_d = msg_h 5067 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5068#endif 5069#endif 5070 END SUBROUTINE mp_max_i_gpu 5071! 5072!------------------------------------------------------------------------------! 5073! 5074 SUBROUTINE mp_max_iv_gpu(msg_d,gid) 5075 IMPLICIT NONE 5076 INTEGER, INTENT (INOUT), DEVICE :: msg_d(:) 5077 INTEGER, ALLOCATABLE :: msg_h(:) 5078 INTEGER, INTENT(IN) :: gid 5079 ! 5080 INTEGER :: msglen, ierr 5081 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5082 IF ( mp_size(gid) == 1 ) THEN 5083 ierr = cudaDeviceSynchronize() 5084 RETURN 5085 END IF 5086 ! 5087#if defined(__MPI) 5088#if defined(__GPU_MPI) 5089 msglen = size(msg_d) 5090 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5091 CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 ) 5092 ! Sync not needed after MPI call 5093#else 5094 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 5095 msglen = size(msg_h) 5096 CALL parallel_max_integer( msglen, msg_h, gid, -1 ) 5097 msg_d = msg_h; DEALLOCATE(msg_h) 5098 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5099#endif 5100#endif 5101 END SUBROUTINE mp_max_iv_gpu 5102! 5103!---------------------------------------------------------------------- 5104! 5105 SUBROUTINE mp_max_r_gpu(msg_d,gid) 5106 IMPLICIT NONE 5107 REAL (DP), INTENT (INOUT), DEVICE :: msg_d 5108 REAL (DP) :: msg_h 5109 INTEGER, INTENT(IN) :: gid 5110 ! 5111 INTEGER :: msglen, ierr 5112 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5113 IF ( mp_size(gid) == 1 ) THEN 5114 ierr = cudaDeviceSynchronize() 5115 RETURN 5116 END IF 5117 ! 5118#if defined(__MPI) 5119 msglen = 1 5120#if defined(__GPU_MPI) 5121 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5122 CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 ) 5123 ! Sync not needed after MPI call 5124#else 5125 msg_h = msg_d ! This syncs __MPI case 5126 CALL parallel_max_real( msglen, msg_h, gid, -1 ) 5127 msg_d = msg_h 5128 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5129#endif 5130#endif 5131 END SUBROUTINE mp_max_r_gpu 5132! 5133!------------------------------------------------------------------------------! 5134! 5135 SUBROUTINE mp_max_rv_gpu(msg_d,gid) 5136 IMPLICIT NONE 5137 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:) 5138 REAL (DP), ALLOCATABLE :: msg_h(:) 5139 INTEGER, INTENT(IN) :: gid 5140 ! 5141 INTEGER :: msglen, ierr 5142 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5143 IF ( mp_size(gid) == 1 ) THEN 5144 ierr = cudaDeviceSynchronize() 5145 RETURN 5146 END IF 5147 ! 5148#if defined(__MPI) 5149#if defined(__GPU_MPI) 5150 msglen = size(msg_d) 5151 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5152 CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 ) 5153 ! Sync not needed after MPI call 5154#else 5155 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 5156 msglen = size(msg_h) 5157 CALL parallel_max_real( msglen, msg_h, gid, -1 ) 5158 msg_d = msg_h; DEALLOCATE(msg_h) 5159 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5160#endif 5161#endif 5162 END SUBROUTINE mp_max_rv_gpu 5163! 5164!------------------------------------------------------------------------------! 5165! 5166 SUBROUTINE mp_min_i_gpu(msg_d,gid) 5167 IMPLICIT NONE 5168 INTEGER, INTENT (INOUT), DEVICE :: msg_d 5169 INTEGER :: msg_h 5170 INTEGER, INTENT(IN) :: gid 5171 ! 5172 INTEGER :: msglen, ierr 5173 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5174 IF ( mp_size(gid) == 1 ) THEN 5175 ierr = cudaDeviceSynchronize() 5176 RETURN 5177 END IF 5178 ! 5179#if defined(__MPI) 5180 msglen = 1 5181#if defined(__GPU_MPI) 5182 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5183 CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 ) 5184 ! Sync not needed after MPI call 5185#else 5186 msg_h = msg_d ! This syncs __MPI case 5187 CALL parallel_min_integer( msglen, msg_h, gid, -1 ) 5188 msg_d = msg_h 5189 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5190#endif 5191#endif 5192 END SUBROUTINE mp_min_i_gpu 5193! 5194!------------------------------------------------------------------------------! 5195! 5196 SUBROUTINE mp_min_iv_gpu(msg_d,gid) 5197 IMPLICIT NONE 5198 INTEGER, INTENT (INOUT), DEVICE :: msg_d(:) 5199 INTEGER, ALLOCATABLE :: msg_h(:) 5200 INTEGER, INTENT(IN) :: gid 5201 ! 5202 INTEGER :: msglen, ierr 5203 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5204 IF ( mp_size(gid) == 1 ) THEN 5205 ierr = cudaDeviceSynchronize() 5206 RETURN 5207 END IF 5208 ! 5209#if defined(__MPI) 5210#if defined(__GPU_MPI) 5211 msglen = SIZE(msg_d) 5212 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5213 CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 ) 5214 ! Sync not needed after MPI call 5215#else 5216 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 5217 msglen = size(msg_h) 5218 CALL parallel_min_integer( msglen, msg_h, gid, -1 ) 5219 msg_d = msg_h; DEALLOCATE(msg_h) 5220 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5221#endif 5222#endif 5223 END SUBROUTINE mp_min_iv_gpu 5224! 5225!------------------------------------------------------------------------------! 5226! 5227 SUBROUTINE mp_min_r_gpu(msg_d,gid) 5228 IMPLICIT NONE 5229 REAL (DP), INTENT (INOUT), DEVICE :: msg_d 5230 REAL (DP) :: msg_h 5231 INTEGER, INTENT(IN) :: gid 5232 ! 5233 INTEGER :: msglen, ierr 5234 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5235 IF ( mp_size(gid) == 1 ) THEN 5236 ierr = cudaDeviceSynchronize() 5237 RETURN 5238 END IF 5239 ! 5240#if defined(__MPI) 5241 msglen = 1 5242#if defined(__GPU_MPI) 5243 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5244 CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 ) 5245 ! Sync not needed after MPI call 5246#else 5247 msg_h = msg_d ! This syncs __MPI case 5248 CALL parallel_min_real( msglen, msg_h, gid, -1 ) 5249 msg_d = msg_h 5250 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5251#endif 5252#endif 5253 END SUBROUTINE mp_min_r_gpu 5254! 5255!------------------------------------------------------------------------------! 5256! 5257 SUBROUTINE mp_min_rv_gpu(msg_d,gid) 5258 IMPLICIT NONE 5259 REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:) 5260 REAL (DP), ALLOCATABLE :: msg_h(:) 5261 INTEGER, INTENT(IN) :: gid 5262 ! 5263 INTEGER :: msglen, ierr 5264 ! Avoid unnecessary communications on __MPI and syncs SERIAL 5265 IF ( mp_size(gid) == 1 ) THEN 5266 ierr = cudaDeviceSynchronize() 5267 RETURN 5268 END IF 5269 ! 5270#if defined(__MPI) 5271#if defined(__GPU_MPI) 5272 msglen = size(msg_d) 5273 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5274 CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 ) 5275 ! Sync not needed after MPI call 5276#else 5277 ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case 5278 msglen = size(msg_h) 5279 CALL parallel_min_real( msglen, msg_h, gid, -1 ) 5280 msg_d = msg_h; DEALLOCATE(msg_h) 5281 ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies 5282#endif 5283#endif 5284 END SUBROUTINE mp_min_rv_gpu 5285! 5286!------------------------------------------------------------------------------! 5287!..mp_gather 5288 5289 SUBROUTINE mp_gather_i1_gpu(mydata_d, alldata_d, root, gid) 5290 IMPLICIT NONE 5291 INTEGER, DEVICE :: mydata_d 5292 INTEGER, INTENT(IN) :: gid, root 5293 INTEGER :: group 5294 INTEGER, INTENT(OUT), DEVICE :: alldata_d(:) 5295 INTEGER :: ierr 5296 5297 5298#if defined (__MPI) 5299#if ! defined(__GPU_MPI) 5300 INTEGER :: mydata_h 5301 INTEGER, ALLOCATABLE :: alldata_h(:) 5302 ALLOCATE( alldata_h, source=alldata_d ) ! This syncs __MPI 5303 mydata_h = mydata_d 5304 CALL mp_gather_i1(mydata_h, alldata_h, root, gid) 5305 mydata_d = mydata_h; alldata_d = alldata_h 5306 DEALLOCATE(alldata_h) 5307#else 5308 group = gid 5309 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5310 CALL MPI_GATHER(mydata_d, 1, MPI_INTEGER, alldata_d, 1, MPI_INTEGER, root, group, IERR) 5311 IF (ierr/=0) CALL mp_stop( 9051 ) 5312 RETURN ! Sync not needed after MPI call 5313#endif 5314#else 5315 !alldata_d(1) = mydata_d 5316 ierr = cudaMemcpy( alldata_d(1), mydata_d, 1, & 5317 & cudaMemcpyDeviceToDevice ) 5318 IF (ierr/=0) CALL mp_stop( 9052 ) 5319#endif 5320 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5321 END SUBROUTINE mp_gather_i1_gpu 5322! 5323!------------------------------------------------------------------------------! 5324! 5325 SUBROUTINE mp_gather_iv_gpu(mydata_d, alldata_d, root, gid) 5326 IMPLICIT NONE 5327 INTEGER, DEVICE :: mydata_d(:) 5328 INTEGER, INTENT(IN) :: gid, root 5329 INTEGER :: group 5330 INTEGER, INTENT(OUT), DEVICE :: alldata_d(:,:) 5331 INTEGER :: msglen, ierr, i 5332 5333 5334#if defined (__MPI) 5335#if ! defined(__GPU_MPI) 5336 INTEGER, ALLOCATABLE :: mydata_h(:) 5337 INTEGER, ALLOCATABLE :: alldata_h(:,:) 5338 ALLOCATE( mydata_h, source=mydata_d ) ! This syncs __MPI 5339 ALLOCATE( alldata_h, source=alldata_d ) 5340 5341 CALL mp_gather_iv(mydata_h, alldata_h, root, gid) 5342 mydata_d = mydata_h; alldata_d = alldata_h 5343 DEALLOCATE(alldata_h, mydata_h) 5344#else 5345 msglen = SIZE(mydata_d) 5346 IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9053 ) 5347 group = gid 5348 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5349 CALL MPI_GATHER(mydata_d, msglen, MPI_INTEGER, alldata_d, msglen, MPI_INTEGER, root, group, IERR) 5350 IF (ierr/=0) CALL mp_stop( 9054 ) 5351 RETURN ! Sync not needed after MPI call 5352#endif 5353#else 5354 msglen = SIZE(mydata_d) 5355 IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9055 ) 5356 !alldata_d(:,1) = mydata_d(:) 5357 ierr = cudaMemcpy(alldata_d(:,1) , mydata_d(1), msglen, cudaMemcpyDeviceToDevice ) 5358 IF (ierr/=0) CALL mp_stop( 9056 ) 5359#endif 5360 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5361 END SUBROUTINE mp_gather_iv_gpu 5362! 5363!------------------------------------------------------------------------------! 5364!..mp_gatherv_rv 5365! 5366 SUBROUTINE mp_gatherv_rv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid) 5367 IMPLICIT NONE 5368 REAL(DP), DEVICE :: mydata_d(:) 5369 REAL(DP), DEVICE :: alldata_d(:) 5370 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 5371 INTEGER, INTENT(IN) :: gid 5372 INTEGER :: group 5373 INTEGER :: ierr, npe, myid 5374#if defined (__MPI) 5375#if ! defined(__GPU_MPI) 5376 REAL(DP), ALLOCATABLE :: mydata_h(:) 5377 REAL(DP), ALLOCATABLE :: alldata_h(:) 5378 5379 ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI 5380 ALLOCATE(alldata_h, source=alldata_d) 5381 CALL mp_gatherv_rv( mydata_h, alldata_h, recvcount, displs, root, gid) 5382 alldata_d = alldata_h ; mydata_d = mydata_h 5383 DEALLOCATE(alldata_h , mydata_h) 5384#else 5385 5386 group = gid 5387 CALL mpi_comm_size( group, npe, ierr ) 5388 IF (ierr/=0) CALL mp_stop( 9057 ) 5389 CALL mpi_comm_rank( group, myid, ierr ) 5390 IF (ierr/=0) CALL mp_stop( 9058 ) 5391 ! 5392 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9059 ) 5393 IF ( myid == root ) THEN 5394 IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9060 ) 5395 END IF 5396 IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9061 ) 5397 ! 5398 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5399 CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, & 5400 alldata_d, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr ) 5401 IF (ierr/=0) CALL mp_stop( 9062 ) 5402 RETURN ! Sync not needed after MPI call 5403#endif 5404#else 5405 IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9063 ) 5406 IF ( SIZE( mydata_d ) < recvcount( 1 ) ) CALL mp_stop( 9064 ) 5407 ! 5408 !alldata_d( 1:recvcount( 1 ) ) = mydata_d( 1:recvcount( 1 ) ) 5409 ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice ) 5410 IF (ierr/=0) CALL mp_stop( 9065 ) 5411#endif 5412 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5413 END SUBROUTINE mp_gatherv_rv_gpu 5414! 5415!------------------------------------------------------------------------------! 5416!..mp_gatherv_cv 5417! 5418 SUBROUTINE mp_gatherv_cv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid) 5419 IMPLICIT NONE 5420 COMPLEX(DP), DEVICE :: mydata_d(:) 5421 COMPLEX(DP), DEVICE :: alldata_d(:) 5422 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 5423 INTEGER, INTENT(IN) :: gid 5424 INTEGER :: group 5425 INTEGER :: ierr, npe, myid 5426 5427#if defined (__MPI) 5428#if ! defined(__GPU_MPI) 5429 COMPLEX(DP), ALLOCATABLE :: mydata_h(:) 5430 COMPLEX(DP), ALLOCATABLE :: alldata_h(:) 5431 5432 ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI 5433 ALLOCATE(alldata_h, source=alldata_d) 5434 CALL mp_gatherv_cv( mydata_h, alldata_h, recvcount, displs, root, gid) 5435 alldata_d = alldata_h ; mydata_d = mydata_h 5436 DEALLOCATE(alldata_h , mydata_h) 5437#else 5438 group = gid 5439 CALL mpi_comm_size( group, npe, ierr ) 5440 IF (ierr/=0) CALL mp_stop( 9066 ) 5441 CALL mpi_comm_rank( group, myid, ierr ) 5442 IF (ierr/=0) CALL mp_stop( 9067 ) 5443 ! 5444 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9068 ) 5445 IF ( myid == root ) THEN 5446 IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9069 ) 5447 END IF 5448 IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9070 ) 5449 ! 5450 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5451 CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, & 5452 alldata_d, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr ) 5453 IF (ierr/=0) CALL mp_stop( 9071 ) 5454 RETURN ! Sync not needed after MPI call 5455#endif 5456#else 5457 IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9072 ) 5458 IF ( SIZE( mydata_d ) < recvcount( 1 ) ) CALL mp_stop( 9073 ) 5459 ! 5460 !alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) ) 5461 ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice ) 5462#endif 5463 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5464 END SUBROUTINE mp_gatherv_cv_gpu 5465! 5466!------------------------------------------------------------------------------! 5467!..mp_gatherv_rv_gpu 5468! 5469 5470 SUBROUTINE mp_gatherv_iv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid) 5471 IMPLICIT NONE 5472 INTEGER, DEVICE :: mydata_d(:) 5473 INTEGER, DEVICE :: alldata_d(:) 5474 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 5475 INTEGER, INTENT(IN) :: gid 5476 INTEGER :: group 5477 INTEGER :: ierr, npe, myid 5478 5479#if defined (__MPI) 5480#if ! defined(__GPU_MPI) 5481 INTEGER, ALLOCATABLE :: mydata_h(:) 5482 INTEGER, ALLOCATABLE :: alldata_h(:) 5483 5484 ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI 5485 ALLOCATE(alldata_h, source=alldata_d) 5486 CALL mp_gatherv_iv( mydata_h, alldata_h, recvcount, displs, root, gid) 5487 alldata_d = alldata_h ; mydata_d = mydata_h 5488 DEALLOCATE(alldata_h , mydata_h) 5489#else 5490 group = gid 5491 CALL mpi_comm_size( group, npe, ierr ) 5492 IF (ierr/=0) CALL mp_stop( 9074 ) 5493 CALL mpi_comm_rank( group, myid, ierr ) 5494 IF (ierr/=0) CALL mp_stop( 9075 ) 5495 ! 5496 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9076 ) 5497 IF ( myid == root ) THEN 5498 IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9077 ) 5499 END IF 5500 IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9078 ) 5501 ! 5502 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5503 CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_INTEGER, & 5504 alldata_d, recvcount, displs, MPI_INTEGER, root, group, ierr ) 5505 IF (ierr/=0) CALL mp_stop( 9079 ) 5506 RETURN ! Sync not needed after MPI call 5507#endif 5508#else 5509 IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9080 ) 5510 IF ( SIZE( mydata_d ) < recvcount( 1 ) ) CALL mp_stop( 9081 ) 5511 ! 5512 !alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) ) 5513 ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice ) 5514#endif 5515 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5516 END SUBROUTINE mp_gatherv_iv_gpu 5517! 5518!------------------------------------------------------------------------------! 5519!..mp_gatherv_rm 5520! 5521 5522 SUBROUTINE mp_gatherv_rm_gpu( mydata_d, alldata_d, recvcount, displs, root, gid) 5523 IMPLICIT NONE 5524 REAL(DP), DEVICE :: mydata_d(:,:) ! Warning first dimension is supposed constant! 5525 REAL(DP), DEVICE :: alldata_d(:,:) 5526 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 5527 INTEGER, INTENT(IN) :: gid 5528 INTEGER :: group 5529 INTEGER :: ierr, npe, myid, nsiz 5530 INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:) 5531 5532 5533#if defined (__MPI) 5534#if ! defined(__GPU_MPI) 5535 REAL(DP), ALLOCATABLE :: mydata_h(:,:) 5536 REAL(DP), ALLOCATABLE :: alldata_h(:,:) 5537 5538 ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI 5539 ALLOCATE(alldata_h, source=alldata_d) 5540 CALL mp_gatherv_rm( mydata_h, alldata_h, recvcount, displs, root, gid) 5541 alldata_d = alldata_h ; mydata_d = mydata_h 5542 DEALLOCATE(alldata_h , mydata_h) 5543#else 5544 group = gid 5545 CALL mpi_comm_size( group, npe, ierr ) 5546 IF (ierr/=0) CALL mp_stop( 9082 ) 5547 CALL mpi_comm_rank( group, myid, ierr ) 5548 IF (ierr/=0) CALL mp_stop( 9083 ) 5549 ! 5550 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9084 ) 5551 IF ( myid == root ) THEN 5552 IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9085 ) 5553 IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9086 ) 5554 END IF 5555 IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9087 ) 5556 ! 5557 ALLOCATE( nrecv( npe ), ndisp( npe ) ) 5558 ! 5559 nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 ) 5560 ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 ) 5561 ! 5562 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5563 CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, & 5564 alldata_d, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr ) 5565 IF (ierr/=0) CALL mp_stop( 9088 ) 5566 ! 5567 DEALLOCATE( nrecv, ndisp ) 5568 ! 5569 RETURN ! Sync not needed after MPI call 5570#endif 5571#else 5572 IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9089 ) 5573 IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9090 ) 5574 IF ( SIZE( mydata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9091 ) 5575 ! 5576 !alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) ) 5577 5578 ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),& 5579 mydata_d, SIZE(mydata_d,1),& 5580 SIZE(mydata_d,1), recvcount( 1 ), & 5581 cudaMemcpyDeviceToDevice ) 5582 5583 IF (ierr/=0) CALL mp_stop( 9092 ) 5584#endif 5585 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5586 END SUBROUTINE mp_gatherv_rm_gpu 5587! 5588!------------------------------------------------------------------------------! 5589!..mp_gatherv_im 5590! 5591 SUBROUTINE mp_gatherv_im_gpu( mydata_d, alldata_d, recvcount, displs, root, gid) 5592 IMPLICIT NONE 5593 INTEGER, DEVICE :: mydata_d(:,:) ! Warning first dimension is supposed constant! 5594 INTEGER, DEVICE :: alldata_d(:,:) 5595 INTEGER, INTENT(IN) :: recvcount(:), displs(:), root 5596 INTEGER, INTENT(IN) :: gid 5597 INTEGER :: group 5598 INTEGER :: ierr, npe, myid, nsiz 5599 INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:) 5600 5601 5602#if defined (__MPI) 5603#if ! defined(__GPU_MPI) 5604 INTEGER, ALLOCATABLE :: mydata_h(:,:) 5605 INTEGER, ALLOCATABLE :: alldata_h(:,:) 5606 5607 ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI 5608 ALLOCATE(alldata_h, source=alldata_d) 5609 CALL mp_gatherv_im( mydata_h, alldata_h, recvcount, displs, root, gid) 5610 alldata_d = alldata_h ; mydata_d = mydata_h 5611 DEALLOCATE(alldata_h , mydata_h) 5612#else 5613 group = gid 5614 CALL mpi_comm_size( group, npe, ierr ) 5615 IF (ierr/=0) CALL mp_stop( 9093 ) 5616 CALL mpi_comm_rank( group, myid, ierr ) 5617 IF (ierr/=0) CALL mp_stop( 9094 ) 5618 ! 5619 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9095 ) 5620 IF ( myid == root ) THEN 5621 IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9096 ) 5622 IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9097 ) 5623 END IF 5624 IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9098 ) 5625 ! 5626 ALLOCATE( nrecv( npe ), ndisp( npe ) ) 5627 ! 5628 nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 ) 5629 ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 ) 5630 ! 5631 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5632 CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_INTEGER, & 5633 alldata_d, nrecv, ndisp, MPI_INTEGER, root, group, ierr ) 5634 IF (ierr/=0) CALL mp_stop( 9099 ) 5635 ! 5636 DEALLOCATE( nrecv, ndisp ) 5637 ! 5638 RETURN ! Sync not needed after MPI call 5639#endif 5640#else 5641 IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9100 ) 5642 IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9101 ) 5643 IF ( SIZE( mydata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9102 ) 5644 ! 5645 !alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) ) 5646 5647 ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),& 5648 mydata_d, SIZE(mydata_d,1),& 5649 SIZE(mydata_d,1), recvcount( 1 ), & 5650 cudaMemcpyDeviceToDevice ) 5651 5652 IF (ierr/=0) CALL mp_stop( 9103 ) 5653#endif 5654 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5655 END SUBROUTINE mp_gatherv_im_gpu 5656! 5657!------------------------------------------------------------------------------! 5658! 5659 SUBROUTINE mp_alltoall_c3d_gpu( sndbuf_d, rcvbuf_d, gid ) 5660 IMPLICIT NONE 5661 COMPLEX(DP), DEVICE :: sndbuf_d( :, :, : ) 5662 COMPLEX(DP), DEVICE :: rcvbuf_d( :, :, : ) 5663 INTEGER, INTENT(IN) :: gid 5664 INTEGER :: nsiz, group, ierr, npe 5665 5666#if defined (__MPI) 5667#if ! defined(__GPU_MPI) 5668 COMPLEX(DP), ALLOCATABLE :: sndbuf_h(:,:,:) 5669 COMPLEX(DP), ALLOCATABLE :: rcvbuf_h(:,:,:) 5670 5671 ALLOCATE(sndbuf_h, source=sndbuf_d) ! This syncs __MPI 5672 ALLOCATE(rcvbuf_h, source=rcvbuf_d) 5673 CALL mp_alltoall_c3d( sndbuf_h, rcvbuf_h, gid ) 5674 sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h 5675 DEALLOCATE(sndbuf_h , rcvbuf_h) 5676#else 5677 group = gid 5678 5679 CALL mpi_comm_size( group, npe, ierr ) 5680 IF (ierr/=0) CALL mp_stop( 9104 ) 5681 5682 IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9105 ) 5683 IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9106 ) 5684 5685 nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 ) 5686 ! 5687 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5688 ! 5689 CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_DOUBLE_COMPLEX, & 5690 rcvbuf_d, nsiz, MPI_DOUBLE_COMPLEX, group, ierr ) 5691 5692 IF (ierr/=0) CALL mp_stop( 9107 ) 5693 RETURN ! Sync not needed after MPI call 5694#endif 5695#else 5696 rcvbuf_d = sndbuf_d 5697#endif 5698 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5699 END SUBROUTINE mp_alltoall_c3d_gpu 5700! 5701!------------------------------------------------------------------------------! 5702! 5703 SUBROUTINE mp_alltoall_i3d_gpu( sndbuf_d, rcvbuf_d, gid ) 5704 IMPLICIT NONE 5705 INTEGER, DEVICE :: sndbuf_d( :, :, : ) 5706 INTEGER, DEVICE :: rcvbuf_d( :, :, : ) 5707 INTEGER, INTENT(IN) :: gid 5708 INTEGER :: nsiz, group, ierr, npe 5709 5710#if defined (__MPI) 5711#if ! defined(__GPU_MPI) 5712 INTEGER, ALLOCATABLE :: sndbuf_h(:,:,:) 5713 INTEGER, ALLOCATABLE :: rcvbuf_h(:,:,:) 5714 5715 ALLOCATE(sndbuf_h, source=sndbuf_d) ! This syncs __MPI 5716 ALLOCATE(rcvbuf_h, source=rcvbuf_d) 5717 CALL mp_alltoall_i3d( sndbuf_h, rcvbuf_h, gid ) 5718 sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h 5719 DEALLOCATE(sndbuf_h , rcvbuf_h) 5720#else 5721 group = gid 5722 5723 CALL mpi_comm_size( group, npe, ierr ) 5724 IF (ierr/=0) CALL mp_stop( 9108 ) 5725 5726 IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9109 ) 5727 IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9110 ) 5728 5729 nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 ) 5730 ! 5731 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5732 ! 5733 CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_INTEGER, & 5734 rcvbuf_d, nsiz, MPI_INTEGER, group, ierr ) 5735 5736 IF (ierr/=0) CALL mp_stop( 9111 ) 5737 RETURN ! Sync not needed after MPI call 5738#endif 5739#else 5740 5741 rcvbuf_d = sndbuf_d 5742 5743#endif 5744 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5745 END SUBROUTINE mp_alltoall_i3d_gpu 5746! 5747!------------------------------------------------------------------------------! 5748! 5749 SUBROUTINE mp_circular_shift_left_i0_gpu( buf_d, itag, gid ) 5750 IMPLICIT NONE 5751 INTEGER, DEVICE :: buf_d 5752 INTEGER, INTENT(IN) :: itag 5753 INTEGER, INTENT(IN) :: gid 5754 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 5755 5756#if defined (__MPI) 5757#if ! defined(__GPU_MPI) 5758 INTEGER :: buf_h 5759 buf_h = buf_d ! This syncs __MPI 5760 CALL mp_circular_shift_left_i0( buf_h, itag, gid ) 5761 buf_d = buf_h 5762#else 5763 INTEGER :: istatus( mpi_status_size ) 5764 ! 5765 group = gid 5766 ! 5767 CALL mpi_comm_size( group, npe, ierr ) 5768 IF (ierr/=0) CALL mp_stop( 9112 ) 5769 CALL mpi_comm_rank( group, mype, ierr ) 5770 IF (ierr/=0) CALL mp_stop( 9113 ) 5771 ! 5772 sour = mype + 1 5773 IF( sour == npe ) sour = 0 5774 dest = mype - 1 5775 IF( dest == -1 ) dest = npe - 1 5776 ! 5777 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5778 CALL MPI_Sendrecv_replace( buf_d, 1, MPI_INTEGER, & 5779 dest, itag, sour, itag, group, istatus, ierr) 5780 ! 5781 IF (ierr/=0) CALL mp_stop( 9114 ) 5782 ! 5783 RETURN ! Sync not needed after MPI call 5784#endif 5785#else 5786 ! do nothing 5787#endif 5788 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5789 END SUBROUTINE mp_circular_shift_left_i0_gpu 5790! 5791!------------------------------------------------------------------------------! 5792! 5793 SUBROUTINE mp_circular_shift_left_i1_gpu( buf_d, itag, gid ) 5794 IMPLICIT NONE 5795 INTEGER, DEVICE :: buf_d(:) 5796 INTEGER, INTENT(IN) :: itag 5797 INTEGER, INTENT(IN) :: gid 5798 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 5799 5800#if defined (__MPI) 5801#if ! defined(__GPU_MPI) 5802 INTEGER, ALLOCATABLE :: buf_h(:) 5803 ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI 5804 CALL mp_circular_shift_left_i1( buf_h, itag, gid ) 5805 buf_d = buf_h; DEALLOCATE(buf_h) 5806#else 5807 INTEGER :: istatus( mpi_status_size ) 5808 ! 5809 group = gid 5810 ! 5811 CALL mpi_comm_size( group, npe, ierr ) 5812 IF (ierr/=0) CALL mp_stop( 9115 ) 5813 CALL mpi_comm_rank( group, mype, ierr ) 5814 IF (ierr/=0) CALL mp_stop( 9116 ) 5815 ! 5816 sour = mype + 1 5817 IF( sour == npe ) sour = 0 5818 dest = mype - 1 5819 IF( dest == -1 ) dest = npe - 1 5820 ! 5821 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5822 CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, & 5823 dest, itag, sour, itag, group, istatus, ierr) 5824 ! 5825 IF (ierr/=0) CALL mp_stop( 9117 ) 5826 ! 5827 RETURN ! Sync not needed after MPI call 5828#endif 5829#else 5830 ! do nothing 5831#endif 5832 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5833 END SUBROUTINE mp_circular_shift_left_i1_gpu 5834! 5835!------------------------------------------------------------------------------! 5836! 5837 SUBROUTINE mp_circular_shift_left_i2_gpu( buf_d, itag, gid ) 5838 IMPLICIT NONE 5839 INTEGER, DEVICE :: buf_d(:,:) 5840 INTEGER, INTENT(IN) :: itag 5841 INTEGER, INTENT(IN) :: gid 5842 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 5843 5844#if defined (__MPI) 5845#if ! defined(__GPU_MPI) 5846 INTEGER, ALLOCATABLE :: buf_h(:,:) 5847 ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI 5848 CALL mp_circular_shift_left_i2( buf_h, itag, gid ) 5849 buf_d = buf_h; DEALLOCATE(buf_h) 5850#else 5851 INTEGER :: istatus( mpi_status_size ) 5852 ! 5853 group = gid 5854 ! 5855 CALL mpi_comm_size( group, npe, ierr ) 5856 IF (ierr/=0) CALL mp_stop( 9118 ) 5857 CALL mpi_comm_rank( group, mype, ierr ) 5858 IF (ierr/=0) CALL mp_stop( 9119 ) 5859 ! 5860 sour = mype + 1 5861 IF( sour == npe ) sour = 0 5862 dest = mype - 1 5863 IF( dest == -1 ) dest = npe - 1 5864 ! 5865 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5866 CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, & 5867 dest, itag, sour, itag, group, istatus, ierr) 5868 ! 5869 IF (ierr/=0) CALL mp_stop( 9120 ) 5870 ! 5871 RETURN ! Sync not needed after MPI call 5872#endif 5873#else 5874 ! do nothing 5875#endif 5876 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5877 END SUBROUTINE mp_circular_shift_left_i2_gpu 5878! 5879!------------------------------------------------------------------------------! 5880! 5881 SUBROUTINE mp_circular_shift_left_r2d_gpu( buf_d, itag, gid ) 5882 IMPLICIT NONE 5883 REAL(DP), DEVICE :: buf_d( :, : ) 5884 INTEGER, INTENT(IN) :: itag 5885 INTEGER, INTENT(IN) :: gid 5886 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 5887 5888#if defined (__MPI) 5889#if ! defined(__GPU_MPI) 5890 REAL(DP), ALLOCATABLE :: buf_h(:, :) 5891 ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI 5892 CALL mp_circular_shift_left_r2d( buf_h, itag, gid ) 5893 buf_d = buf_h; DEALLOCATE(buf_h) 5894#else 5895 INTEGER :: istatus( mpi_status_size ) 5896 ! 5897 group = gid 5898 ! 5899 CALL mpi_comm_size( group, npe, ierr ) 5900 IF (ierr/=0) CALL mp_stop( 9121 ) 5901 CALL mpi_comm_rank( group, mype, ierr ) 5902 IF (ierr/=0) CALL mp_stop( 9122 ) 5903 ! 5904 sour = mype + 1 5905 IF( sour == npe ) sour = 0 5906 dest = mype - 1 5907 IF( dest == -1 ) dest = npe - 1 5908 ! 5909 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5910 CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_PRECISION, & 5911 dest, itag, sour, itag, group, istatus, ierr) 5912 ! 5913 IF (ierr/=0) CALL mp_stop( 9123 ) 5914 ! 5915 RETURN ! Sync not needed after MPI call 5916#endif 5917#else 5918 ! do nothing 5919#endif 5920 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5921 END SUBROUTINE mp_circular_shift_left_r2d_gpu 5922! 5923!------------------------------------------------------------------------------! 5924! 5925 SUBROUTINE mp_circular_shift_left_c2d_gpu( buf_d, itag, gid ) 5926 IMPLICIT NONE 5927 COMPLEX(DP), DEVICE :: buf_d( :, : ) 5928 INTEGER, INTENT(IN) :: itag 5929 INTEGER, INTENT(IN) :: gid 5930 INTEGER :: nsiz, group, ierr, npe, sour, dest, mype 5931 5932#if defined (__MPI) 5933#if ! defined(__GPU_MPI) 5934 COMPLEX(DP), ALLOCATABLE :: buf_h(:, :) 5935 ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI 5936 CALL mp_circular_shift_left_c2d( buf_h, itag, gid ) 5937 buf_d = buf_h; DEALLOCATE(buf_h) 5938#else 5939 INTEGER :: istatus( mpi_status_size ) 5940 ! 5941 group = gid 5942 ! 5943 CALL mpi_comm_size( group, npe, ierr ) 5944 IF (ierr/=0) CALL mp_stop( 9124 ) 5945 CALL mpi_comm_rank( group, mype, ierr ) 5946 IF (ierr/=0) CALL mp_stop( 9125 ) 5947 ! 5948 sour = mype + 1 5949 IF( sour == npe ) sour = 0 5950 dest = mype - 1 5951 IF( dest == -1 ) dest = npe - 1 5952 ! 5953 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 5954 CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_COMPLEX, & 5955 dest, itag, sour, itag, group, istatus, ierr) 5956 ! 5957 IF (ierr/=0) CALL mp_stop( 9126 ) 5958 ! 5959 RETURN ! Sync not needed after MPI call 5960#endif 5961#else 5962 ! do nothing 5963#endif 5964 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 5965 END SUBROUTINE mp_circular_shift_left_c2d_gpu 5966 5967!------------------------------------------------------------------------------! 5968!..mp_gatherv_inplace_cplx_array 5969! 5970 5971 SUBROUTINE mp_gatherv_inplace_cplx_array_gpu(alldata_d, my_column_type, recvcount, displs, root, gid) 5972 IMPLICIT NONE 5973 COMPLEX(DP), DEVICE :: alldata_d(:,:) 5974 INTEGER, INTENT(IN) :: my_column_type 5975 INTEGER, INTENT(IN) :: recvcount(:), displs(:) 5976 INTEGER, INTENT(IN) :: root, gid 5977 INTEGER :: ierr, npe, myid 5978 5979#if defined (__MPI) 5980#if ! defined(__GPU_MPI) 5981 COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :) 5982 ! 5983 ! Avoid unnecessary communications on __MPI 5984 IF ( mp_size(gid) == 1 ) THEN 5985 ierr = cudaDeviceSynchronize() 5986 RETURN 5987 END IF 5988 ! 5989 ALLOCATE(alldata_h, source=alldata_d) ! This syncs __MPI 5990 CALL mp_gatherv_inplace_cplx_array(alldata_h, my_column_type, recvcount, displs, root, gid) 5991 alldata_d = alldata_h; DEALLOCATE(alldata_h) 5992 ierr = cudaDeviceSynchronize() ! This syncs in case of small data chunks 5993 RETURN 5994#else 5995 CALL mpi_comm_size( gid, npe, ierr ) 5996 IF (ierr/=0) CALL mp_stop( 9127 ) 5997 CALL mpi_comm_rank( gid, myid, ierr ) 5998 IF (ierr/=0) CALL mp_stop( 9128 ) 5999 ! 6000 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9129 ) 6001 ! 6002 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 6003 IF (myid==root) THEN 6004 CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, & 6005 alldata_d, recvcount, displs, my_column_type, root, gid, ierr ) 6006 ELSE 6007 CALL MPI_GATHERV( alldata_d(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, & 6008 MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr ) 6009 ENDIF 6010 ! 6011 IF (ierr/=0) CALL mp_stop( 9130 ) 6012 ! 6013 RETURN ! Sync not needed after MPI call 6014#endif 6015#endif 6016 ierr = cudaDeviceSynchronize() ! This syncs SERIAL 6017 END SUBROUTINE mp_gatherv_inplace_cplx_array_gpu 6018 6019!------------------------------------------------------------------------------! 6020!..mp_allgatherv_inplace_cplx_array 6021! 6022 6023 SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu(alldata_d, my_element_type, recvcount, displs, gid) 6024 IMPLICIT NONE 6025 COMPLEX(DP), DEVICE :: alldata_d(:,:) 6026 INTEGER, INTENT(IN) :: my_element_type 6027 INTEGER, INTENT(IN) :: recvcount(:), displs(:) 6028 INTEGER, INTENT(IN) :: gid 6029 INTEGER :: ierr, npe, myid 6030 6031#if defined (__MPI) 6032#if ! defined(__GPU_MPI) 6033 COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :) 6034 ! 6035 ! Avoid unnecessary communications on __MPI 6036 IF ( mp_size(gid) == 1 ) THEN 6037 ierr = cudaDeviceSynchronize() 6038 RETURN 6039 END IF 6040 ! 6041 ALLOCATE(alldata_h, source=alldata_d)! This syncs __MPI 6042 CALL mp_allgatherv_inplace_cplx_array(alldata_h, my_element_type, recvcount, displs, gid) 6043 alldata_d = alldata_h; DEALLOCATE(alldata_h) 6044 ierr = cudaDeviceSynchronize() ! This syncs in case of small data chunks 6045 RETURN 6046#else 6047 CALL mpi_comm_size( gid, npe, ierr ) 6048 IF (ierr/=0) CALL mp_stop( 9131 ) 6049 CALL mpi_comm_rank( gid, myid, ierr ) 6050 IF (ierr/=0) CALL mp_stop( 9132 ) 6051 ! 6052 IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9133 ) 6053 ! 6054 ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI 6055 CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, & 6056 alldata_d, recvcount, displs, my_element_type, gid, ierr ) 6057 IF (ierr/=0) CALL mp_stop( 9134 ) 6058 RETURN ! Sync not needed after MPI call 6059#endif 6060#endif 6061 ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI 6062 END SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu 6063 6064 SUBROUTINE mp_type_create_cplx_column_section_gpu(dummy, start, length, stride, mytype) 6065 IMPLICIT NONE 6066 ! 6067 COMPLEX (DP), DEVICE, INTENT(IN) :: dummy 6068 INTEGER, INTENT(IN) :: start, length, stride 6069 INTEGER, INTENT(OUT) :: mytype 6070 ! 6071#if defined(__MPI) 6072 INTEGER :: ierr 6073 ! 6074 CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,& 6075 MPI_DOUBLE_COMPLEX, mytype, ierr) 6076 IF (ierr/=0) CALL mp_stop( 8081 ) 6077 CALL MPI_Type_commit(mytype, ierr) 6078 IF (ierr/=0) CALL mp_stop( 8082 ) 6079#else 6080 mytype = 0; 6081#endif 6082 ! 6083 RETURN 6084 END SUBROUTINE mp_type_create_cplx_column_section_gpu 6085 6086 SUBROUTINE mp_type_create_real_column_section_gpu(dummy, start, length, stride, mytype) 6087 IMPLICIT NONE 6088 ! 6089 REAL (DP), DEVICE, INTENT(IN) :: dummy 6090 INTEGER, INTENT(IN) :: start, length, stride 6091 INTEGER, INTENT(OUT) :: mytype 6092 ! 6093#if defined(__MPI) 6094 INTEGER :: ierr 6095 ! 6096 CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,& 6097 MPI_DOUBLE_PRECISION, mytype, ierr) 6098 IF (ierr/=0) CALL mp_stop( 8083 ) 6099 CALL MPI_Type_commit(mytype, ierr) 6100 IF (ierr/=0) CALL mp_stop( 8084 ) 6101#else 6102 mytype = 0; 6103#endif 6104 ! 6105 RETURN 6106 END SUBROUTINE mp_type_create_real_column_section_gpu 6107 6108 SUBROUTINE mp_type_create_cplx_row_section_gpu(dummy, column_start, column_stride, row_length, mytype) 6109 IMPLICIT NONE 6110 ! 6111 COMPLEX (DP), DEVICE, INTENT(IN) :: dummy 6112 INTEGER, INTENT(IN) :: column_start, column_stride, row_length 6113 INTEGER, INTENT(OUT) :: mytype 6114 ! 6115#if defined(__MPI) 6116 INTEGER :: ierr, temporary 6117 INTEGER :: strides(2), lengths(2), starts(2) 6118 INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent 6119 ! 6120 strides(1) = column_stride ; strides(2) = row_length 6121 lengths(1) = 1 ; lengths(2) = row_length 6122 starts(1) = column_start ; starts(2) = 0 6123 CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,& 6124 MPI_DOUBLE_COMPLEX, temporary, ierr) 6125 IF (ierr/=0) CALL mp_stop( 8085 ) 6126 CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr) 6127 IF (ierr/=0) CALL mp_stop( 8085 ) 6128 CALL MPI_TYPE_COMMIT(temporary, ierr) 6129 IF (ierr/=0) CALL mp_stop( 8085 ) 6130 CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr) 6131 IF (ierr/=0) CALL mp_stop( 8086 ) 6132 CALL MPI_Type_commit(mytype, ierr) 6133 IF (ierr/=0) CALL mp_stop( 8086 ) 6134#else 6135 mytype = 0; 6136#endif 6137 ! 6138 RETURN 6139 END SUBROUTINE mp_type_create_cplx_row_section_gpu 6140 6141 SUBROUTINE mp_type_create_real_row_section_gpu(dummy, column_start, column_stride, row_length, mytype) 6142 IMPLICIT NONE 6143 ! 6144 REAL (DP), DEVICE, INTENT(IN) :: dummy 6145 INTEGER, INTENT(IN) :: column_start, column_stride, row_length 6146 INTEGER, INTENT(OUT) :: mytype 6147 ! 6148#if defined(__MPI) 6149 INTEGER :: ierr, temporary 6150 INTEGER :: strides(2), lengths(2), starts(2) 6151 INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent 6152 ! 6153 strides(1) = column_stride ; strides(2) = row_length 6154 lengths(1) = 1 ; lengths(2) = row_length 6155 starts(1) = column_start ; starts(2) = 0 6156 CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,& 6157 MPI_DOUBLE_PRECISION, temporary, ierr) 6158 IF (ierr/=0) CALL mp_stop( 8087 ) 6159 CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr) 6160 IF (ierr/=0) CALL mp_stop( 8087 ) 6161 CALL MPI_TYPE_COMMIT(temporary, ierr) 6162 IF (ierr/=0) CALL mp_stop( 8087 ) 6163 CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr) 6164 IF (ierr/=0) CALL mp_stop( 8088 ) 6165 CALL MPI_Type_commit(mytype, ierr) 6166 IF (ierr/=0) CALL mp_stop( 8088 ) 6167#else 6168 mytype = 0; 6169#endif 6170 ! 6171 RETURN 6172 END SUBROUTINE mp_type_create_real_row_section_gpu 6173 6174!------------------------------------------------------------------------------! 6175 6176#endif 6177!------------------------------------------------------------------------------! 6178END MODULE mp 6179!------------------------------------------------------------------------------! 6180! 6181! Script to generate stop messages: 6182! # coding: utf-8 6183! import re 6184! import sys 6185! i = 8000 6186! def replace(match): 6187! global i 6188! i += 1 6189! return 'mp_stop( {0} )'.format(i) 6190! 6191! with open(sys.argv[1],'r') as f: 6192! data = re.sub(r"mp_stop\(\s?\d+\s?\)", replace, f.read()) 6193! with open(sys.argv[1]+'.new','w') as fo: 6194! fo.write(data) 6195