1 2* 3* $Id$ 4* 5 6* *********************************************************** 7* * * 8* * CMatrix library * 9* * * 10* * Author - Eric Bylaska * 11* * date - 5/19/06 * 12* * * 13* *********************************************************** 14c 15c 16c 17c 18 19* *********************************** 20* * * 21* * CMatrix_zgemm1_rot2 * 22* * * 23* *********************************** 24 25 subroutine CMatrix_zgemm1_rot2(m,n,k, 26 > alpha, 27 > A,lda,ma,na, 28 > B,ldb,mb,nb, 29 > beta, 30 > C,ldc,mc,nc, 31 > taskid_i,taskid_j, 32 > np_i,np_j, 33 > comm_i, comm_j, 34 > Bcol,Bwork,work1,work2) 35 implicit none 36 integer m,n,k 37 complex*16 alpha 38 39 integer lda,ma(*),na(*) 40 complex*16 A(lda,*) 41 42 integer ldb,mb(*),nb(*) 43c real*8 B(ldb,*) 44 complex*16 B(*) 45 46 complex*16 beta 47 48 integer ldc,mc(*),nc(*) 49 complex*16 C(ldc,*) 50 51 integer taskid_i,taskid_j 52 integer np_i,np_j 53 integer comm_i,comm_j 54 55 complex*16 Bcol(*),Bwork(*) 56 complex*16 work1(*),work2(*) 57 58#include "mpif.h" 59#ifdef MPI4 60#include "stupid_mpi4.fh" 61#endif 62 63 64* **** local variables **** 65 logical jeven 66 integer i,j,j1,ii,jj,ne0 67 integer iwrk,jcur,ierr,bshift,iwrk2 68 integer request1(4),request2(4) 69 integer bshift2(0:np_j) 70 71 bshift = 1 72 bshift2(0) = 1 73 do jj=1,np_j 74 bshift = bshift + na(jj)*nb(taskid_j+1) 75 bshift2(jj) = bshift 76 end do 77 78* *** collect B into columns *** 79 ne0 = 0 80 do i=1,np_i 81 ne0 = ne0+mb(i) 82 end do 83 j1 = 0 84 do jj=1,taskid_j 85 j1 = j1 + nb(jj) 86 end do 87 88 bshift = 0 89 iwrk2 = nb(taskid_j+1) 90 ii = 0 91 do jcur=0,np_j-1 92 iwrk = na(jcur+1) 93 do j=1,iwrk2 94 do i=1,iwrk 95 Bwork(bshift+i+(j-1)*iwrk) = B(ii+i+(j+j1-1)*ne0) 96 end do 97 end do 98 bshift = bshift + iwrk*iwrk2 99 ii = ii + iwrk 100 end do 101 102 103* *** C = beta*C *** 104c call dscal(ldc*nc(taskid_j+1),beta,C,1) 105 do j=1,nc(taskid_j+1) 106 do i=1,mc(taskid_i+1) 107 C(i,j) = beta*C(i,j) 108 end do 109 end do 110 111 call Cmatrix_start_rot(1, 112 > taskid_j,np_j,comm_j, 113 > A,work1,lda,na, 114 > request1) 115 jcur = taskid_j 116 iwrk = na(jcur+1) 117 118 if ((mc(taskid_i+1).gt.0).and. 119 > (nc(taskid_j+1).gt.0).and. 120 > (iwrk.gt.0)) 121 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 122 > alpha, 123 > A, ma(taskid_i+1), 124 > Bwork(bshift2(jcur)), iwrk, 125 > dcmplx(1.0d0,0.0d0), 126 > C, ldc) 127 128 jeven = .true. 129 do j=2,np_j-1 130 if (jeven) then 131 jeven = .false. 132 jcur = mod(jcur-1+np_j,np_j) 133 iwrk = na(jcur+1) 134 call Cmatrix_end_rot(request1) 135 call Cmatrix_start_rot(j, 136 > taskid_j,np_j,comm_j, 137 > A,work2,lda,na, 138 > request2) 139 if ((mc(taskid_i+1).gt.0).and. 140 > (nc(taskid_j+1).gt.0).and. 141 > (iwrk.gt.0)) 142 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 143 > alpha, 144 > work1, ma(taskid_i+1), 145 > Bwork(bshift2(jcur)), iwrk, 146 > dcmplx(1.0d0,0.0d0), 147 > C, ldc) 148 149 else 150 jeven = .true. 151 jcur = mod(jcur-1+np_j,np_j) 152 iwrk = na(jcur+1) 153 call Cmatrix_end_rot(request2) 154 call Cmatrix_start_rot(j, 155 > taskid_j,np_j,comm_j, 156 > A,work1,lda,na, 157 > request1) 158 if ((mc(taskid_i+1).gt.0).and. 159 > (nc(taskid_j+1).gt.0).and. 160 > (iwrk.gt.0)) 161 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 162 > alpha, 163 > work2, ma(taskid_i+1), 164 > Bwork(bshift2(jcur)), iwrk, 165 > dcmplx(1.0d0,0.0d0), 166 > C, ldc) 167 168 end if 169 end do 170 if (jeven) then 171 jcur = mod(jcur-1+np_j,np_j) 172 iwrk = na(jcur+1) 173 call Cmatrix_end_rot(request1) 174 if ((mc(taskid_i+1).gt.0).and. 175 > (nc(taskid_j+1).gt.0).and. 176 > (iwrk.gt.0)) 177 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 178 > alpha, 179 > work1, ma(taskid_i+1), 180 > Bwork(bshift2(jcur)), iwrk, 181 > dcmplx(1.0d0,0.0d0), 182 > C, ldc) 183 184 else 185 jcur = mod(jcur-1+np_j,np_j) 186 iwrk = na(jcur+1) 187 call Cmatrix_end_rot(request2) 188 if ((mc(taskid_i+1).gt.0).and. 189 > (nc(taskid_j+1).gt.0).and. 190 > (iwrk.gt.0)) 191 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 192 > alpha, 193 > work2, ma(taskid_i+1), 194 > Bwork(bshift2(jcur)), iwrk, 195 > dcmplx(1.0d0,0.0d0), 196 > C, ldc) 197 end if 198 return 199 end 200 201 202 203 204 205* *********************************** 206* * * 207* * CMatrix_zgemm1_rot * 208* * * 209* *********************************** 210 211 subroutine CMatrix_zgemm1_rot(m,n,k, 212 > alpha, 213 > A,lda,ma,na, 214 > B,ldb,mb,nb, 215 > beta, 216 > C,ldc,mc,nc, 217 > taskid_i,taskid_j, 218 > np_i,np_j, 219 > comm_i, comm_j, 220 > Bcol,Bwork,work1,work2) 221 implicit none 222 integer m,n,k 223 complex*16 alpha 224 225 integer lda,ma(*),na(*) 226 complex*16 A(lda,*) 227 228 integer ldb,mb(*),nb(*) 229c real*8 B(ldb,*) 230 complex*16 B(*) 231 232 complex*16 beta 233 234 integer ldc,mc(*),nc(*) 235 complex*16 C(ldc,*) 236 237 integer taskid_i,taskid_j 238 integer np_i,np_j 239 integer comm_i,comm_j 240 241 complex*16 Bcol(*),Bwork(*) 242 complex*16 work1(*),work2(*) 243 244#include "mpif.h" 245#ifdef MPI4 246#include "stupid_mpi4.fh" 247#endif 248 249 250* **** local variables **** 251 logical jeven 252 integer i,j,ii,jj,mbmax,nbmax,ne0 253 integer iwrk,jcur,ierr,bshift,iwrk2 254 integer request1(4),request2(4) 255 integer bshift2(0:np_j) 256 257 bshift = 1 258 bshift2(0) = 1 259 do jj=1,np_j 260 bshift = bshift + na(jj)*nb(taskid_j+1) 261 bshift2(jj) = bshift 262 end do 263 264* *** collect B into columns *** 265 ne0 = 0 266 mbmax = 0 267 do i=1,np_i 268 if (mb(i).gt.mbmax) mbmax = mb(i) 269 ne0 = ne0+mb(i) 270 end do 271 nbmax = 0 272 do j=1,np_j 273 if (nb(j).gt.nbmax) nbmax = nb(j) 274 end do 275 if (np_i.gt.1) then 276 277 278 do j=1,nb(taskid_j+1) 279 280 281 !*** Allgather is flaky on my mac laptop *** 282 call dcopy(ne0,0.0,0,Bcol(1+(j-1)*ne0),1) 283 bshift = 0 284 do ii=1,taskid_i 285 bshift = bshift + mb(ii) 286 end do 287 do i=1,mb(taskid_i+1) 288 Bcol(bshift+i+(j-1)*ne0) = B(i+(j-1)*mb(taskid_i+1)) 289 end do 290 call C3dB_Vector_SumAll(2*ne0,Bcol(1+(j-1)*ne0)) 291 end do 292 293 bshift = 0 294 iwrk2 = nb(taskid_j+1) 295 ii = 0 296 do jcur=0,np_j-1 297 iwrk = na(jcur+1) 298 do j=1,iwrk2 299 do i=1,iwrk 300 Bwork(bshift+i+(j-1)*iwrk) = Bcol(ii+i+(j-1)*ne0) 301 302 end do 303 end do 304 bshift = bshift + iwrk*iwrk2 305 ii = ii + iwrk 306 end do 307 else 308 bshift = 0 309 iwrk2 = nb(taskid_j+1) 310 ii = 0 311 do jcur=0,np_j-1 312 iwrk = na(jcur+1) 313 do j=1,iwrk2 314 do i=1,iwrk 315 Bwork(bshift+i+(j-1)*iwrk) = B(ii+i+(j-1)*ne0) 316 end do 317 end do 318 bshift = bshift + iwrk*iwrk2 319 ii = ii + iwrk 320 end do 321 end if 322 323 324 325* *** C = beta*C *** 326c call dscal(ldc*nc(taskid_j+1),beta,C,1) 327 do j=1,nc(taskid_j+1) 328 do i=1,mc(taskid_i+1) 329 C(i,j) = beta*C(i,j) 330 end do 331 end do 332 333 call Cmatrix_start_rot(1, 334 > taskid_j,np_j,comm_j, 335 > A,work1,lda,na, 336 > request1) 337 jcur = taskid_j 338 iwrk = na(jcur+1) 339 340 if ((mc(taskid_i+1).gt.0).and. 341 > (nc(taskid_j+1).gt.0).and. 342 > (iwrk.gt.0)) 343 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 344 > alpha, 345 > A, ma(taskid_i+1), 346 > Bwork(bshift2(jcur)), iwrk, 347 > dcmplx(1.0d0,0.0d0), 348 > C, ldc) 349 350 jeven = .true. 351 do j=2,np_j-1 352 if (jeven) then 353 jeven = .false. 354 jcur = mod(jcur-1+np_j,np_j) 355 iwrk = na(jcur+1) 356 call Cmatrix_end_rot(request1) 357 call Cmatrix_start_rot(j, 358 > taskid_j,np_j,comm_j, 359 > A,work2,lda,na, 360 > request2) 361 if ((mc(taskid_i+1).gt.0).and. 362 > (nc(taskid_j+1).gt.0).and. 363 > (iwrk.gt.0)) 364 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 365 > alpha, 366 > work1, ma(taskid_i+1), 367 > Bwork(bshift2(jcur)), iwrk, 368 > dcmplx(1.0d0,0.0d0), 369 > C, ldc) 370 371 else 372 jeven = .true. 373 jcur = mod(jcur-1+np_j,np_j) 374 iwrk = na(jcur+1) 375 call Cmatrix_end_rot(request2) 376 call Cmatrix_start_rot(j, 377 > taskid_j,np_j,comm_j, 378 > A,work1,lda,na, 379 > request1) 380 if ((mc(taskid_i+1).gt.0).and. 381 > (nc(taskid_j+1).gt.0).and. 382 > (iwrk.gt.0)) 383 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 384 > alpha, 385 > work2, ma(taskid_i+1), 386 > Bwork(bshift2(jcur)), iwrk, 387 > dcmplx(1.0d0,0.0d0), 388 > C, ldc) 389 390 end if 391 end do 392 if (jeven) then 393 jcur = mod(jcur-1+np_j,np_j) 394 iwrk = na(jcur+1) 395 call Cmatrix_end_rot(request1) 396 if ((mc(taskid_i+1).gt.0).and. 397 > (nc(taskid_j+1).gt.0).and. 398 > (iwrk.gt.0)) 399 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 400 > alpha, 401 > work1, ma(taskid_i+1), 402 > Bwork(bshift2(jcur)), iwrk, 403 > dcmplx(1.0d0,0.0d0), 404 > C, ldc) 405 406 else 407 jcur = mod(jcur-1+np_j,np_j) 408 iwrk = na(jcur+1) 409 call Cmatrix_end_rot(request2) 410 if ((mc(taskid_i+1).gt.0).and. 411 > (nc(taskid_j+1).gt.0).and. 412 > (iwrk.gt.0)) 413 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 414 > alpha, 415 > work2, ma(taskid_i+1), 416 > Bwork(bshift2(jcur)), iwrk, 417 > dcmplx(1.0d0,0.0d0), 418 > C, ldc) 419 end if 420 return 421 end 422 423 424 subroutine CMatrix_Bwork_copy(m,n,A,lda,ii,B,ldb) 425 implicit none 426 integer n,m 427 integer lda,ii,ldb 428 complex*16 A(LDA,*) 429 complex*16 B(LDB,*) 430 431* *** local variables *** 432 integer i,j 433 434 do j=1,n 435 do i=1,m 436 B(i,j) = A(ii+i,j) 437 end do 438 end do 439 return 440 end 441 442* *********************************** 443* * * 444* * CMatrix_start_rot * 445* * * 446* *********************************** 447 448 subroutine CMatrix_start_rot(j, 449 > taskid_j,np_j,comm_j, 450 > A,W,lda,na, 451 > request) 452 implicit none 453 integer j 454 integer taskid_j,np_j,comm_j 455 complex*16 A(*),W(*) 456 integer lda,na(*) 457 integer request(*) 458 459#include "mpif.h" 460#ifdef MPI4 461#include "stupid_mpi4.fh" 462#endif 463 464* **** local variables **** 465 integer proc_to,proc_from,msgtype,amsglen,wmsglen,mpierr 466 467 proc_to = mod(taskid_j+j,np_j) 468 proc_from = mod(taskid_j-j+np_j,np_j) 469 msgtype = j 470 amsglen = lda*na(taskid_j+1) 471 wmsglen = lda*na(proc_from+1) 472 473#ifdef MPI4 474 if (wmsglen.gt.0) then 475 stupid_msglen = wmsglen 476 stupid_type = msgtype 477 stupid_taskid = proc_from 478 call MPI_IRECV(W, 479 > stupid_msglen,stupid_complex, 480 > stupid_taskid, 481 > stupid_type,stupid_comm_j, 482 > stupid_request,stupid_ierr) 483 request(1) = stupid_request 484 request(3) = 1 485 else 486 request(3) = 0 487 end if 488 489 if (amsglen.gt.0) then 490 stupid_msglen = amsglen 491 stupid_type = msgtype 492 stupid_taskid = proc_to 493 call MPI_ISEND(A, 494 > stupid_msglen,stupid_complex, 495 > stupid_taskid, 496 > stupid_type,stupid_comm_j, 497 > stupid_request,stupid_ierr) 498 request(2) = stupid_request 499 request(4) = 1 500 else 501 request(4) = 0 502 end if 503#else 504 if (wmsglen.gt.0) then 505 call MPI_IRECV(W,wmsglen,MPI_DOUBLE_COMPLEX, 506 > proc_from, 507 > msgtype,comm_j, 508 > request(1),mpierr) 509 request(3) = 1 510 else 511 request(3) = 0 512 end if 513 if (amsglen.gt.0) then 514 call MPI_ISEND(A,amsglen,MPI_DOUBLE_COMPLEX, 515 > proc_to, 516 > msgtype,comm_j, 517 > request(2),mpierr) 518 request(4) = 1 519 else 520 request(4) = 0 521 end if 522#endif 523 524 if ((request(3).eq.1).and.(request(4).eq.1)) then 525 request(3) = 1 526 else if (request(3).eq.1) then 527 request(3) = 2 528 else if (request(4).eq.1) then 529 request(3) = 3 530 else 531 request(3) = 4 532 end if 533 534 return 535 end 536 537* *********************************** 538* * * 539* * CMatrix_end_rot * 540* * * 541* *********************************** 542 543 subroutine CMatrix_end_rot(request) 544 implicit none 545 integer request(*) 546 547* **** wait for completion of mp_send, also do a sync **** 548 if (request(3).eq.1) then 549 call Parallel_mpiWaitAll(2,request) 550 else if (request(3).eq.2) then 551 call Parallel_mpiWaitAll(1,request) 552 else if (request(3).eq.3) then 553 call Parallel_mpiWaitAll(1,request(2)) 554 endif 555 556 return 557 end 558 559 560 561 562 563* *********************************** 564* * * 565* * CMatrix_zgemm1 * 566* * * 567* *********************************** 568 569 subroutine CMatrix_zgemm1(m,n,k,nblock, 570 > alpha, 571 > A,lda,ma,na, 572 > B,ldb,mb,nb, 573 > beta, 574 > C,ldc,mc,nc, 575 > taskid_i,taskid_j, 576 > np_i,np_j, 577 > comm_i, comm_j, 578 > work1,work2) 579 implicit none 580 integer m,n,k,nblock 581 complex*16 alpha 582 583 integer lda,ma(*),na(*) 584 complex*16 A(lda,*) 585 586 integer ldb,mb(*),nb(*) 587 complex*16 B(ldb,*) 588 589 complex*16 beta 590 591 integer ldc,mc(*),nc(*) 592 complex*16 C(ldc,*) 593 594 integer taskid_i,taskid_j 595 integer np_i,np_j 596 integer comm_i,comm_j 597 598 complex*16 work1(*),work2(*) 599 600 601#ifdef MPI4 602#include "stupid_mpi4.fh" 603#else 604#include "mpif.h" 605#endif 606 607 608* **** local variables **** 609 logical docalc1,docalc2 610 integer i,j,ii,jj 611 integer kk,iwrk,icur,jcur,ierr,shift 612 613 614 do j=1,nc(taskid_j+1) 615 do i=1,mc(taskid_i+1) 616 C(i,j) = beta*C(i,j) 617 end do 618 end do 619 620 ii = 0 621 jj = 0 622 kk = 0 623 icur = 0 624 jcur = 0 625c **** loop over all row pannels of C *** 626 do while (kk.lt.k) 627 iwrk = min(nblock, mb(icur+1)-ii) 628 iwrk = min(iwrk, na(jcur+1)-jj) 629 630 631* **** pack current iwrk columns of A into work1 *** 632 if (taskid_j.eq.jcur) then 633 call zlacpy("G", ma(taskid_i+1),iwrk, 634 > A(1,jj+1), lda, 635 > work1, ma(taskid_i+1)) 636 end if 637 638* **** pack current iwrk rows of B into work2 *** 639 if (taskid_i.eq.icur) then 640 call zlacpy("G", iwrk,nb(taskid_j+1), 641 > B(ii+1,1), ldb, 642 > work2, iwrk) 643 end if 644 645#ifdef MPI4 646c **** broadcast work1 within my row *** 647 stupid_msglen = iwrk*ma(taskid_i+1) 648 stupid_taskid = jcur 649 call MPI_Bcast(work1,stupid_msglen,stupid_complex, 650 > stupid_taskid,stupid_comm_j,stupid_ierr) 651 652c **** broadcast work2 within my column *** 653 stupid_msglen = iwrk*nb(taskid_j+1) 654 stupid_taskid = icur 655 call MPI_Bcast(work2,stupid_msglen,stupid_complex, 656 > stupid_taskid,stupid_comm_i,stupid_ierr) 657#else 658c **** broadcast work1 within my row *** 659 call MPI_Bcast(work1,iwrk*ma(taskid_i+1),MPI_DOUBLE_COMPLEX, 660 > jcur,comm_j,ierr) 661 662c **** broadcast work2 within my column *** 663 call MPI_Bcast(work2,iwrk*nb(taskid_j+1),MPI_DOUBLE_COMPLEX, 664 > icur,comm_i,ierr) 665#endif 666 667 668 if ((iwrk.gt.0) .and. 669 > (mc(taskid_i+1).gt.0).and. 670 > (nc(taskid_j+1).gt.0)) 671 > call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk, 672 > alpha, 673 > work1, ma(taskid_i+1), 674 > work2, iwrk, 675 > dcmplx(1.0d0,0.0d0), 676 > C, ldc) 677 678 679 ii = ii + iwrk 680 jj = jj + iwrk 681 kk = kk + iwrk 682 683 if (jj.ge.na(jcur+1)) then 684 jcur = jcur + 1 685 jj = 0 686 end if 687 if (ii.ge.mb(icur+1)) then 688 icur = icur + 1 689 ii = 0 690 end if 691 692 end do 693 694 return 695 end 696 697 698* *********************************** 699* * * 700* * CMatrix_zgemm2 * 701* * * 702* *********************************** 703 subroutine CMatrix_zgemm2(m,n,k,nblock, 704 > alpha, 705 > A,lda,ma,na, 706 > B,ldb,mb,nb, 707 > beta, 708 > C,ldc,mc,nc, 709 > taskid_i,taskid_j, 710 > np_i,np_j, 711 > comm_i, comm_j, 712 > work1,work2) 713 implicit none 714 integer m,n,k,nblock 715 complex*16 alpha 716 717 integer lda,ma(*),na(*) 718 complex*16 A(lda,*) 719 720 integer ldb,mb(*),nb(*) 721 complex*16 B(ldb,*) 722 723 complex*16 beta 724 725 integer ldc,mc(*),nc(*) 726 complex*16 C(ldc,*) 727 728 integer taskid_i,taskid_j 729 integer np_i,np_j 730 integer comm_i,comm_j 731 732 complex*16 work1(*),work2(*) 733 734 735#ifdef MPI4 736#include "stupid_mpi4.fh" 737#else 738#include "mpif.h" 739#endif 740 741 742* **** local variables **** 743 logical docalc1,docalc2 744 integer i,j,ii,jj 745 integer kk,iwrk,icur,jcur,ierr,shift 746 747 do j=1,nc(taskid_j+1) 748 do i=1,mc(taskid_i+1) 749 C(i,j) = beta*C(i,j) 750 end do 751 end do 752 753 ii = 0 754 jj = 0 755 kk = 0 756 icur = 0 757 jcur = 0 758c **** loop over all row pannels of C *** 759 do while (kk.lt.m) 760 iwrk = min(nblock, mc(icur+1)-ii) 761 iwrk = min(iwrk, na(jcur+1)-jj) 762 763 764* **** iwrk*nc(taskid_j+1) submatrix !=0 **** 765 if (ma(taskid_i+1).gt.0) then 766 767* **** pack current iwrk columns of A into work1 *** 768 if (taskid_j.eq.jcur) then 769 call zlacpy("G", ma(taskid_i+1),iwrk, 770 > A(1,jj+1), lda, 771 > work1, ma(taskid_i+1)) 772 end if 773 774c **** broadcast work1 within my row *** 775#ifdef MPI4 776 stupid_msglen = iwrk*ma(taskid_i+1) 777 stupid_taskid = jcur 778 call MPI_Bcast(work1,stupid_msglen, 779 > stupid_complex,stupid_taskid, 780 > stupid_comm_j,stupid_ierr) 781#else 782 call MPI_Bcast(work1,iwrk*ma(taskid_i+1), 783 > MPI_DOUBLE_COMPLEX,jcur,comm_j,ierr) 784#endif 785 786 787c if ((iwrk.gt.0) .and. 788c > (nb(taskid_j+1).gt.0).and. 789c > (ma(taskid_i+1).gt.0)) 790 if ((iwrk.gt.0) .and. 791 > (nb(taskid_j+1).gt.0)) 792 > call zgemm('C','N',iwrk,nb(taskid_j+1),ma(taskid_i+1), 793 > alpha, 794 > work1, ma(taskid_i+1), 795 > B, ldb, 796 > dcmplx(0.0d0,0.0d0), 797 > work2, iwrk) 798 799* **** iwrk*nc(taskid_j+1) submatrix ==0 **** 800 else 801 call dcopy(2*nc(taskid_j+1)*iwrk,0.0d0,0,work2,1) 802 end if 803 804 805c **** summ to node that holds current rows of C **** 806#ifdef MPI4 807 stupid_msglen = nc(taskid_j+1)*iwrk 808 stupid_taskid = icur 809 call MPI_Reduce(work2,work1,stupid_msglen, 810 > stupid_complex,stupid_sum, 811 > stupid_taskid,stupid_comm_i,stupid_ierr) 812#else 813 call MPI_Reduce(work2,work1,nc(taskid_j+1)*iwrk, 814 > MPI_DOUBLE_COMPLEX,MPI_SUM,icur,comm_i,ierr) 815#endif 816 817 818c **** add to current rows of C **** 819 if (taskid_i.eq.icur) then 820 shift = 1 821 do i=ii,(ii+iwrk-1) 822 call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift),iwrk, 823 > C(i+1,1),mc(taskid_i+1)) 824 shift = shift + 1 825 end do 826 end if 827 828 ii = ii + iwrk 829 jj = jj + iwrk 830 kk = kk + iwrk 831 832 if (jj.ge.na(jcur+1)) then 833 jcur = jcur + 1 834 jj = 0 835 end if 836 if (ii.ge.mc(icur+1)) then 837 icur = icur + 1 838 ii = 0 839 end if 840 841 end do 842 843 844 return 845 end 846 847 848* *********************************** 849* * * 850* * CMatrix_zgemm3 * 851* * * 852* *********************************** 853 854 subroutine CMatrix_zgemm3(m,n,k,nblock, 855 > alpha, 856 > A,lda,ma,na, 857 > B,ldb,mb,nb, 858 > beta, 859 > C,ldc,mc,nc, 860 > taskid_i,taskid_j, 861 > np_i,np_j, 862 > comm_i, comm_j, 863 > work1,work2) 864 implicit none 865 integer m,n,k,nblock 866 complex*16 alpha 867 868 integer lda,ma(*),na(*) 869 complex*16 A(lda,*) 870 871 integer ldb,mb(*),nb(*) 872 complex*16 B(ldb,*) 873 874 complex*16 beta 875 876 integer ldc,mc(*),nc(*) 877 complex*16 C(ldc,*) 878 879 integer taskid_i,taskid_j 880 integer np_i,np_j 881 integer comm_i,comm_j 882 883 complex*16 work1(*),work2(*) 884 885#ifdef MPI4 886#include "stupid_mpi4.fh" 887#else 888#include "mpif.h" 889#endif 890 891 892* **** local variables **** 893 logical docalc1,docalc2 894 integer i,j,ii,jj 895 integer kk,iwrk,icur,jcur,ierr,shift 896 real*8 dum 897 898 899 do j=1,nc(taskid_j+1) 900 do i=1,mc(taskid_i+1) 901 C(i,j) = beta*C(i,j) 902 end do 903 end do 904 905 ii = 0 906 jj = 0 907 kk = 0 908 icur = 0 909 jcur = 0 910 do while (kk.lt.n) 911 iwrk = min(nblock, mb(icur+1)-ii) 912 iwrk = min(iwrk, nc(jcur+1)-jj) 913 914 915 if (taskid_i.eq.icur) then 916 call zlacpy("G", iwrk,nb(taskid_j+1), 917 > B(ii+1,1), ldb, 918 > work2, iwrk) 919 end if 920 921#ifdef MPI4 922 stupid_msglen = iwrk*nb(taskid_j+1) 923 stupid_taskid = icur 924 call MPI_Bcast(work2,stupid_msglen,stupid_complex, 925 > stupid_taskid,stupid_comm_i,stupid_ierr) 926#else 927 call MPI_Bcast(work2,iwrk*nb(taskid_j+1),MPI_DOUBLE_COMPLEX, 928 > icur,comm_i,ierr) 929#endif 930 931 if ((iwrk.gt.0) .and. 932 > (na(taskid_j+1).gt.0).and. 933 > (mc(taskid_i+1).gt.0)) 934 > call zgemm('N','C',mc(taskid_i+1),iwrk,na(taskid_j+1), 935 > alpha, 936 > A, lda, 937 > work2, iwrk, 938 > dcmplx(0.0d0,0.0d0), 939 > work1, mc(taskid_i+1)) 940 941#ifdef MPI4 942 stupid_msglen = mc(taskid_i+1)*iwrk 943 stupid_taskid = jcur 944 call MPI_Reduce(work1,work2,stupid_msglen,stupid_complex, 945 > stupid_sum,stupid_taskid, 946 > stupid_comm_j,stupid_ierr) 947#else 948 call MPI_Reduce(work1,work2,mc(taskid_i+1)*iwrk, 949 > MPI_DOUBLE_COMPLEX,MPI_SUM,jcur,comm_j,ierr) 950#endif 951 952 953 if (taskid_j.eq.jcur) then 954 shift = 1 955 do j=jj,(jj+iwrk-1) 956 call daxpy(2*mc(taskid_i+1), 957 > 1.0d0, 958 > work2(shift),1, 959 > C(1,j+1),1) 960 shift = shift + mc(taskid_i+1) 961 end do 962 end if 963 964 ii = ii + iwrk 965 jj = jj + iwrk 966 kk = kk + iwrk 967 968 if (jj.ge.nc(jcur+1)) then 969 jcur = jcur + 1 970 jj = 0 971 end if 972 if (ii.ge.mb(icur+1)) then 973 icur = icur + 1 974 ii = 0 975 end if 976 977 end do 978 979 980 return 981 end 982 983 984 985 986 987* *********************************** 988* * * 989* * CMatrix_tqliq * 990* * * 991* *********************************** 992 993 subroutine CMatrix_tqliq(n,eig,tu, 994 > Q,ldq,mq,nq, 995 > taskid_i,taskid_j, 996 > np_i,np_j, 997 > comm_i, comm_j, 998 > work1,work2) 999 implicit none 1000 integer n 1001 1002 integer ldq,mq(*),nq(*) 1003 complex*16 Q(ldq,*) 1004 real*8 eig(*),tu(*) 1005 1006 integer taskid_i,taskid_j 1007 integer np_i,np_j 1008 integer comm_i,comm_j 1009 complex*16 work1(*),work2(*) 1010 1011#ifdef MPI4 1012#include "stupid_mpi4.fh" 1013#else 1014#include "mpif.h" 1015#endif 1016 1017* **** local variables **** 1018 integer MAXITER 1019 parameter (MAXITER = 100) 1020 real*8 tole 1021 parameter (tole=1.0d-15) 1022 1023 logical notdone 1024 integer i,j,l,m,iter 1025 integer ii,jj0,jj1,jcur0,jcur1,ierr,istat 1026 real*8 b,c,f,g,p,r,s 1027 1028 1029 do l=1,n-1 1030 iter = 0 1031 1032 do m=l,n-1 1033 if (dabs(tu(m)).lt.tole) go to 2 1034 end do 1035 m = n 1036 2 continue 1037 if (m.eq.l) then 1038 notdone = .false. 1039 else 1040 notdone = .true. 1041 end if 1042 do while ((iter.lt.MAXITER).and.(notdone)) 1043 g = (eig(l+1)-eig(l))/(2.0d0*tu(l)) 1044 r = dsqrt(g**2+1.0d0) 1045ccccneed to fixccc g = eig(m)-eig(l)+tu(l)/(g+dsign(r,g)) 1046 s = 1.0d0 1047 c = 1.0d0 1048 p = 0.0d0 1049 do i = m-1,l,-1 1050 f = s*tu(i) 1051 b = c*tu(i) 1052 if (dabs(f).ge.dabs(g)) then 1053 c = g/f 1054 r = dsqrt(c**2+1.0d0) 1055 tu(i+1) = f*r 1056 s = 1/r 1057 c = c*s 1058 else 1059 s = f/g 1060 r = dsqrt(s**2+1.0d0) 1061 tu(i+1) = g*r 1062 c = 1/r 1063 s = s*c 1064 end if 1065 g = eig(i+1)-p 1066 r = (eig(i)-g)*s + 2.0d0*c*b 1067 p = s*r 1068 eig(i+1) = g+p 1069 g = c*r-b 1070 1071 1072* **** update eigenvectors **** 1073 jcur0 = 0 1074 jj0 = 1 1075 do j=1,i-1 1076 jj0 = jj0 + 1 1077 if (jj0.gt.nq(jcur0+1)) then 1078 jcur0 = jcur0 + 1 1079 jj0 = 1 1080 end if 1081 end do 1082 jcur1 = jcur0 1083 jj1 = jj0 + 1 1084 if (jj1.gt.nq(jcur1+1)) then 1085 jcur1 = jcur1 + 1 1086 jj1 = 1 1087 end if 1088 1089 if (jcur0.eq.taskid_j) 1090 > call zcopy(mq(taskid_i+1),Q(1,jj0),1,work1,1) 1091 if (jcur1.eq.taskid_j) 1092 > call zcopy(mq(taskid_i+1),Q(1,jj1),1,work2,1) 1093 1094#ifdef MPI4 1095 stupid_msglen = mq(taskid_i+1) 1096 stupid_taskid = jcur0 1097 stupid_type = jcur1 1098 call MPI_Bcast(work1,stupid_msglen,stupid_complex, 1099 > stupid_taskid,stupid_comm_j,stupid_ierr) 1100 call MPI_Bcast(work2,stupid_msglen,stupid_complex, 1101 > stupid_type,stupid_comm_j,stupid_ierr) 1102#else 1103 call MPI_Bcast(work1,mq(taskid_i+1),MPI_DOUBLE_COMPLEX, 1104 > jcur0,comm_j,ierr) 1105 call MPI_Bcast(work2,mq(taskid_i+1),MPI_DOUBLE_COMPLEX, 1106 > jcur1,comm_j,ierr) 1107#endif 1108 1109 if (jcur0.eq.taskid_j) then 1110 do ii=1,mq(taskid_i+1) 1111 Q(ii,jj0) = c*Q(ii,jj0) - s*work2(ii) 1112 end do 1113 end if 1114 1115 if (jcur1.eq.taskid_j) then 1116 do ii=1,mq(taskid_i+1) 1117 Q(ii,jj1) = c*Q(ii,jj1) + s*work1(ii) 1118 end do 1119 end if 1120 1121 end do 1122 eig(l) = eig(l) - p 1123 tu(l) = g 1124 tu(m) = 0.0d0 1125 1126 1127 do m=l,n-1 1128 if (dabs(tu(m)).lt.tole) go to 3 1129 end do 1130 m = n 1131 3 continue 1132 if (m.eq.l) then 1133 notdone = .false. 1134 else 1135 notdone = .true. 1136 end if 1137 1138 iter = iter + 1 1139 end do 1140 1141 end do 1142 1143 return 1144 end 1145 1146 1147 1148* *********************************** 1149* * * 1150* * CMatrix_houseq * 1151* * * 1152* *********************************** 1153 subroutine CMatrix_houseq(jcol, 1154 > n, 1155 > A,V,Q,lda,ma,na, 1156 > taskid_i,taskid_j, 1157 > np_i,np_j, 1158 > comm_i, comm_j, 1159 > work1,work2) 1160 implicit none 1161 integer jcol,n 1162 1163 integer lda,ma(*),na(*) 1164 complex*16 A(lda,*),V(lda,*),Q(lda,*) 1165 1166 integer taskid_i,taskid_j 1167 integer np_i,np_j 1168 integer comm_i,comm_j 1169 1170 complex*16 work1(*),work2(*) 1171 1172#ifdef MPI4 1173#include "stupid_mpi4.fh" 1174#else 1175#include "mpif.h" 1176#endif 1177 1178* **** local variables **** 1179 integer i,j,ii,jj 1180 integer kk,iwrk,icur,jcur,ierr,shift 1181 integer ii0,icur0,ii1,icur1,ii2,icur2 1182 integer jj0,jcur0,jj1,jcur1 1183 real*8 beta,mu0,mu,v20,v2 1184 1185 call dcopy(2*ma(taskid_i+1)*na(taskid_j+1),0.0d0,0,V,1) 1186 1187 jcur0 = 0 1188 jj0 = 1 1189 do j=1,jcol-1 1190 jj0 = jj0 + 1 1191 if (jj0.gt.na(jcur0+1)) then 1192 jcur0 = jcur0 + 1 1193 jj0 = 1 1194 end if 1195 end do 1196 jcur1 = jcur0 1197 jj1 = jj0 + 1 1198 if (jj1.gt.na(jcur1+1)) then 1199 jcur1 = jcur1 + 1 1200 jj1 = 1 1201 end if 1202 1203 icur0 = 0 1204 ii0 = 1 1205 do i=1,jcol-1 1206 ii0 = ii0 + 1 1207 if (ii0.gt.ma(icur0+1)) then 1208 icur0 = icur0 + 1 1209 ii0 = 1 1210 end if 1211 end do 1212 icur1 = icur0 1213 ii1 = ii0 + 1 1214 if (ii1.gt.ma(icur1+1)) then 1215 icur1 = icur1 + 1 1216 ii1 = 1 1217 end if 1218 icur2 = icur1 1219 ii2 = ii1 + 1 1220 if (ii2.gt.ma(icur2+1)) then 1221 icur2 = icur2 + 1 1222 ii2 = 1 1223 end if 1224 1225 if (jcur0.eq.taskid_j) then 1226 1227 icur = icur1 1228 ii = ii1 1229 do i=jcol+1,n 1230 if (icur.eq.taskid_i) V(ii,jj0) = A(ii,jj0) 1231 ii = ii + 1 1232 if (ii.gt.ma(icur+1)) then 1233 icur = icur + 1 1234 ii = 1 1235 end if 1236 end do 1237 1238 1239 mu0 = 0.0d0 1240 icur = icur1 1241 ii = ii1 1242 do i=jcol+1,n 1243 if (icur.eq.taskid_i) 1244 > mu0 = mu0 + dconjg(V(ii,jj0))*V(ii,jj0) 1245 ii = ii + 1 1246 if (ii.gt.ma(icur+1)) then 1247 icur = icur + 1 1248 ii = 1 1249 end if 1250 end do 1251#ifdef MPI4 1252 stupid_msglen = 1 1253 call MPI_AllReduce(mu0,mu,stupid_msglen, 1254 > stupid_double,stupid_sum, 1255 > stupid_comm_i,stupid_ierr) 1256#else 1257 call MPI_AllReduce(mu0,mu,1, 1258 > MPI_DOUBLE_PRECISION,MPI_SUM,comm_i,ierr) 1259#endif 1260 mu = dsqrt(mu) 1261 1262 1263 if (mu.ne.0.0d0) then 1264cccc need to fix if (icur1.eq.taskid_i) 1265cccc need to fix > beta = V(ii1,jj0) + dsign(mu,V(ii1,jj0)) 1266#ifdef MPI4 1267 stupid_msglen = 1 1268 stupid_taskid = icur1 1269 call MPI_Bcast(beta,stupid_msglen,stupid_double, 1270 > stupid_taskid,stupid_comm_i,stupid_ierr) 1271#else 1272 call MPI_Bcast(beta,1,MPI_DOUBLE_PRECISION, 1273 > icur1,comm_i,ierr) 1274#endif 1275 1276 icur = icur2 1277 ii = ii2 1278 do i=jcol+2,n 1279 if (icur.eq.taskid_i) V(ii,jj0) = V(ii,jj0)/beta 1280 ii = ii + 1 1281 if (ii.gt.ma(icur+1)) then 1282 icur = icur + 1 1283 ii = 1 1284 end if 1285 end do 1286 end if 1287 if (icur1.eq.taskid_i) V(ii1,jj0) = dcmplx(1.0d0,0.0d0) 1288 if (icur0.eq.taskid_i) V(ii0,jj0) = dcmplx(0.0d0,0.0d0) 1289 1290 v20 = 0.0d0 1291 icur = icur0 1292 ii = ii0 1293 do i=jcol,n 1294 if (icur.eq.taskid_i) 1295 > v20 = v20 + dcmplx(V(ii,jj0))*V(ii,jj0) 1296 ii = ii + 1 1297 if (ii.gt.ma(icur+1)) then 1298 icur = icur + 1 1299 ii = 1 1300 end if 1301 end do 1302#ifdef MPI4 1303 stupid_msglen = 1 1304 call MPI_AllReduce(v20,v2,stupid_msglen, 1305 > stupid_double,stupid_sum, 1306 > stupid_comm_i,stupid_ierr) 1307#else 1308 call MPI_AllReduce(v20,v2,1, 1309 > MPI_DOUBLE_PRECISION,MPI_SUM,comm_i,ierr) 1310#endif 1311 v2 = 2.0d0/v2 1312 end if 1313#ifdef MPI4 1314 stupid_msglen = 1 1315 stupid_taskid = jcur0 1316 call MPI_Bcast(v2,stupid_msglen,stupid_double, 1317 > stupid_taskid,stupid_comm_j,stupid_ierr) 1318#else 1319 call MPI_Bcast(v2,1,MPI_DOUBLE_PRECISION, 1320 > jcur0,comm_j,ierr) 1321#endif 1322 1323 call CMatrix_eye(n,n,dcmplx(1.0d0,0.0d0), 1324 > Q,lda,ma,na,taskid_i,taskid_j) 1325 call CMatrix_zgemm3(n,n,n,64, 1326 > dcmplx(-v2,0.0d0), 1327 > V,ma(taskid_i+1), ma,na, 1328 > V,ma(taskid_i+1), ma,na, 1329 > dcmplx(1.0d0,0.0d0), 1330 > Q,ma(taskid_i+1), ma,na, 1331 > taskid_i,taskid_j, 1332 > np_i,np_j, 1333 > comm_i, comm_j, 1334 > work1,work2) 1335 1336 1337 return 1338 end 1339 1340 1341* *********************************** 1342* * * 1343* * CMatrix_eigsrtq * 1344* * * 1345* *********************************** 1346 subroutine CMatrix_eigsrtq(n,eig, 1347 > Q,ldq,mq,nq, 1348 > taskid_i,taskid_j, 1349 > np_i,np_j, 1350 > comm_i, comm_j, 1351 > work1,work2) 1352 implicit none 1353 integer n 1354 1355 integer ldq,mq(*),nq(*) 1356 real*8 eig(*) 1357 complex*16 Q(ldq,*) 1358 1359 integer taskid_i,taskid_j 1360 integer np_i,np_j 1361 integer comm_i,comm_j 1362 complex*16 work1(*),work2(*) 1363 1364#ifdef MPI4 1365#include "stupid_mpi4.fh" 1366#else 1367#include "mpif.h" 1368#endif 1369 1370 1371* **** local variables **** 1372 logical notdone 1373 integer i,j,k,l,m,iter 1374 integer ii,jj0,jj1,jcur0,jcur1,ierr,istat 1375 real*8 b,c,f,g,p,r,s 1376 1377 1378 do i=1,n-1 1379 k = i 1380 p = eig(i) 1381 do j=i+1,n 1382 if (eig(j).ge.p) then 1383 k = j 1384 p = eig(j) 1385 end if 1386 end do 1387 if (k.ne.i) then 1388 eig(k) = eig(i) 1389 eig(i) = p 1390 1391 jcur0 = 0 1392 jj0 = 1 1393 do j=1,i-1 1394 jj0 = jj0 + 1 1395 if (jj0.gt.nq(jcur0+1)) then 1396 jcur0 = jcur0 + 1 1397 jj0 = 1 1398 end if 1399 end do 1400 jcur1 = 0 1401 jj1 = 1 1402 do j=1,k-1 1403 jj1 = jj1 + 1 1404 if (jj1.gt.nq(jcur1+1)) then 1405 jcur1 = jcur1 + 1 1406 jj1 = 1 1407 end if 1408 end do 1409 1410 1411 if (jcur0.eq.taskid_j) 1412 > call zcopy(mq(taskid_i+1),Q(1,jj0),1,work1,1) 1413 if (jcur1.eq.taskid_j) 1414 > call zcopy(mq(taskid_i+1),Q(1,jj1),1,work2,1) 1415 1416#ifdef MPI4 1417 stupid_msglen = mq(taskid_i+1) 1418 stupid_taskid = jcur0 1419 stupid_type = jcur1 1420 call MPI_Bcast(work1,stupid_msglen,stupid_complex, 1421 > stupid_taskid,stupid_comm_j,stupid_ierr) 1422 call MPI_Bcast(work2,stupid_msglen,stupid_complex, 1423 > stupid_type,stupid_comm_j,stupid_ierr) 1424#else 1425 call MPI_Bcast(work1,mq(taskid_i+1),MPI_DOUBLE_COMPLEX, 1426 > jcur0,comm_j,ierr) 1427 call MPI_Bcast(work2,mq(taskid_i+1),MPI_DOUBLE_COMPLEX, 1428 > jcur1,comm_j,ierr) 1429#endif 1430 1431 if (jcur0.eq.taskid_j) 1432 > call zcopy(mq(taskid_i+1),work2,1,Q(1,jj0),1) 1433 if (jcur1.eq.taskid_j) 1434 > call zcopy(mq(taskid_i+1),work1,1,Q(1,jj1),1) 1435 1436 end if 1437 1438 end do 1439 1440 return 1441 end 1442 1443 1444* *********************************** 1445* * * 1446* * CMatrix_getdiags * 1447* * * 1448* *********************************** 1449 subroutine CMatrix_getdiags(n,eig,tu, 1450 > A,lda,ma,na, 1451 > taskid_i,taskid_j, 1452 > np_i,np_j, 1453 > comm_i,comm_j, 1454 > work1) 1455 implicit none 1456 integer n 1457 1458 integer lda,ma(*),na(*) 1459 real*8 eig(*),tu(*) 1460 complex*16 A(lda,*) 1461 1462 integer taskid_i,taskid_j 1463 integer np_i,np_j 1464 integer comm_i,comm_j 1465 real*8 work1(*) 1466 1467#ifdef MPI4 1468#include "stupid_mpi4.fh" 1469#else 1470#include "mpif.h" 1471#endif 1472 1473 1474 1475* **** local variables **** 1476 integer i,j,ii,jj,is,ie,js,je 1477 integer icur,jcur,ierr 1478 1479* ************************** 1480* **** gather diagonals **** 1481* ************************** 1482 call dcopy(n,0.0d0,0,work1,1) 1483 call dcopy(n,0.0d0,0,eig,1) 1484 js = 1 1485 do jcur = 0,taskid_j-1 1486 js = js + na(jcur+1) 1487 end do 1488 jcur = taskid_j 1489 je = js-1 + na(jcur+1) 1490 jj = 1 1491 do j=js,je 1492 1493 icur=0 1494 ii = 1 1495 do i=1,j-1 1496 ii = ii + 1 1497 if (ii.gt.ma(icur+1)) then 1498 icur = icur + 1 1499 ii = 1 1500 end if 1501 end do 1502 work1(j) = dble(A(ii,jj)) 1503 1504#ifdef MPI4 1505 stupid_msglen = 1 1506 stupid_taskid = icur 1507 call MPI_Bcast(work1(j),stupid_msglen,stupid_double, 1508 > stupid_taskid,stupid_comm_i,stupid_ierr) 1509#else 1510 call MPI_Bcast(work1(j),1,MPI_DOUBLE_PRECISION, 1511 > icur,comm_i,ierr) 1512#endif 1513 jj = jj + 1 1514 if (jj.gt.na(jcur+1)) then 1515 jcur = jcur + 1 1516 jj = 1 1517 end if 1518 end do 1519 1520#ifdef MPI4 1521 stupid_msglen = n 1522 call MPI_AllReduce(work1,eig,stupid_msglen, 1523 > stupid_double,stupid_sum, 1524 > stupid_comm_j,stupid_ierr) 1525#else 1526 call MPI_AllReduce(work1,eig,n, 1527 > MPI_DOUBLE_PRECISION,MPI_SUM,comm_j,ierr) 1528#endif 1529 1530 1531* ****************************** 1532* **** gather off-diagonals **** 1533* ****************************** 1534 call dcopy(n,0.0d0,0,work1,1) 1535 call dcopy(n,0.0d0,0,tu,1) 1536 is = 1 1537 do icur = 0,taskid_i-1 1538 is = is + ma(icur+1) 1539 end do 1540 icur = taskid_i 1541 ie = is-1 + ma(icur+1) 1542 if (ie.ge.n) ie=ie-1 1543 ii = 1 1544 do i=is,ie 1545 1546 jcur=0 1547 jj = 1 1548 do j=1,i 1549 jj = jj + 1 1550 if (jj.gt.na(jcur+1)) then 1551 jcur = jcur + 1 1552 jj = 1 1553 end if 1554 end do 1555 work1(i) = dble(A(ii,jj)) 1556#ifdef MPI4 1557 stupid_msglen = 1 1558 stupid_taskid = jcur 1559 call MPI_Bcast(work1(i),stupid_msglen,stupid_double, 1560 > stupid_taskid,stupid_comm_j,stupid_ierr) 1561#else 1562 call MPI_Bcast(work1(i),1,MPI_DOUBLE_PRECISION, 1563 > jcur,comm_j,ierr) 1564#endif 1565 ii = ii + 1 1566 if (ii.gt.ma(icur+1)) then 1567 icur = icur + 1 1568 ii = 1 1569 end if 1570 end do 1571#ifdef MPI4 1572 stupid_msglen = n-1 1573 call MPI_AllReduce(work1,tu,stupid_msglen, 1574 > stupid_double,stupid_sum, 1575 > stupid_comm_i,stupid_ierr) 1576#else 1577 call MPI_AllReduce(work1,tu,n-1, 1578 > MPI_DOUBLE_PRECISION,MPI_SUM,comm_i,ierr) 1579#endif 1580 return 1581 end 1582 1583 1584 1585 1586* *********************************** 1587* * * 1588* * CMatrix_MaxAll * 1589* * * 1590* *********************************** 1591 subroutine CMatrix_MaxAll(sum) 1592c implicit none 1593 complex*16 sum 1594 1595#ifdef MPI4 1596#include "stupid_mpi4.fh" 1597#else 1598#include "mpif.h" 1599#endif 1600 1601 1602 integer msglen,mpierr,np 1603 complex*16 sumall 1604 1605* **** external functions **** 1606 1607 call Parallel_np(np) 1608 if (np.gt.1) then 1609 msglen = 1 1610#ifdef MPI4 1611 stupid_msglen = 1 1612 call MPI_Allreduce(sum,sumall,stupid_msglen,stupid_complex, 1613 > stupid_max,stupid_world,stupid_ierr) 1614#else 1615 call MPI_Allreduce(sum,sumall,msglen,MPI_DOUBLE_COMPLEX, 1616 > MPI_MAX,MPI_COMM_WORLD,mpierr) 1617#endif 1618 sum = sumall 1619 end if 1620 1621 return 1622 end 1623 1624 1625 1626 1627* *********************************** 1628* * * 1629* * CMatrix_SumAll * 1630* * * 1631* *********************************** 1632 subroutine CMatrix_SumAll(sum) 1633c implicit none 1634 complex*16 sum 1635 1636#ifdef MPI4 1637#include "stupid_mpi4.fh" 1638#else 1639#include "mpif.h" 1640#endif 1641 1642 1643 integer msglen,mpierr,np 1644 complex*16 sumall 1645 1646* **** external functions **** 1647 1648 call Parallel_np(np) 1649 if (np.gt.1) then 1650 msglen = 1 1651#ifdef MPI4 1652 stupid_msglen = msglen 1653 call MPI_Allreduce(sum,sumall,stupid_msglen,stupid_complex, 1654 > stupid_sum,stupid_world,stupid_ierr) 1655#else 1656 call MPI_Allreduce(sum,sumall,msglen,MPI_DOUBLE_COMPLEX, 1657 > MPI_SUM,MPI_COMM_WORLD,mpierr) 1658#endif 1659 sum = sumall 1660 end if 1661 1662 return 1663 end 1664 1665 1666 1667 1668 1669* *********************************** 1670* * * 1671* * CMatrix_mm_transpose * 1672* * * 1673* *********************************** 1674 1675 subroutine CMatrix_mm_transpose(n,A,B,ldq,mq,nq) 1676 implicit none 1677 integer n 1678 integer ldq,mq(*),nq(*) 1679 complex*16 A(ldq,*) 1680 complex*16 B(ldq,*) 1681 1682#include "bafdecls.fh" 1683#include "errquit.fh" 1684#include "mpif.h" 1685 1686#ifdef MPI4 1687#include "stupid_mpi4.fh" 1688#endif 1689 1690* **** local variables **** 1691 logical value 1692 integer taskid 1693 integer i,j 1694 integer ii,jj,rr,ss 1695 integer icur,jcur,rcur,scur 1696 integer psend,precv,msglen,msgtype,mpierr,status(2) 1697 1698* **** external functions **** 1699 integer Parallel2d_convert_taskid_ij 1700 external Parallel2d_convert_taskid_ij 1701 1702 1703 call Parallel_taskid(taskid) 1704 msglen = 1 1705 msgtype = 1 1706 1707* **** allocate memory **** 1708 value = BA_push_get(mt_int,MPI_STATUS_SIZE, 1709 > 'status',status(2),status(1)) 1710 if (.not. value) 1711 > call errquit(' CMatrix_m_transpose:out of stack',0,MA_ERR) 1712 1713 1714 jj = 1 1715 jcur = 0 1716 rr = 1 1717 rcur = 0 1718 do j=1,n 1719 ii = 1 1720 icur = 0 1721 ss = 1 1722 scur = 0 1723 do i=1,n 1724 1725 1726 psend = Parallel2d_convert_taskid_ij(icur,jcur) 1727 precv = Parallel2d_convert_taskid_ij(rcur,scur) 1728 1729 if (psend.eq.precv) then 1730 if (psend.eq.taskid) B(rr,ss) = A(ii,jj) 1731 else 1732#ifdef MPI4 1733 !**** send **** 1734 if (psend.eq.taskid) then 1735 stupid_msglen = 1 1736 stupid_type = msgtype 1737 stupid_taskid = precv 1738 call MPI_SEND(A(ii,jj), 1739 > stupid_msglen,stupid_complex, 1740 > stupid_taskid, 1741 > stupid_type,stupid_world,stupid_ierr) 1742 end if 1743 1744 !**** recv **** 1745 if (precv.eq.taskid) then 1746 stupid_msglen = msglen 1747 stupid_type = msgtype 1748 stupid_taskid = psend 1749 call MPI_RECV(B(rr,ss), 1750 > stupid_msglen,stupid_complex, 1751 > stupid_taskid, 1752 > stupid_type,stupid_world, 1753 > int_mb(status(1)),stupid_ierr) 1754 end if 1755 1756#else 1757 if (psend.eq.taskid) 1758 > call MPI_SEND(A(ii,jj),1, 1759 > MPI_DOUBLE_COMPLEX,precv, 1760 > msgtype,MPI_COMM_WORLD,mpierr) 1761 1762 !**** recv **** 1763 if (precv.eq.taskid) 1764 > call MPI_RECV(B(rr,ss),msglen, 1765 > MPI_DOUBLE_COMPLEX,psend, 1766 > msgtype,MPI_COMM_WORLD, 1767 > int_mb(status(1)),mpierr) 1768#endif 1769 end if 1770 1771 1772 ii = ii + 1 1773 if (ii.gt.mq(icur+1)) then 1774 icur = icur + 1 1775 ii = 1 1776 end if 1777 1778 ss = ss + 1 1779 if (ss.gt.nq(scur+1)) then 1780 scur = scur + 1 1781 ss = 1 1782 end if 1783 1784 end do 1785 1786 jj = jj + 1 1787 if (jj.gt.nq(jcur+1)) then 1788 jcur = jcur + 1 1789 jj = 1 1790 end if 1791 1792 rr = rr + 1 1793 if (rr.gt.mq(rcur+1)) then 1794 rcur = rcur + 1 1795 rr = 1 1796 end if 1797 1798 end do 1799 1800 !*** deallocate memory *** 1801 value = BA_pop_stack(status(2)) 1802 if (.not. value) 1803 > call errquit(' CMatrix_m_transpose:popping stack',0,MA_ERR) 1804 1805 return 1806 end 1807 1808 1809 1810 1811 1812 1813* *********************************** 1814* * * 1815* * CMatrix_combo_zgemm2 * 1816* * * 1817* *********************************** 1818 subroutine CMatrix_combo_zgemm2(m,n,k,nblock, 1819 > alpha, 1820 > A,B,lda,ma,na, 1821 > beta, 1822 > C,ldc,ldc2,mc,nc, 1823 > taskid_i,taskid_j, 1824 > np_i,np_j, 1825 > comm_i, comm_j, 1826 > work1,work2) 1827 implicit none 1828 integer m,n,k,nblock 1829 complex*16 alpha 1830 1831 integer lda,ma(*),na(*) 1832 complex*16 A(lda,*) 1833 complex*16 B(lda,*) 1834 1835 complex*16 beta 1836 1837 integer ldc,ldc2,mc(*),nc(*) 1838 complex*16 C(ldc,ldc2,3) 1839 1840 integer taskid_i,taskid_j 1841 integer np_i,np_j 1842 integer comm_i,comm_j 1843 1844 complex*16 work1(*),work2(*) 1845 1846 1847#ifdef MPI4 1848#include "stupid_mpi4.fh" 1849#else 1850#include "mpif.h" 1851#endif 1852 1853 1854* **** local variables **** 1855 logical docalc1,docalc2 1856 integer i,j,ii,jj 1857 integer kk,iwrk,icur,jcur,ierr,shift,shft,shft2,shft3 1858 1859 do kk=1,3 1860 do j=1,nc(taskid_j+1) 1861 do i=1,mc(taskid_i+1) 1862 C(i,j,kk) = beta*C(i,j,kk) 1863 end do 1864 end do 1865 end do 1866 1867 ii = 0 1868 jj = 0 1869 kk = 0 1870 icur = 0 1871 jcur = 0 1872c **** loop over all row pannels of C *** 1873 do while (kk.lt.m) 1874 iwrk = min(nblock, mc(icur+1)-ii) 1875 iwrk = min(iwrk, na(jcur+1)-jj) 1876 1877 1878* **** iwrk*nc(taskid_j+1) submatrix !=0 **** 1879 if (ma(taskid_i+1).gt.0) then 1880 1881 shft = iwrk*ma(taskid_i+1) 1882 shft2 = iwrk*nc(taskid_j+1) 1883 shft3 = shft2+shft2 1884 1885* **** pack current iwrk columns of A into work1 *** 1886 if (taskid_j.eq.jcur) then 1887 call zlacpy("G", ma(taskid_i+1),iwrk, 1888 > A(1,jj+1), lda, 1889 > work1, ma(taskid_i+1)) 1890 1891 call zlacpy("G", ma(taskid_i+1),iwrk, 1892 > B(1,jj+1), lda, 1893 > work1(1+shft), ma(taskid_i+1)) 1894 end if 1895 1896c **** broadcast work1 within my row *** 1897#ifdef MPI4 1898 stupid_msglen = 2*iwrk*ma(taskid_i+1) 1899 stupid_taskid = jcur 1900 call MPI_Bcast(work1,stupid_msglen, 1901 > stupid_complex,stupid_taskid, 1902 > stupid_comm_j,stupid_ierr) 1903#else 1904 call MPI_Bcast(work1,2*iwrk*ma(taskid_i+1), 1905 > MPI_DOUBLE_COMPLEX,jcur,comm_j,ierr) 1906#endif 1907 1908 1909 if ((iwrk.gt.0) .and. 1910 > (na(taskid_j+1).gt.0)) then 1911 1912 call zgemm('C','N',iwrk,na(taskid_j+1),ma(taskid_i+1), 1913 > alpha, 1914 > work1, ma(taskid_i+1), 1915 > A, lda, 1916 > dcmplx(0.0d0,0.0d0), 1917 > work2, iwrk) 1918 call zgemm('C','N',iwrk,na(taskid_j+1),ma(taskid_i+1), 1919 > alpha, 1920 > work1, ma(taskid_i+1), 1921 > B, lda, 1922 > dcmplx(0.0d0,0.0d0), 1923 > work2(1+shft2), iwrk) 1924 call zgemm('C','N',iwrk,na(taskid_j+1),ma(taskid_i+1), 1925 > alpha, 1926 > work1(1+shft), ma(taskid_i+1), 1927 > B, lda, 1928 > dcmplx(0.0d0,0.0d0), 1929 > work2(1+shft3), iwrk) 1930 end if 1931 1932* **** iwrk*nc(taskid_j+1) submatrix ==0 **** 1933 else 1934 call dcopy(3*nc(taskid_j+1)*iwrk,0.0d0,0,work2,1) 1935 end if 1936 1937 1938c **** summ to node that holds current rows of C **** 1939#ifdef MPI4 1940 stupid_msglen = 3*nc(taskid_j+1)*iwrk 1941 stupid_taskid = icur 1942 call MPI_Reduce(work2,work1,stupid_msglen, 1943 > stupid_complex,stupid_sum, 1944 > stupid_taskid,stupid_comm_i,stupid_ierr) 1945#else 1946 call MPI_Reduce(work2,work1,3*nc(taskid_j+1)*iwrk, 1947 > MPI_DOUBLE_COMPLEX,MPI_SUM,icur,comm_i,ierr) 1948#endif 1949 1950 1951c **** add to current rows of C **** 1952 if (taskid_i.eq.icur) then 1953 shift = 1 1954 do i=ii,(ii+iwrk-1) 1955 call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift),iwrk, 1956 > C(i+1,1,1),mc(taskid_i+1)) 1957 call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift+shft2),iwrk, 1958 > C(i+1,1,2),mc(taskid_i+1)) 1959 call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift+shft3),iwrk, 1960 > C(i+1,1,3),mc(taskid_i+1)) 1961 shift = shift + 1 1962 end do 1963 end if 1964 1965 ii = ii + iwrk 1966 jj = jj + iwrk 1967 kk = kk + iwrk 1968 1969 if (jj.ge.na(jcur+1)) then 1970 jcur = jcur + 1 1971 jj = 0 1972 end if 1973 if (ii.ge.mc(icur+1)) then 1974 icur = icur + 1 1975 ii = 0 1976 end if 1977 1978 end do 1979 1980 return 1981 end 1982 1983