1C /* Deck so_eres */ 2 SUBROUTINE SO_ERES(MODEL, NOLDTR, NNEWTR, DENSIJ, LDENSIJ, 3 & DENSAB, LDENSAB, T2MP, LT2MP, FOCKD, 4 & LFOCKD, DENSAI, LDENSAI, NIT, ISYMTR, 5 & IDTYPE, 6#ifdef VAR_MPI 7 & AssignedIndices, maxnumjobs, 8#endif 9 & WORK, LWORK) 10C 11C This routine is part of the atomic integral direct SOPPA program. 12C 13C Keld Bak, October 1995 14C Stephan P. A. Sauer: 10.11.2003: merge with Dalton 2.0 15C Frederik Beyer & Stephan P. A. Sauer: 27.08.2013: call to ERI corrected 16C 17C PURPOSE: Driver routine for making a linear transformation of 18C a trialvector with the SOPPA hessian matricx E[2]. 19C The trial vector consists of four parts TR1E, TR1D, 20C TR2E, and TR2D. E refers to excitations and D to 21C de-excitations. 1 refer to the one-particle part and 22C 2 to the two-particle part. The linear transformed 23C trialvector is refered to as the resultvector and is 24C kept in four corresponding arrays. For the linear 25C transformation with E[2] the result vector is in RES1E, 26C RES1D, RES2E, and RES2D. 27C The linear transformation is driven over atomic orbitals, 28C and E[2] is not constructed explicitly. 29C 30 use so_info, only: so_singles_first, so_singles_second, 31 & so_has_doubles, so_needs_densai, 32 & so_double_correction, 33 & sop_mp2ai_done 34 35#ifdef VAR_MPI 36 use so_parutils, only: soppa_comm_active, soppa_num_active, 37 & soppa_nint, my_mpi_integer, sop_master 38#ifdef USE_MPI_MOD_F90 39 use mpi 40#endif 41#endif 42 43#include "implicit.h" 44 45#ifdef VAR_MPI 46#ifndef USE_MPI_MOD_F90 47#include "mpif.h" 48#endif 49#endif 50 51#include "priunit.h" 52#include "maxorb.h" 53#include "maxash.h" 54#include "mxcent.h" 55#include "aovec.h" 56#include "iratdef.h" 57C 58 PARAMETER (ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0, TWO = 2.0D0) 59 DIMENSION INDEXA(MXCORB) 60 DIMENSION DENSIJ(LDENSIJ), DENSAB(LDENSAB), T2MP(LT2MP) 61 DIMENSION FOCKD(LFOCKD) 62 DIMENSION DENSAI(LDENSAI) !intent(inout) 63 DIMENSION WORK(LWORK) 64 CHARACTER MODEL*5 65 integer :: inewtr, nnewtr, isymd1, ntosym 66 integer :: thisinewtr 67C idtype = 0 for dynamic (handle D part explicitly), 68C idtype = 1 for real, static (Tr1D = - Tr1E) 69C idtype = 2 for imaginary, static ( Tr1D = Tr1E ) 70 integer, intent(in) :: idtype 71 72#ifdef VAR_MPI 73 integer :: maxnumjobs, nloopidx 74 integer :: AssignedIndices(maxnumjobs) 75 integer(kind=MPI_INTEGER_KIND) :: ierr_mpi, count_mpi 76#endif 77C 78#include "ccorb.h" 79#include "infind.h" 80#include "blocks.h" 81#include "ccsdinp.h" 82#include "ccsdsym.h" 83#include "ccsdio.h" 84#include "distcl.h" 85#include "cbieri.h" 86#include "eritap.h" 87#include "soppinf.h" 88 89 90 logical :: singles_first, singles_second, doubles, calc_densai 91C 92C Logical variable which can be set to false if the dexcitation vector is 93C to avoid calculating with only zeroes. Useful for static 94C properties and first iteration of excitation energies. 95 logical :: do_dex 96 logical :: tr2_zero 97#ifdef VAR_MPI 98#include "iprtyp.h" 99#include "infpar.h" 100C integer, save :: numprocs 101 integer :: numprocs 102 logical :: loadbal_dyn 103 double precision :: timeini, timefin 104 integer(mpi_integer_kind) :: LTRTOT_mpi, lrestot, latot 105 integer(mpi_integer_kind) :: my_MPI_REAL8 = MPI_REAL8 106 numprocs = nodtot + 1 107C 108C When to do dynamic load_balancing..? 109C The sooner the better, but is the first iteration close to the 110C average? 111 loadbal_dyn = .false. 112 if (nit .eq. 1) loadbal_dyn = .true. 113#endif 114C 115 do_dex = idtype .eq. 0 116 117 calc_densai = SO_NEEDS_DENSAI(MODEL) .AND. .NOT. SOP_MP2AI_DONE 118C 119C------------------ 120C Add to trace. 121C------------------ 122C 123#ifdef VAR_MPI 124 if (mynum .eq. 0 ) then 125#endif 126 CALL QENTER('SO_ERES') 127#ifdef VAR_MPI 128C Slaves need to zero DENSAI if it is (re)calculated. 129 elseif ( calc_densai ) THEN 130 CALL DZERO(DENSAI,LDENSAI) 131 endif 132#endif 133C 134C------------------------------------------------------------- 135C Determine which terms to incude 136C------------------------------------------------------------- 137C 138 singles_first = so_singles_first(model) 139 singles_second = so_singles_second(model) 140 doubles = so_has_doubles(model) 141 tr2_zero = so_double_correction(model) 142C 143 LT2MPH = LT2MP 144 IT2MPH = 0 145C 146C------------------------------------------------------ 147C Write singlet and triplet T2 amplitudes to output 148C------------------------------------------------------ 149C 150 IF ( IPRSOP. GE. 10 ) THEN 151C 152 CALL AROUND('singlet T2AM in HR_ERES') 153 CALL OUTPUT(T2MP,1,LT2MPH,1,1,LT2MPH,1,1,LUPRI) 154 IF (TRIPLET) THEN 155 CALL AROUND('triplet T2AM in HR_ERES') 156 CALL OUTPUT(T2MP(LT2MPH+1),1,LT2MPH,1,1,LT2MPH,1,1,LUPRI) 157 END IF 158C 159 END IF 160C 161C 162C------------------------------------------------------------------ 163C Determine the symmetry of the result vector from the symmetry 164C of the trial vector ISYMTR, and the opperator symmtry ISYMOP. 165C------------------------------------------------------------------ 166C 167 ISYRES = MULD2H(ISYMOP,ISYMTR) 168C 169C--------------------------------- 170C Work space allocation no. 1. 171C--------------------------------- 172C 173 LCMO = NLAMDT 174C 175 KCMO = 1 176 KEND1 = KCMO + LCMO 177 LWORK1 = LWORK - KEND1 178C 179 CALL SO_MEMMAX ('SO_ERES.1',LWORK1) 180 IF (LWORK1 .LT. 0) CALL STOPIT('SO_ERES.1',' ',KEND1,LWORK) 181C 182C------------------------------------------------------- 183C Get the matrix which contains the MO coefficients. 184C------------------------------------------------------- 185C 186#ifdef VAR_MPI 187C Only master reads... 188 IF (MYNUM .EQ. 0) THEN 189#endif 190 DTIME = SECOND() 191 CALL SO_GETMO(WORK(KCMO),LCMO,WORK(KEND1),LWORK1) 192 DTIME = SECOND() - DTIME 193 SOTIME(1) = SOTIME(1) + DTIME 194#ifdef VAR_MPI 195 ENDIF 196C Should probably use non-blocking collectives, where implemented 197! IF (MPI_VERSION.GE.3) THEN 198! MPI_IBCAST(WORK(KCMO),LCMO, my_MPI_INTEGER, 0, 199! & MPI_COMM_WORLD, ierr_mpi) 200! ELSE 201 count_mpi = LCMO 202 CALL MPI_BCAST(WORK(KCMO), count_mpi, my_MPI_REAL8, SOP_MASTER, 203 & SOPPA_COMM_ACTIVE, ierr_mpi) 204 205! ENDIF 206#endif 207C 208C--------------------------------- 209C Work space allocation no. 2. 210C--------------------------------- 211C 212 LTR1E = NT1AM(ISYMTR) 213 LBTR1E = NT1AO(ISYMTR) 214 LTR1D = NT1AM(ISYMTR) 215 LRES1E = NT1AM(ISYMTR) 216 IF (DO_DEX) THEN 217 LRES1D = NT1AM(ISYMTR) 218 ELSE 219 LRES1D = 0 220 END IF 221 LFOCK = N2BST(ISYRES) 222 LDENS = N2BST(ISYMTR) 223 LBTR1D = NT1AO(ISYMTR) 224 LBTJ1E = NMATAV(ISYMTR) 225 LBTJ1D = NMATAV(ISYMTR) 226 227 IF(DOUBLES) THEN 228 IF (TRIPLET) THEN 229 LTR2E = NT2SQ(ISYMTR) 230 ELSE 231 LTR2E = N2P2HOP(ISYMTR) 232 ENDIF 233 LRES2E = N2P2HOP(ISYMTR) 234 IF (DO_DEX) THEN 235 LTR2D = LTR2E 236 LRES2D = N2P2HOP(ISYMTR) 237 ELSE 238 LTR2D = 0 239 LRES2D = 0 240 ENDIF 241 ELSE 242 LTR2E = 0 243 LTR2D = 0 244 LRES2E = 0 245 LRES2D = 0 246 ENDIF 247 248 IF (singles_second) THEN 249 LAIJ = NRHFT*NRHFT 250 LAAB = NVIRT*NVIRT 251 ELSE 252 LAIJ = 0 253 LAAB = 0 254 ENDIF 255C 256 KTR1E = KEND1 257 KTR1D = KTR1E + LTR1E 258 KTR2E = KTR1D + LTR1D 259 KTR2D = KTR2E + LTR2E 260 261 KRES1E = KTR2D + LTR2D 262 KRES1D = KRES1E + LRES1E 263 KRES2E = KRES1D + LRES1D 264 KRES2D = KRES2E + LRES2E 265 266 KFOCK = KRES2D + LRES2D 267 KDENS = KFOCK + LFOCK 268 KBTR1E = KDENS + LDENS 269 KBTR1D = KBTR1E + LBTR1E 270 KBTJ1E = KBTR1D + LBTR1D 271 KBTJ1D = KBTJ1E + LBTJ1E 272 273 KAIJ = KBTJ1D + LBTJ1D 274 KAAB = KAIJ + LAIJ 275 KEND2 = KAAB + LAAB 276#ifdef VAR_MPI 277C MPI -- Allocate timings array 278 if ( loadbal_dyn ) then 279 KTIMING = KEND2 280 KEND2 = KTIMING + SOPPA_NINT 281 CALL DZERO(WORK(KTIMING), SOPPA_NINT) 282 endif 283#endif 284 LWORK2 = LWORK - KEND2 285C 286 CALL SO_MEMMAX ('SO_ERES.2',LWORK2) 287 IF (LWORK2 .LT. 0) CALL STOPIT('SO_ERES.2',' ',KEND2,LWORK) 288C 289C---------------------------- 290C Initialize AIJ and AAB. 291C---------------------------- 292C 293 IF(SINGLES_SECOND) THEN 294 CALL DZERO(WORK(KAIJ),LAIJ) 295 CALL DZERO(WORK(KAAB),LAAB) 296 ENDIF 297#ifdef VAR_MPI 298C MPI -- ONLY MASTER DOES THE READING 299 IF ( MYNUM .EQ. 0 ) THEN 300#endif 301C 302C---------------------------------------------- 303C Open files with trial and result vectors. 304C---------------------------------------------- 305C 306 CALL SO_OPEN(LUTR1E,FNTR1E,LTR1E) 307 CALL SO_OPEN(LURS1E,FNRS1E,LRES1E) 308 CALL SO_OPEN(LURS1D,FNRS1D,LRES1E) 309 IF (DO_DEX) THEN 310 CALL SO_OPEN(LUTR1D,FNTR1D,LTR1D) 311 END IF 312C 313C Note: open trial-vectors also with length LRES2E 314C singlet: LRES2E = LTR2E anyway 315C triplet: We read trial-vectors into result vector memory 316C initially, then create intermediate in 317C TR2 memory 318 IF (DOUBLES) THEN 319 CALL SO_OPEN(LUTR2E,FNTR2E,LRES2E) 320 CALL SO_OPEN(LURS2E,FNRS2E,LRES2E) 321 CALL SO_OPEN(LURS2D,FNRS2D,LRES2E) 322 IF (DO_DEX) THEN 323 CALL SO_OPEN(LUTR2D,FNTR2D,LRES2D) 324 ENDIF 325 ENDIF 326#ifdef VAR_MPI 327 ENDIF 328#endif 329C 330 IF ( IPRSOP. GE. 7 331#ifdef VAR_MPI 332 & .AND. (MYNUM .EQ.0) 333#endif 334 & ) THEN ! Only printing related. 335C------------------------------------------ 336C Write new trial vectors to output. 337C------------------------------------------ 338 DO 50 INEWTR = 1,NNEWTR 339C---------------------------------------------------- 340C Determine pointer to INEWTR trial vector. 341C---------------------------------------------------- 342 INEW = NOLDTR + INEWTR 343C 344 CALL SO_READ(WORK(KTR1E),LTR1E,LUTR1E,FNTR1E,INEW) 345 IF(DO_DEX) 346 & CALL SO_READ(WORK(KTR1D),LTR1D,LUTR1D,FNTR1D,INEW) 347 IF (DOUBLES .AND. .NOT. SO_DOUBLE_CORRECTION(MODEL) ) THEN 348 CALL SO_READ(WORK(KRES2E),LRES2E,LUTR2E,FNTR2E,INEW) 349 IF(DO_DEX) 350 & CALL SO_READ(WORK(KRES2D),LRES2D,LUTR2D,FNTR2D,INEW) 351 ENDIF 352C 353 WRITE(LUPRI,'(/,I3,A)') INEWTR,'. new trial vector' 354 WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)') 355 & (I,WORK(KTR1E+I-1),WORK(KTR1D+I-1),I=1,LTR1E) 356 IF (DOUBLES) THEN 357 WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)') 358 & (I,WORK(KRES2E+I-1),WORK(KRES2D+I-1),I=1,LRES2E) 359 ENDIF 360 50 CONTINUE 361 END IF 362C 363C================================================ 364C Loop over number of excitations considered. 365C================================================ 366C 367 DO 100 INEWTR = 1,NNEWTR 368C 369C------------------------------------------------- 370C Determine pointer to INEWTR trial vector. 371C------------------------------------------------- 372C 373 INEW = NOLDTR + INEWTR 374C 375C-------------------------------------------------------------- 376C Initialize RES1E, RES1D, SIGAI1, SIGAI2, 377C SIGDA1, SIGDA2 and FOCK 378C-------------------------------------------------------------- 379C 380 CALL DZERO(WORK(KRES1E),LRES1E) 381 IF (DO_DEX) CALL DZERO(WORK(KRES1D),LRES1D) 382 383 CALL DZERO(WORK(KFOCK),LFOCK) 384C 385C-------------------------- 386C Read trial vector. 387C-------------------------- 388C 389#ifdef VAR_MPI 390 IF ( MYNUM .EQ. 0 ) THEN 391#endif 392 CALL SO_READ(WORK(KTR1E),LTR1E,LUTR1E,FNTR1E,INEW) 393 IF (DO_DEX) THEN 394 CALL SO_READ(WORK(KTR1D),LTR1D,LUTR1D,FNTR1D,INEW) 395 ELSE 396C Static case: Generate D-vector from E-vector 397 CALL DCOPY(LTR1E,WORK(KTR1E),1,WORK(KTR1D),1) 398 IF (IDTYPE.EQ.1) CALL DSCAL(LTR1E,-1.0D0,WORK(KTR1D),1) 399 END IF 400 IF (DOUBLES) THEN 401C Quick fix for RPA(D) -> Later use this variable to 402C speed up first iteration for excitation energies 403 IF (TR2_ZERO) THEN 404 CALL DZERO(WORK(KTR2E),LTR2E) 405 CALL DZERO(WORK(KTR2D),LTR2D) 406 ENDIF 407 408 ENDIF 409#ifdef VAR_MPI 410 ENDIF 411C 412#endif 413 IF (DOUBLES) THEN 414C-------------------------------------------------------------------- 415C RES2E, RES2D is initialized with the D^0*Tr2 contribution 416C --- For MPI only one process must do this 417C-------------------------------------------------------------------- 418 IF ( (.NOT.TR2_ZERO) 419#ifdef VAR_MPI 420 & .AND.(MYNUM .EQ. 0 ) 421#endif 422 & ) THEN 423C 424 IF (TRIPLET) THEN 425C----------------------------------------------------------- 426C For triplet we read the trial-vector into 427C solution vector memory. Then we use that 428C to create the non-symmetric intermediate on 429C KTR2* 430C---------------------------------------------------------- 431 CALL SO_READ(WORK(KRES2E),LRES2E,LUTR2E,FNTR2E,INEW) 432 CALL SO_TRANTRIP(WORK(KTR2E),WORK(KRES2E),ISYMTR) 433 IF(DO_DEX) THEN 434 CALL SO_READ(WORK(KRES2D),LRES2D,LUTR2D, 435 & FNTR2D,INEW) 436 CALL SO_TRANTRIP(WORK(KTR2D),WORK(KRES2D),ISYMTR) 437 END IF 438C 439 DTIME = SECOND() 440 CALL SO_RES_CDT(WORK(KRES2E),LRES2E, 441 & WORK(KRES2D),LRES2D, 442 & FOCKD,LFOCKD,ISYRES,DO_DEX, 443 & WORK(KEND2),LWORK2) 444 DTIME = SECOND() - DTIME 445 SOTIME(30) = SOTIME(30) + DTIME 446C 447 ELSE 448C 449 CALL SO_READ(WORK(KTR2E),LTR2E,LUTR2E,FNTR2E,INEW) 450 IF(DO_DEX) 451 & CALL SO_READ(WORK(KTR2D),LTR2D,LUTR2D,FNTR2D,INEW) 452C 453 DTIME = SECOND() 454 CALL SO_RES_CD(WORK(KRES2E),LRES2E, 455 & WORK(KRES2D),LRES2D, 456 & WORK(KTR2E),LTR2E,WORK(KTR2D),LTR2D, 457 & FOCKD,LFOCKD,ISYRES, 458 & DO_DEX,WORK(KEND2),LWORK2) 459 460 DTIME = SECOND() - DTIME 461 SOTIME(30) = SOTIME(30) + DTIME 462C 463C The trial-vectors are no longer needed in the original 464C basis, transform them. 465C 466 CALL CCSD_TCMEPKX(WORK(KTR2E),TWO,ISYMTR) 467 IF (DO_DEX) CALL CCSD_TCMEPKX(WORK(KTR2D),TWO,ISYMTR) 468 ENDIF 469C 470 ELSE 471 CALL DZERO(WORK(KRES2E),LRES2E) 472 IF (DO_DEX) CALL DZERO(WORK(KRES2D),LRES2D) 473 ENDIF 474 END IF 475C 476#ifdef VAR_MPI 477C--------------------------- 478C Communicate trial-vectors 479C--------------------------- 480C Rememember that the trial-vectors are contigous in memory 481 LTRTOT_mpi = LTR1E + LTR1D + LTR2E + LTR2D 482 CALL MPI_BCAST(WORK(KTR1E), LTRTOT_mpi, my_MPI_REAL8, 483 & sop_master, SOPPA_COMM_ACTIVE, ierr_mpi) 484C Note for future adjustment: 485C It may be worthwile using non-blocking communication here 486C since the next two calls only use the singles vectors. 487C 488C Also, depending on the communication overhead, it may be better 489C to transform on host only and then send it to the slaves..? 490#endif 491C 492C--------------------------------------------------- 493C Calculate RPA-density matrices in AO basis. 494C--------------------------------------------------- 495C 496 DTIME = SECOND() 497 CALL SO_AODENS(WORK(KDENS),LDENS,WORK(KCMO),LCMO, 498 & WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D,ISYMTR, 499 & WORK(KEND2),LWORK2) 500 DTIME = SECOND() - DTIME 501 SOTIME(6) = SOTIME(6) + DTIME 502C 503C-------------------------------------------- 504C Backtransformation of trial vectors. 505C-------------------------------------------- 506C 507 DTIME = SECOND() 508 CALL SO_BCKTR(WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D, 509 & WORK(KBTR1E),LBTR1E,WORK(KBTR1D),LBTR1D, 510 & WORK(KBTJ1E),LBTJ1E,WORK(KBTJ1D),LBTJ1D, 511 & WORK(KCMO),LCMO,ISYMTR) 512 DTIME = SECOND() - DTIME 513 SOTIME(7) = SOTIME(7) + DTIME 514C 515C======================================================= 516C Start the loop over distributions of integrals. 517C======================================================= 518C 519 IF (DIRECT) THEN 520 NTOSYM = 1 521 DTIME = SECOND() 522 IF (HERDIR) THEN 523 CALL HERDI1(WORK(KEND2),LWORK2,IPRINT) 524 KINDXB = KEND2 525 ELSEIF(INEWTR.EQ.1) THEN 526C Can we get away with doing this once? 527C Because we CAN'T move KEND2 forwards in each loop cycle, 528C since that'll be leaking memory 529 KCCFB1 = KEND2 530 KINDXB = KCCFB1 + MXPRIM*MXCONT 531 KEND2 = KINDXB + (8*MXSHEL*MXCONT)/IRAT 532 LWORK2 = LWORK - KEND2 533 534 CALL ERIDI1(KODCL1,KODCL2,KODBC1,KODBC2,KRDBC1,KRDBC2, 535 & KODPP1,KODPP2,KRDPP1,KRDPP2,KFREE,LFREE, 536 & KEND2,WORK(KCCFB1),WORK(KINDXB),WORK(KEND2), 537 & LWORK2,IPRINT) 538 539 KEND2 = KFREE 540 LWORK2 = LFREE 541 ENDIF 542 DTIME = SECOND() - DTIME 543 SOTIME(8) = SOTIME(8) + DTIME 544 ELSE 545 NTOSYM = NSYM 546 ENDIF 547C 548 ICDEL1 = 0 549C 550#ifdef VAR_MPI 551C----------------------------------------------------------------- 552C In MPI calculations, we need to distribute the indices. 553C On first pass, we set this using the pre-sorted approach. 554C----------------------------------------------------------------- 555C IF ( (nit .ne. 1 .or. inewtr .ne. 1) 556 IF ( .not. (nit .eq. 1 .and. inewtr .eq. 1) 557 & .or. numprocs .eq. 1 ) THEN 558C After first pass, load-balancing has been done 559 CONTINUE 560 ELSEIF( mynum .eq. 0 )THEN 561C Master does the balancing 562 call presortloadbal_parsoppa(AssignedIndices, maxnumjobs, 563 & work(kindxb), work(kend2), lwork2) 564 ELSE 565 count_mpi = maxnumjobs 566C Slaves receives the scatter, send arguments should be ignored 567 call mpi_scatter( AssignedIndices, count_mpi,my_mpi_integer, 568 & AssignedIndices, count_mpi,my_mpi_integer, 569 & sop_master, soppa_comm_active, ierr_mpi ) 570 ENDIF 571#endif 572 DO 210 ISYMD1 = 1,NTOSYM 573C 574 IF (DIRECT) THEN 575 IF (HERDIR) THEN 576 NTOT = MAXSHL 577 ELSE 578 NTOT = MXCALL 579 ENDIF 580 ELSE 581 NTOT = NBAS(ISYMD1) 582 ENDIF 583#ifdef VAR_MPI 584 IF(numprocs .gt. 1) THEN 585 NLOOPIDX = maxnumjobs 586 ELSE 587 NLOOPIDX = NTOT 588 loadbal_dyn = .false. 589 ENDIF 590C 591#endif 592C 593C------------------------------------------------- 594C Main loop over integral-distributions. 595C------------------------------------------------- 596 597#ifdef VAR_MPI 598C------------------------------------------------------------------ 599C For_parallel calculations, we have som stuff to set up. 600C------------------------------------------------------------------ 601 DO 220 ILLLDUMMY = 1, nloopidx 602 if ( numprocs .gt. 1 ) then 603 ILLL = assignedIndices(illldummy) 604C A zero indicates that we have no more work, exit loop 605 IF (ILLL .eq. 0) exit 606 if ( loadbal_dyn ) timeini = mpi_wtime() 607 else 608 ILLL = ILLLDUMMY 609 endif 610#else 611 DO 220 ILLL = 1,NTOT 612#endif 613C print *, ILLL 614C------------------------------------------------ 615C If direct calculate the integrals. 616C------------------------------------------------ 617 IF (DIRECT) THEN 618C 619 DTIME = SECOND() 620 IF (HERDIR) THEN 621C 622 CALL HERDI2(WORK(KEND2),LWORK2,INDEXA,ILLL,NUMDIS, 623 & IPRINT) 624C 625 ELSE 626C 627 CALL ERIDI2(ILLL,INDEXA,NUMDIS,0,0, 628 & WORK(KODCL1),WORK(KODCL2), 629 & WORK(KODBC1),WORK(KODBC2), 630 & WORK(KRDBC1),WORK(KRDBC2), 631 & WORK(KODPP1),WORK(KODPP2), 632 & WORK(KRDPP1),WORK(KRDPP2), 633 & WORK(KCCFB1),WORK(KINDXB), 634 & WORK(KEND2),LWORK2,IPRINT) 635C 636 ENDIF 637 DTIME = SECOND() - DTIME 638 SOTIME(9) = SOTIME(9) + DTIME 639C 640 LRECNR = ( (NBUFX(0) -1) / IRAT ) + 1 641 KRECNR = KEND2 642 KEND2B = KRECNR + LRECNR 643 LWORK2B = LWORK - KEND2B 644C 645 CALL SO_MEMMAX ('SO_ERES.2B',LWORK2B) 646 IF (LWORK2 .LT. 0) 647 & CALL STOPIT('SO_ERES.2B',' ',KEND2B,LWORK) 648C 649 ELSE 650 NUMDIS = 1 651 KEND2B = KEND2 652 ENDIF 653C 654C------------------------------------------------------------------- 655C Loop over number of distributions in disk. 656C In the case of ERI there are more than one distribution and IDEL2 657C loops over them and the actual index of the delta orbital IDEL is 658C then obtained from the array INDEXA. In the case of a not direct 659C calculation there is only one distribution on the disk, which 660C implies that IDEL2 is always 1 and that IDEL is systematically 661C incremented by one each time. 662C-------------------------------------------------------------------- 663C 664 DO 230 IDEL2 = 1,NUMDIS 665C 666 IF (DIRECT) THEN 667 IDEL = INDEXA(IDEL2) 668 ISYMD = ISAO(IDEL) !keeps track of current symmetry 669 ELSE 670 IDEL = IBAS(ISYMD1) + ILLL 671 ISYMD = ISYMD1 672 ENDIF 673C 674 ISYDIS = MULD2H(ISYMD,ISYMOP) 675C 676 IT2DEL(IDEL) = ICDEL1 677 ICDEL1 = ICDEL1 + NT2BCD(ISYDIS) 678C 679C--------------------------------------------- 680C Work space allocation no. 3. 681C--------------------------------------------- 682C 683 LXINT = NDISAO(ISYDIS) 684C 685 KXINT = KEND2B 686 KEND3 = KXINT + LXINT 687 LWORK3 = LWORK - KEND3 688C 689 CALL SO_MEMMAX ('SO_ERES.3',LWORK3) 690 IF (LWORK3 .LT. 0) 691 & CALL STOPIT('SO_ERES.3',' ',KEND3,LWORK) 692C 693C-------------------------------------------- 694C Read in batch of integrals. 695C-------------------------------------------- 696C 697 DTIME = SECOND() 698 CALL CCRDAO(WORK(KXINT),IDEL,IDEL2,WORK(KEND3),LWORK3, 699 & WORK(KRECNR),DIRECT) 700 DTIME = SECOND() - DTIME 701 SOTIME(10) = SOTIME(10) + DTIME 702C 703C 704C--------------------------------------------- 705C Work space allocation no. 4. 706C--------------------------------------------- 707C 708 ISAIJ = MULD2H(ISYMD,1) 709C 710 IF (DOUBLES.OR.SINGLES_SECOND) THEN 711C 712 LT2M1 = NT2BCD(ISAIJ) 713 LX2M1 = NT2BCD(MULD2H(ISYMD,ISYMTR)) 714 KT2M1 = KEND3 715 KX2EM1 = KT2M1 + LT2M1 716 KX2DM1 = KX2EM1 + LX2M1 717 KEND4 = KX2DM1 + LX2M1 718 LWORK4 = LWORK - KEND4 719C 720 CALL SO_MEMMAX ('SO_ERES.4',LWORK4) 721 IF (LWORK4 .LT. 0) 722 & CALL STOPIT('SO_ERES.4',' ',KEND4,LWORK) 723C 724C 725C------------------------------------------------------------ 726C Construct the partially back-transformed T2 727C MP-amplitudes. 728C------------------------------------------------------------ 729C 730 DTIME = SECOND() 731CPi-140316 Change LT2MP -> LT2MPH 732CPi I think now we have this condition two times 733 IF (NVIR(ISYMD).GT.0) THEN 734 CALL SO_T2M1(WORK(KT2M1),LT2M1,T2MP,LT2MPH, 735Cend-Pi 736 & WORK(KCMO),LCMO,IDEL,ISYMD,ISYDIS, 737 & WORK(KEND4),LWORK4) 738 ELSE 739 CALL DZERO(WORK(KT2M1),LT2M1) 740 END IF 741C 742C------------------------------------------------------------ 743C Construct the partially back-transformed 744C trial vectors. 745C These are scaled by 1/sqrt(2) 746C------------------------------------------------------------ 747C 748 IF(DOUBLES.AND..NOT.TRIPLET.AND. 749 & (NVIR(ISYMD).GT.0)) THEN 750 CALL SO_X2M1(WORK(KX2EM1),LX2M1, 751 & WORK(KTR2E),LTR2E, 752 & WORK(KCMO),LCMO,IDEL,ISYMD, 753 & ISYMTR,WORK(KEND4),LWORK4) 754 IF (DO_DEX) THEN 755 CALL SO_X2M1(WORK(KX2DM1),LX2M1, 756 & WORK(KTR2D),LTR2D, 757 & WORK(KCMO),LCMO,IDEL,ISYMD, 758 & ISYMTR,WORK(KEND4),LWORK4) 759 END IF 760 ELSE 761 CALL DZERO(WORK(KX2EM1),LX2M1) 762 IF (DO_DEX) CALL DZERO(WORK(KX2DM1),LX2M1) 763 END IF 764 765 IF (SINGLES_SECOND) THEN 766C 767C--------------------------------------------------------------- 768C Add T2^(ab)_(ij)*x1(b, delta) intermediates 769C to the backtransformed trialvectors. 770C--------------------------------------------------------------- 771C (This takes care of terms (1), and (5) of the B 772C matrix) 773 CALL SO_T2X1(WORK(KX2EM1),LX2M1, 774 & T2MP,LT2MPH, 775 & WORK(KBTJ1D),LBTJ1D,IDEL,ISYMD, 776 & ISYMTR,WORK(KEND4),LWORK4) 777 IF (DO_DEX) THEN 778 CALL SO_T2X1(WORK(KX2DM1),LX2M1, 779 & T2MP,LT2MPH, 780 & WORK(KBTJ1E),LBTJ1E,IDEL,ISYMD, 781 & ISYMTR,WORK(KEND4),LWORK4) 782 END IF 783C 784C For triplet we need to transform the intermediates 785C 786 IF (TRIPLET.AND. 787 & NVIR(MULD2H(ISYMD,ISYMTR)).GE.1) THEN 788 CALL SO_M1SHUF(WORK(KX2EM1),LX2M1, 789 & ISYMD,ISYMTR) 790 IF (DO_DEX) THEN 791 CALL SO_M1SHUF(WORK(KX2DM1),LX2M1, 792 & ISYMD,ISYMTR) 793 END IF 794 END IF 795 796 END IF 797C 798C In the triplet case we create the partially back-transformed 799C intermediate last. 800C 801 IF(DOUBLES.AND.TRIPLET.AND. 802 & (NVIR(ISYMD).GT.0)) THEN 803 CALL SO_X2SM1(WORK(KX2EM1),LX2M1, 804 & WORK(KTR2E),LTR2E, 805 & WORK(KCMO),LCMO,IDEL,ISYMD, 806 & ISYMTR,WORK(KEND4),LWORK4) 807 IF (DO_DEX) THEN 808 CALL SO_X2SM1(WORK(KX2DM1),LX2M1, 809 & WORK(KTR2D),LTR2D, 810 & WORK(KCMO),LCMO,IDEL,ISYMD, 811 & ISYMTR,WORK(KEND4),LWORK4) 812 END IF 813 END IF 814C 815 DTIME = SECOND() - DTIME 816 SOTIME(12) = SOTIME(12) + DTIME 817 818 ELSE 819 KT2M1 = HUGE(LWORK) ! Crash if this is accessed 820 KX2EM1 = HUGE(LWORK) 821 KX2DM1 = HUGE(LWORK) 822 LT2M1 = 0 823 LX2M1 = 0 824 KEND4 = KEND3 825 LWORK4 = LWORK3 826 ENDIF 827C 828C--------------------------------------------- 829C Work space allocation no. 5. 830C--------------------------------------------- 831C 832 IF (DOUBLES.OR.SINGLES_SECOND) THEN 833 LDSRHF = NDSRHF(ISYMD) 834C 835 KDSRHF = KEND4 836 KEND5 = KDSRHF + LDSRHF 837 LWORK5 = LWORK - KEND5 838C 839 CALL SO_MEMMAX ('SO_ERES.5',LWORK5) 840 IF (LWORK5 .LT. 0) 841 & CALL STOPIT('SO_ERES.5',' ',KEND5,LWORK) 842C 843C---------------------------------------------------------------- 844C Transform one index in the integral batch to an 845C occupied index. 846C---------------------------------------------------------------- 847C 848 DTIME = SECOND() 849 ISYMLP = 1 850 CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KCMO), 851 & ISYMLP,WORK(KEND5),LWORK5,ISYDIS) 852 DTIME = SECOND() - DTIME 853 SOTIME(13) = SOTIME(13) + DTIME 854 ELSE 855 LDSRHF = 0 856 KEND5 = KEND4 857 LWORK5 = LWORK4 858 KDSRHF = HUGE(LWORK) 859 END IF 860C 861C------------------------------------------------------------------- 862C Calculate part of the second order density matrix. 863C------------------------------------------------------------------- 864C 865 DTIME = SECOND() 866 IF ( calc_densai .AND. ( INEWTR.EQ.1 ) ) THEN 867 CALL SO_DENSAI1(DENSAI,LDENSAI,WORK(KDSRHF),LDSRHF, 868 & WORK(KCMO),LCMO,WORK(KT2M1),LT2M1, 869 & ISYMD,ISYDIS,WORK(KEND5), 870 & LWORK5) 871 872 END IF 873 DTIME = SECOND() - DTIME 874 SOTIME(41) = SOTIME(41) + DTIME 875C 876 IF (TRIPLET) THEN 877 CALL LOOP_BODY_TRIPLET 878 ELSE 879 CALL LOOP_BODY_SINGLET 880 ENDIF 881C 882 230 CONTINUE ! End of IDEL2 loop 883C 884#ifdef VAR_MPI 885 IF (loadbal_dyn) then 886 timefin = mpi_wtime() 887 itimeilll = ktiming + illl - 1 888 work(itimeilll) = work(itimeilll) + timefin - timeini 889 ENDIF 890#endif 891C 892 220 CONTINUE !End of ILLL loop 893C 894 210 CONTINUE !end of ISYMD1 loop 895C 896C==================================================== 897C End of loop over distributions of integrals. 898C==================================================== 899C 900#ifdef VAR_MPI 901C------------------------------------------------- 902C Communicate the result vectors to master. 903C------------------------------------------------- 904C Note for further development: 905C In the following, non-blocking reductions could be used, 906C since the one-particle result-vector is first needed in the 907C second call, sigai and sigda in the third and fourth call, 908C and the two-particle vector is just written to file. 909C The alternative approach, is to do all these calculations 910C on the slaves, and only reduce in the end, this would save 911C communication of sigai and sigda. 912C 913C Use the fact that memory are together 914 lrestot = lres1e + lres1d + lres2e + lres2d 915 latot = laij + laab 916C Master use inplace operations 917 if (mynum .eq. 0 ) then 918C 919C Fock-matrix 920 count_mpi = lfock 921 call mpi_reduce( mpi_in_place, work(kfock), count_mpi, 922 & my_MPI_REAL8, MPI_SUM, sop_master, 923 & soppa_comm_active, ierr_mpi) 924C 925C Result-vectors 926 call mpi_reduce( mpi_in_place, work(kres1e), lrestot, 927 & my_MPI_REAL8, MPI_SUM, sop_master, 928 & soppa_comm_active, ierr_mpi) 929C 930 if (singles_second) then 931C 932C Aij / Aab -- only calculated in first pass 933 if ( inewtr .eq. 1 ) then 934C write(lupri,*) 'Aij' 935 call mpi_reduce ( mpi_in_place, work(kaij), latot, 936 & my_MPI_REAL8, MPI_SUM, sop_master, 937 & soppa_comm_active, ierr_mpi) 938 end if 939 end if 940 else 941C Slaves pass the same buffer as the recieve-buffer... 942C 943C Fock-matrix 944 count_mpi = lfock 945 call mpi_reduce( work(kfock), work(kfock), count_mpi, 946 & my_MPI_REAL8, MPI_SUM, sop_master, 947 & soppa_comm_active, ierr_mpi) 948C 949C Result-vectors 950 call mpi_reduce( work(kres1e), work(kres1e), lrestot, 951 & my_MPI_REAL8, MPI_SUM, sop_master, 952 & soppa_comm_active, ierr_mpi) 953C 954 if (singles_second) then 955C 956C Aij / Aab 957 if ( inewtr .eq. 1 ) then 958 call mpi_reduce ( work(kaij), work(kaij), latot, 959 & my_MPI_REAL8, MPI_SUM, sop_master, 960 & soppa_comm_active, ierr_mpi) 961 endif 962 endif 963C 964C After the reductions, the slaves are done; cycle loop 965 goto 100 966 endif 967#endif 968C 969C--------------------------------------------- 970C Transform AO Fock matrix to MO basis. 971C--------------------------------------------- 972C 973 DTIME = SECOND() 974 CALL TRANS_FCK(WORK(KFOCK),ISYRES) 975 CALL CC_FCKMO(WORK(KFOCK),WORK(KCMO),WORK(KCMO), 976 & WORK(KEND2),LWORK2,ISYRES,1,1) 977 DTIME = SECOND() - DTIME 978 SOTIME(24) = SOTIME(24) + DTIME 979C 980C------------------------------------------------------------------ 981C Calculate and add the RPA two-particle parts to the result 982C vectors. 983C------------------------------------------------------------------ 984C 985 986 DTIME = SECOND() 987 CALL SO_TWOFOCK(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D, 988 & WORK(KFOCK),LFOCK,ISYRES,DO_DEX) 989 DTIME = SECOND() - DTIME 990 SOTIME(25) = SOTIME(25) + DTIME 991C 992 if (singles_second) then 993C----------------------------------------------------------------- 994C Calculate and add the symmetry correcting term to A in 995C eq. (44). 996C----------------------------------------------------------------- 997C 998 DTIME = SECOND() 999 1000 CALL SO_RES_SYM(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D, 1001 & WORK(KAIJ),LAIJ,WORK(KAAB),LAAB,WORK(KTR1E), 1002 & LTR1E,WORK(KTR1D),LTR1D,ISYRES,DO_DEX) 1003 DTIME = SECOND() - DTIME 1004 SOTIME(20) = SOTIME(20) + DTIME 1005C 1006C--------------------------------------------------------- 1007C Calculate and add the Fock-term to A in eq. (40). 1008C--------------------------------------------------------- 1009C 1010 DTIME = SECOND() 1011 CALL SO_RES_FCK(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D, 1012 & WORK(KTR1E),LTR1E,WORK(KTR1D), 1013 & LTR1D,FOCKD,LFOCKD,DENSIJ,LDENSIJ,DENSAB, 1014 & LDENSAB,ISYRES,ISYMTR,DO_DEX) 1015 1016 DTIME = SECOND() - DTIME 1017 SOTIME(21) = SOTIME(21) + DTIME 1018C 1019 endif 1020C 1021C------------------------------------------------------------------ 1022C Calculate and add the RPA one-particle parts to the result 1023C vectors. 1024C------------------------------------------------------------------ 1025C 1026 DTIME = SECOND() 1027 1028 CALL SO_ONEFOCK(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D,FOCKD, 1029 & LFOCKD,WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D, 1030 & ISYRES,ISYMTR,DO_DEX) 1031 DTIME = SECOND() - DTIME 1032 SOTIME(26) = SOTIME(26) + DTIME 1033C 1034C 1035#ifdef VAR_MPI 1036C Slaves are done 1037 IF (MYNUM .NE. 0) GOTO 100 1038#endif 1039C 1040C---------------------------------------- 1041C Write new result vectors to file. 1042C---------------------------------------- 1043C 1044 CALL SO_WRITE(WORK(KRES1E),LRES1E,LURS1E,FNRS1E,INEW) 1045 IF (DO_DEX) 1046 & CALL SO_WRITE(WORK(KRES1D),LRES1D,LURS1D,FNRS1D,INEW) 1047 IF (DOUBLES) THEN 1048 CALL SO_WRITE(WORK(KRES2E),LRES2E,LURS2E,FNRS2E,INEW) 1049 IF(DO_DEX) 1050 & CALL SO_WRITE(WORK(KRES2D),LRES2D,LURS2D,FNRS2D,INEW) 1051 ENDIF 1052C 1053C Write zeroes to D-solution vectors 1054 IF (.NOT. DO_DEX) THEN 1055 CALL DZERO(WORK(KRES1E),LRES1E) 1056 CALL SO_WRITE(WORK(KRES1E),LRES1E,LURS1D,FNRS1D,INEW) 1057 IF (DOUBLES) THEN 1058 CALL DZERO(WORK(KRES2E),LRES2E) 1059 CALL SO_WRITE(WORK(KRES2E),LRES2E,LURS2D,FNRS2D,INEW) 1060 END IF 1061 END IF 1062C 1063 100 CONTINUE 1064C 1065C================================== 1066C End of loop over excitations. 1067C================================== 1068C 1069#ifdef VAR_MPI 1070C----------------------------------------------------------- 1071C Communicate the second order density matrix if needed. 1072C----------------------------------------------------------- 1073 IF ( CALC_DENSAI ) THEN 1074 count_mpi = LDENSAI 1075 IF (MYNUM.EQ.0) THEN 1076 CALL MPI_REDUCE( MPI_IN_PLACE, DENSAI, count_mpi, 1077 & my_MPI_REAL8, MPI_SUM, sop_master, 1078 & SOPPA_COMM_ACTIVE, ierr_mpi) 1079 ELSE 1080 CALL MPI_REDUCE( DENSAI, DENSAI, count_mpi, 1081 & my_MPI_REAL8, MPI_SUM, sop_master, 1082 & SOPPA_COMM_ACTIVE, ierr_mpi) 1083 ENDIF 1084 ENDIF 1085C-------------------------------------- 1086C Communicate the timings if needed 1087C-------------------------------------- 1088 if ( loadbal_dyn ) then 1089 if (mynum .eq.0) then 1090C Master 1091C -------- 1092C Recieve the timings 1093 count_mpi = soppa_nint 1094 call mpi_reduce( mpi_in_place, work(ktiming), count_mpi, 1095 & my_MPI_REAL8, MPI_SUM, sop_master, 1096 & SOPPA_COMM_ACTIVE, ierr_mpi) 1097C 1098C Redo the loadbalancing based on the timings and distribute them 1099 ksorted = kend2 1100 ktmp = (soppa_num_active*maxnumjobs) 1101 knasjob = ksorted + ktmp/irat + mod(ktmp,irat) 1102 kswork = knasjob + soppa_num_active/irat + mod(ktmp,irat) 1103 kendf = kswork + soppa_num_active 1104C 1105 call dynloadbal_parsoppa( AssignedIndices, maxnumjobs, 1106 & work(ktiming), soppa_nint, 1107 & work(ksorted), work(knasjob), 1108 & work(kswork) ) 1109 ! add empty work-space... 1110 ! currently it can start at kend2 1111 else 1112C Slave 1113C --------- 1114C Send the timings 1115 count_mpi = soppa_nint 1116 call mpi_reduce( work(ktiming), work(ktiming), count_mpi, 1117 & my_MPI_REAL8, MPI_SUM, sop_master, 1118 & SOPPA_COMM_ACTIVE, ierr_mpi) 1119C Recieve the new indices 1120 count_mpi = maxnumjobs 1121 call mpi_scatter( AssignedIndices,count_mpi,my_mpi_integer, 1122 & AssignedIndices,count_mpi,my_mpi_integer, 1123 & sop_master, soppa_comm_active, ierr_mpi) 1124 endif 1125 endif 1126C Slaves are done 1127 IF (MYNUM .NE. 0) THEN 1128 IF ( CALC_DENSAI ) SOP_MP2AI_DONE = .TRUE. 1129 RETURN 1130 END IF 1131#endif 1132C---------------------------------------------------------------- 1133C Calculate the last part of the second order density matrix. 1134C---------------------------------------------------------------- 1135C 1136 DTIME = SECOND() 1137 IF ( CALC_DENSAI ) THEN 1138 CALL SO_DENSAI2(DENSAI,LDENSAI,FOCKD,LFOCKD) 1139 SOP_MP2AI_DONE = .TRUE. 1140 END IF 1141C 1142 DTIME = SECOND() - DTIME 1143 SOTIME(41) = SOTIME(41) + DTIME 1144C 1145 IF ( IPRSOP .GE. 7 ) THEN 1146C------------------------------------------ 1147C Write new resultvectors to output. 1148C------------------------------------------ 1149 DO 400 INEWTR = 1,NNEWTR 1150 INEW = NOLDTR + INEWTR 1151 WRITE(LUPRI,'(/,I3,A)') INEWTR, 1152 & '. new E[2] linear transformed trial vector' 1153 CALL SO_READ(WORK(KRES1E),LRES1E,LURS1E,FNRS1E,INEW) 1154 CALL SO_READ(WORK(KRES1D),LRES1E,LURS1D,FNRS1D,INEW) 1155 WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)') 1156 & (I,WORK(KRES1E+I-1),WORK(KRES1D+I-1),I=1,LRES1E) 1157 IF (DOUBLES) THEN 1158 CALL SO_READ(WORK(KRES2E),LRES2E,LURS2E,FNRS2E,INEW) 1159C IF (DO_DEX) 1160 CALL SO_READ(WORK(KRES2D),LRES2E,LURS2D,FNRS2D,INEW) 1161 WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)') 1162 & (I,WORK(KRES2E+I-1),WORK(KRES2D+I-1),I=1,LRES2E) 1163 ENDIF 1164 400 CONTINUE 1165C 1166 END IF 1167C 1168C----------------- 1169C Close files. 1170C----------------- 1171C 1172 CALL SO_CLOSE(LUTR1E,FNTR1E,'KEEP') 1173 CALL SO_CLOSE(LURS1E,FNRS1E,'KEEP') 1174 CALL SO_CLOSE(LURS1D,FNRS1D,'KEEP') 1175 IF (DO_DEX) THEN 1176 CALL SO_CLOSE(LUTR1D,FNTR1D,'KEEP') 1177 END IF 1178C 1179 IF (DOUBLES) THEN 1180 CALL SO_CLOSE(LUTR2E,FNTR2E,'KEEP') 1181 CALL SO_CLOSE(LURS2E,FNRS2E,'KEEP') 1182 CALL SO_CLOSE(LURS2D,FNRS2D,'KEEP') 1183 IF (DO_DEX) THEN 1184 CALL SO_CLOSE(LUTR2D,FNTR2D,'KEEP') 1185 ENDIF 1186 ENDIF 1187C 1188C 1189C----------------------- 1190C Remove from trace. 1191C----------------------- 1192C 1193 CALL QEXIT('SO_ERES') 1194C 1195 RETURN 1196C 1197C------------------------------------------------------------------------ 1198C The body of the above loop over integral distributions has been moved 1199C into these internal subroutines. 1200C------------------------------------------------------------------------ 1201C 1202 CONTAINS 1203C Be aware that variables fall through from the outer scope! 1204C 1205 1206 SUBROUTINE LOOP_BODY_SINGLET 1207C 1208 DOUBLE PRECISION DTIME 1209C 1210C 1211C---------------------------------------------- 1212C Calculate the AO-Fock matrix. 1213C---------------------------------------------- 1214C 1215 DTIME = SECOND() 1216 CALL SO_AOFOCK1(WORK(KXINT),WORK(KDENS),WORK(KFOCK), 1217 & WORK(KEND5),LWORK5, 1218 & IDEL,ISYMD, 1219 & ISYMTR) 1220 DTIME = SECOND() - DTIME 1221 SOTIME(11) = SOTIME(11) + DTIME 1222 IF (singles_second) THEN 1223C 1224C---------------------------------------------------------------------- 1225C Calculate part of the result vectors RES1E and RES1D, 1226C specifically the first and the second term in eqs. (34,35). 1227C Also calculate Aij and Aab in eqs. (43,44). 1228C---------------------------------------------------------------------- 1229C 1230 DTIME = SECOND() 1231 CALL SO_RES_A(WORK(KRES1E),LRES1E, 1232 & WORK(KRES1D),LRES1D, 1233 & WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D, 1234 & WORK(KDSRHF),LDSRHF,WORK(KCMO),LCMO, 1235 & WORK(KT2M1),LT2M1,WORK(KAIJ),LAIJ, 1236 & WORK(KAAB),LAAB,INEWTR,ISYMD,ISYDIS, 1237 & ISYRES,ISYMTR,DO_DEX, 1238 & WORK(KEND5),LWORK5) 1239 DTIME = SECOND() - DTIME 1240 SOTIME(14) = SOTIME(14) + DTIME 1241 ENDIF 1242C 1243 IF (DOUBLES.OR.SINGLES_SECOND) THEN 1244C 1245C------------------------------------------------------------------- 1246C Calculate the part of the result vectors RES1E and 1247C RES1D which originate from the C matrices. See 1248C eqs. (72) and (73). 1249C------------------------------------------------------------------- 1250CRF The X2M1 vectors is now an intermediate, which contain 1251CRF both the 2-particle trial vector AND the T2*x1 intermediate, 1252CRF ~ ~ ~ 1253CRF so both C*x2 and terms (1) and (5) of B*x1 is calculated here. 1254C 1255 DTIME = SECOND() 1256 CALL SO_RES_TCB(WORK(KRES1E),LRES1E, 1257 & WORK(KRES1D),LRES1D, 1258 & WORK(KX2EM1),LX2M1, 1259 & WORK(KX2DM1),LX2M1, 1260 & WORK(KDSRHF),LDSRHF, 1261 & WORK(KCMO),LCMO,IDEL,ISYMD,ISYDIS, 1262 & ISYMTR,DO_DEX,WORK(KEND5),LWORK5) 1263 DTIME = SECOND() - DTIME 1264 SOTIME(29) = SOTIME(29) + DTIME 1265 END IF 1266C 1267C---------------------------------------------------------------------- 1268C Construct C-contribution to 2p2h result vectors RES2E 1269C and RES2D. 1270C---------------------------------------------------------------------- 1271C 1272 IF(DOUBLES) THEN 1273 DTIME = SECOND() 1274 CALL SO_RES_CB(WORK(KRES2E),LRES2E, 1275 & WORK(KRES2D),LRES2D, 1276 & WORK(KDSRHF),LDSRHF, 1277 & WORK(KBTR1E),LBTR1E, 1278 & WORK(KBTR1D),LBTR1D, 1279 & WORK(KBTJ1E),LBTJ1E, 1280 & WORK(KBTJ1D),LBTJ1D,WORK(KCMO),LCMO, 1281 & IDEL,ISYMD,ISYDIS,ISYMTR,DO_DEX, 1282 & WORK(KEND5),LWORK5) 1283 DTIME = SECOND() - DTIME 1284 SOTIME(15) = SOTIME(15) + DTIME 1285C 1286 ENDIF 1287 1288 IF (SINGLES_SECOND) THEN 1289 DTIME = SECOND() 1290 ISYDIS2 = MULD2H(ISYDIS,ISYMTR) 1291 KEND6 = KEND4 + NDSRHF(MULD2H(ISYMD,ISYMTR)) 1292 LWORK6 = LWORK - KEND6 1293C ~ 1294C Calculate ( alpha, beta | j delta) 1295 CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1D), 1296 & ISYMTR,WORK(KEND5),LWORK5,ISYDIS) 1297C Calculate terms 2 and 6 of the B-matrix, by 1298C ~ 1299C (2) - ( c a | j delta ) * T2M1( ci, j) 1300C ~ 1301C (6) ( k i | j delta ) * T2M1( ak, j) 1302 1303 CALL SO_RES_B26 ( WORK(KRES1E), LRES1E, WORK(KT2M1), LT2M1, 1304 & WORK(KDSRHF),WORK(KCMO),LCMO,IDEL,ISYMD, 1305 & ISYDIS2,ISYMTR,WORK(KEND6),LWORK6) 1306 IF (DO_DEX) THEN 1307C 1308C Same for the D-part 1309 CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1E), 1310 & ISYMTR,WORK(KEND5),LWORK5,ISYDIS) 1311 CALL SO_RES_B26 ( WORK(KRES1D), LRES1D, 1312 & WORK(KT2M1), LT2M1, 1313 & WORK(KDSRHF),WORK(KCMO),LCMO, 1314 & IDEL,ISYMD, 1315 & ISYDIS2,ISYMTR,WORK(KEND6),LWORK6) 1316 END IF 1317 DTIME = SECOND() - DTIME 1318 SOTIME(16) = SOTIME(16) + DTIME 1319 ENDIF 1320 END SUBROUTINE 1321C 1322 SUBROUTINE LOOP_BODY_TRIPLET 1323C 1324C---------------------------------------------- 1325C Calculate the AO-Fock matrix. 1326C---------------------------------------------- 1327C 1328 DTIME = SECOND() 1329 CALL SO_AOFOCK3(WORK(KXINT),WORK(KDENS), 1330 & WORK(KFOCK), 1331 & IDEL,ISYMD,ISYMTR) 1332 DTIME = SECOND() - DTIME 1333 SOTIME(11) = SOTIME(11) + DTIME 1334C 1335 IF (singles_second) THEN 1336C 1337CRF A^(2) Doesn't mix spins - Same for singlet and triplet 1338C---------------------------------------------------------------------- 1339C Calculate part of the result vectors RES1E and RES1D, 1340C specifically the first and the second term in eqs. 1341C (34,35). Also calculate Aij and Aab in eqs. (43,44). 1342C---------------------------------------------------------------------- 1343C 1344 DTIME = SECOND() 1345 CALL SO_RES_A(WORK(KRES1E),LRES1E, 1346 & WORK(KRES1D),LRES1D, 1347 & WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D, 1348 & WORK(KDSRHF),LDSRHF,WORK(KCMO),LCMO, 1349 & WORK(KT2M1),LT2M1,WORK(KAIJ),LAIJ, 1350 & WORK(KAAB),LAAB,INEWTR,ISYMD,ISYDIS, 1351 & ISYRES,ISYMTR,DO_DEX, 1352 & WORK(KEND5),LWORK5) 1353 DTIME = SECOND() - DTIME 1354 SOTIME(14) = SOTIME(14) + DTIME 1355C 1356C-------------------------------------------------------------------- 1357C Transform the partially back-transformed T2 MP-amplitudes 1358C Currently they are stored as 1359C t2m1(ai,j) = 2t(ai,j) - t(aj,i) 1360C In the following we need to have 1361C t2m1(ai,j) = -t(aj,i) 1362C-------------------------------------------------------------------- 1363C 1364 DTIME = SECOND() 1365 CALL SO_M1SHUF(WORK(KT2M1),LT2M1,ISYMD,1) 1366 DTIME = SECOND() - DTIME 1367 SOTIME(12) = SOTIME(12) + DTIME 1368 END IF 1369 IF (DOUBLES.OR.SINGLES_SECOND) THEN 1370C 1371C------------------------------------------------------------------- 1372C Calculate the part of the result vectors RES1E and 1373C RES1D which originate from the C matrices. See 1374C eqs. (72) and (73). 1375C------------------------------------------------------------------- 1376CRF The X2M1 vectors are now intermediates, which contain 1377CRF the the T2*x1 intermediate, 1378CRF so terms (1) and (5) of B*x1 are calculated here. 1379C 1380 DTIME = SECOND() 1381 CALL SO_RES_TCB(WORK(KRES1E),LRES1E, 1382 & WORK(KRES1D),LRES1D, 1383 & WORK(KX2EM1),LX2M1, 1384 & WORK(KX2DM1),LX2M1, 1385 & WORK(KDSRHF),LDSRHF, 1386 & WORK(KCMO),LCMO,IDEL,ISYMD,ISYDIS, 1387 & ISYMTR,DO_DEX, 1388 & WORK(KEND5),LWORK5) 1389 DTIME = SECOND() - DTIME 1390 SOTIME(29) = SOTIME(29) + DTIME 1391 1392 END IF 1393C 1394 IF (DOUBLES) THEN 1395C 1396C---------------------------------------------------------------------- 1397C Construct C-contribution to 2p2h result vectors RES2E 1398C and RES2D. 1399C---------------------------------------------------------------------- 1400C 1401 DTIME = SECOND() 1402 CALL SO_RES_CBT(WORK(KRES2E),LRES2E, 1403 & WORK(KRES2D),LRES2D, 1404 & WORK(KDSRHF),LDSRHF, 1405 & WORK(KBTR1E),LBTR1E, 1406 & WORK(KBTR1D),LBTR1D, 1407 & WORK(KBTJ1E),LBTJ1E, 1408 & WORK(KBTJ1D),LBTJ1D,WORK(KCMO),LCMO, 1409 & IDEL,ISYMD,ISYDIS,ISYMTR,DO_DEX, 1410 & WORK(KEND5),LWORK5) 1411 DTIME = SECOND() - DTIME 1412 SOTIME(15) = SOTIME(15) + DTIME 1413C 1414 ENDIF 1415 IF(SINGLES_SECOND) THEN 1416 DTIME = SECOND() 1417 ISYDIS2 = MULD2H(ISYDIS,ISYMTR) 1418 KEND6 = KEND4 + NDSRHF(MULD2H(ISYMD,ISYMTR)) 1419 LWORK6 = LWORK - KEND6 1420C ~ 1421C Calculate ( alpha, beta | j delta) 1422 CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1D), 1423 & ISYMTR,WORK(KEND5),LWORK5,ISYDIS) 1424C Calculate terms 2 and 6 of the B-matrix, by 1425C ~ 1426C (2) - ( c a | j delta ) * T2M1( ci, j) 1427C ~ 1428C (6) ( k i | j delta ) * T2M1( ak, j) 1429C 1430 CALL SO_RES_B26 ( WORK(KRES1E), LRES1E, WORK(KT2M1), LT2M1, 1431 & WORK(KDSRHF),WORK(KCMO),LCMO,IDEL,ISYMD, 1432 & ISYDIS2,ISYMTR,WORK(KEND6),LWORK6) 1433C 1434 IF (DO_DEX) THEN 1435C Same for the D-part 1436 CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1E), 1437 & ISYMTR,WORK(KEND5),LWORK5,ISYDIS) 1438 CALL SO_RES_B26 (WORK(KRES1D), LRES1D, 1439 & WORK(KT2M1), LT2M1, 1440 & WORK(KDSRHF),WORK(KCMO),LCMO,IDEL,ISYMD, 1441 & ISYDIS2,ISYMTR,WORK(KEND6),LWORK6) 1442 END IF 1443 DTIME = SECOND() - DTIME 1444 SOTIME(16) = SOTIME(16) + DTIME 1445 ENDIF 1446 1447 END SUBROUTINE 1448 1449 SUBROUTINE TRANS_FCK(FOCK,ISYMFCK) 1450 DOUBLE PRECISION, INTENT(INOUT) :: FOCK(*) 1451 INTEGER, INTENT(IN) :: ISYMFCK 1452 1453 INTEGER :: IA, IB, ISIZE, NUMA, NUMB 1454 INTEGER :: ISYMA, ISYMB, IOFF, IOFF1, IOFF2 1455 DOUBLE PRECISION :: TMP 1456 1457 IF ( ISYMFCK .EQ. 1) THEN 1458 DO ISYMA = 1, NSYM 1459 IOFF = IAODIS(ISYMA,ISYMA) 1460 DO IA = 2, NBAS(ISYMA) 1461 DO IB = 1, IA-1 1462 IPOS1 = (IA-1)*NBAS(ISYMA)+IB+IOFF 1463 IPOS2 = (IB-1)*NBAS(ISYMA)+IA+IOFF 1464 TMP = FOCK(IPOS1) 1465 FOCK(IPOS1) = FOCK(IPOS2) 1466 FOCK(IPOS2) = TMP 1467 END DO 1468 END DO 1469 END DO 1470 ELSE 1471 DO ISYMA = 1, NSYM 1472 ISYMB = MULD2H(ISYMFCK,ISYMA) 1473 IF (ISYMB .GT.ISYMA) CYCLE 1474 NUMA = NBAS(ISYMA) 1475 NUMB = NBAS(ISYMB) 1476 IOFF1 = IAODIS(ISYMA,ISYMB) 1477 IOFF2 = IAODIS(ISYMB,ISYMA) 1478 ISIZE = NUMA*NUMB 1479 DO IB = 1, NUMB 1480 DO IA = 1, NBAS(ISYMA) 1481 IDX1 = IOFF1 + NUMA*(IB-1) +IA 1482 IDX2 = IOFF2 + NUMB*(IA-1) +IB 1483 TMP = FOCK(IDX1) 1484 FOCK(IDX1) = FOCK(IDX2) 1485 FOCK(IDX2) = TMP 1486 END DO 1487 END DO 1488 END DO 1489 ENDIF 1490 1491 END SUBROUTINE 1492 1493 SUBROUTINE SO_AOFOCK3(XINT,DENSIT,FOCK, 1494 & IDEL,ISYMD,ISYDEN) 1495 1496#include "symsq.h" 1497 1498 INTEGER,INTENT(IN) :: IDEL, ISYMD,ISYDEN 1499 DOUBLE PRECISION,INTENT(IN) :: XINT(*), DENSIT(*) 1500 DOUBLE PRECISION,INTENT(INOUT) :: FOCK(*) 1501 1502 INTEGER :: ISYMBG, ISYMAB,ISYMG,ISYDIS,ISYMB,ISYMA 1503 INTEGER :: IG, IA, IB 1504 INTEGER :: KOFFINT, KOFFINT1, KOFFDEN, KOFFOUT 1505 INTEGER :: NOFFB, NOFFA 1506 DOUBLE PRECISION :: TMP 1507 1508 ISYMBG = ISYDEN 1509 ISYDIS = ISYMD 1510 ISYMA = MULD2H(ISYDIS,ISYMBG) 1511 KOFFOUT = IAODIS(ISYMA,ISYMD) + NBAS(ISYMA)* 1512 & (IDEL- IBAS(ISYMD) - 1) 1513 1514 DO ISYMG = 1, NSYM 1515 ISYMB = MULD2H(ISYMBG,ISYMG) 1516 ISYMAB = MULD2H(ISYMA,ISYMB) 1517 KOFFINT1 = IDSAOG(ISYMG,ISYDIS) + IAODPK(ISYMA,ISYMB) 1518 1519 IF(ISYMAB.EQ.1) THEN ! integrals stored as triangle 1520 DO IG = 1, NBAS(ISYMG) 1521 KOFFINT = KOFFINT1 + (IG - 1) *NNBST(ISYMAB) 1522 KOFFDEN = IAODIS(ISYMB,ISYMG) + NBAS(ISYMB)*(IG-1) 1523 ! alpha =< beta : 1524 DO IB = 1, NBAS(ISYMB) 1525 NOFFB = IB*(IB-1)/2 + KOFFINT 1526 DO IA = 1, IB 1527 FOCK(KOFFOUT+IA) = -XINT(NOFFB+IA) 1528 & *DENSIT(KOFFDEN+IB) 1529 & +FOCK(KOFFOUT+IA) 1530 END DO 1531 END DO 1532 1533 ! alpha > beta 1534 DO IA = 2, NBAS(ISYMA) 1535 NOFFA = IA*(IA-1)/2 + KOFFINT 1536 TMP = 0.0D0 1537 DO IB = 1, IA - 1 1538 TMP = XINT(NOFFA+IB)*DENSIT(KOFFDEN+IB) + TMP 1539 END DO 1540 FOCK(KOFFOUT+IA) = FOCK(KOFFOUT+IA) - TMP 1541 END DO 1542 END DO ! LOOP IG 1543 1544 ELSEIF(ISYMA .LT.ISYMB) THEN 1545 ! Stored as alpha, beta 1546 DO IG = 1, NBAS(ISYMG) 1547 KOFFINT = KOFFINT1 + (IG - 1) *NNBST(ISYMAB) 1548 KOFFDEN = IAODIS(ISYMB,ISYMG) + NBAS(ISYMB)*(IG-1) 1549 1550 ! Loop B first 1551 DO IB = 1, NBAS(ISYMB) 1552 NOFFB = NBAS(ISYMA)*(IB-1) + KOFFINT 1553 DO IA = 1, NBAS(ISYMA) 1554 FOCK(KOFFOUT+IA) = -XINT(NOFFB+IA) 1555 & *DENSIT(KOFFDEN+IB) 1556 & +FOCK(KOFFOUT+IA) 1557 END DO 1558 END DO 1559 END DO 1560 ELSE ! ISYMA .GT. ISYMB 1561 ! Stored as beta, alpha 1562 DO IG = 1, NBAS(ISYMG) 1563 KOFFINT = KOFFINT1 + (IG - 1) *NNBST(ISYMAB) 1564 KOFFDEN = IAODIS(ISYMB,ISYMG) + NBAS(ISYMB)*(IG-1) 1565 ! LOOP A first 1566 DO IA = 1, NBAS(ISYMA) 1567 NOFFA = NBAS(ISYMB)*(IA-1) + KOFFINT 1568 TMP = 0.0D0 1569 DO IB = 1, NBAS(ISYMB) 1570 TMP = XINT(NOFFA+IB)*DENSIT(KOFFDEN+IB) + TMP 1571 END DO 1572 FOCK(KOFFOUT+IA) = FOCK(KOFFOUT+IA) - TMP 1573 END DO 1574 END DO ! IG 1575 END IF 1576 END DO ! ISYMG 1577 1578 END SUBROUTINE 1579C 1580 SUBROUTINE SO_SYM_DENS(DENS_SQ,DENS_SYM,ISYDENS) 1581 ! ~ 1582 ! Form the symmetric array D(alpha,beta) from 1583 ! the array D(alpha, beta) and store it in packed form 1584 ! (alpha =< beta). 1585 ! ~ 1586 ! D(alpha,beta) = D(alpha,beta)+ D(beta,alpha) 1587 ! ~ 1588 ! D(alpha,alpha) = D(alpha,alpha) 1589 ! 1590 ! D has symmetry ISYDENS 1591#include "symsq.h" 1592 DOUBLE PRECISION, INTENT(IN) :: DENS_SQ(*) 1593 DOUBLE PRECISION, INTENT(OUT):: DENS_SYM(*) 1594 INTEGER, INTENT(IN) :: ISYDENS 1595 1596 INTEGER :: ISYMA, ISYMB, IA, IB 1597 INTEGER :: NUMA, NUMB, IOFFA, IOFFB, IOFFPK, IOFFSQ 1598 INTEGER :: IOFF1, IOFF2,IOFFSQ1,IOFFSQ2 1599 1600 IF (ISYDENS.EQ.1) THEN 1601 ! Totally symmetric case, Triangular storage 1602 DO ISYMA = 1, NSYM 1603 IOFFPK = IAODPK(ISYMA,ISYMA) 1604 IOFFSQ = IAODIS(ISYMA,ISYMA) 1605 NUMA = NBAS(ISYMA) 1606 NUMB = NBAS(ISYMA) 1607 DO IB = 1, NUMB 1608 IOFF1 = IB*(IB-1)/2 + IOFFPK 1609 IOFFB = (IB-1)*NUMA + IOFFSQ 1610 DO IA = 1, IB-1 1611 IOFFA = (IA-1)*NUMB + IOFFSQ 1612 DENS_SYM(IOFF1+IA) = DENS_SQ(IOFFB+IA) + 1613 & DENS_SQ(IOFFA+IB) 1614 END DO 1615 DENS_SYM(IOFF1+IB) = DENS_SQ(IOFFB+IB) 1616 END DO 1617 END DO 1618 ELSE ! Else we deal with rectangular offdiagonal blocks 1619 DO ISYMB = 1, NSYM 1620 ISYMA = MULD2H(ISYMB,ISYDENS) 1621 IF (ISYMA.GT.ISYMB) CYCLE 1622 IOFFPK=IAODPK(ISYMA,ISYMB) 1623 IOFFSQ1 = IAODIS(ISYMB,ISYMA) 1624 IOFFSQ2 = IAODIS(ISYMA,ISYMB) 1625 NUMA = NBAS(ISYMA) 1626 NUMB = NBAS(ISYMB) 1627 DO IB = 1, NUMB 1628 IOFF1 = (IB-1)*NUMA + IOFFPK 1629 IOFFB = (IB-1)*NUMA + IOFFSQ2 1630 DO IA = 1, NUMA 1631 IOFFA = (IA-1)*NUMB + IOFFSQ1 1632 DENS_SYM(IOFF1+IA) = DENS_SQ(IOFFB+IA) + 1633 & DENS_SQ(IOFFA+IB) 1634 END DO 1635 END DO 1636 END DO 1637 ENDIF 1638 1639 END SUBROUTINE 1640 1641 SUBROUTINE SO_AOFOCK1(XINT,DENSIT,FOCK,WORK,LWORK, 1642 & IDEL,ISYMD,ISYDEN) 1643 ! Calculates here 1644 ! 2 ( alpha, beta | gamma ; delta) * D( alpha, beta) => 1645 ! F ( gamma ; delta) 1646 ! 1647 ! and in call to SO_AOFOCK3 : 1648 ! - ( alpha, beta | gamma ; delta) * D( beta, gamma) => 1649 ! F (alpha ; delta) 1650 ! 1651 DOUBLE PRECISION, PARAMETER :: TWO = 2.0D0, ONE =1.0D0 1652 1653 INTEGER,INTENT(IN) :: IDEL, ISYMD,ISYDEN, LWORK 1654 DOUBLE PRECISION,INTENT(IN) :: XINT(*), DENSIT(*) 1655 DOUBLE PRECISION,INTENT(INOUT) :: FOCK(*), WORK(LWORK) 1656 1657 INTEGER :: ISYDIS, ISYMG 1658 INTEGER :: NUMG, NAB, KGAM, KOUT 1659 1660 ! First do the singlet-only term 1661 ISYDIS = ISYMD 1662 ISYMG = MULD2H(ISYDIS,ISYDEN) 1663 NUMG = NBAS(ISYMG) 1664 IF (NUMG .GE. 1) THEN 1665 !Symmetrice AO-DENSITY and compress to packed form (could be done outside) 1666 CALL SO_SYM_DENS(DENSIT,WORK,ISYDEN) 1667 1668 ! Position of (alpha =< beta | gamma; delta ) block for this 1669 ! symmetry of gamma 1670 KGAM = IDSAOG(ISYMG,ISYDIS) + 1 1671 ! Position of the delta'th colunm in output 1672 KOUT = IAODIS(ISYMG,ISYMD) + 1673 & NBAS(ISYMG)*(IDEL-IBAS(ISYMD)-1) + 1 1674 NAB = MAX( 1, NNBST(ISYDEN) ) 1675 1676 CALL DGEMV('T',NNBST(ISYDEN),NUMG,TWO,XINT(KGAM),NAB, 1677 & WORK,1,ONE,FOCK(KOUT),1) 1678 END IF 1679 1680 ! Do the term also appearing in the triplet case 1681 CALL SO_AOFOCK3(XINT,DENSIT,FOCK,IDEL,ISYMD,ISYDEN) 1682 1683 1684 END SUBROUTINE 1685 1686 END SUBROUTINE 1687 1688C 1689#ifdef VAR_MPI 1690C 1691C 1692 subroutine dynloadbal_parsoppa( localIndices, maxnumjobs, 1693 & timings, ltimings, 1694 & sortedindices, numassignjobs, 1695 & sumofwork) 1696C Dynamic Load Balancing for the parallel SOPPA calculations (and parallel RPA). 1697C The routine assumed that the MPI processes that does the load balancing is the master. If any other routine enters, there will be issues with the updated ILLL indices in the AssignedIndices array. Right now, any slave that enters is immediately evicted from the routine. 1698C 1699C The routine takes an array of timings as input. The timings are creatd on the fly by every parallel process and the index corresponds to the ILLL index in the parallel calculation. The time is given in integers from calling system_clock. The timings are not given in any human-readable form. This choice was made so that one can reuse the subroutine getallocsize, which relies on an integer input array. 1700C The work is resorted once the actual time associated with an ILLL index is known and the indices are rebalanced once more and sent to all slaves. 1701C The improvement over the presorted balancing scheme is expected to be very small, but gets better the larger the basis set. 1702C 1703C F.Beyer Oct. 2014. 1704 1705 use so_parutils, only : soppa_comm_active, soppa_num_active, 1706 & soppa_nint, my_mpi_integer, sop_master 1707 1708 use so_info, only: sop_dp 1709 1710 implicit none 1711#include "mpif.h" 1712#include "maxorb.h" 1713#include "distcl.h" 1714#include "priunit.h" 1715 1716 ! Dummy parameters 1717 double precision, dimension(ltimings), intent(inout) :: timings 1718 integer, intent(inout) :: localIndices(maxnumjobs) 1719 integer :: sortedindices(maxnumjobs,soppa_num_active) 1720 integer :: numassignjobs(soppa_num_active) 1721 real(sop_dp) :: sumofwork(soppa_num_active) 1722 integer :: ltimings, maxnumjobs 1723 1724 ! Bookkeeping 1725 integer :: maxrows, maxcols, numrecipients 1726 integer :: getnumjobs, targetID, myid 1727 integer :: colindex, rowindex, assignILLL, col,i 1728 integer :: ntot 1729 1730 integer(mpi_integer_kind) :: ierr_mpi, maxnum_mpi 1731 1732 ntot = soppa_nint 1733 ! In case the amount of work is smaller than 1734 ! the number of MPI processes... 1735 maxcols = soppa_num_active 1736 1737 ! Explicitly set the maximum number of jobs a single MPI process can be allocated. 1738 maxrows = maxnumjobs 1739 1740 sortedindices(:,:) = 0 1741 numassignjobs(:) = 0 1742 sumofwork(:) = 0.0D0 1743 1744 1745 DO i=1, ntot 1746 ! Find the largest chunk of available work 1747 assignILLL = maxloc(timings,dim=1, mask=timings.ge.0.0D0) 1748 1749 ! Find the laziest slave 1750 colindex = minloc(sumofwork, dim=1) 1751 1752 ! Write the ILLL index to the correct row in the sortedmatrix 1753 numassignjobs(colindex) = numassignjobs(colindex) + 1 1754 rowindex = numassignjobs(colindex) 1755 sortedindices(rowindex, colindex) = assignILLL 1756 1757 ! Update the slave's expected work/walltime and 1758 ! remove the work from the timings array 1759 sumofwork(colindex) = sumofwork(colindex) 1760 & +timings(assignILLL) 1761 timings(assignILLL) = -1.0D0 1762 1763 ENDDO 1764 1765C PRINT HOW WORK IS DISTRIBUTED 1766 write (LUPRI,'(a)') 'AOSOPPA Work distribution' 1767 write (LUPRI,'(a)') 'NODE Expected time ' 1768 do i = 1, maxcols 1769 write (LUPRI, '(i5,f20.5)') i, sumofwork(i) 1770 enddo 1771 ! Send the info to every slave that does computation (some slaves might be stalled in the polling barrier in case there are too many MPI processes compared to the number of tasks). 1772 call mpi_scatter( sortedindices, maxnumjobs, my_mpi_integer, 1773 & localIndices, maxnumjobs, my_mpi_integer, 1774 & sop_master, soppa_comm_active, ierr_mpi) 1775 1776 ! Update the master's own array of ILLL indices 1777 ! should be done by the scatter 1778 1779 return 1780 end subroutine 1781 1782 1783 1784 1785 1786 1787 subroutine presortloadbal_parsoppa(localIndices, maxnumjobs, 1788 & indexb, 1789 & work, lwork) 1790C This subroutine load balances parallel SOPPA/RPA 1791C calculation. 1792C 1793C The routine makes a best guess of loadbalancing by giving 1794C every MPI process an equal number of distributions to handle. 1795C Testing shows this is a better first guess that giving every 1796C MPI process an equal number of ILLL indices to work with 1797C (since there is a different number of distributions 1798C associated with every ILLL index). 1799C 1800C F.Beyer Oct. 2014. 1801 1802 use so_parutils, only: soppa_comm_active, soppa_nint, sop_master, 1803 & soppa_num_active, my_mpi_integer 1804 1805 implicit none 1806C#include "implicit.h" 1807#include "priunit.h" 1808#include "mpif.h" 1809#include "maxorb.h" 1810#include "iratdef.h" 1811C fetch HERDIR 1812#include "ccsdinp.h" 1813C Dummy parameters 1814 integer, intent(in) :: lwork, maxnumjobs 1815 ! Declare work as integer, to avoid "irats" later 1816 integer :: work(irat*lwork) !intent in 1817 integer :: indexb(*) 1818 integer, dimension(maxnumjobs), intent(out) :: localIndices 1819 1820C Pre-sorting load balancing variables 1821 integer :: getnumjobs, getindices, numprocs 1822 INTEGER(MPI_INTEGER_KIND) :: ierr_mpi 1823C integer :: presortarray, finalsorted 1824 integer :: col, ntot 1825 integer :: kpresortarray, kend, kfinalsorted, ktmp 1826 1827 ntot = soppa_nint 1828 1829 ! numprocs = soppa_num_active !! not consistent with other use 1830! call mpi_comm_size(soppa_comm_active, numprocs, ierr_mpi) 1831 numprocs = soppa_num_active 1832 1833 1834 IF (.NOT. HERDIR) THEN 1835C ERI code 1836 1837C FIND THE AMOUNT OF WORK ASSOCIATED WITH EVERY AO INDEX 1838 kpresortarray = 1 1839 kend = kpresortarray + 2* ntot 1840 call presortaodist(ntot, indexb, work(kpresortarray) ) 1841 1842 1843C CREATE THE FINAL MATRIX OF PRE-SORTED AO INDICES 1844 kfinalsorted = kend 1845 ktmp = kfinalsorted + maxnumjobs * numprocs 1846 kend = ktmp + 2*numprocs 1847 if( kend .gt. lwork ) then 1848 call quit('Insufficient memory in presorted loadbalancer!') 1849 endif 1850 call partitionAOindices(ntot, maxnumjobs, numprocs, 1851 & work(kpresortarray), work(kfinalsorted), 1852 & work(ktmp) ) 1853 else 1854C HERDIR code 1855 kfinalsorted = 1 1856 kend = kfinalsorted + maxnumjobs * numprocs 1857 if( kend .gt. lwork ) then 1858 call quit('Insufficient memory in presorted loadbalancer!') 1859 endif 1860 call herdir_presort( work(kfinalsorted), maxnumjobs) 1861 endif 1862 1863 1864C TRANSFER AO INDICES TO SLAVES 1865C Use scatter 1866 call mpi_scatter( work(kfinalsorted), maxnumjobs, my_mpi_integer, 1867 & localIndices, maxnumjobs, my_mpi_integer, 1868 & sop_master, soppa_comm_active, ierr_mpi ) 1869 1870 return 1871 end subroutine 1872 1873 1874 subroutine herdir_presort ( sorted, maxnumjobs ) 1875C This is a subroutine for doing the initial distribution of the 1876C AO-indices in a parallel SOPPA calculation. 1877C This subroutine works with Hermite (HERDIR) integral generator, 1878C as the code by F. Beyer only supports using ERI 1879C 1880 use so_parutils, only: soppa_num_active, soppa_nint 1881 implicit none 1882 integer, intent(out) :: sorted( maxnumjobs, soppa_num_active ) 1883 integer, intent(in) :: maxnumjobs 1884 integer :: intnum, inext, icol, inum 1885 1886 sorted = 0 1887 inext = 0 1888 icol = 1 1889C For now just do it the stupid way ( round robin ) 1890 1891C initial implementation, is this what chokes ifort? 1892C do intnum = 1, soppa_nint 1893C inext = inext + 1 1894C if ( inext .gt. soppa_num_active ) then 1895C inext = 1 1896C icol = icol + 1 1897C endif 1898C sorted ( icol, inext ) = intnum 1899C enddo 1900 1901C Distribute first an even number 1902 inum = soppa_nint/soppa_num_active 1903 do icol = 1, soppa_num_active 1904 do inext = 1, inum 1905 sorted( inext, icol ) = inext + inum*(icol-1) 1906 enddo 1907 enddo 1908C Distribute the remainder 1909 do icol = 1, mod(soppa_nint, soppa_num_active) 1910 sorted ( inum+1, soppa_num_active-icol+1) = 1911 & inum*soppa_num_active + icol 1912 enddo 1913C Debug print 1914 do inext = 1, soppa_num_active 1915 print '(10i3)', sorted(:,inext) 1916 enddo 1917 return 1918 end subroutine 1919 1920 1921 1922 subroutine pollingbarrier(pollinginterval) 1923C This subroutine is implemented in case there is ever a need to 1924C remove certain processes from a calculation. 1925C 1926C The processes will repeatedly poll until they receive a non-blocking 1927C send from the master with a logical value equal to .true. 1928C 1929C F.Beyer Oct. 2014 1930 1931#ifdef VAR_MPI 1932 use so_parutils, only: my_mpi_logical 1933#endif 1934 1935#include "implicit.h" 1936#include "mpif.h" 1937 1938 integer, dimension(MPI_STATUS_SIZE) :: mpistatus 1939 integer, intent(in) :: pollinginterval !in milliseconds 1940 logical :: flag, exitbarrier 1941 volatile :: exitbarrier 1942 INTEGER(MPI_INTEGER_KIND) :: ierr_mpi, myid, request 1943 1944 call mpi_comm_rank(mpi_comm_world, myid, ierr_mpi) 1945 print *, 'I am in the polling barrier ', myid 1946 1947 ! Initiate the polling variables and ask for the non-blocking update 1948 exitbarrier = .false. 1949 call mpi_irecv(exitbarrier, 1, my_mpi_logical, 0, myid, 1950 & mpi_comm_world, request, ierr_mpi) 1951 1952 ! Polling barrier, the process will cycle repeatedly until released. 1953130 continue 1954#ifdef VAR_IFORT 1955 call sleepqq(pollinginterval) 1956#endif 1957 call mpi_test(request, flag, mpistatus, ierr_mpi) 1958 if (.not. flag) goto 130 1959 ! Warning, seems that we'll go into an infinite loop, 1960 ! if exitbarrier is ever sent as .false. 1961 if (.not.exitbarrier) then 1962 goto 130 ! Cycle to the top of the barrier. 1963 else 1964 return 1965 endif 1966 1967 return 1968 end subroutine 1969 1970C /* deck presortaodist*/ 1971 subroutine presortaodist(Nindex, indxbt, outlist) 1972C A subroutine associated with the atomic integral parallel RPA/SOPPA calculations. 1973C Pre-calculate IDEL2 indexes before starting a parallel calculation. 1974C In other words, this subroutine calculates how many distributions are associated with the calculation. 1975C 1976C This routine assembles a matrix that counts the number of AOs 1977C associated with an ILLL distribution index. This array is used by getallocsize 1978C and partitionAOindices to pre-sort the integrals that need to be done 1979C 1980C The first row in outlist is the number of distributions 1981C The second row in outlist is the associated ILLL index 1982#include "implicit.h" 1983#include "priunit.h" 1984#include "maxaqn.h" 1985#include "maxorb.h" 1986#include "mxcent.h" 1987#include "eridst.h" 1988 1989 integer, intent(in) :: Nindex 1990 integer, dimension(*), intent(in) :: indxbt 1991 integer, dimension(2, Nindex), intent(out):: outlist 1992 integer :: i 1993 1994 do i=1, Nindex 1995 call getdst(i, 0, 0) 1996 call pickao(0) 1997 call eridsi(indxbt, 0) 1998 outlist(1, i) = ndistr 1999 outlist(2, i) = i 2000 enddo 2001 2002 return 2003 2004 end subroutine 2005 2006 2007C /* deck getallocsize */ 2008 SUBROUTINE getallocsize(ntot, originalsort, maxnumjobs) 2009C A subroutine associated with the atomic integral parallel RPA/SOPPA calculations. 2010C 2011C This subroutine is used to get the amount of storage that needs 2012C to be allocated for a parallel SO_ERES run. 2013C 2014C The subroutine calculates which process will be assigned the 2015C most single jobs (not the largest total amount of work) for a 2016C parallel RPA/SOPPA calculation. Its output is used to allocate 2017C the right amount of storage by the master when it starts pre-sorting 2018C the integrals for the parallel calculation of the E matrix. 2019C 2020 2021#include "implicit.h" 2022#include "mpif.h" 2023 integer, intent(in) :: ntot 2024 integer, dimension(2, ntot), intent(in) :: originalsort 2025 integer, intent(out) :: maxnumjobs 2026 2027 integer, dimension(:,:), allocatable :: copysort 2028 integer, dimension(:,:), allocatable :: sumofwork 2029 integer, dimension(2) :: temploc, tempwork, tempout 2030 integer :: allocstatus, deallocstatus, numprocs 2031 INTEGER(MPI_INTEGER_KIND) :: ierr_mpi, numprocs_mpi 2032 2033 2034 2035 call mpi_comm_size(mpi_comm_world, numprocs_mpi, ierr_mpi) 2036 numprocs = numprocs_mpi 2037 allocate( copysort(2, ntot), sumofwork(2, numprocs) 2038 & ,stat=allocstatus) 2039 if(.not.(allocstatus.eq.0) ) then 2040 call quit('Allocation error in GETALLOCSIZE') 2041 endif 2042 2043 !call izero(sumofwork, (2*numprocs) ) 2044 sumofwork = 0 2045 copysort = originalsort 2046 2047 DO i=1, ntot 2048 ! Find location of largest chunk of work and the work itself 2049 temploc = maxloc(copysort, DIM=2, mask=copysort.gt.0) 2050 addwork = copysort(1, temploc(1) ) 2051 copysort( 1,temploc(1) ) = 0 2052 2053 ! Find laziest slave and simulate the workload on the slave 2054 tempwork = minloc(sumofwork, DIM=2) 2055 sumofwork(1, tempwork(1)) = sumofwork(1, tempwork(1)) + addwork! adding total number of distributions 2056 sumofwork(2, tempwork(1)) = sumofwork(2, tempwork(1)) + 1 !adding total number of assigned indexes 2057 ENDDO 2058 2059 tempout = maxloc(sumofwork, dim=2) 2060 maxnumjobs = sumofwork( 2,tempout(2) ) 2061 2062 deallocate(copysort, sumofwork, stat=deallocstatus) 2063 if(.not.(deallocstatus.eq.0) ) then 2064 call quit('Deallocation error in GETALLOCSIZE') 2065 endif 2066 2067 return 2068 2069 END SUBROUTINE 2070 2071 2072C /* deck partitionAOindices */ 2073 SUBROUTINE partitionAOindices(ntot, rows, cols, presortedarray, 2074 & sorted, sumofwork) 2075C A subroutine associated with the atomic integral parallel RPA/SOPPA calculations. 2076C 2077C The output from this routine is the array 'sorted'. 2078C The sorted matrix contains AO integral indexes for two-electron integrals. 2079C Every column contains a list of ILLL indexes that are to be assigned as work 2080C to a single process. The total amount of work per column is estimated based 2081C on the number of distributions that are related to the ILLL indexes in the column. 2082C 2083C This pre-sorting of ILLL indexes for a parallel calculation approximates an 2084C even distribution of total work for all processes when performing two-electron 2085C integrals in parallel for AOSOPPA and AORPA. 2086C 2087 !use mpi 2088 implicit none 2089#include "mpif.h" 2090 2091 integer, intent(in) :: ntot, rows, cols 2092 integer, dimension(2, ntot), intent(inout) :: presortedarray 2093 integer, dimension(rows, cols), intent(out) :: sorted 2094 2095 integer, dimension(2, cols), intent(out) :: sumofwork 2096 integer, dimension(2) :: tempavail, templazy 2097 integer :: numprocs, availloc, targetrow 2098 integer :: allocstatus, deallocstatus 2099 integer :: lazyloc, aoindex, i, numdists 2100 2101 numprocs = cols 2102 2103 !call izero(sorted, (rows*cols) ) 2104 !call izero(sumofwork, (2*numprocs) ) 2105 sorted = 0 2106 sumofwork = 0 2107 2108 DO i=1, ntot 2109C FIND LARGEST CHUNK OF AVAILABLE WORK AND ITS INDEX 2110 tempavail = maxloc(presortedarray, DIM=2, 2111 & mask=presortedarray.gt.0) 2112 availloc = tempavail(1) 2113 numdists = presortedarray(1, availloc) ! amount of available work 2114 aoindex = presortedarray(2, availloc) ! the index to be passed to process 2115 2116 presortedarray(1, availloc) = 0 2117 presortedarray(2, availloc) = 0 2118 2119C FIND THE LAZIEST PROCESS, GIVE IT WORK & INCREMENT THE ROW COUNTER 2120 templazy = minloc(sumofwork, DIM=2) 2121 lazyloc = templazy(1) ! This is equal to: MYID+1 2122 sumofwork( 1,lazyloc ) = sumofwork( 1,lazyloc ) + numdists 2123 sumofwork( 2,lazyloc ) = sumofwork( 2,lazyloc ) + 1 2124 targetrow = sumofwork( 2,lazyloc ) 2125 2126C ADD AO-INDEX TO FIRST AVAILABLE ROW IN THE LAZY SLAVE'S COLUMN 2127 sorted( targetrow, lazyloc ) = aoindex 2128 ENDDO 2129 2130 return 2131 2132 END SUBROUTINE 2133 2134 2135#endif 2136!VAR_MPI at the beginning of dynloadbal_parsoppa 2137