1c 2c This routine returns the Coulomb and exchange integral 3c operator matrices for the range of MO-indices as mo_indx_hi, mo_indx_lo 4c The g_coul, g_exch global arrays are ordered as 5c 6c 7c 8c ij 9c (ij|ab) = ( J ) = g_coul[ ij : (a-1)*N2 + b ] = g_coul [ ij : (b-1)*N2 + a ] 10c ab 11c 12c ij 13c (ia|jb) = ( K ) = g_exch[ ij : (a-1)*N2 + b ] 14c ab 15c 16c 17c 18c 19c 20c Note: if number MO-indices in the 2nd pair is equal to Nbf 21c then the half-transformed arrays will be aliased to the 22c passed globals, g_coul and g_exch. 23c Otherwise, temporary storage for the half-transformed 24c integrals will be allocated and destroyed. 25c 26c 27c 28 subroutine moints_build_JK( rtdb, basis, 29 $ mo1_lo, mo1_hi, 30 $ mo2_lo, mo2_hi, 31 $ g_movecs, 32 $ g_coul, ocoul, 33 $ g_exch, oexch, 34 $ oprintstats, 35 $ chunk_factor ) 36 implicit none 37#include "global.fh" 38#include "tcgmsg.fh" 39#include "bas.fh" 40#include "rtdb.fh" 41#include "context.fh" 42#include "mafdecls.fh" 43c 44c 45c 46 integer rtdb, basis ! [input] Handles to environments 47 integer mo1_lo, mo1_hi ! [input] Range of MO indices to transform (1st pair) 48 integer mo2_lo, mo2_hi ! [input] Range of MO indices to transform (2nd pair) 49 integer g_movecs ! [input] MO vectors 50 integer g_coul ! [output] Coulomb operator 51 integer g_exch ! [output] exchange operator 52 logical ocoul, oexch ! [input] Type selection 53 logical oprintstats ! [input] Print flag (ignored) 54 double precision chunk_factor 55c 56c Local 57c 58 integer nbf,noper,my_id,n2,noper_t,n2n2,jlo,jhi 59#ifdef OLD_ALGORITHM 60 integer g_jhalf, g_khalf 61#endif 62 logical status, oalias 63 double precision ttotal 64c 65 integer ga_create_moblocked 66 logical ga_check_moblocked 67 external ga_create_moblocked,ga_check_moblocked 68c 69c 70c 71 if ((.not.(ocoul)).and.(.not.(oexch))) return 72 if (.not. context_push('moints_build_JK')) 73 $ call errquit('moints_build_JK: context_push failed',0) 74 ttotal = tcgtime() 75 noper = mo1_hi - mo1_lo + 1 76 noper_t = (noper*(noper+1))/2 77 n2 = mo2_hi - mo2_lo + 1 78 n2n2 = n2*n2 79 if (.not.bas_numbf(basis,nbf)) 80 $ call errquit('moints_build_JK: cannot get basis info',0) 81 oalias = ((mo2_hi.eq.nbf).and.(mo2_lo.eq.1)) 82 my_id = ga_nodeid() 83c 84c Check dimension and distribution of passed global arrays 85c 86 if ((ocoul).and.(.not. 87 $ ga_check_moblocked(g_coul,noper,n2,jlo,jhi))) 88 $ call errquit( 89 $ 'moint_build_JK: wrong dimensions/distrib for Coul',my_id) 90 if ((oexch).and.(.not. 91 $ ga_check_moblocked(g_exch,noper,n2,jlo,jhi))) 92 $ call errquit( 93 $ 'moint_build_JK: wrong dimensions/distrib. for exchange',0) 94#ifndef OLD_ALGORITHM 95 call moints_build(basis, 96 $ mo1_lo, mo1_hi, 97 $ mo2_lo, mo2_hi, 98 $ g_movecs, 99 $ g_coul, ocoul, 100 $ g_exch, oexch, 101 $ oprintstats, 102 $ chunk_factor) 103#else 104c 105c Create temporary globals or alias if possible 106c 107 if (oalias) then 108 if (ocoul) g_jhalf = g_coul 109 if (oexch) g_khalf = g_exch 110 else 111 if (ocoul) g_jhalf = 112 $ ga_create_moblocked(noper,nbf,'J half-transformed') 113 if (oexch) g_khalf = 114 $ ga_create_moblocked(noper,nbf,'K half-transformed') 115 endif 116c 117c 1st half transformation 118c 119#ifdef WHITESIDE 120 call moints_1st_half_w(basis,mo1_lo,mo1_hi,g_movecs, 121 $ g_jhalf,ocoul,g_khalf,oexch) 122#else 123 call moints_1st_half(basis,mo1_lo,mo1_hi,g_movecs, 124 $ g_jhalf,ocoul,g_khalf,oexch) 125#endif 126c 127c 2nd half transformation 128c 129 call moints_2nd_half(noper,nbf,mo2_lo,mo2_hi,g_movecs, 130 $ g_jhalf,g_coul,ocoul, 131 $ g_khalf,g_exch,oexch) 132c 133c Clean up 134c 135 if (.not.(oalias)) then 136 status = .true. 137 if (ocoul) status = status.and.ga_destroy(g_jhalf) 138 if (oexch) status = status.and.ga_destroy(g_khalf) 139 if (.not.(status)) call 140 $ errquit('moints_build_JK: cannot free temp globals',0) 141 endif 142#endif 143 ttotal = tcgtime() - ttotal 144 if (ga_nodeid().eq.0) then 145 write(6,971) ttotal 146 971 format(/,10x,'Total transformation time:',5x,f10.2) 147 call util_flush(6) 148 endif 149 150 if (.not.context_pop('moints_build_JK')) 151 $ call errquit('moints: context_pop failed',0) 152 return 153 end 154 155 156 157 158 159 160 161 162 163c--------------------------------------------------------------------------------- 164c--------------------------------------------------------------------------------- 165c--------------------------------------------------------------------------------- 166c--------------------------------------------------------------------------------- 167c--------------------------------------------------------------------------------- 168c 169c 170c atw - 7 Sep 94 171c Original crappy algorithm 172c 173c 174c--------------------------------------------------------------------------------- 175c--------------------------------------------------------------------------------- 176c--------------------------------------------------------------------------------- 177c--------------------------------------------------------------------------------- 178c--------------------------------------------------------------------------------- 179 180 181#ifdef OLD_ALGORITHM 182#ifndef WHITESIDE 183 subroutine moints_1st_half(basis,mo_indx_lo,mo_indx_hi, 184 $ g_movecs, 185 $ g_jhalf,ocoul, 186 $ g_khalf,oexch) 187 implicit none 188#include "tcgmsg.fh" 189#include "global.fh" 190#include "mafdecls.fh" 191#include "bas.fh" 192#include "schwarz.fh" 193 integer MOINTS_SCHWARZ_STAT 194 parameter(MOINTS_SCHWARZ_STAT=1921) 195c 196c 197 integer basis ! Basis handle 198 integer mo_indx_lo, mo_indx_hi ! 1st pair of MO indices 199 integer g_movecs ! MO coefficients 200 integer g_jhalf ! Half-transformed Coulomb operator 201 integer g_khalf ! Half-transformed exchange operator 202 logical ocoul,oexch ! Type selection 203c 204c Local 205c 206 integer noper,nbf 207 integer g_mocols 208 integer g_j1idx, g_k1idx 209 integer g_j2idx, g_k2idx 210 integer l_jsssi,k_jsssi,l_tsssi,k_tsssi,l_ksssi,k_ksssi 211 integer l_jssni,k_jssni 212 integer l_iscr,k_iscr,l_eri,k_eri,l_erit,k_erit 213 integer l_mov,k_mov 214 integer mem2,max2e,sssi_block,ssni_block,maxbf_shell 215 integer nsh,ish,jsh,ksh,lsh 216 integer ish_bflo,ish_bfhi,jsh_bflo,jsh_bfhi 217 integer ksh_bflo,ksh_bfhi,lsh_bflo,lsh_bfhi 218 integer ileng,jleng,kleng 219 integer ncol_tij,row_dist,col_dist,nbf_sh_oper 220 integer num_nodes, jk, next 221 double precision tol2e,dschw_done,schw_ratio 222 double precision tt,t1qtr,t2qtr,thalf,txyz,tint 223 logical status 224 data tol2e/1.d-10/ 225c 226c 227c Get general info 228c 229 call ga_sync() 230 thalf = tcgtime() 231 status = bas_numbf(basis,nbf) 232 status = status.and.bas_numcont(basis,nsh) 233 status = status.and.bas_nbf_cn_max(basis,maxbf_shell) 234 if (.not.status) call errquit('moints: cannot get basis info',0) 235 noper = mo_indx_hi - mo_indx_lo + 1 236 if (ga_nodeid().eq.0) then 237 write(6,927) nsh,maxbf_shell 238 927 format(10x,'Number of shells:',19x,i5,/, 239 $ 10x,'Maximum shell length:',15x,i5) 240 call util_flush(6) 241 endif 242c 243c Ints allocation 244c 245 call int_mem_2e4c(max2e, mem2) 246 status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri) 247 status = status.and.ma_push_get(MT_DBL,max2e,'moints: buf2', 248 $ l_erit,k_erit) 249 status = status.and.ma_push_get(MT_DBL, mem2, 250 $ 'moints: scr', l_iscr, k_iscr) 251c 252c Allocate and get local & global MO coefficients !!!! FIX with ga_patch routines !!!! 253c 254 status = status.and.ma_push_get(MT_DBL,(noper*nbf), 255 $ 'movecs cols',l_mov,k_mov) 256 status = status.and.ga_create(MT_DBL,nbf,noper,'MO cols', 257 $ nbf,1,g_mocols) 258 if (.not.(status)) call errquit('failed allocate MO vectors',0) 259 call ga_get(g_movecs,1,nbf,mo_indx_lo,mo_indx_hi, 260 $ dbl_mb(k_mov),nbf) 261 call ga_copy_patch('n',g_movecs,1,nbf,mo_indx_lo,mo_indx_hi, 262 $ g_mocols,1,nbf,1,noper) 263c 264c Allocate local shell-shell-shell-active blocks 265c 266 sssi_block = noper*maxbf_shell*maxbf_shell*maxbf_shell 267 ssni_block = noper*maxbf_shell*maxbf_shell*nbf 268 status = status.and.ma_push_get(MT_DBL,sssi_block, 269 $ 'J sssi block',l_jsssi,k_jsssi) 270 status = status.and.ma_push_get(MT_DBL,sssi_block, 271 $ 'J Tsssi block',l_tsssi,k_tsssi) 272 status = status.and.ma_push_get(MT_DBL,sssi_block, 273 $ 'K sssi block',l_ksssi,k_ksssi) 274 status = status.and.ma_push_get(MT_DBL,ssni_block, 275 $ 'J ssni block',l_jssni,k_jssni) 276 if (.not.status) call 277 $ errquit('moints: cannot allocate local memory',0) 278c 279c Global arrays for accumulating intermediates 280c 281 nbf_sh_oper = nbf*maxbf_shell*noper 282 col_dist = 1 283 row_dist = 1 284 if (ocoul) then 285 status = status.and.ga_create(MT_DBL,nbf,nbf_sh_oper, 286 $ 'J 1indx',row_dist,col_dist,g_j1idx) 287 status = status.and.ga_create(MT_DBL,nbf_sh_oper,noper, 288 $ 'J 2indx',nbf_sh_oper,col_dist,g_j2idx) 289 endif 290 if (oexch) then 291 status = status.and.ga_create(MT_DBL,nbf,nbf_sh_oper, 292 $ 'K 1indx',row_dist,col_dist,g_k1idx) 293 status = status.and.ga_create(MT_DBL,nbf_sh_oper,noper, 294 $ 'K 2indx',nbf_sh_oper,col_dist,g_k2idx) 295 endif 296 if (.not.(status)) call 297 $ errquit('moints_1st_half: cannot allocate global memory',0) 298c 299c Start 4-fold shell loop 300c 301 t1qtr = 0.d0 302 t2qtr = 0.d0 303 tint = 0.d0 304 dschw_done = 0.d0 305 num_nodes = ga_nnodes() 306 do ish=1,nsh 307 if (.not. bas_cn2bfr(basis,ish,ish_bflo,ish_bfhi)) 308 $ call errquit('mo_ints: bas_cn2bfr',ish) 309 ileng = ish_bfhi - ish_bflo + 1 310 if (ocoul) call ga_zero(g_j1idx) 311 if (oexch) call ga_zero(g_k1idx) 312 call ga_sync() 313 tt = tcgtime() 314 jk = 0 315 next = nxtval(num_nodes) 316 do jsh=1,nsh 317 if (.not. bas_cn2bfr(basis,jsh,jsh_bflo,jsh_bfhi)) 318 $ call errquit('mo_ints: bas_cn2bfr',jsh) 319 jleng = jsh_bfhi - jsh_bflo + 1 320 if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then 321 do ksh=1,nsh 322c 323c Parallelize over JK index here. 324c 325 if (jk.eq.next) then 326 if (.not. bas_cn2bfr(basis,ksh,ksh_bflo,ksh_bfhi)) 327 $ call errquit('mo_ints: bas_cn2bfr',ksh) 328 kleng = ksh_bfhi - ksh_bflo + 1 329 call dfill(sssi_block,0.d0,dbl_mb(k_jsssi),1) 330 call dfill(ssni_block,0.d0,dbl_mb(k_jssni),1) 331#ifdef PERMUT_SYMM 332 do lsh=1,ksh 333#else 334 do lsh=1,nsh 335#endif 336 if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh) 337 $ .ge.tol2e) then 338 call dfill(sssi_block,0.d0,dbl_mb(k_tsssi),1) 339 dschw_done = dschw_done + 1.d0 340 if (.not. bas_cn2bfr(basis,lsh,lsh_bflo,lsh_bfhi)) 341 $ call errquit('mo_ints: bas_cn2bfr',lsh) 342 txyz = tcgtime() 343 call int_2e4c(basis, ish, jsh, basis, ksh, lsh, 344 $ mem2, dbl_mb(k_iscr), max2e, dbl_mb(k_eri)) 345 tint = tint + tcgtime() - txyz 346#ifndef MOINTS_INTS_DUMMY 347c 348c Transform 1st index for range lsh_bflo<->lsh_bfhi 349c 350 call moints_1idx_trf( ish_bflo, ish_bfhi, 351 $ jsh_bflo, jsh_bfhi, 352 $ ksh_bflo, ksh_bfhi, 353 $ lsh_bflo, lsh_bfhi, 354 $ dbl_mb(k_eri), 355 $ dbl_mb(k_mov), nbf, 356 $ mo_indx_lo, mo_indx_hi, 357 $ dbl_mb(k_jsssi) ) 358#ifdef PERMUT_SYMM 359c 360c Transform 1st index for range ksh_bflo<->ksh_bfhi 361c 362 if (lsh.ne.ksh) then 363 call eri_tr( ish_bflo, ish_bfhi, 364 $ jsh_bflo, jsh_bfhi, 365 $ ksh_bflo, ksh_bfhi, 366 $ lsh_bflo, lsh_bfhi, 367 $ dbl_mb(k_eri), 368 $ dbl_mb(k_erit)) 369 370 call moints_1idx_trf( ish_bflo, ish_bfhi, 371 $ jsh_bflo, jsh_bfhi, 372 $ lsh_bflo, lsh_bfhi, 373 $ ksh_bflo, ksh_bfhi, 374 $ dbl_mb(k_erit), 375 $ dbl_mb(k_mov), nbf, 376 $ mo_indx_lo, mo_indx_hi, 377 $ dbl_mb(k_tsssi) ) 378 379 call moints_1idx_move( nbf, ish_bflo, ish_bfhi, 380 $ jsh_bflo, jsh_bfhi, 381 $ lsh_bflo, lsh_bfhi, 382 $ mo_indx_lo, mo_indx_hi, 383 $ dbl_mb(k_tsssi), 384 $ dbl_mb(k_jssni) ) 385 endif 386#endif 387 endif 388#endif 389 enddo 390 391c 392c Push into global memory 393c 394#ifndef MOINTS_INTS_DUMMY 395 if (oexch) then 396 call moints_ktransp(kleng,jleng,ileng,noper, 397 $ dbl_mb(k_jsssi),dbl_mb(k_ksssi)) 398 call moints_push_1idx( nbf, ish_bflo, ish_bfhi, 399 $ ksh_bflo, ksh_bfhi, 400 $ jsh_bflo, jsh_bfhi, 401 $ mo_indx_lo, mo_indx_hi, 402 $ dbl_mb(k_ksssi), g_k1idx ) 403#ifdef PERMUT_SYMM 404 call moints_push_1idx_b( nbf, ish_bflo, ish_bfhi, 405 $ ksh_bflo, ksh_bfhi, 406 $ mo_indx_lo, mo_indx_hi, 407 $ dbl_mb(k_jssni), g_k1idx ) 408#endif 409 endif 410 if (ocoul) then 411 call moints_push_1idx( nbf, ish_bflo, ish_bfhi, 412 $ jsh_bflo, jsh_bfhi, 413 $ ksh_bflo, ksh_bfhi, 414 $ mo_indx_lo, mo_indx_hi, 415 $ dbl_mb(k_jsssi), g_j1idx ) 416#ifdef PERMUT_SYMM 417 call moints_push_1idx_a( nbf, ish_bflo, ish_bfhi, 418 $ jsh_bflo, jsh_bfhi, 419 $ mo_indx_lo, mo_indx_hi, 420 $ dbl_mb(k_jssni), g_j1idx ) 421#endif 422 endif 423#endif 424 next = nxtval(num_nodes) 425 endif 426 jk = jk + 1 427 428c 429c End ksh-loop & jsh-loop 430c 431 enddo 432 endif 433 enddo 434 next = nxtval(-num_nodes) 435 t1qtr = t1qtr + tcgtime() - tt 436#ifndef MOINTS_INTS_DUMMY 437c 438c 2nd Index (global) transformation 439c Push half-transformed integrals (for i-shell) into global 440c operator matrices 441c 442 tt = tcgtime() 443 ncol_tij = noper*ileng*nbf 444 if (ocoul) then 445 call ga_dgemm('t','n',ncol_tij,noper,nbf,1.0d0,g_j1idx, 446 $ g_mocols,0.d0,g_j2idx) 447 call moints_push_halftr(nbf,ish_bflo,ish_bfhi,noper, 448 $ g_j2idx,g_jhalf) 449 endif 450 if (oexch) then 451 call ga_dgemm('t','n',ncol_tij,noper,nbf,1.0d0,g_k1idx, 452 $ g_mocols,0.d0,g_k2idx) 453 call moints_push_halftr(nbf,ish_bflo,ish_bfhi,noper, 454 $ g_k2idx,g_khalf) 455 endif 456 t2qtr = t2qtr + tcgtime() - tt 457#endif 458c 459c End loop over ish 460c 461 enddo 462 call ga_sync() 463 call ga_dgop(MOINTS_SCHWARZ_STAT,dschw_done,1,'+') 464#ifdef PERMUT_SYMM 465 schw_ratio = dschw_done/(nsh*nsh*((nsh*(nsh+1))/2))*100.d0 466#else 467 schw_ratio = dschw_done/(nsh*nsh*nsh*nsh)*100.d0 468#endif 469 thalf = tcgtime() - thalf 470 if (ga_nodeid().eq.0) then 471 write(6,986) schw_ratio,t1qtr,t2qtr,tint,thalf 472 986 format(10x,'Shell-quartets percentage:',4x,f10.1,'%',//, 473 $ 10x,'1st quarter time:',14x,f10.2,/, 474 $ 10x,'2nd quarter time:',14x,f10.2,/, 475 $ 10x,'Integrals only time:',11x,f10.2,/, 476 $ 10x,'Total half time:',15x,f10.2,/) 477 call util_flush(6) 478 endif 479#ifdef DEBUG_PRINT 480C if (ocoul) call moints_print_opermatrix(noper,nbf,g_jhalf) 481C if (oexch) call moints_print_opermatrix(noper,nbf,g_khalf) 482#endif 483c 484c 485c 486c Clean up 487c 488 status = ga_destroy(g_mocols) 489 if (oexch) then 490 status = status.and.ga_destroy(g_k2idx) 491 status = status.and.ga_destroy(g_k1idx) 492 endif 493 if (ocoul) then 494 status = status.and.ga_destroy(g_j2idx) 495 status = status.and.ga_destroy(g_j1idx) 496 endif 497 if (.not.(status)) 498 $ call errquit('moints_1st_half: cannot release globals',0) 499c 500 if (.not. ma_pop_stack(l_jssni)) 501 $ call errquit('moints: failed to pop temp transf', l_jssni) 502 if (.not. ma_pop_stack(l_ksssi)) 503 $ call errquit('moints: failed to pop temp transf', l_ksssi) 504 if (.not. ma_pop_stack(l_tsssi)) 505 $ call errquit('moints: failed to pop temp transf', l_tsssi) 506 if (.not. ma_pop_stack(l_jsssi)) 507 $ call errquit('moints: failed to pop temp transf', l_jsssi) 508 if (.not. ma_pop_stack(l_mov)) 509 $ call errquit('moints: failed to pop movectors', l_mov) 510 if (.not. ma_pop_stack(l_iscr)) 511 $ call errquit('moints: failed to pop int scratch', l_iscr) 512 if (.not. ma_pop_stack(l_erit)) 513 $ call errquit('moints: failed to pop int buffer', l_erit) 514 if (.not. ma_pop_stack(l_eri)) 515 $ call errquit('moints: failed to pop int buffer', l_eri) 516 517 return 518 end 519 520#endif /* !WHITESIDE */ 521 522 523 524 525 526 527 528 529c 530c Performs straightforward two-index transformation on 531c half-transformed integrals. 532c 533c This routines permits the aliasing of 534c g_jhalf -> g_coul 535c g_khalf -> g_exch 536c Provided the distributions and array sizes are 537c compatible. 538c 539c 540c 541 subroutine moints_2nd_half(noper,nbf,mo_lo,mo_hi,g_movecs, 542 $ g_jhalf,g_coul,ocoul, 543 $ g_khalf,g_exch,oexch) 544 implicit none 545#include "global.fh" 546#include "mafdecls.fh" 547#include "tcgmsg.fh" 548 integer MOINTS_J_SUM, MOINTS_K_SUM 549 parameter(MOINTS_J_SUM=1973,MOINTS_K_SUM=1974) 550 integer noper 551 integer nbf 552 integer mo_lo,mo_hi 553 integer g_movecs 554 integer g_jhalf 555 integer g_khalf 556 integer g_coul 557 integer g_exch 558 logical ocoul, oexch 559c 560c Local 561c 562 integer l_tmp, k_tmp, l_mov, k_mov 563 integer ij, jnonzero, knonzero, my_id, jlo, jhi 564 integer k_src, ld_src, k_dest, ld_dest 565 integer nn,n2,n2n2 566 double precision tt,t3qtr,t4qtr,thalf 567 logical status,oalias 568 integer moints_integ_count 569 external moints_integ_count 570 logical ga_check_moblocked 571 external ga_check_moblocked 572c 573c Allocate local scratch space 574c 575 nn = nbf*nbf 576 n2 = mo_hi - mo_lo + 1 577 n2n2 = n2*n2 578 status = ma_push_get(MT_DBL,(n2*nbf),'scratch 1',l_tmp,k_tmp) 579 status = status.and.ma_push_get(MT_DBL,(n2*nbf),'MO vectors', 580 $ l_mov,k_mov) 581 if (.not.(status)) 582 $ call errquit('moints_2nd_half: not enough local memory',0) 583 call ga_get(g_movecs,1,nbf,mo_lo,mo_hi,dbl_mb(k_mov),nbf) 584 jnonzero = 0 585 knonzero = 0 586 my_id = ga_nodeid() 587 thalf = tcgtime() 588 t3qtr = 0.d0 589 t4qtr = 0.d0 590c 591c Coulomb 592c 593 if (ocoul) then 594 oalias = g_jhalf.eq.g_coul 595 if (.not.ga_check_moblocked(g_jhalf,noper,nbf,jlo,jhi)) 596 $ call errquit( 597 $ 'moints_2nd_half: wrong dimensions/distrib half Coul',0) 598 if ((.not.(oalias)).and. 599 $ (.not.ga_check_moblocked(g_coul,noper,n2,jlo,jhi))) 600 $ call errquit( 601 $ 'moints_2nd_half: wrong dimensions/distrib Coul',0) 602 do ij=jlo,jhi 603 tt = tcgtime() 604 call ga_access(g_jhalf,1,nn,ij,ij,k_src,ld_src) 605 call dgemm('n','n',nbf,n2,nbf,1.d0,dbl_mb(k_src),nbf, 606 $ dbl_mb(k_mov),nbf,0.d0,dbl_mb(k_tmp),nbf) 607 t3qtr = t3qtr + tcgtime() - tt 608 tt = tcgtime() 609 if (oalias) then 610 k_dest = k_src 611 else 612 call ga_release(g_jhalf,1,nn,ij,ij) 613 call ga_access(g_coul,1,n2n2,ij,ij,k_dest,ld_dest) 614 endif 615 call dgemm('t','n',n2,n2,nbf,1.d0,dbl_mb(k_mov),nbf, 616 $ dbl_mb(k_tmp),nbf,0.d0,dbl_mb(k_dest),n2) 617 jnonzero = jnonzero + 618 $ moints_integ_count(n2,dbl_mb(k_dest)) 619 call ga_release_update(g_coul,1,n2n2,ij,ij) 620 t4qtr = t4qtr + tcgtime() - tt 621 enddo 622 call ga_sync() 623 endif 624c 625c Exchange 626c 627 if (oexch) then 628 oalias = g_khalf.eq.g_exch 629 if (.not.ga_check_moblocked(g_khalf,noper,nbf,jlo,jhi)) 630 $ call errquit( 631 $ 'moints_2nd_half: wrong dimensions/distrib half exch',0) 632 if ((.not.(oalias)).and. 633 $ (.not.ga_check_moblocked(g_exch,noper,n2,jlo,jhi))) 634 $ call errquit( 635 $ 'moints_2nd_half: wrong dimensions/distrib exch',0) 636 do ij=jlo,jhi 637 tt = tcgtime() 638 call ga_access(g_khalf,1,nn,ij,ij,k_src,ld_src) 639 call dgemm('n','n',nbf,n2,nbf,1.d0,dbl_mb(k_src),nbf, 640 $ dbl_mb(k_mov),nbf,0.d0,dbl_mb(k_tmp),nbf) 641 t3qtr = t3qtr + tcgtime() - tt 642 tt = tcgtime() 643 if (oalias) then 644 k_dest = k_src 645 else 646 call ga_release(g_khalf,1,nn,ij,ij) 647 call ga_access(g_exch,1,n2n2,ij,ij,k_dest,ld_dest) 648 endif 649 call dgemm('t','n',n2,n2,nbf,1.d0,dbl_mb(k_mov),nbf, 650 $ dbl_mb(k_tmp),nbf,0.d0,dbl_mb(k_dest),n2) 651 knonzero = knonzero + 652 $ moints_integ_count(n2,dbl_mb(k_dest)) 653 call ga_release_update(g_exch,1,n2n2,ij,ij) 654 t4qtr = t4qtr + tcgtime() - tt 655 enddo 656 call ga_sync() 657 endif 658c 659c Clean up 660c 661 if (.not. ma_pop_stack(l_mov)) 662 $ call errquit('moints: failed to pop temp transf', l_mov) 663 if (.not. ma_pop_stack(l_tmp)) 664 $ call errquit('moints: failed to pop temp transf', l_tmp) 665 thalf = tcgtime() - thalf 666c 667c Statistics 668c 669 if (ocoul) call ga_igop(MOINTS_J_SUM,jnonzero,1,'+') 670 if (oexch) call ga_igop(MOINTS_K_SUM,knonzero,1,'+') 671 if (ga_nodeid().eq.0) then 672 write(6,973) t3qtr,t4qtr,thalf 673 973 format(10x,'3rd quarter time:',14x,f10.2,/, 674 $ 10x,'4th quarter time:',14x,f10.2,/, 675 $ 10x,'Total half time:',15x,f10.2) 676 call util_flush(6) 677 ij = n2*n2*(noper*(noper+1))/2 678 if ((ocoul).and.(oexch)) ij = ij*2 679 write(6,901) ij 680 if (ocoul) write(6,902) jnonzero 681 if (oexch) write(6,903) knonzero 682 901 format(/,10x,'Total number of integrals:',5x,i10) 683 902 format( 10x,'Non-zero Coulomb integrals:',4x,i10) 684 903 format( 10x,'Non-zero exchange integrals:',3x,i10) 685 endif 686 return 687 end 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708c 709c Auxiliary routines to manage and manipulate memory 710c 711 712 subroutine moints_1idx_trf( ilo, ihi, jlo, jhi, klo, khi, 713 $ llo, lhi, eri, c, nbf, 714 $ mo_lo, mo_hi, x ) 715 implicit none 716 integer nbf 717 integer ilo,ihi,jlo,jhi,klo,khi,llo,lhi,mo_lo,mo_hi 718 double precision eri( llo:lhi, klo:khi, jlo:jhi, ilo:ihi ) 719 double precision c( 1:nbf, mo_lo: mo_hi ) 720 double precision x( klo:khi, jlo:jhi, ilo:ihi, mo_lo:mo_hi ) 721 integer nn,mm,ll 722 723 nn = (ihi-ilo+1)*(jhi-jlo+1)*(khi-klo+1) 724 mm = mo_hi-mo_lo+1 725 ll = lhi - llo + 1 726 call dgemm('t','n',nn,mm,ll,1.d0,eri,ll,c(llo,1),nbf,1.d0,x,nn) 727 return 728 end 729 730 731 732 733 734 735 subroutine moints_push_halftr(nbf,mulo,muhi,noper,g_j2idx, 736 $ g_joper) 737 implicit none 738#include "global.fh" 739#include "mafdecls.fh" 740 integer nbf, mulo, muhi, noper 741 integer g_j2idx, g_joper 742 integer i,j,ij,mu,muleng,nopert 743 integer l_scr, k_scr, k_local, ld_local 744 integer j_off,mu_off,nu_off 745 integer jlo,jhi 746 logical ga_check_moblocked 747 external ga_check_moblocked 748 749 muleng = muhi - mulo + 1 750 nopert = (noper*(noper+1))/2 751 if (.not.ga_check_moblocked(g_joper,noper,nbf,jlo,jhi)) 752 $ call errquit('moints_push_halftr: wrong distrib',0) 753 if (.not.ma_push_get(MT_DBL,nbf,'scr',l_scr,k_scr)) 754 $ call errquit('moints_push_halftr: cannot allocate',0) 755 do i=1,noper 756 do j=1,i 757 ij = (i*(i-1))/2 + j 758 if ((ij.ge.jlo).and.(ij.le.jhi)) then 759 call ga_access(g_joper,1,(nbf*nbf),ij,ij,k_local,ld_local) 760 j_off = (j-1)*muleng 761 do mu=mulo,muhi 762 nu_off = (j_off+(mu-mulo))*nbf 763 mu_off = (mu-1)*nbf 764 call ga_get(g_j2idx,nu_off+1,nu_off+nbf,i,i, 765 $ dbl_mb(k_scr),nbf) 766 call dcopy(nbf,dbl_mb(k_scr),1,dbl_mb(k_local+mu_off),1) 767C call ga_put(g_joper,mu_off+1,mu_off+nbf,ij,ij, 768C $ dbl_mb(k_scr),nbf) 769 enddo 770 call ga_release_update(g_joper,1,(nbf*nbf),ij,ij) 771 endif 772 enddo 773 enddo 774 call ga_sync() 775 if (.not.ma_pop_stack(l_scr)) 776 $ call errquit('moints_push_halftr: pop stack failed',0) 777 return 778 end 779 780 781 782 783 784 subroutine moints_push_1idx( nbf, ilo, ihi, jlo, jhi, klo, khi, 785 $ mo_lo, mo_hi, x, g_j1idx ) 786 implicit none 787#include "global.fh" 788#include "mafdecls.fh" 789 integer nbf,ilo,ihi,jlo,jhi,klo,khi,mo_lo,mo_hi 790 double precision x(klo:khi, jlo:jhi, ilo:ihi, mo_lo:mo_hi) 791 integer g_j1idx 792 integer t, i, j 793 integer ileng, kleng, ti, tij 794 795 ileng = ihi - ilo + 1 796 kleng = khi - klo + 1 797 do t=mo_lo,mo_hi 798 do i=ilo,ihi 799 ti = (t-mo_lo)*ileng + (i-ilo+1) 800 do j=jlo,jhi 801 tij = (ti-1)*nbf + j 802 call ga_acc(g_j1idx,klo,khi,tij,tij,x(klo,j,i,t), 803 $ kleng,1.d0) 804C call ga_put(g_j1idx,klo,khi,tij,tij,x(klo,j,i,t),kleng) 805 enddo 806 enddo 807 enddo 808 end 809 810 811 812 813 814 subroutine moints_ktransp(k,l,m,n,x,y) 815 implicit none 816 integer k,l,m,n 817 double precision x(k,l,m,n),y(l,k,m,n) 818 integer a,b,c,d 819 820 do a=1,n 821 do b=1,m 822 do c=1,k 823 do d=1,l 824 y(d,c,b,a) = x(c,d,b,a) 825 enddo 826 enddo 827 enddo 828 enddo 829 return 830 end 831 832 833 834 835 836 837 838 839 840 841 842 843 integer function moints_integ_count(nbf,xx) 844 implicit none 845 integer nbf 846 double precision xx(nbf,nbf) 847 integer i,j,count 848 849 count = 0 850 do i=1,nbf 851 do j=1,nbf 852 if (abs(xx(j,i)).gt.1.d-12) then 853 count = count + 1 854 endif 855 enddo 856 enddo 857 moints_integ_count = count 858 return 859 end 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 subroutine eri_tr(ilo,ihi,jlo,jhi,klo,khi,llo,lhi,x,y) 878 implicit none 879 integer ilo,ihi,jlo,jhi,klo,khi,llo,lhi 880 double precision x(llo:lhi,klo:khi,jlo:jhi,ilo:ihi) 881 double precision y(klo:khi,llo:lhi,jlo:jhi,ilo:ihi) 882 integer i,j,k,l 883 884 do i=ilo,ihi 885 do j=jlo,jhi 886 do k=klo,khi 887 do l=llo,lhi 888 y(k,l,j,i) = x(l,k,j,i) 889 enddo 890 enddo 891 enddo 892 enddo 893 return 894 end 895 896 897 898 899 900 901 902 subroutine moints_1idx_move(nbf,ilo,ihi,jlo,jhi,llo,lhi,mo_lo, 903 $ mo_hi,x,y) 904 implicit none 905 integer nbf,ilo,ihi,jhi,jlo,llo,lhi,mo_lo,mo_hi 906 double precision x( llo:lhi, jlo:jhi, ilo:ihi, mo_lo:mo_hi) 907 double precision y( nbf, jlo:jhi, ilo:ihi, mo_lo:mo_hi ) 908 integer t,i,j,l 909 910 do t=mo_lo,mo_hi 911 do i=ilo,ihi 912 do j=jlo,jhi 913 do l=llo,lhi 914 y(l,j,i,t) = x(l,j,i,t) 915 enddo 916 enddo 917 enddo 918 enddo 919 return 920 end 921 922 923 924 925 926 927 subroutine moints_push_1idx_a(nbf,ilo,ihi,jlo,jhi, 928 $ mo_lo,mo_hi,x,g_j1idx) 929 implicit none 930 integer nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi 931 double precision x(nbf,jlo:jhi,ilo:ihi,mo_lo:mo_hi) 932 integer g_j1idx 933 integer ileng 934 integer t,i,j,ti,tij 935 936 ileng = ihi - ilo + 1 937 do t=mo_lo,mo_hi 938 do i=ilo,ihi 939 ti = (t-mo_lo)*ileng + (i-ilo+1) 940 do j=jlo,jhi 941 tij = (ti-1)*nbf + j 942 call ga_acc(g_j1idx,1,nbf,tij,tij,x(1,j,i,t), 943 $ nbf,1.d0) 944 enddo 945 enddo 946 enddo 947 end 948 949 950 951 952 953 subroutine moints_push_1idx_b( nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi, 954 $ x, g_k1idx ) 955 implicit none 956 integer nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi 957 double precision x(nbf,jlo:jhi,ilo:ihi,mo_lo:mo_hi) 958 integer g_k1idx 959 integer ileng 960 integer t,i,j,l,ti,til 961 962 ileng = ihi - ilo + 1 963 do t=mo_lo,mo_hi 964 do i=ilo,ihi 965 ti = (t-mo_lo)*ileng + (i-ilo+1) 966 do l=1,nbf 967 til = (ti-1)*nbf + l 968 do j=jlo,jhi 969 call ga_acc(g_k1idx,j,j,til,til,x(l,j,i,t), 970 $ 1,1.d0) 971 enddo 972 enddo 973 enddo 974 enddo 975 return 976 end 977 978#endif 979 980 981 982 983 984 985 986#ifdef DEBUG_PRINT 987 subroutine moints_print_opermatrix(noper,nbf,g_a) 988 implicit none 989#include "global.fh" 990#include "mafdecls.fh" 991 integer noper,nbf 992 integer g_a 993 integer j,k,jk 994 integer my_id, ilo, ihi, jlo, jhi 995 integer k_local, ld_local 996 997 my_id = ga_nodeid() 998 call ga_distribution(g_a,my_id,ilo,ihi,jlo,jhi) 999 do j=1,noper 1000 do k=1,j 1001 jk = (j*(j-1))/2 + k 1002 if ((jk.ge.jlo).and.(jk.le.jhi)) then 1003 write(6,901) j,k 1004 901 format(//,'Operator: [',i2,',',i2,']',/) 1005 print*,my_id,' ',ilo,ihi,' ',jk 1006 call ga_access(g_a,ilo,ihi,jk,jk,k_local,ld_local) 1007 call moints_print_square(nbf,dbl_mb(k_local)) 1008 endif 1009 call ga_sync() 1010 enddo 1011 enddo 1012 1013 return 1014 end 1015 1016 1017 1018 1019 subroutine moints_print_square(n,x) 1020 implicit none 1021 integer n,i,j 1022 double precision x(n,n) 1023 1024 do i=1,n 1025 write(6,901) (x(i,j),j=1,n) 1026 901 format(15f9.4) 1027 enddo 1028 call util_flush(6) 1029 end 1030 1031 1032 1033 1034 1035 1036 subroutine moints_print_1(ilo,ihi,jlo,jhi, 1037 $ klo,khi,mo_lo,mo_hi,x) 1038 implicit none 1039 integer ilo,ihi,jlo,jhi,klo,khi,mo_lo,mo_hi 1040 double precision x(klo:khi,jlo:jhi,ilo:ihi,mo_lo:mo_hi) 1041 integer m,i,j,k 1042 1043 do m=mo_lo,mo_hi 1044 do i=ilo,ihi 1045 write(6,933) ((x(k,j,i,m),k=klo,khi),j=jlo,jhi) 1046 933 format(15f9.4) 1047 enddo 1048 write(6,*) 1049 enddo 1050 1051 return 1052 end 1053 1054#endif 1055 1056 1057 1058 1059 1060 1061#ifdef OLD_ALGORITHM 1062#ifdef WHITESIDE 1063C======================================================================== 1064C======================================================================== 1065C======================================================================== 1066C======================================================================== 1067C======================================================================== 1068C======================================================================== 1069C======================================================================== 1070c 1071c 1072c atw - 8/25/94 1073c 1074c This is an alternate algorithm for the 1075c half-transformed Coulomb operator, (the 1076c obvious algorithm from Whiteside et al). 1077c 1078c 1079C======================================================================== 1080C======================================================================== 1081C======================================================================== 1082C======================================================================== 1083C======================================================================== 1084C======================================================================== 1085C======================================================================== 1086 subroutine moints_1st_half_w(basis,mo_indx_lo,mo_indx_hi, 1087 $ g_movecs, 1088 $ g_jhalf,ocoul, 1089 $ g_khalf,oexch) 1090 implicit none 1091#include "tcgmsg.fh" 1092#include "global.fh" 1093#include "mafdecls.fh" 1094#include "bas.fh" 1095#include "schwarz.fh" 1096 integer MOINTS_SCHWARZ_STAT 1097 parameter(MOINTS_SCHWARZ_STAT=1921) 1098c 1099c 1100 integer basis ! Basis handle 1101 integer mo_indx_lo, mo_indx_hi ! 1st pair of MO indices 1102 integer g_movecs ! MO coefficients 1103 integer g_jhalf ! Half-transformed Coulomb operator 1104 integer g_khalf ! Half-transformed exchange operator (UNUSED !) 1105 logical ocoul,oexch ! Type selection 1106c 1107c Local 1108c 1109 integer noper,nbf 1110 integer ssbb_block, ssba_block, ssaa_block 1111 integer l_jssbb,k_jssbb 1112 integer l_jssba,k_jssba 1113 integer l_jssaa,k_jssaa 1114 integer l_zssaa,k_zssaa, l_yssaa, k_yssaa 1115 integer ssn, ssa 1116 integer l_iscr,k_iscr,l_eri,k_eri 1117 integer l_mov,k_mov 1118 integer mem2,max2e,maxbf_shell 1119 integer nsh,ish,jsh,ksh,lsh 1120 integer ish_bflo,ish_bfhi,jsh_bflo,jsh_bfhi 1121 integer ksh_bflo,ksh_bfhi,lsh_bflo,lsh_bfhi 1122 integer ileng,jleng 1123 integer num_nodes, next, ij 1124 double precision tol2e,dschw_done,schw_ratio 1125 double precision t1qtr,t2qtr,thalf,tzz,tint,txy 1126 logical status 1127 data tol2e/1.d-10/ 1128c 1129c 1130c Get general info 1131c 1132 call ga_sync() 1133 thalf = tcgtime() 1134 status = bas_numbf(basis,nbf) 1135 status = status.and.bas_numcont(basis,nsh) 1136 status = status.and.bas_nbf_cn_max(basis,maxbf_shell) 1137 if (.not.status) call errquit('moints: cannot get basis info',0) 1138 noper = mo_indx_hi - mo_indx_lo + 1 1139 if (ga_nodeid().eq.0) then 1140 write(6,927) nsh,maxbf_shell 1141 927 format(10x,'Number of shells:',19x,i5,/, 1142 $ 10x,'Maximum shell length:',15x,i5) 1143 call util_flush(6) 1144 endif 1145c 1146c Ints allocation 1147c 1148 call int_mem_2e4c(max2e, mem2) 1149 status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri) 1150 status = status.and.ma_push_get(MT_DBL, mem2, 1151 $ 'moints: scr', l_iscr, k_iscr) 1152c 1153c Allocate and get local 1154c 1155 status = status.and.ma_push_get(MT_DBL,(noper*nbf), 1156 $ 'movecs cols',l_mov,k_mov) 1157 if (.not.(status)) call errquit('failed allocate MO vectors',0) 1158 call ga_get(g_movecs,1,nbf,mo_indx_lo,mo_indx_hi, 1159 $ dbl_mb(k_mov),nbf) 1160c 1161c Allocate local shell-shell-basis-basis blocks 1162c 1163 ssbb_block = maxbf_shell*maxbf_shell*nbf*nbf 1164 ssba_block = maxbf_shell*maxbf_shell*nbf*noper 1165 ssaa_block = maxbf_shell*maxbf_shell*noper*noper 1166 status = status.and.ma_push_get(MT_DBL,ssbb_block, 1167 $ 'J ssbb block',l_jssbb,k_jssbb) 1168 status = status.and.ma_push_get(MT_DBL,ssba_block, 1169 $ 'J ssba block',l_jssba,k_jssba) 1170 status = status.and.ma_push_get(MT_DBL,ssaa_block, 1171 $ 'J ssaa block',l_jssaa,k_jssaa) 1172 status = status.and.ma_push_get(MT_DBL,ssaa_block, 1173 $ 'J transp 1 ssaa block',l_zssaa,k_zssaa) 1174 status = status.and.ma_push_get(MT_DBL,ssaa_block, 1175 $ 'J transp 2 ssaa block',l_yssaa,k_yssaa) 1176 if (.not.status) call 1177 $ errquit('moints: cannot allocate local memory',0) 1178c 1179c Start 4-fold shell loop 1180c 1181 if (ocoul) call ga_zero(g_jhalf) 1182 t1qtr = 0.d0 1183 t2qtr = 0.d0 1184 tint = 0.d0 1185 dschw_done = 0.d0 1186 num_nodes = ga_nnodes() 1187 ij = 0 1188 next = nxtval(num_nodes) 1189c 1190c Four-fold shell loop 1191c 1192 do ish=1,nsh 1193 if (.not. bas_cn2bfr(basis,ish,ish_bflo,ish_bfhi)) 1194 $ call errquit('mo_ints: bas_cn2bfr',ish) 1195 ileng = ish_bfhi - ish_bflo + 1 1196 do jsh=1,ish 1197 if (.not. bas_cn2bfr(basis,jsh,jsh_bflo,jsh_bfhi)) 1198 $ call errquit('mo_ints: bas_cn2bfr',jsh) 1199 jleng = jsh_bfhi - jsh_bflo + 1 1200 ssn = ileng*jleng*nbf 1201 ssa = ileng*jleng*noper 1202c 1203c Parallelize on (ij) 1204c 1205 if (ij.eq.next) then 1206 if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then 1207 call dfill(ssbb_block,0.d0,dbl_mb(k_jssbb),1) 1208 tzz = tcgtime() 1209 do ksh=1,nsh 1210 if (.not. bas_cn2bfr(basis,ksh,ksh_bflo,ksh_bfhi)) 1211 $ call errquit('mo_ints: bas_cn2bfr',ksh) 1212 do lsh=1,ksh 1213 if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh) 1214 $ .ge.tol2e) then 1215c 1216c Compute shell-quartet integral block 1217c 1218 dschw_done = dschw_done + 1.d0 1219 if (.not. bas_cn2bfr(basis,lsh,lsh_bflo,lsh_bfhi)) 1220 $ call errquit('mo_ints: bas_cn2bfr',lsh) 1221 txy = tcgtime() 1222 call int_2e4c(basis, ish, jsh, basis, ksh, lsh, 1223 $ mem2, dbl_mb(k_iscr), max2e, dbl_mb(k_eri)) 1224 tint = tint + tcgtime() - txy 1225 call moints_push_ints1( ish_bflo, ish_bfhi, 1226 $ jsh_bflo, jsh_bfhi, 1227 $ ksh_bflo, ksh_bfhi, 1228 $ lsh_bflo, lsh_bfhi, 1229 $ dbl_mb(k_eri), 1230 $ nbf, dbl_mb(k_jssbb)) 1231 endif 1232 enddo 1233 enddo 1234c 1235c First half-transformation 1236c 1237 call sgemm('t','n',noper,ssn,nbf,1.d0,dbl_mb(k_mov), 1238 $ nbf,dbl_mb(k_jssbb),nbf,0.d0, 1239 $ dbl_mb(k_jssba),noper) 1240 t1qtr = t1qtr + tcgtime() - tzz 1241 tzz = tcgtime() 1242c 1243c Second half-transformation 1244c 1245 call moints_2ndi_tr( nbf, noper, jleng, ileng, 1246 $ dbl_mb(k_jssba), dbl_mb(k_mov), 1247 $ dbl_mb(k_jssaa)) 1248 call moints_transp_ss( noper, ileng, jleng, 1249 $ dbl_mb(k_jssaa), 1250 $ dbl_mb(k_zssaa), 1251 $ dbl_mb(k_yssaa) ) 1252c 1253c Push half-transformed into global 1254c 1255 call moints_push_ints2( ish_bflo, ish_bfhi, 1256 $ jsh_bflo, jsh_bfhi, 1257 $ nbf, noper, 1258 $ dbl_mb(k_zssaa), 1259 $ dbl_mb(k_yssaa), 1260 $ g_jhalf ) 1261 t2qtr = t2qtr + tcgtime() - tzz 1262 endif 1263 next = nxtval(num_nodes) 1264 endif 1265c 1266c End parallel task 1267c 1268 ij = ij + 1 1269 enddo 1270 enddo 1271 next = nxtval(-num_nodes) 1272 1273 call ga_sync() 1274 call ga_dgop(MOINTS_SCHWARZ_STAT,dschw_done,1,'+') 1275 schw_ratio = dschw_done/(nsh*nsh*nsh*nsh)*100.d0 1276 thalf = tcgtime() - thalf 1277 if (ga_nodeid().eq.0) then 1278 write(6,986) schw_ratio,tint,t1qtr,t2qtr,thalf 1279 986 format(10x,'Shell-quartets percentage:',4x,f10.1,'%',//, 1280 $ 10x,'Integrals time:',16x,f10.2,/, 1281 $ 10x,'1st quarter time:',14x,f10.2,/, 1282 $ 10x,'2nd quarter time:',14x,f10.2,/, 1283 $ 10x,'Total half time:',15x,f10.2,/) 1284 call util_flush(6) 1285 endif 1286c 1287c 1288c 1289c Clean up 1290c 1291c 1292 if (.not. ma_pop_stack(l_yssaa)) 1293 $ call errquit('moints: failed to pop temp transf', l_yssaa) 1294 if (.not. ma_pop_stack(l_zssaa)) 1295 $ call errquit('moints: failed to pop temp transf', l_zssaa) 1296 if (.not. ma_pop_stack(l_jssaa)) 1297 $ call errquit('moints: failed to pop temp transf', l_jssaa) 1298 if (.not. ma_pop_stack(l_jssba)) 1299 $ call errquit('moints: failed to pop temp transf', l_jssba) 1300 if (.not. ma_pop_stack(l_jssbb)) 1301 $ call errquit('moints: failed to pop temp transf', l_jssbb) 1302 if (.not. ma_pop_stack(l_mov)) 1303 $ call errquit('moints: failed to pop movectors', l_mov) 1304 if (.not. ma_pop_stack(l_iscr)) 1305 $ call errquit('moints: failed to pop int scratch', l_iscr) 1306 if (.not. ma_pop_stack(l_eri)) 1307 $ call errquit('moints: failed to pop int buffer', l_eri) 1308 1309 return 1310 end 1311 1312 1313 1314 1315 1316 1317c 1318c ==================================================================== 1319c Auxiliary routines to help manage array 1320c addressing, transposition, etc. 1321c ==================================================================== 1322c 1323c 1324c 1325 1326c 1327c Copy integrals from shell-quartet offset addressing 1328c to absolute AO indices 1329c 1330 subroutine moints_push_ints1( ilo, ihi, jlo, jhi, klo, khi, 1331 $ llo, lhi, eri, nbf, buf ) 1332 implicit none 1333 integer ilo, ihi, jlo, jhi 1334 integer klo, khi, llo, lhi 1335 integer nbf 1336 double precision eri(llo:lhi,klo:khi,jlo:jhi,ilo:ihi) 1337 double precision buf(nbf,nbf,jlo:jhi,ilo:ihi) 1338 integer i,j,k,l 1339 1340 do i=ilo,ihi 1341 do j=jlo,jhi 1342 do k=klo,khi 1343 do l=llo,lhi 1344 buf(l,k,j,i) = eri(l,k,j,i) 1345 buf(k,l,j,i) = eri(l,k,j,i) 1346 enddo 1347 enddo 1348 enddo 1349 enddo 1350 return 1351 end 1352 1353 1354 1355c 1356c Push half-transformed integrals into global 1357c array. Copies strips of length: (jhi-jlo+1) 1358c 1359 subroutine moints_push_ints2( ilo, ihi, jlo, jhi, 1360 $ nbf, noper, x, y, g_half ) 1361 implicit none 1362 integer ilo, ihi, jlo, jhi, nbf, noper 1363 double precision x(jlo:jhi,ilo:ihi,noper,noper) 1364 double precision y(ilo:ihi,jlo:jhi,noper,noper) 1365 integer g_half 1366 integer i,j,t,u,ij1,ij2,tu,ileng,jleng 1367 1368 do t=1,noper 1369 do u=1,t 1370 tu = ((t-1)*t)/2 + u 1371 do i=ilo,ihi 1372 ij1 = (i-1)*nbf+jlo 1373 ij2 = (i-1)*nbf+jhi 1374 jleng = jhi - jlo + 1 1375 call ga_put(g_half,ij1,ij2,tu,tu,x(jlo,i,u,t),jleng) 1376 enddo 1377 do j=jlo,jhi 1378 ij1 = (j-1)*nbf+ilo 1379 ij2 = (j-1)*nbf+ihi 1380 ileng = ihi - ilo + 1 1381 call ga_put(g_half,ij1,ij2,tu,tu,y(ilo,j,u,t),ileng) 1382 enddo 1383 enddo 1384 enddo 1385 return 1386 end 1387 1388 1389 1390 1391 1392c 1393c Loops over (ij)-shells and performs 2nd index 1394c transformation. (This cannot be done as in a 1395c single dgemm) 1396c 1397c 1398 subroutine moints_2ndi_tr( nbf, noper, jleng, ileng, 1399 $ xx, yy, zz ) 1400 implicit none 1401 integer nbf,noper,jleng,ileng 1402 double precision xx(noper,nbf,jleng,ileng) 1403 double precision yy(nbf,noper) 1404 double precision zz(noper,noper,jleng,ileng) 1405 integer i,j 1406 1407 1408 do i=1,ileng 1409 do j=1,jleng 1410 call sgemm('n','n',noper,noper,nbf,1.d0,xx(1,1,j,i), 1411 $ noper,yy,nbf,0.d0,zz(1,1,j,i),noper) 1412 enddo 1413 enddo 1414 return 1415 end 1416 1417 1418 1419 1420 1421 1422c 1423c Transpose half-transformed integrals so AO-indices 1424c varies faster, to accomodate ga_put() using 1425c shell-strips. 1426c 1427 subroutine moints_transp_ss( noper, ileng, jleng, x, y, z) 1428 implicit none 1429 integer noper, ileng, jleng 1430 double precision x(noper,noper,jleng,ileng) 1431 double precision y(jleng,ileng,noper,noper) 1432 double precision z(ileng,jleng,noper,noper) 1433 integer i,j,k,l 1434 1435 do i=1,ileng 1436 do j=1,jleng 1437 do k=1,noper 1438 do l=1,noper 1439 y(j,i,l,k) = x(l,k,j,i) 1440 z(i,j,l,k) = x(l,k,j,i) 1441 enddo 1442 enddo 1443 enddo 1444 enddo 1445 return 1446 end 1447 1448 1449 1450 1451#endif /* WHITESIDE */ 1452#endif /* OLD_ALGORITHM */ 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464C======================================================================== 1465C======================================================================== 1466C======================================================================== 1467C======================================================================== 1468C======================================================================== 1469C======================================================================== 1470C======================================================================== 1471c 1472c 1473c atw - 9/7/94 1474c 1475c This is a variation of Whiteside and 1476c is currently the one implemented. 1477c Uses 4-fold symmetry. 1478c Split-loop: Outer ish loop is split on 1479c for load-balancing 1480c Segmented-task: Tasks do not span across (nsh*(nsh+1))/2 1481c to minimise comms 1482c UseGemm: Use the BLAS calls for transformation steps 1483c (very slow) 1484c 1485c 1486C======================================================================== 1487C======================================================================== 1488C======================================================================== 1489C======================================================================== 1490C======================================================================== 1491C======================================================================== 1492C======================================================================== 1493#ifndef OLD_ALGORITHM 1494 subroutine moints_build(basis, 1495 $ mo1_lo,mo1_hi, 1496 $ mo2_lo,mo2_hi, 1497 $ g_movecs, 1498 $ g_coul,ocoul, 1499 $ g_exch,oexch, 1500 $ oprintstats, 1501 $ chunk_factor) 1502 implicit none 1503#include "tcgmsg.fh" 1504#include "global.fh" 1505#include "mafdecls.fh" 1506#include "bas.fh" 1507#include "schwarz.fh" 1508#include "msgids.fh" 1509c 1510c Arguments 1511c 1512 integer basis ! Basis handle 1513 integer mo1_lo, mo1_hi ! 1st Pair Index range 1514 integer mo2_lo, mo2_hi ! 2nd Pair Index range 1515 integer g_movecs ! MO coefficients 1516 integer g_coul ! Coulomb operator 1517 integer g_exch ! Exchange operator 1518 logical ocoul,oexch ! Type selection 1519 logical oprintstats ! Print flag 1520 double precision chunk_factor 1521c 1522c Local variables 1523c 1524 integer nmo1,nmo2,nbf,nsh,maxbfsh 1525 integer ish,jsh,ish2,msh 1526 integer ibflo1,ibfhi1,ibflo2,ibfhi2 1527 integer g_k2idx,g_j2idx 1528 integer l_sssi,k_sssi 1529 integer l_ssni,k_ssni 1530 integer l_ssai,k_ssai 1531 integer l_ssji,k_ssji 1532 integer l_ab1, k_ab1, l_ab2, k_ab2 1533 integer l_eri,k_eri,l_erit,k_erit 1534 integer l_iscr,k_iscr 1535 integer l_mo,k_mo 1536 integer n_sssi,n_ssni,n_ssai,n_ssji,n_ab,nrow,ncol 1537 integer mem2,max2e 1538 integer offset 1539 integer num_nodes,ploop,next,chunksiz,tri_shells 1540 integer i 1541 double precision tol2e 1542 double precision tt,t4sh,tyy,twait,tavg,tyyy,t12 1543 double precision schw_ratio 1544 logical status 1545 integer istatv(1000) 1546 double precision dstatv(100) 1547 double precision tpush_avg,tint_avg,twait_avg 1548#ifdef DEBUG_PARALLEL 1549 integer nptasks,npcomm 1550 double precision schw_done 1551 common/pstats/nptasks,npcomm,schw_done 1552 double precision t1qtr,t2qtr,t34,tpush,tint,tloop,t11 1553 common/ztime/t1qtr,t2qtr,t34,tpush,tint,tloop,t11 1554#endif 1555c 1556c 1557 integer nxtask,nxtseg 1558 external nxtask,nxtseg 1559c 1560c 1561c 1562 data tol2e/1.d-12/ 1563c 1564c 1565c General basis info 1566c 1567 num_nodes = ga_nnodes() 1568 status = bas_numbf(basis,nbf) 1569 status = status.and.bas_numcont(basis,nsh) 1570 status = status.and.bas_nbf_cn_max(basis,maxbfsh) 1571 if (.not.status) call errquit('moints: cannot get basis info',0) 1572 nmo1 = mo1_hi - mo1_lo + 1 1573 nmo2 = mo2_hi - mo2_lo + 1 1574c 1575c Integrals allocation 1576c 1577 call int_mem_2e4c(max2e, mem2) 1578 status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri) 1579 status = status.and.ma_push_get(MT_DBL, max2e, 1580 $ 'moints: buf', l_erit, k_erit) 1581 status = status.and.ma_push_get(MT_DBL, mem2, 1582 $ 'moints: scr', l_iscr, k_iscr) 1583c 1584c Local MO coefficients 1585c 1586 status = status.and.ma_push_get(MT_DBL,(nbf*nbf), 1587 $ 'movecs cols',l_mo,k_mo) 1588 call ga_get(g_movecs,1,nbf,1,nbf,dbl_mb(k_mo),nbf) 1589c 1590c Temporary partially-transformed arrays 1591c 1592 n_sssi = maxbfsh*maxbfsh*maxbfsh*nmo1 1593 n_ssni = maxbfsh*maxbfsh*nbf*nmo1 1594 n_ssai = maxbfsh*maxbfsh*nmo2*nmo1 1595 n_ssji = max((maxbfsh*maxbfsh*nmo1*nmo1),(maxbfsh*nmo2)) 1596 n_ab = max(nmo1,nmo2)*max(nmo1,nmo2) 1597 status = status.and.ma_push_get(MT_DBL,n_sssi, 1598 $ 'K sssi block',l_sssi,k_sssi) 1599 status = status.and.ma_push_get(MT_DBL,n_ssni, 1600 $ 'K ssni block',l_ssni,k_ssni) 1601 if (oexch) then 1602 status = status.and.ma_push_get(MT_DBL,n_ssai, 1603 $ 'K ssai block',l_ssai,k_ssai) 1604 endif 1605 if (ocoul) then 1606 status = status.and.ma_push_get(MT_DBL,n_ssji, 1607 $ 'K ssai block',l_ssji,k_ssji) 1608 endif 1609 status = status.and.ma_push_get(MT_DBL,n_ab, 1610 $ 'K ab1 block',l_ab1,k_ab1) 1611 status = status.and.ma_push_get(MT_DBL,n_ab, 1612 $ 'K ab2 block',l_ab2,k_ab2) 1613 if (.not.(status)) call errquit('cannot allocate local memory',0) 1614c 1615c Globals for accumulating partially-transformed (& transposing) 1616c 1617 nrow = maxbfsh*(nbf+1) 1618 ncol = nmo1*nmo2 1619 if (oexch) then 1620 if (.not.ga_create(MT_DBL,nrow,ncol,'K2idx',nrow,1,g_k2idx)) 1621 $ call errquit('moints: cannot allocate global',0) 1622 endif 1623 ncol = (nmo1*(nmo1+1))/2 1624 if (ocoul) then 1625 if (.not.ga_create(MT_DBL,nrow,ncol,'J2idx',nrow,1,g_j2idx)) 1626 $ call errquit('moints: cannot allocate global',0) 1627 endif 1628c 1629c Initialize 1630c 1631 tri_shells = (nsh*(nsh+1))/2 1632 chunksiz = tri_shells/chunk_factor 1633 t1qtr = 0.d0 1634 t2qtr = 0.d0 1635 t34 = 0.d0 1636 tint = 0.d0 1637 tpush = 0.d0 1638 twait = 0.d0 1639 tloop = 0.d0 1640 t11 = 0.d0 1641 schw_done = 0.d0 1642 nptasks = 0 1643 npcomm = 0 1644 if (oexch) call ga_zero(g_exch) 1645 if (ocoul) call ga_zero(g_coul) 1646c 1647c 4-fold shell loop 1648c 1649 t4sh = tcgtime() 1650 msh = nsh/2 + mod(nsh,2) 1651#ifdef SPLIT_LOOP 1652 do ish=1,msh 1653#else 1654 do ish=1,nsh 1655#endif 1656 tyyy = tcgtime() 1657#ifdef SEGMENT_TASK 1658 next = nxtseg(num_nodes,chunksiz,tri_shells) 1659#else 1660 next = nxtask(num_nodes,chunksiz) 1661#endif 1662 ploop = 0 1663 offset = 0 1664 if (oexch) call ga_zero(g_k2idx) 1665 if (ocoul) call ga_zero(g_j2idx) 1666 if (.not. bas_cn2bfr(basis,ish,ibflo1,ibfhi1)) 1667 $ call errquit('mo_ints: bas_cn2bfr',ish) 1668 do jsh=1,ish 1669 if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then 1670 call moints_inner( basis, nbf, nsh, ish, jsh, 1671 $ ibflo1, ibfhi1, 1672 $ next, ploop, chunksiz, offset, 1673 $ ocoul, oexch, mo1_lo, mo1_hi, 1674 $ mo2_lo, mo2_hi, dbl_mb(k_mo), 1675 $ max2e, dbl_mb(k_eri), dbl_mb(k_erit), 1676 $ mem2, dbl_mb(k_iscr), 1677 $ dbl_mb(k_sssi), n_ssni, 1678 $ dbl_mb(k_ssni), dbl_mb(k_ssai), 1679 $ dbl_mb(k_ssji), g_k2idx, g_j2idx ) 1680 endif 1681 enddo 1682#ifdef SPLIT_LOOP 1683 ish2 = nsh - ish + 1 1684 if (ish2.eq.ish) goto 100 1685 if (.not. bas_cn2bfr(basis,ish2,ibflo2,ibfhi2)) 1686 $ call errquit('mo_ints: bas_cn2bfr',ish2) 1687 offset = (ibfhi1 - ibflo1 + 1)*ibfhi1 1688 do jsh=1,ish2 1689 if (schwarz_shell(ish2,jsh)*schwarz_max().ge.tol2e) then 1690 call moints_inner( basis, nbf, nsh, ish2, jsh, 1691 $ ibflo2, ibfhi2, 1692 $ next, ploop, chunksiz, offset, 1693 $ ocoul, oexch, mo1_lo, mo1_hi, 1694 $ mo2_lo, mo2_hi, dbl_mb(k_mo), 1695 $ max2e, dbl_mb(k_eri), dbl_mb(k_erit), 1696 $ mem2, dbl_mb(k_iscr), 1697 $ dbl_mb(k_sssi), n_ssni, 1698 $ dbl_mb(k_ssni), dbl_mb(k_ssai), 1699 $ dbl_mb(k_ssji), g_k2idx, g_j2idx ) 1700 endif 1701 enddo 1702 100 continue 1703 t12 = t12 + tcgtime() - tyyy 1704#endif 1705c 1706c Third and fourth qtr transformations can only be done 1707c when all 'jsh' contributions have been completed 1708c for some 'ish'. Must sync() and wait for all parallel 1709c tasks to finish. 1710c 1711 1712 tyy = tcgtime() 1713#ifdef SEGMENT_TASK 1714 next = nxtseg(-num_nodes,chunksiz,tri_shells) 1715#else 1716 next = nxtask(-num_nodes,chunksiz) 1717#endif 1718 tt = tcgtime() 1719 twait = twait + tt - tyy 1720 offset = 0 1721 if (oexch) 1722 $ call moints_trf34idx_K( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi, 1723 $ ibflo1, ibfhi1, offset, dbl_mb(k_mo), 1724 $ g_k2idx, dbl_mb(k_ab1), 1725 $ dbl_mb(k_ab2), dbl_mb(k_ssai), 1726 $ g_exch ) 1727 if (ocoul) 1728 $ call moints_trf34idx_J( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi, 1729 $ ibflo1, ibfhi1, offset, dbl_mb(k_mo), 1730 $ g_j2idx, dbl_mb(k_ab1), 1731 $ dbl_mb(k_ab2), dbl_mb(k_ssji), 1732 $ g_coul ) 1733#ifdef SPLIT_LOOP 1734 if (ish.eq.ish2) goto 101 1735 offset = (ibfhi1 - ibflo1 + 1)*ibfhi1 1736 if (oexch) 1737 $ call moints_trf34idx_K( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi, 1738 $ ibflo2, ibfhi2, offset, dbl_mb(k_mo), 1739 $ g_k2idx, dbl_mb(k_ab1), 1740 $ dbl_mb(k_ab2), dbl_mb(k_ssai), 1741 $ g_exch ) 1742 if (ocoul) 1743 $ call moints_trf34idx_J( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi, 1744 $ ibflo2, ibfhi2, offset, dbl_mb(k_mo), 1745 $ g_j2idx, dbl_mb(k_ab1), 1746 $ dbl_mb(k_ab2), dbl_mb(k_ssji), 1747 $ g_coul ) 1748 101 continue 1749#endif 1750 t34 = t34 + tcgtime() - tt 1751 enddo 1752 call ga_sync() 1753 t4sh = tcgtime() - t4sh 1754c 1755C call moints_print_opermatrix(nmo1,nmo2,g_exch) 1756C call moints_print_opermatrix(nmo1,nmo2,g_coul) 1757c 1758c Print statistics 1759c 1760 if (oprintstats) then 1761 call ga_dgop(Msg_MOSch,schw_done,1,'+') 1762 schw_ratio = (schw_done/(tri_shells*tri_shells))*100.d0 1763 if (ga_nodeid().eq.0) then 1764 write(6,986) schw_done,schw_ratio,chunksiz, 1765 $ t1qtr,t2qtr,t11,t12,t34,t4sh, 1766 $ tint,tpush,twait 1767 986 format(10x,'Shell-quartets done:',11x,f10.0,/, 1768 $ 10x,'Shell-quartets percentage:',4x,f10.1,'%',//, 1769 $ 10x,'Task chunksize:',19x,i7,/, 1770 $ 10x,'1st quarter time:',14x,f10.2,/, 1771 $ 10x,'2nd quarter time:',14x,f10.2,/, 1772 $ 10x,'1st quarter total time:',8x,f10.2,/, 1773 $ 10x,'Combined 1st and 2nd time:',5x,f10.2,/, 1774 $ 10x,'3rd and 4th quarter time:',6x,f10.2,/, 1775 $ 10x,'Four-fold shell loop time:',5x,f10.2,/, 1776 $ 10x,'Integral evaluation time:',6x,f10.2,/, 1777 $ 10x,'Transposition time:',12x,f10.2,/ 1778 $ 10x,'Synchronization time:',10x,f10.2) 1779 call util_flush(6) 1780 endif 1781 endif 1782 1783 1784#ifdef DEBUG_PARALLEL 1785 call ga_sync() 1786c$$$ call brdcst_ivec( nptasks, istatv ) 1787c$$$ if (ga_nodeid().eq.0) then 1788c$$$ write(6,966) 1789c$$$ 966 format(/,'Number of tasks:') 1790c$$$ write(6,901) (istatv(i),i=1,num_nodes) 1791c$$$ endif 1792c$$$ call brdcst_ivec( npcomm, istatv ) 1793c$$$ if (ga_nodeid().eq.0) then 1794c$$$ write(6,967) 1795c$$$ 967 format(/,'Number of comms:') 1796c$$$ write(6,901) (istatv(i),i=1,num_nodes) 1797c$$$ 901 format(10i10) 1798c$$$ endif 1799c$$$ call brdcst_dvec( twait, dstatv, tavg ) 1800c$$$ if (ga_nodeid().eq.0) then 1801c$$$ write(6,968) 1802c$$$ 968 format(/,'Sync times:') 1803c$$$ write(6,921) (dstatv(i),i=1,num_nodes) 1804c$$$ 921 format(10f10.3) 1805c$$$ endif 1806c$$$ call brdcst_dvec( tloop, dstatv, tavg ) 1807c$$$ if (ga_nodeid().eq.0) then 1808c$$$ write(6,969) 1809c$$$ 969 format(/,'Loop times:') 1810c$$$ write(6,921) (dstatv(i),i=1,num_nodes) 1811c$$$ write(6,*) 1812c$$$ write(6,*) 1813c$$$ endif 1814 call brdcst_dvec( tpush, dstatv, tpush_avg ) 1815 call brdcst_dvec( twait, dstatv, twait_avg ) 1816 call brdcst_dvec( tint, dstatv, tint_avg ) 1817 if (ga_nodeid().eq.0) write(6,956) num_nodes,chunksiz, 1818 $ tpush_avg, 1819 $ twait_avg,tint_avg, 1820 $ t4sh 1821 956 format(//,'Average times:', 1822 $ /,':: Number of nodes:',i10, 1823 $ /,':: Chunksize:',6x,i10, 1824 $ /,':: Communication:',2x,f10.3, 1825 $ /,':: Synchronization:',f10.3, 1826 $ /,':: Integrals:',6x,f10.3, 1827 $ /,':: Total:',10x,f10.3) 1828 1829#endif 1830c 1831c Clean-up 1832c 1833 if (.not. ma_pop_stack(l_ab2)) 1834 $ call errquit('moints: failed to pop', l_ab2) 1835 if (.not. ma_pop_stack(l_ab1)) 1836 $ call errquit('moints: failed to pop', l_ab1) 1837 if ((ocoul).and.(.not. ma_pop_stack(l_ssji))) 1838 $ call errquit('moints: failed to pop', l_ssji) 1839 if ((oexch).and.(.not.ma_pop_stack(l_ssai))) 1840 $ call errquit('moints: failed to pop', l_ssai) 1841 if (.not. ma_pop_stack(l_ssni)) 1842 $ call errquit('moints: failed to pop', l_ssni) 1843 if (.not. ma_pop_stack(l_sssi)) 1844 $ call errquit('moints: failed to pop', l_sssi) 1845 if (.not. ma_pop_stack(l_mo)) 1846 $ call errquit('moints: failed to pop', l_mo) 1847 if (.not. ma_pop_stack(l_iscr)) 1848 $ call errquit('moints: failed to pop', l_iscr) 1849 if (.not. ma_pop_stack(l_erit)) 1850 $ call errquit('moints: failed to pop', l_erit) 1851 if (.not. ma_pop_stack(l_eri)) 1852 $ call errquit('moints: failed to pop', l_eri) 1853 if ((oexch).and. 1854 $ (.not.ga_destroy(g_k2idx))) 1855 $ call errquit('moints: cannot destroy global',g_k2idx) 1856 if ((ocoul).and. 1857 $ (.not.ga_destroy(g_j2idx))) 1858 $ call errquit('moints: cannot destroy global',g_j2idx) 1859 1860 return 1861 end 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871c 1872c 1873c 1874c --------------------------- 1875c Auxiliary Routines 1876c --------------------------- 1877c 1878c 1879c 1880 subroutine moints_inttrsp( ni, nj, nk, nl, x, xt ) 1881 implicit none 1882 integer ni,nj,nk,nl 1883 double precision x(nl,nk,nj,ni) 1884 double precision xt(nk,nl,nj,ni) 1885 integer i,j,k,l 1886 1887 do i=1,ni 1888 do j=1,nj 1889 do l=1,nl 1890 do k=1,nk 1891 xt(k,l,j,i) = x(l,k,j,i) 1892 enddo 1893 enddo 1894 enddo 1895 enddo 1896 return 1897 end 1898 1899 1900 1901 1902 1903#ifdef USEGEMM 1904 subroutine moints_trf1idx( nbf, a1, a2, ni, nj, 1905 $ klo, khi, llo, lhi, 1906 $ scale, eri, c, x, y ) 1907 implicit none 1908 integer nbf,a1,a2,ni,nj,klo,khi,llo,lhi 1909 double precision scale 1910 double precision eri(llo:lhi,klo:khi,nj,ni) 1911 double precision c(nbf,nbf) 1912 double precision x(klo:khi,nj,ni,a1:a2) 1913 double precision y(nbf,ni,nj,a1:a2) 1914 integer na,nl,nsss,i,j,nk,a 1915 1916 na = a2 - a1 + 1 1917 nl = lhi - llo + 1 1918 nk = khi - klo + 1 1919 nsss = ni*nj*nk 1920 call sgemm('t','n',nsss,na,nl,scale,eri,nl,c(llo,a1),nbf, 1921 $ 0.d0,x,nsss) 1922 do a=a1,a2 1923 do i=1,ni 1924 do j=1,nj 1925 call saxpy(nk,1.d0,x(klo,j,i,a),1,y(klo,i,j,a),1) 1926 enddo 1927 enddo 1928 enddo 1929 return 1930 end 1931 1932#endif /* USEGEMM */ 1933 1934 1935 1936 1937 1938 1939 1940 subroutine moints_trf2idx( nbf, a1, a2, nb, ni, nj, 1941 $ scale, y, c, z ) 1942 implicit none 1943 integer nbf,a1,a2,nb,ni,nj 1944 double precision scale 1945 double precision c(nbf,nbf) 1946 double precision y(nbf,ni,nj,nb) 1947 double precision z(ni,nj,nb,a1:a2) 1948 integer ssb,na 1949 1950 ssb = nj*ni*nb 1951 na = a2 - a1 + 1 1952 call sgemm('t','n',ssb,na,nbf,scale,y,nbf,c(1,a1),nbf, 1953 $ 0.d0,z,ssb) 1954 return 1955 end 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 subroutine moints_push2idxK( nbf, alo, ahi, blo, bhi, 1966 $ ilo, ihi, jlo, jhi, koffset, 1967 $ x, g_k ) 1968 implicit none 1969#include "global.fh" 1970 integer nbf,alo,ahi,blo,bhi 1971 integer ilo,ihi,jlo,jhi,koffset 1972 double precision x(ilo:ihi,jlo:jhi,alo:ahi,blo:bhi) 1973 integer g_k 1974 integer a,b,ijlo,ijhi,bleng,ab,ij_leng,ileng 1975 1976 ileng = ihi - ilo + 1 1977 ij_leng = ileng*(jhi - jlo + 1) 1978 bleng = bhi - blo + 1 1979 ijlo = koffset + (jlo-1)*ileng + 1 1980 ijhi = koffset + jhi*ileng 1981 do a=alo,ahi 1982 do b=blo,bhi 1983 ab = (a-alo)*bleng + (b-blo+1) 1984 call ga_acc(g_k,ijlo,ijhi,ab,ab,x(ilo,jlo,a,b),ij_leng,1.d0) 1985 enddo 1986 enddo 1987 return 1988 end 1989 1990 1991 1992 1993 subroutine moints_push2idxJ( nbf, alo, ahi, blo, bhi, 1994 $ ilo, ihi, jlo, jhi, joffset, 1995 $ x, g_j ) 1996 implicit none 1997#include "global.fh" 1998 integer nbf,alo,ahi,blo,bhi 1999 integer ilo,ihi,jlo,jhi 2000 integer joffset 2001 double precision x(ilo:ihi,jlo:jhi,alo:ahi,alo:ahi) 2002 integer a,b,ijlo,ijhi,ab,ij_leng,ileng 2003 integer g_j 2004 2005 ileng = ihi - ilo + 1 2006 ij_leng = ileng*(jhi - jlo + 1) 2007 ijlo = joffset + (jlo-1)*ileng + 1 2008 ijhi = joffset + jhi*ileng 2009 do a=alo,ahi 2010 do b=alo,ahi 2011 ab = ((a-alo+1)*(a-alo))/2 + (b-alo+1) 2012 call ga_acc(g_j,ijlo,ijhi,ab,ab,x(ilo,jlo,a,b),ij_leng,1.d0) 2013 enddo 2014 enddo 2015 return 2016 end 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 subroutine moints_trf34idx_K( nbf, alo, ahi, blo, bhi, 2030 $ ilo, ihi, offset, c, g_k, 2031 $ t1, t2, tmp, g_exch ) 2032 implicit none 2033#include "global.fh" 2034#include "mafdecls.fh" 2035 integer nbf,alo,ahi,blo,bhi,ilo,ihi 2036 integer g_k,g_exch 2037 integer offset 2038 double precision c(nbf,nbf) 2039 double precision t1(alo:ahi,blo:bhi) 2040 double precision t2(blo:bhi,alo:ahi) 2041 double precision tmp(*) 2042 integer my_id,rlo,rhi,clo,chi,ileng 2043 integer k_local,ld_local 2044 integer a,b,ia,i,j,ij,bleng,jblo,jbhi 2045 2046 2047 ileng = ihi - ilo + 1 2048 bleng = bhi - blo + 1 2049 my_id = ga_nodeid() 2050 call ga_distribution(g_k,my_id,rlo,rhi,clo,chi) 2051 do i=alo,ahi 2052 do j=blo,bhi 2053 ij = (i-alo)*(bhi-blo+1) + (j-blo+1) 2054 if ((ij.ge.clo).and.(ij.le.chi)) then 2055 call ga_access(g_k,rlo,rhi,ij,ij,k_local,ld_local) 2056 call moints_trf34idx_a(nbf,ilo,ihi,alo,ahi,blo,bhi,c, 2057 $ dbl_mb(k_local+offset),tmp,t1) 2058 call moints_trf34idx_a(nbf,ilo,ihi,blo,bhi,alo,ahi,c, 2059 $ dbl_mb(k_local+offset),tmp,t2) 2060 call ga_release(g_k,rlo,rhi,ij,ij) 2061 jblo = (j-blo)*bleng + 1 2062 jbhi = (j-blo)*bleng + bleng 2063 do a=alo,i 2064 do b=blo,bhi 2065 t2(b,a) = t2(b,a) + t1(a,b) 2066 enddo 2067 ia = ((i-alo)*(i-alo+1))/2 + (a-alo+1) 2068 call ga_acc(g_exch,jblo,jbhi,ia,ia,t2(blo,a),bleng,1.d0) 2069 enddo 2070 endif 2071 enddo 2072 enddo 2073 2074 2075 return 2076 end 2077 2078 2079 2080 2081 2082 2083 2084 2085 subroutine moints_trf34idx_J( nbf, alo, ahi, blo, bhi, 2086 $ ilo, ihi, offset, c, g_j, 2087 $ t1, t2, tmp, g_coul ) 2088 implicit none 2089#include "global.fh" 2090#include "mafdecls.fh" 2091 integer nbf,alo,ahi,blo,bhi,ilo,ihi 2092 integer offset 2093 integer g_j,g_coul 2094 double precision c(nbf,nbf) 2095 double precision t1(blo:bhi,blo:bhi) 2096 double precision t2(blo:bhi,blo:bhi) 2097 double precision tmp(*) 2098 integer my_id,rlo,rhi,clo,chi,ileng 2099 integer k_local,ld_local 2100 integer a,b,i,j,ij,bleng,bbleng,bblo,bbhi 2101 2102 ileng = ihi - ilo + 1 2103 bleng = bhi - blo + 1 2104 bbleng = bleng*bleng 2105 bblo = 1 2106 bbhi = bbleng 2107 my_id = ga_nodeid() 2108 call ga_distribution(g_j,my_id,rlo,rhi,clo,chi) 2109 do i=alo,ahi 2110 do j=alo,i 2111 ij = ((i-alo)*(i-alo+1))/2 + (j-alo+1) 2112 if ((ij.ge.clo).and.(ij.le.chi)) then 2113 call ga_access(g_j,rlo,rhi,ij,ij,k_local,ld_local) 2114 call moints_trf34idx_a(nbf,ilo,ihi,blo,bhi,blo,bhi,c, 2115 $ dbl_mb(k_local+offset),tmp,t1) 2116 call ga_release(g_j,rlo,rhi,ij,ij) 2117 do a=blo,bhi 2118 do b=blo,a 2119 t2(a,b) = t1(a,b) + t1(b,a) 2120 t2(b,a) = t2(a,b) 2121 enddo 2122 enddo 2123 call ga_acc(g_coul,bblo,bbhi,ij,ij,t2,bbleng,1.d0) 2124 endif 2125 enddo 2126 enddo 2127 return 2128 end 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 subroutine moints_trf34idx_a( nbf, ilo, ihi, alo, ahi, 2140 $ blo, bhi, c, x, y, z ) 2141 implicit none 2142 integer nbf,ilo,ihi,alo,ahi,blo,bhi 2143 double precision c(nbf,nbf) 2144 double precision x(ilo:ihi,ihi) 2145 double precision y(alo:ahi,ilo:ihi) 2146 double precision z(alo:ahi,blo:bhi) 2147 integer na,nb,ileng 2148 2149 na = ahi - alo + 1 2150 nb = bhi - blo + 1 2151 ileng = ihi - ilo + 1 2152 2153 call sgemm('t','t',na,ileng,ihi,1.d0,c(1,alo),nbf,x,ileng, 2154 $ 0.d0,y,na) 2155 call sgemm('n','n',na,nb,ileng,1.d0,y,na,c(ilo,blo),nbf, 2156 $ 0.d0,z,na) 2157 end 2158 2159 2160 2161 2162 2163 2164 2165 subroutine moints_inner( basis, nbf, nsh, ish, jsh, 2166 $ ibflo, ibfhi, 2167 $ next, ploop, tasksiz, offset, 2168 $ ocoul, oexch, 2169 $ mo1_lo, mo1_hi, mo2_lo, mo2_hi, 2170 $ mo, max2e, eri, erit, mem2, scr, 2171 $ sssi, n_ssni, ssni, ssai, ssji, 2172 $ g_k2, g_j2 ) 2173 implicit none 2174#include "tcgmsg.fh" 2175#include "global.fh" 2176#include "bas.fh" 2177#include "schwarz.fh" 2178 integer basis 2179 integer nbf, nsh, ish, jsh 2180 integer ibflo, ibfhi 2181 integer next, ploop, tasksiz 2182 integer mo1_lo, mo1_hi, mo2_lo, mo2_hi 2183 integer mem2, max2e, n_ssni 2184 integer g_k2, g_j2 2185 integer offset 2186 logical oexch, ocoul 2187 double precision mo(nbf,nbf) 2188 double precision eri(max2e),erit(max2e),scr(mem2) 2189 double precision ssni(n_ssni),sssi(*),ssai(*),ssji(*) 2190c 2191c 2192 integer nmo1 2193 integer ksh,lsh 2194 integer jbflo,jbfhi,kbflo,kbfhi,lbflo,lbfhi 2195 integer ilen,jlen,klen,llen 2196 integer numnodes,next0,tri_shells 2197 double precision tol2e,permscale 2198 double precision tt,txy,tzz 2199#ifdef DEBUG_PARALLEL 2200 integer nptasks,npcomm 2201 double precision schw_done 2202 common/pstats/nptasks,npcomm,schw_done 2203 double precision t1qtr,t2qtr,t34,tpush,tint,tloop,t11 2204 common/ztime/t1qtr,t2qtr,t34,tpush,tint,tloop,t11 2205#endif 2206 2207c 2208c 2209 integer nxtask,nxtseg 2210 external nxtask,nxtseg 2211c 2212c 2213 data tol2e/1.e-12/ 2214 2215 2216 txy = tcgtime() 2217 ilen = ibfhi - ibflo + 1 2218 if (.not. bas_cn2bfr(basis,jsh,jbflo,jbfhi)) 2219 $ call errquit('mo_ints: bas_cn2bfr',jsh) 2220 jlen = jbfhi - jbflo + 1 2221 numnodes = ga_nnodes() 2222 2223 next0 = next 2224 tri_shells = (nsh*(nsh+1))/2 2225 nmo1 = mo1_hi - mo1_lo + 1 2226 call dfill(n_ssni,0.d0,ssni,1) 2227 do ksh=1,nsh 2228 if (.not. bas_cn2bfr(basis,ksh,kbflo,kbfhi)) 2229 $ call errquit('mo_ints: bas_cn2bfr',ksh) 2230 klen = kbfhi - kbflo + 1 2231 do lsh=1,ksh 2232 if (next.eq.ploop) then 2233 nptasks = nptasks + 1 2234 if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh) 2235 $ .ge.tol2e) then 2236 schw_done = schw_done + 1.d0 2237 if (.not. bas_cn2bfr(basis,lsh,lbflo,lbfhi)) 2238 $ call errquit('mo_ints: bas_cn2bfr',lsh) 2239 llen = lbfhi - lbflo + 1 2240 tt = tcgtime() 2241 call int_2e4c(basis, ish, jsh, basis, ksh, lsh, 2242 $ mem2, scr, max2e, eri ) 2243 tint = tint + tcgtime() - tt 2244 permscale = 1.d0 2245 if (ksh.eq.lsh) permscale = 0.5d0 2246#ifdef USEGEMM 2247 call moints_trf1idx( nbf, mo1_lo, mo1_hi, 2248 $ ilen, jlen, kbflo, kbfhi, 2249 $ lbflo, lbfhi, permscale, 2250 $ eri, mo, sssi, ssni ) 2251 call moints_inttrsp( ilen, jlen, klen, llen, eri, erit ) 2252 call moints_trf1idx( nbf, mo1_lo, mo1_hi, 2253 $ ilen, jlen, lbflo, lbfhi, 2254 $ kbflo, kbfhi, permscale, 2255 $ erit, mo, sssi, ssni ) 2256#else 2257 call moints_ytrf1idx( nbf, mo1_lo, mo1_hi, 2258 $ ilen, jlen, kbflo, kbfhi, 2259 $ lbflo, lbfhi, permscale, 2260 $ eri, mo, ssni ) 2261#endif 2262 t1qtr = t1qtr + tcgtime() - tt 2263 endif 2264#ifdef SEGMENT_TASK 2265 next = nxtseg(numnodes,tasksiz,tri_shells) 2266#else 2267 next = nxtask(numnodes,tasksiz) 2268#endif 2269 endif 2270 ploop = ploop + 1 2271 enddo 2272 enddo 2273 if (next.eq.next0) tloop = tloop + tcgtime() - txy 2274 t11 = t11 + tcgtime() - txy 2275c 2276c 2nd qtr transformation 2277c 2278 if (next.ne.next0) then 2279 tt = tcgtime() 2280 permscale = 1.d0 2281 if (jsh.eq.ish) permscale = 0.5d0 2282 if (oexch) 2283 $ call moints_trf2idx( nbf, mo2_lo, mo2_hi, nmo1, 2284 $ ilen, jlen, permscale, 2285 $ ssni, mo, ssai ) 2286 if (ocoul) 2287 $ call moints_trf2idx( nbf, mo1_lo, mo1_hi, nmo1, 2288 $ ilen, jlen, permscale, 2289 $ ssni, mo, ssji ) 2290c 2291c Push intermediates into global 2292c 2293 npcomm = npcomm + 1 2294 tzz = tcgtime() 2295 if (oexch) 2296 $ call moints_push2idxK( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi, 2297 $ ibflo, ibfhi, jbflo, jbfhi, offset, 2298 $ ssai, g_k2 ) 2299 if (ocoul) 2300 $ call moints_push2idxJ( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi, 2301 $ ibflo, ibfhi, jbflo, jbfhi, offset, 2302 $ ssji, g_j2 ) 2303 tpush = tpush + tcgtime() - tzz 2304 t2qtr = t2qtr + tcgtime() - tt 2305 endif 2306 return 2307 end 2308 2309 2310 2311#endif /* OLD_ALGORITHM */ 2312 2313 2314 2315#ifndef USEGEMM 2316 subroutine moints_ytrf1idx( nbf, a1, a2, ni, nj, 2317 $ klo, khi, llo, lhi, 2318 $ scale, eri, c, y ) 2319 implicit none 2320 integer nbf,a1,a2,ni,nj,klo,khi,llo,lhi 2321 double precision scale 2322 double precision eri(llo:lhi,klo:khi,nj,ni) 2323 double precision c(nbf,nbf) 2324 double precision y(nbf,ni,nj,a1:a2) 2325 integer i,j,a,k,l,nn 2326 double precision xx 2327 2328 if (scale.ne.1.d0) then 2329 nn = ni*nj*(khi-klo+1)*(lhi-llo+1) 2330 call sscal(nn,scale,eri,1) 2331 endif 2332 2333 do a=a1,a2 2334 do j=1,nj 2335 do i=1,ni 2336 do k=klo,khi 2337 xx = 0.d0 2338 do l=llo,lhi 2339 xx = xx + eri(l,k,j,i)*c(l,a) 2340 enddo 2341 y(k,i,j,a) = y(k,i,j,a) + xx 2342 enddo 2343 do l=llo,lhi 2344 xx = 0.d0 2345 do k=klo,khi 2346 xx = xx + eri(l,k,j,i)*c(k,a) 2347 enddo 2348 y(l,i,j,a) = y(l,i,j,a) + xx 2349 enddo 2350 enddo 2351 enddo 2352 enddo 2353 2354 return 2355 end 2356 2357#endif /* ! USEGEMM */ 2358c 2359c 2360c 2361c Unused routines (may come in handy later) 2362c 2363c 2364c 2365c 2366#if defined(MOINTS_PAR_STATS) 2367 2368c 2369c Broadcast all nodes' value. Result are 2370c returned in ordered vector 2371c 2372 2373 subroutine brdcst_ivec( value, v ) 2374C$Id$ 2375 implicit none 2376 integer intlen 2377#ifdef KSR 2378 parameter (intlen=8) 2379#else 2380 parameter (intlen=4) 2381#endif 2382#include "tcgmsg.fh" 2383#include "global.fh" 2384#include "msgids.fh" 2385 integer value, v(*) 2386 integer myid, numnodes, itmp, i 2387 2388 numnodes = ga_nnodes() 2389 myid = ga_nodeid() 2390 do i=0,numnodes-1 2391 if (i.eq.myid) itmp = value 2392 call ga_brdcst(Msg_BcstVec,itmp,intlen,i) 2393 v(i+1) = itmp 2394 enddo 2395 return 2396 end 2397 2398 2399 2400 subroutine brdcst_dvec( value, v, avg ) 2401 implicit none 2402 integer msgtype,dbllen 2403 parameter (msgtype=1973) 2404 parameter (dbllen=8) 2405#include "tcgmsg.fh" 2406#include "global.fh" 2407 double precision value, v(*), avg 2408 integer myid, numnodes, i 2409 double precision dtmp 2410 2411 avg = 0.d0 2412 numnodes = ga_nnodes() 2413 myid = ga_nodeid() 2414 do i=0,numnodes-1 2415 if (i.eq.myid) dtmp = value 2416 call ga_brdcst(msgtype,dtmp,dbllen,i) 2417 v(i+1) = dtmp 2418 avg = avg + dtmp 2419 enddo 2420 avg = avg/numnodes 2421 return 2422 end 2423#else 2424 subroutine brdcst_ivec( value, v ) 2425 implicit none 2426 integer value, v(*) 2427 return 2428 end 2429#endif 2430 2431 2432 2433 subroutine moints_trf2Kynew 2434 $ ( nbf, qlo, qhi, oseglo, oseghi, 2435 $ vlo, vhi, ilo, ihi, jlo, jhi, gloc, 2436 $ ssni, h, h2, ct, g_buf ) 2437 implicit none 2438#include "global.fh" 2439 integer nbf, qlo, qhi, oseglo, oseghi, vlo, vhi 2440 integer ilo, ihi, jlo, jhi 2441 integer gloc(nbf,nbf) 2442 double precision ssni(nbf,jlo:jhi,ilo:ihi,oseglo:oseghi) 2443 double precision h(vlo:vhi,jlo:jhi,ilo:ihi),h2(*) 2444 double precision ct(qlo:qhi,nbf),s, sum 2445 integer g_buf 2446 integer ilen, jlen, ijlen, vlen, qlen 2447 integer glo, ghi, i, j, ij,k, jtop, klo, khi 2448 integer o, v, vo, vtop, vbot 2449 integer kchunk, vchunk 2450 parameter (kchunk=32, vchunk=96) ! For cache reuse 2451 2452* integer ilen, jlen, ijlen 2453* integer glo, ghi, i, j, ij,k 2454* integer o, v, vo, jtop 2455 2456c 2457 ilen = ihi - ilo + 1 2458 jlen = jhi - jlo + 1 2459 vlen = vhi - vlo + 1 2460 qlen = qhi - qlo + 1 2461 glo = gloc(ilo,jlo) 2462 ghi = gloc(ihi,jhi) 2463 ijlen = ilen*jlen 2464 2465 do o = oseglo, oseghi 2466* call dgemm('n','n',vlen,ijlen,nbf,1.0d0,ct(vlo,1),qlen, 2467* $ ssni(1,1,1,o),nbf,0.0d0,h,vlen) 2468c 2469c Following is the same as the above dgemm but using 2470c sparsity of the integrals and restricting diagonal where possible 2471c 2472 call dfill((vhi-vlo+1)*ilen*jlen,0.0d0,h,1) 2473 do klo = 1, nbf, kchunk ! Blocking for cache 2474 khi = min(nbf,klo+kchunk-1) 2475 do vbot = vlo, vhi, vchunk ! Blocking for cache 2476 vtop = min(vhi,vbot+vchunk-1) 2477 do i = ilo, ihi 2478 jtop = jhi 2479 if (ilo .eq. jlo) jtop = i 2480 do j = jlo, jtop 2481 do k = klo, khi 2482 s = ssni(k,j,i,o) 2483 if (abs(s) .gt. 1d-12) then 2484 do v = vbot,vtop 2485 h(v,j,i) = h(v,j,i) + s*ct(v,k) 2486 enddo 2487 endif 2488 enddo 2489 enddo 2490 enddo 2491 enddo 2492 enddo 2493 do v=vlo,vhi 2494 vo = (v-vlo)*(oseghi-oseglo+1) + o - oseglo + 1 2495 ij = 0 2496 sum = 0.0d0 2497 do i=ilo,ihi 2498 jtop = jhi 2499 if (ihi.eq.jhi) jtop = i 2500 do j=jlo,jtop 2501 ij = ij + 1 2502 h2(ij) = h(v,j,i) 2503 sum = sum + h(v,j,i)*h(v,j,i) 2504 enddo 2505 enddo 2506c Note that sum is the sum of squares hence test against tol**2 2507 if (sum .gt. 1d-20) call ga_put(g_buf,glo,ghi,vo,vo,h2,1) 2508 enddo 2509 enddo 2510c 2511 return 2512 end 2513 2514 2515 2516 2517 2518 2519