1 2c 3c 4c Sigma vector Alpha - Alpha contribution 5c 6c 7 subroutine detci_sigmaaa( norb, nsym, nela, nelb, nstra, nstrb, 8 $ nexa, nexb, nekla, neklb, 9 $ osym, ijmap, exa, exb, 10 $ ataba, atabb, ntij, nsblk, 11 $ h, g, f, g_civec, g_sigma ) 12* 13* $Id$ 14* 15 implicit none 16#include "errquit.fh" 17#include "mafdecls.fh" 18#include "global.fh" 19#include "msgids.fh" 20#include "util.fh" 21#include "detciP.fh" 22#include "detci.fh" 23#include "cdetcistats.fh" 24 integer norb ! [input] Orbitals 25 integer nsym ! [input] Irreps 26 integer nela ! [input] Alpha electrons 27 integer nelb ! [input] Beta electrons 28 integer nstra ! [input] Alpha strings 29 integer nstrb ! [input] Beta strings 30 integer nexa ! [input] Alpha excitations 31 integer nexb ! [input] Beta excitations 32 integer nekla ! [input] Maximum non-zero alpha strings for E_kl 33 integer neklb ! [input] Maximum non-zero beta strings for E_kl 34 integer osym(norb) ! [input] Orbital irreps 35 integer ijmap(norb,norb) ! [input] Map of (i,j) -> ij (symmtery-blocked) 36 integer ataba(0:norb,0:nela,nsym) ! [input] Alpha arc weights 37 integer atabb(0:norb,0:nelb,nsym) ! [input] Beta arc weights 38 integer exa(6,nexa,nstra) ! [input] Alpha excitation lookup table 39 integer exb(6,nexb,nstrb) ! [input] Beta excitation lookup table 40 integer ntij ! [input] Symmtery-blocked triangular sum 41 integer nsblk ! [input] Blocking factor to increase parallelism 42 double precision h(ntij) ! [input] One-electron hamiltonian 43 double precision g(ntij,ntij) ! [input] ERI's 44 double precision f(nstrb,nsblk) ! [scratch] Scratch space 45 integer g_civec ! [input] CI-vector 46 integer g_sigma ! [input/output] Sigma vector 47c 48c 49c 50 integer istr, kstr, jstr 51 integer istrlo, istrhi, iistr 52 integer msblk, iscnt 53 integer iex, kex 54 integer rlo, rhi, cilo, cihi 55 integer k_ci, k_sig, ldc, lds 56 integer myid, numnodes 57 integer j, k, l 58c$$$ integer i 59 integer kl, ij, kj, lj 60 integer kphase, jphase 61 double precision xx, tff, tdot, tgop 62* double precision tx 63 double precision h1(detci_maxorb*detci_maxorb) 64c$$$ double precision sdot 65c$$$ external sdot 66 67c 68#include "symmdef.fh" 69#include "bitops.fh" 70#include "symmmul.fh" 71c 72c 73 tff = 0.d0 74 tdot = 0.d0 75 tgop = 0.d0 76 if (.not.ga_compare_distr( g_civec, g_sigma )) 77 $ call errquit('detci_sigmaaa: CI vectors do not match',0, 78 & GA_ERR) 79 myid = ga_nodeid() 80 numnodes = ga_nnodes() 81 call ga_distribution( g_civec, myid, rlo, rhi, cilo, cihi ) 82c 83 if (((cilo.ne.0).and.(cihi.ne.-1)).and. 84 $ ((rlo.ne.1).or.(rhi.ne.nstrb))) 85 $ call errquit('detci_sigmaaa: wrong distrib for CI vector',0, 86 & INPUT_ERR) 87c 88 if (cilo.gt.0 .and. cihi.gt.0) then 89 call ga_access( g_civec, rlo, rhi, cilo, cihi, k_ci, ldc ) 90 call ga_access( g_sigma, rlo, rhi, cilo, cihi, k_sig, lds ) 91 endif 92c 93c Precompute 2e 94c 95 do k=1,norb 96 do l=1,k 97 xx = 0.d0 98 do j=1,norb 99 kj = ijmap(k,j) 100 lj = ijmap(l,j) 101 xx = xx + g(kj,lj) 102 enddo 103 kl = ijmap(k,l) 104 h1(kl) = xx*0.5d0 105 enddo 106 enddo 107c 108c 109c 110 do istrlo=1,nstrb,nsblk 111 istrhi = min((istrlo + nsblk - 1),nstrb) 112 msblk = istrhi - istrlo + 1 113* tx = util_cpusec() 114 call dfill((msblk*nstrb),0.d0,f,1) 115 do istr=istrlo,istrhi 116 iistr = istr - istrlo + 1 117 iscnt = (iistr - 1)*nexb 118 do iex=1,nexb 119 if (mod((iscnt+iex-1),numnodes).eq.myid) then 120 kstr = exb(1,iex,istr) 121 kl = exb(3,iex,istr) 122 kphase = exb(4,iex,istr) 123 f(kstr,iistr) = f(kstr,iistr) + 124 $ kphase*(h(kl) - h1(kl)) 125c 126 do kex=1,nexb 127 jstr = exb(1,kex,kstr) 128 ij = exb(3,kex,kstr) 129 jphase = exb(4,kex,kstr)*kphase 130 f(jstr,iistr) = f(jstr,iistr) + 131 $ 0.5d0*jphase*g(ij,kl) 132 enddo 133 endif 134 enddo 135 enddo 136* tff = tff + util_cpusec() - tx 137* tx = util_cpusec() 138 call ga_dgop(Msg_detci_sum, f, (nstrb*msblk), '+' ) 139* tgop = tgop + util_cpusec() - tx 140c 141c Data parallel here.... 142c 143* tx = util_cpusec() 144c$$$ do i=cilo, cihi 145c$$$ do istr=istrlo,istrhi 146c$$$ iistr = istr - istrlo + 1 147c$$$ dbl_mb(k_sig+istr-1+(i-cilo)*lds) = 148c$$$ $ dbl_mb(k_sig+istr-1+(i-cilo)*lds) + 149c$$$ $ ddot(nstrb,f(1,iistr),1, 150c$$$ $ dbl_mb(k_ci+(i-cilo)*ldc),1) 151c$$$ enddo 152c$$$ enddo 153 call detci_saa_kernel(dbl_mb(k_sig), lds, 154 $ dbl_mb(k_ci), ldc, f, nstrb, 155 $ cilo, cihi, istrlo, istrhi, nstrb) 156* tdot = tdot + util_cpusec() - tx 157 enddo 158c 159 if (cilo.gt.0 .and. cihi.gt.0) then 160 call ga_release( g_civec, rlo, rhi, cilo, cihi ) 161 call ga_release_update( g_sigma, rlo, rhi, cilo, cihi ) 162 endif 163c 164c 165c Collect stats 166c 167 detci_aaff_etime = detci_aaff_etime + tff 168 detci_aadot_etime = detci_aadot_etime + tdot 169 detci_aagop_etime = detci_aagop_etime + tgop 170 return 171 end 172 subroutine detci_saa_kernel(s, lds, c, ldc, f, ldf, 173 $ cilo, cihi, istrlo, istrhi, nstrb) 174 implicit none 175 integer lds, ldc, ldf, cilo, cihi, istrlo, istrhi, nstrb 176 double precision s(lds,cilo:cihi), c(ldc,cilo:cihi) 177 double precision f(ldf,istrlo:istrhi), sum 178c 179 integer istr, i, j, jlo, jhi, nj, jj 180 integer ind(1024) 181c 182 do istr = istrlo, istrhi ! Short loop 183 do jlo = 1, nstrb, 1024 184 jhi = min(nstrb,jlo+1024-1) 185 nj = 0 186 do j = jlo, jhi 187 if (f(j,istr).ne.0.0d0) then 188 nj = nj + 1 189 ind(nj) = j 190 endif 191 enddo 192 do i = cilo, cihi 193 sum = 0.0d0 194 do jj = 1, nj 195 j = ind(jj) 196 sum = sum + f(j,istr)*c(j,i) 197 enddo 198 s(istr,i) = s(istr,i) + sum 199 enddo 200 enddo 201 enddo 202c 203 end 204c 205c 206c Sigma vector Alpha-Beta contribution 207c 208c 209 subroutine detci_sigmaab( norb, nsym, nela, nelb, nstra, nstrb, 210 $ nexa, nexb, nekla, neklb, 211 $ osym, ijmap, exa, exb, 212 $ ataba, atabb, ntij, g, 213 $ vrhs, vlhs, vphase, f, 214 $ cprime, sprime, g_civec, g_sigma ) 215 implicit none 216#include "errquit.fh" 217#include "global.fh" 218#include "util.fh" 219#include "detciP.fh" 220#include "detci.fh" 221#include "cdetcistats.fh" 222#include "mafdecls.fh" 223 integer norb ! [input] Orbitals 224 integer nsym ! [input] Irreps 225 integer nela ! [input] Alpha electrons 226 integer nelb ! [input] Beta electrons 227 integer nstra ! [input] Alpha strings 228 integer nstrb ! [input] Beta strings 229 integer nexa ! [input] Alpha excitations 230 integer nexb ! [input] Beta excitations 231 integer nekla ! [input] Max non-zero alpha strings for E_kl 232 integer neklb ! [input] Max non-zero beta strings for E_kl 233 integer osym(norb) ! [input] Orbital irreps 234 integer ijmap(norb,norb) ! [input] Map of (i,j) -> ij (symm-blocked) 235 integer ataba(0:norb,0:nela,nsym) ! [input] Alpha arc weights 236 integer atabb(0:norb,0:nelb,nsym) ! [input] Beta arc weights 237 integer exa(6,nexa,nstra) ! [input] Alpha excitation lookup table 238 integer exb(6,nexb,nstrb) ! [input] Beta excitation lookup table 239 integer ntij ! [input] Symmtery-blocked triangular sum 240 double precision g(ntij,ntij) ! [input] ERI's 241 integer vrhs(nekla) ! [scratch] Array of RHS Strings for E_kl 242 integer vlhs(*) ! [scratch] Array of LHS Strings for E_kl 243 integer vphase(nekla) ! [scratch] Array of Phases for E_kl 244 double precision f(nexb) ! [scratch] Scratch space 245 double precision cprime(nstrb,nekla) ! [scratch] Gathered CI-vector 246 double precision sprime(nstrb,nekla) ! [scratch] Gathered sigma vector 247 integer g_civec ! [input] CI-vector 248 integer g_sigma ! [input/output] Sigma vector 249c 250c 251 integer i, ij, kl, iii 252 integer ak, al, iph 253 integer jstr 254 integer ne_kl, nidx, iex 255 integer relv(detci_maxelec) 256 integer lelv(detci_maxelec) 257 integer oidx(detci_maxorb) 258 integer ip(detci_maxelec) 259 integer ploop, numnodes, next, myid 260 integer nsblk, isblk, slo, shi, sseg 261 double precision phase 262 double precision tstr, tgath, tdotab, tscat 263* double precision tsync, tx 264 integer k_ci, l_ci, k_sig, l_sig 265 logical oreplicated 266 integer nxtask 267 external nxtask 268c 269c If enuf memory is available simply replicate the CI vector 270c and accumulate into local sigmas, ending with global sum. 271c 272 oreplicated = ma_inquire_avail(mt_dbl) .gt. 2*nstra*nstrb 273 if (oreplicated) then 274 if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: c', 275 $ l_ci, k_ci)) call errquit('detci_sab: ma', nstra*nstrb, 276 & MA_ERR) 277 if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: s', 278 $ l_sig, k_sig)) call errquit('detci_sab: ma', nstra*nstrb, 279 & MA_ERR) 280 call dfill(nstra*nstrb, 0.0d0, dbl_mb(k_sig), 1) 281 if (ga_nodeid() .eq. 0) call ga_get(g_civec, 1, nstrb, 1, nstra, 282 $ dbl_mb(k_ci), nstrb) 283 call ga_brdcst(1, dbl_mb(k_ci), 284 $ ma_sizeof(mt_dbl,nstra*nstrb,mt_byte), 0) 285 else 286 k_ci = 1 ! to avoid segv 287 k_sig = 1 288 endif 289c 290c 291c Initialize parallel stuff 292c 293 tstr = 0.d0 294 tgath = 0.d0 295 tdotab = 0.d0 296 tscat = 0.d0 297 ploop = -1 298 numnodes = ga_nnodes() 299 myid = ga_nodeid() 300c 301c Block over strings for finer granularity 302c 303 nsblk = (20*numnodes)/(norb*norb) 304 nsblk = max(nsblk,1) 305 NSBLK = min(2,nekla) 306 if (numnodes.eq.1) nsblk = 1 307 sseg = (nekla/nsblk) 308 if (mod(nekla,nsblk).ne.0) sseg = sseg + 1 309c 310c Loop over all excitation operators 311c 312c t 313c E = a a 314c kl k l 315c 316 next = nxtask(numnodes, 1) 317 do ak=1,norb 318 do al=1,norb 319 shi = 0 320 do isblk=1,nsblk 321 slo = shi + 1 322 shi = min((slo + sseg - 1),nekla) 323 ploop = ploop + 1 324 if (ploop.eq.next) then 325* tx = util_cpusec() 326 kl = ijmap(ak,al) 327c 328c Vector of orbital indices except create/annih indices 329c Initialize pointer vector 330c 331 call ifill(norb,0,oidx,1) 332 nidx = 0 333 do i=1,norb 334 if ((i.ne.ak).and.(i.ne.al)) then 335 nidx = nidx + 1 336 oidx(nidx) = i 337 endif 338 enddo 339 do i=1,nela-1 340 ip(i) = i 341 enddo 342c 343c Loop through all strings for nidx and (nela-1) 344c Accept strings in the block range slo:shi 345c Insert orbital index k and l to create 346c LHS and RHS strings where 347c 348c |LHS> = E |RHS> 349c kl 350c 351c Push indices into gather/scatter arrays 352c 353c Note: special case when nela = norb then 354c 355c E !RHS> = 0 for k != l 356c kl 357c 358c thus ne_kl = 0 359c 360 ne_kl = 0 361 if (nela.eq.1) then 362 ne_kl = 1 363 vrhs(i) = al 364 vlhs(i) = ak 365 vphase(i) = 1 366 else if ((norb.ne.nela).or.(al.eq.ak)) then 367 iii = 0 368 101 continue 369 iii = iii + 1 370 if ((iii.ge.slo).and.(iii.le.shi)) then 371 ne_kl = ne_kl + 1 372 iph = 1 373 call detci_ptr2elv( norb, nela, (nela-1), nidx, ip, 374 $ oidx, al, relv, iph ) 375 call detci_ptr2elv( norb, nela, (nela-1), nidx, ip, 376 $ oidx, ak, lelv, iph ) 377 vphase(ne_kl) = iph 378 vrhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym, 379 $ ataba, relv) 380 vlhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym, 381 $ ataba, lelv) 382 endif 383 if (detci_getnextelv(nidx,(nela-1),ip)) goto 101 384 endif 385* tstr = tstr + util_cpusec() - tx 386* tx = util_cpusec() 387c 388c End loop over possible strings 389c 390c Gather in CI blocks 391c 392 call detci_cigather( nstrb, nstra, ne_kl, g_civec, vlhs, 393 $ vphase, sprime , dbl_mb(k_ci), 394 $ oreplicated) 395 call dfill((nstrb*ne_kl), 0.d0, cprime, 1 ) 396 call transpose_nw( nstrb, ne_kl, sprime, cprime ) 397 call dfill((nstrb*ne_kl),0.d0,sprime,1) 398* tgath = tgath + util_cpusec() - tx 399* tx = util_cpusec() 400c 401c Loop over all beta strings 402c 403 do jstr=1,nstrb 404 call dfill( nstrb, 0.d0, f, 1 ) 405 do iex=1,nexb 406 vlhs(iex) = exb(1,iex,jstr) 407 ij = exb(3,iex,jstr) 408 phase = exb(4,iex,jstr) 409 f(iex) = phase*g(ij,kl) 410 enddo 411 call detci_dotabx(jstr, ne_kl, nstrb, nexb, f, vlhs, 412 $ cprime, sprime ) 413 enddo 414* tdotab = tdotab + util_cpusec() - tx 415* tx = util_cpusec() 416c 417c Scatter accumulate result into sigma vector 418c 419 call transpose_nw( ne_kl, nstrb, sprime, cprime ) 420 call detci_ciscatter( nstrb, nstra, ne_kl, cprime, 421 $ vrhs, g_sigma, dbl_mb(k_sig), 422 $ oreplicated) 423* tscat = tscat + util_cpusec() - tx 424c 425c End parallel task 426c 427 next = nxtask(numnodes, 1) 428 endif 429 enddo 430 enddo 431 enddo 432* tx = util_cpusec() 433 next = nxtask(-numnodes, 1) 434* tsync = util_cpusec() - tx 435c 436 if (oreplicated) then 437 call ga_dgop(1, dbl_mb(k_sig), nstra*nstrb, '+') 438 if (ga_nodeid().eq.0) call ga_acc(g_sigma, 1, nstrb, 1, nstra, 439 $ dbl_mb(k_sig), nstrb, 1.0d0) 440 if (.not. ma_pop_stack(l_sig)) call errquit('detci_sab: ma?',0, 441 & MA_ERR) 442 if (.not. ma_pop_stack(l_ci)) call errquit('detci_sab: ma?',0, 443 & MA_ERR) 444 call ga_sync() 445 endif 446 detci_abstr_etime = detci_abstr_etime + tstr 447 detci_abgath_etime = detci_abgath_etime + tgath 448 detci_abdotab_etime = detci_abdotab_etime + tdotab 449 detci_abscat_etime = detci_abscat_etime + tscat 450* detci_absync_etime = detci_absync_etime + tsync 451 return 452 end 453