1 2c 3c 4c Sigma vector Alpha - Alpha contribution 5c 6c 7 subroutine detci_sigmaaa( norb, nsym, nela, nelb, nstra, nstrb, 8 $ nexa, nexb, nekla, neklb, 9 $ osym, ijmap, exa, exb, 10 $ ataba, atabb, ntij, nsblk, 11 $ h, g, f, g_civec, g_sigma ) 12* 13* $Id$ 14* 15 implicit none 16#include "errquit.fh" 17#include "mafdecls.fh" 18#include "global.fh" 19#include "msgids.fh" 20#include "util.fh" 21#include "detciP.fh" 22#include "detci.fh" 23#include "cdetcistats.fh" 24 integer norb ! [input] Orbitals 25 integer nsym ! [input] Irreps 26 integer nela ! [input] Alpha electrons 27 integer nelb ! [input] Beta electrons 28 integer nstra ! [input] Alpha strings 29 integer nstrb ! [input] Beta strings 30 integer nexa ! [input] Alpha excitations 31 integer nexb ! [input] Beta excitations 32 integer nekla ! [input] Maximum non-zero alpha strings for E_kl 33 integer neklb ! [input] Maximum non-zero beta strings for E_kl 34 integer osym(norb) ! [input] Orbital irreps 35 integer ijmap(norb,norb) ! [input] Map of (i,j) -> ij (symmtery-blocked) 36 integer ataba(0:norb,0:nela,nsym) ! [input] Alpha arc weights 37 integer atabb(0:norb,0:nelb,nsym) ! [input] Beta arc weights 38 integer exa(6,nexa,nstra) ! [input] Alpha excitation lookup table 39 integer exb(6,nexb,nstrb) ! [input] Beta excitation lookup table 40 integer ntij ! [input] Symmtery-blocked triangular sum 41 integer nsblk ! [input] Blocking factor to increase parallelism 42 double precision h(ntij) ! [input] One-electron hamiltonian 43 double precision g(ntij,ntij) ! [input] ERI's 44 double precision f(nstrb,nsblk) ! [scratch] Scratch space 45 integer g_civec ! [input] CI-vector 46 integer g_sigma ! [input/output] Sigma vector 47c 48c 49c 50 integer istr, kstr, jstr 51 integer istrlo, istrhi, iistr 52 integer msblk, iscnt 53 integer iex, kex 54 integer rlo, rhi, cilo, cihi 55 integer k_ci, k_sig, ldc, lds 56 integer myid, numnodes 57 integer j, k, l 58c$$$ integer i 59 integer kl, ij, kj, lj 60 integer kphase, jphase 61 double precision xx, tff, tdot, tgop 62* double precision tx 63 double precision h1(detci_maxorb*detci_maxorb) 64c$$$ double precision sdot 65c$$$ external sdot 66 67c 68#include "symmdef.fh" 69#include "bitops.fh" 70#include "symmmul.fh" 71c 72c 73 tff = 0.d0 74 tdot = 0.d0 75 tgop = 0.d0 76 if (.not.ga_compare_distr( g_civec, g_sigma )) 77 $ call errquit('detci_sigmaaa: CI vectors do not match',0, 78 & GA_ERR) 79 myid = ga_nodeid() 80 numnodes = ga_nnodes() 81 call ga_distribution( g_civec, myid, rlo, rhi, cilo, cihi ) 82c 83 if (((cilo.ne.0).and.(cihi.ne.-1)).and. 84 $ ((rlo.ne.1).or.(rhi.ne.nstrb))) 85 $ call errquit('detci_sigmaaa: wrong distrib for CI vector',0, 86 & INPUT_ERR) 87c 88 if (cilo.gt.0 .and. cihi.gt.0) then 89 call ga_access( g_civec, rlo, rhi, cilo, cihi, k_ci, ldc ) 90 call ga_access( g_sigma, rlo, rhi, cilo, cihi, k_sig, lds ) 91 endif 92c 93c Precompute 2e 94c 95 do k=1,norb 96 do l=1,k 97 xx = 0.d0 98 do j=1,norb 99 kj = ijmap(k,j) 100 lj = ijmap(l,j) 101 xx = xx + g(kj,lj) 102 enddo 103 kl = ijmap(k,l) 104 h1(kl) = xx*0.5d0 105 enddo 106 enddo 107c 108c 109c 110 do istrlo=1,nstrb,nsblk 111 istrhi = min((istrlo + nsblk - 1),nstrb) 112 msblk = istrhi - istrlo + 1 113* tx = util_cpusec() 114 call dfill((msblk*nstrb),0.d0,f,1) 115 do istr=istrlo,istrhi 116 iistr = istr - istrlo + 1 117 iscnt = (iistr - 1)*nexb 118 do iex=1,nexb 119 if (mod((iscnt+iex-1),numnodes).eq.myid) then 120 kstr = exb(1,iex,istr) 121 kl = exb(3,iex,istr) 122 kphase = exb(4,iex,istr) 123 f(kstr,iistr) = f(kstr,iistr) + 124 $ kphase*(h(kl) - h1(kl)) 125c 126 do kex=1,nexb 127 jstr = exb(1,kex,kstr) 128 ij = exb(3,kex,kstr) 129 jphase = exb(4,kex,kstr)*kphase 130 f(jstr,iistr) = f(jstr,iistr) + 131 $ 0.5d0*jphase*g(ij,kl) 132 enddo 133 endif 134 enddo 135 enddo 136* tff = tff + util_cpusec() - tx 137* tx = util_cpusec() 138 call ga_dgop(Msg_detci_sum, f, (nstrb*msblk), '+' ) 139* tgop = tgop + util_cpusec() - tx 140c 141c Data parallel here.... 142c 143* tx = util_cpusec() 144c$$$ do i=cilo, cihi 145c$$$ do istr=istrlo,istrhi 146c$$$ iistr = istr - istrlo + 1 147c$$$ dbl_mb(k_sig+istr-1+(i-cilo)*lds) = 148c$$$ $ dbl_mb(k_sig+istr-1+(i-cilo)*lds) + 149c$$$ $ ddot(nstrb,f(1,iistr),1, 150c$$$ $ dbl_mb(k_ci+(i-cilo)*ldc),1) 151c$$$ enddo 152c$$$ enddo 153 call detci_saa_kernel(dbl_mb(k_sig), lds, 154 $ dbl_mb(k_ci), ldc, f, nstrb, 155 $ cilo, cihi, istrlo, istrhi, nstrb) 156* tdot = tdot + util_cpusec() - tx 157 enddo 158c 159 if (cilo.gt.0 .and. cihi.gt.0) then 160 call ga_release( g_civec, rlo, rhi, cilo, cihi ) 161 call ga_release_update( g_sigma, rlo, rhi, cilo, cihi ) 162 endif 163c 164c 165c Collect stats 166c 167 detci_aaff_etime = detci_aaff_etime + tff 168 detci_aadot_etime = detci_aadot_etime + tdot 169 detci_aagop_etime = detci_aagop_etime + tgop 170 return 171 end 172 subroutine detci_saa_kernel(s, lds, c, ldc, f, ldf, 173 $ cilo, cihi, istrlo, istrhi, nstrb) 174 implicit none 175 integer lds, ldc, ldf, cilo, cihi, istrlo, istrhi, nstrb 176 double precision s(lds,cilo:cihi), c(ldc,cilo:cihi) 177 double precision f(ldf,istrlo:istrhi), sum 178c 179 integer istr, i, j, jlo, jhi, nj, jj 180 integer ind(1024) 181c 182 do istr = istrlo, istrhi ! Short loop 183 do jlo = 1, nstrb, 1024 184 jhi = min(nstrb,jlo+1024-1) 185 nj = 0 186 do j = jlo, jhi 187 if (f(j,istr).ne.0.0d0) then 188 nj = nj + 1 189 ind(nj) = j 190 endif 191 enddo 192 do i = cilo, cihi 193 sum = 0.0d0 194 do jj = 1, nj 195 j = ind(jj) 196 sum = sum + f(j,istr)*c(j,i) 197 enddo 198 s(istr,i) = s(istr,i) + sum 199c 200c Sigma block with non-zero only for sym(istr)*sym(i) = targetsym 201c j runs over all symmetries as sym(j)*sym(j) = A1 and A1*targetsym=targetsym 202c 203 enddo 204 enddo 205 enddo 206c 207 end 208c 209c 210c Sigma vector Alpha-Beta contribution 211c 212c 213 subroutine detci_sigmaab( norb, nsym, nela, nelb, nstra, nstrb, 214 $ nexa, nexb, nekla, neklb, 215 $ osym, ijmap, exa, exb, 216 $ ataba, atabb, ntij, g, 217 $ vrhs, vlhs, vphase, f, 218 $ cprime, sprime, g_civec, g_sigma ) 219 implicit none 220#include "errquit.fh" 221#include "global.fh" 222#include "util.fh" 223#include "detciP.fh" 224#include "detci.fh" 225#include "cdetcistats.fh" 226#include "mafdecls.fh" 227 integer norb ! [input] Orbitals 228 integer nsym ! [input] Irreps 229 integer nela ! [input] Alpha electrons 230 integer nelb ! [input] Beta electrons 231 integer nstra ! [input] Alpha strings 232 integer nstrb ! [input] Beta strings 233 integer nexa ! [input] Alpha excitations 234 integer nexb ! [input] Beta excitations 235 integer nekla ! [input] Max non-zero alpha strings for E_kl 236 integer neklb ! [input] Max non-zero beta strings for E_kl 237 integer osym(norb) ! [input] Orbital irreps 238 integer ijmap(norb,norb) ! [input] Map of (i,j) -> ij (symm-blocked) 239 integer ataba(0:norb,0:nela,nsym) ! [input] Alpha arc weights 240 integer atabb(0:norb,0:nelb,nsym) ! [input] Beta arc weights 241 integer exa(6,nexa,nstra) ! [input] Alpha excitation lookup table 242 integer exb(6,nexb,nstrb) ! [input] Beta excitation lookup table 243 integer ntij ! [input] Symmtery-blocked triangular sum 244 double precision g(ntij,ntij) ! [input] ERI's 245 integer vrhs(nekla) ! [scratch] Array of RHS Strings for E_kl 246 integer vlhs(*) ! [scratch] Array of LHS Strings for E_kl 247 integer vphase(nekla) ! [scratch] Array of Phases for E_kl 248 double precision f(nexb) ! [scratch] Scratch space 249 double precision cprime(nstrb,nekla) ! [scratch] Gathered CI-vector 250 double precision sprime(nstrb,nekla) ! [scratch] Gathered sigma vector 251 integer g_civec ! [input] CI-vector 252 integer g_sigma ! [input/output] Sigma vector 253c 254c 255 integer i, ij, kl, iii 256 integer ak, al, iph 257 integer jstr 258 integer ne_kl, nidx, iex 259 integer relv(detci_maxelec) 260 integer lelv(detci_maxelec) 261 integer oidx(detci_maxorb) 262 integer ip(detci_maxelec) 263 integer ploop, numnodes, next, myid 264 integer nsblk, isblk, slo, shi, sseg 265 double precision phase 266 double precision tstr, tgath, tdotab, tscat 267 double precision tsync, tx 268 integer k_ci, l_ci, k_sig, l_sig 269 logical oreplicated 270 integer nxtask 271 external nxtask 272c 273c If enuf memory is available simply replicate the CI vector 274c and accumulate into local sigmas, ending with global sum. 275c 276 oreplicated = ma_inquire_avail(mt_dbl) .gt. 2*nstra*nstrb 277 if (oreplicated) then 278 if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: c', 279 $ l_ci, k_ci)) call errquit('detci_sab: ma', nstra*nstrb, 280 & MA_ERR) 281 if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: s', 282 $ l_sig, k_sig)) call errquit('detci_sab: ma', nstra*nstrb, 283 & MA_ERR) 284 call dfill(nstra*nstrb, 0.0d0, dbl_mb(k_sig), 1) 285 if (ga_nodeid() .eq. 0) call ga_get(g_civec, 1, nstrb, 1, nstra, 286 $ dbl_mb(k_ci), nstrb) 287 call ga_brdcst(1, dbl_mb(k_ci), 288 $ ma_sizeof(mt_dbl,nstra*nstrb,mt_byte), 0) 289 else 290 k_ci = 1 ! to avoid segv 291 k_sig = 1 292 endif 293c 294c 295c Initialize parallel stuff 296c 297 tstr = 0.d0 298 tgath = 0.d0 299 tdotab = 0.d0 300 tscat = 0.d0 301 ploop = -1 302 numnodes = ga_nnodes() 303 myid = ga_nodeid() 304c 305c Block over strings for finer granularity 306c 307 nsblk = (20*numnodes)/(norb*norb) 308 nsblk = max(nsblk,1) 309 NSBLK = min(2,nekla) 310 if (numnodes.eq.1) nsblk = 1 311 sseg = (nekla/nsblk) 312 if (mod(nekla,nsblk).ne.0) sseg = sseg + 1 313c 314c Loop over all excitation operators 315c 316c t 317c E = a a 318c kl k l 319c 320 next = nxtask(numnodes, 1) 321 do ak=1,norb 322 do al=1,norb 323 shi = 0 324 do isblk=1,nsblk 325 slo = shi + 1 326 shi = min((slo + sseg - 1),nekla) 327 ploop = ploop + 1 328 if (ploop.eq.next) then 329 tx = util_cpusec() 330 kl = ijmap(ak,al) 331c 332c Vector of orbital indices except create/annih indices 333c Initialize pointer vector 334c 335 call ifill(norb,0,oidx,1) 336 nidx = 0 337 do i=1,norb 338 if ((i.ne.ak).and.(i.ne.al)) then 339 nidx = nidx + 1 340 oidx(nidx) = i 341 endif 342 enddo 343 do i=1,nela-1 344 ip(i) = i 345 enddo 346c 347c Loop through all strings for nidx and (nela-1) 348c Accept strings in the block range slo:shi 349c Insert orbital index k and l to create 350c LHS and RHS strings where 351c 352c |LHS> = E |RHS> 353c kl 354c 355c Push indices into gather/scatter arrays 356c 357c Note: special case when nela = norb then 358c 359c E !RHS> = 0 for k != l 360c kl 361c 362c thus ne_kl = 0 363c 364 ne_kl = 0 365 if (nela.eq.1) then 366 ne_kl = 1 367 vrhs(i) = al 368 vlhs(i) = ak 369 vphase(i) = 1 370 else if ((norb.ne.nela).or.(al.eq.ak)) then 371 iii = 0 372 101 continue 373 iii = iii + 1 374 if ((iii.ge.slo).and.(iii.le.shi)) then 375 ne_kl = ne_kl + 1 376 iph = 1 377 call detci_ptr2elv( norb, nela, (nela-1), nidx, ip, 378 $ oidx, al, relv, iph ) 379 call detci_ptr2elv( norb, nela, (nela-1), nidx, ip, 380 $ oidx, ak, lelv, iph ) 381 vphase(ne_kl) = iph 382 vrhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym, 383 $ ataba, relv) 384 vlhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym, 385 $ ataba, lelv) 386 endif 387 if (detci_getnextelv(nidx,(nela-1),ip)) goto 101 388 endif 389 tstr = tstr + util_cpusec() - tx 390 tx = util_cpusec() 391c 392c End loop over possible strings 393c 394c Gather in CI blocks 395c 396 call detci_cigather( nstrb, nstra, ne_kl, g_civec, vlhs, 397 $ vphase, sprime , dbl_mb(k_ci), 398 $ oreplicated) 399 call dfill((nstrb*ne_kl), 0.d0, cprime, 1 ) 400 call transpose_nw( nstrb, ne_kl, sprime, cprime ) 401 call dfill((nstrb*ne_kl),0.d0,sprime,1) 402 tgath = tgath + util_cpusec() - tx 403 tx = util_cpusec() 404c 405c Loop over all beta strings 406c 407 do jstr=1,nstrb 408 call dfill( nstrb, 0.d0, f, 1 ) 409 do iex=1,nexb 410 vlhs(iex) = exb(1,iex,jstr) 411 ij = exb(3,iex,jstr) 412 phase = exb(4,iex,jstr) 413 f(iex) = phase*g(ij,kl) 414 enddo 415 call detci_dotabx(jstr, ne_kl, nstrb, nexb, f, vlhs, 416 $ cprime, sprime ) 417 enddo 418 tdotab = tdotab + util_cpusec() - tx 419 tx = util_cpusec() 420c 421c Scatter accumulate result into sigma vector 422c 423 call transpose_nw( ne_kl, nstrb, sprime, cprime ) 424 call detci_ciscatter( nstrb, nstra, ne_kl, cprime, 425 $ vrhs, g_sigma, dbl_mb(k_sig), 426 $ oreplicated) 427 tscat = tscat + util_cpusec() - tx 428c 429c End parallel task 430c 431 next = nxtask(numnodes, 1) 432 endif 433 enddo 434 enddo 435 enddo 436 tx = util_cpusec() 437 next = nxtask(-numnodes, 1) 438 tsync = util_cpusec() - tx 439c 440 if (oreplicated) then 441 call ga_dgop(1, dbl_mb(k_sig), nstra*nstrb, '+') 442 if (ga_nodeid().eq.0) call ga_acc(g_sigma, 1, nstrb, 1, nstra, 443 $ dbl_mb(k_sig), nstrb, 1.0d0) 444 if (.not. ma_pop_stack(l_sig)) call errquit('detci_sab: ma?',0, 445 & MA_ERR) 446 if (.not. ma_pop_stack(l_ci)) call errquit('detci_sab: ma?',0, 447 & MA_ERR) 448 call ga_sync() 449 endif 450 detci_abstr_etime = detci_abstr_etime + tstr 451 detci_abgath_etime = detci_abgath_etime + tgath 452 detci_abdotab_etime = detci_abdotab_etime + tdotab 453 detci_abscat_etime = detci_abscat_etime + tscat 454* detci_absync_etime = detci_absync_etime + tsync 455 return 456 end 457