1
2c
3c
4c     Sigma vector Alpha - Alpha contribution
5c
6c
7      subroutine detci_sigmaaa( norb, nsym, nela, nelb, nstra, nstrb,
8     $     nexa, nexb, nekla, neklb,
9     $     osym, ijmap, exa, exb,
10     $     ataba, atabb, ntij, nsblk,
11     $     h, g, f, g_civec, g_sigma )
12*
13* $Id$
14*
15      implicit none
16#include "errquit.fh"
17#include "mafdecls.fh"
18#include "global.fh"
19#include "msgids.fh"
20#include "util.fh"
21#include "detciP.fh"
22#include "detci.fh"
23#include "cdetcistats.fh"
24      integer norb              ! [input] Orbitals
25      integer nsym              ! [input] Irreps
26      integer nela              ! [input] Alpha electrons
27      integer nelb              ! [input] Beta electrons
28      integer nstra             ! [input] Alpha strings
29      integer nstrb             ! [input] Beta strings
30      integer nexa              ! [input] Alpha excitations
31      integer nexb              ! [input] Beta excitations
32      integer nekla             ! [input] Maximum non-zero alpha strings for E_kl
33      integer neklb             ! [input] Maximum non-zero beta strings for E_kl
34      integer osym(norb)        ! [input] Orbital irreps
35      integer ijmap(norb,norb)  ! [input] Map of (i,j) -> ij (symmtery-blocked)
36      integer ataba(0:norb,0:nela,nsym) ! [input] Alpha arc weights
37      integer atabb(0:norb,0:nelb,nsym) ! [input] Beta arc weights
38      integer exa(6,nexa,nstra) ! [input] Alpha excitation lookup table
39      integer exb(6,nexb,nstrb) ! [input] Beta excitation lookup table
40      integer ntij              ! [input] Symmtery-blocked triangular sum
41      integer nsblk             ! [input] Blocking factor to increase parallelism
42      double precision h(ntij)  ! [input] One-electron hamiltonian
43      double precision g(ntij,ntij) ! [input] ERI's
44      double precision f(nstrb,nsblk) ! [scratch] Scratch space
45      integer g_civec           ! [input] CI-vector
46      integer g_sigma           ! [input/output] Sigma vector
47c
48c
49c
50      integer istr, kstr, jstr
51      integer istrlo, istrhi, iistr
52      integer msblk, iscnt
53      integer iex, kex
54      integer rlo, rhi, cilo, cihi
55      integer k_ci, k_sig, ldc, lds
56      integer myid, numnodes
57      integer j, k, l
58c$$$  integer i
59      integer kl, ij, kj, lj
60      integer kphase, jphase
61      double precision xx, tff, tdot, tgop
62*      double precision tx
63      double precision h1(detci_maxorb*detci_maxorb)
64c$$$  double precision sdot
65c$$$  external sdot
66
67c
68#include "symmdef.fh"
69#include "bitops.fh"
70#include "symmmul.fh"
71c
72c
73      tff = 0.d0
74      tdot = 0.d0
75      tgop = 0.d0
76      if (.not.ga_compare_distr( g_civec, g_sigma ))
77     $     call errquit('detci_sigmaaa: CI vectors do not match',0,
78     &       GA_ERR)
79      myid = ga_nodeid()
80      numnodes = ga_nnodes()
81      call ga_distribution( g_civec, myid, rlo, rhi, cilo, cihi )
82c
83      if (((cilo.ne.0).and.(cihi.ne.-1)).and.
84     $     ((rlo.ne.1).or.(rhi.ne.nstrb)))
85     $     call errquit('detci_sigmaaa: wrong distrib for CI vector',0,
86     &       INPUT_ERR)
87c
88      if (cilo.gt.0 .and. cihi.gt.0) then
89         call ga_access( g_civec, rlo, rhi, cilo, cihi, k_ci, ldc )
90         call ga_access( g_sigma, rlo, rhi, cilo, cihi, k_sig, lds )
91      endif
92c
93c     Precompute 2e
94c
95      do k=1,norb
96         do l=1,k
97            xx = 0.d0
98            do j=1,norb
99               kj = ijmap(k,j)
100               lj = ijmap(l,j)
101               xx = xx + g(kj,lj)
102            enddo
103            kl = ijmap(k,l)
104            h1(kl) = xx*0.5d0
105         enddo
106      enddo
107c
108c
109c
110      do istrlo=1,nstrb,nsblk
111         istrhi = min((istrlo + nsblk - 1),nstrb)
112         msblk = istrhi - istrlo + 1
113*     tx = util_cpusec()
114         call dfill((msblk*nstrb),0.d0,f,1)
115         do istr=istrlo,istrhi
116            iistr = istr - istrlo + 1
117            iscnt = (iistr - 1)*nexb
118            do iex=1,nexb
119               if (mod((iscnt+iex-1),numnodes).eq.myid) then
120                  kstr   = exb(1,iex,istr)
121                  kl     = exb(3,iex,istr)
122                  kphase = exb(4,iex,istr)
123                  f(kstr,iistr) = f(kstr,iistr) +
124     $                 kphase*(h(kl) - h1(kl))
125c
126                  do kex=1,nexb
127                     jstr   = exb(1,kex,kstr)
128                     ij     = exb(3,kex,kstr)
129                     jphase = exb(4,kex,kstr)*kphase
130                     f(jstr,iistr) = f(jstr,iistr) +
131     $                    0.5d0*jphase*g(ij,kl)
132                  enddo
133               endif
134            enddo
135         enddo
136*     tff = tff + util_cpusec() - tx
137*     tx = util_cpusec()
138         call ga_dgop(Msg_detci_sum, f, (nstrb*msblk), '+' )
139*     tgop = tgop + util_cpusec() - tx
140c
141c     Data parallel here....
142c
143*     tx = util_cpusec()
144c$$$         do i=cilo, cihi
145c$$$            do istr=istrlo,istrhi
146c$$$               iistr = istr - istrlo + 1
147c$$$               dbl_mb(k_sig+istr-1+(i-cilo)*lds) =
148c$$$     $              dbl_mb(k_sig+istr-1+(i-cilo)*lds) +
149c$$$     $              ddot(nstrb,f(1,iistr),1,
150c$$$     $              dbl_mb(k_ci+(i-cilo)*ldc),1)
151c$$$            enddo
152c$$$         enddo
153         call detci_saa_kernel(dbl_mb(k_sig), lds,
154     $        dbl_mb(k_ci), ldc, f, nstrb,
155     $        cilo, cihi, istrlo, istrhi, nstrb)
156*     tdot = tdot + util_cpusec() - tx
157      enddo
158c
159      if (cilo.gt.0 .and. cihi.gt.0) then
160         call ga_release( g_civec, rlo, rhi, cilo, cihi )
161         call ga_release_update( g_sigma, rlo, rhi, cilo, cihi )
162      endif
163c
164c
165c     Collect stats
166c
167      detci_aaff_etime = detci_aaff_etime + tff
168      detci_aadot_etime = detci_aadot_etime + tdot
169      detci_aagop_etime = detci_aagop_etime + tgop
170      return
171      end
172      subroutine detci_saa_kernel(s, lds, c, ldc, f, ldf,
173     $     cilo, cihi, istrlo, istrhi, nstrb)
174      implicit none
175      integer lds, ldc, ldf, cilo, cihi, istrlo, istrhi, nstrb
176      double precision s(lds,cilo:cihi), c(ldc,cilo:cihi)
177      double precision f(ldf,istrlo:istrhi), sum
178c
179      integer istr, i, j, jlo, jhi, nj, jj
180      integer ind(1024)
181c
182      do istr = istrlo, istrhi  ! Short loop
183         do jlo = 1, nstrb, 1024
184            jhi = min(nstrb,jlo+1024-1)
185            nj = 0
186            do j = jlo, jhi
187               if (f(j,istr).ne.0.0d0) then
188                  nj = nj + 1
189                  ind(nj) = j
190               endif
191            enddo
192            do i = cilo, cihi
193               sum = 0.0d0
194               do jj = 1, nj
195                  j = ind(jj)
196                  sum = sum + f(j,istr)*c(j,i)
197               enddo
198               s(istr,i) = s(istr,i) + sum
199            enddo
200         enddo
201      enddo
202c
203      end
204c
205c
206c  Sigma vector Alpha-Beta contribution
207c
208c
209      subroutine detci_sigmaab( norb, nsym, nela, nelb, nstra, nstrb,
210     $                          nexa, nexb, nekla, neklb,
211     $                          osym, ijmap, exa, exb,
212     $                          ataba, atabb, ntij, g,
213     $                          vrhs, vlhs, vphase, f,
214     $                          cprime, sprime, g_civec, g_sigma )
215      implicit none
216#include "errquit.fh"
217#include "global.fh"
218#include "util.fh"
219#include "detciP.fh"
220#include "detci.fh"
221#include "cdetcistats.fh"
222#include "mafdecls.fh"
223      integer norb                            ! [input] Orbitals
224      integer nsym                            ! [input] Irreps
225      integer nela                            ! [input] Alpha electrons
226      integer nelb                            ! [input] Beta electrons
227      integer nstra                           ! [input] Alpha strings
228      integer nstrb                           ! [input] Beta strings
229      integer nexa                            ! [input] Alpha excitations
230      integer nexb                            ! [input] Beta excitations
231      integer nekla                           ! [input] Max non-zero alpha strings for E_kl
232      integer neklb                           ! [input] Max non-zero beta strings for E_kl
233      integer osym(norb)                      ! [input] Orbital irreps
234      integer ijmap(norb,norb)                ! [input] Map of (i,j) -> ij (symm-blocked)
235      integer ataba(0:norb,0:nela,nsym)           ! [input] Alpha arc weights
236      integer atabb(0:norb,0:nelb,nsym)           ! [input] Beta arc weights
237      integer exa(6,nexa,nstra)               ! [input] Alpha excitation lookup table
238      integer exb(6,nexb,nstrb)               ! [input] Beta excitation lookup table
239      integer ntij                            ! [input] Symmtery-blocked triangular sum
240      double precision g(ntij,ntij)           ! [input] ERI's
241      integer vrhs(nekla)                     ! [scratch] Array of RHS Strings for E_kl
242      integer vlhs(*)                         ! [scratch] Array of LHS Strings for E_kl
243      integer vphase(nekla)                   ! [scratch] Array of Phases for E_kl
244      double precision f(nexb)                ! [scratch] Scratch space
245      double precision cprime(nstrb,nekla)    ! [scratch] Gathered CI-vector
246      double precision sprime(nstrb,nekla)    ! [scratch] Gathered sigma vector
247      integer g_civec                         ! [input] CI-vector
248      integer g_sigma                         ! [input/output] Sigma vector
249c
250c
251      integer i, ij, kl, iii
252      integer ak, al, iph
253      integer jstr
254      integer ne_kl, nidx, iex
255      integer relv(detci_maxelec)
256      integer lelv(detci_maxelec)
257      integer oidx(detci_maxorb)
258      integer ip(detci_maxelec)
259      integer ploop, numnodes, next, myid
260      integer nsblk, isblk, slo, shi, sseg
261      double precision phase
262      double precision tstr, tgath, tdotab, tscat
263*      double precision tsync, tx
264      integer k_ci, l_ci, k_sig, l_sig
265      logical oreplicated
266      integer nxtask
267      external nxtask
268c
269c  If enuf memory is available simply replicate the CI vector
270c  and accumulate into local sigmas, ending with global sum.
271c
272      oreplicated = ma_inquire_avail(mt_dbl) .gt. 2*nstra*nstrb
273      if (oreplicated) then
274        if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: c',
275     $       l_ci, k_ci)) call errquit('detci_sab: ma', nstra*nstrb,
276     &       MA_ERR)
277        if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: s',
278     $       l_sig, k_sig)) call errquit('detci_sab: ma', nstra*nstrb,
279     &       MA_ERR)
280	call dfill(nstra*nstrb, 0.0d0, dbl_mb(k_sig), 1)
281	if (ga_nodeid() .eq. 0) call ga_get(g_civec, 1, nstrb, 1, nstra,
282     $                          dbl_mb(k_ci), nstrb)
283        call ga_brdcst(1, dbl_mb(k_ci),
284     $                 ma_sizeof(mt_dbl,nstra*nstrb,mt_byte), 0)
285      else
286	k_ci = 1  ! to avoid segv
287	k_sig = 1
288      endif
289c
290c
291c  Initialize parallel stuff
292c
293      tstr = 0.d0
294      tgath = 0.d0
295      tdotab = 0.d0
296      tscat = 0.d0
297      ploop = -1
298      numnodes = ga_nnodes()
299      myid = ga_nodeid()
300c
301c  Block over strings for finer granularity
302c
303      nsblk = (20*numnodes)/(norb*norb)
304      nsblk = max(nsblk,1)
305      NSBLK = min(2,nekla)
306      if (numnodes.eq.1) nsblk = 1
307      sseg = (nekla/nsblk)
308      if (mod(nekla,nsblk).ne.0) sseg = sseg + 1
309c
310c  Loop over all excitation operators
311c
312c            t
313c      E  = a a
314c       kl   k l
315c
316      next = nxtask(numnodes, 1)
317      do ak=1,norb
318        do al=1,norb
319          shi = 0
320          do isblk=1,nsblk
321            slo = shi + 1
322            shi = min((slo + sseg - 1),nekla)
323            ploop = ploop + 1
324            if (ploop.eq.next) then
325*              tx = util_cpusec()
326              kl = ijmap(ak,al)
327c
328c Vector of orbital indices except create/annih indices
329c Initialize pointer vector
330c
331              call ifill(norb,0,oidx,1)
332              nidx = 0
333              do i=1,norb
334                if ((i.ne.ak).and.(i.ne.al)) then
335                  nidx = nidx + 1
336                  oidx(nidx) = i
337                endif
338              enddo
339              do i=1,nela-1
340                ip(i) = i
341              enddo
342c
343c Loop through all strings for nidx and (nela-1)
344c Accept strings in the block range slo:shi
345c Insert orbital index k and l to create
346c LHS and RHS strings where
347c
348c        |LHS> = E  |RHS>
349c                 kl
350c
351c Push indices into gather/scatter arrays
352c
353c Note: special case when nela = norb then
354c
355c       E  !RHS> = 0   for k != l
356c        kl
357c
358c thus ne_kl = 0
359c
360              ne_kl = 0
361              if (nela.eq.1) then
362                ne_kl = 1
363                vrhs(i) = al
364                vlhs(i) = ak
365                vphase(i) = 1
366              else if ((norb.ne.nela).or.(al.eq.ak)) then
367                iii = 0
368 101            continue
369                iii = iii + 1
370                if ((iii.ge.slo).and.(iii.le.shi)) then
371                  ne_kl = ne_kl + 1
372                  iph = 1
373                  call detci_ptr2elv( norb, nela, (nela-1), nidx, ip,
374     $                                oidx, al, relv, iph )
375                  call detci_ptr2elv( norb, nela, (nela-1), nidx, ip,
376     $                                oidx, ak, lelv, iph )
377                  vphase(ne_kl) = iph
378                  vrhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym,
379     $                                         ataba, relv)
380                  vlhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym,
381     $                                         ataba, lelv)
382                endif
383                if (detci_getnextelv(nidx,(nela-1),ip)) goto 101
384              endif
385*              tstr = tstr + util_cpusec() - tx
386*              tx = util_cpusec()
387c
388c End loop over possible strings
389c
390c Gather in CI blocks
391c
392              call detci_cigather( nstrb, nstra, ne_kl, g_civec, vlhs,
393     $                             vphase, sprime , dbl_mb(k_ci),
394     $                             oreplicated)
395              call dfill((nstrb*ne_kl), 0.d0, cprime, 1 )
396              call transpose_nw( nstrb, ne_kl, sprime, cprime )
397              call dfill((nstrb*ne_kl),0.d0,sprime,1)
398*              tgath = tgath + util_cpusec() - tx
399*              tx = util_cpusec()
400c
401c Loop over all beta strings
402c
403              do jstr=1,nstrb
404                call dfill( nstrb, 0.d0, f, 1 )
405                do iex=1,nexb
406                  vlhs(iex) = exb(1,iex,jstr)
407                  ij        = exb(3,iex,jstr)
408                  phase     = exb(4,iex,jstr)
409                  f(iex)    = phase*g(ij,kl)
410                enddo
411                call detci_dotabx(jstr, ne_kl, nstrb, nexb, f, vlhs,
412     $                            cprime, sprime )
413              enddo
414*              tdotab = tdotab + util_cpusec() - tx
415*              tx = util_cpusec()
416c
417c Scatter accumulate result into sigma vector
418c
419              call transpose_nw( ne_kl, nstrb, sprime, cprime )
420              call detci_ciscatter( nstrb, nstra, ne_kl, cprime,
421     $                              vrhs, g_sigma, dbl_mb(k_sig),
422     $                              oreplicated)
423*              tscat = tscat + util_cpusec() - tx
424c
425c End parallel task
426c
427              next = nxtask(numnodes, 1)
428            endif
429          enddo
430        enddo
431      enddo
432*      tx = util_cpusec()
433      next = nxtask(-numnodes, 1)
434*      tsync = util_cpusec() - tx
435c
436      if (oreplicated) then
437	call ga_dgop(1, dbl_mb(k_sig), nstra*nstrb, '+')
438        if (ga_nodeid().eq.0) call ga_acc(g_sigma, 1, nstrb, 1, nstra,
439     $                                    dbl_mb(k_sig), nstrb, 1.0d0)
440        if (.not. ma_pop_stack(l_sig)) call errquit('detci_sab: ma?',0,
441     &       MA_ERR)
442        if (.not. ma_pop_stack(l_ci)) call errquit('detci_sab: ma?',0,
443     &       MA_ERR)
444        call ga_sync()
445      endif
446      detci_abstr_etime   = detci_abstr_etime + tstr
447      detci_abgath_etime  = detci_abgath_etime + tgath
448      detci_abdotab_etime = detci_abdotab_etime + tdotab
449      detci_abscat_etime  = detci_abscat_etime + tscat
450*      detci_absync_etime  = detci_absync_etime + tsync
451      return
452      end
453