1
2c
3c
4c     Sigma vector Alpha - Alpha contribution
5c
6c
7      subroutine detci_sigmaaa( norb, nsym, nela, nelb, nstra, nstrb,
8     $     nexa, nexb, nekla, neklb,
9     $     osym, ijmap, exa, exb,
10     $     ataba, atabb, ntij, nsblk,
11     $     h, g, f, g_civec, g_sigma )
12*
13* $Id$
14*
15      implicit none
16#include "errquit.fh"
17#include "mafdecls.fh"
18#include "global.fh"
19#include "msgids.fh"
20#include "util.fh"
21#include "detciP.fh"
22#include "detci.fh"
23#include "cdetcistats.fh"
24      integer norb              ! [input] Orbitals
25      integer nsym              ! [input] Irreps
26      integer nela              ! [input] Alpha electrons
27      integer nelb              ! [input] Beta electrons
28      integer nstra             ! [input] Alpha strings
29      integer nstrb             ! [input] Beta strings
30      integer nexa              ! [input] Alpha excitations
31      integer nexb              ! [input] Beta excitations
32      integer nekla             ! [input] Maximum non-zero alpha strings for E_kl
33      integer neklb             ! [input] Maximum non-zero beta strings for E_kl
34      integer osym(norb)        ! [input] Orbital irreps
35      integer ijmap(norb,norb)  ! [input] Map of (i,j) -> ij (symmtery-blocked)
36      integer ataba(0:norb,0:nela,nsym) ! [input] Alpha arc weights
37      integer atabb(0:norb,0:nelb,nsym) ! [input] Beta arc weights
38      integer exa(6,nexa,nstra) ! [input] Alpha excitation lookup table
39      integer exb(6,nexb,nstrb) ! [input] Beta excitation lookup table
40      integer ntij              ! [input] Symmtery-blocked triangular sum
41      integer nsblk             ! [input] Blocking factor to increase parallelism
42      double precision h(ntij)  ! [input] One-electron hamiltonian
43      double precision g(ntij,ntij) ! [input] ERI's
44      double precision f(nstrb,nsblk) ! [scratch] Scratch space
45      integer g_civec           ! [input] CI-vector
46      integer g_sigma           ! [input/output] Sigma vector
47c
48c
49c
50      integer istr, kstr, jstr
51      integer istrlo, istrhi, iistr
52      integer msblk, iscnt
53      integer iex, kex
54      integer rlo, rhi, cilo, cihi
55      integer k_ci, k_sig, ldc, lds
56      integer myid, numnodes
57      integer j, k, l
58c$$$  integer i
59      integer kl, ij, kj, lj
60      integer kphase, jphase
61      double precision xx, tff, tdot, tgop
62*      double precision tx
63      double precision h1(detci_maxorb*detci_maxorb)
64c$$$  double precision sdot
65c$$$  external sdot
66
67c
68#include "symmdef.fh"
69#include "bitops.fh"
70#include "symmmul.fh"
71c
72c
73      tff = 0.d0
74      tdot = 0.d0
75      tgop = 0.d0
76      if (.not.ga_compare_distr( g_civec, g_sigma ))
77     $     call errquit('detci_sigmaaa: CI vectors do not match',0,
78     &       GA_ERR)
79      myid = ga_nodeid()
80      numnodes = ga_nnodes()
81      call ga_distribution( g_civec, myid, rlo, rhi, cilo, cihi )
82c
83      if (((cilo.ne.0).and.(cihi.ne.-1)).and.
84     $     ((rlo.ne.1).or.(rhi.ne.nstrb)))
85     $     call errquit('detci_sigmaaa: wrong distrib for CI vector',0,
86     &       INPUT_ERR)
87c
88      if (cilo.gt.0 .and. cihi.gt.0) then
89         call ga_access( g_civec, rlo, rhi, cilo, cihi, k_ci, ldc )
90         call ga_access( g_sigma, rlo, rhi, cilo, cihi, k_sig, lds )
91      endif
92c
93c     Precompute 2e
94c
95      do k=1,norb
96         do l=1,k
97            xx = 0.d0
98            do j=1,norb
99               kj = ijmap(k,j)
100               lj = ijmap(l,j)
101               xx = xx + g(kj,lj)
102            enddo
103            kl = ijmap(k,l)
104            h1(kl) = xx*0.5d0
105         enddo
106      enddo
107c
108c
109c
110      do istrlo=1,nstrb,nsblk
111         istrhi = min((istrlo + nsblk - 1),nstrb)
112         msblk = istrhi - istrlo + 1
113*     tx = util_cpusec()
114         call dfill((msblk*nstrb),0.d0,f,1)
115         do istr=istrlo,istrhi
116            iistr = istr - istrlo + 1
117            iscnt = (iistr - 1)*nexb
118            do iex=1,nexb
119               if (mod((iscnt+iex-1),numnodes).eq.myid) then
120                  kstr   = exb(1,iex,istr)
121                  kl     = exb(3,iex,istr)
122                  kphase = exb(4,iex,istr)
123                  f(kstr,iistr) = f(kstr,iistr) +
124     $                 kphase*(h(kl) - h1(kl))
125c
126                  do kex=1,nexb
127                     jstr   = exb(1,kex,kstr)
128                     ij     = exb(3,kex,kstr)
129                     jphase = exb(4,kex,kstr)*kphase
130                     f(jstr,iistr) = f(jstr,iistr) +
131     $                    0.5d0*jphase*g(ij,kl)
132                  enddo
133               endif
134            enddo
135         enddo
136*     tff = tff + util_cpusec() - tx
137*     tx = util_cpusec()
138         call ga_dgop(Msg_detci_sum, f, (nstrb*msblk), '+' )
139*     tgop = tgop + util_cpusec() - tx
140c
141c     Data parallel here....
142c
143*     tx = util_cpusec()
144c$$$         do i=cilo, cihi
145c$$$            do istr=istrlo,istrhi
146c$$$               iistr = istr - istrlo + 1
147c$$$               dbl_mb(k_sig+istr-1+(i-cilo)*lds) =
148c$$$     $              dbl_mb(k_sig+istr-1+(i-cilo)*lds) +
149c$$$     $              ddot(nstrb,f(1,iistr),1,
150c$$$     $              dbl_mb(k_ci+(i-cilo)*ldc),1)
151c$$$            enddo
152c$$$         enddo
153         call detci_saa_kernel(dbl_mb(k_sig), lds,
154     $        dbl_mb(k_ci), ldc, f, nstrb,
155     $        cilo, cihi, istrlo, istrhi, nstrb)
156*     tdot = tdot + util_cpusec() - tx
157      enddo
158c
159      if (cilo.gt.0 .and. cihi.gt.0) then
160         call ga_release( g_civec, rlo, rhi, cilo, cihi )
161         call ga_release_update( g_sigma, rlo, rhi, cilo, cihi )
162      endif
163c
164c
165c     Collect stats
166c
167      detci_aaff_etime = detci_aaff_etime + tff
168      detci_aadot_etime = detci_aadot_etime + tdot
169      detci_aagop_etime = detci_aagop_etime + tgop
170      return
171      end
172      subroutine detci_saa_kernel(s, lds, c, ldc, f, ldf,
173     $     cilo, cihi, istrlo, istrhi, nstrb)
174      implicit none
175      integer lds, ldc, ldf, cilo, cihi, istrlo, istrhi, nstrb
176      double precision s(lds,cilo:cihi), c(ldc,cilo:cihi)
177      double precision f(ldf,istrlo:istrhi), sum
178c
179      integer istr, i, j, jlo, jhi, nj, jj
180      integer ind(1024)
181c
182      do istr = istrlo, istrhi  ! Short loop
183         do jlo = 1, nstrb, 1024
184            jhi = min(nstrb,jlo+1024-1)
185            nj = 0
186            do j = jlo, jhi
187               if (f(j,istr).ne.0.0d0) then
188                  nj = nj + 1
189                  ind(nj) = j
190               endif
191            enddo
192            do i = cilo, cihi
193               sum = 0.0d0
194               do jj = 1, nj
195                  j = ind(jj)
196                  sum = sum + f(j,istr)*c(j,i)
197               enddo
198               s(istr,i) = s(istr,i) + sum
199c
200c  Sigma block with non-zero only for sym(istr)*sym(i) = targetsym
201c  j runs over all symmetries as sym(j)*sym(j) = A1 and A1*targetsym=targetsym
202c
203            enddo
204         enddo
205      enddo
206c
207      end
208c
209c
210c  Sigma vector Alpha-Beta contribution
211c
212c
213      subroutine detci_sigmaab( norb, nsym, nela, nelb, nstra, nstrb,
214     $                          nexa, nexb, nekla, neklb,
215     $                          osym, ijmap, exa, exb,
216     $                          ataba, atabb, ntij, g,
217     $                          vrhs, vlhs, vphase, f,
218     $                          cprime, sprime, g_civec, g_sigma )
219      implicit none
220#include "errquit.fh"
221#include "global.fh"
222#include "util.fh"
223#include "detciP.fh"
224#include "detci.fh"
225#include "cdetcistats.fh"
226#include "mafdecls.fh"
227      integer norb                            ! [input] Orbitals
228      integer nsym                            ! [input] Irreps
229      integer nela                            ! [input] Alpha electrons
230      integer nelb                            ! [input] Beta electrons
231      integer nstra                           ! [input] Alpha strings
232      integer nstrb                           ! [input] Beta strings
233      integer nexa                            ! [input] Alpha excitations
234      integer nexb                            ! [input] Beta excitations
235      integer nekla                           ! [input] Max non-zero alpha strings for E_kl
236      integer neklb                           ! [input] Max non-zero beta strings for E_kl
237      integer osym(norb)                      ! [input] Orbital irreps
238      integer ijmap(norb,norb)                ! [input] Map of (i,j) -> ij (symm-blocked)
239      integer ataba(0:norb,0:nela,nsym)           ! [input] Alpha arc weights
240      integer atabb(0:norb,0:nelb,nsym)           ! [input] Beta arc weights
241      integer exa(6,nexa,nstra)               ! [input] Alpha excitation lookup table
242      integer exb(6,nexb,nstrb)               ! [input] Beta excitation lookup table
243      integer ntij                            ! [input] Symmtery-blocked triangular sum
244      double precision g(ntij,ntij)           ! [input] ERI's
245      integer vrhs(nekla)                     ! [scratch] Array of RHS Strings for E_kl
246      integer vlhs(*)                         ! [scratch] Array of LHS Strings for E_kl
247      integer vphase(nekla)                   ! [scratch] Array of Phases for E_kl
248      double precision f(nexb)                ! [scratch] Scratch space
249      double precision cprime(nstrb,nekla)    ! [scratch] Gathered CI-vector
250      double precision sprime(nstrb,nekla)    ! [scratch] Gathered sigma vector
251      integer g_civec                         ! [input] CI-vector
252      integer g_sigma                         ! [input/output] Sigma vector
253c
254c
255      integer i, ij, kl, iii
256      integer ak, al, iph
257      integer jstr
258      integer ne_kl, nidx, iex
259      integer relv(detci_maxelec)
260      integer lelv(detci_maxelec)
261      integer oidx(detci_maxorb)
262      integer ip(detci_maxelec)
263      integer ploop, numnodes, next, myid
264      integer nsblk, isblk, slo, shi, sseg
265      double precision phase
266      double precision tstr, tgath, tdotab, tscat
267      double precision tsync, tx
268      integer k_ci, l_ci, k_sig, l_sig
269      logical oreplicated
270      integer nxtask
271      external nxtask
272c
273c  If enuf memory is available simply replicate the CI vector
274c  and accumulate into local sigmas, ending with global sum.
275c
276      oreplicated = ma_inquire_avail(mt_dbl) .gt. 2*nstra*nstrb
277      if (oreplicated) then
278        if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: c',
279     $       l_ci, k_ci)) call errquit('detci_sab: ma', nstra*nstrb,
280     &       MA_ERR)
281        if (.not. ma_push_get(mt_dbl, nstra*nstrb, 'detci_sab: s',
282     $       l_sig, k_sig)) call errquit('detci_sab: ma', nstra*nstrb,
283     &       MA_ERR)
284	call dfill(nstra*nstrb, 0.0d0, dbl_mb(k_sig), 1)
285	if (ga_nodeid() .eq. 0) call ga_get(g_civec, 1, nstrb, 1, nstra,
286     $                          dbl_mb(k_ci), nstrb)
287        call ga_brdcst(1, dbl_mb(k_ci),
288     $                 ma_sizeof(mt_dbl,nstra*nstrb,mt_byte), 0)
289      else
290	k_ci = 1  ! to avoid segv
291	k_sig = 1
292      endif
293c
294c
295c  Initialize parallel stuff
296c
297      tstr = 0.d0
298      tgath = 0.d0
299      tdotab = 0.d0
300      tscat = 0.d0
301      ploop = -1
302      numnodes = ga_nnodes()
303      myid = ga_nodeid()
304c
305c  Block over strings for finer granularity
306c
307      nsblk = (20*numnodes)/(norb*norb)
308      nsblk = max(nsblk,1)
309      NSBLK = min(2,nekla)
310      if (numnodes.eq.1) nsblk = 1
311      sseg = (nekla/nsblk)
312      if (mod(nekla,nsblk).ne.0) sseg = sseg + 1
313c
314c  Loop over all excitation operators
315c
316c            t
317c      E  = a a
318c       kl   k l
319c
320      next = nxtask(numnodes, 1)
321      do ak=1,norb
322        do al=1,norb
323          shi = 0
324          do isblk=1,nsblk
325            slo = shi + 1
326            shi = min((slo + sseg - 1),nekla)
327            ploop = ploop + 1
328            if (ploop.eq.next) then
329              tx = util_cpusec()
330              kl = ijmap(ak,al)
331c
332c Vector of orbital indices except create/annih indices
333c Initialize pointer vector
334c
335              call ifill(norb,0,oidx,1)
336              nidx = 0
337              do i=1,norb
338                if ((i.ne.ak).and.(i.ne.al)) then
339                  nidx = nidx + 1
340                  oidx(nidx) = i
341                endif
342              enddo
343              do i=1,nela-1
344                ip(i) = i
345              enddo
346c
347c Loop through all strings for nidx and (nela-1)
348c Accept strings in the block range slo:shi
349c Insert orbital index k and l to create
350c LHS and RHS strings where
351c
352c        |LHS> = E  |RHS>
353c                 kl
354c
355c Push indices into gather/scatter arrays
356c
357c Note: special case when nela = norb then
358c
359c       E  !RHS> = 0   for k != l
360c        kl
361c
362c thus ne_kl = 0
363c
364              ne_kl = 0
365              if (nela.eq.1) then
366                ne_kl = 1
367                vrhs(i) = al
368                vlhs(i) = ak
369                vphase(i) = 1
370              else if ((norb.ne.nela).or.(al.eq.ak)) then
371                iii = 0
372 101            continue
373                iii = iii + 1
374                if ((iii.ge.slo).and.(iii.le.shi)) then
375                  ne_kl = ne_kl + 1
376                  iph = 1
377                  call detci_ptr2elv( norb, nela, (nela-1), nidx, ip,
378     $                                oidx, al, relv, iph )
379                  call detci_ptr2elv( norb, nela, (nela-1), nidx, ip,
380     $                                oidx, ak, lelv, iph )
381                  vphase(ne_kl) = iph
382                  vrhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym,
383     $                                         ataba, relv)
384                  vlhs(ne_kl) = detci_elv2str( norb, nela, nsym, osym,
385     $                                         ataba, lelv)
386                endif
387                if (detci_getnextelv(nidx,(nela-1),ip)) goto 101
388              endif
389              tstr = tstr + util_cpusec() - tx
390              tx = util_cpusec()
391c
392c End loop over possible strings
393c
394c Gather in CI blocks
395c
396              call detci_cigather( nstrb, nstra, ne_kl, g_civec, vlhs,
397     $                             vphase, sprime , dbl_mb(k_ci),
398     $                             oreplicated)
399              call dfill((nstrb*ne_kl), 0.d0, cprime, 1 )
400              call transpose_nw( nstrb, ne_kl, sprime, cprime )
401              call dfill((nstrb*ne_kl),0.d0,sprime,1)
402              tgath = tgath + util_cpusec() - tx
403              tx = util_cpusec()
404c
405c Loop over all beta strings
406c
407              do jstr=1,nstrb
408                call dfill( nstrb, 0.d0, f, 1 )
409                do iex=1,nexb
410                  vlhs(iex) = exb(1,iex,jstr)
411                  ij        = exb(3,iex,jstr)
412                  phase     = exb(4,iex,jstr)
413                  f(iex)    = phase*g(ij,kl)
414                enddo
415                call detci_dotabx(jstr, ne_kl, nstrb, nexb, f, vlhs,
416     $                            cprime, sprime )
417              enddo
418              tdotab = tdotab + util_cpusec() - tx
419              tx = util_cpusec()
420c
421c Scatter accumulate result into sigma vector
422c
423              call transpose_nw( ne_kl, nstrb, sprime, cprime )
424              call detci_ciscatter( nstrb, nstra, ne_kl, cprime,
425     $                              vrhs, g_sigma, dbl_mb(k_sig),
426     $                              oreplicated)
427              tscat = tscat + util_cpusec() - tx
428c
429c End parallel task
430c
431              next = nxtask(numnodes, 1)
432            endif
433          enddo
434        enddo
435      enddo
436      tx = util_cpusec()
437      next = nxtask(-numnodes, 1)
438      tsync = util_cpusec() - tx
439c
440      if (oreplicated) then
441	call ga_dgop(1, dbl_mb(k_sig), nstra*nstrb, '+')
442        if (ga_nodeid().eq.0) call ga_acc(g_sigma, 1, nstrb, 1, nstra,
443     $                                    dbl_mb(k_sig), nstrb, 1.0d0)
444        if (.not. ma_pop_stack(l_sig)) call errquit('detci_sab: ma?',0,
445     &       MA_ERR)
446        if (.not. ma_pop_stack(l_ci)) call errquit('detci_sab: ma?',0,
447     &       MA_ERR)
448        call ga_sync()
449      endif
450      detci_abstr_etime   = detci_abstr_etime + tstr
451      detci_abgath_etime  = detci_abgath_etime + tgath
452      detci_abdotab_etime = detci_abdotab_etime + tdotab
453      detci_abscat_etime  = detci_abscat_etime + tscat
454*      detci_absync_etime  = detci_absync_etime + tsync
455      return
456      end
457