1c
2c     Evaluate the quantity W. Eq. 24, Furche & Ahlrichs
3c     There are three parts that need evaluating separately.
4c     Occupied-occupied, Occupied-Virtual, Virtual-Virtual parts.
5c     HvD 11/2007, NG 11/2012
6c
7      subroutine tddft_grad_compute_w(rtdb,ihdl_geom,ihdl_bfao,tol2e,
8     +           tda,ipol,nroots,nfc,naoc,nocc,nav,
9     +           nfv,nao,g_mo,g_p,g_z,g_xpy,g_xmy,eps,omega,g_w,
10     +           kfac,lhashf,otriplet)
11c
12      implicit none
13c
14#include "errquit.fh"
15#include "mafdecls.fh"
16#include "global.fh"
17#include "rtdb.fh"
18#include "tddft_grad_util.fh"
19#include "stdio.fh"
20c
21c     Input:
22c
23      integer rtdb      ! runtime database handle
24      integer ihdl_geom ! geometry handle
25      integer ihdl_bfao ! basis set handle
26      logical tda       ! .true. if Tamm-Dancoff approximation is used
27      integer ipol      ! =1 (restricted), =2 (unrestricted)
28      integer nfc(2)    ! the number of frozen core orbitals
29      integer naoc(2)   ! the number of active occupied orbitals
30      integer nocc(2)   ! the total number of occupied orbitals
31      integer nav(2)    ! the number of active virtual orbitals
32      integer nfv(2)    ! the number of frozen virtual orbitals
33      integer nao       ! the number of basis functions
34      integer nroots    ! the number of states to consider
35      integer g_mo(2)   ! global array handle for MO coefficients
36      integer g_p(2)    ! global array handle for Ppq
37      integer g_z(2)    ! global array handle for Zia
38      integer g_xpy(2)  ! global array handle for (X+Y)
39      integer g_xmy(2)  ! global array handle for (X-Y)
40c
41      logical lhashf    ! =.true.  hybrid functionals
42                        ! =.false. otherwise
43      logical otriplet  ! =.true.  triplet excited states
44                        ! =.false. singlet excited states
45                        ! value does not matter for TDUDFT
46c
47      double precision kfac          ! the weight of the Hartree-Fock
48                                     ! exchange contributions
49      double precision eps(nao,2)    ! orbital eigenvalues
50      double precision omega(nroots) ! the excitation energies
51      double precision tol2e         ! integral tolerance
52c
53c     Output:
54c
55      integer g_w(2)    ! global array handle for W
56c
57c     Functions:
58c
59      integer  ga_create_atom_blocked
60      external ga_create_atom_blocked
61      logical  xc_gotxc
62      external xc_gotxc
63c
64c     Local:
65c
66      integer g_puv     ! global array handle for Puv
67      integer g_apbp ! global array handle for (A+B)Puv
68      integer g_ambp ! global array handle for (A-B)Puv
69      integer g_x    ! global array handle for X
70      integer g_y    ! global array handle for Y
71      integer g_apbx ! global array handle for (A+B)X
72      integer g_ambx ! global array handle for (A-B)X
73      integer g_apby ! global array handle for (A+B)Y
74      integer g_amby ! global array handle for (A-B)Y
75      integer g_hij(2)  ! global array handle for Hij+-
76      integer g_t       ! global array handle for transposer
77c
78      integer alo(3) ! lower chunck limits on A
79      integer ahi(3) ! upper chunck limits on A
80      integer blo(3) ! lower chunck limits on B
81      integer bhi(3) ! upper chunck limits on B
82      integer clo(3) ! lower chunck limits on C
83      integer chi(3) ! upper chunck limits on C
84      integer ld(3)  ! leading dimensions
85c
86      integer idim(3)  ! dimensions for global arrays
87      integer ichnk(3) ! chunk sizes for distribution
88c
89      integer nproc  ! the number of processors
90      integer iproc  ! the rank of the current processor
91      integer jproc  ! the rank of some other processor
92c
93      integer ip     ! counter over spin components
94      integer ir     ! counter over roots
95      integer ic     ! counter for arbitrary purposes
96      integer icount ! counter for arbitrary purposes
97c
98      integer mini   ! minimum value of i-label
99      integer minj   ! minimum value of j-label
100      integer mina   ! minimum value of a-label
101      integer minb   ! minimum value of b-label
102      integer maxi   ! maximum value of i-label
103      integer maxj   ! maximum value of j-label
104      integer maxa   ! maximum value of a-label
105      integer maxb   ! maximum value of b-label
106      integer numi   ! the number of i-labels
107      integer numj   ! the number of j-labels
108      integer numa   ! the number of a-labels
109      integer numb   ! the number of b-labels
110c
111      integer i      ! counter over i-labels
112      integer j      ! counter over j-labels
113      integer a      ! counter over a-labels
114      integer b      ! counter over b-labels
115c
116      integer imo    ! molecular orbital label for accessing eps
117c
118      integer ihdl_a ! the memory handle for block A
119      integer ihdl_b ! the memory handle for block B
120      integer ihdl_r ! the memory handle for block R
121      integer ihdl_v ! the memory handle for block V
122c
123      integer iptr_a ! the memory index for block A
124      integer iptr_b ! the memory index for block B
125      integer iptr_r ! the memory index for block R
126      integer iptr_v ! the memory index for block V
127c
128      integer calc_type      ! calculation type for fock_xc
129      integer l_dens, k_dens ! memory for array of density handles
130      integer l_den2, k_den2 ! memory for array of density handles
131      integer ndens          ! the number of density matrices
132      integer l_gxc , k_gxc  ! memory for array of Gxc matrix handles
133      integer ngxc           ! the number of Gxc matrices
134      logical oroot
135      integer iwhich
136c
137      character*32 pname
138c
139      logical oskel
140      parameter (oskel=.false.)
141      double precision Exc(2)    ! Exchange-correlation energy
142c
143      logical tdaloc
144      logical doitw,doitz
145      logical doitxpy1,doitxpy2
146      logical doitxmy1,doitxmy2
147c
148      pname = "tddft_grad_compute_w: "
149      iwhich = 0   ! call to tddft_nga_cont()
150c
151c     General stuff
152c
153      iproc = ga_nodeid()
154      nproc = ga_nnodes()
155c
156      do ip = 1, ipol
157        call ga_zero(g_w(ip))
158      enddo
159c Daniel (12/1/12): Here, we have that the occupied-occupied block is
160c the sum of the ground state energy-weighted matrix with the
161c energy-weighted difference density matrix.  The former gives the
162c first term in the expression.  The ground state density matrix makes
163c no other contributions to W.
164c
165c     Compute the occupied-occupied block
166c
167c     Wijs = delta_ij eps_i
168c          + Omega Sum_a [(X+Y)ias(X-Y)jas+(X-Y)ias(X+Y)jas]/2
169c          - Sum_a eps_a [(X+Y)ias(X+Y)jas+(X-Y)ias(X-Y)jas]/2
170c          + Aijs'[P]
171c          + Sum_kat Sum_ldr Gxc_ijskatldr(X+Y)kat(X+Y)ldr
172c
173c     Do the first term, the parallelisation is driven off the
174c     distribution of g_w.
175c
176      do ip = 1, ipol
177        call nga_distribution(g_w(ip),iproc,alo,ahi)
178        doitw = .not.(alo(1).ne.1.or.ahi(1).ne.nroots)
179        if (doitw) then
180         mini = max(alo(2),alo(3))
181         maxi = min(ahi(2),ahi(3),naoc(ip))
182         do i = mini, maxi
183          imo = nfc(ip)+i
184          do ir = 1, nroots
185            blo(1) = ir
186            bhi(1) = ir
187            blo(2) = i
188            bhi(2) = i
189            blo(3) = i
190            bhi(3) = i
191            ld(1)  = 1
192            ld(2)  = 1
193            call nga_acc(g_w(ip),blo,bhi,(3-ipol)*eps(imo,ip),ld,1.0d0)
194          enddo
195         enddo
196        endif ! doitw
197      enddo ! ipol
198c
199c     Do the second term
200c
201      do ip = 1, ipol
202          do ir = 1, nroots
203            alo(1) = ir
204            ahi(1) = ir
205            alo(2) = 1
206            ahi(2) = naoc(ip)
207            alo(3) = 1
208            ahi(3) = nav(ip)
209            blo(1) = ir
210            bhi(1) = ir
211            blo(2) = 1
212            bhi(2) = nav(ip)
213            blo(3) = 1
214            bhi(3) = naoc(ip)
215            clo(1) = ir
216            chi(1) = ir
217            clo(2) = 1
218            chi(2) = naoc(ip)
219            clo(3) = 1
220            chi(3) = naoc(ip)
221            if (.not.tda) then
222               call tddft_patch3mxm('n','t',0.5d0*omega(ir),1.0d0,
223     +                                g_xpy(ip),alo,ahi,
224     +                                g_xmy(ip),blo,bhi,
225     +                                g_w(ip),clo,chi)
226               call tddft_patch3mxm('n','t',0.5d0*omega(ir),1.0d0,
227     +                                g_xmy(ip),alo,ahi,
228     +                                g_xpy(ip),blo,bhi,
229     +                                g_w(ip),clo,chi)
230            else
231c              g_xpy: X with CIS, Y = 0
232c              g_xmy: not created with CIS
233               call tddft_patch3mxm('n','t',omega(ir),1.0d0,
234     +                                g_xpy(ip),alo,ahi,
235     +                                g_xpy(ip),blo,bhi,
236     +                                g_w(ip),clo,chi)
237            endif
238          enddo  ! ir
239c
240c         Do the third term, drive the parallelization off the
241c         distribution of (X+-Y)ib. The reason for this choice is that
242c         the occ-occ block will be localized on a (small) sub-set of
243c         the processors. So driving the parallelization off the
244c         distribution of W will result in dreadful load imbalance
245c         problems. This choice will lead to a (slightly) less bad
246c         load imbalance.
247c
248c         Do the (X+Y) terms first
249c
250          call nga_distribution(g_xpy(ip),iproc,blo,bhi)
251          doitxpy1 = .not.(blo(1).ne.1.or.bhi(1).ne.nroots)
252c
253          if (doitxpy1) then
254c
255           minj = blo(2)
256           maxj = bhi(2)
257           numj = maxj-minj+1
258           mina = blo(3)
259           maxa = bhi(3)
260           numa = maxa-mina+1
261           if (.not.ma_push_get(mt_dbl,nroots*numa*numj,'block b',
262     +                         ihdl_b,iptr_b))
263     +      call errquit(pname//'failed to allocate blockb',0,MA_ERR)
264           ld(1)  = nroots
265           ld(2)  = numj
266           call nga_get(g_xpy(ip),blo,bhi,dbl_mb(iptr_b),ld)
267           ic = 0
268           do a = 1, numa
269             do j = 1, numj
270              imo = nocc(ip)+mina-1+a
271              do ir = 1, nroots
272                dbl_mb(iptr_b+ic) = 0.5d0*eps(imo,ip)*dbl_mb(iptr_b+ic)
273                ic = ic + 1
274              enddo
275            enddo
276           enddo
277c
278           do icount = 0, nproc-1
279           jproc = mod(iproc+icount,nproc)
280c
281           call nga_distribution(g_xpy(ip),jproc,alo,ahi)
282           doitxpy2 = .not.(alo(1).ne.1.or.ahi(1).ne.nroots)
283c
284           if (doitxpy2.and.doitxpy1) then
285            if (alo(3).eq.mina.and.ahi(3).eq.maxa) then
286              mini = alo(2)
287              maxi = ahi(2)
288              numi = maxi-mini+1
289c             alo(3) = mina
290c             ahi(3) = maxa
291              if (.not.ma_push_get(mt_dbl,nroots*numa*numi,'block a',
292     +                             ihdl_a,iptr_a))
293     +         call errquit(pname//'failed to allocate blockb',0,UERR)
294              if (.not.ma_push_get(mt_dbl,nroots*numi*numj,'block r',
295     +                             ihdl_r,iptr_r))
296     +         call errquit(pname//'failed to allocate blockr',0,UERR)
297              ld(1)  = nroots
298              ld(2)  = numi
299              call nga_get(g_xpy(ip),alo,ahi,dbl_mb(iptr_a),ld)
300              call dfill(nroots*numi*numj,0.0d0,dbl_mb(iptr_r),1)
301              do a = 0, numa-1
302                do j = 0, numj-1
303                  do i = 0, numi-1
304                    do ir = 0, nroots-1
305                      dbl_mb(iptr_r+ir+nroots*i+nroots*numi*j)
306     +                = dbl_mb(iptr_r+ir+nroots*i+nroots*numi*j)
307     +                - dbl_mb(iptr_a+ir+nroots*i+nroots*numi*a)
308     +                * dbl_mb(iptr_b+ir+nroots*j+nroots*numj*a)
309                    enddo
310                  enddo
311                enddo
312              enddo
313c
314c Daniel (12-3-12): We need to account for (X-Y) = X in TDA here.
315c We must either double the contribution from (X+Y) here or redo the
316c procedure in this block of code.  I prefer scaling here since it's
317c less messy.
318              if (tda) then
319                call dscal(nroots*numi*numj,2.0d0,dbl_mb(iptr_r),1)
320              endif
321              clo(1) = 1
322              chi(1) = nroots
323              clo(2) = mini
324              chi(2) = maxi
325              clo(3) = minj
326              chi(3) = maxj
327              ld(1) = nroots
328              ld(2) = numi
329              call nga_acc(g_w(ip),clo,chi,dbl_mb(iptr_r),ld,1.0d0)
330              if (.not.ma_pop_stack(ihdl_r))
331     +         call errquit(pname//'failed to deallocate blockr',0,UERR)
332              if (.not.ma_pop_stack(ihdl_a))
333     +         call errquit(pname//'failed to deallocate blocka',0,UERR)
334            endif
335           endif !doitxpy1 and doitxpy2
336           enddo  ! icount
337c
338             if (.not.ma_pop_stack(ihdl_b))
339     +         call errquit(pname//'failed to deallocate blockb',0,UERR)
340           endif ! doitxpy1
341
342          call ga_sync()
343c
344          if (.not.tda) then
345c
346c         Do the (X-Y) terms next
347c
348            call nga_distribution(g_xmy(ip),iproc,blo,bhi)
349            doitxmy1 = .not.(blo(1).ne.1.or.bhi(1).ne.nroots)
350c
351            if (doitxmy1) then
352c
353             minj = blo(2)
354             maxj = bhi(2)
355             numj = maxj-minj+1
356             mina = blo(3)
357             maxa = bhi(3)
358             numa = maxa-mina+1
359             if (.not.ma_push_get(mt_dbl,nroots*numa*numj,'block b',
360     +                           ihdl_b,iptr_b))
361     +        call errquit(pname//'failed to allocate blockb',0,UERR)
362             ld(1)  = nroots
363             ld(2)  = numj
364             call nga_get(g_xmy(ip),blo,bhi,dbl_mb(iptr_b),ld)
365             ic = 0
366             do a = 1, numa
367              do i = 1, numj
368                imo = nocc(ip)+mina-1+a
369                do ir = 1, nroots
370                  dbl_mb(iptr_b+ic) = 0.5d0*eps(imo,ip)*
371     +                                      dbl_mb(iptr_b+ic)
372                  ic = ic + 1
373                enddo
374              enddo
375             enddo
376c
377            do icount = 0, nproc-1
378              jproc = mod(iproc+icount,nproc)
379              call nga_distribution(g_xmy(ip),jproc,alo,ahi)
380              doitxmy2 = .not.(alo(1).ne.1.or.ahi(1).ne.nroots)
381c
382               if (doitxmy2.and.doitxmy1) then
383               if (alo(3).eq.mina.and.ahi(3).eq.maxa) then
384                mini = alo(2)
385                maxi = ahi(2)
386                numi = maxi-mini+1
387c               alo(3) = mina
388c               ahi(3) = maxa
389                if (.not.ma_push_get(mt_dbl,nroots*numa*numi,'block a',
390     +                               ihdl_a,iptr_a))
391     +           call errquit(pname//'failed to allocate blockb',0,UERR)
392                if (.not.ma_push_get(mt_dbl,nroots*numi*numj,'block r',
393     +                               ihdl_r,iptr_r))
394     +           call errquit(pname//'failed to allocate blockr',0,UERR)
395                ld(1)  = nroots
396                ld(2)  = numi
397                call nga_get(g_xmy(ip),alo,ahi,dbl_mb(iptr_a),ld)
398                call dfill(nroots*numi*numj,0.0d0,dbl_mb(iptr_r),1)
399                do a = 0, numa-1
400                  do i = 0, numi-1
401                    do j = 0, numj-1
402                      do ir = 0, nroots-1
403                        dbl_mb(iptr_r+ir+nroots*i+nroots*numi*j)
404     +                  = dbl_mb(iptr_r+ir+nroots*i+nroots*numi*j)
405     +                  - dbl_mb(iptr_a+ir+nroots*i+nroots*numi*a)
406     +                  * dbl_mb(iptr_b+ir+nroots*j+nroots*numj*a)
407                      enddo
408                    enddo
409                  enddo
410                enddo
411                clo(1) = 1
412                chi(1) = nroots
413                clo(2) = mini
414                chi(2) = maxi
415                clo(3) = minj
416                chi(3) = maxj
417                ld(1) = nroots
418                ld(2) = numi
419                call nga_acc(g_w(ip),clo,chi,dbl_mb(iptr_r),ld,1.0d0)
420                if (.not.ma_pop_stack(ihdl_r))
421     +            call errquit(pname//'failed to deallocate blockr',0,
422     +              UERR)
423                if (.not.ma_pop_stack(ihdl_a))
424     +            call errquit(pname//'failed to deallocate blocka',0,
425     +              UERR)
426              endif
427              endif ! doitxmy1 and doitxmy2
428            enddo ! icount
429c
430             if (.not.ma_pop_stack(ihdl_b))
431     +        call errquit(pname//'failed to deallocate blockb',0,UERR)
432            endif ! doitxmy1
433c
434          call ga_sync()
435c
436          endif ! tda
437c
438      enddo ! ipol
439c
440c     Now do the fourth term
441c
442c     - Create Puv
443c
444      idim(1) = nroots*ipol
445      idim(2) = nao
446      idim(3) = nao
447      ichnk(1) = nroots*ipol
448      ichnk(2) = -1
449      ichnk(3) = -1
450      if (.not.nga_create(mt_dbl,3,idim,'vec Puv',ichnk,g_puv))
451     +     call errquit(pname//'failed to create g_puv',0,GA_ERR)
452c
453c     - Puv = sum_pq Cup*Ppq*Cvq
454c
455      call tddft_grad_trans_mo2ao(ipol,nao,nfc,naoc,nocc,nav,nfv,
456     +     nroots,1.0d0,0.0d0,"pq",g_mo,g_p,"pq",g_puv)
457c
458c     - Create global array for (A+B)P in AO basis
459c     - Compute (A+B)P in AO basis (currently we have to compute
460c       (A-B)P as well although we do not need it...)
461c
462      if (.not.nga_create(mt_dbl,3,idim,'vec (A+B)P',ichnk,g_apbp))
463     +    call errquit(pname//'failed to create g_apbp',0,GA_ERR)
464      if (.not.nga_create(mt_dbl,3,idim,'vec (A-B)P',ichnk,g_ambp))
465     +    call errquit(pname//'failed to create g_ambp',0,GA_ERR)
466      call ga_zero(g_ambp)
467      call ga_zero(g_apbp)
468c Daniel (2-26-13): It was not obvious that we need to unset
469c fock_xc:triplet here for restricted triplet calculations to work.
470      if (otriplet) then
471        if (.not.rtdb_put(rtdb,'fock_xc:triplet',mt_log,1,.false.))
472     1    call errquit(pname//'failed to set triplet',0,RTDB_ERR)
473      endif
474      call tddft_nga_cont(rtdb,ihdl_geom,ihdl_bfao,g_puv,g_apbp,g_ambp,
475     +     nao,ipol,tol2e,tda,oskel,kfac,lhashf,.false.,nroots,iwhich)
476c Daniel (2-26-13): Reset fock_xc:triplet here for restricted triplet
477c calculations to work.
478      if (otriplet) then
479        if (.not.rtdb_put(rtdb,'fock_xc:triplet',mt_log,1,.true.))
480     1    call errquit(pname//'failed to set triplet',0,RTDB_ERR)
481      endif
482c Daniel (1-5-13): This line accidentally halves the contribution of
483c this term for CIS, but functions correctly for RPA.
484      if (.not.tda) then
485        call ga_add(+0.5d0,g_apbp,+0.5d0,g_ambp,g_apbp)
486      endif
487c
488c     - Destroy global array for Puv
489c
490      if (.not.ga_destroy(g_puv))
491     +    call errquit(pname//'failed to destroy g_puv',0,GA_ERR)
492c
493c     - Transform (A+B)P from AO basis to Wij in MO basis
494c
495      call tddft_grad_trans_ao2mo(ipol,nao,nfc,naoc,nocc,nav,nfv,
496     +     nroots,1.0d0,1.0d0,"ij",g_mo,g_apbp,g_w,"pq")
497c
498c     - Destroy (A+B)P and (A-B)P
499c
500      if (.not.ga_destroy(g_apbp))
501     +    call errquit(pname//'failed to destroy g_apbp',0,GA_ERR)
502      if (.not.ga_destroy(g_ambp))
503     +    call errquit(pname//'failed to destroy g_apbp',0,GA_ERR)
504c
505c     Now do the fifth term (this follows the same approach as in
506c     tddft_grad_comp_r)
507c
508      if (xc_gotxc()) then
509        ndens = ipol*(nroots+1)
510        ngxc  = ipol*nroots
511        if (.not.ma_push_get(mt_int,ngxc,'gxc-s',l_gxc,k_gxc))
512     +    call errquit(pname//'failed to allocate l_gxc',0,MA_ERR)
513        if (.not.ma_push_get(mt_int,ndens,'densities',l_dens,k_dens))
514     +    call errquit(pname//'failed to allocate l_dens',0,MA_ERR)
515        if (.not.ma_push_get(mt_int,ndens,'dens-tmps',l_den2,k_den2))
516     +    call errquit(pname//'failed to allocate l_den2',0,MA_ERR)
517c
518c       - Create and calculate the AO basis density matrices
519c
520        do ip = 0, ipol-1
521          int_mb(k_dens+nroots*ipol+ip) =
522     +       ga_create_atom_blocked(ihdl_geom,ihdl_bfao,"d_ao")
523        enddo
524        call tddft_grad_compute_dao(ipol,nao,nocc,g_mo,
525     +                           int_mb(k_dens+nroots*ipol))
526        if (ipol.eq.1) call ga_scale(int_mb(k_dens+nroots*ipol),2.0d0)
527c
528c       - Create and calculate the AO basis representation of (X+Y)
529c
530        do ip = 0, ipol-1
531          do ir = 0, nroots-1
532            int_mb(k_dens+ip*nroots+ir) = ga_create_atom_blocked(
533     +                                   ihdl_geom,ihdl_bfao,"xpy_ao")
534c Daniel (1-14-13): Added this call for consistency with
535c tddft_grad_compute_r
536            call ga_zero(int_mb(k_dens+ip*nroots+ir))
537          enddo
538        enddo
539        do ip = 1, ipol
540          do ir = 1, nroots
541            call tddft_grad_trans_mo2ao(1,nao,nfc(ip),naoc(ip),nocc(ip),
542     +           nav(ip),nfv(ip),ir,1.0d0,0.0d0,"ib",g_mo(ip),g_xpy(ip),
543     +           "ib",int_mb(k_dens+(ip-1)*nroots+ir-1))
544            call ga_symmetrize(int_mb(k_dens+(ip-1)*nroots+ir-1))
545c Daniel (2-16-13): This line is needed to get matching results from the
546c unrestricted code compared to the restricted one.
547            if (ipol.eq.1) then
548              call ga_scale(int_mb(k_dens+(ip-1)*nroots+ir-1),2.0d0)
549            endif
550          enddo
551        enddo
552c
553c       - Create and calculate Gxc in AO basis using fock_xc
554c
555        do i = 0, ngxc-1
556          int_mb(k_gxc+i) = ga_create_atom_blocked(ihdl_geom,ihdl_bfao,
557     +                                             "gxc_ao")
558          call ga_zero(int_mb(k_gxc+i))
559        enddo
560        if (.not.rtdb_get(rtdb,'fock_xc:calc_type',mt_int,1,calc_type))
561     +    calc_type=0
562        if (.not.rtdb_put(rtdb,'fock_xc:calc_type',mt_int,1,5))
563     +    call errquit(pname//'failed to set calc_type 5',0,RTDB_ERR)
564        if (.not.rtdb_put(rtdb,'fock_xc:calc_type_tddft_w',mt_int,1,
565     +      calc_type))
566     +    call errquit(pname//'failed to set calc_type_tddft_w',
567     +       0,RTDB_ERR)
568c
569c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
570c Daniel (2-16-13): There is a line in fock_xc, involving lcgmin, that
571c will set l3d to true, even though we want it to be false for the
572c gradients.  We need to set the RTDB such that dft:cgmin is true so
573c that the l3d is set correctly in fock_xc.
574c **********************************************************************
575c WE NEED TO FIX THIS BECAUSE TDDFT GRADIENTS DON'T WORK with CGMIN SET.
576c **********************************************************************
577c Daniel (2-18-13): This fix will not work for optimizations.
578c        if (.not.rtdb_put(rtdb,'dft:cgmin',mt_log,1,.true.))
579c     1    call errquit(pname//'failed to set cgmin',0,RTDB_ERR)
580c XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
581c
582c       Reorder the densities to match the expectation in fock_xc
583c       See tddft_grad_comp_r for details.
584c
585        do i = 0, nroots
586          do ip = 0, ipol-1
587            int_mb(k_den2+i+ip*(nroots+1)) = int_mb(k_dens+ip+ipol*i)
588          enddo
589        enddo
590        call ga_sync()
591c Daniel (2-12-13): We need to set triplet here for fock_xc.
592        if (otriplet) then
593          if (.not.rtdb_put(rtdb,'fock_xc:triplet',mt_log,1,otriplet))
594     1      call errquit(pname//'failed to set triplet',0,RTDB_ERR)
595        endif
596        call fock_xc(ihdl_geom, nao, ihdl_bfao, ngxc, int_mb(k_den2),
597     +               int_mb(k_gxc), Exc, ipol, .false.)
598        call ga_sync()
599c DEBUG
600      if (tddft_grad_util_print('tddft grad w',print_debug)) then
601        oroot = ga_nodeid().eq.0
602        if (oroot) write(LuOut,*)'DEBUG: '//pname//'gxc'
603        call tddft_grad_print_array(ipol,nroots,int_mb(k_gxc),
604     +                              dble(ipol))
605      endif
606c DEBUG
607c
608        if (.not.rtdb_get(rtdb,'fock_xc:calc_type_tddft_w',mt_int,1,
609     +    calc_type))
610     +    call errquit(pname//'failed to get calc_type_tddft_w',0,
611     +        RTDB_ERR)
612        if (.not.rtdb_put(rtdb,'fock_xc:calc_type',mt_int,1,calc_type))
613     +    call errquit(pname//'failed to reset calc_type',0,RTDB_ERR)
614        if (.not.rtdb_delete(rtdb,'fock_xc:calc_type_tddft_w'))
615     +    call errquit(pname//'failed to delete calc_type_tddft_w',0,
616     +        RTDB_ERR)
617c
618c       - Transform the Gxc matrices to MO basis and add them to Wij
619c
620        do i = 0, ndens-1
621          if (.not.ga_destroy(int_mb(k_dens+i)))
622     +     call errquit(pname//'failed to destroy densities',0,GA_ERR)
623        enddo
624        if (.not.ma_pop_stack(l_den2))
625     +    call errquit(pname//'failed to deallocate l_den2', 0, MA_ERR)
626        if (.not.ma_pop_stack(l_dens))
627     +    call errquit(pname//'failed to deallocate l_dens', 0, MA_ERR)
628        do ip = 1, ipol
629          do ir = 1, nroots
630            call tddft_grad_trans_ao2mo(1,nao,nfc(ip),naoc(ip),nocc(ip),
631     +           nav(ip),nfv(ip),ir,1.0d0,1.0d0,"ij",g_mo(ip),
632     +           int_mb(k_gxc+(ip-1)*nroots+ir-1),g_w(ip),"ij")
633          enddo
634        enddo
635        do i = 0, ngxc-1
636          if (.not.ga_destroy(int_mb(k_gxc+i)))
637     +     call errquit(pname//'failed to destroy gxc_ao', 0, GA_ERR)
638        enddo
639        if (.not.ma_pop_stack(l_gxc))
640     +     call errquit(pname//'failed to deallocate l_gxc', 0, MA_ERR)
641c
642      endif
643c
644c     Compute the occupied-virtual block
645c
646c     Wias = eps_is Zias
647c          + Sum_j {(X+Y)jasHji+[X+Y]+(X-Y)jasHjis-[X-Y]}
648c
649c     Do the first term, drive the parallelization off the distribution
650c     of Zias
651c
652      do ip = 1, ipol
653c
654        call nga_distribution(g_z(ip),iproc,alo,ahi)
655        doitz = .not.(alo(1).ne.1.or.ahi(1).ne.nroots)
656c
657        if (doitz) then
658         mini = alo(2)
659         maxi = ahi(2)
660         numi = maxi-mini+1
661         mina = alo(3)
662         maxa = ahi(3)
663         numa = maxa-mina+1
664         if (.not.ma_push_get(mt_dbl,nroots*numi*numa,'block v',
665     +                       ihdl_v,iptr_v))
666     +   call errquit(pname//'failed to allocate block v',0, UERR)
667         ld(1) = nroots
668         ld(2) = numi
669         call nga_get(g_z(ip),alo,ahi,dbl_mb(iptr_v),ld)
670         do a = 0, numa-1
671          do i = 0, numi-1
672            imo = nfc(ip)+mini+i
673            do ir = 0, nroots-1
674              dbl_mb(iptr_v+ir+nroots*i+nroots*numi*a)
675     +        = dbl_mb(iptr_v+ir+nroots*i+nroots*numi*a)
676     +        * eps(imo,ip)
677            enddo
678          enddo
679         enddo
680        alo(3) = naoc(ip)+mina
681        ahi(3) = naoc(ip)+maxa
682        call nga_put(g_w(ip),alo,ahi,dbl_mb(iptr_v),ld)
683        if (.not.ma_pop_stack(ihdl_v))
684     +    call errquit(pname//'failed to deallocate block v',0, UERR)
685       endif ! doitz
686      enddo  ! ip
687c
688c     Do the second term
689c
690c     - Do the (X+Y) and (X-Y) contributions
691c
692c     - Create global arrays for X and Y in AO basis
693c
694      idim(1) = nroots*ipol
695      idim(2) = nao
696      idim(3) = nao
697      ichnk(1) = nroots*ipol
698      ichnk(2) = -1
699      ichnk(3) = -1
700      if (.not.nga_create(mt_dbl,3,idim,'vectors Xuv',ichnk,g_x))
701     +    call errquit(pname//'failed to create g_x',0,GA_ERR)
702      if (.not.nga_create(mt_dbl,3,idim,'vectors Yuv',ichnk,g_y))
703     +    call errquit(pname//'failed to create g_y',0,GA_ERR)
704c
705c     - Transform (X+Y) to AO basis in g_x and (X-Y) to AO basis
706c       in g_y
707c
708      call tddft_grad_trans_mo2ao(ipol,nao,nfc,naoc,nocc,nav,nfv,
709     +     nroots,1.0d0,0.0d0,"ib",g_mo,g_xpy,"ib",g_x)
710      if (.not.tda) then
711        call tddft_grad_trans_mo2ao(ipol,nao,nfc,naoc,nocc,nav,nfv,
712     +       nroots,1.0d0,0.0d0,"ib",g_mo,g_xmy,"ib",g_y)
713      else
714        call tddft_grad_trans_mo2ao(ipol,nao,nfc,naoc,nocc,nav,nfv,
715     +       nroots,1.0d0,0.0d0,"ib",g_mo,g_xpy,"ib",g_y)
716      endif
717c
718c     - Compute X and Y from (X+Y) and (X-Y) in place
719c
720      alo(1) = 1
721      alo(2) = 1
722      alo(3) = 1
723      ahi(1) = nroots*ipol
724      ahi(2) = nao
725      ahi(3) = nao
726c Daniel (1-7-13): With the modification above, the lines here behave
727c like you'd expect (i.e. g_x = X and g_y = Y = 0 for CIS, rather than
728c what was here before which made g_x = 0.50*X and g_y = 0.50*X).
729      call nga_add_patch(0.5d0,g_x,alo,ahi,0.5d0,g_y,alo,ahi,g_x,
730     +                   alo,ahi)
731      call nga_add_patch(1.0d0,g_x,alo,ahi,-1.0d0,g_y,alo,ahi,g_y,
732     +                   alo,ahi)
733c
734c     - Allocate various workspace arrays
735c
736      if (.not.nga_create(mt_dbl,3,idim,'vectors (A+B)X',ichnk,g_apbx))
737     +    call errquit(pname//'failed to create g_apbx',0, GA_ERR)
738      if (.not.nga_create(mt_dbl,3,idim,'vectors (A-B)X',ichnk,g_ambx))
739     +    call errquit(pname//'failed to create g_ambx',0, GA_ERR)
740      if (.not.nga_create(mt_dbl,3,idim,'vectors (A+B)Y',ichnk,g_apby))
741     +    call errquit(pname//'failed to create g_apby',0, GA_ERR)
742      if (.not.nga_create(mt_dbl,3,idim,'vectors (A-B)Y',ichnk,g_amby))
743     +    call errquit(pname//'failed to create g_amby',0, GA_ERR)
744c
745c     - Compute (A+B)X, (A-B)X, (A+B)Y and (A-B)Y
746c
747c Daniel (1-7-13): We manipulate the code here because the
748c R vector has the same number of terms for RPA and CIS.  This is a
749c consequence of (X-Y) = X.  It might be a good idea to avoid doing this
750c part for the Y vector since Y = 0.  Note that the coupling matrix
751c expressions H^+[V] and H^-[V] can both be nonzero for CIS, so it isn't
752c okay to skip the anti-symmetric part in the tddft_nga_cont routine.
753c What CIS does is makes the Y vector contribution zero in the following
754c routines.
755c Daniel (2-8-13): Set the local TDA variable so that we don't change
756c the global one.
757      if (lhashf) then
758        if (tda) then
759          tdaloc = .false. ! For CIS and TDDFT/TDA (hybrid) calculations
760        else
761          tdaloc = .false. ! For RPA and TDDFT (hybrid) calculations
762        endif
763      else
764        if (tda) then
765          tdaloc = .true.  ! For TDDFT/TDA (pure) calculations
766        else
767          tdaloc = .false. ! For TDDFT (pure) calculations
768        endif
769      endif
770      call tddft_nga_cont(rtdb,ihdl_geom,ihdl_bfao,g_x,g_apbx,g_ambx,
771     +   nao,ipol,tol2e,tdaloc,oskel,kfac,lhashf,otriplet,nroots,iwhich)
772      call tddft_nga_cont(rtdb,ihdl_geom,ihdl_bfao,g_y,g_apby,g_amby,
773     +   nao,ipol,tol2e,tdaloc,oskel,kfac,lhashf,otriplet,nroots,iwhich)
774c
775c     - Dispose of X and Y
776c
777      if (.not.ga_destroy(g_x))
778     +    call errquit(pname//'failed to destroy g_x',0, GA_ERR)
779      if (.not.ga_destroy(g_y))
780     +    call errquit(pname//'failed to destroy g_y',0, GA_ERR)
781c
782c     - Compute (A+B)(X+Y) and (A-B)(X-Y)
783cdbg            (A+B)(X-Y)     (A-B)(X+Y)
784c
785      call nga_add_patch(1.0d0,g_apbx,alo,ahi,+1.0d0,g_apby,alo,ahi,
786     +                   g_apbx,alo,ahi)
787      call nga_add_patch(1.0d0,g_ambx,alo,ahi,-1.0d0,g_amby,alo,ahi,
788     +                   g_ambx,alo,ahi)
789c
790c     - Dispose of (A+B)Y and (A-B)Y
791c
792      if (.not.ga_destroy(g_apby))
793     +   call errquit(pname//'failed to destroy g_apby',0, GA_ERR)
794      if (.not.ga_destroy(g_amby))
795     +   call errquit(pname//'failed to destroy g_amby',0, GA_ERR)
796c
797c     - Allocate (A+-B)(X+-Y)ij
798c
799      do ip = 1, ipol
800        idim(1) = nroots
801        idim(2) = naoc(ip)
802        idim(3) = naoc(ip)
803        ichnk(1) = nroots
804        ichnk(2) = -1
805        ichnk(3) = -1
806        if (.not.nga_create(mt_dbl,3,idim,'vec hij',ichnk,g_hij(ip)))
807     +    call errquit(pname//'failed to create g_hij',0, GA_ERR)
808      enddo
809c
810c     - Transform (A+B)(X+Y) to MO basis occupied-occupied block only
811cdbg              (A+B)(X-Y)
812c
813      call tddft_grad_trans_ao2mo(ipol,nao,nfc,naoc,nocc,nav,nfv,
814     +     nroots,1.0d0,0.0d0,"ij",g_mo,g_apbx,g_hij,"ij")
815c
816c     - Add sum_j (X+Y)ja [(A+B)(X+Y)ji] to Wia
817cdbg              (X-Y)ja [(A+B)(X-Y)ji]
818c       use symmetry of [(A+B)(X+Y)ji]
819c
820      do ip = 1, ipol
821        do ir=1,nroots
822          alo(1) = ir
823          ahi(1) = ir
824          alo(2) = 1
825          ahi(2) = naoc(ip)
826          alo(3) = 1
827          ahi(3) = naoc(ip)
828          blo(1) = ir
829          bhi(1) = ir
830          blo(2) = 1
831          bhi(2) = naoc(ip)
832          blo(3) = 1
833          bhi(3) = nav(ip)
834          clo(1) = ir
835          chi(1) = ir
836          clo(2) = 1
837          chi(2) = naoc(ip)
838          clo(3) = naoc(ip)+1
839          chi(3) = naoc(ip)+nav(ip)
840          call tddft_patch3mxm('n','n',1.0d0,1.0d0,g_hij(ip),alo,ahi,
841     +         g_xpy(ip),blo,bhi,g_w(ip),clo,chi)
842        enddo
843      enddo
844c
845c     - Transform (A-B)(X-Y) to MO basis occupied-occupied block only
846cdbg              (A-B)(X+Y)
847c
848      call tddft_grad_trans_ao2mo(ipol,nao,nfc,naoc,nocc,nav,nfv,
849     +     nroots,1.0d0,0.0d0,"ij",g_mo,g_ambx,g_hij,"ij")
850c
851c     - Add sum_j (X-Y)ja [(A-B)(X-Y)ji] to Wia
852cdbg              (X+Y)js [(A-B)(X+Y)ji]
853c       use anti-symmetry of [(A-B)(X-Y)ji]
854c
855c Daniel (1-7-13): We can definitely avoid this part of the routine if
856c we don't use exact exchange, since the linear transformation H^-[V]
857c is zero in that case.
858      if (lhashf) then
859        do ip = 1, ipol
860          do ir = 1, nroots
861            alo(1) = ir
862            ahi(1) = ir
863            alo(2) = 1
864            ahi(2) = naoc(ip)
865            alo(3) = 1
866            ahi(3) = naoc(ip)
867            blo(1) = ir
868            bhi(1) = ir
869            blo(2) = 1
870            bhi(2) = naoc(ip)
871            blo(3) = 1
872            bhi(3) = nav(ip)
873            clo(1) = ir
874            chi(1) = ir
875            clo(2) = 1
876            chi(2) = naoc(ip)
877            clo(3) = naoc(ip)+1
878            chi(3) = naoc(ip)+nav(ip)
879c Daniel (1-7-13): Manipulate the code here for CIS to use g_xpy here
880c since (X+Y) = (X-Y) = X.  This is a consequence of not allocating
881c a g_xmy array for CIS.  The linear transformation H^-[X] still exists
882c in CIS.
883            if (.not.tda) then
884              call tddft_patch3mxm('n','n',-1.0d0,1.0d0,
885     +             g_hij(ip),alo,ahi,
886     +             g_xmy(ip),blo,bhi,g_w(ip),clo,chi)
887            else
888              call tddft_patch3mxm('n','n',-1.0d0,1.0d0,
889     +             g_hij(ip),alo,ahi,
890     +             g_xpy(ip),blo,bhi,g_w(ip),clo,chi)
891            endif
892          enddo
893        enddo
894      endif ! lhashf
895c
896c     - Dispose of g_hij
897c
898      do ip = 1, ipol
899        if (.not.ga_destroy(g_hij(ip)))
900     +   call errquit(pname//'failed to destroy g_hij',0, GA_ERR)
901      enddo
902c
903c     - Dispose of (A+B)(X+Y) and (A+B)(X-Y)
904c
905      if (.not.ga_destroy(g_apbx))
906     +   call errquit(pname//'failed to destroy g_apbx',0, GA_ERR)
907      if (.not.ga_destroy(g_ambx))
908     +   call errquit(pname//'failed to destroy g_ambx',0, GA_ERR)
909c
910c
911c     Compute the virtual-virtual block
912c
913c     Wabs = Omega Sum_i  [(X+Y)ias(X-Y)ibs+(X-Y)ias(X+Y)ibs]/2
914c          + Sum_i eps_is [(X+Y)ias(X+Y)ibs+(X-Y)ias(X-Y)ibs]/2
915c
916      do ip = 1, ipol
917          do ir = 1, nroots
918            alo(1) = ir
919            ahi(1) = ir
920            alo(2) = 1
921            ahi(2) = nav(ip)
922            alo(3) = 1
923            ahi(3) = naoc(ip)
924            blo(1) = ir
925            bhi(1) = ir
926            blo(2) = 1
927            bhi(2) = naoc(ip)
928            blo(3) = 1
929            bhi(3) = nav(ip)
930            clo(1) = ir
931            chi(1) = ir
932            clo(2) = naoc(ip)+1
933            chi(2) = naoc(ip)+nav(ip)
934            clo(3) = naoc(ip)+1
935            chi(3) = naoc(ip)+nav(ip)
936c
937            if (.not.tda) then
938            call tddft_patch3mxm('t','n',0.5d0*omega(ir),0.0d0,
939     +                              g_xpy(ip),alo,ahi,
940     +                              g_xmy(ip),blo,bhi,
941     +                              g_w(ip),clo,chi)
942            call tddft_patch3mxm('t','n',0.5d0*omega(ir),1.0d0,
943     +                              g_xmy(ip),alo,ahi,
944     +                              g_xpy(ip),blo,bhi,
945     +                              g_w(ip),clo,chi)
946            else
947c            g_xpy: X with CIS, Y = 0
948c            g_xmy: not created with CIS
949             call tddft_patch3mxm('t','n',omega(ir),0.0d0,
950     +                            g_xpy(ip),alo,ahi,
951     +                            g_xpy(ip),blo,bhi,
952     +                            g_w(ip),clo,chi)
953            endif ! tda
954          enddo ! ir
955c
956c         Do the second term, drive the parallelization off the
957c         distribution of (X+-Y)ib
958c
959c         Do the (X+Y) terms first
960c
961          call nga_distribution(g_xpy(ip),iproc,blo,bhi)
962          doitxpy1 = .not.(blo(1).ne.1.or.bhi(1).ne.nroots)
963c
964          if (doitxpy1) then
965c
966           mini = blo(2)
967           maxi = bhi(2)
968           numi = maxi-mini+1
969           minb = blo(3)
970           maxb = bhi(3)
971           numb = maxb-minb+1
972           if (.not.ma_push_get(mt_dbl,nroots*numb*numi,'block b',
973     +                         ihdl_b,iptr_b))
974     +      call errquit(pname//'failed to allocate blockb',0,UERR)
975           ld(1)  = nroots
976           ld(2)  = numi
977           call nga_get(g_xpy(ip),blo,bhi,dbl_mb(iptr_b),ld)
978           ic = 0
979           do b = 1, numb
980            do i = 1, numi
981              imo = nfc(ip)+mini-1+i
982              do ir = 1, nroots
983                dbl_mb(iptr_b+ic) = 0.5d0*eps(imo,ip)*dbl_mb(iptr_b+ic)
984                ic = ic + 1
985              enddo
986            enddo
987           enddo
988c
989           do icount = 0, nproc-1
990           jproc = mod(iproc+icount,nproc)
991c
992           call nga_distribution(g_xpy(ip),jproc,alo,ahi)
993           doitxpy2 = .not.(alo(1).ne.1.or.ahi(1).ne.nroots)
994c
995           if (doitxpy2.and.doitxpy1) then
996           if (alo(2).eq.mini.and.ahi(2).eq.maxi) then
997           mina = alo(3)
998           maxa = ahi(3)
999           numa = maxa-mina+1
1000           alo(2) = mini
1001           ahi(2) = maxi
1002           if (.not.ma_push_get(mt_dbl,nroots*numa*numi,'block a',
1003     +                           ihdl_a,iptr_a))
1004     +      call errquit(pname//'failed to allocate blockb',0, MA_ERR)
1005            if (.not.ma_push_get(mt_dbl,nroots*numa*numb,'block r',
1006     +                           ihdl_r,iptr_r))
1007     +      call errquit(pname//'failed to allocate blockr',0, MA_ERR)
1008            ld(1)  = nroots
1009            ld(2)  = numi
1010            call nga_get(g_xpy(ip),alo,ahi,dbl_mb(iptr_a),ld)
1011            call dfill(nroots*numa*numb,0.0d0,dbl_mb(iptr_r),1)
1012            do b = 0, numb-1
1013              do a = 0, numa-1
1014                do i = 0, numi-1
1015                  do ir = 0, nroots-1
1016                    dbl_mb(iptr_r+ir+nroots*a+nroots*numa*b)
1017     +              = dbl_mb(iptr_r+ir+nroots*a+nroots*numa*b)
1018     +              + dbl_mb(iptr_a+ir+nroots*i+nroots*numi*a)
1019     +              * dbl_mb(iptr_b+ir+nroots*i+nroots*numi*b)
1020                  enddo
1021                enddo
1022              enddo
1023            enddo
1024c Daniel (12-3-12): We need to account for (X-Y) = X in TDA here.
1025c We must either double the contribution from (X+Y) here or redo the
1026c procedure in this block of code.  I prefer scaling here since it's
1027c less messy.
1028            if (tda) then
1029              call dscal(nroots*numa*numb,2.0d0,dbl_mb(iptr_r),1)
1030            endif
1031            clo(1) = 1
1032            chi(1) = nroots
1033            clo(2) = naoc(ip)+mina
1034            chi(2) = naoc(ip)+maxa
1035            clo(3) = naoc(ip)+minb
1036            chi(3) = naoc(ip)+maxb
1037            ld(1) = nroots
1038            ld(2) = numa
1039            call nga_acc(g_w(ip),clo,chi,dbl_mb(iptr_r),ld,1.0d0)
1040            if (.not.ma_pop_stack(ihdl_r))
1041     +       call errquit(pname//'failed to deallocate blockr',0,MA_ERR)
1042            if (.not.ma_pop_stack(ihdl_a))
1043     +       call errquit(pname//'failed to deallocate blocka',0,MA_ERR)
1044            endif
1045           endif ! doitxpy1 and doitxpy2
1046          enddo ! icount
1047c
1048           if (.not.ma_pop_stack(ihdl_b))
1049     +      call errquit(pname//'failed to deallocate blockb',0,MA_ERR)
1050          endif  ! doitxpy1
1051c
1052          call ga_sync()
1053c
1054          if (.not.tda) then
1055c
1056c         Do the (X-Y) terms next
1057c
1058          call nga_distribution(g_xmy(ip),iproc,blo,bhi)
1059          doitxmy1 = .not.(blo(1).ne.1.or.bhi(1).ne.nroots)
1060c
1061          if (doitxmy1) then
1062c
1063           mini = blo(2)
1064           maxi = bhi(2)
1065           numi = maxi-mini+1
1066           minb = blo(3)
1067           maxb = bhi(3)
1068           numb = maxb-minb+1
1069           if (.not.ma_push_get(mt_dbl,nroots*numb*numi,'block b',
1070     +                         ihdl_b,iptr_b))
1071     +      call errquit(pname//'failed to allocate blockb',0,UERR)
1072           ld(1)  = nroots
1073           ld(2)  = numi
1074           call nga_get(g_xmy(ip),blo,bhi,dbl_mb(iptr_b),ld)
1075           ic = 0
1076           do b = 1, numb
1077            do i = 1, numi
1078              imo = nfc(ip)+mini-1+i
1079              do ir = 1, nroots
1080                dbl_mb(iptr_b+ic) = 0.5d0*eps(imo,ip)*dbl_mb(iptr_b+ic)
1081                ic = ic + 1
1082              enddo
1083            enddo
1084           enddo
1085c
1086           do icount = 0, nproc-1
1087            jproc = mod(iproc+icount,nproc)
1088c
1089            call nga_distribution(g_xmy(ip),jproc,alo,ahi)
1090            doitxmy2 = .not.(alo(1).ne.1.or.ahi(1).ne.nroots)
1091c
1092            if (doitxmy2) then
1093            if (alo(2).eq.mini.and.ahi(2).eq.maxi) then
1094            mina = alo(3)
1095            maxa = ahi(3)
1096            numa = maxa-mina+1
1097            alo(2) = mini
1098            ahi(2) = maxi
1099            if (.not.ma_push_get(mt_dbl,nroots*numa*numi,'block a',
1100     +                           ihdl_a,iptr_a))
1101     +      call errquit(pname//'failed to allocate blockb',0, UERR)
1102            if (.not.ma_push_get(mt_dbl,nroots*numa*numb,'block r',
1103     +                          ihdl_r,iptr_r))
1104     +      call errquit(pname//'failed to allocate blockr',0, UERR)
1105            ld(1)  = nroots
1106            ld(2)  = numi
1107            call nga_get(g_xmy(ip),alo,ahi,dbl_mb(iptr_a),ld)
1108            call dfill(nroots*numa*numb,0.0d0,dbl_mb(iptr_r),1)
1109            do b = 0, numb-1
1110              do a = 0, numa-1
1111                do i = 0, numi-1
1112                  do ir = 0, nroots-1
1113                    dbl_mb(iptr_r+ir+nroots*a+nroots*numa*b)
1114     +              = dbl_mb(iptr_r+ir+nroots*a+nroots*numa*b)
1115     +              + dbl_mb(iptr_a+ir+nroots*i+nroots*numi*a)
1116     +              * dbl_mb(iptr_b+ir+nroots*i+nroots*numi*b)
1117                  enddo
1118                enddo
1119              enddo
1120            enddo
1121            clo(1) = 1
1122            chi(1) = nroots
1123            clo(2) = naoc(ip)+mina
1124            chi(2) = naoc(ip)+maxa
1125            clo(3) = naoc(ip)+minb
1126            chi(3) = naoc(ip)+maxb
1127            ld(1) = nroots
1128            ld(2) = numa
1129            call nga_acc(g_w(ip),clo,chi,dbl_mb(iptr_r),ld,1.0d0)
1130            if (.not.ma_pop_stack(ihdl_r))
1131     +      call errquit(pname//'failed to deallocate blockr',0,MA_ERR)
1132            if (.not.ma_pop_stack(ihdl_a))
1133     +      call errquit(pname//'failed to deallocate blocka',0,MA_ERR)
1134            endif
1135           endif ! doitxmy1 and doitxmy2
1136          enddo ! icount
1137c
1138            if (.not.ma_pop_stack(ihdl_b))
1139     +       call errquit(pname//'failed to deallocate blockb',0,MA_ERR)
1140          endif !doitxmy1
1141
1142          call ga_sync()
1143
1144          endif  ! tda
1145c
1146      enddo
1147c
1148c     Copy Wia to Wai
1149c
1150      do ip = 1, ipol
1151        alo(1) = 1
1152        ahi(1) = nroots
1153        alo(2) = 1
1154        ahi(2) = naoc(ip)
1155        alo(3) = naoc(ip)+1
1156        ahi(3) = naoc(ip)+nav(ip)
1157        call nga_scale_patch(g_w(ip),alo,ahi,0.5d0)
1158        idim(1) = naoc(ip)
1159        idim(2) = nav(ip)
1160        ichnk(1) = -1
1161        ichnk(2) = -1
1162        if (.not.nga_create(mt_dbl,2,idim,"transpose",ichnk,g_t))
1163     +     call errquit(pname//'failed to create g_t',0,GA_ERR)
1164        do ir = 1, nroots
1165          alo(1) = ir
1166          ahi(1) = ir
1167          alo(2) = 1
1168          ahi(2) = naoc(ip)
1169          alo(3) = naoc(ip)+1
1170          ahi(3) = naoc(ip)+nav(ip)
1171          blo(1) = 1
1172          bhi(1) = naoc(ip)
1173          blo(2) = 1
1174          bhi(2) = nav(ip)
1175          call nga_copy_patch('n',g_w(ip),alo,ahi,g_t,blo,bhi)
1176          alo(2) = naoc(ip)+1
1177          ahi(2) = naoc(ip)+nav(ip)
1178          alo(3) = 1
1179          ahi(3) = naoc(ip)
1180          bhi(2) = nav(ip)
1181          bhi(1) = naoc(ip)
1182          call nga_copy_patch('t',g_t,blo,bhi,g_w(ip),alo,ahi)
1183        enddo
1184        if (.not.ga_destroy(g_t))
1185     +     call errquit(pname//'failed to destroy g_t',0,GA_ERR)
1186      enddo
1187      call ga_sync()
1188c
1189      if (tddft_grad_util_print('tddft grad w',print_debug)) then
1190        oroot = ga_nodeid().eq.0
1191        if (oroot) write(LuOut,*)'DEBUG: '//pname//'W'
1192        call tddft_grad_print_array(ipol,nroots,g_w,dble(ipol))
1193      endif
1194c
1195      end
1196c $Id$
1197