1C:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
2C NAME
3C     CCDen_2PDMoooo -- Form coupled cluster 2-particle density matrix
4C
5C REVISION
6C     $Id$
7C
8C SYNOPSIS
9      Subroutine CCDen_2PDMoooo(NOcc, NVir, T1, LDT1, g_T2,
10     $     g_Z2, g_Q, g_G)
11      Implicit NONE
12#include "errquit.fh"
13      Integer NOcc, NVir, LDT1
14      Double Precision T1(LDT1, NVir)
15      Integer g_T2, g_Z2, g_Q, g_G
16C
17C ARGUMENTS
18C DESCRIPTION
19C     Compute the oooo part of the coupled cluster 2-PDM
20C
21C REFERENCES
22C     Alistair P. Rendell and Timothy J. Lee, J. Chem. Phys. 94,
23C     6219--6228 (1991). Especially Eq. 46
24C
25C INCLUDE FILES
26#include "global.fh"
27#include "mafdecls.fh"
28#include "stdio.fh"
29C
30C EXTERNAL ROUTINES
31C:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
32C LOCAL VARIABLES
33      Integer ILo, IHi, JLo, JHi, I, J, K
34      Integer h_Qi, i_Qi
35      Double precision Qij
36      Integer g_Tau
37C
38C     Clear the result
39C
40      Call GA_Zero(g_G)
41C
42C     Find out what portion of G is "local"
43C
44      Call CCSD_T2_MyIJ(GA_NodeID(), NOcc, g_G, ILo, IHi, JLo, JHi)
45C
46C     *********************************************************
47C     * G(i,j,k,l) = 1/8 P(i,j,k,l) {1/2 [TauZ](i,k,j,l)      *
48C     *              + 2 delta(k,l) ( Q(i,j) - delta(i,j) )   *
49C     *              -   delta(j,l) ( Q(i,k) - delta(i,k) ) } *
50C     * APR & TJL Eq. 46                                      *
51C     *********************************************************
52C
53C     ***********************
54C     * 1/2 [TauZ](i,j,k,l) *
55C     ***********************
56C
57      If ( .NOT. GA_Create(MT_Dbl, NVir*NOcc, NVir*NOcc,
58     $   'CCDen Tau', NVir, NVir, G_Tau) ) Call ErrQuit(
59     $   'CCDen_2PDMoooo: unable to allocate Tau',
60     $   NVir*NOcc*NVir*NOcc, GA_ERR)
61C
62      Call CCSD_MkTau_GA(NOcc, NVir, T1, LDT1, g_T2, g_Tau)
63      Call CCDen_tZoooo(NOcc, NVir, g_Tau, g_Z2, g_G)
64      Call GA_Scale(g_G, 0.50d0)
65C
66      If (.NOT. GA_Destroy(g_Tau) ) Call ErrQuit(
67     $   'CCDen_2PDMoooo: unable to free Tau', 0, GA_ERR)
68C
69C     Let's do the 1-PDM terms.  Note that the data distribution is
70C     such that each node has complete kl blocks for a particular subset
71C     of i and j.
72C
73C     Note too that the data distribution does not guarantee strict
74C     locality -- the edge portion of our patch (determined by
75C     CCSD_T2_MyIJ) may in fact be remote.  Thus it is possible this
76C     implementation is a bit too naive for good performance.
77C     With a little work, it can be made to do fewer comms by using
78C     a ga_scatter or other mechanism.
79C
80C     ****************************************
81C     * 2 delta(k,l) ( Q(i,j) - delta(i,j) ) *
82C     ****************************************
83C     Place the Q(i,j) element all along the diagonal of the block (k=l).
84C
85      Do I = ILo, IHi
86         Do J = JLo, JHi
87C
88C           Get the appropriate element of Q for this ij block
89C
90            Call GA_Get(g_Q, I, I, J, J, Qij, 1)
91            If ( I .eq. J ) Qij = Qij - 1.0d0
92C
93C           Now plop it into the diagonal elements of this block
94C
95            Do K = 1, NOcc
96               Call GA_Acc(g_G, (I-1)*NOcc+K, (I-1)*NOcc+K,
97     $              (J-1)*NOcc+K, (J-1)*NOcc+K, Qij, 1, 2.0d0)
98            EndDo
99         EndDo
100      EndDo
101C
102C     **************************************
103C     * delta(j,l) ( Q(i,k) - delta(i,k) ) *
104C     **************************************
105C     Put the Q(i,*) row into one one column (j=l) of the block.
106C
107      If (.NOT. MA_Push_Get(MT_Dbl, NOcc, 'CCDen Scr',
108     $     H_Qi, I_Qi) ) Call ErrQuit(
109     $     'CCDen_2PDMoooo: unable to allocate local Qi column', NOcc,
110     &       MA_ERR)
111C
112      Do I = ILo, IHi
113C
114C        This column of Q goes into one column in each J
115C
116         Call GA_Get(g_Q, I, I, 1, NOcc, Dbl_MB(I_Qi), 1)
117C
118C        Handle the delta(i,k) term
119C
120         Dbl_MB(I_Qi + I - 1) = Dbl_MB(I_Qi + I - 1) - 1.0d0
121C
122         Do J = JLo, JHi
123            Call GA_Acc(g_G, (I-1)*NOcc+1, I*NOcc,
124     $           (J-1)*NOcc+J, (J-1)*NOcc+J, Dbl_MB(i_Qi), 1,
125     $           -1.0d0)
126         EndDo
127      EndDo
128C
129      If ( .NOT. MA_Pop_Stack(H_Qi) ) Call ErrQuit(
130     $     'CCDen_2PDMoooo: unable to free Qi', NOcc, MA_ERR)
131C
132C
133C
134      Return
135      End
136C:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
137C NAME
138C     CCDen_tZoooo -- Form [tZ] intermediate for 2PDM
139C
140C REVISION
141C     $Id$
142C
143C SYNOPSIS
144      Subroutine CCDen_tZoooo(NOcc, NVir, g_t, g_Z, g_tZ)
145      Implicit NONE
146#include "errquit.fh"
147      Integer NOcc, NVir
148      Integer g_t, g_Z, g_tZ
149C
150C ARGUMENTS
151C DESCRIPTION
152C     Computes the [tZ](i,j,k,l) intermediate
153C
154C     In paper, the term is defined as (Eq. 58)
155C     [tZ](i,j,k,l) = sum(a>=b) t(i,j,a,b)(+) Z(k,l,a,b)(+)
156C                   + sum(a>b)  t(i,j,a,b)(-) Z(k,l,a,b)(-)
157C     with (Eq. 66--68)
158C     Z(k,l,a,b)(+/-) = 1/2 [ Z(k,l,a,b) +/- Z(l,k,a,b) ]
159C     t(i,j,a,b)(+/-) = 1/2 [ t(i,j,a,b) +/- t(j,i,a,b) ]
160C     t(i,j,a,a)(+)   = 1/2 [ t(i,j,a,a) ]
161C
162C     This can be simplified to
163C     [tZ](i,j,k,l) = 1/4 sum(a) [ t(i,j,a,a) {Z(k,l,a,a) + Z(l,k,a,a)} ]
164C                   + 1/2 sum(a>b) [ t(i,j,a,b) Z(k,l,a,b)
165C                                  + t(j,i,a,b) Z(l,k,a,b) ]
166C
167C     Note that [tZ](i,j,k,l) = [tZ](j,i,l,k)
168C
169C REFERENCES
170C     Alistair P. Rendell and Timothy J. Lee, J. Chem. Phys. 94,
171C     6219--6228 (1991). Especially Eq. 58, 66-68.
172C
173C INCLUDE FILES
174#include "global.fh"
175#include "mafdecls.fh"
176#include "stdio.fh"
177#include "util.fh"
178C
179C EXTERNAL ROUTINES
180C:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
181C LOCAL VARIABLES
182      Integer I, J, K, L, ILo, IHi, JLo, JHi, KLo, KHi, LLo, LHi, A, B
183      Integer Sorta_Node, Node
184      Integer H_Scr1, I_Scr1, H_Scr2, I_Scr2
185      Double precision tZ
186      Logical oprint
187C
188C Get print information
189C
190      oprint = util_print('information',print_low)
191C
192C     Find out what portion of t is "local"
193C
194      Call CCSD_T2_MyIJ(GA_NodeID(), NVir, g_t, ILo, IHi, JLo, JHi)
195C
196      If (oprint)  Write (LuOut, *) 'CCDen_tZoooo ', GA_NodeID(),
197     $     ': T2 region (', ILo, ':', IHi, ', ', JLo, ':', JHi, ')'
198C
199C     Allocate scratch space
200C
201      If (.NOT. MA_Push_Get(MT_Dbl, NVir*NVir,
202     $   'CCDen Scratch 1', H_Scr1, I_Scr1)) Call ErrQuit(
203     $   'CCDen_tZoooo: unable to allocate scratch', NVir*NVir, MA_ERR)
204      If (.NOT. MA_Push_Get(MT_Dbl, NVir*NVir,
205     $   'CCDen Scratch 1', H_Scr2, I_Scr2)) Call ErrQuit(
206     $   'CCDen_tZoooo: unable to allocate scratch', NVir*NVir, MA_ERR)
207C
208C     Basically, we get blocks of t and Z and dot-product them to
209C     produce an element [tZ].  For the local chunk of t, contract
210C     with all blocks of Z.  The comms can probably be handled smarter.
211C     Should check if this is a problem or not.
212C
213C
214C     Loop over processors.  Start with ours and work around the
215C     list to avoid everyone pounding on the same processor at once.
216C
217      Do Sorta_Node = GA_NodeID(), GA_NNodes() + GA_NodeID() - 1
218         Node = Mod(Sorta_Node, GA_NNodes() )
219C
220C        Figure out what Z patches are on the node and loop over them
221C
222         Call CCSD_T2_MyIJ(Node, NVir, g_Z, KLo, KHi, LLo, LHi)
223         If (oprint) Write (LuOut, *) 'CCDen_tZoooo ', GA_NodeID(),
224     $        ': Z2 region (', KLo, ':', KHi, ', ', LLo, ':', LHi, ')'
225C
226         Do K = KLo, KHi
227            Do L = LLo, LHi
228               Call GA_Get(g_Z, (K-1)*NVir+1, K*NVir, (L-1)*NVir+1,
229     $         L*NVir, Dbl_MB(I_Scr1), NVir)
230C
231C              Contract this Z block with each local t block
232C
233               Do I = ILo, IHi
234                  Do J = JLo, JHi
235                     Call GA_Get(g_t, (I-1)*NVir+1, I*NVir,
236     $                    (J-1)*NVir+1, J*NVir, Dbl_MB(I_Scr2), NVir)
237C
238                     tZ = 0.0d0
239                     Do A = 1, NVir
240                        Do B = 1, A-1
241                           tZ = tZ + Dbl_MB(i_Scr1 + (B-1)*NVir + A - 1)
242     $                             * Dbl_MB(i_Scr2 + (B-1)*NVir + A - 1)
243                        EndDo
244C
245C                       Now handle diagonal.  Note factor of 1/2 here.
246C                       Additional 1/2 applied to final result below.
247C
248                        tZ = tZ + 0.5d0
249     $                       * Dbl_MB(i_Scr1 + (A-1)*NVir + A - 1)
250     $                       * Dbl_MB(i_Scr2 + (A-1)*NVir + A - 1)
251                     EndDo
252C
253C                    Don't forget to apply factor of 1/2
254C
255                     tZ = 0.5d0 * tZ
256C
257C                    Put this in [tZ](i,j,k,l) and [tZ](j,i,l,k)
258C
259                     Call GA_Acc(g_tZ, (I-1)*NOcc+K, (I-1)*NOcc+K,
260     $                    (J-1)*NOcc+L, (J-1)*NOcc+L, tZ, 1, 1.0d0)
261                     Call GA_Acc(g_tZ, (J-1)*NOcc+L, (J-1)*NOcc+L,
262     $                    (I-1)*NOcc+K, (I-1)*NOcc+K, tZ, 1, 1.0d0)
263
264                  EndDo
265               EndDo
266C
267            EndDo
268         EndDo
269C
270      EndDo ! Nodes
271C
272C     Clean up scratch
273C
274      If (.NOT. MA_Pop_Stack(H_Scr2) ) Call ErrQuit(
275     $   'CCDen_tZoooo: unable to free scratch', 0, MA_ERR)
276      If (.NOT. MA_Pop_Stack(H_Scr1) ) Call ErrQuit(
277     $   'CCDen_tZoooo: unable to free scratch', 0, MA_ERR)
278C
279C
280C
281      Return
282      End
283C:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
284C NAME
285C     CCDen_tZvvvv -- Form [tZ] intermediate for 2PDM
286C
287C REVISION
288C     $Id$
289C
290C SYNOPSIS
291      Subroutine CCDen_tZvvvv(NOcc, NVir, g_t, g_Z, g_tZ)
292      Implicit NONE
293#include "errquit.fh"
294      Integer NOcc, NVir
295      Integer g_t, g_Z, g_tZ
296C
297C ARGUMENTS
298C DESCRIPTION
299C     Computes the [tZ](a,b,c,d) intermediate
300C
301C     In paper, the term is defined as (Eq. 59)
302C     [tZ](a,b,c,d) = sum(i>=j) t(i,j,a,b)(+) Z(i,j,c,d)(+)
303C                   + sum(i>j)  t(i,j,a,b)(-) Z(i,j,c,d)(-)
304C     with (Eq. 69--71)
305C     Z(i,j,c,d)(+/-) = 1/2 [ Z(i,j,c,d) +/- Z(i,j,d,c) ]
306C     t(i,j,a,b)(+/-) = 1/2 [ t(i,j,a,b) +/- t(j,i,a,b) ]
307C     t(i,i,a,b)(+)   = 1/2 [ t(i,i,a,b) ]
308C
309C     This can be simplified to
310C     [tZ](a,b,c,d) = 1/4 sum(i) [ t(i,i,a,b) {Z(i,j,c,d) + Z(i,j,d,c)} ]
311C                   + 1/2 sum(i>j) [ t(i,j,a,b) Z(i,j,c,d)
312C                                  + t(j,i,a,b) Z(i,j,d,c) ]
313C
314C     Note that [tZ](a,b,c,d) = [tZ](b,a,d,c)
315C
316C REFERENCES
317C     Alistair P. Rendell and Timothy J. Lee, J. Chem. Phys. 94,
318C     6219--6228 (1991). Especially Eq. 59, 69--71
319C
320C INCLUDE FILES
321#include "global.fh"
322#include "mafdecls.fh"
323#include "stdio.fh"
324#include "util.fh"
325C
326C EXTERNAL ROUTINES
327C:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
328C LOCAL VARIABLES
329      Integer I, J, K, L, ILo, IHi, JLo, JHi, KLo, KHi, LLo, LHi, A, B
330      Integer Sorta_Node, Node
331      Integer H_Scr1, I_Scr1, H_Scr2, I_Scr2
332      Double precision tZ
333      Logical oprint
334C
335C     Get print information
336C
337      oprint = util_print('information',print_low)
338C
339C     Find out what portion of t is "local"
340C
341      Call CCSD_T2_MyIJ(GA_NodeID(), NVir, g_t, ILo, IHi, JLo, JHi)
342C
343      If (oprint) Write (LuOut, *) 'CCDen_tZvvvv ', GA_NodeID(),
344     $     ': T2 region (', ILo, ':', IHi, ', ', JLo, ':', JHi, ')'
345C
346C     Allocate scratch space
347C
348      If (.NOT. MA_Push_Get(MT_Dbl, NVir*NVir,
349     $   'CCDen Scratch 1', H_Scr1, I_Scr1)) Call ErrQuit(
350     $   'CCDen_tZvvvv: unable to allocate scratch', NVir*NVir,
351     &       MA_ERR)
352      If (.NOT. MA_Push_Get(MT_Dbl, NVir*NVir,
353     $   'CCDen Scratch 1', H_Scr2, I_Scr2)) Call ErrQuit(
354     $   'CCDen_tZvvvv: unable to allocate scratch', NVir*NVir,
355     &       MA_ERR)
356C
357C     Basically, we get blocks of t and Z and dot-product them to
358C     produce an element [tZ].  For the local chunk of t, contract
359C     with all blocks of Z.  The comms can probably be handled smarter.
360C     Should check if this is a problem or not.
361C
362C
363C     Loop over processors.  Start with ours and work around the
364C     list to avoid everyone pounding on the same processor at once.
365C
366      Do Sorta_Node = GA_NodeID(), GA_NNodes() + GA_NodeID() - 1
367         Node = Mod(Sorta_Node, GA_NNodes() )
368C
369C        Figure out what Z patches are on the node and loop over them
370C
371         Call CCSD_T2_MyIJ(Node, NVir, g_Z, KLo, KHi, LLo, LHi)
372         If (oprint) Write (LuOut, *) 'CCDen_tZvvvv ', GA_NodeID(),
373     $        ': Z2 region (', KLo, ':', KHi, ', ', LLo, ':', LHi, ')'
374C
375         Do K = KLo, KHi
376            Do L = LLo, LHi
377               Call GA_Get(g_Z, (K-1)*NVir+1, K*NVir, (L-1)*NVir+1,
378     $         L*NVir, Dbl_MB(I_Scr1), NVir)
379C
380C              Contract this Z block with each local t block
381C
382               Do I = ILo, IHi
383                  Do J = JLo, JHi
384                     Call GA_Get(g_t, (I-1)*NVir+1, I*NVir,
385     $                    (J-1)*NVir+1, J*NVir, Dbl_MB(I_Scr2), NVir)
386C
387                     tZ = 0.0d0
388                     Do A = 1, NVir
389                        Do B = 1, A-1
390                           tZ = tZ + Dbl_MB(i_Scr1 + (B-1)*NVir + A - 1)
391     $                             * Dbl_MB(i_Scr2 + (B-1)*NVir + A - 1)
392                        EndDo
393C
394C                       Now handle diagonal.  Note factor of 1/2 here.
395C                       Additional 1/2 applied to final result below.
396C
397                        tZ = tZ + 0.5d0
398     $                       * Dbl_MB(i_Scr1 + (A-1)*NVir + A - 1)
399     $                       * Dbl_MB(i_Scr2 + (A-1)*NVir + A - 1)
400                     EndDo
401C
402C                    Don't forget to apply factor of 1/2
403C
404                     tZ = 0.5d0 * tZ
405C
406C                    Put this in [tZ](i,j,k,l) and [tZ](j,i,l,k)
407C
408                     Call GA_Acc(g_tZ, (I-1)*NOcc+K, (I-1)*NOcc+K,
409     $                    (J-1)*NOcc+L, (J-1)*NOcc+L, tZ, 1, 1.0d0)
410                     Call GA_Acc(g_tZ, (J-1)*NOcc+L, (J-1)*NOcc+L,
411     $                    (I-1)*NOcc+K, (I-1)*NOcc+K, tZ, 1, 1.0d0)
412
413                  EndDo
414               EndDo
415C
416            EndDo
417         EndDo
418C
419      EndDo ! Nodes
420C
421C     Clean up scratch
422C
423      If (.NOT. MA_Pop_Stack(H_Scr2) ) Call ErrQuit(
424     $   'CCDen_tZvvvv: unable to free scratch', 0, MA_ERR)
425      If (.NOT. MA_Pop_Stack(H_Scr1) ) Call ErrQuit(
426     $   'CCDen_tZvvvv: unable to free scratch', 0, MA_ERR)
427C
428C
429C
430      Return
431      End
432