1C  /* Deck so_eres */
2      SUBROUTINE SO_ERES(MODEL,  NOLDTR, NNEWTR,  DENSIJ,  LDENSIJ,
3     &                   DENSAB, LDENSAB, T2MP,    LT2MP,   FOCKD,
4     &                   LFOCKD, DENSAI,  LDENSAI, NIT,     ISYMTR,
5     &                   IDTYPE,
6#ifdef VAR_MPI
7     &                   AssignedIndices, maxnumjobs,
8#endif
9     &                   WORK,   LWORK)
10C
11C     This routine is part of the atomic integral direct SOPPA program.
12C
13C     Keld Bak, October 1995
14C     Stephan P. A. Sauer: 10.11.2003: merge with Dalton 2.0
15C     Frederik Beyer & Stephan P. A. Sauer: 27.08.2013: call to ERI corrected
16C
17C     PURPOSE: Driver routine for making a linear transformation of
18C              a trialvector with the SOPPA hessian matricx E[2].
19C              The trial vector consists of four parts TR1E, TR1D,
20C              TR2E, and TR2D. E refers to excitations and D to
21C              de-excitations. 1 refer to the one-particle part and
22C              2 to the two-particle part. The linear transformed
23C              trialvector is refered to as the resultvector and is
24C              kept in four corresponding arrays. For the linear
25C              transformation with E[2] the result vector is in RES1E,
26C              RES1D, RES2E, and RES2D.
27C              The linear transformation is driven over atomic orbitals,
28C              and E[2] is not constructed explicitly.
29C
30      use so_info, only: so_singles_first, so_singles_second,
31     &                   so_has_doubles, so_needs_densai,
32     &                   so_double_correction,
33     &                   sop_mp2ai_done
34
35#ifdef VAR_MPI
36      use so_parutils, only: soppa_comm_active, soppa_num_active,
37     &                       soppa_nint, my_mpi_integer, sop_master
38#ifdef USE_MPI_MOD_F90
39      use mpi
40#endif
41#endif
42
43#include "implicit.h"
44
45#ifdef VAR_MPI
46#ifndef USE_MPI_MOD_F90
47#include "mpif.h"
48#endif
49#endif
50
51#include "priunit.h"
52#include "maxorb.h"
53#include "maxash.h"
54#include "mxcent.h"
55#include "aovec.h"
56#include "iratdef.h"
57C
58      PARAMETER (ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0, TWO = 2.0D0)
59      DIMENSION INDEXA(MXCORB)
60      DIMENSION DENSIJ(LDENSIJ), DENSAB(LDENSAB), T2MP(LT2MP)
61      DIMENSION FOCKD(LFOCKD)
62      DIMENSION DENSAI(LDENSAI) !intent(inout)
63      DIMENSION WORK(LWORK)
64      CHARACTER MODEL*5
65      integer :: inewtr, nnewtr, isymd1, ntosym
66      integer :: thisinewtr
67C     idtype = 0 for dynamic (handle D part explicitly),
68C     idtype = 1 for real, static (Tr1D = - Tr1E)
69C     idtype = 2 for imaginary, static ( Tr1D = Tr1E )
70      integer, intent(in) :: idtype
71
72#ifdef VAR_MPI
73      integer :: maxnumjobs, nloopidx
74      integer :: AssignedIndices(maxnumjobs)
75      integer(kind=MPI_INTEGER_KIND) :: ierr_mpi, count_mpi
76#endif
77C
78#include "ccorb.h"
79#include "infind.h"
80#include "blocks.h"
81#include "ccsdinp.h"
82#include "ccsdsym.h"
83#include "ccsdio.h"
84#include "distcl.h"
85#include "cbieri.h"
86#include "eritap.h"
87#include "soppinf.h"
88
89
90      logical :: singles_first, singles_second, doubles, calc_densai
91C
92C     Logical variable which can be set to false if the dexcitation vector is
93C     to avoid calculating with only zeroes. Useful for static
94C     properties and first iteration of excitation energies.
95      logical :: do_dex
96      logical :: tr2_zero
97#ifdef VAR_MPI
98#include "iprtyp.h"
99#include "infpar.h"
100C     integer, save :: numprocs
101      integer :: numprocs
102      logical :: loadbal_dyn
103      double precision ::  timeini, timefin
104      integer(mpi_integer_kind) :: LTRTOT_mpi, lrestot, latot
105      integer(mpi_integer_kind) :: my_MPI_REAL8 = MPI_REAL8
106      numprocs = nodtot + 1
107C
108C  When to do dynamic load_balancing..?
109C  The sooner the better, but is the first iteration close to the
110C  average?
111      loadbal_dyn = .false.
112      if (nit .eq. 1) loadbal_dyn = .true.
113#endif
114C
115      do_dex = idtype .eq. 0
116
117      calc_densai = SO_NEEDS_DENSAI(MODEL) .AND. .NOT. SOP_MP2AI_DONE
118C
119C------------------
120C     Add to trace.
121C------------------
122C
123#ifdef VAR_MPI
124      if (mynum .eq. 0 ) then
125#endif
126      CALL QENTER('SO_ERES')
127#ifdef VAR_MPI
128C     Slaves need to zero DENSAI if it is (re)calculated.
129      elseif ( calc_densai ) THEN
130         CALL DZERO(DENSAI,LDENSAI)
131      endif
132#endif
133C
134C-------------------------------------------------------------
135C     Determine which terms to incude
136C-------------------------------------------------------------
137C
138      singles_first = so_singles_first(model)
139      singles_second = so_singles_second(model)
140      doubles = so_has_doubles(model)
141      tr2_zero = so_double_correction(model)
142C
143      LT2MPH = LT2MP
144      IT2MPH = 0
145C
146C------------------------------------------------------
147C     Write singlet and triplet T2 amplitudes to output
148C------------------------------------------------------
149C
150      IF ( IPRSOP. GE. 10 ) THEN
151C
152         CALL AROUND('singlet T2AM in HR_ERES')
153         CALL OUTPUT(T2MP,1,LT2MPH,1,1,LT2MPH,1,1,LUPRI)
154         IF (TRIPLET) THEN
155            CALL AROUND('triplet T2AM in HR_ERES')
156            CALL OUTPUT(T2MP(LT2MPH+1),1,LT2MPH,1,1,LT2MPH,1,1,LUPRI)
157         END IF
158C
159      END IF
160C
161C
162C------------------------------------------------------------------
163C     Determine the symmetry of the result vector from the symmetry
164C     of the trial vector ISYMTR, and the opperator symmtry ISYMOP.
165C------------------------------------------------------------------
166C
167      ISYRES  = MULD2H(ISYMOP,ISYMTR)
168C
169C---------------------------------
170C     Work space allocation no. 1.
171C---------------------------------
172C
173      LCMO   = NLAMDT
174C
175      KCMO    = 1
176      KEND1   = KCMO  + LCMO
177      LWORK1  = LWORK - KEND1
178C
179      CALL SO_MEMMAX ('SO_ERES.1',LWORK1)
180      IF (LWORK1 .LT. 0) CALL STOPIT('SO_ERES.1',' ',KEND1,LWORK)
181C
182C-------------------------------------------------------
183C     Get the matrix which contains the MO coefficients.
184C-------------------------------------------------------
185C
186#ifdef VAR_MPI
187C Only master reads...
188      IF (MYNUM .EQ. 0) THEN
189#endif
190         DTIME      = SECOND()
191         CALL SO_GETMO(WORK(KCMO),LCMO,WORK(KEND1),LWORK1)
192         DTIME      = SECOND()   - DTIME
193         SOTIME(1)  = SOTIME(1) + DTIME
194#ifdef VAR_MPI
195      ENDIF
196C Should probably use non-blocking collectives, where implemented
197!      IF (MPI_VERSION.GE.3) THEN
198!         MPI_IBCAST(WORK(KCMO),LCMO, my_MPI_INTEGER, 0,
199!     &              MPI_COMM_WORLD, ierr_mpi)
200!      ELSE
201      count_mpi = LCMO
202      CALL MPI_BCAST(WORK(KCMO), count_mpi, my_MPI_REAL8, SOP_MASTER,
203     &               SOPPA_COMM_ACTIVE, ierr_mpi)
204
205!      ENDIF
206#endif
207C
208C---------------------------------
209C     Work space allocation no. 2.
210C---------------------------------
211C
212      LTR1E   = NT1AM(ISYMTR)
213      LBTR1E  = NT1AO(ISYMTR)
214      LTR1D   = NT1AM(ISYMTR)
215      LRES1E  = NT1AM(ISYMTR)
216      IF (DO_DEX) THEN
217         LRES1D = NT1AM(ISYMTR)
218      ELSE
219         LRES1D = 0
220      END IF
221      LFOCK   = N2BST(ISYRES)
222      LDENS   = N2BST(ISYMTR)
223      LBTR1D  = NT1AO(ISYMTR)
224      LBTJ1E  = NMATAV(ISYMTR)
225      LBTJ1D  = NMATAV(ISYMTR)
226
227      IF(DOUBLES) THEN
228         IF (TRIPLET) THEN
229            LTR2E = NT2SQ(ISYMTR)
230         ELSE
231            LTR2E = N2P2HOP(ISYMTR)
232         ENDIF
233         LRES2E  = N2P2HOP(ISYMTR)
234         IF (DO_DEX) THEN
235            LTR2D   = LTR2E
236            LRES2D  = N2P2HOP(ISYMTR)
237         ELSE
238            LTR2D  = 0
239            LRES2D = 0
240         ENDIF
241      ELSE
242         LTR2E   = 0
243         LTR2D   = 0
244         LRES2E  = 0
245         LRES2D  = 0
246      ENDIF
247
248      IF (singles_second) THEN
249         LAIJ    = NRHFT*NRHFT
250         LAAB    = NVIRT*NVIRT
251      ELSE
252         LAIJ    = 0
253         LAAB    = 0
254      ENDIF
255C
256      KTR1E   = KEND1
257      KTR1D   = KTR1E   + LTR1E
258      KTR2E   = KTR1D   + LTR1D
259      KTR2D   = KTR2E   + LTR2E
260
261      KRES1E  = KTR2D   + LTR2D
262      KRES1D  = KRES1E  + LRES1E
263      KRES2E  = KRES1D  + LRES1D
264      KRES2D  = KRES2E  + LRES2E
265
266      KFOCK   = KRES2D  + LRES2D
267      KDENS   = KFOCK   + LFOCK
268      KBTR1E  = KDENS   + LDENS
269      KBTR1D  = KBTR1E  + LBTR1E
270      KBTJ1E  = KBTR1D  + LBTR1D
271      KBTJ1D  = KBTJ1E  + LBTJ1E
272
273      KAIJ    = KBTJ1D  + LBTJ1D
274      KAAB    = KAIJ    + LAIJ
275      KEND2   = KAAB    + LAAB
276#ifdef VAR_MPI
277C     MPI -- Allocate timings array
278      if ( loadbal_dyn ) then
279         KTIMING = KEND2
280         KEND2   = KTIMING + SOPPA_NINT
281         CALL DZERO(WORK(KTIMING), SOPPA_NINT)
282      endif
283#endif
284      LWORK2  = LWORK   - KEND2
285C
286      CALL SO_MEMMAX ('SO_ERES.2',LWORK2)
287      IF (LWORK2 .LT. 0) CALL STOPIT('SO_ERES.2',' ',KEND2,LWORK)
288C
289C----------------------------
290C     Initialize AIJ and AAB.
291C----------------------------
292C
293      IF(SINGLES_SECOND) THEN
294         CALL DZERO(WORK(KAIJ),LAIJ)
295         CALL DZERO(WORK(KAAB),LAAB)
296      ENDIF
297#ifdef VAR_MPI
298C MPI -- ONLY MASTER DOES THE READING
299      IF ( MYNUM .EQ. 0 ) THEN
300#endif
301C
302C----------------------------------------------
303C     Open files with trial and result vectors.
304C----------------------------------------------
305C
306      CALL SO_OPEN(LUTR1E,FNTR1E,LTR1E)
307      CALL SO_OPEN(LURS1E,FNRS1E,LRES1E)
308      CALL SO_OPEN(LURS1D,FNRS1D,LRES1E)
309      IF (DO_DEX) THEN
310         CALL SO_OPEN(LUTR1D,FNTR1D,LTR1D)
311      END IF
312C
313C     Note: open trial-vectors also with length LRES2E
314C          singlet: LRES2E = LTR2E anyway
315C          triplet: We read trial-vectors into result vector memory
316C                   initially, then create intermediate in
317C                   TR2 memory
318      IF (DOUBLES) THEN
319         CALL SO_OPEN(LUTR2E,FNTR2E,LRES2E)
320         CALL SO_OPEN(LURS2E,FNRS2E,LRES2E)
321            CALL SO_OPEN(LURS2D,FNRS2D,LRES2E)
322         IF (DO_DEX) THEN
323            CALL SO_OPEN(LUTR2D,FNTR2D,LRES2D)
324         ENDIF
325      ENDIF
326#ifdef VAR_MPI
327      ENDIF
328#endif
329C
330      IF ( IPRSOP. GE. 7
331#ifdef VAR_MPI
332     &      .AND. (MYNUM .EQ.0)
333#endif
334     &                     ) THEN ! Only printing related.
335C------------------------------------------
336C        Write new trial vectors to output.
337C------------------------------------------
338         DO 50 INEWTR = 1,NNEWTR
339C----------------------------------------------------
340C           Determine pointer to INEWTR trial vector.
341C----------------------------------------------------
342            INEW = NOLDTR + INEWTR
343C
344            CALL SO_READ(WORK(KTR1E),LTR1E,LUTR1E,FNTR1E,INEW)
345            IF(DO_DEX)
346     &               CALL SO_READ(WORK(KTR1D),LTR1D,LUTR1D,FNTR1D,INEW)
347            IF (DOUBLES .AND. .NOT. SO_DOUBLE_CORRECTION(MODEL) ) THEN
348               CALL SO_READ(WORK(KRES2E),LRES2E,LUTR2E,FNTR2E,INEW)
349               IF(DO_DEX)
350     &              CALL SO_READ(WORK(KRES2D),LRES2D,LUTR2D,FNTR2D,INEW)
351            ENDIF
352C
353            WRITE(LUPRI,'(/,I3,A)') INEWTR,'. new trial vector'
354            WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)')
355     &           (I,WORK(KTR1E+I-1),WORK(KTR1D+I-1),I=1,LTR1E)
356            IF (DOUBLES) THEN
357               WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)')
358     &            (I,WORK(KRES2E+I-1),WORK(KRES2D+I-1),I=1,LRES2E)
359            ENDIF
360   50    CONTINUE
361      END IF
362C
363C================================================
364C     Loop over number of excitations considered.
365C================================================
366C
367      DO 100 INEWTR = 1,NNEWTR
368C
369C-------------------------------------------------
370C        Determine pointer to INEWTR trial vector.
371C-------------------------------------------------
372C
373         INEW = NOLDTR + INEWTR
374C
375C--------------------------------------------------------------
376C        Initialize RES1E, RES1D, SIGAI1, SIGAI2,
377C                   SIGDA1, SIGDA2 and FOCK
378C--------------------------------------------------------------
379C
380         CALL DZERO(WORK(KRES1E),LRES1E)
381         IF (DO_DEX) CALL DZERO(WORK(KRES1D),LRES1D)
382
383         CALL DZERO(WORK(KFOCK),LFOCK)
384C
385C--------------------------
386C        Read trial vector.
387C--------------------------
388C
389#ifdef VAR_MPI
390         IF ( MYNUM .EQ. 0 ) THEN
391#endif
392            CALL SO_READ(WORK(KTR1E),LTR1E,LUTR1E,FNTR1E,INEW)
393            IF (DO_DEX) THEN
394               CALL SO_READ(WORK(KTR1D),LTR1D,LUTR1D,FNTR1D,INEW)
395            ELSE
396C     Static case: Generate D-vector from E-vector
397               CALL DCOPY(LTR1E,WORK(KTR1E),1,WORK(KTR1D),1)
398               IF (IDTYPE.EQ.1) CALL DSCAL(LTR1E,-1.0D0,WORK(KTR1D),1)
399            END IF
400            IF (DOUBLES) THEN
401C              Quick fix for RPA(D) -> Later use this variable to
402C              speed up first iteration for excitation energies
403               IF (TR2_ZERO) THEN
404                  CALL DZERO(WORK(KTR2E),LTR2E)
405                  CALL DZERO(WORK(KTR2D),LTR2D)
406               ENDIF
407
408            ENDIF
409#ifdef VAR_MPI
410         ENDIF
411C
412#endif
413         IF (DOUBLES) THEN
414C--------------------------------------------------------------------
415C        RES2E, RES2D is initialized with the D^0*Tr2 contribution
416C        --- For MPI only one process must do this
417C--------------------------------------------------------------------
418            IF ( (.NOT.TR2_ZERO)
419#ifdef VAR_MPI
420     &            .AND.(MYNUM .EQ. 0 )
421#endif
422     &                        ) THEN
423C
424               IF (TRIPLET) THEN
425C-----------------------------------------------------------
426C                 For triplet we read the trial-vector into
427C                 solution vector memory. Then we use that
428C                 to create the non-symmetric intermediate on
429C                 KTR2*
430C----------------------------------------------------------
431                  CALL SO_READ(WORK(KRES2E),LRES2E,LUTR2E,FNTR2E,INEW)
432                  CALL SO_TRANTRIP(WORK(KTR2E),WORK(KRES2E),ISYMTR)
433                  IF(DO_DEX) THEN
434                     CALL SO_READ(WORK(KRES2D),LRES2D,LUTR2D,
435     &                     FNTR2D,INEW)
436                     CALL SO_TRANTRIP(WORK(KTR2D),WORK(KRES2D),ISYMTR)
437                  END IF
438C
439                  DTIME      = SECOND()
440                  CALL SO_RES_CDT(WORK(KRES2E),LRES2E,
441     &                            WORK(KRES2D),LRES2D,
442     &                            FOCKD,LFOCKD,ISYRES,DO_DEX,
443     &                            WORK(KEND2),LWORK2)
444                  DTIME      = SECOND()   - DTIME
445                  SOTIME(30) = SOTIME(30) + DTIME
446C
447               ELSE
448C
449                  CALL SO_READ(WORK(KTR2E),LTR2E,LUTR2E,FNTR2E,INEW)
450                  IF(DO_DEX)
451     &               CALL SO_READ(WORK(KTR2D),LTR2D,LUTR2D,FNTR2D,INEW)
452C
453                  DTIME      = SECOND()
454                  CALL SO_RES_CD(WORK(KRES2E),LRES2E,
455     &                           WORK(KRES2D),LRES2D,
456     &                           WORK(KTR2E),LTR2E,WORK(KTR2D),LTR2D,
457     &                           FOCKD,LFOCKD,ISYRES,
458     &                           DO_DEX,WORK(KEND2),LWORK2)
459
460                  DTIME      = SECOND()   - DTIME
461                  SOTIME(30) = SOTIME(30) + DTIME
462C
463C   The trial-vectors are no longer needed in the original
464C   basis, transform them.
465C
466                  CALL CCSD_TCMEPKX(WORK(KTR2E),TWO,ISYMTR)
467                  IF (DO_DEX) CALL CCSD_TCMEPKX(WORK(KTR2D),TWO,ISYMTR)
468               ENDIF
469C
470            ELSE
471               CALL DZERO(WORK(KRES2E),LRES2E)
472               IF (DO_DEX) CALL DZERO(WORK(KRES2D),LRES2D)
473            ENDIF
474         END IF
475C
476#ifdef VAR_MPI
477C---------------------------
478C Communicate trial-vectors
479C---------------------------
480C Rememember that the trial-vectors are contigous in memory
481         LTRTOT_mpi = LTR1E + LTR1D + LTR2E + LTR2D
482         CALL MPI_BCAST(WORK(KTR1E), LTRTOT_mpi, my_MPI_REAL8,
483     &                  sop_master, SOPPA_COMM_ACTIVE, ierr_mpi)
484C Note for future adjustment:
485C It may be worthwile using non-blocking communication here
486C since the next two calls only use the singles vectors.
487C
488C Also, depending on the communication overhead, it may be better
489C to transform on host only and then send it to the slaves..?
490#endif
491C
492C---------------------------------------------------
493C        Calculate RPA-density matrices in AO basis.
494C---------------------------------------------------
495C
496         DTIME     = SECOND()
497         CALL SO_AODENS(WORK(KDENS),LDENS,WORK(KCMO),LCMO,
498     &                  WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D,ISYMTR,
499     &                  WORK(KEND2),LWORK2)
500         DTIME     = SECOND()  - DTIME
501         SOTIME(6) = SOTIME(6) + DTIME
502C
503C--------------------------------------------
504C        Backtransformation of trial vectors.
505C--------------------------------------------
506C
507         DTIME     = SECOND()
508         CALL SO_BCKTR(WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D,
509     &                 WORK(KBTR1E),LBTR1E,WORK(KBTR1D),LBTR1D,
510     &                 WORK(KBTJ1E),LBTJ1E,WORK(KBTJ1D),LBTJ1D,
511     &                 WORK(KCMO),LCMO,ISYMTR)
512         DTIME     = SECOND()  - DTIME
513         SOTIME(7) = SOTIME(7) + DTIME
514C
515C=======================================================
516C        Start the loop over distributions of integrals.
517C=======================================================
518C
519         IF (DIRECT) THEN
520            NTOSYM = 1
521            DTIME  = SECOND()
522            IF (HERDIR) THEN
523               CALL HERDI1(WORK(KEND2),LWORK2,IPRINT)
524               KINDXB = KEND2
525            ELSEIF(INEWTR.EQ.1) THEN
526C Can we get away with doing this once?
527C Because we CAN'T move KEND2 forwards in each loop cycle,
528C since that'll be leaking memory
529               KCCFB1 = KEND2
530               KINDXB = KCCFB1 + MXPRIM*MXCONT
531               KEND2  = KINDXB + (8*MXSHEL*MXCONT)/IRAT
532               LWORK2 = LWORK  - KEND2
533
534               CALL ERIDI1(KODCL1,KODCL2,KODBC1,KODBC2,KRDBC1,KRDBC2,
535     &                     KODPP1,KODPP2,KRDPP1,KRDPP2,KFREE,LFREE,
536     &                     KEND2,WORK(KCCFB1),WORK(KINDXB),WORK(KEND2),
537     &                     LWORK2,IPRINT)
538
539               KEND2  = KFREE
540               LWORK2 = LFREE
541            ENDIF
542            DTIME     = SECOND()  - DTIME
543            SOTIME(8) = SOTIME(8) + DTIME
544         ELSE
545            NTOSYM = NSYM
546         ENDIF
547C
548         ICDEL1  = 0
549C
550#ifdef VAR_MPI
551C-----------------------------------------------------------------
552C        In MPI calculations, we need to distribute the indices.
553C        On first pass, we set this using the pre-sorted approach.
554C-----------------------------------------------------------------
555C         IF ( (nit .ne. 1 .or. inewtr .ne. 1)
556         IF ( .not. (nit .eq. 1 .and. inewtr .eq. 1)
557     &                        .or.  numprocs .eq. 1 ) THEN
558C           After first pass, load-balancing has been done
559            CONTINUE
560         ELSEIF( mynum .eq. 0 )THEN
561C           Master does the balancing
562            call presortloadbal_parsoppa(AssignedIndices,   maxnumjobs,
563     &                     work(kindxb), work(kend2), lwork2)
564         ELSE
565            count_mpi = maxnumjobs
566C           Slaves receives the scatter, send arguments should be ignored
567            call mpi_scatter( AssignedIndices, count_mpi,my_mpi_integer,
568     &                        AssignedIndices, count_mpi,my_mpi_integer,
569     &                        sop_master, soppa_comm_active, ierr_mpi )
570         ENDIF
571#endif
572         DO 210 ISYMD1 = 1,NTOSYM
573C
574            IF (DIRECT) THEN
575               IF (HERDIR) THEN
576                  NTOT = MAXSHL
577               ELSE
578                  NTOT = MXCALL
579               ENDIF
580            ELSE
581               NTOT = NBAS(ISYMD1)
582            ENDIF
583#ifdef VAR_MPI
584            IF(numprocs .gt. 1) THEN
585               NLOOPIDX = maxnumjobs
586            ELSE
587               NLOOPIDX = NTOT
588               loadbal_dyn = .false.
589            ENDIF
590C
591#endif
592C
593C-------------------------------------------------
594C           Main loop over integral-distributions.
595C-------------------------------------------------
596
597#ifdef VAR_MPI
598C------------------------------------------------------------------
599C           For_parallel calculations, we have som stuff to set up.
600C------------------------------------------------------------------
601            DO 220 ILLLDUMMY = 1, nloopidx
602               if ( numprocs .gt. 1 ) then
603                  ILLL = assignedIndices(illldummy)
604C                 A zero indicates that we have no more work, exit loop
605                  IF (ILLL .eq. 0) exit
606                  if ( loadbal_dyn ) timeini = mpi_wtime()
607               else
608                  ILLL = ILLLDUMMY
609               endif
610#else
611            DO 220 ILLL = 1,NTOT
612#endif
613C               print *, ILLL
614C------------------------------------------------
615C              If direct calculate the integrals.
616C------------------------------------------------
617               IF (DIRECT) THEN
618C
619                  DTIME  = SECOND()
620                  IF (HERDIR) THEN
621C
622                    CALL HERDI2(WORK(KEND2),LWORK2,INDEXA,ILLL,NUMDIS,
623     &                          IPRINT)
624C
625                  ELSE
626C
627                     CALL ERIDI2(ILLL,INDEXA,NUMDIS,0,0,
628     &                           WORK(KODCL1),WORK(KODCL2),
629     &                           WORK(KODBC1),WORK(KODBC2),
630     &                           WORK(KRDBC1),WORK(KRDBC2),
631     &                           WORK(KODPP1),WORK(KODPP2),
632     &                           WORK(KRDPP1),WORK(KRDPP2),
633     &                           WORK(KCCFB1),WORK(KINDXB),
634     &                           WORK(KEND2),LWORK2,IPRINT)
635C
636                  ENDIF
637                  DTIME     = SECOND()  - DTIME
638                  SOTIME(9) = SOTIME(9) + DTIME
639C
640                  LRECNR  = ( (NBUFX(0) -1) / IRAT ) + 1
641                  KRECNR  = KEND2
642                  KEND2B   = KRECNR + LRECNR
643                  LWORK2B  = LWORK  - KEND2B
644C
645                  CALL SO_MEMMAX ('SO_ERES.2B',LWORK2B)
646                  IF (LWORK2 .LT. 0)
647     &                CALL STOPIT('SO_ERES.2B',' ',KEND2B,LWORK)
648C
649               ELSE
650                  NUMDIS = 1
651                  KEND2B = KEND2
652               ENDIF
653C
654C-------------------------------------------------------------------
655C   Loop over number of distributions in disk.
656C   In the case of ERI there are more than one distribution and IDEL2
657C   loops over them and the actual index of the delta orbital IDEL is
658C   then obtained from the array INDEXA. In the case of a not direct
659C   calculation there is only one distribution on the disk, which
660C   implies that IDEL2 is always 1 and that IDEL is systematically
661C   incremented by one each time.
662C--------------------------------------------------------------------
663C
664               DO 230 IDEL2 = 1,NUMDIS
665C
666                  IF (DIRECT) THEN
667                     IDEL  = INDEXA(IDEL2)
668                     ISYMD = ISAO(IDEL) !keeps track of current symmetry
669                  ELSE
670                     IDEL  = IBAS(ISYMD1) + ILLL
671                     ISYMD = ISYMD1
672                  ENDIF
673C
674                  ISYDIS = MULD2H(ISYMD,ISYMOP)
675C
676                  IT2DEL(IDEL) = ICDEL1
677                  ICDEL1       = ICDEL1 + NT2BCD(ISYDIS)
678C
679C---------------------------------------------
680C                 Work space allocation no. 3.
681C---------------------------------------------
682C
683                  LXINT  = NDISAO(ISYDIS)
684C
685                  KXINT   = KEND2B
686                  KEND3   = KXINT + LXINT
687                  LWORK3  = LWORK - KEND3
688C
689                  CALL SO_MEMMAX ('SO_ERES.3',LWORK3)
690                  IF (LWORK3 .LT. 0)
691     &                CALL STOPIT('SO_ERES.3',' ',KEND3,LWORK)
692C
693C--------------------------------------------
694C                 Read in batch of integrals.
695C--------------------------------------------
696C
697                  DTIME      = SECOND()
698                  CALL CCRDAO(WORK(KXINT),IDEL,IDEL2,WORK(KEND3),LWORK3,
699     &                        WORK(KRECNR),DIRECT)
700                  DTIME      = SECOND()   - DTIME
701                  SOTIME(10) = SOTIME(10) + DTIME
702C
703C
704C---------------------------------------------
705C                    Work space allocation no. 4.
706C---------------------------------------------
707C
708                  ISAIJ = MULD2H(ISYMD,1)
709C
710                  IF (DOUBLES.OR.SINGLES_SECOND) THEN
711C
712                     LT2M1 = NT2BCD(ISAIJ)
713                     LX2M1 = NT2BCD(MULD2H(ISYMD,ISYMTR))
714                     KT2M1  = KEND3
715                     KX2EM1 = KT2M1  + LT2M1
716                     KX2DM1 = KX2EM1 + LX2M1
717                     KEND4  = KX2DM1 + LX2M1
718                     LWORK4 = LWORK  - KEND4
719C
720                     CALL SO_MEMMAX ('SO_ERES.4',LWORK4)
721                     IF (LWORK4 .LT. 0)
722     &                   CALL STOPIT('SO_ERES.4',' ',KEND4,LWORK)
723C
724C
725C------------------------------------------------------------
726C                    Construct the partially back-transformed T2
727C                    MP-amplitudes.
728C------------------------------------------------------------
729C
730                     DTIME      = SECOND()
731CPi-140316 Change LT2MP -> LT2MPH
732CPi I think now we have this condition two times
733                     IF (NVIR(ISYMD).GT.0) THEN
734                        CALL SO_T2M1(WORK(KT2M1),LT2M1,T2MP,LT2MPH,
735Cend-Pi
736     &                               WORK(KCMO),LCMO,IDEL,ISYMD,ISYDIS,
737     &                               WORK(KEND4),LWORK4)
738                     ELSE
739                        CALL DZERO(WORK(KT2M1),LT2M1)
740                     END IF
741C
742C------------------------------------------------------------
743C                    Construct the partially back-transformed
744C                    trial vectors.
745C                    These are scaled by 1/sqrt(2)
746C------------------------------------------------------------
747C
748                     IF(DOUBLES.AND..NOT.TRIPLET.AND.
749     &                  (NVIR(ISYMD).GT.0)) THEN
750                        CALL SO_X2M1(WORK(KX2EM1),LX2M1,
751     &                               WORK(KTR2E),LTR2E,
752     &                               WORK(KCMO),LCMO,IDEL,ISYMD,
753     &                               ISYMTR,WORK(KEND4),LWORK4)
754                        IF (DO_DEX) THEN
755                           CALL SO_X2M1(WORK(KX2DM1),LX2M1,
756     &                                  WORK(KTR2D),LTR2D,
757     &                                  WORK(KCMO),LCMO,IDEL,ISYMD,
758     &                                  ISYMTR,WORK(KEND4),LWORK4)
759                        END IF
760                     ELSE
761                        CALL DZERO(WORK(KX2EM1),LX2M1)
762                        IF (DO_DEX) CALL DZERO(WORK(KX2DM1),LX2M1)
763                     END IF
764
765                     IF (SINGLES_SECOND) THEN
766C
767C---------------------------------------------------------------
768C                    Add T2^(ab)_(ij)*x1(b, delta) intermediates
769C                    to the backtransformed trialvectors.
770C---------------------------------------------------------------
771C                    (This takes care of terms (1), and (5) of the B
772C                    matrix)
773                        CALL SO_T2X1(WORK(KX2EM1),LX2M1,
774     &                               T2MP,LT2MPH,
775     &                               WORK(KBTJ1D),LBTJ1D,IDEL,ISYMD,
776     &                               ISYMTR,WORK(KEND4),LWORK4)
777                        IF (DO_DEX) THEN
778                           CALL SO_T2X1(WORK(KX2DM1),LX2M1,
779     &                                  T2MP,LT2MPH,
780     &                                  WORK(KBTJ1E),LBTJ1E,IDEL,ISYMD,
781     &                                  ISYMTR,WORK(KEND4),LWORK4)
782                        END IF
783C
784C     For triplet we need to transform the intermediates
785C
786                        IF (TRIPLET.AND.
787     &                      NVIR(MULD2H(ISYMD,ISYMTR)).GE.1) THEN
788                           CALL SO_M1SHUF(WORK(KX2EM1),LX2M1,
789     &                                    ISYMD,ISYMTR)
790                           IF (DO_DEX) THEN
791                              CALL SO_M1SHUF(WORK(KX2DM1),LX2M1,
792     &                                       ISYMD,ISYMTR)
793                           END IF
794                        END IF
795
796                     END IF
797C
798C     In the triplet case we create the partially back-transformed
799C     intermediate last.
800C
801                     IF(DOUBLES.AND.TRIPLET.AND.
802     &                  (NVIR(ISYMD).GT.0)) THEN
803                        CALL SO_X2SM1(WORK(KX2EM1),LX2M1,
804     &                               WORK(KTR2E),LTR2E,
805     &                               WORK(KCMO),LCMO,IDEL,ISYMD,
806     &                               ISYMTR,WORK(KEND4),LWORK4)
807                        IF (DO_DEX) THEN
808                           CALL SO_X2SM1(WORK(KX2DM1),LX2M1,
809     &                                  WORK(KTR2D),LTR2D,
810     &                                  WORK(KCMO),LCMO,IDEL,ISYMD,
811     &                                  ISYMTR,WORK(KEND4),LWORK4)
812                        END IF
813                     END IF
814C
815                     DTIME      = SECOND()   - DTIME
816                     SOTIME(12) = SOTIME(12) + DTIME
817
818                  ELSE
819                     KT2M1 = HUGE(LWORK) ! Crash if this is accessed
820                     KX2EM1 = HUGE(LWORK)
821                     KX2DM1 = HUGE(LWORK)
822                     LT2M1 = 0
823                     LX2M1 = 0
824                     KEND4 = KEND3
825                     LWORK4 = LWORK3
826                  ENDIF
827C
828C---------------------------------------------
829C                 Work space allocation no. 5.
830C---------------------------------------------
831C
832                  IF (DOUBLES.OR.SINGLES_SECOND) THEN
833                     LDSRHF = NDSRHF(ISYMD)
834C
835                     KDSRHF = KEND4
836                     KEND5  = KDSRHF + LDSRHF
837                     LWORK5 = LWORK  - KEND5
838C
839                     CALL SO_MEMMAX ('SO_ERES.5',LWORK5)
840                     IF (LWORK5 .LT. 0)
841     &                  CALL STOPIT('SO_ERES.5',' ',KEND5,LWORK)
842C
843C----------------------------------------------------------------
844C                    Transform one index in the integral batch to an
845C                    occupied index.
846C----------------------------------------------------------------
847C
848                     DTIME  = SECOND()
849                     ISYMLP = 1
850                     CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KCMO),
851     &                        ISYMLP,WORK(KEND5),LWORK5,ISYDIS)
852                     DTIME      = SECOND()   - DTIME
853                     SOTIME(13) = SOTIME(13) + DTIME
854                  ELSE
855                     LDSRHF = 0
856                     KEND5 = KEND4
857                     LWORK5 = LWORK4
858                     KDSRHF = HUGE(LWORK)
859                  END IF
860C
861C-------------------------------------------------------------------
862C                 Calculate part of the second order density matrix.
863C-------------------------------------------------------------------
864C
865                  DTIME      = SECOND()
866                  IF ( calc_densai .AND. (  INEWTR.EQ.1 ) ) THEN
867                     CALL SO_DENSAI1(DENSAI,LDENSAI,WORK(KDSRHF),LDSRHF,
868     &                               WORK(KCMO),LCMO,WORK(KT2M1),LT2M1,
869     &                               ISYMD,ISYDIS,WORK(KEND5),
870     &                               LWORK5)
871
872                  END IF
873                  DTIME      = SECOND()   - DTIME
874                  SOTIME(41) = SOTIME(41) + DTIME
875C
876                  IF (TRIPLET) THEN
877                     CALL LOOP_BODY_TRIPLET
878                  ELSE
879                     CALL LOOP_BODY_SINGLET
880                  ENDIF
881C
882  230          CONTINUE  ! End of IDEL2 loop
883C
884#ifdef VAR_MPI
885               IF (loadbal_dyn) then
886                   timefin = mpi_wtime()
887                   itimeilll = ktiming + illl - 1
888                   work(itimeilll) = work(itimeilll) + timefin - timeini
889               ENDIF
890#endif
891C
892  220       CONTINUE !End of ILLL loop
893C
894  210    CONTINUE !end of ISYMD1 loop
895C
896C====================================================
897C        End of loop over distributions of integrals.
898C====================================================
899C
900#ifdef VAR_MPI
901C-------------------------------------------------
902C        Communicate the result vectors to master.
903C-------------------------------------------------
904C  Note for further development:
905C  In the following, non-blocking reductions could be used,
906C  since the one-particle result-vector is first needed in the
907C  second call, sigai and sigda in the third and fourth call,
908C  and the two-particle vector is just written to file.
909C  The alternative approach, is to do all these calculations
910C  on the slaves, and only reduce in the end, this would save
911C  communication of sigai and sigda.
912C
913C  Use the fact that memory are together
914         lrestot = lres1e + lres1d + lres2e + lres2d
915         latot   = laij + laab
916C  Master use inplace operations
917         if (mynum .eq. 0 ) then
918C
919C           Fock-matrix
920            count_mpi = lfock
921            call mpi_reduce( mpi_in_place, work(kfock), count_mpi,
922     &                       my_MPI_REAL8, MPI_SUM, sop_master,
923     &                       soppa_comm_active, ierr_mpi)
924C
925C           Result-vectors
926            call mpi_reduce( mpi_in_place, work(kres1e), lrestot,
927     &                       my_MPI_REAL8, MPI_SUM, sop_master,
928     &                       soppa_comm_active, ierr_mpi)
929C
930            if (singles_second) then
931C
932C           Aij / Aab -- only calculated in first pass
933               if ( inewtr .eq. 1 ) then
934C               write(lupri,*) 'Aij'
935                  call mpi_reduce ( mpi_in_place, work(kaij), latot,
936     &                              my_MPI_REAL8, MPI_SUM, sop_master,
937     &                              soppa_comm_active, ierr_mpi)
938               end if
939            end if
940         else
941C  Slaves pass the same buffer as the recieve-buffer...
942C
943C           Fock-matrix
944            count_mpi = lfock
945            call mpi_reduce( work(kfock), work(kfock), count_mpi,
946     &                       my_MPI_REAL8, MPI_SUM, sop_master,
947     &                       soppa_comm_active, ierr_mpi)
948C
949C           Result-vectors
950            call mpi_reduce( work(kres1e), work(kres1e), lrestot,
951     &                       my_MPI_REAL8, MPI_SUM, sop_master,
952     &                       soppa_comm_active, ierr_mpi)
953C
954            if (singles_second) then
955C
956C              Aij / Aab
957               if ( inewtr .eq. 1 ) then
958                  call mpi_reduce ( work(kaij), work(kaij), latot,
959     &                              my_MPI_REAL8, MPI_SUM, sop_master,
960     &                              soppa_comm_active, ierr_mpi)
961               endif
962            endif
963C
964C  After the reductions, the slaves are done; cycle loop
965            goto 100
966         endif
967#endif
968C
969C---------------------------------------------
970C        Transform AO Fock matrix to MO basis.
971C---------------------------------------------
972C
973         DTIME      = SECOND()
974         CALL TRANS_FCK(WORK(KFOCK),ISYRES)
975         CALL CC_FCKMO(WORK(KFOCK),WORK(KCMO),WORK(KCMO),
976     &                    WORK(KEND2),LWORK2,ISYRES,1,1)
977         DTIME      = SECOND()   - DTIME
978         SOTIME(24) = SOTIME(24) + DTIME
979C
980C------------------------------------------------------------------
981C        Calculate and add the RPA two-particle parts to the result
982C        vectors.
983C------------------------------------------------------------------
984C
985
986         DTIME      = SECOND()
987         CALL SO_TWOFOCK(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D,
988     &                   WORK(KFOCK),LFOCK,ISYRES,DO_DEX)
989         DTIME      = SECOND()   - DTIME
990         SOTIME(25) = SOTIME(25) + DTIME
991C
992        if (singles_second) then
993C-----------------------------------------------------------------
994C           Calculate and add the symmetry correcting term to A in
995C           eq. (44).
996C-----------------------------------------------------------------
997C
998            DTIME      = SECOND()
999
1000            CALL SO_RES_SYM(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D,
1001     &                      WORK(KAIJ),LAIJ,WORK(KAAB),LAAB,WORK(KTR1E),
1002     &                      LTR1E,WORK(KTR1D),LTR1D,ISYRES,DO_DEX)
1003            DTIME      = SECOND()   - DTIME
1004            SOTIME(20) = SOTIME(20) + DTIME
1005C
1006C---------------------------------------------------------
1007C           Calculate and add the Fock-term to A in eq. (40).
1008C---------------------------------------------------------
1009C
1010            DTIME      = SECOND()
1011            CALL SO_RES_FCK(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D,
1012     &                      WORK(KTR1E),LTR1E,WORK(KTR1D),
1013     &                      LTR1D,FOCKD,LFOCKD,DENSIJ,LDENSIJ,DENSAB,
1014     &                      LDENSAB,ISYRES,ISYMTR,DO_DEX)
1015
1016            DTIME      = SECOND()   - DTIME
1017            SOTIME(21) = SOTIME(21) + DTIME
1018C
1019         endif
1020C
1021C------------------------------------------------------------------
1022C        Calculate and add the RPA one-particle parts to the result
1023C        vectors.
1024C------------------------------------------------------------------
1025C
1026         DTIME      = SECOND()
1027
1028         CALL SO_ONEFOCK(WORK(KRES1E),LRES1E,WORK(KRES1D),LRES1D,FOCKD,
1029     &                   LFOCKD,WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D,
1030     &                   ISYRES,ISYMTR,DO_DEX)
1031         DTIME      = SECOND()   - DTIME
1032         SOTIME(26) = SOTIME(26) + DTIME
1033C
1034C
1035#ifdef VAR_MPI
1036C Slaves are done
1037         IF (MYNUM .NE. 0) GOTO 100
1038#endif
1039C
1040C----------------------------------------
1041C        Write new result vectors to file.
1042C----------------------------------------
1043C
1044         CALL SO_WRITE(WORK(KRES1E),LRES1E,LURS1E,FNRS1E,INEW)
1045         IF (DO_DEX)
1046     &      CALL SO_WRITE(WORK(KRES1D),LRES1D,LURS1D,FNRS1D,INEW)
1047         IF (DOUBLES) THEN
1048            CALL SO_WRITE(WORK(KRES2E),LRES2E,LURS2E,FNRS2E,INEW)
1049            IF(DO_DEX)
1050     &            CALL SO_WRITE(WORK(KRES2D),LRES2D,LURS2D,FNRS2D,INEW)
1051         ENDIF
1052C
1053C     Write zeroes to D-solution vectors
1054         IF (.NOT. DO_DEX) THEN
1055            CALL DZERO(WORK(KRES1E),LRES1E)
1056            CALL SO_WRITE(WORK(KRES1E),LRES1E,LURS1D,FNRS1D,INEW)
1057            IF (DOUBLES) THEN
1058               CALL DZERO(WORK(KRES2E),LRES2E)
1059               CALL SO_WRITE(WORK(KRES2E),LRES2E,LURS2D,FNRS2D,INEW)
1060            END IF
1061         END IF
1062C
1063  100 CONTINUE
1064C
1065C==================================
1066C     End of loop over excitations.
1067C==================================
1068C
1069#ifdef VAR_MPI
1070C-----------------------------------------------------------
1071C     Communicate the second order density matrix if needed.
1072C-----------------------------------------------------------
1073      IF ( CALC_DENSAI ) THEN
1074         count_mpi = LDENSAI
1075         IF (MYNUM.EQ.0) THEN
1076            CALL MPI_REDUCE( MPI_IN_PLACE, DENSAI, count_mpi,
1077     &                       my_MPI_REAL8, MPI_SUM, sop_master,
1078     &                       SOPPA_COMM_ACTIVE, ierr_mpi)
1079         ELSE
1080            CALL MPI_REDUCE( DENSAI, DENSAI, count_mpi,
1081     &                       my_MPI_REAL8, MPI_SUM, sop_master,
1082     &                       SOPPA_COMM_ACTIVE, ierr_mpi)
1083         ENDIF
1084      ENDIF
1085C--------------------------------------
1086C     Communicate the timings if needed
1087C--------------------------------------
1088      if ( loadbal_dyn ) then
1089         if (mynum .eq.0) then
1090C  Master
1091C --------
1092C Recieve the timings
1093            count_mpi = soppa_nint
1094            call mpi_reduce( mpi_in_place, work(ktiming), count_mpi,
1095     &                       my_MPI_REAL8, MPI_SUM, sop_master,
1096     &                       SOPPA_COMM_ACTIVE, ierr_mpi)
1097C
1098C Redo the loadbalancing based on the timings and distribute them
1099            ksorted = kend2
1100            ktmp    = (soppa_num_active*maxnumjobs)
1101            knasjob = ksorted + ktmp/irat + mod(ktmp,irat)
1102            kswork  = knasjob + soppa_num_active/irat + mod(ktmp,irat)
1103            kendf   = kswork + soppa_num_active
1104C
1105            call dynloadbal_parsoppa( AssignedIndices, maxnumjobs,
1106     &                                work(ktiming), soppa_nint,
1107     &                                work(ksorted), work(knasjob),
1108     &                                work(kswork) )
1109            !                    add empty work-space...
1110            !                    currently it can start at kend2
1111         else
1112C   Slave
1113C  ---------
1114C  Send the timings
1115            count_mpi = soppa_nint
1116            call mpi_reduce( work(ktiming), work(ktiming), count_mpi,
1117     &                       my_MPI_REAL8, MPI_SUM, sop_master,
1118     &                       SOPPA_COMM_ACTIVE, ierr_mpi)
1119C  Recieve the new indices
1120            count_mpi = maxnumjobs
1121            call mpi_scatter( AssignedIndices,count_mpi,my_mpi_integer,
1122     &                        AssignedIndices,count_mpi,my_mpi_integer,
1123     &                        sop_master, soppa_comm_active, ierr_mpi)
1124         endif
1125      endif
1126C Slaves are done
1127      IF (MYNUM .NE. 0) THEN
1128         IF ( CALC_DENSAI ) SOP_MP2AI_DONE = .TRUE.
1129         RETURN
1130      END IF
1131#endif
1132C----------------------------------------------------------------
1133C     Calculate the last part of the second order density matrix.
1134C----------------------------------------------------------------
1135C
1136      DTIME      = SECOND()
1137      IF ( CALC_DENSAI )  THEN
1138         CALL SO_DENSAI2(DENSAI,LDENSAI,FOCKD,LFOCKD)
1139         SOP_MP2AI_DONE = .TRUE.
1140      END IF
1141C
1142      DTIME      = SECOND()   - DTIME
1143      SOTIME(41) = SOTIME(41) + DTIME
1144C
1145      IF ( IPRSOP .GE. 7 ) THEN
1146C------------------------------------------
1147C        Write new resultvectors to output.
1148C------------------------------------------
1149         DO 400 INEWTR = 1,NNEWTR
1150            INEW = NOLDTR + INEWTR
1151            WRITE(LUPRI,'(/,I3,A)') INEWTR,
1152     &                '. new E[2] linear transformed trial vector'
1153            CALL SO_READ(WORK(KRES1E),LRES1E,LURS1E,FNRS1E,INEW)
1154            CALL SO_READ(WORK(KRES1D),LRES1E,LURS1D,FNRS1D,INEW)
1155            WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)')
1156     &           (I,WORK(KRES1E+I-1),WORK(KRES1D+I-1),I=1,LRES1E)
1157            IF (DOUBLES) THEN
1158               CALL SO_READ(WORK(KRES2E),LRES2E,LURS2E,FNRS2E,INEW)
1159C               IF (DO_DEX)
1160               CALL SO_READ(WORK(KRES2D),LRES2E,LURS2D,FNRS2D,INEW)
1161               WRITE(LUPRI,'(I8,1X,F14.8,5X,F14.8)')
1162     &             (I,WORK(KRES2E+I-1),WORK(KRES2D+I-1),I=1,LRES2E)
1163            ENDIF
1164  400    CONTINUE
1165C
1166      END IF
1167C
1168C-----------------
1169C     Close files.
1170C-----------------
1171C
1172      CALL SO_CLOSE(LUTR1E,FNTR1E,'KEEP')
1173      CALL SO_CLOSE(LURS1E,FNRS1E,'KEEP')
1174         CALL SO_CLOSE(LURS1D,FNRS1D,'KEEP')
1175      IF (DO_DEX) THEN
1176         CALL SO_CLOSE(LUTR1D,FNTR1D,'KEEP')
1177      END IF
1178C
1179      IF (DOUBLES) THEN
1180         CALL SO_CLOSE(LUTR2E,FNTR2E,'KEEP')
1181         CALL SO_CLOSE(LURS2E,FNRS2E,'KEEP')
1182            CALL SO_CLOSE(LURS2D,FNRS2D,'KEEP')
1183         IF (DO_DEX) THEN
1184            CALL SO_CLOSE(LUTR2D,FNTR2D,'KEEP')
1185         ENDIF
1186      ENDIF
1187C
1188C
1189C-----------------------
1190C     Remove from trace.
1191C-----------------------
1192C
1193      CALL QEXIT('SO_ERES')
1194C
1195      RETURN
1196C
1197C------------------------------------------------------------------------
1198C  The body of the above loop over integral distributions has been moved
1199C  into these internal subroutines.
1200C------------------------------------------------------------------------
1201C
1202      CONTAINS
1203C  Be aware that variables fall through from the outer scope!
1204C
1205
1206         SUBROUTINE LOOP_BODY_SINGLET
1207C
1208         DOUBLE PRECISION DTIME
1209C
1210C
1211C----------------------------------------------
1212C        Calculate the AO-Fock matrix.
1213C----------------------------------------------
1214C
1215         DTIME      = SECOND()
1216         CALL SO_AOFOCK1(WORK(KXINT),WORK(KDENS),WORK(KFOCK),
1217     &                  WORK(KEND5),LWORK5,
1218     &                  IDEL,ISYMD,
1219     &                  ISYMTR)
1220         DTIME      = SECOND()   - DTIME
1221         SOTIME(11) = SOTIME(11) + DTIME
1222         IF (singles_second) THEN
1223C
1224C----------------------------------------------------------------------
1225C           Calculate part of the result vectors RES1E and RES1D,
1226C           specifically the first and the second term in eqs. (34,35).
1227C           Also calculate Aij and Aab in eqs. (43,44).
1228C----------------------------------------------------------------------
1229C
1230            DTIME      = SECOND()
1231            CALL SO_RES_A(WORK(KRES1E),LRES1E,
1232     &                    WORK(KRES1D),LRES1D,
1233     &                    WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D,
1234     &                    WORK(KDSRHF),LDSRHF,WORK(KCMO),LCMO,
1235     &                    WORK(KT2M1),LT2M1,WORK(KAIJ),LAIJ,
1236     &                    WORK(KAAB),LAAB,INEWTR,ISYMD,ISYDIS,
1237     &                    ISYRES,ISYMTR,DO_DEX,
1238     &                    WORK(KEND5),LWORK5)
1239            DTIME      = SECOND()   - DTIME
1240            SOTIME(14) = SOTIME(14) + DTIME
1241         ENDIF
1242C
1243         IF (DOUBLES.OR.SINGLES_SECOND) THEN
1244C
1245C-------------------------------------------------------------------
1246C        Calculate the part of the result vectors RES1E and
1247C        RES1D which originate from the C matrices. See
1248C                 eqs. (72) and (73).
1249C-------------------------------------------------------------------
1250CRF      The X2M1 vectors is now an intermediate, which contain
1251CRF      both the 2-particle trial vector AND the T2*x1 intermediate,
1252CRF              ~ ~                               ~
1253CRF      so both C*x2 and terms (1) and (5) of B*x1 is calculated here.
1254C
1255            DTIME      = SECOND()
1256            CALL SO_RES_TCB(WORK(KRES1E),LRES1E,
1257     &                      WORK(KRES1D),LRES1D,
1258     &                      WORK(KX2EM1),LX2M1,
1259     &                      WORK(KX2DM1),LX2M1,
1260     &                      WORK(KDSRHF),LDSRHF,
1261     &                      WORK(KCMO),LCMO,IDEL,ISYMD,ISYDIS,
1262     &                      ISYMTR,DO_DEX,WORK(KEND5),LWORK5)
1263            DTIME      = SECOND()   - DTIME
1264            SOTIME(29) = SOTIME(29) + DTIME
1265         END IF
1266C
1267C----------------------------------------------------------------------
1268C           Construct C-contribution to 2p2h result vectors RES2E
1269C                 and RES2D.
1270C----------------------------------------------------------------------
1271C
1272         IF(DOUBLES) THEN
1273            DTIME      = SECOND()
1274            CALL SO_RES_CB(WORK(KRES2E),LRES2E,
1275     &                    WORK(KRES2D),LRES2D,
1276     &                    WORK(KDSRHF),LDSRHF,
1277     &                    WORK(KBTR1E),LBTR1E,
1278     &                    WORK(KBTR1D),LBTR1D,
1279     &                    WORK(KBTJ1E),LBTJ1E,
1280     &                    WORK(KBTJ1D),LBTJ1D,WORK(KCMO),LCMO,
1281     &                    IDEL,ISYMD,ISYDIS,ISYMTR,DO_DEX,
1282     &                    WORK(KEND5),LWORK5)
1283            DTIME      = SECOND()   - DTIME
1284            SOTIME(15) = SOTIME(15) + DTIME
1285C
1286         ENDIF
1287
1288         IF (SINGLES_SECOND) THEN
1289            DTIME   = SECOND()
1290            ISYDIS2 = MULD2H(ISYDIS,ISYMTR)
1291            KEND6 = KEND4 + NDSRHF(MULD2H(ISYMD,ISYMTR))
1292            LWORK6 = LWORK - KEND6
1293C                                     ~
1294C           Calculate ( alpha, beta | j delta)
1295            CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1D),
1296     &                  ISYMTR,WORK(KEND5),LWORK5,ISYDIS)
1297C           Calculate terms 2 and 6 of the B-matrix, by
1298C                     ~
1299C    (2)    - ( c a | j delta ) * T2M1( ci, j)
1300C                     ~
1301C    (6)      ( k i | j delta ) * T2M1( ak, j)
1302
1303            CALL SO_RES_B26 ( WORK(KRES1E), LRES1E, WORK(KT2M1), LT2M1,
1304     &                        WORK(KDSRHF),WORK(KCMO),LCMO,IDEL,ISYMD,
1305     &                        ISYDIS2,ISYMTR,WORK(KEND6),LWORK6)
1306            IF (DO_DEX) THEN
1307C
1308C           Same for the D-part
1309               CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1E),
1310     &                     ISYMTR,WORK(KEND5),LWORK5,ISYDIS)
1311               CALL SO_RES_B26 ( WORK(KRES1D), LRES1D,
1312     &                           WORK(KT2M1), LT2M1,
1313     &                           WORK(KDSRHF),WORK(KCMO),LCMO,
1314     &                           IDEL,ISYMD,
1315     &                           ISYDIS2,ISYMTR,WORK(KEND6),LWORK6)
1316            END IF
1317            DTIME      = SECOND()   - DTIME
1318            SOTIME(16) = SOTIME(16) + DTIME
1319         ENDIF
1320         END SUBROUTINE
1321C
1322         SUBROUTINE LOOP_BODY_TRIPLET
1323C
1324C----------------------------------------------
1325C        Calculate the AO-Fock matrix.
1326C----------------------------------------------
1327C
1328         DTIME      = SECOND()
1329         CALL SO_AOFOCK3(WORK(KXINT),WORK(KDENS),
1330     &                   WORK(KFOCK),
1331     &                   IDEL,ISYMD,ISYMTR)
1332         DTIME      = SECOND()   - DTIME
1333         SOTIME(11) = SOTIME(11) + DTIME
1334C
1335         IF (singles_second) THEN
1336C
1337CRF         A^(2) Doesn't mix spins - Same for singlet and triplet
1338C----------------------------------------------------------------------
1339C           Calculate part of the result vectors RES1E and RES1D,
1340C           specifically the first and the second term in eqs.
1341C           (34,35). Also calculate Aij and Aab in eqs. (43,44).
1342C----------------------------------------------------------------------
1343C
1344            DTIME      = SECOND()
1345            CALL SO_RES_A(WORK(KRES1E),LRES1E,
1346     &                    WORK(KRES1D),LRES1D,
1347     &                    WORK(KTR1E),LTR1E,WORK(KTR1D),LTR1D,
1348     &                    WORK(KDSRHF),LDSRHF,WORK(KCMO),LCMO,
1349     &                    WORK(KT2M1),LT2M1,WORK(KAIJ),LAIJ,
1350     &                    WORK(KAAB),LAAB,INEWTR,ISYMD,ISYDIS,
1351     &                    ISYRES,ISYMTR,DO_DEX,
1352     &                    WORK(KEND5),LWORK5)
1353            DTIME      = SECOND()   - DTIME
1354            SOTIME(14) = SOTIME(14) + DTIME
1355C
1356C--------------------------------------------------------------------
1357C           Transform the partially back-transformed T2 MP-amplitudes
1358C           Currently they are stored as
1359C              t2m1(ai,j) = 2t(ai,j) - t(aj,i)
1360C           In the following we need to have
1361C              t2m1(ai,j) = -t(aj,i)
1362C--------------------------------------------------------------------
1363C
1364            DTIME      = SECOND()
1365            CALL SO_M1SHUF(WORK(KT2M1),LT2M1,ISYMD,1)
1366            DTIME      = SECOND()   - DTIME
1367            SOTIME(12) = SOTIME(12) + DTIME
1368         END IF
1369         IF (DOUBLES.OR.SINGLES_SECOND) THEN
1370C
1371C-------------------------------------------------------------------
1372C        Calculate the part of the result vectors RES1E and
1373C        RES1D which originate from the C matrices. See
1374C                 eqs. (72) and (73).
1375C-------------------------------------------------------------------
1376CRF      The X2M1 vectors are now intermediates, which contain
1377CRF      the the T2*x1 intermediate,
1378CRF      so terms (1) and (5) of B*x1 are calculated here.
1379C
1380            DTIME      = SECOND()
1381            CALL SO_RES_TCB(WORK(KRES1E),LRES1E,
1382     &                      WORK(KRES1D),LRES1D,
1383     &                      WORK(KX2EM1),LX2M1,
1384     &                      WORK(KX2DM1),LX2M1,
1385     &                      WORK(KDSRHF),LDSRHF,
1386     &                      WORK(KCMO),LCMO,IDEL,ISYMD,ISYDIS,
1387     &                      ISYMTR,DO_DEX,
1388     &                      WORK(KEND5),LWORK5)
1389            DTIME      = SECOND()   - DTIME
1390            SOTIME(29) = SOTIME(29) + DTIME
1391
1392         END IF
1393C
1394         IF (DOUBLES) THEN
1395C
1396C----------------------------------------------------------------------
1397C           Construct C-contribution to 2p2h result vectors RES2E
1398C                 and RES2D.
1399C----------------------------------------------------------------------
1400C
1401            DTIME      = SECOND()
1402            CALL SO_RES_CBT(WORK(KRES2E),LRES2E,
1403     &                      WORK(KRES2D),LRES2D,
1404     &                      WORK(KDSRHF),LDSRHF,
1405     &                      WORK(KBTR1E),LBTR1E,
1406     &                      WORK(KBTR1D),LBTR1D,
1407     &                      WORK(KBTJ1E),LBTJ1E,
1408     &                      WORK(KBTJ1D),LBTJ1D,WORK(KCMO),LCMO,
1409     &                      IDEL,ISYMD,ISYDIS,ISYMTR,DO_DEX,
1410     &                      WORK(KEND5),LWORK5)
1411            DTIME      = SECOND()   - DTIME
1412            SOTIME(15) = SOTIME(15) + DTIME
1413C
1414         ENDIF
1415         IF(SINGLES_SECOND) THEN
1416            DTIME   = SECOND()
1417            ISYDIS2 = MULD2H(ISYDIS,ISYMTR)
1418            KEND6 = KEND4 + NDSRHF(MULD2H(ISYMD,ISYMTR))
1419            LWORK6 = LWORK - KEND6
1420C                                     ~
1421C           Calculate ( alpha, beta | j delta)
1422            CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1D),
1423     &                  ISYMTR,WORK(KEND5),LWORK5,ISYDIS)
1424C           Calculate terms 2 and 6 of the B-matrix, by
1425C                     ~
1426C    (2)    - ( c a | j delta ) * T2M1( ci, j)
1427C                     ~
1428C    (6)      ( k i | j delta ) * T2M1( ak, j)
1429C
1430            CALL SO_RES_B26 ( WORK(KRES1E), LRES1E, WORK(KT2M1), LT2M1,
1431     &                        WORK(KDSRHF),WORK(KCMO),LCMO,IDEL,ISYMD,
1432     &                        ISYDIS2,ISYMTR,WORK(KEND6),LWORK6)
1433C
1434            IF (DO_DEX) THEN
1435C           Same for the D-part
1436               CALL CCTRBT(WORK(KXINT),WORK(KDSRHF),WORK(KBTR1E),
1437     &                    ISYMTR,WORK(KEND5),LWORK5,ISYDIS)
1438               CALL SO_RES_B26 (WORK(KRES1D), LRES1D,
1439     &                          WORK(KT2M1), LT2M1,
1440     &                          WORK(KDSRHF),WORK(KCMO),LCMO,IDEL,ISYMD,
1441     &                          ISYDIS2,ISYMTR,WORK(KEND6),LWORK6)
1442            END IF
1443            DTIME      = SECOND()   - DTIME
1444            SOTIME(16) = SOTIME(16) + DTIME
1445         ENDIF
1446
1447         END SUBROUTINE
1448
1449         SUBROUTINE TRANS_FCK(FOCK,ISYMFCK)
1450            DOUBLE PRECISION, INTENT(INOUT) :: FOCK(*)
1451            INTEGER, INTENT(IN) :: ISYMFCK
1452
1453            INTEGER :: IA, IB, ISIZE, NUMA, NUMB
1454            INTEGER :: ISYMA, ISYMB, IOFF, IOFF1, IOFF2
1455            DOUBLE PRECISION :: TMP
1456
1457            IF ( ISYMFCK .EQ. 1) THEN
1458               DO ISYMA = 1, NSYM
1459                  IOFF = IAODIS(ISYMA,ISYMA)
1460                  DO IA = 2, NBAS(ISYMA)
1461                     DO IB = 1, IA-1
1462                        IPOS1 = (IA-1)*NBAS(ISYMA)+IB+IOFF
1463                        IPOS2 = (IB-1)*NBAS(ISYMA)+IA+IOFF
1464                        TMP = FOCK(IPOS1)
1465                        FOCK(IPOS1) = FOCK(IPOS2)
1466                        FOCK(IPOS2) = TMP
1467                     END DO
1468                  END DO
1469               END DO
1470            ELSE
1471               DO ISYMA = 1, NSYM
1472                  ISYMB = MULD2H(ISYMFCK,ISYMA)
1473                  IF (ISYMB .GT.ISYMA) CYCLE
1474                  NUMA = NBAS(ISYMA)
1475                  NUMB = NBAS(ISYMB)
1476                  IOFF1 = IAODIS(ISYMA,ISYMB)
1477                  IOFF2 = IAODIS(ISYMB,ISYMA)
1478                  ISIZE = NUMA*NUMB
1479                  DO IB = 1, NUMB
1480                     DO IA = 1, NBAS(ISYMA)
1481                        IDX1 = IOFF1 + NUMA*(IB-1) +IA
1482                        IDX2 = IOFF2 + NUMB*(IA-1) +IB
1483                        TMP = FOCK(IDX1)
1484                        FOCK(IDX1) = FOCK(IDX2)
1485                        FOCK(IDX2) = TMP
1486                     END DO
1487                  END DO
1488               END DO
1489            ENDIF
1490
1491         END SUBROUTINE
1492
1493         SUBROUTINE SO_AOFOCK3(XINT,DENSIT,FOCK,
1494     &                         IDEL,ISYMD,ISYDEN)
1495
1496#include "symsq.h"
1497
1498            INTEGER,INTENT(IN) ::  IDEL, ISYMD,ISYDEN
1499            DOUBLE PRECISION,INTENT(IN)    :: XINT(*), DENSIT(*)
1500            DOUBLE PRECISION,INTENT(INOUT) :: FOCK(*)
1501
1502            INTEGER :: ISYMBG, ISYMAB,ISYMG,ISYDIS,ISYMB,ISYMA
1503            INTEGER :: IG, IA, IB
1504            INTEGER :: KOFFINT, KOFFINT1, KOFFDEN, KOFFOUT
1505            INTEGER :: NOFFB, NOFFA
1506            DOUBLE PRECISION :: TMP
1507
1508            ISYMBG = ISYDEN
1509            ISYDIS = ISYMD
1510            ISYMA  = MULD2H(ISYDIS,ISYMBG)
1511            KOFFOUT = IAODIS(ISYMA,ISYMD) + NBAS(ISYMA)*
1512     &                (IDEL- IBAS(ISYMD) - 1)
1513
1514            DO ISYMG = 1, NSYM
1515               ISYMB = MULD2H(ISYMBG,ISYMG)
1516               ISYMAB = MULD2H(ISYMA,ISYMB)
1517               KOFFINT1 = IDSAOG(ISYMG,ISYDIS) + IAODPK(ISYMA,ISYMB)
1518
1519               IF(ISYMAB.EQ.1) THEN ! integrals stored as triangle
1520                  DO IG = 1, NBAS(ISYMG)
1521                     KOFFINT = KOFFINT1 + (IG - 1) *NNBST(ISYMAB)
1522                     KOFFDEN = IAODIS(ISYMB,ISYMG) + NBAS(ISYMB)*(IG-1)
1523                     ! alpha =< beta :
1524                     DO IB = 1, NBAS(ISYMB)
1525                        NOFFB = IB*(IB-1)/2 + KOFFINT
1526                        DO IA = 1, IB
1527                           FOCK(KOFFOUT+IA) = -XINT(NOFFB+IA)
1528     &                                        *DENSIT(KOFFDEN+IB)
1529     &                                        +FOCK(KOFFOUT+IA)
1530                        END DO
1531                     END DO
1532
1533                     ! alpha > beta
1534                     DO IA = 2, NBAS(ISYMA)
1535                        NOFFA = IA*(IA-1)/2 + KOFFINT
1536                        TMP = 0.0D0
1537                        DO IB = 1, IA - 1
1538                           TMP = XINT(NOFFA+IB)*DENSIT(KOFFDEN+IB) + TMP
1539                        END DO
1540                        FOCK(KOFFOUT+IA) = FOCK(KOFFOUT+IA) - TMP
1541                     END DO
1542                 END DO ! LOOP IG
1543
1544               ELSEIF(ISYMA .LT.ISYMB) THEN
1545                  ! Stored as alpha, beta
1546                  DO IG = 1, NBAS(ISYMG)
1547                     KOFFINT = KOFFINT1 + (IG - 1) *NNBST(ISYMAB)
1548                     KOFFDEN = IAODIS(ISYMB,ISYMG) + NBAS(ISYMB)*(IG-1)
1549
1550                     ! Loop B first
1551                     DO IB = 1, NBAS(ISYMB)
1552                        NOFFB = NBAS(ISYMA)*(IB-1) + KOFFINT
1553                        DO IA = 1, NBAS(ISYMA)
1554                           FOCK(KOFFOUT+IA) = -XINT(NOFFB+IA)
1555     &                                        *DENSIT(KOFFDEN+IB)
1556     &                                        +FOCK(KOFFOUT+IA)
1557                        END DO
1558                     END DO
1559                  END DO
1560               ELSE ! ISYMA .GT. ISYMB
1561                  ! Stored as beta, alpha
1562                  DO IG = 1, NBAS(ISYMG)
1563                     KOFFINT = KOFFINT1 + (IG - 1) *NNBST(ISYMAB)
1564                     KOFFDEN = IAODIS(ISYMB,ISYMG) + NBAS(ISYMB)*(IG-1)
1565                     ! LOOP A first
1566                     DO IA = 1, NBAS(ISYMA)
1567                        NOFFA = NBAS(ISYMB)*(IA-1) + KOFFINT
1568                        TMP = 0.0D0
1569                        DO IB = 1, NBAS(ISYMB)
1570                           TMP = XINT(NOFFA+IB)*DENSIT(KOFFDEN+IB) + TMP
1571                        END DO
1572                        FOCK(KOFFOUT+IA) = FOCK(KOFFOUT+IA) - TMP
1573                     END DO
1574                  END DO ! IG
1575               END IF
1576            END DO ! ISYMG
1577
1578         END SUBROUTINE
1579C
1580         SUBROUTINE SO_SYM_DENS(DENS_SQ,DENS_SYM,ISYDENS)
1581            !                           ~
1582            !  Form the symmetric array D(alpha,beta) from
1583            !  the array D(alpha, beta) and store it in packed form
1584            !  (alpha =< beta).
1585            !  ~
1586            !  D(alpha,beta) = D(alpha,beta)+ D(beta,alpha)
1587            !  ~
1588            !  D(alpha,alpha) = D(alpha,alpha)
1589            !
1590            !  D has symmetry ISYDENS
1591#include "symsq.h"
1592            DOUBLE PRECISION, INTENT(IN) :: DENS_SQ(*)
1593            DOUBLE PRECISION, INTENT(OUT):: DENS_SYM(*)
1594            INTEGER, INTENT(IN)          :: ISYDENS
1595
1596            INTEGER :: ISYMA, ISYMB, IA, IB
1597            INTEGER :: NUMA, NUMB, IOFFA, IOFFB, IOFFPK, IOFFSQ
1598            INTEGER :: IOFF1, IOFF2,IOFFSQ1,IOFFSQ2
1599
1600            IF (ISYDENS.EQ.1) THEN
1601               ! Totally symmetric case, Triangular storage
1602               DO ISYMA = 1, NSYM
1603                  IOFFPK = IAODPK(ISYMA,ISYMA)
1604                  IOFFSQ = IAODIS(ISYMA,ISYMA)
1605                  NUMA = NBAS(ISYMA)
1606                  NUMB = NBAS(ISYMA)
1607                  DO IB = 1, NUMB
1608                     IOFF1 = IB*(IB-1)/2 + IOFFPK
1609                     IOFFB = (IB-1)*NUMA + IOFFSQ
1610                     DO IA = 1, IB-1
1611                        IOFFA = (IA-1)*NUMB + IOFFSQ
1612                        DENS_SYM(IOFF1+IA) = DENS_SQ(IOFFB+IA) +
1613     &                                       DENS_SQ(IOFFA+IB)
1614                     END DO
1615                     DENS_SYM(IOFF1+IB) = DENS_SQ(IOFFB+IB)
1616                  END DO
1617               END DO
1618            ELSE ! Else we deal with rectangular offdiagonal blocks
1619               DO ISYMB = 1, NSYM
1620                  ISYMA = MULD2H(ISYMB,ISYDENS)
1621                  IF (ISYMA.GT.ISYMB) CYCLE
1622                  IOFFPK=IAODPK(ISYMA,ISYMB)
1623                  IOFFSQ1 = IAODIS(ISYMB,ISYMA)
1624                  IOFFSQ2 = IAODIS(ISYMA,ISYMB)
1625                  NUMA = NBAS(ISYMA)
1626                  NUMB = NBAS(ISYMB)
1627                  DO IB = 1, NUMB
1628                     IOFF1 = (IB-1)*NUMA + IOFFPK
1629                     IOFFB = (IB-1)*NUMA + IOFFSQ2
1630                     DO IA = 1, NUMA
1631                        IOFFA = (IA-1)*NUMB + IOFFSQ1
1632                        DENS_SYM(IOFF1+IA) = DENS_SQ(IOFFB+IA) +
1633     &                                       DENS_SQ(IOFFA+IB)
1634                     END DO
1635                  END DO
1636               END DO
1637            ENDIF
1638
1639         END SUBROUTINE
1640
1641         SUBROUTINE SO_AOFOCK1(XINT,DENSIT,FOCK,WORK,LWORK,
1642     &                         IDEL,ISYMD,ISYDEN)
1643            !  Calculates here
1644            !  2 ( alpha, beta | gamma ; delta) * D( alpha, beta) =>
1645            !                 F ( gamma ; delta)
1646            !
1647            !  and in call to SO_AOFOCK3 :
1648            !  - ( alpha, beta | gamma ; delta) * D( beta, gamma) =>
1649            !                 F (alpha ; delta)
1650            !
1651            DOUBLE PRECISION, PARAMETER  :: TWO = 2.0D0, ONE =1.0D0
1652
1653            INTEGER,INTENT(IN) ::  IDEL, ISYMD,ISYDEN, LWORK
1654            DOUBLE PRECISION,INTENT(IN)    :: XINT(*), DENSIT(*)
1655            DOUBLE PRECISION,INTENT(INOUT) :: FOCK(*), WORK(LWORK)
1656
1657            INTEGER :: ISYDIS, ISYMG
1658            INTEGER :: NUMG, NAB, KGAM, KOUT
1659
1660            ! First do the singlet-only term
1661            ISYDIS = ISYMD
1662            ISYMG  = MULD2H(ISYDIS,ISYDEN)
1663            NUMG = NBAS(ISYMG)
1664            IF (NUMG .GE. 1) THEN
1665               !Symmetrice AO-DENSITY and compress to packed form (could be done outside)
1666               CALL SO_SYM_DENS(DENSIT,WORK,ISYDEN)
1667
1668               ! Position of (alpha =< beta | gamma; delta ) block for this
1669               ! symmetry of gamma
1670               KGAM = IDSAOG(ISYMG,ISYDIS) + 1
1671               ! Position of the delta'th colunm in output
1672               KOUT = IAODIS(ISYMG,ISYMD) +
1673     &                NBAS(ISYMG)*(IDEL-IBAS(ISYMD)-1) + 1
1674               NAB  = MAX( 1, NNBST(ISYDEN) )
1675
1676               CALL DGEMV('T',NNBST(ISYDEN),NUMG,TWO,XINT(KGAM),NAB,
1677     &                    WORK,1,ONE,FOCK(KOUT),1)
1678            END IF
1679
1680            ! Do the term also appearing in the triplet case
1681            CALL SO_AOFOCK3(XINT,DENSIT,FOCK,IDEL,ISYMD,ISYDEN)
1682
1683
1684         END SUBROUTINE
1685
1686      END SUBROUTINE
1687
1688C
1689#ifdef VAR_MPI
1690C
1691C
1692      subroutine dynloadbal_parsoppa( localIndices, maxnumjobs,
1693     &                                timings, ltimings,
1694     &                                sortedindices, numassignjobs,
1695     &                                sumofwork)
1696C    Dynamic Load Balancing for the parallel SOPPA calculations (and parallel RPA).
1697C    The routine assumed that the MPI processes that does the load balancing is the master. If any other routine enters, there will be issues with the updated ILLL indices in the AssignedIndices array.  Right now, any slave that enters is immediately evicted from the routine.
1698C
1699C The routine takes an array of timings as input. The timings are creatd on the fly by every parallel process and the index corresponds to the ILLL index in the parallel calculation. The time is given in integers from calling system_clock. The timings are not given in any human-readable form. This choice was made so that one can reuse the subroutine getallocsize, which relies on an integer input array.
1700C The work is resorted once the actual time associated with an ILLL index is known and the indices are rebalanced once more and sent to all slaves.
1701C The improvement over the presorted balancing scheme is expected to be very small, but gets better the larger the basis set.
1702C
1703C F.Beyer Oct. 2014.
1704
1705      use so_parutils, only : soppa_comm_active, soppa_num_active,
1706     &                        soppa_nint, my_mpi_integer, sop_master
1707
1708      use so_info, only: sop_dp
1709
1710      implicit none
1711#include "mpif.h"
1712#include "maxorb.h"
1713#include "distcl.h"
1714#include "priunit.h"
1715
1716      ! Dummy parameters
1717      double precision, dimension(ltimings), intent(inout) :: timings
1718      integer, intent(inout) :: localIndices(maxnumjobs)
1719      integer  :: sortedindices(maxnumjobs,soppa_num_active)
1720      integer  :: numassignjobs(soppa_num_active)
1721      real(sop_dp) :: sumofwork(soppa_num_active)
1722      integer  :: ltimings, maxnumjobs
1723
1724      ! Bookkeeping
1725      integer :: maxrows, maxcols, numrecipients
1726      integer :: getnumjobs, targetID, myid
1727      integer :: colindex, rowindex, assignILLL, col,i
1728      integer :: ntot
1729
1730      integer(mpi_integer_kind) :: ierr_mpi, maxnum_mpi
1731
1732      ntot = soppa_nint
1733      ! In case the amount of work is smaller than
1734      ! the number of MPI processes...
1735      maxcols = soppa_num_active
1736
1737      ! Explicitly set the maximum number of jobs a single MPI process can be allocated.
1738      maxrows = maxnumjobs
1739
1740      sortedindices(:,:) = 0
1741      numassignjobs(:) = 0
1742      sumofwork(:) = 0.0D0
1743
1744
1745      DO i=1, ntot
1746              ! Find the largest chunk of available work
1747              assignILLL = maxloc(timings,dim=1, mask=timings.ge.0.0D0)
1748
1749              ! Find the laziest slave
1750              colindex = minloc(sumofwork, dim=1)
1751
1752              ! Write the ILLL index to the correct row in the sortedmatrix
1753              numassignjobs(colindex) = numassignjobs(colindex) + 1
1754              rowindex = numassignjobs(colindex)
1755              sortedindices(rowindex, colindex) = assignILLL
1756
1757              ! Update the slave's expected work/walltime and
1758              ! remove the work from the timings array
1759              sumofwork(colindex) = sumofwork(colindex)
1760     &                             +timings(assignILLL)
1761              timings(assignILLL) = -1.0D0
1762
1763      ENDDO
1764
1765C     PRINT HOW WORK IS DISTRIBUTED
1766      write (LUPRI,'(a)') 'AOSOPPA Work distribution'
1767      write (LUPRI,'(a)') 'NODE       Expected time '
1768      do i = 1, maxcols
1769           write (LUPRI, '(i5,f20.5)') i, sumofwork(i)
1770      enddo
1771      ! Send the info to every slave that does computation (some slaves might be stalled in the polling barrier in case there are too many MPI processes compared to the number of tasks).
1772      call mpi_scatter( sortedindices, maxnumjobs, my_mpi_integer,
1773     &                  localIndices,  maxnumjobs, my_mpi_integer,
1774     &                  sop_master, soppa_comm_active, ierr_mpi)
1775
1776      ! Update the master's own array of ILLL indices
1777      ! should be done by the scatter
1778
1779      return
1780      end subroutine
1781
1782
1783
1784
1785
1786
1787      subroutine presortloadbal_parsoppa(localIndices, maxnumjobs,
1788     &           indexb,
1789     &           work, lwork)
1790C     This subroutine load balances parallel SOPPA/RPA
1791C     calculation.
1792C
1793C     The routine makes a best guess of loadbalancing by giving
1794C     every MPI process an equal number of distributions to handle.
1795C     Testing shows this is a better first guess that giving every
1796C     MPI process an equal number of ILLL indices to work with
1797C     (since there is a different number of distributions
1798C     associated with every ILLL index).
1799C
1800C     F.Beyer Oct. 2014.
1801
1802      use so_parutils, only: soppa_comm_active, soppa_nint, sop_master,
1803     &                       soppa_num_active, my_mpi_integer
1804
1805      implicit none
1806C#include "implicit.h"
1807#include "priunit.h"
1808#include "mpif.h"
1809#include "maxorb.h"
1810#include "iratdef.h"
1811C fetch HERDIR
1812#include "ccsdinp.h"
1813C     Dummy parameters
1814      integer, intent(in) :: lwork, maxnumjobs
1815      ! Declare work as integer, to avoid "irats" later
1816      integer :: work(irat*lwork) !intent in
1817      integer :: indexb(*)
1818      integer, dimension(maxnumjobs), intent(out) :: localIndices
1819
1820C     Pre-sorting load balancing variables
1821      integer :: getnumjobs, getindices, numprocs
1822      INTEGER(MPI_INTEGER_KIND) :: ierr_mpi
1823C      integer :: presortarray, finalsorted
1824      integer :: col, ntot
1825      integer :: kpresortarray, kend, kfinalsorted, ktmp
1826
1827      ntot = soppa_nint
1828
1829      ! numprocs = soppa_num_active !! not consistent with other use
1830!      call mpi_comm_size(soppa_comm_active, numprocs, ierr_mpi)
1831      numprocs = soppa_num_active
1832
1833
1834      IF (.NOT. HERDIR) THEN
1835C     ERI code
1836
1837C     FIND THE AMOUNT OF WORK ASSOCIATED WITH EVERY AO INDEX
1838         kpresortarray = 1
1839         kend = kpresortarray + 2* ntot
1840         call presortaodist(ntot, indexb, work(kpresortarray) )
1841
1842
1843C     CREATE THE FINAL MATRIX OF PRE-SORTED AO INDICES
1844         kfinalsorted = kend
1845         ktmp = kfinalsorted + maxnumjobs * numprocs
1846         kend = ktmp + 2*numprocs
1847         if( kend .gt. lwork ) then
1848            call quit('Insufficient memory in presorted loadbalancer!')
1849         endif
1850         call partitionAOindices(ntot, maxnumjobs, numprocs,
1851     &              work(kpresortarray), work(kfinalsorted),
1852     &              work(ktmp) )
1853      else
1854C     HERDIR code
1855         kfinalsorted = 1
1856         kend = kfinalsorted + maxnumjobs * numprocs
1857         if( kend .gt. lwork ) then
1858            call quit('Insufficient memory in presorted loadbalancer!')
1859         endif
1860         call herdir_presort( work(kfinalsorted), maxnumjobs)
1861      endif
1862
1863
1864C     TRANSFER AO INDICES TO SLAVES
1865C     Use scatter
1866      call mpi_scatter( work(kfinalsorted), maxnumjobs, my_mpi_integer,
1867     &                  localIndices,       maxnumjobs, my_mpi_integer,
1868     &                  sop_master, soppa_comm_active, ierr_mpi )
1869
1870      return
1871      end subroutine
1872
1873
1874      subroutine herdir_presort ( sorted, maxnumjobs )
1875C     This is a subroutine for doing the initial distribution of the
1876C     AO-indices in a parallel SOPPA calculation.
1877C     This subroutine works with Hermite (HERDIR) integral generator,
1878C     as the code by F. Beyer only supports using ERI
1879C
1880      use so_parutils, only: soppa_num_active, soppa_nint
1881      implicit none
1882      integer, intent(out) :: sorted( maxnumjobs, soppa_num_active )
1883      integer, intent(in)  :: maxnumjobs
1884      integer              :: intnum, inext, icol, inum
1885
1886      sorted = 0
1887      inext = 0
1888      icol = 1
1889C     For now just do it the stupid way ( round robin )
1890
1891C     initial implementation, is this what chokes ifort?
1892C      do intnum = 1, soppa_nint
1893C         inext = inext + 1
1894C         if ( inext .gt. soppa_num_active ) then
1895C             inext = 1
1896C             icol = icol + 1
1897C         endif
1898C         sorted ( icol, inext ) = intnum
1899C      enddo
1900
1901C     Distribute first an even number
1902      inum = soppa_nint/soppa_num_active
1903      do icol = 1, soppa_num_active
1904         do inext = 1, inum
1905            sorted( inext, icol ) = inext + inum*(icol-1)
1906         enddo
1907      enddo
1908C     Distribute the remainder
1909      do icol = 1, mod(soppa_nint, soppa_num_active)
1910         sorted ( inum+1, soppa_num_active-icol+1) =
1911     &            inum*soppa_num_active + icol
1912      enddo
1913C     Debug print
1914      do inext = 1, soppa_num_active
1915         print '(10i3)', sorted(:,inext)
1916      enddo
1917      return
1918      end subroutine
1919
1920
1921
1922      subroutine pollingbarrier(pollinginterval)
1923C     This subroutine is implemented in case there is ever a need to
1924C     remove certain processes from a calculation.
1925C
1926C     The processes will repeatedly poll until they receive a non-blocking
1927C     send from the master with a logical value equal to .true.
1928C
1929C     F.Beyer Oct. 2014
1930
1931#ifdef VAR_MPI
1932      use so_parutils, only: my_mpi_logical
1933#endif
1934
1935#include "implicit.h"
1936#include "mpif.h"
1937
1938      integer, dimension(MPI_STATUS_SIZE) :: mpistatus
1939      integer, intent(in)                 :: pollinginterval !in milliseconds
1940      logical                             :: flag, exitbarrier
1941      volatile                            :: exitbarrier
1942      INTEGER(MPI_INTEGER_KIND) :: ierr_mpi, myid, request
1943
1944      call mpi_comm_rank(mpi_comm_world, myid, ierr_mpi)
1945      print *, 'I am in the polling barrier ', myid
1946
1947      ! Initiate the polling variables and ask for the non-blocking update
1948      exitbarrier = .false.
1949      call mpi_irecv(exitbarrier, 1, my_mpi_logical, 0, myid,
1950     &               mpi_comm_world, request, ierr_mpi)
1951
1952      ! Polling barrier, the process will cycle repeatedly until released.
1953130   continue
1954#ifdef VAR_IFORT
1955      call sleepqq(pollinginterval)
1956#endif
1957      call mpi_test(request, flag, mpistatus, ierr_mpi)
1958      if (.not. flag) goto 130
1959      ! Warning, seems that we'll go into an infinite loop,
1960      ! if exitbarrier is ever sent as .false.
1961      if (.not.exitbarrier) then
1962         goto 130 ! Cycle to the top of the barrier.
1963      else
1964         return
1965      endif
1966
1967      return
1968      end subroutine
1969
1970C /* deck presortaodist*/
1971      subroutine presortaodist(Nindex, indxbt, outlist)
1972C A subroutine associated with the atomic integral parallel RPA/SOPPA calculations.
1973C Pre-calculate IDEL2 indexes before starting a parallel calculation.
1974C In other words, this subroutine calculates how many distributions are associated with the calculation.
1975C
1976C This routine assembles a matrix that counts the number of AOs
1977C associated with an ILLL distribution index. This array is used by getallocsize
1978C and partitionAOindices to pre-sort the integrals that need to be done
1979C
1980C The first row in outlist is the number of distributions
1981C The second row in outlist is the associated ILLL index
1982#include "implicit.h"
1983#include "priunit.h"
1984#include "maxaqn.h"
1985#include "maxorb.h"
1986#include "mxcent.h"
1987#include "eridst.h"
1988
1989      integer,                       intent(in) :: Nindex
1990      integer, dimension(*),         intent(in) :: indxbt
1991      integer, dimension(2, Nindex), intent(out):: outlist
1992      integer :: i
1993
1994      do i=1, Nindex
1995         call getdst(i, 0, 0)
1996         call pickao(0)
1997         call eridsi(indxbt, 0)
1998         outlist(1, i) = ndistr
1999         outlist(2, i) = i
2000      enddo
2001
2002      return
2003
2004      end subroutine
2005
2006
2007C     /* deck getallocsize */
2008      SUBROUTINE getallocsize(ntot, originalsort, maxnumjobs)
2009C A subroutine associated with the atomic integral parallel RPA/SOPPA calculations.
2010C
2011C This subroutine is used to get the amount of storage that needs
2012C to be allocated for a parallel SO_ERES run.
2013C
2014C The subroutine calculates which process will be assigned the
2015C most single jobs (not the largest total amount of work) for a
2016C parallel RPA/SOPPA calculation. Its output is used to allocate
2017C the right amount of storage by the master when it starts pre-sorting
2018C the integrals for the parallel calculation of the E matrix.
2019C
2020
2021#include "implicit.h"
2022#include "mpif.h"
2023      integer,                     intent(in)  :: ntot
2024      integer, dimension(2, ntot), intent(in)  :: originalsort
2025      integer,                     intent(out) :: maxnumjobs
2026
2027      integer, dimension(:,:), allocatable  :: copysort
2028      integer, dimension(:,:), allocatable  :: sumofwork
2029      integer, dimension(2) :: temploc, tempwork, tempout
2030      integer :: allocstatus, deallocstatus, numprocs
2031      INTEGER(MPI_INTEGER_KIND) :: ierr_mpi, numprocs_mpi
2032
2033
2034
2035      call mpi_comm_size(mpi_comm_world, numprocs_mpi, ierr_mpi)
2036      numprocs = numprocs_mpi
2037      allocate( copysort(2, ntot), sumofwork(2, numprocs)
2038     &         ,stat=allocstatus)
2039      if(.not.(allocstatus.eq.0) ) then
2040         call quit('Allocation error in GETALLOCSIZE')
2041      endif
2042
2043      !call izero(sumofwork, (2*numprocs) )
2044      sumofwork = 0
2045      copysort = originalsort
2046
2047      DO i=1, ntot
2048         ! Find location of largest chunk of work and the work itself
2049         temploc = maxloc(copysort, DIM=2, mask=copysort.gt.0)
2050         addwork = copysort(1, temploc(1) )
2051         copysort( 1,temploc(1) ) = 0
2052
2053         ! Find laziest slave and simulate the workload on the slave
2054         tempwork = minloc(sumofwork, DIM=2)
2055         sumofwork(1, tempwork(1)) = sumofwork(1, tempwork(1)) + addwork! adding total number of distributions
2056         sumofwork(2, tempwork(1)) = sumofwork(2, tempwork(1)) + 1  !adding total number of assigned indexes
2057      ENDDO
2058
2059      tempout = maxloc(sumofwork, dim=2)
2060      maxnumjobs = sumofwork( 2,tempout(2) )
2061
2062      deallocate(copysort, sumofwork, stat=deallocstatus)
2063      if(.not.(deallocstatus.eq.0) ) then
2064         call quit('Deallocation error in GETALLOCSIZE')
2065      endif
2066
2067      return
2068
2069      END SUBROUTINE
2070
2071
2072C     /* deck partitionAOindices */
2073      SUBROUTINE  partitionAOindices(ntot, rows, cols, presortedarray,
2074     &                               sorted, sumofwork)
2075C A subroutine associated with the atomic integral parallel RPA/SOPPA calculations.
2076C
2077C The output from this routine is the array 'sorted'.
2078C The sorted matrix contains AO integral indexes for two-electron integrals.
2079C Every column contains a list of ILLL indexes that are to be assigned as work
2080C to a single process. The total amount of work per column is estimated based
2081C on the number of distributions that are related to the ILLL indexes in the column.
2082C
2083C This pre-sorting of ILLL indexes for a parallel calculation approximates an
2084C even distribution of total work for all processes when performing two-electron
2085C integrals in parallel for AOSOPPA and AORPA.
2086C
2087      !use mpi
2088      implicit none
2089#include "mpif.h"
2090
2091      integer,                        intent(in)    :: ntot, rows, cols
2092      integer, dimension(2, ntot),    intent(inout) :: presortedarray
2093      integer, dimension(rows, cols), intent(out)   :: sorted
2094
2095      integer, dimension(2, cols), intent(out) :: sumofwork
2096      integer, dimension(2) :: tempavail, templazy
2097      integer :: numprocs, availloc, targetrow
2098      integer :: allocstatus, deallocstatus
2099      integer :: lazyloc, aoindex, i, numdists
2100
2101      numprocs = cols
2102
2103      !call izero(sorted, (rows*cols) )
2104      !call izero(sumofwork, (2*numprocs) )
2105      sorted = 0
2106      sumofwork = 0
2107
2108      DO i=1, ntot
2109C        FIND LARGEST CHUNK OF AVAILABLE WORK AND ITS INDEX
2110         tempavail = maxloc(presortedarray, DIM=2,
2111     &                      mask=presortedarray.gt.0)
2112         availloc = tempavail(1)
2113         numdists = presortedarray(1, availloc) ! amount of available work
2114         aoindex  = presortedarray(2, availloc) ! the index to be passed to process
2115
2116         presortedarray(1, availloc) = 0
2117         presortedarray(2, availloc) = 0
2118
2119C        FIND THE LAZIEST PROCESS, GIVE IT WORK & INCREMENT THE ROW COUNTER
2120         templazy = minloc(sumofwork, DIM=2)
2121         lazyloc = templazy(1) ! This is equal to: MYID+1
2122         sumofwork( 1,lazyloc ) = sumofwork( 1,lazyloc ) + numdists
2123         sumofwork( 2,lazyloc ) = sumofwork( 2,lazyloc ) + 1
2124         targetrow = sumofwork( 2,lazyloc )
2125
2126C        ADD AO-INDEX TO FIRST AVAILABLE ROW IN THE LAZY SLAVE'S COLUMN
2127         sorted( targetrow, lazyloc ) = aoindex
2128      ENDDO
2129
2130      return
2131
2132      END SUBROUTINE
2133
2134
2135#endif
2136!VAR_MPI at the beginning of dynloadbal_parsoppa
2137