1#if HAVE_CONFIG_H
2#   include "config.fh"
3#endif
4      program scf
5C$Id: scf.F,v 1.18 2007/03/23 19:24:36 d3g293 Exp $
6      implicit double precision (a-h, o-z)
7#include "cscf.h"
8#include "mafdecls.fh"
9#include "global.fh"
10c
11      integer*8 nints, maxint                                           !cste
12c
13c     CAUTION: integer precision requirements
14c     nints, maxint, etc. are proportional to the number of basis functions
15c     to the fourth power! 216**4 is greater than the largest number
16c     that can be represented as a 32-bit signed interger, so 64-bit
17c     arithmetic is needed to count integrals when calculating more than
18c     14 Be atoms with 15 basis functions each. Since integrals are counted
19c     over all iterations, 18 iterations with 7 atoms can result in precision
20c     problems. Note that the wave function could be calculated correctly
21c     for much larger basis sets without 64-bit integers because the required
22c     indexing is usually proportional to nbfn**2, which is good to 46,340
23c     basis functions, except that the task counter runs as (nbfn/ichunk)**4,
24c     so with ichunk = 10, 32-bit integers yield correct wavefunctions out to
25c     2145 basis functions (maxatom=143), or 4290 (maxatom=286) with ichunk = 20, ...
26c
27c     This warning applies to the Global Arrays implementation as well!
28c     functions of special concern are ga_igop and nga_read_inc.
29c
30#define USE_TRANSFORM 1
31      integer heap, stack
32      data tinit, tonel, ttwoel, tdiag, tdens, tprint /6*0.0d0/
33      data eone, etwo, energy, deltad /4*0.0d0/
34c
35c     initalize the parallel message passing environment
36c
37#include "mp3.fh"
38      call ga_initialize()
39c
40c   Allocate memory
41c
42      heap =  32000000
43      stack = 32000000
44      if (.not.ma_init(MT_DBL, stack, heap))
45     +   call ga_error("ma_init failed",-1)
46      call flush(6)
47      me = ga_nodeid()
48      nproc = ga_nnodes()
49c
50c     initialize a bunch of stuff and initial density matrix
51c
52      rjunk = timer()
53c
54c     get input from file be.inpt
55c
56      call input
57c
58c     create and allocate global arrays
59c
60      call setarrays
61      if (ga_nodeid().eq.0) write(6,*) 'bytes of memory used by node 0:'
62     +                                 ,ga_inquire_memory()
63      call ininrm
64c
65c     create initial guess for density matrix by using single atom
66c     densities
67c
68      call denges
69      tinit = timer()
70#if USE_TRANSFORM
71c
72c     make initial orthogonal orbital set for solution method using
73c     similarity transform
74c
75      call makeob
76#endif
77c
78c     make info for sparsity test
79c
80      call makesz(schwmax)
81c
82c     print preliminary data before any long compute segments start
83      if (ga_nodeid().eq.0) call flush(6)
84c
85c     *** iterate ***
86c
87      do 10 iter = 1, mxiter
88c
89c     make the one particle contribution to the fock matrix
90c     and get the partial contribution to the energy
91c
92         call oneel(schwmax, eone)
93         tonel = tonel + timer()
94c
95c     compute the two particle contributions to the fock matrix and
96c     get the total energy.
97c
98         call twoel(schwmax, etwo)
99         ttwoel = ttwoel + timer()
100c
101c     Diagonalize the fock matrix. The diagonalizers used in this
102c     subroutine are actually sequential, not parallel.
103c
104         call diagon(tester,iter)
105         tdiag = tdiag + timer()
106c
107c     make the new density matrix in g_work from orbitals in g_orbs,
108c     compute the norm of the change in the density matrix and
109c     then update the density matrix in g_dens with damping.
110c
111         call makden
112         deltad = dendif()
113         if (iter.eq.1) then
114            scale = 0.0d0
115         else if (iter .le. 5) then
116            if (nbfn .gt. 60) then
117               scale = 0.5d0
118            else
119               scale = 0.0d0
120            endif
121         else
122            scale = 0.0d0
123         endif
124         call damp(scale)
125         tdens = tdens + timer()
126c
127c     add up energy and print out convergence information
128c
129         if (me.eq.0) then
130           energy = enrep + eone + etwo
131           call prnout(iter, energy, deltad, tester)
132           tprint = tprint + timer()
133         endif
134c
135c     if converged then exit iteration loop
136c
137         if (deltad .lt. tol) goto 20
138         call ga_igop(9, icut4, 1, '+')                         !cste
139         if(icut4 .eq. 0) then                                  !cste
140c     something has gone wrong--print what you know and quit.
141            write(6,*) 'no two-electron integrals computed!'    !cste
142            goto 20                                             !cste
143         endif                                                  !cste
144 10   continue
145      iter = iter - 1                                           !cste
146      if(me.eq.0)
147     $     write(6,*) ' SCF failed to converge in ', iter, ' iters'
148c...v....1....v....2....v....3....v....4....v....5....v....6....v....7..
149c
150c     finished ... print out eigenvalues and occupied orbitals
151c
152 20   continue
153      call ga_igop(6, icut1, 1, '+')
154      call ga_igop(7, icut2, 1, '+')
155      call ga_igop(8, icut3, 1, '+')
156      if (me.eq.0) then
157c
158c     print out timing information
159c
160         call prnfin(energy)
161         write(6,1) tinit, tonel, ttwoel, tdiag, tdens, tprint,
162     $        nproc
163 1       format(/5x,' init ',4x,' onel ',4x,' twoel ',4x,' diag ',4x,
164     $          ' dens      print  ncpu'/
165     $           5x,'------',4x,'------',4x,'-------',4x,'------',4x,
166     $          '------    ------ ------'/
167     $          2f10.2,f11.2,3f10.2, i7/)
168         totsec = tinit+tonel+ttwoel+tdiag+tdens+tprint
169         write(6,*)'elapsed time in seconds ',totsec
170c
171c     print out information on # integrals evaluated each iteration
172c
173
174          nints = icut1+icut2+icut3
175                  frac  = dble(icut3)/dble(nints)
176
177               write(6,2) icut1, icut2, icut3, nints, frac
178 2       format(/'No. of integrals screened or computed (all iters) '
179     $          /'-------------------------------------'/
180     $        /1x,' failed #ij test failed #kl test',
181     $            '        #compute          #total',
182     $            ' fraction',
183     $        /1x,' --------------- ---------------',
184     $            ' --------------- ---------------',
185     $            ' --------',
186     $        /1x,4(1x,i15),f9.6)
187         maxint = nbfn                                  !cste
188         maxint = maxint**4 * iter                      !cste
189         if(nints .ne. maxint) then                     !cste
190            write(6,*)'Inconsistent number of integrals, should be ',   !cste
191     $                 maxint                                           !cste
192            write(6,*)'Note: largest 32-bit integer is 2,147,483,647'   !cste
193       write(6,*)'Probably due to insufficient integer precision in GA.'!cste
194         endif                                                          !cste
195#ifdef MSG_COMMS_MPI
196         call ga_print_stats()
197#else
198         call stats
199#endif
200      endif
201c
202      call closearrays
203      call ga_terminate
204#ifdef MSG_COMMS_MPI
205      call mpi_finalize
206#else
207      call pend
208#endif
209c
210      end
211c
212      subroutine makesz(schwmax)
213      implicit double precision (a-h, o-z)
214#include "cscf.h"
215#include "mafdecls.fh"
216#include "global.fh"
217#include "mp3def.fh"
218      dimension work(ichunk,ichunk)
219      integer lo(2),hi(2),i,j,iloc,jloc,ld
220      logical dotask, next_chunk
221c
222c     schwarz(ij) = (ij|ij) for sparsity test
223c
224      icut1 = 0
225      icut2 = 0
226      icut3 = 0
227c
228      call ga_zero(g_schwarz)
229      call ga_zero(g_counter)
230      schwmax = 0.0d0
231      dotask = next_chunk(lo,hi)
232      ld = ichunk
233      do while (dotask)
234        do i = lo(1), hi(1)
235          iloc = i - lo(1) + 1
236          do j = lo(2), hi(2)
237            jloc = j - lo(2) + 1
238            call g(gg,i,j,i,j)
239            work(iloc,jloc) = sqrt(gg)
240            schwmax = max(schwmax, work(iloc,jloc))
241          end do
242        end do
243        call nga_put(g_schwarz,lo,hi,work,ld)
244        dotask = next_chunk(lo,hi)
245      end do
246      call ga_dgop(11,schwmax,1,'max')
247c
248      return
249      end
250c
251      subroutine ininrm
252      implicit double precision (a-h, o-z)
253#include "cscf.h"
254#include "mafdecls.fh"
255#include "global.fh"
256#include "mp3def.fh"
257      integer*8 maxint
258c
259c     write a little welcome message
260c
261      maxint = nbfn
262      maxint = maxint**4
263      if (ga_nodeid().eq.0) then
264      write(6,1) natom, nocc, nbfn, maxint, tol, ichunk
2651     format(/' Example Direct Self Consistent Field Program '/
266     $        ' -------------------------------------------- '//
267     $        ' no. of atoms .............. ',i5/
268     $        ' no. of occupied orbitals .. ',i5/
269     $        ' no. of basis functions .... ',i5/
270     $        ' basis functions^4 '          ,i15/
271     $        ' convergence threshold ..... ',1pd9.2/
272     $        ' chunk size .................',i5)
273      write(6,*)                                                !cste
274      call flush(6)                                             !cste
275      endif                                                     !cste
276c
277c     generate normalisation coefficients for the basis functions
278c     and the index array iky
279c
280      do 10 i = 1, nbfn
281         iky(i) = i*(i-1)/2
282 10   continue
283c
284      do 20 i = 1, nbfn
285         rnorm(i) = (expnt(i)*2.0d0/pi)**0.75d0
286 20   continue
287c
288c     initialize common for computing f0
289c
290      call setfm
291c
292      end
293      double precision function h(i,j)
294      implicit double precision (a-h, o-z)
295#include "cscf.h"
296#include "mafdecls.fh"
297#include "global.fh"
298#include "mp3def.fh"
299cvd$r novector
300cvd$r noconcur
301c
302c     generate the one particle hamiltonian matrix element
303c     over the normalized primitive 1s functions i and j
304c
305      f0val = 0.0d0
306      sum = 0.0d0
307      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
308      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
309      expij = exprjh(-facij*rab2)
310      repij = (2.0d0*pi/(expnt(i)+expnt(j))) * expij
311c
312c     first do the nuclear attraction integrals
313c
314      do 10 iat = 1, natom
315         xp = (x(i)*expnt(i) + x(j)*expnt(j))/(expnt(i)+expnt(j))
316         yp = (y(i)*expnt(i) + y(j)*expnt(j))/(expnt(i)+expnt(j))
317         zp = (z(i)*expnt(i) + z(j)*expnt(j))/(expnt(i)+expnt(j))
318         rpc2 = (xp-ax(iat))**2 + (yp-ay(iat))**2 + (zp-az(iat))**2
319c
320         call f0(f0val, (expnt(i)+expnt(j))*rpc2)
321         sum = sum - repij * q(iat) * f0val
322 10   continue
323c
324c     add on the kinetic energy term
325c
326      sum = sum + facij*(3.0d0-2.0d0*facij*rab2) *
327     $     (pi/(expnt(i)+expnt(j)))**1.5d0 * expij
328c
329c     finally multiply by the normalization constants
330c
331      h = sum * rnorm(i) * rnorm(j)
332c
333      end
334      double precision function s(i,j)
335      implicit double precision (a-h, o-z)
336#include "cscf.h"
337#include "mafdecls.fh"
338#include "global.fh"
339#include "mp3def.fh"
340c
341c     generate the overlap matrix element between the normalized
342c     primitve gaussian 1s functions i and j
343c
344      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
345      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
346      s = (pi/(expnt(i)+expnt(j)))**1.5d0 * exprjh(-facij*rab2) *
347     $     rnorm(i)*rnorm(j)
348c
349      end
350      subroutine makden
351      implicit double precision (a-h, o-z)
352#include "cscf.h"
353#include "mafdecls.fh"
354#include "global.fh"
355#include "mp3def.fh"
356c      dimension work(maxnbfn,maxnbfn), torbs(maxnbfn,maxnbfn)
357      dimension work(ichunk,ichunk), orbsi(ichunk,maxnbfn)
358      dimension orbsj(ichunk,maxnbfn)
359      integer lo(2), hi(2), tlo(2), thi(2), i, j, iloc, jloc, ld
360      logical dotask, next_chunk
361c
362c     generate density matrix from orbitals in g_orbs. the first
363c     nocc orbitals are doubly occupied.
364c
365      call ga_zero(g_counter)
366      dotask = next_chunk(lo,hi)
367      ld = ichunk
368      do while (dotask)
369        tlo(1) = lo(1)
370        thi(1) = hi(1)
371        tlo(2) = 1
372        thi(2) = nocc
373        call nga_get(g_orbs,tlo,thi,orbsi,ld)
374        tlo(1) = lo(2)
375        thi(1) = hi(2)
376        call nga_get(g_orbs,tlo,thi,orbsj,ld)
377        do i = lo(1), hi(1)
378          iloc = i - lo(1) + 1
379          do j = lo(2), hi(2)
380            jloc = j - lo(2) + 1
381            p = 0.0d00
382            do k = 1, nocc
383              p = p + orbsi(iloc,k)*orbsj(jloc,k)
384            end do
385            work(iloc,jloc) = 2.0d00*p
386          end do
387        end do
388        call nga_put(g_work,lo,hi,work,ld)
389        dotask = next_chunk(lo,hi)
390      end do
391      return
392      end
393c
394      subroutine oneel(schwmax, eone)
395      implicit double precision (a-h, o-z)
396#include "cscf.h"
397#include "mafdecls.fh"
398#include "global.fh"
399#include "mp3def.fh"
400      integer lo(2), hi(2), i, j, iloc, jloc, ld
401      dimension work(ichunk,ichunk),tfock(ichunk,ichunk)
402      logical dotask, next_chunk
403c
404c     fill in the one-electron part of the fock matrix and
405c     compute the one-electron energy contribution
406c
407
408      me = ga_nodeid()
409      nproc = ga_nnodes()
410c
411      call ga_zero(g_counter)
412      dotask = next_chunk(lo,hi)
413      ld = ichunk
414      do while (dotask)
415        call nga_get(g_schwarz,lo,hi,work,ld)
416        do j = lo(2), hi(2)
417          jloc = j - lo(2) + 1
418          do i = lo(1), hi(1)
419            iloc = i - lo(1) + 1
420            tfock(iloc,jloc) = 0.0d00
421            if (work(iloc,jloc)*schwmax.gt.tol2e)
422     +        tfock(iloc,jloc) = h(i,j)
423          end do
424        end do
425        call nga_put(g_fock,lo,hi,tfock,ld)
426        dotask = next_chunk(lo,hi)
427      end do
428      eone = 0.5d00*contract_matrices(g_fock,g_dens)
429c
430      end
431#if 0
432      integer function nxtask(nproc)
433      parameter (ichunk = 10)
434      save icount, nleft
435      data nleft, icount /0, 0/
436c
437c     wrapper round nxtval() to increase granularity
438c     and thus reduce no. of requests to shared counter
439c
440      if(nproc.gt.0) then
441         if(nleft.eq.0) then
442#ifdef MSG_COMMS_MPI
443            icount = nxtval_ga(nproc) * ichunk
444#else
445            icount = nxtval(nproc) * ichunk
446#endif
447            nleft = ichunk
448         endif
449         nxtask = icount
450         icount = icount + 1
451         nleft = nleft -1
452      else
453          nleft = 0
454          nxtask = 0
455#ifdef MSG_COMMS_MPI
456          junk = nxtval_ga(nproc)
457#else
458          junk = nxtval(nproc)
459#endif
460      endif
461c
462c     following does dumb static load balancing
463c
464c$$$      if(nproc.gt.0) then
465c$$$         if (nleft .eq. 0) then
466c$$$            icount = ga_nodeid()
467c$$$            nleft = 1
468c$$$         endif
469c$$$         nxtask = icount
470c$$$         icount = icount + ga_nnodes()
471c$$$      else
472c$$$          nleft = 0
473c$$$          nxtask = 0
474c$$$      endif
475      end
476#endif
477c
478      logical function next_chunk(lo,hi)
479#include "cscf.h"
480      integer one
481      parameter (one = 1)
482      integer imax, lo(2), hi(2), ilo, jlo
483      itask = nga_read_inc(g_counter,one,one)
484      imax = nbfn/ichunk
485      if (nbfn - ichunk*imax.gt.0) imax = imax + 1
486      if (itask.lt.imax*imax) then
487        ilo = mod(itask,imax)
488        jlo = (itask-ilo)/imax
489        lo(1) = ilo*ichunk + 1
490        lo(2) = jlo*ichunk + 1
491        hi(1) = min((ilo+1)*ichunk,nbfn)
492        hi(2) = min((jlo+1)*ichunk,nbfn)
493        next_chunk = .true.
494      else
495        next_chunk = .false.
496      endif
497      return
498      end
499c
500      logical function next_4chunk(lo,hi,ilo,jlo,klo,llo)
501#include "cscf.h"
502      integer one
503      parameter (one = 1)
504      integer*8 imax, itask, itmp                               !cste
505      integer lo(4), hi(4), ilo, jlo, klo, llo                  !cste
506c
507      itask = nga_read_inc(g_counter,one,one)
508      imax = nbfn/ichunk
509      if (nbfn - ichunk*imax.gt.0) imax = imax + 1
510      if (itask. lt. 0) then                                    !cste
511         write(6,*) 'next_4chunk: itask negative:',itask,       !cste
512     *             ' imax:',imax,' nbfn:',nbfn,' ichunk:',ichunk!cste
513         write(6,*) 'probable GA integer precision problem if ' !cste
514     *             ,'imax^4 > 2^31'                             !cste
515         call flush(6)                                          !cste
516         stop 'next_4chunk'                                     !cste
517      end if                                                    !cste
518      if (itask.lt.imax**4) then
519        ilo = mod(itask,imax)
520        itmp = (itask - ilo)/imax
521        jlo = mod(itmp,imax)
522        itmp = (itmp - jlo)/imax
523        klo = mod(itmp,imax)
524        llo = (itmp - klo)/imax
525        lo(1) = ilo*ichunk + 1
526        lo(2) = jlo*ichunk + 1
527        lo(3) = klo*ichunk + 1
528        lo(4) = llo*ichunk + 1
529        hi(1) = min((ilo+1)*ichunk,nbfn)
530        hi(2) = min((jlo+1)*ichunk,nbfn)
531        hi(3) = min((klo+1)*ichunk,nbfn)
532        hi(4) = min((llo+1)*ichunk,nbfn)
533        next_4chunk = .true.
534      else
535        next_4chunk = .false.
536      endif
537      return
538      end
539c
540      subroutine clean_chunk(chunk)
541#include "cscf.h"
542      double precision chunk(ichunk,ichunk)
543      integer i,j
544      do j = 1, ichunk
545        do i = 1, ichunk
546          chunk(i,j) = 0.0d00
547        end do
548      end do
549      return
550      end
551c
552      subroutine twoel(schwmax, etwo)
553      implicit double precision (a-h, o-z)
554#include "cscf.h"
555#include "mafdecls.fh"
556#include "global.fh"
557#include "mp3def.fh"
558      double precision f_ij(ichunk,ichunk),d_kl(ichunk,ichunk)
559      double precision f_ik(ichunk,ichunk),d_jl(ichunk,ichunk)
560      double precision s_ij(ichunk,ichunk),s_kl(ichunk,ichunk)
561      double precision schwmax, one
562cste  integer nproc                                             !cste
563      integer*8 ijkls, ijcnt,klcnt,ijklcnt                      !cste
564      integer lo(4),hi(4),lo_ik(2),hi_ik(2),lo_jl(2),hi_jl(2)   !cste
565      integer i,j,k,l,iloc,jloc,kloc,lloc,ld,ich,it,jt,kt,lt    !cste
566      logical dotask, next_4chunk
567c
568c     add in the two-electron contribution to the fock matrix
569c
570cste  nproc = ga_nnodes()
571      one = 1.0d00
572      ijcnt = icut1                                     !cste
573      klcnt = icut2                                     !cste
574      ijklcnt = icut3                                   !cste
575c
576      call ga_zero(g_counter)
577      ld = maxnbfn
578      ich = ichunk
579      dotask = next_4chunk(lo,hi,it,jt,kt,lt)
580      itask = 0
581cste  ijkls = 0                                         !cste
582      do while (dotask)
583cste  ijkl=(hi(1)-lo(1)+1)*(hi(2)-lo(2)+1)*             !cste
584cste *     (hi(3)-lo(3)+1)*(hi(4)-lo(4)+1)              !cste
585cste  ijkls = ijkls + ijkl                              !cste
586cste  write(6,*)itask,lo,hi,ijkl,ijkls                  !cste
587        lo_ik(1) = lo(1)
588        lo_ik(2) = lo(3)
589        hi_ik(1) = hi(1)
590        hi_ik(2) = hi(3)
591        lo_jl(1) = lo(2)
592        lo_jl(2) = lo(4)
593        hi_jl(1) = hi(2)
594        hi_jl(2) = hi(4)
595        call nga_get(g_schwarz,lo,hi,s_ij,ich)
596        call nga_get(g_schwarz,lo(3),hi(3),s_kl,ich)
597        call nga_get(g_dens,lo(3),hi(3),d_kl,ich)
598        call nga_get(g_dens,lo_jl,hi_jl,d_jl,ich)
599        itask = itask + 1
600        call clean_chunk(f_ij)
601        call clean_chunk(f_ik)
602        do i = lo(1), hi(1)
603          iloc = i-lo(1) + 1
604          do j = lo(2), hi(2)
605            jloc = j-lo(2) + 1
606            if (s_ij(iloc,jloc)*schwmax .lt. tol2e) then
607              icut1 = icut1 + (hi(3)-lo(3)+1)*(hi(4)-lo(4)+1)   !cste
608            else
609              do k = lo(3), hi(3)
610                kloc = k-lo(3) + 1
611                do l = lo(4), hi(4)
612                  lloc = l-lo(4) + 1
613                  if (s_ij(iloc,jloc)*s_kl(kloc,lloc).lt.tol2e) then
614                    icut2 = icut2 + 1
615                  else
616                    call g(gg, i, j, k, l)
617                    f_ij(iloc,jloc) = f_ij(iloc,jloc)
618     +                              + gg*d_kl(kloc,lloc)
619                    f_ik(iloc,kloc) = f_ik(iloc,kloc)
620     +                              - 0.5d00*gg*d_jl(jloc,lloc)
621                    icut3 = icut3 + 1
622                  endif
623                end do
624              end do
625            endif
626          end do
627        end do
628        call nga_acc(g_fock,lo,hi,f_ij,ich,one)
629        call nga_acc(g_fock,lo_ik,hi_ik,f_ik,ich,one)
630        dotask = next_4chunk(lo,hi,it,jt,kt,lt)
631      end do
632      etwo = 0.5d00*contract_matrices(g_fock,g_dens)
633      ijcnt = icut1 - ijcnt
634      klcnt = icut2 - klcnt
635      ijklcnt = icut3 - ijklcnt
636cste  write(6,*) 'node ', ga_nodeid(), ijcnt, klcnt, ijklcnt    !cste
637cste *           ,icut1,icut2,icut3                             !cste
638cste  call flush(6)                                             !cste
639      icut4 = icut3                                             !cste
640      if (icut3 .gt. 0) return                                  !cste
641c
642c    no integrals may be calculated if there is no work for
643c    this node (ichunk too big), or, something is wrong
644c
645      write(6,*) 'no two-electron integrals computed by node',  !cste
646     *            ga_nodeid()                                   !cste
647      call flush(6)                                             !cste
648      return
649cste  stop 'twoel computed no integrals'                        !cste
650      end
651c
652      subroutine damp(fac)
653      implicit double precision (a-h, o-z)
654#include "cscf.h"
655#include "mafdecls.fh"
656#include "global.fh"
657#include "mp3def.fh"
658c
659c    create damped density matrix as a linear combination of
660c    old density matrix and density matrix formed from new orbitals
661c
662      ofac = 1.0d0 - fac
663      call ga_add(fac,g_dens,ofac,g_work,g_dens)
664      return
665      end
666c
667      subroutine prnout(iter, energy, deltad, tester)
668      implicit double precision (a-h, o-z)
669#include "cscf.h"
670#include "mafdecls.fh"
671#include "global.fh"
672#include "mp3def.fh"
673c
674c     printout results of each iteration
675c
676      if (ga_nodeid().ne.0) return
677      write(6,1) iter, energy, deltad, tester
678      call flush(6)
6791     format(' iter=',i3,', energy=',f15.8,', deltad=',1pd9.2,
680     $     ', deltaf=',d9.2)
681      return
682      end
683c
684      double precision function dendif()
685      implicit double precision (a-h, o-z)
686#include "cscf.h"
687#include "mafdecls.fh"
688#include "global.fh"
689#include "mp3def.fh"
690      double precision xdiff
691      dimension dens_c(ichunk,ichunk),work_c(ichunk,ichunk)
692      integer lo(2), hi(2), i, j, ld
693      logical dotask, next_chunk
694c
695c     compute largest change in density matrix elements
696c
697      denmax = 0.0d0
698      call ga_zero(g_counter)
699      dotask = next_chunk(lo,hi)
700      ld = ichunk
701      do while(dotask)
702        call nga_get(g_dens,lo,hi,dens_c,ld)
703        call nga_get(g_work,lo,hi,work_c,ld)
704        do j = 1, hi(2)-lo(2)+1
705          do i = 1, hi(1)-lo(1)+1
706            xdiff = abs(dens_c(i,j)-work_c(i,j))
707            if (xdiff.gt.denmax) denmax = xdiff
708          end do
709        end do
710        dotask = next_chunk(lo,hi)
711      end do
712      call ga_dgop(1,denmax,1,'max')
713      dendif = denmax
714      return
715      end
716c
717      double precision function testfock()
718      implicit double precision (a-h, o-z)
719#include "cscf.h"
720#include "mafdecls.fh"
721#include "global.fh"
722#include "mp3def.fh"
723      double precision xmax, xtmp
724      dimension work(ichunk,ichunk)
725      integer lo(2), hi(2), i, j, iloc, jloc, ld
726      logical dotask, next_chunk
727c
728c     compute largest change in density matrix elements
729c
730      xmax = 0.0d0
731      call ga_zero(g_counter)
732      dotask = next_chunk(lo,hi)
733      ld = ichunk
734      do while(dotask)
735        call nga_get(g_fock,lo,hi,work,ld)
736        do j = lo(2), hi(2)
737          jloc = j - lo(2) + 1
738          do i = lo(1), hi(1)
739            iloc = i - lo(1) + 1
740            if (i.ne.j) then
741              xtmp = abs(work(iloc,jloc))
742              if (xtmp.gt.xmax) xmax = xtmp
743            endif
744          end do
745        end do
746        dotask = next_chunk(lo,hi)
747      end do
748      call ga_dgop(1,xmax,1,'max')
749      testfock = xmax
750      return
751      end
752c
753      subroutine shiftfock(shift)
754      implicit double precision (a-h, o-z)
755#include "cscf.h"
756#include "mafdecls.fh"
757#include "global.fh"
758#include "mp3def.fh"
759      double precision shift
760      dimension work(ichunk,ichunk)
761      integer lo(2), hi(2), i, j, iloc, jloc, ld, icnt
762      logical dotask, next_chunk
763c
764c     compute largest change in density matrix elements
765c
766      call ga_zero(g_counter)
767      dotask = next_chunk(lo,hi)
768      ld = ichunk
769      do while(dotask)
770        call nga_get(g_fock,lo,hi,work,ld)
771        icnt = 0
772        do j = lo(2), hi(2)
773          jloc = j - lo(2) + 1
774          do i = lo(1), hi(1)
775            iloc = i - lo(1) + 1
776            if (i.eq.j.and.i.gt.nocc) then
777              work(iloc,jloc) = work(iloc,jloc) + shift
778              icnt = icnt + 1
779            endif
780          end do
781        end do
782        if (icnt.gt.0) call nga_put(g_fock,lo,hi,work,ld)
783        dotask = next_chunk(lo,hi)
784      end do
785      return
786      end
787c
788      subroutine prnfin(energy)
789      implicit double precision (a-h, o-z)
790#include "cscf.h"
791#include "mafdecls.fh"
792#include "global.fh"
793#include "mp3def.fh"
794      dimension orbs(maxnbfn, maxnbfn)
795      integer lo(2),hi(2),ld
796c
797c     printout final results
798c
799      if (ga_nodeid().ne.0) return
800      write(6,1) energy
801 1    format(//' final energy = ',f18.11//' eigenvalues')
802      call output(eigv, 1, min(nbfn,nocc+5), 1, 1, nbfn, 1, 1)
803c
804      return
805      end
806      subroutine g(value,i,j,k,l)
807      implicit double precision (a-h, o-z)
808#include "cscf.h"
809#include "mafdecls.fh"
810#include "global.fh"
811#include "mp3def.fh"
812c
813c     compute the two electon integral (ij|kl) over normalized
814c     primitive 1s gaussians
815c
816      f0val = 0.0d0
817      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
818      rcd2 = (x(k)-x(l))**2 + (y(k)-y(l))**2 + (z(k)-z(l))**2
819      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
820      fackl = expnt(k)*expnt(l)/(expnt(k)+expnt(l))
821      exijkl = exprjh(- facij*rab2 - fackl*rcd2)
822      denom = (expnt(i)+expnt(j))*(expnt(k)+expnt(l)) *
823     $        sqrt(expnt(i)+expnt(j)+expnt(k)+expnt(l))
824      fac = (expnt(i)+expnt(j))*(expnt(k)+expnt(l)) /
825     $        (expnt(i)+expnt(j)+expnt(k)+expnt(l))
826c
827      xp = (x(i)*expnt(i) + x(j)*expnt(j))/(expnt(i)+expnt(j))
828      yp = (y(i)*expnt(i) + y(j)*expnt(j))/(expnt(i)+expnt(j))
829      zp = (z(i)*expnt(i) + z(j)*expnt(j))/(expnt(i)+expnt(j))
830      xq = (x(k)*expnt(k) + x(l)*expnt(l))/(expnt(k)+expnt(l))
831      yq = (y(k)*expnt(k) + y(l)*expnt(l))/(expnt(k)+expnt(l))
832      zq = (z(k)*expnt(k) + z(l)*expnt(l))/(expnt(k)+expnt(l))
833      rpq2 = (xp-xq)**2 + (yp-yq)**2 + (zp-zq)**2
834c
835      call f0(f0val, fac*rpq2)
836      value = (2.0d0 * pi**2.5d0 / denom) * exijkl * f0val *
837     $    rnorm(i)*rnorm(j)*rnorm(k)*rnorm(l)
838      return
839      end
840c
841      subroutine diagon(tester, iter)
842c      subroutine diagon(fock, orbs, evals, work, tester, iter)
843      implicit double precision (a-h, o-z)
844#include "cscf.h"
845#include "mafdecls.fh"
846#include "global.fh"
847#include "mp3def.fh"
848      double precision r_zero, r_one, shift, tester
849c
850#if USE_TRANSFORM
851c
852c     use similarity transform to solve standard eigenvalue problem
853c     (overlap matrix has been transformed out of the problem)
854c
855      r_one = 1.0d00
856      r_zero = 0.0d00
857      call ga_dgemm('n','n',nbfn,nbfn,nbfn,r_one,g_fock,g_orbs,
858     +               r_zero,g_tfock)
859      call ga_dgemm('t','n',nbfn,nbfn,nbfn,r_one,g_orbs,g_tfock,
860     +               r_zero,g_fock)
861      tester = testfock()
862      shift = 0.0d00
863      if (tester.gt.0.3d0) then
864        shift = 0.3d0
865      else
866        if (nbfn .gt. 60) then
867          shift = 0.1d0
868        else
869          shift = 0.0d0
870        endif
871      endif
872      if (iter.ge.2.and.shift.ne.0.0d00) then
873        call shiftfock(shift)
874      endif
875      call ga_copy(g_orbs,g_tfock)
876      call ga_diag_std_seq(g_fock, g_work, eigv)
877c
878c     Back transform eigenvectors
879c
880      call ga_dgemm('n','n',nbfn,nbfn,nbfn,r_one,g_tfock,g_work,
881     +               r_zero,g_orbs)
882      if (iter.ge.2.and.shift.ne.0.0d00) then
883         do 50 i = nocc+1, nbfn
884            eigv(i) = eigv(i) - shift
885 50      continue
886      endif
887#else
888c
889c     Keep remaking overlap matrix since ga_diag_seq does not
890c     guarantee that g_ident is preserved.
891c
892         call makoverlap
893         call ga_diag_seq(g_fock, g_ident, g_orbs, eigv)
894         tester = 0.0d00
895#endif
896      return
897      end
898c
899      subroutine makeob
900      implicit double precision (a-h, o-z)
901#include "cscf.h"
902#include "mafdecls.fh"
903#include "global.fh"
904#include "mp3def.fh"
905      double precision work(ichunk,ichunk),orbs(ichunk,ichunk)
906      double precision eval(maxnbfn)
907      integer lo(2),hi(2),ld,me,i,j,iloc,jloc
908      logical dotask, next_chunk
909c
910c     generate set of orthonormal vectors by creating a random
911c     symmetric matrix and solving associated generalized eigenvalue
912c     problem using the correct overlap matrix.
913c
914      me = ga_nodeid()
915      call ga_zero(g_counter)
916      dotask = next_chunk(lo,hi)
917      ld = ichunk
918      do while (dotask)
919        do j = lo(2), hi(2)
920          jloc = j - lo(2) + 1
921          do i = lo(1), hi(1)
922            iloc = i - lo(1) + 1
923            work(iloc,jloc) = s(i,j)
924            orbs(iloc,jloc) = drand(0)
925          end do
926        end do
927        call nga_put(g_ident,lo,hi,work,ld)
928        call nga_put(g_fock,lo,hi,orbs,ld)
929        dotask = next_chunk(lo,hi)
930      end do
931      call ga_symmetrize(g_fock)
932      call ga_diag_seq(g_fock, g_ident, g_orbs, eval)
933c
934      return
935      end
936c
937      subroutine denges
938      implicit double precision (a-h, o-z)
939#include "cscf.h"
940#include "mafdecls.fh"
941#include "global.fh"
942#include "mp3def.fh"
943c
944c     Form guess density from superposition of atomic densities in the AO
945c     basis set ... instead of doing the atomic SCF hardwire for this
946c     small basis set for the Be atom.
947c
948      integer one, itask, lo(2), hi(2), ld
949      dimension atdens(15,15)
950      data atdens/
951     $     0.000002,0.000027,0.000129,0.000428,0.000950,0.001180,
952     $     0.000457,-0.000270,-0.000271,0.000004,0.000004,0.000004,
953     $     0.000004,0.000004,0.000004,0.000027,0.000102,0.000987,
954     $     0.003269,0.007254,0.009007,0.003492,-0.002099,-0.002108,
955     $     0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,
956     $     0.000129,0.000987,0.002381,0.015766,0.034988,0.043433,
957     $     0.016835,-0.010038,-0.010082,0.000166,0.000166,0.000166,
958     $     0.000166,0.000166,0.000166,0.000428,0.003269,0.015766,
959     $     0.026100,0.115858,0.144064,0.055967,-0.035878,-0.035990,
960     $     0.000584,0.000584,0.000584,0.000584,0.000584,0.000584,
961     $     0.000950,0.007254,0.034988,0.115858,0.128586,0.320120,
962     $     0.124539,-0.083334,-0.083536,0.001346,0.001346,0.001346,
963     $     0.001346,0.001346,0.001346,0.001180,0.009007,0.043433,
964     $     0.144064,0.320120,0.201952,0.159935,-0.162762,-0.162267,
965     $     0.002471,0.002471,0.002471,0.002471,0.002471,0.002471,
966     $     0.000457,0.003492,0.016835,0.055967,0.124539,0.159935,
967     $     0.032378,-0.093780,-0.093202,0.001372,0.001372,0.001372,
968     $     0.001372,0.001372,0.001372,-0.000270,-0.002099,-0.010038,
969     $     -0.035878,-0.083334,-0.162762,-0.093780,0.334488,0.660918,
970     $     -0.009090,-0.009090,-0.009090,-0.009090,-0.009090,-0.009090,
971     $     -0.000271,-0.002108,-0.010082,-0.035990,-0.083536,-0.162267,
972     $     -0.093202,0.660918,0.326482,-0.008982,-0.008982,-0.008981,
973     $     -0.008981,-0.008981,-0.008982,0.000004,0.000035,0.000166,
974     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008982,
975     $     0.000062,0.000124,0.000124,0.000124,0.000124,0.000124,
976     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
977     $     0.001372,-0.009090,-0.008982,0.000124,0.000062,0.000124,
978     $     0.000124,0.000124,0.000124,0.000004,0.000035,0.000166,
979     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008981,
980     $     0.000124,0.000124,0.000062,0.000124,0.000124,0.000124,
981     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
982     $     0.001372,-0.009090,-0.008981,0.000124,0.000124,0.000124,
983     $     0.000062,0.000124,0.000124,0.000004,0.000035,0.000166,
984     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008981,
985     $     0.000124,0.000124,0.000124,0.000124,0.000062,0.000124,
986     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
987     $     0.001372,-0.009090,-0.008982,0.000124,0.000124,0.000124,
988     $     0.000124,0.000124,0.000062/
989c
990c   Create initial guess for density matrix in global array
991c
992      call ga_zero(g_dens)
993      call ga_zero(g_counter)
994      one = 1
995      ld = 15
996c
997c   Correct for a factor of two along the diagonal
998c
999      do i = 1, ld
1000        atdens(i,i) = 2.0d00*atdens(i,i)
1001      end do
1002      itask = nga_read_inc(g_counter,one,one)
1003      do while(itask.lt.natom)
1004        ioff = itask*15
1005        lo(1) = ioff+1
1006        lo(2) = ioff+1
1007        hi(1) = ioff+15
1008        hi(2) = ioff+15
1009        call nga_put(g_dens,lo,hi,atdens,ld)
1010        itask = nga_read_inc(g_counter,one,one)
1011      end do
1012      call ga_sync
1013      return
1014      end
1015c
1016      subroutine setarrays
1017      implicit double precision (a-h, o-z)
1018#include "cscf.h"
1019#include "mafdecls.fh"
1020#include "global.fh"
1021#include "mp3def.fh"
1022      integer one, two, dims(2)
1023      logical status
1024      one = 1
1025      two = 2
1026      g_counter = ga_create_handle()
1027      call ga_set_data(g_counter,one,one,MT_INT)
1028      status = ga_allocate(g_counter)
1029      call ga_zero(g_counter)
1030
1031      dims(1) = nbfn
1032      dims(2) = nbfn
1033      g_dens = ga_create_handle()
1034      call ga_set_data(g_dens, two, dims, MT_DBL)
1035      status = ga_allocate(g_dens)
1036      call ga_zero(g_dens)
1037
1038      g_schwarz = ga_create_handle()
1039      call ga_set_data(g_schwarz, two, dims, MT_DBL)
1040      status = ga_allocate(g_schwarz)
1041      call ga_zero(g_schwarz)
1042
1043      g_fock = ga_create_handle()
1044      call ga_set_data(g_fock, two, dims, MT_DBL)
1045      status = ga_allocate(g_fock)
1046      call ga_zero(g_fock)
1047
1048
1049      g_tfock = ga_create_handle()
1050      call ga_set_data(g_tfock, two, dims, MT_DBL)
1051      status = ga_allocate(g_tfock)
1052      call ga_zero(g_tfock)
1053
1054      g_work = ga_create_handle()
1055      call ga_set_data(g_work, two, dims, MT_DBL)
1056      status = ga_allocate(g_work)
1057      call ga_zero(g_work)
1058
1059      g_ident = ga_create_handle()
1060      call ga_set_data(g_ident, two, dims, MT_DBL)
1061      status = ga_allocate(g_ident)
1062      call ga_zero(g_ident)
1063
1064      g_orbs = ga_create_handle()
1065      call ga_set_data(g_orbs, two, dims, MT_DBL)
1066      status = ga_allocate(g_orbs)
1067      call ga_zero(g_orbs)
1068
1069      return
1070      end
1071      subroutine closearrays
1072      implicit double precision (a-h, o-z)
1073#include "cscf.h"
1074#include "mafdecls.fh"
1075#include "global.fh"
1076#include "mp3def.fh"
1077      logical status
1078c
1079      status = ga_destroy(g_counter)
1080      status = ga_destroy(g_dens)
1081      status = ga_destroy(g_schwarz)
1082      status = ga_destroy(g_fock)
1083      status = ga_destroy(g_tfock)
1084      status = ga_destroy(g_work)
1085      status = ga_destroy(g_ident)
1086      status = ga_destroy(g_orbs)
1087c
1088      return
1089      end
1090c
1091      subroutine makoverlap
1092      implicit double precision (a-h, o-z)
1093#include "cscf.h"
1094#include "mafdecls.fh"
1095#include "global.fh"
1096#include "mp3def.fh"
1097      integer me, lo(2), hi(2), ptr, ld(2)
1098      integer ld1, ld2
1099      me = ga_nodeid()
1100      call nga_distribution(g_ident, me, lo, hi)
1101      call nga_access(g_ident, lo, hi, ptr, ld)
1102      ld1 = hi(1) - lo(1) + 1
1103      ld2 = hi(2) - lo(2) + 1
1104      call setoverlap(dbl_mb(ptr),lo,hi,ld1,ld2)
1105      call nga_release(g_ident)
1106      return
1107      end
1108c
1109      subroutine setoverlap(a,lo,hi,ld1,ld2)
1110#include "cscf.h"
1111#include "mafdecls.fh"
1112#include "global.fh"
1113#include "mp3def.fh"
1114      integer lo(2), hi(2)
1115      integer ld1, ld2, ii, jj
1116      double precision a(ld1,ld2)
1117      do i = 1, ld1
1118        ii = i + lo(1) - 1
1119        do j = 1, ld2
1120          jj = j + lo(2) - 1
1121#if USE_TRANSFORM
1122          if (ii.eq.jj) then
1123            a(i,j) = 1.0d00
1124          else
1125            a(i,j) = 0.0d00
1126          endif
1127#else
1128          a(i,j) = s(ii,jj)
1129#endif
1130        end do
1131      end do
1132      return
1133      end
1134c
1135      subroutine print_ga_block(g_a)
1136      implicit double precision(a-h,o-z)
1137#include "cscf.h"
1138#include "mafdecls.fh"
1139#include "global.fh"
1140#include "mp3def.fh"
1141      integer lo(2), hi(2), ptr, ld1, ld2
1142c
1143      me = ga_nodeid()
1144      call nga_distribution(g_a, me, lo, hi)
1145      ld1 = hi(1) - lo(1) + 1
1146      ld2 = hi(2) - lo(2) + 1
1147      call nga_access(g_a, lo, hi, ptr, ld)
1148      call dump_chunk(dbl_mb(ptr),ld1,ld2)
1149      call nga_release(g_a)
1150c
1151      return
1152      end
1153c
1154      subroutine print_ga_block_ij(g_a,tlo)
1155      implicit double precision(a-h,o-z)
1156#include "cscf.h"
1157#include "mafdecls.fh"
1158#include "global.fh"
1159#include "mp3def.fh"
1160      integer lo(2), hi(2), ptr, ld1, ld2
1161c
1162      me = ga_nodeid()
1163      call nga_distribution(g_a, me, lo, hi)
1164      ld1 = hi(1) - lo(1) + 1
1165      ld2 = hi(2) - lo(2) + 1
1166      call nga_access(g_a, tlo, hi, ptr, ld)
1167      call dump_chunk(dbl_mb(ptr),ld1,ld2)
1168      call nga_release(g_a)
1169c
1170      return
1171      end
1172c
1173      subroutine dump_chunk(a,ld1,ld2)
1174      implicit double precision (a-h, o-z)
1175#include "cscf.h"
1176#include "mafdecls.fh"
1177#include "global.fh"
1178#include "mp3def.fh"
1179      integer ld1, ld2
1180      double precision a(ld1, ld2)
1181      do i = 1, min(10,ld1)
1182        write(6,100) (a(i,j), j = 1, min(10,ld2))
1183      end do
1184      write(6,*)
1185      trace = 0.0d0
1186      do i=1,ld2
1187         trace = trace +a(i,i)
1188      end do
1189      write(6,*) 'trace=',trace
1190  100 format(10f10.4)
1191      return
1192      end
1193c
1194      double precision function contract_matrices(g_a,g_b)
1195      implicit double precision(a-h,o-z)
1196#include "cscf.h"
1197#include "mafdecls.fh"
1198#include "global.fh"
1199#include "mp3def.fh"
1200      integer lo(2), hi(2), ptr_a, ptr_b, ld, ld1, ld2
1201      double precision a(ichunk,ichunk),b(ichunk,ichunk)
1202      double precision value
1203      logical dotask, next_chunk
1204c
1205c   evalute sum_ij a_ij*b_ij
1206c
1207      value = 0.0d00
1208      call ga_zero(g_counter)
1209      dotask = next_chunk(lo,hi)
1210      ld = ichunk
1211      do while (dotask)
1212        call nga_get(g_a,lo,hi,a,ld)
1213        call nga_get(g_b,lo,hi,b,ld)
1214        do j = 1, hi(2)-lo(2)+1
1215          do i = 1, hi(1)-lo(1)+1
1216            value = value + a(i,j)*b(i,j)
1217          end do
1218        end do
1219        dotask = next_chunk(lo,hi)
1220      end do
1221      call ga_dgop(3,value,1,'+')
1222      contract_matrices=value
1223c
1224      return
1225      end
1226