#if HAVE_CONFIG_H
#   include "config.fh"
#endif
      program scf
C$Id: scf.F,v 1.18 2007/03/23 19:24:36 d3g293 Exp $
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
c
      integer*8 nints, maxint                                           !cste
c
c     CAUTION: integer precision requirements
c     nints, maxint, etc. are proportional to the number of basis functions
c     to the fourth power! 216**4 is greater than the largest number 
c     that can be represented as a 32-bit signed interger, so 64-bit 
c     arithmetic is needed to count integrals when calculating more than
c     14 Be atoms with 15 basis functions each. Since integrals are counted
c     over all iterations, 18 iterations with 7 atoms can result in precision
c     problems. Note that the wave function could be calculated correctly 
c     for much larger basis sets without 64-bit integers because the required 
c     indexing is usually proportional to nbfn**2, which is good to 46,340 
c     basis functions, except that the task counter runs as (nbfn/ichunk)**4, 
c     so with ichunk = 10, 32-bit integers yield correct wavefunctions out to 
c     2145 basis functions (maxatom=143), or 4290 (maxatom=286) with ichunk = 20, ...
c
c     This warning applies to the Global Arrays implementation as well! 
c     functions of special concern are ga_igop and nga_read_inc.
c
#define USE_TRANSFORM 1
      integer heap, stack
      data tinit, tonel, ttwoel, tdiag, tdens, tprint /6*0.0d0/
      data eone, etwo, energy, deltad /4*0.0d0/
c     
c     initalize the parallel message passing environment
c     
#include "mp3.fh"
      call ga_initialize()
c
c   Allocate memory
c
      heap =  32000000  
      stack = 32000000
      if (.not.ma_init(MT_DBL, stack, heap))
     +   call ga_error("ma_init failed",-1)
      call flush(6)
      me = ga_nodeid()
      nproc = ga_nnodes()
c     
c     initialize a bunch of stuff and initial density matrix
c     
      rjunk = timer()
c
c     get input from file be.inpt
c
      call input
c
c     create and allocate global arrays
c
      call setarrays
      if (ga_nodeid().eq.0) write(6,*) 'bytes of memory used by node 0:'
     +                                 ,ga_inquire_memory()
      call ininrm
c
c     create initial guess for density matrix by using single atom
c     densities
c
      call denges
      tinit = timer()
#if USE_TRANSFORM
c
c     make initial orthogonal orbital set for solution method using
c     similarity transform
c
      call makeob      
#endif
c     
c     make info for sparsity test
c     
      call makesz(schwmax)
c
c     print preliminary data before any long compute segments start
      if (ga_nodeid().eq.0) call flush(6)       
c     
c     *** iterate ***
c
      do 10 iter = 1, mxiter
c
c     make the one particle contribution to the fock matrix
c     and get the partial contribution to the energy
c    
         call oneel(schwmax, eone)
         tonel = tonel + timer()
c     
c     compute the two particle contributions to the fock matrix and
c     get the total energy.
c     
         call twoel(schwmax, etwo)
         ttwoel = ttwoel + timer()
c     
c     Diagonalize the fock matrix. The diagonalizers used in this
c     subroutine are actually sequential, not parallel.
c
         call diagon(tester,iter)
         tdiag = tdiag + timer()
c     
c     make the new density matrix in g_work from orbitals in g_orbs,
c     compute the norm of the change in the density matrix and
c     then update the density matrix in g_dens with damping.
c     
         call makden
         deltad = dendif()
         if (iter.eq.1) then
            scale = 0.0d0
         else if (iter .le. 5) then
            if (nbfn .gt. 60) then
               scale = 0.5d0
            else
               scale = 0.0d0
            endif
         else
            scale = 0.0d0
         endif
         call damp(scale)
         tdens = tdens + timer()
c     
c     add up energy and print out convergence information
c     
         if (me.eq.0) then
           energy = enrep + eone + etwo
           call prnout(iter, energy, deltad, tester)
           tprint = tprint + timer()
         endif
c     
c     if converged then exit iteration loop
c     
         if (deltad .lt. tol) goto 20
         call ga_igop(9, icut4, 1, '+')                         !cste
         if(icut4 .eq. 0) then                                  !cste
c     something has gone wrong--print what you know and quit.   
            write(6,*) 'no two-electron integrals computed!'    !cste
            goto 20                                             !cste
         endif                                                  !cste
 10   continue
      iter = iter - 1                                           !cste
      if(me.eq.0)
     $     write(6,*) ' SCF failed to converge in ', iter, ' iters'     
c...v....1....v....2....v....3....v....4....v....5....v....6....v....7..
c     
c     finished ... print out eigenvalues and occupied orbitals
c     
 20   continue
      call ga_igop(6, icut1, 1, '+')
      call ga_igop(7, icut2, 1, '+')
      call ga_igop(8, icut3, 1, '+')
      if (me.eq.0) then
c     
c     print out timing information
c     
         call prnfin(energy)
         write(6,1) tinit, tonel, ttwoel, tdiag, tdens, tprint,
     $        nproc
 1       format(/5x,' init ',4x,' onel ',4x,' twoel ',4x,' diag ',4x,   
     $          ' dens      print  ncpu'/                               
     $           5x,'------',4x,'------',4x,'-------',4x,'------',4x,   
     $          '------    ------ ------'/                            
     $          2f10.2,f11.2,3f10.2, i7/)
         totsec = tinit+tonel+ttwoel+tdiag+tdens+tprint 
         write(6,*)'elapsed time in seconds ',totsec   
c     
c     print out information on # integrals evaluated each iteration
c     

          nints = icut1+icut2+icut3
                  frac  = dble(icut3)/dble(nints)

               write(6,2) icut1, icut2, icut3, nints, frac
 2       format(/'No. of integrals screened or computed (all iters) '
     $          /'-------------------------------------'/
     $        /1x,' failed #ij test failed #kl test',
     $            '        #compute          #total',
     $            ' fraction',
     $        /1x,' --------------- ---------------',
     $            ' --------------- ---------------',
     $            ' --------',
     $        /1x,4(1x,i15),f9.6)
         maxint = nbfn                                  !cste   
         maxint = maxint**4 * iter                      !cste
         if(nints .ne. maxint) then                     !cste
            write(6,*)'Inconsistent number of integrals, should be ',   !cste
     $                 maxint                                           !cste
            write(6,*)'Note: largest 32-bit integer is 2,147,483,647'   !cste
       write(6,*)'Probably due to insufficient integer precision in GA.'!cste
         endif                                                          !cste
#ifdef MSG_COMMS_MPI
         call ga_print_stats()
#else
         call stats
#endif
      endif
c     
      call closearrays
      call ga_terminate
#ifdef MSG_COMMS_MPI
      call mpi_finalize
#else
      call pend
#endif
c     
      end
c
      subroutine makesz(schwmax)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      dimension work(ichunk,ichunk)
      integer lo(2),hi(2),i,j,iloc,jloc,ld
      logical dotask, next_chunk
c
c     schwarz(ij) = (ij|ij) for sparsity test
c
      icut1 = 0
      icut2 = 0
      icut3 = 0
c
      call ga_zero(g_schwarz)
      call ga_zero(g_counter)
      schwmax = 0.0d0
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while (dotask)
        do i = lo(1), hi(1)
          iloc = i - lo(1) + 1
          do j = lo(2), hi(2)
            jloc = j - lo(2) + 1
            call g(gg,i,j,i,j)
            work(iloc,jloc) = sqrt(gg)
            schwmax = max(schwmax, work(iloc,jloc))
          end do
        end do
        call nga_put(g_schwarz,lo,hi,work,ld)
        dotask = next_chunk(lo,hi)
      end do
      call ga_dgop(11,schwmax,1,'max')
c
      return
      end
c
      subroutine ininrm
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer*8 maxint          
c
c     write a little welcome message
c
      maxint = nbfn             
      maxint = maxint**4        
      if (ga_nodeid().eq.0) then                                
      write(6,1) natom, nocc, nbfn, maxint, tol, ichunk
1     format(/' Example Direct Self Consistent Field Program '/
     $        ' -------------------------------------------- '//
     $        ' no. of atoms .............. ',i5/       
     $        ' no. of occupied orbitals .. ',i5/       
     $        ' no. of basis functions .... ',i5/       
     $        ' basis functions^4 '          ,i15/      
     $        ' convergence threshold ..... ',1pd9.2/
     $        ' chunk size .................',i5)   
      write(6,*)                                                !cste
      call flush(6)                                             !cste
      endif                                                     !cste
c
c     generate normalisation coefficients for the basis functions
c     and the index array iky
c
      do 10 i = 1, nbfn
         iky(i) = i*(i-1)/2
 10   continue
c
      do 20 i = 1, nbfn
         rnorm(i) = (expnt(i)*2.0d0/pi)**0.75d0
 20   continue
c
c     initialize common for computing f0
c
      call setfm
c
      end
      double precision function h(i,j)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
cvd$r novector
cvd$r noconcur
c
c     generate the one particle hamiltonian matrix element
c     over the normalized primitive 1s functions i and j
c
      f0val = 0.0d0
      sum = 0.0d0
      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
      expij = exprjh(-facij*rab2)
      repij = (2.0d0*pi/(expnt(i)+expnt(j))) * expij
c
c     first do the nuclear attraction integrals
c
      do 10 iat = 1, natom
         xp = (x(i)*expnt(i) + x(j)*expnt(j))/(expnt(i)+expnt(j))
         yp = (y(i)*expnt(i) + y(j)*expnt(j))/(expnt(i)+expnt(j))
         zp = (z(i)*expnt(i) + z(j)*expnt(j))/(expnt(i)+expnt(j))
         rpc2 = (xp-ax(iat))**2 + (yp-ay(iat))**2 + (zp-az(iat))**2
c
         call f0(f0val, (expnt(i)+expnt(j))*rpc2)
         sum = sum - repij * q(iat) * f0val
 10   continue
c
c     add on the kinetic energy term
c
      sum = sum + facij*(3.0d0-2.0d0*facij*rab2) *
     $     (pi/(expnt(i)+expnt(j)))**1.5d0 * expij
c
c     finally multiply by the normalization constants
c
      h = sum * rnorm(i) * rnorm(j)
c
      end
      double precision function s(i,j)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
c
c     generate the overlap matrix element between the normalized
c     primitve gaussian 1s functions i and j
c
      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
      s = (pi/(expnt(i)+expnt(j)))**1.5d0 * exprjh(-facij*rab2) *
     $     rnorm(i)*rnorm(j)
c
      end
      subroutine makden
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
c      dimension work(maxnbfn,maxnbfn), torbs(maxnbfn,maxnbfn)
      dimension work(ichunk,ichunk), orbsi(ichunk,maxnbfn)
      dimension orbsj(ichunk,maxnbfn)
      integer lo(2), hi(2), tlo(2), thi(2), i, j, iloc, jloc, ld
      logical dotask, next_chunk
c
c     generate density matrix from orbitals in g_orbs. the first
c     nocc orbitals are doubly occupied.
c
      call ga_zero(g_counter)
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while (dotask)
        tlo(1) = lo(1)
        thi(1) = hi(1)
        tlo(2) = 1
        thi(2) = nocc
        call nga_get(g_orbs,tlo,thi,orbsi,ld)
        tlo(1) = lo(2)
        thi(1) = hi(2)
        call nga_get(g_orbs,tlo,thi,orbsj,ld)
        do i = lo(1), hi(1)
          iloc = i - lo(1) + 1
          do j = lo(2), hi(2)
            jloc = j - lo(2) + 1
            p = 0.0d00
            do k = 1, nocc
              p = p + orbsi(iloc,k)*orbsj(jloc,k)
            end do
            work(iloc,jloc) = 2.0d00*p
          end do
        end do
        call nga_put(g_work,lo,hi,work,ld)
        dotask = next_chunk(lo,hi)
      end do
      return
      end
c
      subroutine oneel(schwmax, eone)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer lo(2), hi(2), i, j, iloc, jloc, ld
      dimension work(ichunk,ichunk),tfock(ichunk,ichunk)
      logical dotask, next_chunk
c
c     fill in the one-electron part of the fock matrix and
c     compute the one-electron energy contribution
c

      me = ga_nodeid()
      nproc = ga_nnodes()
c
      call ga_zero(g_counter)
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while (dotask)
        call nga_get(g_schwarz,lo,hi,work,ld)
        do j = lo(2), hi(2)
          jloc = j - lo(2) + 1
          do i = lo(1), hi(1)
            iloc = i - lo(1) + 1
            tfock(iloc,jloc) = 0.0d00
            if (work(iloc,jloc)*schwmax.gt.tol2e)
     +        tfock(iloc,jloc) = h(i,j)
          end do
        end do
        call nga_put(g_fock,lo,hi,tfock,ld)
        dotask = next_chunk(lo,hi)
      end do
      eone = 0.5d00*contract_matrices(g_fock,g_dens)
c
      end
#if 0
      integer function nxtask(nproc)
      parameter (ichunk = 10)
      save icount, nleft
      data nleft, icount /0, 0/
c
c     wrapper round nxtval() to increase granularity
c     and thus reduce no. of requests to shared counter
c
      if(nproc.gt.0) then
         if(nleft.eq.0) then
#ifdef MSG_COMMS_MPI
            icount = nxtval_ga(nproc) * ichunk
#else
            icount = nxtval(nproc) * ichunk
#endif
            nleft = ichunk
         endif
         nxtask = icount
         icount = icount + 1
         nleft = nleft -1
      else
          nleft = 0
          nxtask = 0
#ifdef MSG_COMMS_MPI
          junk = nxtval_ga(nproc)
#else
          junk = nxtval(nproc)
#endif
      endif
c
c     following does dumb static load balancing
c
c$$$      if(nproc.gt.0) then
c$$$         if (nleft .eq. 0) then
c$$$            icount = ga_nodeid()
c$$$            nleft = 1
c$$$         endif
c$$$         nxtask = icount
c$$$         icount = icount + ga_nnodes()
c$$$      else
c$$$          nleft = 0
c$$$          nxtask = 0
c$$$      endif
      end
#endif
c
      logical function next_chunk(lo,hi)
#include "cscf.h"
      integer one
      parameter (one = 1)
      integer imax, lo(2), hi(2), ilo, jlo
      itask = nga_read_inc(g_counter,one,one)
      imax = nbfn/ichunk
      if (nbfn - ichunk*imax.gt.0) imax = imax + 1 
      if (itask.lt.imax*imax) then
        ilo = mod(itask,imax)
        jlo = (itask-ilo)/imax
        lo(1) = ilo*ichunk + 1
        lo(2) = jlo*ichunk + 1
        hi(1) = min((ilo+1)*ichunk,nbfn)
        hi(2) = min((jlo+1)*ichunk,nbfn)
        next_chunk = .true.
      else
        next_chunk = .false.
      endif
      return
      end
c
      logical function next_4chunk(lo,hi,ilo,jlo,klo,llo)
#include "cscf.h"
      integer one
      parameter (one = 1)
      integer*8 imax, itask, itmp                               !cste
      integer lo(4), hi(4), ilo, jlo, klo, llo                  !cste
c
      itask = nga_read_inc(g_counter,one,one)
      imax = nbfn/ichunk
      if (nbfn - ichunk*imax.gt.0) imax = imax + 1
      if (itask. lt. 0) then                                    !cste
         write(6,*) 'next_4chunk: itask negative:',itask,       !cste
     *             ' imax:',imax,' nbfn:',nbfn,' ichunk:',ichunk!cste
         write(6,*) 'probable GA integer precision problem if ' !cste
     *             ,'imax^4 > 2^31'                             !cste
         call flush(6)                                          !cste
         stop 'next_4chunk'                                     !cste
      end if                                                    !cste
      if (itask.lt.imax**4) then
        ilo = mod(itask,imax)
        itmp = (itask - ilo)/imax
        jlo = mod(itmp,imax)
        itmp = (itmp - jlo)/imax
        klo = mod(itmp,imax)
        llo = (itmp - klo)/imax
        lo(1) = ilo*ichunk + 1
        lo(2) = jlo*ichunk + 1
        lo(3) = klo*ichunk + 1
        lo(4) = llo*ichunk + 1
        hi(1) = min((ilo+1)*ichunk,nbfn)
        hi(2) = min((jlo+1)*ichunk,nbfn)
        hi(3) = min((klo+1)*ichunk,nbfn)
        hi(4) = min((llo+1)*ichunk,nbfn)
        next_4chunk = .true.
      else
        next_4chunk = .false.
      endif
      return
      end
c
      subroutine clean_chunk(chunk)
#include "cscf.h"
      double precision chunk(ichunk,ichunk)
      integer i,j
      do j = 1, ichunk
        do i = 1, ichunk
          chunk(i,j) = 0.0d00
        end do
      end do
      return
      end
c
      subroutine twoel(schwmax, etwo)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      double precision f_ij(ichunk,ichunk),d_kl(ichunk,ichunk)
      double precision f_ik(ichunk,ichunk),d_jl(ichunk,ichunk)
      double precision s_ij(ichunk,ichunk),s_kl(ichunk,ichunk)
      double precision schwmax, one
cste  integer nproc                                             !cste
      integer*8 ijkls, ijcnt,klcnt,ijklcnt                      !cste
      integer lo(4),hi(4),lo_ik(2),hi_ik(2),lo_jl(2),hi_jl(2)   !cste
      integer i,j,k,l,iloc,jloc,kloc,lloc,ld,ich,it,jt,kt,lt    !cste
      logical dotask, next_4chunk
c     
c     add in the two-electron contribution to the fock matrix
c     
cste  nproc = ga_nnodes()
      one = 1.0d00
      ijcnt = icut1                                     !cste
      klcnt = icut2                                     !cste
      ijklcnt = icut3                                   !cste
c
      call ga_zero(g_counter)
      ld = maxnbfn
      ich = ichunk
      dotask = next_4chunk(lo,hi,it,jt,kt,lt)
      itask = 0
cste  ijkls = 0                                         !cste
      do while (dotask)
cste  ijkl=(hi(1)-lo(1)+1)*(hi(2)-lo(2)+1)*             !cste
cste *     (hi(3)-lo(3)+1)*(hi(4)-lo(4)+1)              !cste
cste  ijkls = ijkls + ijkl                              !cste
cste  write(6,*)itask,lo,hi,ijkl,ijkls                  !cste
        lo_ik(1) = lo(1)
        lo_ik(2) = lo(3)
        hi_ik(1) = hi(1)
        hi_ik(2) = hi(3)
        lo_jl(1) = lo(2)
        lo_jl(2) = lo(4)
        hi_jl(1) = hi(2)
        hi_jl(2) = hi(4)
        call nga_get(g_schwarz,lo,hi,s_ij,ich)
        call nga_get(g_schwarz,lo(3),hi(3),s_kl,ich)
        call nga_get(g_dens,lo(3),hi(3),d_kl,ich)
        call nga_get(g_dens,lo_jl,hi_jl,d_jl,ich)
        itask = itask + 1
        call clean_chunk(f_ij)
        call clean_chunk(f_ik)
        do i = lo(1), hi(1)
          iloc = i-lo(1) + 1
          do j = lo(2), hi(2)
            jloc = j-lo(2) + 1
            if (s_ij(iloc,jloc)*schwmax .lt. tol2e) then
              icut1 = icut1 + (hi(3)-lo(3)+1)*(hi(4)-lo(4)+1)   !cste
            else
              do k = lo(3), hi(3)
                kloc = k-lo(3) + 1
                do l = lo(4), hi(4)
                  lloc = l-lo(4) + 1
                  if (s_ij(iloc,jloc)*s_kl(kloc,lloc).lt.tol2e) then
                    icut2 = icut2 + 1
                  else
                    call g(gg, i, j, k, l)
                    f_ij(iloc,jloc) = f_ij(iloc,jloc)
     +                              + gg*d_kl(kloc,lloc)
                    f_ik(iloc,kloc) = f_ik(iloc,kloc)
     +                              - 0.5d00*gg*d_jl(jloc,lloc)
                    icut3 = icut3 + 1
                  endif
                end do
              end do
            endif
          end do
        end do
        call nga_acc(g_fock,lo,hi,f_ij,ich,one)
        call nga_acc(g_fock,lo_ik,hi_ik,f_ik,ich,one)
        dotask = next_4chunk(lo,hi,it,jt,kt,lt)
      end do
      etwo = 0.5d00*contract_matrices(g_fock,g_dens)
      ijcnt = icut1 - ijcnt
      klcnt = icut2 - klcnt
      ijklcnt = icut3 - ijklcnt
cste  write(6,*) 'node ', ga_nodeid(), ijcnt, klcnt, ijklcnt    !cste
cste *           ,icut1,icut2,icut3                             !cste
cste  call flush(6)                                             !cste
      icut4 = icut3                                             !cste
      if (icut3 .gt. 0) return                                  !cste
c
c    no integrals may be calculated if there is no work for 
c    this node (ichunk too big), or, something is wrong
c
      write(6,*) 'no two-electron integrals computed by node',  !cste
     *            ga_nodeid()                                   !cste
      call flush(6)                                             !cste
      return
cste  stop 'twoel computed no integrals'                        !cste
      end
c
      subroutine damp(fac)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
c
c    create damped density matrix as a linear combination of
c    old density matrix and density matrix formed from new orbitals
c
      ofac = 1.0d0 - fac
      call ga_add(fac,g_dens,ofac,g_work,g_dens)
      return
      end
c
      subroutine prnout(iter, energy, deltad, tester)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
c
c     printout results of each iteration
c
      if (ga_nodeid().ne.0) return
      write(6,1) iter, energy, deltad, tester
      call flush(6)
1     format(' iter=',i3,', energy=',f15.8,', deltad=',1pd9.2,  
     $     ', deltaf=',d9.2)
      return
      end
c
      double precision function dendif()
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      double precision xdiff
      dimension dens_c(ichunk,ichunk),work_c(ichunk,ichunk)
      integer lo(2), hi(2), i, j, ld
      logical dotask, next_chunk
c
c     compute largest change in density matrix elements
c
      denmax = 0.0d0
      call ga_zero(g_counter)
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while(dotask)
        call nga_get(g_dens,lo,hi,dens_c,ld)
        call nga_get(g_work,lo,hi,work_c,ld)
        do j = 1, hi(2)-lo(2)+1
          do i = 1, hi(1)-lo(1)+1
            xdiff = abs(dens_c(i,j)-work_c(i,j))
            if (xdiff.gt.denmax) denmax = xdiff
          end do
        end do
        dotask = next_chunk(lo,hi)
      end do
      call ga_dgop(1,denmax,1,'max')
      dendif = denmax
      return
      end
c
      double precision function testfock()
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      double precision xmax, xtmp
      dimension work(ichunk,ichunk)
      integer lo(2), hi(2), i, j, iloc, jloc, ld
      logical dotask, next_chunk
c
c     compute largest change in density matrix elements
c
      xmax = 0.0d0
      call ga_zero(g_counter)
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while(dotask)
        call nga_get(g_fock,lo,hi,work,ld)
        do j = lo(2), hi(2)
          jloc = j - lo(2) + 1
          do i = lo(1), hi(1)
            iloc = i - lo(1) + 1
            if (i.ne.j) then
              xtmp = abs(work(iloc,jloc))
              if (xtmp.gt.xmax) xmax = xtmp
            endif
          end do
        end do
        dotask = next_chunk(lo,hi)
      end do
      call ga_dgop(1,xmax,1,'max')
      testfock = xmax
      return
      end
c
      subroutine shiftfock(shift)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      double precision shift
      dimension work(ichunk,ichunk)
      integer lo(2), hi(2), i, j, iloc, jloc, ld, icnt
      logical dotask, next_chunk
c
c     compute largest change in density matrix elements
c
      call ga_zero(g_counter)
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while(dotask)
        call nga_get(g_fock,lo,hi,work,ld)
        icnt = 0
        do j = lo(2), hi(2)
          jloc = j - lo(2) + 1
          do i = lo(1), hi(1)
            iloc = i - lo(1) + 1
            if (i.eq.j.and.i.gt.nocc) then
              work(iloc,jloc) = work(iloc,jloc) + shift
              icnt = icnt + 1
            endif
          end do
        end do
        if (icnt.gt.0) call nga_put(g_fock,lo,hi,work,ld)
        dotask = next_chunk(lo,hi)
      end do
      return
      end
c
      subroutine prnfin(energy)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      dimension orbs(maxnbfn, maxnbfn)
      integer lo(2),hi(2),ld
c
c     printout final results
c
      if (ga_nodeid().ne.0) return
      write(6,1) energy
 1    format(//' final energy = ',f18.11//' eigenvalues') 
      call output(eigv, 1, min(nbfn,nocc+5), 1, 1, nbfn, 1, 1)
c
      return
      end
      subroutine g(value,i,j,k,l)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
c
c     compute the two electon integral (ij|kl) over normalized
c     primitive 1s gaussians
c
      f0val = 0.0d0
      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
      rcd2 = (x(k)-x(l))**2 + (y(k)-y(l))**2 + (z(k)-z(l))**2
      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
      fackl = expnt(k)*expnt(l)/(expnt(k)+expnt(l))
      exijkl = exprjh(- facij*rab2 - fackl*rcd2)
      denom = (expnt(i)+expnt(j))*(expnt(k)+expnt(l)) *
     $        sqrt(expnt(i)+expnt(j)+expnt(k)+expnt(l))
      fac = (expnt(i)+expnt(j))*(expnt(k)+expnt(l)) /
     $        (expnt(i)+expnt(j)+expnt(k)+expnt(l))
c
      xp = (x(i)*expnt(i) + x(j)*expnt(j))/(expnt(i)+expnt(j))
      yp = (y(i)*expnt(i) + y(j)*expnt(j))/(expnt(i)+expnt(j))
      zp = (z(i)*expnt(i) + z(j)*expnt(j))/(expnt(i)+expnt(j))
      xq = (x(k)*expnt(k) + x(l)*expnt(l))/(expnt(k)+expnt(l))
      yq = (y(k)*expnt(k) + y(l)*expnt(l))/(expnt(k)+expnt(l))
      zq = (z(k)*expnt(k) + z(l)*expnt(l))/(expnt(k)+expnt(l))
      rpq2 = (xp-xq)**2 + (yp-yq)**2 + (zp-zq)**2
c
      call f0(f0val, fac*rpq2)
      value = (2.0d0 * pi**2.5d0 / denom) * exijkl * f0val *
     $    rnorm(i)*rnorm(j)*rnorm(k)*rnorm(l)
      return
      end
c
      subroutine diagon(tester, iter)
c      subroutine diagon(fock, orbs, evals, work, tester, iter)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      double precision r_zero, r_one, shift, tester
c
#if USE_TRANSFORM
c
c     use similarity transform to solve standard eigenvalue problem
c     (overlap matrix has been transformed out of the problem)
c
      r_one = 1.0d00
      r_zero = 0.0d00
      call ga_dgemm('n','n',nbfn,nbfn,nbfn,r_one,g_fock,g_orbs,
     +               r_zero,g_tfock)
      call ga_dgemm('t','n',nbfn,nbfn,nbfn,r_one,g_orbs,g_tfock,
     +               r_zero,g_fock)
      tester = testfock()
      shift = 0.0d00
      if (tester.gt.0.3d0) then
        shift = 0.3d0
      else
        if (nbfn .gt. 60) then
          shift = 0.1d0
        else
          shift = 0.0d0
        endif
      endif
      if (iter.ge.2.and.shift.ne.0.0d00) then
        call shiftfock(shift)
      endif
      call ga_copy(g_orbs,g_tfock)
      call ga_diag_std_seq(g_fock, g_work, eigv)
c
c     Back transform eigenvectors
c
      call ga_dgemm('n','n',nbfn,nbfn,nbfn,r_one,g_tfock,g_work,
     +               r_zero,g_orbs)
      if (iter.ge.2.and.shift.ne.0.0d00) then
         do 50 i = nocc+1, nbfn
            eigv(i) = eigv(i) - shift
 50      continue
      endif
#else
c
c     Keep remaking overlap matrix since ga_diag_seq does not
c     guarantee that g_ident is preserved.
c
         call makoverlap
         call ga_diag_seq(g_fock, g_ident, g_orbs, eigv)
         tester = 0.0d00
#endif
      return
      end
c
      subroutine makeob
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      double precision work(ichunk,ichunk),orbs(ichunk,ichunk)
      double precision eval(maxnbfn)
      integer lo(2),hi(2),ld,me,i,j,iloc,jloc
      logical dotask, next_chunk
c
c     generate set of orthonormal vectors by creating a random
c     symmetric matrix and solving associated generalized eigenvalue
c     problem using the correct overlap matrix.
c
      me = ga_nodeid()
      call ga_zero(g_counter)
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while (dotask)
        do j = lo(2), hi(2)
          jloc = j - lo(2) + 1
          do i = lo(1), hi(1)
            iloc = i - lo(1) + 1
            work(iloc,jloc) = s(i,j)
            orbs(iloc,jloc) = drand(0)
          end do
        end do
        call nga_put(g_ident,lo,hi,work,ld)
        call nga_put(g_fock,lo,hi,orbs,ld)
        dotask = next_chunk(lo,hi)
      end do
      call ga_symmetrize(g_fock)
      call ga_diag_seq(g_fock, g_ident, g_orbs, eval)
c
      return
      end
c
      subroutine denges
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
c
c     Form guess density from superposition of atomic densities in the AO
c     basis set ... instead of doing the atomic SCF hardwire for this
c     small basis set for the Be atom.
c
      integer one, itask, lo(2), hi(2), ld
      dimension atdens(15,15)
      data atdens/
     $     0.000002,0.000027,0.000129,0.000428,0.000950,0.001180,
     $     0.000457,-0.000270,-0.000271,0.000004,0.000004,0.000004,
     $     0.000004,0.000004,0.000004,0.000027,0.000102,0.000987,
     $     0.003269,0.007254,0.009007,0.003492,-0.002099,-0.002108,
     $     0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,
     $     0.000129,0.000987,0.002381,0.015766,0.034988,0.043433,
     $     0.016835,-0.010038,-0.010082,0.000166,0.000166,0.000166,
     $     0.000166,0.000166,0.000166,0.000428,0.003269,0.015766,
     $     0.026100,0.115858,0.144064,0.055967,-0.035878,-0.035990,
     $     0.000584,0.000584,0.000584,0.000584,0.000584,0.000584,
     $     0.000950,0.007254,0.034988,0.115858,0.128586,0.320120,
     $     0.124539,-0.083334,-0.083536,0.001346,0.001346,0.001346,
     $     0.001346,0.001346,0.001346,0.001180,0.009007,0.043433,
     $     0.144064,0.320120,0.201952,0.159935,-0.162762,-0.162267,
     $     0.002471,0.002471,0.002471,0.002471,0.002471,0.002471,
     $     0.000457,0.003492,0.016835,0.055967,0.124539,0.159935,
     $     0.032378,-0.093780,-0.093202,0.001372,0.001372,0.001372,
     $     0.001372,0.001372,0.001372,-0.000270,-0.002099,-0.010038,
     $     -0.035878,-0.083334,-0.162762,-0.093780,0.334488,0.660918,
     $     -0.009090,-0.009090,-0.009090,-0.009090,-0.009090,-0.009090,
     $     -0.000271,-0.002108,-0.010082,-0.035990,-0.083536,-0.162267,
     $     -0.093202,0.660918,0.326482,-0.008982,-0.008982,-0.008981,
     $     -0.008981,-0.008981,-0.008982,0.000004,0.000035,0.000166,
     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008982,
     $     0.000062,0.000124,0.000124,0.000124,0.000124,0.000124,
     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
     $     0.001372,-0.009090,-0.008982,0.000124,0.000062,0.000124,
     $     0.000124,0.000124,0.000124,0.000004,0.000035,0.000166,
     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008981,
     $     0.000124,0.000124,0.000062,0.000124,0.000124,0.000124,
     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
     $     0.001372,-0.009090,-0.008981,0.000124,0.000124,0.000124,
     $     0.000062,0.000124,0.000124,0.000004,0.000035,0.000166,
     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008981,
     $     0.000124,0.000124,0.000124,0.000124,0.000062,0.000124,
     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
     $     0.001372,-0.009090,-0.008982,0.000124,0.000124,0.000124,
     $     0.000124,0.000124,0.000062/
c
c   Create initial guess for density matrix in global array
c
      call ga_zero(g_dens)
      call ga_zero(g_counter)
      one = 1
      ld = 15
c
c   Correct for a factor of two along the diagonal
c
      do i = 1, ld
        atdens(i,i) = 2.0d00*atdens(i,i)
      end do
      itask = nga_read_inc(g_counter,one,one)
      do while(itask.lt.natom)
        ioff = itask*15
        lo(1) = ioff+1
        lo(2) = ioff+1
        hi(1) = ioff+15
        hi(2) = ioff+15
        call nga_put(g_dens,lo,hi,atdens,ld)
        itask = nga_read_inc(g_counter,one,one)
      end do
      call ga_sync
      return
      end
c
      subroutine setarrays
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer one, two, dims(2)
      logical status
      one = 1
      two = 2
      g_counter = ga_create_handle()
      call ga_set_data(g_counter,one,one,MT_INT)
      status = ga_allocate(g_counter)
      call ga_zero(g_counter)

      dims(1) = nbfn
      dims(2) = nbfn
      g_dens = ga_create_handle()
      call ga_set_data(g_dens, two, dims, MT_DBL)
      status = ga_allocate(g_dens)
      call ga_zero(g_dens)

      g_schwarz = ga_create_handle()
      call ga_set_data(g_schwarz, two, dims, MT_DBL)
      status = ga_allocate(g_schwarz)
      call ga_zero(g_schwarz)

      g_fock = ga_create_handle()
      call ga_set_data(g_fock, two, dims, MT_DBL)
      status = ga_allocate(g_fock)
      call ga_zero(g_fock)

      
      g_tfock = ga_create_handle()
      call ga_set_data(g_tfock, two, dims, MT_DBL)
      status = ga_allocate(g_tfock)
      call ga_zero(g_tfock)

      g_work = ga_create_handle()
      call ga_set_data(g_work, two, dims, MT_DBL)
      status = ga_allocate(g_work)
      call ga_zero(g_work)

      g_ident = ga_create_handle()
      call ga_set_data(g_ident, two, dims, MT_DBL)
      status = ga_allocate(g_ident)
      call ga_zero(g_ident)

      g_orbs = ga_create_handle()
      call ga_set_data(g_orbs, two, dims, MT_DBL)
      status = ga_allocate(g_orbs)
      call ga_zero(g_orbs)

      return
      end
      subroutine closearrays
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      logical status
c
      status = ga_destroy(g_counter)
      status = ga_destroy(g_dens)
      status = ga_destroy(g_schwarz)
      status = ga_destroy(g_fock)
      status = ga_destroy(g_tfock)
      status = ga_destroy(g_work)
      status = ga_destroy(g_ident)
      status = ga_destroy(g_orbs)
c
      return
      end
c
      subroutine makoverlap
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer me, lo(2), hi(2), ptr, ld(2)
      integer ld1, ld2
      me = ga_nodeid()
      call nga_distribution(g_ident, me, lo, hi)
      call nga_access(g_ident, lo, hi, ptr, ld)
      ld1 = hi(1) - lo(1) + 1
      ld2 = hi(2) - lo(2) + 1
      call setoverlap(dbl_mb(ptr),lo,hi,ld1,ld2)
      call nga_release(g_ident)
      return
      end
c
      subroutine setoverlap(a,lo,hi,ld1,ld2)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer lo(2), hi(2)
      integer ld1, ld2, ii, jj
      double precision a(ld1,ld2)
      do i = 1, ld1
        ii = i + lo(1) - 1
        do j = 1, ld2
          jj = j + lo(2) - 1
#if USE_TRANSFORM
          if (ii.eq.jj) then
            a(i,j) = 1.0d00
          else
            a(i,j) = 0.0d00
          endif
#else
          a(i,j) = s(ii,jj)
#endif
        end do
      end do
      return
      end
c
      subroutine print_ga_block(g_a)
      implicit double precision(a-h,o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer lo(2), hi(2), ptr, ld1, ld2
c
      me = ga_nodeid()
      call nga_distribution(g_a, me, lo, hi)
      ld1 = hi(1) - lo(1) + 1
      ld2 = hi(2) - lo(2) + 1
      call nga_access(g_a, lo, hi, ptr, ld)
      call dump_chunk(dbl_mb(ptr),ld1,ld2)
      call nga_release(g_a)
c
      return
      end
c
      subroutine print_ga_block_ij(g_a,tlo)
      implicit double precision(a-h,o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer lo(2), hi(2), ptr, ld1, ld2
c
      me = ga_nodeid()
      call nga_distribution(g_a, me, lo, hi)
      ld1 = hi(1) - lo(1) + 1
      ld2 = hi(2) - lo(2) + 1
      call nga_access(g_a, tlo, hi, ptr, ld)
      call dump_chunk(dbl_mb(ptr),ld1,ld2)
      call nga_release(g_a)
c
      return
      end
c
      subroutine dump_chunk(a,ld1,ld2)
      implicit double precision (a-h, o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer ld1, ld2
      double precision a(ld1, ld2)
      do i = 1, min(10,ld1)
        write(6,100) (a(i,j), j = 1, min(10,ld2))
      end do
      write(6,*)
      trace = 0.0d0
      do i=1,ld2
         trace = trace +a(i,i)
      end do
      write(6,*) 'trace=',trace
  100 format(10f10.4)
      return
      end
c
      double precision function contract_matrices(g_a,g_b)
      implicit double precision(a-h,o-z)
#include "cscf.h"
#include "mafdecls.fh"
#include "global.fh"
#include "mp3def.fh"
      integer lo(2), hi(2), ptr_a, ptr_b, ld, ld1, ld2
      double precision a(ichunk,ichunk),b(ichunk,ichunk)
      double precision value
      logical dotask, next_chunk
c
c   evalute sum_ij a_ij*b_ij
c
      value = 0.0d00
      call ga_zero(g_counter)
      dotask = next_chunk(lo,hi)
      ld = ichunk
      do while (dotask)
        call nga_get(g_a,lo,hi,a,ld)
        call nga_get(g_b,lo,hi,b,ld)
        do j = 1, hi(2)-lo(2)+1
          do i = 1, hi(1)-lo(1)+1
            value = value + a(i,j)*b(i,j)
          end do
        end do
        dotask = next_chunk(lo,hi)
      end do
      call ga_dgop(3,value,1,'+')
      contract_matrices=value
c
      return
      end