1! ---
2! Copyright (C) 1996-2016	The SIESTA group
3!  This file is distributed under the terms of the
4!  GNU General Public License: see COPYING in the top directory
5!  or http://www.gnu.org/copyleft/gpl.txt .
6! See Docs/Contributors.txt for a list of contributors.
7! ---
8module m_dminim
9
10use fdf,            only : fdf_boolean, fdf_integer, fdf_get, fdf_physical
11use files,          only : slabel
12use parallel,       only : ProcessorY, BlockSize, Node, Nodes
13use precision,      only : dp
14use siesta_options, only : fixspin
15use sys,            only : die
16#ifdef MPI
17use mpi_siesta,     only : mpi_integer, mpi_double_precision, mpi_comm_world, mpi_sum, mpi_status_size
18use parallelsubs,   only : GetNodeOrbs, GlobalToLocalOrb, WhichNodeOrb
19use parallelsubs,   only : set_BlockSizeDefault
20#endif
21
22implicit none
23
24external :: io_assign, io_close
25
26!**** PRIVATE ***********************************!
27
28private
29
30type multispin
31  real(dp), allocatable :: mtrx(:,:)
32end type multispin
33
34real(dp), parameter :: Pi=3.141592653589793238462643383279502884197_dp
35
36logical, save :: LongOut               ! print detailed output?
37logical, save :: UseCholesky           ! use Cholesky factorization?
38logical, save :: UseSparse             ! use sparse algebra?
39logical, save :: WriteCoeffs           ! write WF coeffs. at the end of each MD iteration?
40logical, save :: ReadCoeffs            ! read WF coeffs. at the start of the calculation?
41logical, save :: FirstCall(1:2)=.true. ! first time minim_cg is called? (up/down spin)
42#ifdef MPI
43logical, save :: Use2D                 ! use 2D data decomposition?
44#endif
45
46integer, save :: N_occ_loc_1D(1:2)         ! num. of WFs (local 1D) (up/down spin)
47integer, save :: h_dim_loc_1D              ! num. of AOs (local 1D)
48integer, save :: N_occ_loc(1:2,1:2)        ! num. of WFs (local 1D or 2D) (up/down spin)
49integer, save :: h_dim_loc(1:2)            ! num. of AOs (local 1D or 2D) (up/down spin)
50integer, save :: N_occ_diff                ! difference between num. of WFs for up/down spin
51#ifdef MPI
52integer, save :: BlockSize_c               ! ScaLAPACK blocking factor for WFs
53integer, save :: desc1(1:9)                ! descriptor for operator matrix in AO basis (1D or 2D)
54integer, save :: desc2(1:9,1:2)            ! descriptor for operator matrix in WF basis (1D or 2D) (up/down spin)
55integer, save :: desc3(1:9,1:2)            ! descriptor for WF coeffs. matrix (1D or 2D) (up/down spin)
56integer, allocatable :: numh_recv(:)       ! num. of nonzero elements of each row of sparse matrices
57integer, allocatable :: listhptr_recv(:)   ! pointer to start of row in listh
58integer, allocatable :: listh_recv(:)      ! list of nonzero elements of each row of sparse matrices
59integer, allocatable :: h_dim_l2g_recv(:)  ! local-to-global index transform for AOs
60integer, allocatable, save :: h_dim_l2g(:) ! local-to-global index transform for AOs
61#endif
62
63#ifdef MPI
64real(dp), allocatable :: As_recv(:) ! work matrix
65#endif
66
67type(multispin), save :: c(1:2)      ! WF coeffs. matrix (after Cholesky fact.) (up/down spin)
68type(multispin), save :: c_orig(1:2) ! WF coeffs. matrix (before Cholesky fact.) (up/down spin)
69
70!**** PUBLIC ************************************!
71
72public :: dminim
73
74!************************************************!
75
76contains
77
78!================================================!
79! use the orbital minimization method (OMM) to   !
80! solve the eigenvalue problem (double precision !
81! routine, for Gamma point-only calculations)    !
82!================================================!
83subroutine dminim(CalcE,PreviousCallDiagon,iscf,istp,nbasis,nspin,h_dim,nhmax,numh,listhptr,listh,d_sparse,eta,qs,h_sparse,&
84    s_sparse,t_sparse)
85
86  use densematrix, only : psi, allocDenseMatrix, resetDenseMatrix
87  implicit none
88
89  !**** INPUT ***********************************!
90
91  logical, intent(in) :: CalcE              ! Calculate the energy-density matrix from the existing coeffs.?
92  logical, intent(in) :: PreviousCallDiagon ! Previous SCF iteration solved by diagonalization?
93
94  integer, intent(in) :: iscf               ! SCF iteration num.
95  integer, intent(in) :: istp               ! MD iteration num.
96  integer, intent(in) :: nbasis             ! dimension of numh and listhptr
97  integer, intent(in) :: nspin              ! num. of spins
98  integer, intent(in) :: h_dim              ! num. of AOs (global)
99  integer, intent(in) :: nhmax              ! first dimension of listh and sparse matrices
100  integer, intent(in) :: numh(1:nbasis)     ! num. of nonzero elements of each row of sparse matrices
101  integer, intent(in) :: listhptr(1:nbasis) ! pointer to start of row in listh
102  integer, intent(in) :: listh(1:nhmax)     ! list of nonzero elements of each row of sparse matrices
103
104  real(dp), intent(in) :: qs(1:2)                             ! num. of electrons per spin
105  real(dp), intent(in) :: eta(1:2)                            ! chemical potential for Kim functional
106  real(dp), intent(in), optional :: h_sparse(1:nhmax,1:nspin) ! hamiltonian matrix (sparse)
107  real(dp), intent(in), optional :: t_sparse(1:nhmax)         ! kinetic energy matrix (sparse)
108  real(dp), intent(in), optional :: s_sparse(1:nhmax)         ! overlap matrix (sparse)
109
110  !**** OUTPUT **********************************!
111
112  real(dp), intent(out) :: d_sparse(1:nhmax,1:nspin) ! (energy-)density matrix (sparse)
113
114  !**** LOCAL ***********************************!
115
116  logical :: UpdatePrecon     ! update the preconditioner?
117  logical :: UsePrecon        ! use the preconditioner?
118  logical :: UpdateSparseComm ! update nhmax_max?
119
120  integer :: ispin
121  integer :: io
122  integer :: jo
123  integer :: j
124  integer :: k
125  integer :: ind
126  integer, save :: N_occ(1:2)           ! num. of WFs (global) (up/down spin)
127  integer, save :: precon_default       ! default number of SCF steps for which to use preconditioning
128  integer, save :: precon_first_step    ! number of SCF steps for which to use preconditioning (first MD step)
129  integer, save :: last_precon_update=0 ! istp value of last preconditioner update
130  integer, save :: last_call(1:2)=0     ! iscf and istp value of last module call
131#ifdef MPI
132  integer :: BlockSize_c_default
133#endif
134
135  real(dp), allocatable :: h_dense1D(:,:) ! hamiltonian matrix (dense 1D)
136  real(dp), allocatable :: t_dense1D(:,:) ! kinetic energy matrix (dense 1D)
137  real(dp), allocatable :: s_dense1D(:,:) ! overlap matrix (dense 1D)
138  real(dp), allocatable :: d_dense1D(:,:) ! (energy-)density matrix (dense 1D)
139
140  !**********************************************!
141
142  call timer('dmin',1)
143
144  if (all(FirstCall(1:2))) then
145
146    LongOut=fdf_boolean('OMM.LongOutput',.false.)
147
148    UseCholesky=fdf_boolean('OMM.UseCholesky',.true.)
149    UseSparse=fdf_boolean('OMM.UseSparse',.false.)
150    if (UseSparse .and. UseCholesky) call die('ERROR: sparse algebra not compatible with Cholesky factorization!')
151
152    WriteCoeffs=fdf_boolean('OMM.WriteCoeffs',.false.)
153    ReadCoeffs=fdf_boolean('OMM.ReadCoeffs',.false.)
154
155    if (.not. UseCholesky) then
156      precon_default=fdf_integer('OMM.Precon',-1)
157      precon_first_step=fdf_integer('OMM.PreconFirstStep',precon_default)
158    end if
159
160    if (nspin==1) then
161      N_occ(1)=nint(0.5_dp*qs(1))
162      if (abs(N_occ(1)-0.5_dp*qs(1))>1.0d-10) call die('ERROR: only integer number of electrons per spin allowed!')
163    else
164      if (.not. fixspin) call die('ERROR: OMM for spin unpolarized calculations only supports fixed spin!')
165      do ispin=1,nspin
166        N_occ(ispin)=nint(qs(ispin))
167        if (abs(N_occ(ispin)-qs(ispin))>1.0d-10) call die('ERROR: only integer number of electrons per spin allowed!')
168      end do
169      N_occ_diff=N_occ(1)-N_occ(2)
170    end if
171
172    h_dim_loc_1D=nbasis
173
174#ifdef MPI
175    Use2D=fdf_boolean('OMM.Use2D',.true.)
176    if (UseSparse .and. Use2D) call die('ERROR: sparse algebra not compatible with 2D data decomposition!')
177
178    ! calculate the ScaLAPACK blocking factor for distributing the WF coeffs. matrix
179    call set_blocksizedefault(Nodes,N_occ(1),BlockSize_c_default)
180    BlockSize_c=fdf_integer('OMM.BlockSize',BlockSize_c_default)
181#endif
182
183  end if
184
185  if (.not. UseSparse) allocate(d_dense1D(1:h_dim,1:h_dim_loc_1D))
186
187  if (CalcE) then
188
189    UsePrecon=.false.
190    UpdatePrecon=.false.
191    UpdateSparseComm=.false.
192
193  else
194
195    if (.not. UseSparse) allocate(h_dense1D(1:h_dim,1:h_dim_loc_1D))
196
197    ! decide whether preconditioning should be used in this minimization call
198    if (UseCholesky) then
199      UsePrecon=.false.
200      UpdatePrecon=.false.
201    else
202      if (istp==1) then
203        if ((iscf<=precon_first_step) .or. (precon_first_step<0)) then
204          UsePrecon=.true.
205        else
206          UsePrecon=.false.
207        end if
208      else
209        if ((iscf<=precon_default) .or. (precon_default<0)) then
210          UsePrecon=.true.
211        else
212          UsePrecon=.false.
213        end if
214      end if
215    end if
216
217    ! if this is the first time the module is called for this MD step, convert the new overlap
218    ! matrix from sparse to dense
219    if (UseSparse) then
220      if (istp/=last_call(2)) then
221        UpdateSparseComm=.true.
222      else
223        UpdateSparseComm=.false.
224      end if
225      if (UsePrecon .and. (istp/=last_precon_update)) then
226        allocate(s_dense1D(1:h_dim_loc_1D,1:h_dim))
227        s_dense1D=0.0_dp
228        do io=1,h_dim_loc_1D
229          do j=1,numh(io)
230            ind=listhptr(io)+j
231            jo=listh(ind)
232            s_dense1D(io,jo)=s_dense1D(io,jo)+s_sparse(ind)
233          end do
234        end do
235      end if
236    else
237      if (istp/=last_call(2)) then
238        allocate(s_dense1D(1:h_dim,1:h_dim_loc_1D))
239        s_dense1D=0.0_dp
240        do io=1,h_dim_loc_1D
241          do j=1,numh(io)
242            ind=listhptr(io)+j
243            jo=listh(ind)
244            s_dense1D(jo,io)=s_dense1D(jo,io)+s_sparse(ind)
245          end do
246        end do
247      end if
248    end if
249
250    ! if this is the first time we are using preconditioning for this MD step, convert also the new
251    ! kinetic energy matrix from sparse to dense
252    if (UsePrecon) then
253      if (istp/=last_precon_update) then
254        if (UseSparse) then
255          allocate(t_dense1D(1:h_dim_loc_1D,1:h_dim))
256          t_dense1D=0.0_dp
257          do io=1,h_dim_loc_1D
258            do j=1,numh(io)
259              ind=listhptr(io)+j
260              jo=listh(ind)
261              t_dense1D(io,jo)=t_dense1D(io,jo)+t_sparse(ind)
262            end do
263          end do
264        else
265          allocate(t_dense1D(1:h_dim,1:h_dim_loc_1D))
266          t_dense1D=0.0_dp
267          do io=1,h_dim_loc_1D
268            do j=1,numh(io)
269              ind=listhptr(io)+j
270              jo=listh(ind)
271              t_dense1D(jo,io)=t_dense1D(jo,io)+t_sparse(ind)
272            end do
273          end do
274        end if
275        UpdatePrecon=.true.
276        last_precon_update=istp
277      else
278        UpdatePrecon=.false.
279      end if
280    else
281      UpdatePrecon=.false.
282    end if
283
284  end if
285
286  ! The psi-array *must* be allocated even if it is not accessed when
287  ! PreviousCallDiagon is .false. or else compilers might flag as an
288  ! error the unallocated 'psi' dummy argument in some of the routines
289  ! below.
290  !
291  ! This call will keep the full 'psi' when needed (i.e., when
292  ! PreviousCallDiagon is .true. and 'psi' holds the eigenvectors
293  ! from a previous call to diagon that did not deallocate 'psi')
294  ! and allocate a minimal version when 'psi' has been deallocated
295  ! In order to achieve the former, the "shrink=.false." option in
296  ! allocDenseMatrix is essential.
297  call allocDenseMatrix(1, 1, 1)
298
299  do ispin=1,nspin
300
301    if (.not. calcE) then
302      ! convert the hamiltonian matrix from sparse to dense (and shift the eingevalue spectrum
303      ! w.r.t. the chemical potential reference)
304      if (.not. UseSparse) then
305        h_dense1D=0.0_dp
306        do io=1,h_dim_loc_1D
307          do j=1,numh(io)
308            ind=listhptr(io)+j
309            jo=listh(ind)
310            h_dense1D(jo,io)=h_dense1D(jo,io)+h_sparse(ind,ispin)-eta(ispin)*s_sparse(ind)
311          end do
312        end do
313      end if
314    end if
315
316    ! call the routine to perform the energy minimization
317    if (UseSparse) then
318      if (CalcE) then
319        call minim_cg_sparse(nhmax,numh,listhptr,listh,CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),&
320                             psi,nspin,ispin,UpdatePrecon,UsePrecon,UpdateSparseComm,d_sparse(1:nhmax,ispin))
321      else
322        if (allocated(s_dense1D)) then
323          if (allocated(t_dense1D)) then
324            call minim_cg_sparse(nhmax,numh,listhptr,listh,CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),&
325                                 psi,nspin,ispin,UpdatePrecon,UsePrecon,UpdateSparseComm,d_sparse(1:nhmax,ispin),&
326                                 h_sparse(1:nhmax,ispin)-eta(ispin)*s_sparse(1:nhmax),s_sparse,s_dense1D,t_dense1D)
327          else
328            call minim_cg_sparse(nhmax,numh,listhptr,listh,CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),&
329                                 psi,nspin,ispin,UpdatePrecon,UsePrecon,UpdateSparseComm,d_sparse(1:nhmax,ispin),&
330                                 h_sparse(1:nhmax,ispin)-eta(ispin)*s_sparse(1:nhmax),s_sparse,s_dense1D)
331          end if
332        else
333          if (allocated(t_dense1D)) then
334            call minim_cg_sparse(nhmax,numh,listhptr,listh,CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),&
335                                 psi,nspin,ispin,UpdatePrecon,UsePrecon,UpdateSparseComm,d_sparse(1:nhmax,ispin),&
336                                 h_sparse(1:nhmax,ispin)-eta(ispin)*s_sparse(1:nhmax),s_sparse,t_dense1D=t_dense1D)
337          else
338            call minim_cg_sparse(nhmax,numh,listhptr,listh,CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),&
339                                 psi,nspin,ispin,UpdatePrecon,UsePrecon,UpdateSparseComm,d_sparse(1:nhmax,ispin),&
340                                 h_sparse(1:nhmax,ispin)-eta(ispin)*s_sparse(1:nhmax),s_sparse)
341          end if
342        end if
343      end if
344    else
345      if (CalcE) then
346        call minim_cg(CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),psi,nspin,ispin,UpdatePrecon,UsePrecon,d_dense1D)
347      else
348        if (allocated(s_dense1D)) then
349          if (allocated(t_dense1D)) then
350            call minim_cg(CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),psi,nspin,ispin,UpdatePrecon,UsePrecon,&
351                          d_dense1D,h_dense1D,s_dense1D,t_dense1D)
352          else
353            call minim_cg(CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),psi,nspin,ispin,UpdatePrecon,UsePrecon,&
354                          d_dense1D,h_dense1D,s_dense1D)
355          end if
356        else
357          if (allocated(t_dense1D)) then
358            call minim_cg(CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),psi,nspin,ispin,UpdatePrecon,UsePrecon,&
359                          d_dense1D,h_dense1D,t_dense1D=t_dense1D)
360          else
361            call minim_cg(CalcE,PreviousCallDiagon,iscf,h_dim,N_occ(ispin),eta(ispin),psi,nspin,ispin,UpdatePrecon,UsePrecon,&
362                          d_dense1D,h_dense1D)
363          end if
364        end if
365      end if
366    end if
367
368    ! convert the (energy-)density matrix from dense to sparse
369    if (.not. UseSparse) then
370      do io=1,h_dim_loc_1D
371        do j=1,numh(io)
372          ind=listhptr(io)+j
373          jo=listh(ind)
374          d_sparse(ind,ispin)=d_dense1D(jo,io)
375        end do
376      end do
377    end if
378
379  end do
380
381  if (.not. CalcE) then
382    if (allocated(t_dense1D)) deallocate(t_dense1D)
383    if (allocated(s_dense1D)) deallocate(s_dense1D)
384    if (.not. UseSparse) deallocate(h_dense1D)
385  end if
386  if (.not. UseSparse) deallocate(d_dense1D)
387
388  last_call(1:2)=(/iscf,istp/)
389
390  ! Clean-up dense matrices holding seed eigenvectors
391  call resetDenseMatrix()
392
393  call timer('dmin',2)
394
395  end subroutine dminim
396
397!================================================!
398! minimize the Kim functional by conjugate       !
399! gradients (dense routine)                      !
400!================================================!
401subroutine minim_cg(CalcE,PreviousCallDiagon,iscf,h_dim,N_occ,eta,psi,nspin,ispin,UpdatePrecon,UsePrecon,d_dense1D,h_dense1D,&
402                    s_dense1D,t_dense1D)
403  implicit none
404
405  !**** INPUT ***********************************!
406
407  logical, intent(in) :: CalcE              ! calculate the energy-density matrix from the existing coeffs.?
408  logical, intent(in) :: PreviousCallDiagon ! Previous SCF iteration solved by diagonalization?
409  logical, intent(in) :: UpdatePrecon       ! update the preconditioner?
410  logical, intent(in) :: UsePrecon          ! use the preconditioner?
411
412  integer, intent(in) :: iscf  ! SCF iteration num.
413  integer, intent(in) :: h_dim ! num. of AOs (global)
414  integer, intent(in) :: N_occ ! num. of WFs (global)
415  integer, intent(in) :: nspin ! num. of spins
416  integer, intent(in) :: ispin ! up/down spin
417
418  real(dp), intent(in) :: eta                                 ! chemical potential for Kim functional
419  real(dp), intent(in) :: psi(1:h_dim,1:h_dim_loc_1D,1:nspin) ! eigenvectors from diagonalization
420  real(dp), intent(in), optional :: h_dense1D(:,:)            ! hamiltonian matrix in AO basis (dense 1D)
421  real(dp), intent(in), optional :: s_dense1D(:,:)            ! overlap matrix in AO basis (dense 1D)
422  real(dp), intent(in), optional :: t_dense1D(:,:)            ! kinetic energy matrix in AO basis (dense 1D)
423
424  !**** OUTPUT **********************************!
425
426  real(dp), intent(out) :: d_dense1D(:,:) ! (energy-)density matrix in AO basis (dense 1D)
427
428  !**** LOCAL ***********************************!
429
430  character(len=100) :: WF_COEFFS_filename
431#ifdef MPI
432  character(len=5) :: Node_name
433#endif
434
435  logical :: new_s
436  logical :: conv
437  logical :: ls_conv
438  logical :: ls_fail
439  logical :: ReadCoeffs2
440
441  integer :: i
442  integer :: j
443  integer :: k
444  integer :: l
445  integer :: m
446  integer :: n
447  integer :: info
448  integer :: icg                           ! CG step num.
449  integer :: n_step_max=100                ! max. num. steps for CG minimization
450  integer :: lwork
451  integer, allocatable :: ipiv(:)
452#ifdef MPI
453  integer :: liwork
454  integer :: mpi_status(1:mpi_status_size) ! MPI status
455  integer, save :: ictxt                   ! handle for main BLACS context (1D or 2D)
456  integer, save :: ictxt_1D                ! handle for additional BLACS context (1D)
457  integer, save :: ictxt_1D_T              ! handle for additional BLACS context (1D transposed)
458  integer, save :: desc1_1D(1:9)           ! descriptor for operator matrix in AO basis (1D)
459  integer, save :: desc3_1D_T(1:9,1:2)     ! descriptor for WF coeffs. matrix (1D transposed)
460  integer, allocatable :: iwork(:)
461  integer, external :: numroc
462#else
463  integer, external :: ilaenv
464#endif
465
466  real(dp) :: rn
467  real(dp) :: rn2
468  real(dp) :: E_diff
469  real(dp) :: TrQS
470  real(dp) :: lambda
471  real(dp) :: lambda_n
472  real(dp) :: lambda_d
473  real(dp) :: lambda_n_tot
474  real(dp) :: lambda_d_tot
475  real(dp) :: E_OMM                           ! OMM functional energy
476  real(dp) :: E_OMM_old                       ! OMM functional energy at previous step
477  real(dp) :: coeff(0:4)                      ! coeffs. of the quartic equation
478  real(dp), save :: x_min(1:2)                ! position of minimum
479  real(dp), save :: t_precon_scale            ! kinetic energy scale for the preconditioning
480  real(dp), save :: cg_tol                    ! convergence tolerance of CG minimization
481  real(dp), allocatable :: work1(:,:)         ! work matrix
482  real(dp), allocatable :: Sd(:,:)            ! g^T*s*g
483  real(dp), allocatable :: Sdd(:,:)           ! g^T*s*g
484  real(dp), allocatable :: g(:,:)             ! gradient
485  real(dp), allocatable :: g_p(:,:)           ! gradient at previous step
486  real(dp), allocatable :: pg(:,:)            ! preconditioned gradient
487  real(dp), allocatable :: pg_p(:,:)          ! preconditioned gradient at previous step
488  real(dp), allocatable :: d(:,:)             ! conjugate search direction
489  real(dp), allocatable :: hc(:,:)            ! h*c
490  real(dp), allocatable :: hg(:,:)            ! h*g
491  real(dp), allocatable :: h_dense(:,:)       ! hamiltonian matrix in AO basis (dense 2D or Cholesky transformed)
492  real(dp), allocatable, save :: s_dense(:,:) ! overlap matrix in AO basis (dense 1D or 2D)
493  real(dp), allocatable, save :: p_dense(:,:) ! preconditioning matrix in AO basis (dense 1D or 2D)
494  real(dp), external :: ddot
495#ifdef MPI
496  real(dp) :: Tr_loc
497  real(dp), save :: scale_Cholesky            ! Cholesky factorization eigenvalue scaling factor
498  real(dp), allocatable :: d_dense2D(:,:)     ! (energy-)density matrix in AO basis (dense 2D)
499  real(dp), allocatable :: work2(:,:)         ! work matrix
500  real(dp), allocatable :: work3(:,:)         ! work matrix
501#endif
502
503  type(multispin), save :: H(1:2)    ! hamiltonian matrix in WF basis
504  type(multispin), save :: S(1:2)    ! overlap matrix in WF basis
505  type(multispin), save :: Hd(1:2)   ! g^T*h*c
506  type(multispin), save :: Hdd(1:2)  ! g^T*h*g
507  type(multispin), save :: sc(1:2)   ! s*c
508  type(multispin), save :: sg(1:2)   ! s*g
509  type(multispin), save :: twoI(1:2) ! identity matrix x2
510  type(multispin), save :: cd(1:2)   ! work matrix
511
512  !**********************************************!
513
514  call timer('m_cg',1)
515
516  ! if this is the first time the minimization module is called, several things need to be done
517  ! (detailed below)
518  if (FirstCall(ispin)) then
519
520#ifdef MPI
521    if (all(FirstCall(1:2))) then
522
523      ! initialize the BLACS process grids
524      if (Use2D) then
525        call blacs_get(-1,0,ictxt_1D)
526        call blacs_gridinit(ictxt_1D,'C',1,Nodes)
527        call blacs_get(ictxt_1D,10,ictxt)
528        call blacs_gridinit(ictxt,'C',processorY,Nodes/processorY)
529        call blacs_get(ictxt,10,ictxt_1D_T)
530        call blacs_gridinit(ictxt_1D_T,'C',Nodes,1)
531      else
532        call blacs_get(-1,0,ictxt)
533        call blacs_gridinit(ictxt,'C',1,Nodes)
534        call blacs_get(ictxt,10,ictxt_1D_T)
535        call blacs_gridinit(ictxt_1D_T,'C',Nodes,1)
536      end if
537
538    end if
539
540    ! calculate the local dimensions of the AO and WF matrices
541    call blacs_gridinfo(ictxt_1D_T,i,j,k,l)
542    N_occ_loc_1D(ispin)=numroc(N_occ,BlockSize_c,k,0,Nodes)
543    if (Use2D) then
544      call blacs_gridinfo(ictxt,i,j,k,l)
545      h_dim_loc(1)=numroc(h_dim,BlockSize,k,0,processorY)
546      h_dim_loc(2)=numroc(h_dim,BlockSize,l,0,Nodes/processorY)
547      N_occ_loc(1,ispin)=numroc(N_occ,BlockSize_c,k,0,processorY)
548      N_occ_loc(2,ispin)=numroc(N_occ,BlockSize_c,l,0,Nodes/processorY)
549    else
550      h_dim_loc(1)=h_dim
551      h_dim_loc(2)=h_dim_loc_1D
552      N_occ_loc(1,ispin)=N_occ
553      N_occ_loc(2,ispin)=N_occ_loc_1D(ispin)
554    end if
555
556    ! initialize the matrix descriptors
557    if (Use2D) then
558      call descinit(desc1_1D,h_dim,h_dim,BlockSize,BlockSize,0,0,ictxt_1D,h_dim,info)
559      if (info/=0) call die('ERROR: desc1_1D setup has failed in minim!')
560    end if
561    call descinit(desc1,h_dim,h_dim,BlockSize,BlockSize,0,0,ictxt,h_dim_loc(1),info)
562    if (info/=0) call die('ERROR: desc1 setup has failed in minim!')
563    call descinit(desc2(1:9,ispin),N_occ,N_occ,BlockSize_c,BlockSize_c,0,0,ictxt,N_occ_loc(1,ispin),info)
564    if (info/=0) call die('ERROR: desc2 setup has failed in minim!')
565    call descinit(desc3(1:9,ispin),N_occ,h_dim,BlockSize_c,BlockSize,0,0,ictxt,N_occ_loc(1,ispin),info)
566    if (info/=0) call die('ERROR: desc3 setup has failed in minim!')
567    call descinit(desc3_1D_T(1:9,ispin),N_occ,h_dim,BlockSize_c,BlockSize,0,0,ictxt_1D_T,N_occ_loc_1D(ispin),info)
568    if (info/=0) call die('ERROR: desc3_1D_T setup has failed in minim!')
569#else
570    N_occ_loc_1D(ispin)=N_occ
571    h_dim_loc(1)=h_dim
572    h_dim_loc(2)=h_dim
573    N_occ_loc(1,ispin)=N_occ
574    N_occ_loc(2,ispin)=N_occ
575#endif
576
577    ! allocate the WF coeffs. matrix
578    allocate(c(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
579    if (UseCholesky) allocate(c_orig(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
580
581    ! if this is the first SCF step, then we need to initialize the WF coeffs. matrix with random
582    ! numbers between -0.5 and 0.5 (normalize at the end to avoid instabilities), unless we are
583    ! reading them from file
584    if (.not. PreviousCallDiagon) then
585      if (ReadCoeffs) then
586#ifdef MPI
587        write(Node_name,'(i5)') Node
588        if (nspin==1) then
589          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS.'//trim(adjustl(Node_name))
590        else
591          if (ispin==1) then
592            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP.'//trim(adjustl(Node_name))
593          else if (ispin==2) then
594            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN.'//trim(adjustl(Node_name))
595          end if
596        end if
597#else
598        if (nspin==1) then
599          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS'
600        else
601          if (ispin==1) then
602            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP'
603          else if (ispin==2) then
604            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN'
605          end if
606        end if
607#endif
608        inquire(file=trim(WF_COEFFS_filename),exist=ReadCoeffs2)
609      else
610        ReadCoeffs2=.false.
611      end if
612      if (ReadCoeffs2) then
613        call io_assign(i)
614        open(i,file=trim(WF_COEFFS_filename),form='unformatted',status='old',action='read')
615        if (UseCholesky) then
616          read(i) c_orig(ispin)%mtrx
617        else
618          read(i) c(ispin)%mtrx
619        end if
620        call io_close(i)
621      else
622        if ((ispin==1) .or. (N_occ_diff/=0)) then
623          call rand_init
624          do i=1,h_dim_loc(2)
625            do j=1,N_occ_loc(1,ispin)
626              call random_number(rn)
627              call random_number(rn2)
628              c(ispin)%mtrx(j,i)=sign(0.5_dp*rn,rn2-0.5_dp)
629            end do
630          end do
631          c(ispin)%mtrx=1.0d-2*c(ispin)%mtrx/sqrt(real(h_dim,dp))
632        else
633          c(2)%mtrx=c(1)%mtrx
634        end if
635      end if
636    end if
637
638    if (all(FirstCall(1:2))) then
639      t_precon_scale=fdf_physical('OMM.TPreconScale',10.0_dp,'Ry')
640      cg_tol=fdf_get('OMM.RelTol',1.0d-9)
641    end if
642
643  end if
644
645  ! if the previous SCF iteration was solved by diagonalization, then we take the lowest N_occ
646  ! eigenfunctions as our initial guess, but we must take care to distribute them properly amongst
647  ! the MPI processes for parallel runs
648  if (PreviousCallDiagon) then
649#ifdef MPI
650    allocate(work1(1:N_occ_loc_1D(ispin),1:h_dim))
651    allocate(work2(1:h_dim,1))
652    ! i: receiving node
653    ! j: local orbital num. on receiving node
654    ! k: global orbital num.
655    ! l: sending node
656    ! m: local orbital num. on sending node
657    k=0
658    do i=0,Nodes-1
659      n=N_occ_loc_1D(ispin)
660      call mpi_bcast(n,1,mpi_integer,i,mpi_comm_world,info)
661      do j=1,n
662        k=k+1
663        call WhichNodeOrb(k,Nodes,l)
664        call GlobalToLocalOrb(k,l,Nodes,m)
665        if (Node==l) then
666          if (Node==i) then
667            work1(j,1:h_dim)=psi(1:h_dim,m,ispin)
668          else
669            call mpi_send(psi(1,m,ispin),h_dim,mpi_double_precision,i,k,mpi_comm_world,info)
670          end if
671        else if (Node==i) then
672          call mpi_recv(work2(1,1),h_dim,mpi_double_precision,l,k,mpi_comm_world,mpi_status,info)
673          work1(j,1:h_dim)=work2(1:h_dim,1)
674        end if
675      end do
676    end do
677    deallocate(work2)
678    call pdgemr2d(N_occ,h_dim,work1,1,1,desc3_1D_T(1:9,ispin),c(ispin)%mtrx,1,1,desc3(1:9,ispin),ictxt)
679    deallocate(work1)
680#else
681    do i=1,N_occ
682      c(ispin)%mtrx(i,1:h_dim)=psi(1:h_dim,i,ispin)
683    end do
684#endif
685  end if
686
687#ifdef MPI
688  if (Use2D) allocate(d_dense2D(1:h_dim_loc(1),1:h_dim_loc(2)))
689#endif
690
691  if (CalcE) then
692
693    ! calculate the energy-density matrix: e=c^T*[(2*I-S)*(H+eta*S)]*c
694#ifdef MPI
695    call pdgeadd('T',N_occ,N_occ,x_min(ispin),Hd(ispin)%mtrx,1,1,desc2(1:9,ispin),1.0_dp,H(ispin)%mtrx,1,1,desc2(1:9,ispin))
696#else
697    do k=1,N_occ
698      do l=1,N_occ
699        H(ispin)%mtrx(k,l)=H(ispin)%mtrx(k,l)+x_min(ispin)*Hd(ispin)%mtrx(l,k)
700      end do
701    end do
702#endif
703    H(ispin)%mtrx=H(ispin)%mtrx+x_min(ispin)*Hd(ispin)%mtrx+x_min(ispin)**2*Hdd(ispin)%mtrx
704    allocate(work1(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
705    if (UseCholesky) then
706#ifdef MPI
707      if (Use2D) then
708        call calc_densmat(h_dim,N_occ,ispin,H(ispin)%mtrx+eta*S(ispin)%mtrx,c_orig(ispin)%mtrx,d_dense2D,work1,cd(ispin)%mtrx)
709      else
710        call calc_densmat(h_dim,N_occ,ispin,H(ispin)%mtrx+eta*S(ispin)%mtrx,c_orig(ispin)%mtrx,d_dense1D,work1,cd(ispin)%mtrx)
711      end if
712#else
713      call calc_densmat(h_dim,N_occ,ispin,H(ispin)%mtrx+eta*S(ispin)%mtrx,c_orig(ispin)%mtrx,d_dense1D,work1,cd(ispin)%mtrx)
714#endif
715    else
716#ifdef MPI
717      if (Use2D) then
718        call calc_densmat(h_dim,N_occ,ispin,H(ispin)%mtrx+eta*S(ispin)%mtrx,c(ispin)%mtrx,d_dense2D,work1,cd(ispin)%mtrx)
719      else
720        call calc_densmat(h_dim,N_occ,ispin,H(ispin)%mtrx+eta*S(ispin)%mtrx,c(ispin)%mtrx,d_dense1D,work1,cd(ispin)%mtrx)
721      end if
722#else
723      call calc_densmat(h_dim,N_occ,ispin,H(ispin)%mtrx+eta*S(ispin)%mtrx,c(ispin)%mtrx,d_dense1D,work1,cd(ispin)%mtrx)
724#endif
725    end if
726#ifdef MPI
727    if (Use2D) call pdgemr2d(h_dim,h_dim,d_dense2D,1,1,desc1,d_dense1D,1,1,desc1_1D,ictxt)
728#endif
729    if (nspin==1) d_dense1D=2.0_dp*d_dense1D
730
731    if (WriteCoeffs) then
732      call io_assign(i)
733#ifdef MPI
734      write(Node_name,'(i5)') Node
735      if (nspin==1) then
736        WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS.'//trim(adjustl(Node_name))
737      else
738        if (ispin==1) then
739          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP.'//trim(adjustl(Node_name))
740        else if (ispin==2) then
741          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN.'//trim(adjustl(Node_name))
742        end if
743      end if
744#else
745      if (nspin==1) then
746        WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS'
747      else
748        if (ispin==1) then
749          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP'
750        else if (ispin==2) then
751          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN'
752        end if
753      end if
754#endif
755      open(i,file=trim(WF_COEFFS_filename),form='unformatted',status='replace',action='write')
756      if (UseCholesky) then
757        write(i) c_orig(ispin)%mtrx
758      else
759        write(i) c(ispin)%mtrx
760      end if
761      call io_close(i)
762    end if
763
764    deallocate(work1)
765    deallocate(cd(ispin)%mtrx)
766    if (allocated(p_dense)) deallocate(p_dense)
767    if (.not. UseCholesky) then
768      deallocate(sg(ispin)%mtrx)
769      deallocate(sc(ispin)%mtrx)
770    end if
771    if (allocated(s_dense)) deallocate(s_dense)
772    deallocate(S(ispin)%mtrx)
773    deallocate(Hdd(ispin)%mtrx)
774    deallocate(Hd(ispin)%mtrx)
775    deallocate(H(ispin)%mtrx)
776#ifdef MPI
777    if (Use2D) deallocate(d_dense2D)
778#endif
779
780    call timer('m_cg',2)
781
782    return
783
784  end if
785
786  if (.not. allocated(H(ispin)%mtrx)) allocate(H(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
787  if (.not. allocated(Hd(ispin)%mtrx)) allocate(Hd(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
788  if (.not. allocated(Hdd(ispin)%mtrx)) allocate(Hdd(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
789  if (.not. allocated(S(ispin)%mtrx)) then
790    allocate(S(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
791    if (.not. allocated(s_dense)) then
792      allocate(s_dense(1:h_dim_loc(1),1:h_dim_loc(2)))
793#ifdef MPI
794      if (Use2D) then
795        call pdgemr2d(h_dim,h_dim,s_dense1D,1,1,desc1_1D,s_dense,1,1,desc1,ictxt)
796      else
797        s_dense=s_dense1D
798      end if
799#else
800      s_dense=s_dense1D
801#endif
802      if (UseCholesky) then
803#ifdef MPI
804        call pdpotrf('U',h_dim,s_dense,1,1,desc1,info)
805        if (info/=0) call die('ERROR: pdpotrf has failed in minim!')
806#else
807        call dpotrf('U',h_dim,s_dense,h_dim,info)
808        if (info/=0) call die('ERROR: dpotrf has failed in minim!')
809#endif
810      end if
811    end if
812    new_s=.true.
813  else
814    new_s=.false.
815  end if
816  if (.not. UseCholesky) then
817    if (.not. allocated(sc(ispin)%mtrx)) allocate(sc(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
818    if (.not. allocated(sg(ispin)%mtrx)) allocate(sg(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
819  end if
820
821  if (UseCholesky) then
822    if ((new_s .and. (.not. FirstCall(ispin))) .or. &
823        PreviousCallDiagon .or. &
824        (ReadCoeffs .and. FirstCall(ispin))) then
825      if (.not. PreviousCallDiagon) c(ispin)%mtrx=c_orig(ispin)%mtrx
826#ifdef MPI
827      call pdtrmm('R','U','T','N',N_occ,h_dim,1.0_dp,s_dense,1,1,desc1,c(ispin)%mtrx,1,1,desc3(1:9,ispin))
828#else
829      call dtrmm('R','U','T','N',N_occ,h_dim,1.0_dp,s_dense,h_dim,c(ispin)%mtrx,N_occ)
830#endif
831    end if
832  end if
833
834#ifdef MPI
835  if (UseCholesky .or. Use2D) then
836    allocate(h_dense(1:h_dim_loc(1),1:h_dim_loc(2)))
837    if (Use2D) then
838      call pdgemr2d(h_dim,h_dim,h_dense1D,1,1,desc1_1D,h_dense,1,1,desc1,ictxt)
839    else
840      h_dense=h_dense1D
841    end if
842  end if
843#else
844  if (UseCholesky) then
845    allocate(h_dense(1:h_dim_loc(1),1:h_dim_loc(2)))
846    h_dense=h_dense1D
847  end if
848#endif
849  if (UseCholesky) then
850#ifdef MPI
851    call pdsygst(1,'U',h_dim,h_dense,1,1,desc1,s_dense,1,1,desc1,scale_Cholesky,info)
852    if (info/=0) call die('ERROR: pdsygst has failed in minim!')
853    allocate(work1(1:h_dim_loc(1),1:h_dim_loc(2)))
854    allocate(work2(1:h_dim_loc(1),1:h_dim_loc(2)))
855    allocate(work3(1:h_dim_loc(1),1:h_dim_loc(2)))
856    call pdtran(h_dim,h_dim,1.0_dp,h_dense,1,1,desc1,0.0_dp,work1,1,1,desc1)
857    work2=0.0_dp
858    call pdlaset('U',h_dim,h_dim,1.0_dp,0.5_dp,work2,1,1,desc1)
859    work3=0.0_dp
860    call pdlaset('L',h_dim,h_dim,1.0_dp,0.5_dp,work3,1,1,desc1)
861    h_dense=work2*h_dense+work3*work1
862    deallocate(work3)
863    deallocate(work2)
864    deallocate(work1)
865#else
866    call dsygst(1,'U',h_dim,h_dense,h_dim,s_dense,h_dim,info)
867    if (info/=0) call die('ERROR: dsygst has failed in minim!')
868    do i=1,h_dim-1
869      do j=i+1,h_dim
870        h_dense(j,i)=h_dense(i,j)
871      end do
872    end do
873#endif
874  end if
875
876  ! calculate the preconditioning matrix (s+t/tau)^-1
877  if (UpdatePrecon .and. (ispin==1)) then
878    if (.not. allocated(p_dense)) allocate(p_dense(1:h_dim_loc(1),1:h_dim_loc(2)))
879#ifdef MPI
880    if (Use2D) then
881      call pdgemr2d(h_dim,h_dim,t_dense1D,1,1,desc1_1D,p_dense,1,1,desc1,ictxt)
882    else
883      p_dense=t_dense1D
884    end if
885#else
886    p_dense=t_dense1D
887#endif
888    p_dense=s_dense+p_dense/t_precon_scale
889#ifdef MPI
890    allocate(ipiv(1:h_dim_loc(1)+BlockSize))
891    call pdgetrf(h_dim,h_dim,p_dense,1,1,desc1,ipiv,info)
892    if (info/=0) call die('ERROR: pdgetrf has failed in minim!')
893    allocate(work1(1:1,1:1))
894    allocate(iwork(1:1))
895    call pdgetri(h_dim,p_dense,1,1,desc1,ipiv,work1,-1,iwork,-1,info)
896    if (info/=0) call die('ERROR: pdgetri has failed in minim!')
897    liwork=iwork(1)
898    deallocate(iwork)
899    lwork=work1(1,1)
900    deallocate(work1)
901    allocate(work1(1:lwork,1:1))
902    allocate(iwork(1:liwork))
903    call pdgetri(h_dim,p_dense,1,1,desc1,ipiv,work1,lwork,iwork,liwork,info)
904    if (info/=0) call die('ERROR: pdgetri has failed in minim!')
905    deallocate(iwork)
906    deallocate(work1)
907    deallocate(ipiv)
908#else
909    allocate(ipiv(1:h_dim))
910    lwork=h_dim*ilaenv(1,'dsytrf','U',h_dim,-1,-1,-1)
911    allocate(work1(1:lwork,1:1))
912    call dsytrf('U',h_dim,p_dense,h_dim,ipiv,work1,lwork,info)
913    if (info/=0) call die('ERROR: dsytrf has failed in minim!')
914    deallocate(work1)
915    allocate(work1(1:h_dim,1:1))
916    call dsytri('U',h_dim,p_dense,h_dim,ipiv,work1,info)
917    if (info/=0) call die('ERROR: dsytri has failed in minim!')
918    deallocate(work1)
919    deallocate(ipiv)
920    do i=1,h_dim-1
921      do j=i+1,h_dim
922        p_dense(j,i)=p_dense(i,j)
923      end do
924    end do
925#endif
926  end if
927
928  allocate(Sd(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
929  allocate(Sdd(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
930  allocate(g(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
931  allocate(g_p(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
932  if (UsePrecon) then
933    allocate(pg(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
934    allocate(pg_p(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
935  end if
936  allocate(d(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
937  allocate(hc(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
938  allocate(hg(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
939  allocate(work1(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
940
941  ! first we calculate the energy and gradient for our initial guess, with the following steps:
942  ! -calculate the hamiltonian in WF basis: H=c^T*h*c
943  if (allocated(h_dense)) then
944    call calc_A(h_dim,N_occ,ispin,h_dense,c(ispin)%mtrx,H(ispin)%mtrx,hc)
945  else
946    call calc_A(h_dim,N_occ,ispin,h_dense1D,c(ispin)%mtrx,H(ispin)%mtrx,hc)
947  end if
948  ! -calculate the overlap matrix in WF basis: S=c^T*s*c
949  if (UseCholesky) then
950#ifdef MPI
951    if (new_s .or. PreviousCallDiagon) call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,c(ispin)%mtrx,1,1,desc3(1:9,ispin),&
952                                                   c(ispin)%mtrx,1,1,desc3(1:9,ispin),0.0_dp,S(ispin)%mtrx,1,1,desc2(1:9,ispin))
953#else
954    if (new_s .or. PreviousCallDiagon) call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,c(ispin)%mtrx,N_occ,c(ispin)%mtrx,N_occ,0.0_dp,&
955                                                  S(ispin)%mtrx,N_occ)
956#endif
957  else
958    if (new_s .or. PreviousCallDiagon) then
959      call calc_A(h_dim,N_occ,ispin,s_dense,c(ispin)%mtrx,S(ispin)%mtrx,sc(ispin)%mtrx)
960    else
961      sc(ispin)%mtrx=sc(ispin)%mtrx+x_min(ispin)*sg(ispin)%mtrx
962    end if
963  end if
964  ! -calculate the gradient: g=2*(2*h*c-s*c*H-h*c*S)
965  !  (note that we *reuse* h*c and s*c contained in hc and sc from the previous call to calc_A)
966  if (UseCholesky) then
967    call calc_grad(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,g,hc,c(ispin)%mtrx)
968  else
969    call calc_grad(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,g,hc,sc(ispin)%mtrx)
970  end if
971  ! -calculate the preconditioned gradient by premultiplying g by (s+t/tau)^-1
972  if (UsePrecon) then
973#ifdef MPI
974    call pdgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,1,1,desc3(1:9,ispin),p_dense,1,1,desc1,0.0_dp,pg,1,1,desc3(1:9,ispin))
975#else
976    call dgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,N_occ,p_dense,h_dim,0.0_dp,pg,N_occ)
977#endif
978  end if
979  ! -calculate the additional matrices:
980  !  Hd=g^T*h*c
981  !  Sd=g^T*s*c
982  !  Hdd=g^T*h*g
983  !  Sdd=g^T*s*g
984  !  (again, h*c has already been calculated, although h*g has not)
985  !  and, finally, the coeffs. of the quartic line search equation in the direction g
986  !  (the energy at c is given by the zeroth-order coeff. c(0))
987  if (UsePrecon) then
988#ifdef MPI
989    call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,1,1,desc3(1:9,ispin),pg,1,1,desc3(1:9,ispin),0.0_dp,Hd(ispin)%mtrx,1,1,&
990                desc2(1:9,ispin))
991    call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,1,1,desc3(1:9,ispin),pg,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
992                desc2(1:9,ispin))
993#else
994    call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,N_occ,pg,N_occ,0.0_dp,Hd(ispin)%mtrx,N_occ)
995    call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,N_occ,pg,N_occ,0.0_dp,Sd,N_occ)
996#endif
997  else
998#ifdef MPI
999    call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,1,1,desc3(1:9,ispin),g,1,1,desc3(1:9,ispin),0.0_dp,Hd(ispin)%mtrx,1,1,&
1000                desc2(1:9,ispin))
1001#else
1002    call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,N_occ,g,N_occ,0.0_dp,Hd(ispin)%mtrx,N_occ)
1003#endif
1004    if (UseCholesky) then
1005#ifdef MPI
1006      call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,c(ispin)%mtrx,1,1,desc3(1:9,ispin),g,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
1007                  desc2(1:9,ispin))
1008#else
1009      call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,c(ispin)%mtrx,N_occ,g,N_occ,0.0_dp,Sd,N_occ)
1010#endif
1011    else
1012#ifdef MPI
1013      call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,1,1,desc3(1:9,ispin),g,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
1014                  desc2(1:9,ispin))
1015#else
1016      call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,N_occ,g,N_occ,0.0_dp,Sd,N_occ)
1017#endif
1018    end if
1019  end if
1020  if (UsePrecon) then
1021    if (allocated(h_dense)) then
1022      call calc_A(h_dim,N_occ,ispin,h_dense,pg,Hdd(ispin)%mtrx,hg)
1023    else
1024      call calc_A(h_dim,N_occ,ispin,h_dense1D,pg,Hdd(ispin)%mtrx,hg)
1025    end if
1026    call calc_A(h_dim,N_occ,ispin,s_dense,pg,Sdd,sg(ispin)%mtrx)
1027  else
1028    if (allocated(h_dense)) then
1029      call calc_A(h_dim,N_occ,ispin,h_dense,g,Hdd(ispin)%mtrx,hg)
1030    else
1031      call calc_A(h_dim,N_occ,ispin,h_dense1D,g,Hdd(ispin)%mtrx,hg)
1032    end if
1033    if (UseCholesky) then
1034#ifdef MPI
1035      call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,g,1,1,desc3(1:9,ispin),g,1,1,desc3(1:9,ispin),0.0_dp,Sdd,1,1,desc2(1:9,ispin))
1036#else
1037      call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,g,N_occ,g,N_occ,0.0_dp,Sdd,N_occ)
1038#endif
1039    else
1040      call calc_A(h_dim,N_occ,ispin,s_dense,g,Sdd,sg(ispin)%mtrx)
1041    end if
1042  end if
1043  call calc_coeff(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,Hd(ispin)%mtrx,Sd,Hdd(ispin)%mtrx,Sdd,coeff,work1)
1044  E_OMM=coeff(0)
1045
1046  ! this is the main loop of the CG algorithm. We perform a series of line minimizations, with the
1047  ! gradient g at each new step being modified to obtain the search direction d
1048  if (Node==0) then
1049    if (ispin==1) then
1050      print'(a)', '+---------------------------------------------+'
1051      if (UseCholesky) then
1052        print'(a)', '| OMM (Cholesky factorization)                |'
1053      else if (UsePrecon) then
1054        print'(a)', '| OMM (preconditioning)                       |'
1055      else
1056        print'(a)', '| OMM                                         |'
1057      end if
1058      print'(a)',      '+---------------------------------------------+'
1059    end if
1060    if (nspin==2) then
1061      if (ispin==1) then
1062        print'(a)',      '| up spin                                     |'
1063      else
1064        print'(a)',      '| down spin                                   |'
1065      end if
1066      print'(a)',      '+---------------------------------------------+'
1067    end if
1068    if (LongOut) print'(a)', '|             E_OMM            E_diff         |'
1069  end if
1070  conv=.false.
1071  d=0.0_dp
1072  icg=0
1073  do i=1,n_step_max
1074    lambda=0.0_dp
1075    do j=1,h_dim*N_occ-1
1076      if (UsePrecon) then
1077        d=pg+lambda*d
1078      else
1079        d=g+lambda*d
1080      end if
1081      g_p=g
1082      if (UsePrecon) pg_p=pg
1083      E_OMM_old=E_OMM
1084      ! if this is not the first CG step, we have to recalculate Hd, Sd, Hdd, Sdd, and the coeffs.
1085      if (icg>0) then
1086#ifdef MPI
1087        call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,1,1,desc3(1:9,ispin),d,1,1,desc3(1:9,ispin),0.0_dp,Hd(ispin)%mtrx,1,1,&
1088                    desc2(1:9,ispin))
1089#else
1090        call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,N_occ,d,N_occ,0.0_dp,Hd(ispin)%mtrx,N_occ)
1091#endif
1092        if (UseCholesky) then
1093#ifdef MPI
1094          call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,c(ispin)%mtrx,1,1,desc3(1:9,ispin),d,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
1095                      desc2(1:9,ispin))
1096#else
1097          call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,c(ispin)%mtrx,N_occ,d,N_occ,0.0_dp,Sd,N_occ)
1098#endif
1099        else
1100#ifdef MPI
1101          call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,1,1,desc3(1:9,ispin),d,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
1102                      desc2(1:9,ispin))
1103#else
1104          call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,N_occ,d,N_occ,0.0_dp,Sd,N_occ)
1105#endif
1106        end if
1107        if (allocated(h_dense)) then
1108          call calc_A(h_dim,N_occ,ispin,h_dense,d,Hdd(ispin)%mtrx,hg)
1109        else
1110          call calc_A(h_dim,N_occ,ispin,h_dense1D,d,Hdd(ispin)%mtrx,hg)
1111        end if
1112        if (UseCholesky) then
1113#ifdef MPI
1114          call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,d,1,1,desc3(1:9,ispin),d,1,1,desc3(1:9,ispin),0.0_dp,Sdd,1,1,&
1115                      desc2(1:9,ispin))
1116#else
1117          call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,d,N_occ,d,N_occ,0.0_dp,Sdd,N_occ)
1118#endif
1119        else
1120          call calc_A(h_dim,N_occ,ispin,s_dense,d,Sdd,sg(ispin)%mtrx)
1121        end if
1122        call calc_coeff(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,Hd(ispin)%mtrx,Sd,Hdd(ispin)%mtrx,Sdd,coeff,work1)
1123      end if
1124      ! using the coeffs. calculated anlytically, we can find the minimum of the functional in the
1125      ! search direction, and calculate the energy at that minimum
1126      call solve_quartic(coeff(0:4),x_min(ispin),ls_fail)
1127      ! in certain regions of the coeffs. space the line search gives no minimum--this occurs when there
1128      ! are positive eigenvalues in the eigenspecturm which are significantly occupied by our coeffs.
1129      ! matrix; the only known cure, unfortunately, is to scale down the entire matrix, thus returning to
1130      ! a  safe region of the coeffs. space.
1131      if (ls_fail) then
1132        if (Node==0) print'(a)', '| WARNING: Rescaling coefficients!            |'
1133        E_OMM=3.0*E_OMM
1134        c(ispin)%mtrx=0.5_dp*c(ispin)%mtrx
1135        ls_conv=.false.
1136      else
1137        ! if the line search is successful, move to the minimum
1138        E_OMM=coeff(4)*x_min(ispin)**4+&
1139              coeff(3)*x_min(ispin)**3+&
1140              coeff(2)*x_min(ispin)**2+&
1141              coeff(1)*x_min(ispin)+&
1142              coeff(0)
1143        c(ispin)%mtrx=c(ispin)%mtrx+x_min(ispin)*d
1144        ls_conv=.true.
1145      end if
1146      ! recalculate S at the minimum (or for the rescaled coeffs.)
1147      if (ls_fail) then
1148        S(ispin)%mtrx=0.25_dp*S(ispin)%mtrx
1149      else
1150#ifdef MPI
1151        call pdgeadd('T',N_occ,N_occ,x_min(ispin),Sd,1,1,desc2(1:9,ispin),1.0_dp,S(ispin)%mtrx,1,1,desc2(1:9,ispin))
1152#else
1153        do k=1,N_occ
1154          do l=1,N_occ
1155            S(ispin)%mtrx(k,l)=S(ispin)%mtrx(k,l)+x_min(ispin)*Sd(l,k)
1156          end do
1157        end do
1158#endif
1159        S(ispin)%mtrx=S(ispin)%mtrx+x_min(ispin)*Sd+x_min(ispin)**2*Sdd
1160      end if
1161      E_diff=2.0_dp*abs((E_OMM-E_OMM_old)/(E_OMM+E_OMM_old))
1162      if ((Node==0) .and. LongOut) print'(a,2(1x,i5),2(1x,es15.7e3),1x,a)', '|', i, j, E_OMM, E_diff, '|'
1163      icg=icg+1
1164      if (E_diff<=cg_tol) then
1165        conv=.true.
1166        exit
1167      end if
1168      ! recalculate H at the minimum (or for the rescaled coeffs.)
1169      if (ls_fail) then
1170        H(ispin)%mtrx=0.25_dp*H(ispin)%mtrx
1171      else
1172#ifdef MPI
1173        call pdgeadd('T',N_occ,N_occ,x_min(ispin),Hd(ispin)%mtrx,1,1,desc2(1:9,ispin),1.0_dp,H(ispin)%mtrx,1,1,desc2(1:9,ispin))
1174#else
1175        do k=1,N_occ
1176          do l=1,N_occ
1177            H(ispin)%mtrx(k,l)=H(ispin)%mtrx(k,l)+x_min(ispin)*Hd(ispin)%mtrx(l,k)
1178          end do
1179        end do
1180#endif
1181        H(ispin)%mtrx=H(ispin)%mtrx+x_min(ispin)*Hd(ispin)%mtrx+x_min(ispin)**2*Hdd(ispin)%mtrx
1182      end if
1183      ! recalculate g at the minimum (or for the rescaled coeffs.)
1184      if (ls_fail) then
1185        hc=0.5_dp*hc
1186        if (.not. UseCholesky) sc(ispin)%mtrx=0.5_dp*sc(ispin)%mtrx
1187        g=g_p+1.5_dp*hc
1188      else
1189        hc=hc+x_min(ispin)*hg
1190        if (UseCholesky) then
1191          call calc_grad(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,g,hc,c(ispin)%mtrx)
1192        else
1193          sc(ispin)%mtrx=sc(ispin)%mtrx+x_min(ispin)*sg(ispin)%mtrx
1194          call calc_grad(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,g,hc,sc(ispin)%mtrx)
1195        end if
1196      end if
1197      if (UsePrecon) then
1198#ifdef MPI
1199        call pdgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,1,1,desc3(1:9,ispin),p_dense,1,1,desc1,0.0_dp,pg,1,1,desc3(1:9,ispin))
1200#else
1201        call dgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,N_occ,p_dense,h_dim,0.0_dp,pg,N_occ)
1202#endif
1203      end if
1204      if (ls_conv) then
1205        if (UsePrecon) then
1206          lambda_n=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),pg,1,g-g_p,1)
1207          lambda_d=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),pg_p,1,g_p,1)
1208        else
1209          lambda_n=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),g,1,g-g_p,1)
1210          lambda_d=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),g_p,1,g_p,1)
1211        end if
1212#ifdef MPI
1213        call mpi_allreduce(lambda_n,lambda_n_tot,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
1214        call mpi_allreduce(lambda_d,lambda_d_tot,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
1215        lambda=lambda_n_tot/lambda_d_tot
1216#else
1217        lambda=lambda_n/lambda_d
1218#endif
1219      else
1220        exit
1221      end if
1222    end do
1223    if (conv) exit
1224  end do
1225  if (i>n_step_max) then
1226    if (Node==0) print'(a)', '| WARNING: OMM failed to converge!            |'
1227  end if
1228  if ((Node==0) .and. LongOut) print'(a)', '+---------------------------------------------+'
1229
1230  deallocate(work1)
1231  deallocate(hg)
1232  deallocate(hc)
1233  deallocate(d)
1234  if (UsePrecon) then
1235    deallocate(pg_p)
1236    deallocate(pg)
1237  end if
1238  deallocate(g_p)
1239  deallocate(g)
1240  deallocate(Sdd)
1241  deallocate(Sd)
1242  if (allocated(h_dense)) deallocate(h_dense)
1243
1244  ! calculate the density matrix: d=c*(2*I-S)*c^T
1245  if (.not. allocated(twoI(ispin)%mtrx)) then
1246    allocate(twoI(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1247#ifdef MPI
1248    call pdlaset('A',N_occ,N_occ,0.0_dp,2.0_dp,twoI(ispin)%mtrx,1,1,desc2(1:9,ispin))
1249#else
1250    twoI(ispin)%mtrx=0.0_dp
1251    do i=1,N_occ
1252      twoI(ispin)%mtrx(i,i)=2.0_dp
1253    end do
1254#endif
1255  end if
1256  if (.not. allocated(cd(ispin)%mtrx)) allocate(cd(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1257  if (UseCholesky) then
1258    c_orig(ispin)%mtrx=c(ispin)%mtrx
1259#ifdef MPI
1260    call pdtrsm('R','U','T','N',N_occ,h_dim,1.0_dp,s_dense,1,1,desc1,c_orig(ispin)%mtrx,1,1,desc3(1:9,ispin))
1261    if (Use2D) then
1262      call calc_densmat(h_dim,N_occ,ispin,twoI(ispin)%mtrx-S(ispin)%mtrx,c_orig(ispin)%mtrx,d_dense2D,cd(ispin)%mtrx)
1263    else
1264      call calc_densmat(h_dim,N_occ,ispin,twoI(ispin)%mtrx-S(ispin)%mtrx,c_orig(ispin)%mtrx,d_dense1D,cd(ispin)%mtrx)
1265    end if
1266#else
1267    call dtrsm('R','U','T','N',N_occ,h_dim,1.0_dp,s_dense,h_dim,c_orig(ispin)%mtrx,N_occ)
1268    call calc_densmat(h_dim,N_occ,ispin,twoI(ispin)%mtrx-S(ispin)%mtrx,c_orig(ispin)%mtrx,d_dense1D,cd(ispin)%mtrx)
1269#endif
1270  else
1271#ifdef MPI
1272    if (Use2D) then
1273      call calc_densmat(h_dim,N_occ,ispin,twoI(ispin)%mtrx-S(ispin)%mtrx,c(ispin)%mtrx,d_dense2D,cd(ispin)%mtrx)
1274    else
1275      call calc_densmat(h_dim,N_occ,ispin,twoI(ispin)%mtrx-S(ispin)%mtrx,c(ispin)%mtrx,d_dense1D,cd(ispin)%mtrx)
1276    end if
1277#else
1278    call calc_densmat(h_dim,N_occ,ispin,twoI(ispin)%mtrx-S(ispin)%mtrx,c(ispin)%mtrx,d_dense1D,cd(ispin)%mtrx)
1279#endif
1280  end if
1281#ifdef MPI
1282  if (Use2D) then
1283    call pdgemr2d(h_dim,h_dim,d_dense2D,1,1,desc1,d_dense1D,1,1,desc1_1D,ictxt)
1284    deallocate(d_dense2D)
1285  end if
1286#endif
1287  if (nspin==1) d_dense1D=2.0_dp*d_dense1D
1288
1289  ! calculate the trace of S to make sure we are occupying the right number of eigenstates in our
1290  ! solution
1291#ifdef MPI
1292  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),twoI(ispin)%mtrx-S(ispin)%mtrx,1,S(ispin)%mtrx,1)
1293  call mpi_allreduce(Tr_loc,TrQS,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
1294#else
1295  TrQS=ddot(N_occ*N_occ,twoI(ispin)%mtrx-S(ispin)%mtrx,1,S(ispin)%mtrx,1)
1296#endif
1297  if (Node==0) then
1298    if (nspin==1) then
1299      print'(a,i5,a)',    '| minim: icg             = ', icg, '              |'
1300      print'(a,f13.7,a)', '| minim: 2*Tr[(2*I-S)*S] = ', 2.0_dp*TrQS, '      |'
1301    else
1302      print'(a,i5,a)',    '| minim: icg           = ', icg, '                |'
1303      print'(a,f13.7,a)', '| minim: Tr[(2*I-S)*S] = ', TrQS, '        |'
1304    end if
1305    print'(a)',       '+---------------------------------------------+'
1306  end if
1307
1308  if (FirstCall(ispin)) FirstCall(ispin)=.false.
1309
1310  call timer('m_cg',2)
1311
1312end subroutine minim_cg
1313
1314!================================================!
1315! minimize the Kim functional by conjugate       !
1316! gradients (sparse routine)                     !
1317!================================================!
1318subroutine minim_cg_sparse(nhmax,numh,listhptr,listh,CalcE,PreviousCallDiagon,iscf,h_dim,N_occ,eta,psi,nspin,ispin,&
1319                           UpdatePrecon,UsePrecon,UpdateSparseComm,d_sparse,h_sparse,s_sparse,s_dense1D,t_dense1D)
1320  implicit none
1321
1322  !**** INPUT ***********************************!
1323
1324  logical, intent(in) :: CalcE              ! calculate the energy-density matrix from the existing coeffs.?
1325  logical, intent(in) :: PreviousCallDiagon ! Previous SCF iteration solved by diagonalization?
1326  logical, intent(in) :: UpdatePrecon       ! update the preconditioner?
1327  logical, intent(in) :: UsePrecon          ! use the preconditioner?
1328  logical, intent(in) :: UpdateSparseComm   ! update nhmax_max?
1329
1330  integer, intent(in) :: nhmax       ! first dimension of listh and sparse matrices
1331  integer, intent(in) :: numh(:)     ! num. of nonzero elements of each row of sparse matrices
1332  integer, intent(in) :: listhptr(:) ! pointer to start of row in listh
1333  integer, intent(in) :: listh(:)    ! list of nonzero elements of each row of sparse matrices
1334  integer, intent(in) :: iscf        ! SCF iteration num.
1335  integer, intent(in) :: h_dim       ! num. of AOs (global)
1336  integer, intent(in) :: N_occ       ! num. of WFs (global)
1337  integer, intent(in) :: nspin       ! num. of spins
1338  integer, intent(in) :: ispin       ! up/down spin
1339
1340  real(dp), intent(in) :: eta                                 ! chemical potential for Kim functional
1341  real(dp), intent(in) :: psi(1:h_dim,1:h_dim_loc_1D,1:nspin) ! eigenvectors from diagonalization
1342  real(dp), intent(in), optional :: h_sparse(:)               ! hamiltonian matrix in AO basis (sparse)
1343  real(dp), intent(in), optional :: s_sparse(:)               ! overlap matrix in AO basis (sparse)
1344  real(dp), intent(in), optional :: s_dense1D(:,:)            ! overlap matrix in AO basis (dense 1D)
1345  real(dp), intent(in), optional :: t_dense1D(:,:)            ! kinetic energy matrix in AO basis (dense 1D)
1346
1347  !**** INOUT ***********************************!
1348
1349  real(dp), intent(inout) :: d_sparse(:) ! (energy-)density matrix in AO basis (sparse)
1350
1351  !**** LOCAL ***********************************!
1352
1353  character(len=100) :: WF_COEFFS_filename
1354#ifdef MPI
1355  character(len=5) :: Node_name
1356#endif
1357
1358  logical :: new_s
1359  logical :: conv
1360  logical :: ls_conv
1361  logical :: ls_fail
1362  logical :: ReadCoeffs2
1363
1364  integer :: i
1365  integer :: j
1366  integer :: k
1367  integer :: l
1368  integer :: m
1369  integer :: n
1370  integer :: info
1371  integer :: icg                           ! CG step num.
1372  integer :: n_step_max=100                ! max. num. steps for CG minimization
1373  integer :: lwork
1374  integer, allocatable :: ipiv(:)
1375#ifdef MPI
1376  integer :: liwork
1377  integer :: mpi_status(1:mpi_status_size) ! MPI status
1378  integer, save :: ictxt                   ! handle for main BLACS context (1D)
1379  integer, save :: nhmax_max
1380  integer, save :: h_dim_loc_max
1381  integer, allocatable :: iwork(:)
1382  integer, external :: numroc
1383#else
1384  integer, external :: ilaenv
1385#endif
1386
1387  real(dp) :: rn
1388  real(dp) :: rn2
1389  real(dp) :: E_diff
1390  real(dp) :: TrQS
1391  real(dp) :: lambda
1392  real(dp) :: lambda_n
1393  real(dp) :: lambda_d
1394  real(dp) :: lambda_n_tot
1395  real(dp) :: lambda_d_tot
1396  real(dp) :: E_OMM                             ! OMM functional energy
1397  real(dp) :: E_OMM_old                         ! OMM functional energy at previous step
1398  real(dp) :: coeff(0:4)                        ! coeffs. of the quartic equation
1399  real(dp), save :: x_min(1:2)                  ! position of minimum
1400  real(dp), save :: t_precon_scale              ! kinetic energy scale for the preconditioning
1401  real(dp), save :: cg_tol                      ! convergence tolerance of CG minimization
1402  real(dp), allocatable :: work1(:,:)           ! work matrix
1403  real(dp), allocatable :: Sd(:,:)              ! g^T*s*g
1404  real(dp), allocatable :: Sdd(:,:)             ! g^T*s*g
1405  real(dp), allocatable :: g(:,:)               ! gradient
1406  real(dp), allocatable :: g_p(:,:)             ! gradient at previous step
1407  real(dp), allocatable :: pg(:,:)              ! preconditioned gradient
1408  real(dp), allocatable :: pg_p(:,:)            ! preconditioned gradient at previous step
1409  real(dp), allocatable :: d(:,:)               ! conjugate search direction
1410  real(dp), allocatable :: hc(:,:)              ! h*c
1411  real(dp), allocatable :: hg(:,:)              ! h*g
1412  real(dp), allocatable, save :: p_dense1D(:,:) ! preconditioning matrix in AO basis (dense 1D)
1413  real(dp), external :: ddot
1414#ifdef MPI
1415  real(dp) :: Tr_loc
1416  real(dp), allocatable :: work2(:,:)           ! work matrix
1417  real(dp), allocatable :: work3(:,:)           ! work matrix
1418#endif
1419
1420  type(multispin), save :: H(1:2)    ! hamiltonian matrix in WF basis
1421  type(multispin), save :: S(1:2)    ! overlap matrix in WF basis
1422  type(multispin), save :: Hd(1:2)   ! g^T*h*c
1423  type(multispin), save :: Hdd(1:2)  ! g^T*h*g
1424  type(multispin), save :: sc(1:2)   ! s*c
1425  type(multispin), save :: sg(1:2)   ! s*g
1426  type(multispin), save :: twoI(1:2) ! identity matrix x2
1427  type(multispin), save :: cd(1:2)   ! work matrix
1428
1429  !**********************************************!
1430
1431  call timer('m_cg',1)
1432
1433  ! if this is the first time the minimization module is called, several things need to be done
1434  ! (detailed below)
1435  if (FirstCall(ispin)) then
1436
1437#ifdef MPI
1438    if (all(FirstCall(1:2))) then
1439
1440      ! calculate the local-to-global index transformations needed for the sparse matrix-matrix
1441      ! multiplications
1442      allocate(h_dim_l2g(1:h_dim_loc_1D))
1443      j=0
1444      k=0
1445      l=0
1446      do i=1,h_dim
1447        k=k+1
1448        if (j==Node) then
1449          l=l+1
1450          h_dim_l2g(l)=i
1451        end if
1452        if (k==BlockSize) then
1453          k=0
1454          j=j+1
1455          if (j==Nodes) j=0
1456        end if
1457      end do
1458
1459      ! find the largest value of h_dim_loc over all MPI processes needed for the sparse matrix
1460      ! multiplications
1461      h_dim_loc_max=0
1462      do i=0,Nodes-1
1463        if (i==Node) then
1464          k=h_dim_loc_1D
1465        end if
1466        call mpi_bcast(j,1,mpi_integer,i,mpi_comm_world,info)
1467        call mpi_bcast(k,1,mpi_integer,i,mpi_comm_world,info)
1468        if (k>h_dim_loc_max) h_dim_loc_max=k
1469      end do
1470
1471      ! initialize the BLACS process grids
1472      call blacs_get(-1,0,ictxt)
1473      call blacs_gridinit(ictxt,'C',Nodes,1)
1474
1475    end if
1476
1477    ! calculate the local dimensions of the AO and WF matrices
1478    call blacs_gridinfo(ictxt,i,j,k,l)
1479    N_occ_loc_1D(ispin)=numroc(N_occ,BlockSize_c,k,0,Nodes)
1480    h_dim_loc(1)=h_dim_loc_1D
1481    h_dim_loc(2)=h_dim
1482    N_occ_loc(1,ispin)=N_occ_loc_1D(ispin)
1483    N_occ_loc(2,ispin)=N_occ
1484
1485    ! initialize the matrix descriptors
1486    call descinit(desc1,h_dim,h_dim,BlockSize,BlockSize,0,0,ictxt,h_dim_loc(1),info)
1487    if (info/=0) call die('ERROR: desc1 setup has failed in minim!')
1488    call descinit(desc2(1:9,ispin),N_occ,N_occ,BlockSize_c,BlockSize_c,0,0,ictxt,N_occ_loc(1,ispin),info)
1489    if (info/=0) call die('ERROR: desc2 setup has failed in minim!')
1490    call descinit(desc3(1:9,ispin),N_occ,h_dim,BlockSize_c,BlockSize,0,0,ictxt,N_occ_loc(1,ispin),info)
1491    if (info/=0) call die('ERROR: desc3 setup has failed in minim!')
1492#else
1493    N_occ_loc_1D(ispin)=N_occ
1494    h_dim_loc(1)=h_dim
1495    h_dim_loc(2)=h_dim
1496    N_occ_loc(1,ispin)=N_occ
1497    N_occ_loc(2,ispin)=N_occ
1498#endif
1499
1500    ! allocate the WF coeffs. matrix
1501    allocate(c(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1502
1503    ! if this is the first SCF step, then we need to initialize the WF coeffs. matrix with random
1504    ! numbers between -0.5 and 0.5 (normalize at the end to avoid instabilities), unless we are
1505    ! reading them from file
1506    if (.not. PreviousCallDiagon) then
1507      if (ReadCoeffs) then
1508#ifdef MPI
1509        write(Node_name,'(i5)') Node
1510        if (nspin==1) then
1511          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS.'//trim(adjustl(Node_name))
1512        else
1513          if (ispin==1) then
1514            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP.'//trim(adjustl(Node_name))
1515          else if (ispin==2) then
1516            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN.'//trim(adjustl(Node_name))
1517          end if
1518        end if
1519#else
1520        if (nspin==1) then
1521          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS'
1522        else
1523          if (ispin==1) then
1524            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP'
1525          else if (ispin==2) then
1526            WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN'
1527          end if
1528        end if
1529#endif
1530        inquire(file=trim(WF_COEFFS_filename),exist=ReadCoeffs2)
1531      else
1532        ReadCoeffs2=.false.
1533      end if
1534      if (ReadCoeffs2) then
1535        call io_assign(i)
1536        open(i,file=trim(WF_COEFFS_filename),form='unformatted',status='old',action='read')
1537        read(i) c(ispin)%mtrx
1538        call io_close(i)
1539      else
1540        if ((ispin==1) .or. (N_occ_diff/=0)) then
1541          call rand_init
1542          do i=1,h_dim_loc(2)
1543            do j=1,N_occ_loc(1,ispin)
1544              call random_number(rn)
1545              call random_number(rn2)
1546              c(ispin)%mtrx(j,i)=sign(0.5_dp*rn,rn2-0.5_dp)
1547            end do
1548          end do
1549          c(ispin)%mtrx=1.0d-2*c(ispin)%mtrx/sqrt(real(h_dim,dp))
1550        else
1551          c(2)%mtrx=c(1)%mtrx
1552        end if
1553      end if
1554    end if
1555
1556    if (all(FirstCall(1:2))) then
1557      t_precon_scale=fdf_physical('OMM.TPreconScale',10.0_dp,'Ry')
1558      cg_tol=fdf_get('OMM.RelTol',1.0d-9)
1559    end if
1560
1561  end if
1562
1563  ! if the previous SCF iteration was solved by diagonalization, then we take the lowest N_occ
1564  ! eigenfunctions as our initial guess, but we must take care to distribute them properly amongst
1565  ! the MPI processes for parallel runs
1566  if (PreviousCallDiagon) then
1567#ifdef MPI
1568    allocate(work2(1:h_dim,1))
1569    ! i: receiving node
1570    ! j: local orbital num. on receiving node
1571    ! k: global orbital num.
1572    ! l: sending node
1573    ! m: local orbital num. on sending node
1574    k=0
1575    do i=0,Nodes-1
1576      n=N_occ_loc_1D(ispin)
1577      call mpi_bcast(n,1,mpi_integer,i,mpi_comm_world,info)
1578      do j=1,n
1579        k=k+1
1580        call WhichNodeOrb(k,Nodes,l)
1581        call GlobalToLocalOrb(k,l,Nodes,m)
1582        if (Node==l) then
1583          if (Node==i) then
1584            c(ispin)%mtrx(j,1:h_dim)=psi(1:h_dim,m,ispin)
1585          else
1586            call mpi_send(psi(1,m,ispin),h_dim,mpi_double_precision,i,k,mpi_comm_world,info)
1587          end if
1588        else if (Node==i) then
1589          call mpi_recv(work2(1,1),h_dim,mpi_double_precision,l,k,mpi_comm_world,mpi_status,info)
1590          c(ispin)%mtrx(j,1:h_dim)=work2(1:h_dim,1)
1591        end if
1592      end do
1593    end do
1594    deallocate(work2)
1595#else
1596    do i=1,N_occ
1597      c(ispin)%mtrx(i,1:h_dim)=psi(1:h_dim,i,ispin)
1598    end do
1599#endif
1600  end if
1601
1602#ifdef MPI
1603  ! find the largest value of nhmax over all MPI processes needed for the sparse matrix
1604  ! multiplications
1605  if (UpdateSparseComm) then
1606    nhmax_max=0
1607    do i=0,Nodes-1
1608      if (i==Node) then
1609        j=nhmax
1610      end if
1611      call mpi_bcast(j,1,mpi_integer,i,mpi_comm_world,info)
1612      call mpi_bcast(k,1,mpi_integer,i,mpi_comm_world,info)
1613      if (j>nhmax_max) nhmax_max=j
1614    end do
1615  end if
1616
1617  allocate(numh_recv(1:h_dim_loc_max))
1618  allocate(listhptr_recv(1:h_dim_loc_max))
1619  allocate(listh_recv(1:nhmax_max))
1620  allocate(h_dim_l2g_recv(1:h_dim_loc_max))
1621  allocate(As_recv(1:nhmax_max))
1622#endif
1623
1624  if (CalcE) then
1625
1626    ! calculate the energy-density matrix: e=c^T*[(2*I-S)*(H+eta*S)]*c
1627#ifdef MPI
1628    call pdgeadd('T',N_occ,N_occ,x_min(ispin),Hd(ispin)%mtrx,1,1,desc2(1:9,ispin),1.0_dp,H(ispin)%mtrx,1,1,desc2(1:9,ispin))
1629#else
1630    do k=1,N_occ
1631      do l=1,N_occ
1632        H(ispin)%mtrx(k,l)=H(ispin)%mtrx(k,l)+x_min(ispin)*Hd(ispin)%mtrx(l,k)
1633      end do
1634    end do
1635#endif
1636    H(ispin)%mtrx=H(ispin)%mtrx+x_min(ispin)*Hd(ispin)%mtrx+x_min(ispin)**2*Hdd(ispin)%mtrx
1637    allocate(work1(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1638    call calc_densmat_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,H(ispin)%mtrx+eta*S(ispin)%mtrx,c(ispin)%mtrx,&
1639                             d_sparse,work1,cd(ispin)%mtrx)
1640    if (nspin==1) d_sparse=2.0_dp*d_sparse
1641
1642    if (WriteCoeffs) then
1643      call io_assign(i)
1644#ifdef MPI
1645      write(Node_name,'(i5)') Node
1646      if (nspin==1) then
1647        WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS.'//trim(adjustl(Node_name))
1648      else
1649        if (ispin==1) then
1650          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP.'//trim(adjustl(Node_name))
1651        else if (ispin==2) then
1652          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN.'//trim(adjustl(Node_name))
1653        end if
1654      end if
1655#else
1656      if (nspin==1) then
1657        WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS'
1658      else
1659        if (ispin==1) then
1660          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_UP'
1661        else if (ispin==2) then
1662          WF_COEFFS_filename=trim(slabel)//'.WF_COEFFS_DOWN'
1663        end if
1664      end if
1665#endif
1666      open(i,file=trim(WF_COEFFS_filename),form='unformatted',status='replace',action='write')
1667      write(i) c(ispin)%mtrx
1668      call io_close(i)
1669    end if
1670
1671    deallocate(work1)
1672    deallocate(cd(ispin)%mtrx)
1673    if (allocated(p_dense1D)) deallocate(p_dense1D)
1674    deallocate(sg(ispin)%mtrx)
1675    deallocate(sc(ispin)%mtrx)
1676    deallocate(S(ispin)%mtrx)
1677    deallocate(Hdd(ispin)%mtrx)
1678    deallocate(Hd(ispin)%mtrx)
1679    deallocate(H(ispin)%mtrx)
1680
1681#ifdef MPI
1682    deallocate(As_recv)
1683    deallocate(h_dim_l2g_recv)
1684    deallocate(listh_recv)
1685    deallocate(listhptr_recv)
1686    deallocate(numh_recv)
1687#endif
1688
1689    call timer('m_cg',2)
1690
1691    return
1692
1693  end if
1694
1695  if (.not. allocated(H(ispin)%mtrx)) allocate(H(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1696  if (.not. allocated(Hd(ispin)%mtrx)) allocate(Hd(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1697  if (.not. allocated(Hdd(ispin)%mtrx)) allocate(Hdd(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1698  if (.not. allocated(S(ispin)%mtrx)) then
1699    allocate(S(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1700    new_s=.true.
1701  else
1702    new_s=.false.
1703  end if
1704  if (.not. allocated(sc(ispin)%mtrx)) allocate(sc(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1705  if (.not. allocated(sg(ispin)%mtrx)) allocate(sg(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1706
1707  ! calculate the preconditioning matrix (s+t/tau)^-1
1708  if (UpdatePrecon .and. (ispin==1)) then
1709    if (.not. allocated(p_dense1D)) allocate(p_dense1D(1:h_dim_loc(1),1:h_dim_loc(2)))
1710    p_dense1D=s_dense1D+t_dense1D/t_precon_scale
1711#ifdef MPI
1712    allocate(ipiv(1:h_dim_loc(1)+BlockSize))
1713    call pdgetrf(h_dim,h_dim,p_dense1D,1,1,desc1,ipiv,info)
1714    if (info/=0) then
1715print*, info
1716call die('ERROR: pdgetrf has failed in minim!')
1717end if
1718    allocate(work1(1:1,1:1))
1719    allocate(iwork(1:1))
1720    call pdgetri(h_dim,p_dense1D,1,1,desc1,ipiv,work1,-1,iwork,-1,info)
1721    if (info/=0) call die('ERROR: pdgetri has failed in minim!')
1722    liwork=iwork(1)
1723    deallocate(iwork)
1724    lwork=work1(1,1)
1725    deallocate(work1)
1726    allocate(work1(1:lwork,1:1))
1727    allocate(iwork(1:liwork))
1728    call pdgetri(h_dim,p_dense1D,1,1,desc1,ipiv,work1,lwork,iwork,liwork,info)
1729    if (info/=0) call die('ERROR: pdgetri has failed in minim!')
1730    deallocate(iwork)
1731    deallocate(work1)
1732    deallocate(ipiv)
1733#else
1734    allocate(ipiv(1:h_dim))
1735    lwork=h_dim*ilaenv(1,'dsytrf','U',h_dim,-1,-1,-1)
1736    allocate(work1(1:lwork,1:1))
1737    call dsytrf('U',h_dim,p_dense1D,h_dim,ipiv,work1,lwork,info)
1738    if (info/=0) call die('ERROR: dsytrf has failed in minim!')
1739    deallocate(work1)
1740    allocate(work1(1:h_dim,1:1))
1741    call dsytri('U',h_dim,p_dense1D,h_dim,ipiv,work1,info)
1742    if (info/=0) call die('ERROR: dsytri has failed in minim!')
1743    deallocate(work1)
1744    deallocate(ipiv)
1745    do i=1,h_dim-1
1746      do j=i+1,h_dim
1747        p_dense1D(j,i)=p_dense1D(i,j)
1748      end do
1749    end do
1750#endif
1751  end if
1752
1753  allocate(Sd(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1754  allocate(Sdd(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1755  allocate(g(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1756  allocate(g_p(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1757  if (UsePrecon) then
1758    allocate(pg(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1759    allocate(pg_p(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1760  end if
1761  allocate(d(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1762  allocate(hc(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1763  allocate(hg(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
1764  allocate(work1(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1765
1766  ! first we calculate the energy and gradient for our initial guess, with the following steps:
1767  ! -calculate the hamiltonian in WF basis: H=c^T*h*c
1768  call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,h_sparse,c(ispin)%mtrx,H(ispin)%mtrx,hc)
1769  ! -calculate the overlap matrix in WF basis: S=c^T*s*c
1770  if (new_s .or. PreviousCallDiagon) then
1771    call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,s_sparse,c(ispin)%mtrx,S(ispin)%mtrx,sc(ispin)%mtrx)
1772  else
1773    sc(ispin)%mtrx=sc(ispin)%mtrx+x_min(ispin)*sg(ispin)%mtrx
1774  end if
1775  ! -calculate the gradient: g=2*(2*h*c-s*c*H-h*c*S)
1776  !  (note that we *reuse* h*c and s*c contained in hc and sc from the previous call to calc_A)
1777  call calc_grad(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,g,hc,sc(ispin)%mtrx)
1778  ! -calculate the preconditioned gradient by premultiplying g by (s+t/tau)^-1
1779  if (UsePrecon) then
1780#ifdef MPI
1781    call pdgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,1,1,desc3(1:9,ispin),p_dense1D,1,1,desc1,0.0_dp,pg,1,1,desc3(1:9,ispin))
1782#else
1783    call dgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,N_occ,p_dense1D,h_dim,0.0_dp,pg,N_occ)
1784#endif
1785  end if
1786  ! -calculate the additional matrices:
1787  !  Hd=g^T*h*c
1788  !  Sd=g^T*s*c
1789  !  Hdd=g^T*h*g
1790  !  Sdd=g^T*s*g
1791  !  (again, h*c has already been calculated, although h*g has not)
1792  !  and, finally, the coeffs. of the quartic line search equation in the direction g
1793  !  (the energy at c is given by the zeroth-order coeff. c(0))
1794  if (UsePrecon) then
1795#ifdef MPI
1796    call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,1,1,desc3(1:9,ispin),pg,1,1,desc3(1:9,ispin),0.0_dp,Hd(ispin)%mtrx,1,1,&
1797                desc2(1:9,ispin))
1798    call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,1,1,desc3(1:9,ispin),pg,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
1799                desc2(1:9,ispin))
1800#else
1801    call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,N_occ,pg,N_occ,0.0_dp,Hd(ispin)%mtrx,N_occ)
1802    call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,N_occ,pg,N_occ,0.0_dp,Sd,N_occ)
1803#endif
1804  else
1805#ifdef MPI
1806    call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,1,1,desc3(1:9,ispin),g,1,1,desc3(1:9,ispin),0.0_dp,Hd(ispin)%mtrx,1,1,&
1807                desc2(1:9,ispin))
1808#else
1809    call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,N_occ,g,N_occ,0.0_dp,Hd(ispin)%mtrx,N_occ)
1810#endif
1811#ifdef MPI
1812    call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,1,1,desc3(1:9,ispin),g,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
1813                desc2(1:9,ispin))
1814#else
1815    call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,N_occ,g,N_occ,0.0_dp,Sd,N_occ)
1816#endif
1817  end if
1818  if (UsePrecon) then
1819    call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,h_sparse,pg,Hdd(ispin)%mtrx,hg)
1820    call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,s_sparse,pg,Sdd,sg(ispin)%mtrx)
1821  else
1822    call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,h_sparse,g,Hdd(ispin)%mtrx,hg)
1823    call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,s_sparse,g,Sdd,sg(ispin)%mtrx)
1824  end if
1825  call calc_coeff(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,Hd(ispin)%mtrx,Sd,Hdd(ispin)%mtrx,Sdd,coeff,work1)
1826  E_OMM=coeff(0)
1827
1828  ! this is the main loop of the CG algorithm. We perform a series of line minimizations, with the
1829  ! gradient g at each new step being modified to obtain the search direction d
1830  if (Node==0) then
1831    if (ispin==1) then
1832      print'(a)', '+---------------------------------------------+'
1833      if (UsePrecon) then
1834        print'(a)', '| OMM (sparse algebra+preconditioning)        |'
1835      else
1836        print'(a)', '| OMM (sparse algebra)                        |'
1837      end if
1838      print'(a)', '+---------------------------------------------+'
1839    end if
1840    if (nspin==2) then
1841      if (ispin==1) then
1842        print'(a)',      '| up spin                                     |'
1843      else
1844        print'(a)',      '| down spin                                   |'
1845      end if
1846      print'(a)',      '+---------------------------------------------+'
1847    end if
1848    if (LongOut) print'(a)', '|             E_OMM            E_diff         |'
1849  end if
1850  conv=.false.
1851  d=0.0_dp
1852  icg=0
1853  do i=1,n_step_max
1854    lambda=0.0_dp
1855    do j=1,h_dim*N_occ-1
1856      if (UsePrecon) then
1857        d=pg+lambda*d
1858      else
1859        d=g+lambda*d
1860      end if
1861      g_p=g
1862      if (UsePrecon) pg_p=pg
1863      E_OMM_old=E_OMM
1864      ! if this is not the first CG step, we have to recalculate Hd, Sd, Hdd, Sdd, and the coeffs.
1865      if (icg>0) then
1866#ifdef MPI
1867        call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,1,1,desc3(1:9,ispin),d,1,1,desc3(1:9,ispin),0.0_dp,Hd(ispin)%mtrx,1,1,&
1868                    desc2(1:9,ispin))
1869        call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,1,1,desc3(1:9,ispin),d,1,1,desc3(1:9,ispin),0.0_dp,Sd,1,1,&
1870                    desc2(1:9,ispin))
1871#else
1872        call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,hc,N_occ,d,N_occ,0.0_dp,Hd(ispin)%mtrx,N_occ)
1873        call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,sc(ispin)%mtrx,N_occ,d,N_occ,0.0_dp,Sd,N_occ)
1874#endif
1875        call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,h_sparse,d,Hdd(ispin)%mtrx,hg)
1876        call calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,s_sparse,d,Sdd,sg(ispin)%mtrx)
1877        call calc_coeff(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,Hd(ispin)%mtrx,Sd,Hdd(ispin)%mtrx,Sdd,coeff,work1)
1878      end if
1879      ! using the coeffs. calculated anlytically, we can find the minimum of the functional in the
1880      ! search direction, and calculate the energy at that minimum
1881      call solve_quartic(coeff(0:4),x_min(ispin),ls_fail)
1882      ! in certain regions of the coeffs. space the line search gives no minimum--this occurs when there
1883      ! are positive eigenvalues in the eigenspecturm which are significantly occupied by our coeffs.
1884      ! matrix; the only known cure, unfortunately, is to scale down the entire matrix, thus returning to
1885      ! a  safe region of the coeffs. space.
1886      if (ls_fail) then
1887        if (Node==0) print'(a)', '| WARNING: Rescaling coefficients!            |'
1888        E_OMM=3.0*E_OMM
1889        c(ispin)%mtrx=0.5_dp*c(ispin)%mtrx
1890        ls_conv=.false.
1891      else
1892        ! if the line search is successful, move to the minimum
1893        E_OMM=coeff(4)*x_min(ispin)**4+&
1894              coeff(3)*x_min(ispin)**3+&
1895              coeff(2)*x_min(ispin)**2+&
1896              coeff(1)*x_min(ispin)+&
1897              coeff(0)
1898        c(ispin)%mtrx=c(ispin)%mtrx+x_min(ispin)*d
1899        ls_conv=.true.
1900      end if
1901      ! recalculate S at the minimum (or for the rescaled coeffs.)
1902      if (ls_fail) then
1903        S(ispin)%mtrx=0.25_dp*S(ispin)%mtrx
1904      else
1905#ifdef MPI
1906        call pdgeadd('T',N_occ,N_occ,x_min(ispin),Sd,1,1,desc2(1:9,ispin),1.0_dp,S(ispin)%mtrx,1,1,desc2(1:9,ispin))
1907#else
1908        do k=1,N_occ
1909          do l=1,N_occ
1910            S(ispin)%mtrx(k,l)=S(ispin)%mtrx(k,l)+x_min(ispin)*Sd(l,k)
1911          end do
1912        end do
1913#endif
1914        S(ispin)%mtrx=S(ispin)%mtrx+x_min(ispin)*Sd+x_min(ispin)**2*Sdd
1915      end if
1916      E_diff=2.0_dp*abs((E_OMM-E_OMM_old)/(E_OMM+E_OMM_old))
1917      if ((Node==0) .and. LongOut) print'(a,2(1x,i5),2(1x,es15.7e3),1x,a)', '|', i, j, E_OMM, E_diff, '|'
1918      icg=icg+1
1919      if (E_diff<=cg_tol) then
1920        conv=.true.
1921        exit
1922      end if
1923      ! recalculate H at the minimum (or for the rescaled coeffs.)
1924      if (ls_fail) then
1925        H(ispin)%mtrx=0.25_dp*H(ispin)%mtrx
1926      else
1927#ifdef MPI
1928        call pdgeadd('T',N_occ,N_occ,x_min(ispin),Hd(ispin)%mtrx,1,1,desc2(1:9,ispin),1.0_dp,H(ispin)%mtrx,1,1,desc2(1:9,ispin))
1929#else
1930        do k=1,N_occ
1931          do l=1,N_occ
1932            H(ispin)%mtrx(k,l)=H(ispin)%mtrx(k,l)+x_min(ispin)*Hd(ispin)%mtrx(l,k)
1933          end do
1934        end do
1935#endif
1936        H(ispin)%mtrx=H(ispin)%mtrx+x_min(ispin)*Hd(ispin)%mtrx+x_min(ispin)**2*Hdd(ispin)%mtrx
1937      end if
1938      ! recalculate g at the minimum (or for the rescaled coeffs.)
1939      if (ls_fail) then
1940        hc=0.5_dp*hc
1941        sc(ispin)%mtrx=0.5_dp*sc(ispin)%mtrx
1942        g=g_p+1.5_dp*hc
1943      else
1944        hc=hc+x_min(ispin)*hg
1945        sc(ispin)%mtrx=sc(ispin)%mtrx+x_min(ispin)*sg(ispin)%mtrx
1946        call calc_grad(h_dim,N_occ,ispin,H(ispin)%mtrx,S(ispin)%mtrx,g,hc,sc(ispin)%mtrx)
1947      end if
1948      if (UsePrecon) then
1949#ifdef MPI
1950        call pdgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,1,1,desc3(1:9,ispin),p_dense1D,1,1,desc1,0.0_dp,pg,1,1,desc3(1:9,ispin))
1951#else
1952        call dgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,g,N_occ,p_dense1D,h_dim,0.0_dp,pg,N_occ)
1953#endif
1954      end if
1955      if (ls_conv) then
1956        if (UsePrecon) then
1957          lambda_n=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),pg,1,g-g_p,1)
1958          lambda_d=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),pg_p,1,g_p,1)
1959        else
1960          lambda_n=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),g,1,g-g_p,1)
1961          lambda_d=ddot(N_occ_loc(1,ispin)*h_dim_loc(2),g_p,1,g_p,1)
1962        end if
1963#ifdef MPI
1964        call mpi_allreduce(lambda_n,lambda_n_tot,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
1965        call mpi_allreduce(lambda_d,lambda_d_tot,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
1966        lambda=lambda_n_tot/lambda_d_tot
1967#else
1968        lambda=lambda_n/lambda_d
1969#endif
1970      else
1971        exit
1972      end if
1973    end do
1974    if (conv) exit
1975  end do
1976  if (i>n_step_max) then
1977    if (Node==0) print'(a)', '| WARNING: OMM failed to converge!            |'
1978  end if
1979  if ((Node==0) .and. LongOut) print'(a)', '+---------------------------------------------+'
1980
1981  deallocate(work1)
1982  deallocate(hg)
1983  deallocate(hc)
1984  deallocate(d)
1985  if (UsePrecon) then
1986    deallocate(pg_p)
1987    deallocate(pg)
1988  end if
1989  deallocate(g_p)
1990  deallocate(g)
1991  deallocate(Sdd)
1992  deallocate(Sd)
1993
1994  ! calculate the density matrix: d=c*(2*I-S)*c^T
1995  if (.not. allocated(twoI(ispin)%mtrx)) then
1996    allocate(twoI(ispin)%mtrx(1:N_occ_loc(1,ispin),1:N_occ_loc(2,ispin)))
1997#ifdef MPI
1998    call pdlaset('A',N_occ,N_occ,0.0_dp,2.0_dp,twoI(ispin)%mtrx,1,1,desc2(1:9,ispin))
1999#else
2000    twoI(ispin)%mtrx=0.0_dp
2001    do i=1,N_occ
2002      twoI(ispin)%mtrx(i,i)=2.0_dp
2003    end do
2004#endif
2005  end if
2006  if (.not. allocated(cd(ispin)%mtrx)) allocate(cd(ispin)%mtrx(1:N_occ_loc(1,ispin),1:h_dim_loc(2)))
2007  call calc_densmat_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,twoI(ispin)%mtrx-S(ispin)%mtrx,c(ispin)%mtrx,&
2008                           d_sparse,cd(ispin)%mtrx)
2009  if (nspin==1) d_sparse=2.0_dp*d_sparse
2010
2011#ifdef MPI
2012    deallocate(As_recv)
2013    deallocate(h_dim_l2g_recv)
2014    deallocate(listh_recv)
2015    deallocate(listhptr_recv)
2016    deallocate(numh_recv)
2017#endif
2018
2019  ! calculate the trace of S to make sure we are occupying the right number of eigenstates in our
2020  ! solution
2021#ifdef MPI
2022  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),twoI(ispin)%mtrx-S(ispin)%mtrx,1,S(ispin)%mtrx,1)
2023  call mpi_allreduce(Tr_loc,TrQS,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2024#else
2025  TrQS=ddot(N_occ*N_occ,twoI(ispin)%mtrx-S(ispin)%mtrx,1,S(ispin)%mtrx,1)
2026#endif
2027  if (Node==0) then
2028    if (nspin==1) then
2029      print'(a,i5,a)',    '| minim: icg             = ', icg, '              |'
2030      print'(a,f13.7,a)', '| minim: 2*Tr[(2*I-S)*S] = ', 2.0_dp*TrQS, '      |'
2031    else
2032      print'(a,i5,a)',    '| minim: icg           = ', icg, '                |'
2033      print'(a,f13.7,a)', '| minim: Tr[(2*I-S)*S] = ', TrQS, '        |'
2034    end if
2035    print'(a)',       '+---------------------------------------------+'
2036  end if
2037
2038  if (FirstCall(ispin)) FirstCall(ispin)=.false.
2039
2040  call timer('m_cg',2)
2041
2042end subroutine minim_cg_sparse
2043
2044!================================================!
2045! calculate the coeffs. of the quartic line      !
2046! search equation from three energy points and   !
2047! two gradient points                            !
2048!================================================!
2049subroutine fit_quartic(x,y,g,c)
2050  implicit none
2051
2052  !**** INPUT ***********************************!
2053
2054  real(dp), intent(in) :: x(1:3) ! three x-points {x_i}
2055  real(dp), intent(in) :: y(1:3) ! y(x_i) at the three points
2056  real(dp), intent(in) :: g(1:2) ! (dy/dx)|x_i at the three points
2057
2058  !**** OUTPUT **********************************!
2059
2060  real(dp), intent(out) :: c(0:4) ! coeffs. of the quartic equation
2061
2062  !**********************************************!
2063
2064  !call timer('m_solve_quartic',1)
2065
2066  ! the following expressions for the coeffs. were produced automatically using Maple 12
2067  c(4)=(x(3)**3*x(2)*g(1)-3*x(1)*x(2)**2*y(1)+3*y(3)*x(1)*x(2)**2+x(1)**2*x(2)**2*g(1)+x(3)*x(2)**3*&
2068       g(1)+2*x(1)**2*x(3)**2*g(2)-3*x(2)*x(3)**2*y(1)+3*y(2)*x(1)**2*x(2)-x(3)**3*x(1)*g(1)+x(3)**3*&
2069       x(2)*g(2)-x(2)**2*x(3)**2*g(2)-x(1)**2*x(2)**2*g(2)-2*x(2)**2*x(3)**2*g(1)+3*x(2)*x(3)**2*&
2070       y(2)+x(1)**2*x(3)**2*g(1)-x(1)*x(2)**3*g(1)-3*x(1)*x(3)**2*y(1)-x(3)**3*x(1)*g(2)+3*x(1)*&
2071       x(3)**2*y(2)+x(2)*g(2)*x(1)**3-3*y(3)*x(1)**2*x(2)-x(3)*g(2)*x(1)**3+6*x(1)*x(3)*x(2)*y(1)+2*&
2072       x(1)*x(3)*x(2)**2*g(2)+x(1)*x(3)**2*x(2)*g(1)-x(1)*x(3)**2*x(2)*g(2)-2*x(1)**2*x(3)*x(2)*g(1)-&
2073       x(1)**2*x(3)*x(2)*g(2)+x(1)*x(3)*x(2)**2*g(1)-6*x(1)*x(3)*x(2)*y(2)+2*x(3)**3*y(1)-2*x(3)**3*&
2074       y(2)+x(2)**3*y(1)+y(3)*x(1)**3-y(3)*x(2)**3-y(2)*x(1)**3)/(-2*x(3)**3*x(1)**4+x(3)**4*x(1)**3-&
2075       x(1)**2*x(2)**5-x(3)**4*x(2)**3-x(2)**5*x(3)**2-3*x(1)**4*x(2)**3+2*x(3)**3*x(2)**4+x(1)**5*&
2076       x(3)**2+3*x(2)**4*x(1)**3+4*x(3)**3*x(2)*x(1)**3-4*x(3)**3*x(1)*x(2)**3+2*x(1)*x(3)*x(2)**5+4*&
2077       x(1)**4*x(3)*x(2)**2+8*x(1)**2*x(3)**2*x(2)**3+x(1)**4*x(3)**2*x(2)-x(1)*x(3)**2*x(2)**4-2*&
2078       x(1)**5*x(3)*x(2)-4*x(1)**2*x(3)*x(2)**4+x(1)**5*x(2)**2-8*x(2)**2*x(3)**2*x(1)**3-3*x(3)**4*&
2079       x(1)**2*x(2)+3*x(3)**4*x(1)*x(2)**2)
2080
2081  c(3)=-(-x(1)*g(1)+2*c(4)*x(1)**4-x(1)*g(2)+4*x(1)*c(4)*x(2)**3+x(2)*g(1)-4*x(2)*c(4)*x(1)**3+x(2)*&
2082       g(2)-2*c(4)*x(2)**4+2*y(1)-2*y(2))/(x(1)**3+3*x(1)*x(2)**2-3*x(2)*x(1)**2-x(2)**3)
2083
2084  c(2)=-(-y(2)+c(4)*x(2)**4+c(3)*x(2)**3+x(2)*g(1)-4*x(2)*c(4)*x(1)**3-3*x(2)*c(3)*x(1)**2+y(1)+3*&
2085       c(4)*x(1)**4+2*c(3)*x(1)**3-x(1)*g(1))/(x(1)**2-2*x(1)*x(2)+x(2)**2)
2086
2087  c(1)=g(1)-4*c(4)*x(1)**3-3*c(3)*x(1)**2-2*c(2)*x(1)
2088
2089  c(0)=y(1)-c(4)*x(1)**4-c(3)*x(1)**3-c(2)*x(1)**2-c(1)*x(1)
2090
2091  !if (Node==0) print*, 'f(x)=',c(4),'*x**4+',c(3),'*x**3+',c(2),'*x**2+',c(1),'*x+',c(0)
2092
2093  !call timer('m_fit_quartic',2)
2094
2095end subroutine fit_quartic
2096
2097!================================================!
2098! find the minimum for the quartic line search   !
2099! equation                                       !
2100!================================================!
2101subroutine solve_quartic(c,x_min,fail)
2102  implicit none
2103
2104  !**** INPUT ***********************************!
2105
2106  real(dp), intent(in) :: c(0:4) ! coeffs. of the quartic equation
2107
2108  !**** OUTPUT **********************************!
2109
2110  logical, intent(out) :: fail ! did we fail to find a minimum?
2111
2112  real(dp), intent(out) :: x_min ! position of minimum
2113
2114  !**** LOCAL ***********************************!
2115
2116  integer :: i
2117  integer :: x_order(1:3)
2118
2119  real(dp) :: t(1:3)
2120  real(dp) :: z(1:3)
2121  real(dp) :: a
2122  real(dp) :: b
2123  real(dp) :: d
2124  real(dp) :: Q
2125  real(dp) :: R
2126  real(dp) :: theta
2127  real(dp) :: S
2128  real(dp) :: U
2129
2130  !**********************************************!
2131
2132  !call timer('m_solve_quartic',1)
2133
2134  fail=.false.
2135
2136  !if (c(4)<0.0_dp) then
2137  !  if (Node==0) print*, '#WARNING: Function is unbounded!'
2138  !  !stop
2139  !end if
2140
2141  ! in order to find the minimum of the quartic equation, we have to solve a cubic equation; the
2142  ! following method is taken from Numerical Recipes
2143  a=3.0_dp*c(3)/(4.0_dp*c(4))
2144  b=2.0_dp*c(2)/(4.0_dp*c(4))
2145  if ((abs(b)>=1.0d11) .or. (abs(c(4))<=1.0d-11)) then
2146    !if (Node==0) print*, '#WARNING: Function is quadratic!'
2147    x_min=-0.5_dp*c(1)/c(2)
2148    return
2149  end if
2150  d=c(1)/(4.0_dp*c(4))
2151
2152  Q=(a**2-3.0_dp*b)/9.0_dp
2153  R=(2.0_dp*a**3-9.0_dp*a*b+27.0_dp*d)/54.0_dp
2154  if (R**2<Q**3) then
2155    theta=acos(R/sqrt(Q**3))
2156    t(1)=-2.0_dp*sqrt(Q)*cos(theta/3.0_dp)-a/3.0_dp
2157    t(2)=-2.0_dp*sqrt(Q)*cos((theta+2.0_dp*Pi)/3.0_dp)-a/3.0_dp
2158    t(3)=-2.0_dp*sqrt(Q)*cos((theta-2.0_dp*Pi)/3.0_dp)-a/3.0_dp
2159    z(1:3)=c(4)*t(1:3)**4+c(3)*t(1:3)**3+c(2)*t(1:3)**2+c(1)*t(1:3)+c(0)
2160    if (c(4)>0.0_dp) then
2161      if (all(z(1)>=z(2:3))) then
2162        x_order(1:3)=(/1,2,3/)
2163      else if (z(2)>z(3)) then
2164        x_order(1:3)=(/2,3,1/)
2165      else
2166        x_order(1:3)=(/3,1,2/)
2167      end if
2168      if ((0.0_dp<=t(x_order(1))) .and. (t(x_order(2))<=t(x_order(1)))) then
2169        x_min=t(x_order(2))
2170      else
2171        x_min=t(x_order(3))
2172      end if
2173    else
2174      if (all(z(1)<=z(2:3))) then
2175        x_min=t(1)
2176      else if (z(2)<z(3)) then
2177        x_min=t(2)
2178      else
2179        x_min=t(3)
2180      end if
2181    end if
2182  else
2183    S=-sign(1.0_dp,R)*(abs(R)+sqrt(R**2-Q**3))**(1.0_dp/3.0_dp)
2184    if (S==0.0_dp) then
2185      U=0.0_dp
2186    else
2187      U=Q/S
2188    end if
2189    x_min=(S+U)-(a/3.0_dp)
2190    if (c(4)<0.0_dp) fail=.true.
2191  end if
2192
2193  !call timer('m_solve_quartic',2)
2194
2195end subroutine solve_quartic
2196
2197!================================================!
2198! calculate the gradient of the Kim functional:  !
2199! g=2*(2*h*C-s*C*H-h*C*S)                        !
2200!================================================!
2201subroutine calc_grad(h_dim,N_occ,ispin,H,S,grad,hc,sc)
2202  implicit none
2203
2204  !**** INPUT ***********************************!
2205
2206  integer, intent(in) :: h_dim ! num. of AOs (global)
2207  integer, intent(in) :: N_occ ! num. of WFs (global)
2208  integer, intent(in) :: ispin ! up/down spin (1/2)
2209
2210  real(dp), intent(in) :: H(:,:) ! hamiltonian matrix in WF basis
2211  real(dp), intent(in) :: S(:,:) ! overlap matrix in WF basis
2212
2213  !**** INOUT ***********************************!
2214
2215  real(dp), intent(inout) :: grad(:,:) ! gradient of Kim functional
2216  real(dp), intent(inout) :: hc(:,:)   ! h*c
2217  real(dp), intent(inout) :: sc(:,:)   ! s*c
2218
2219  !**********************************************!
2220
2221  call timer('m_calc_grad',1)
2222
2223  grad=4.0_dp*hc
2224
2225#ifdef MPI
2226  call pdgemm('N','N',N_occ,h_dim,N_occ,-2.0_dp,S,1,1,desc2(1:9,ispin),hc,1,1,desc3(1:9,ispin),1.0_dp,grad,1,1,desc3(1:9,ispin))
2227  call pdgemm('N','N',N_occ,h_dim,N_occ,-2.0_dp,H,1,1,desc2(1:9,ispin),sc,1,1,desc3(1:9,ispin),1.0_dp,grad,1,1,desc3(1:9,ispin))
2228#else
2229  call dgemm('N','N',N_occ,h_dim,N_occ,-2.0_dp,S,N_occ,hc,N_occ,1.0_dp,grad,N_occ)
2230  call dgemm('N','N',N_occ,h_dim,N_occ,-2.0_dp,H,N_occ,sc,N_occ,1.0_dp,grad,N_occ)
2231#endif
2232
2233  call timer('m_calc_grad',2)
2234
2235end subroutine calc_grad
2236
2237!================================================!
2238! calculate operator matrix in WF basis (dense   !
2239! routine)                                       !
2240!================================================!
2241subroutine calc_A(h_dim,N_occ,ispin,Ap,c,A,Apc)
2242  implicit none
2243
2244  !**** INPUT ***********************************!
2245
2246  integer, intent(in) :: h_dim ! num. of AOs (global)
2247  integer, intent(in) :: N_occ ! num. of WFs (global)
2248  integer, intent(in) :: ispin ! up/down spin (1/2)
2249
2250  real(dp), intent(in) :: Ap(:,:) ! operator matrix in AO basis
2251  real(dp), intent(in) :: c(:,:)  ! WF coeffs. matrix
2252
2253  !**** INOUT ***********************************!
2254
2255  real(dp), intent(inout) :: A(:,:)   ! operator matrix in WF basis
2256  real(dp), intent(inout) :: Apc(:,:) ! work matrix
2257
2258  !**********************************************!
2259
2260  call timer('m_calc_A',1)
2261
2262#ifdef MPI
2263  call pdgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,c,  1,1,desc3(1:9,ispin),Ap,1,1,desc1,           0.0_dp,Apc,1,1,desc3(1:9,ispin))
2264  call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,Apc,1,1,desc3(1:9,ispin),c, 1,1,desc3(1:9,ispin),0.0_dp,A,  1,1,desc2(1:9,ispin))
2265#else
2266  call dgemm('N','N',N_occ,h_dim,h_dim,1.0_dp,c,  N_occ,Ap,h_dim,0.0_dp,Apc,N_occ)
2267  call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,Apc,N_occ,c, N_occ,0.0_dp,A,  N_occ)
2268#endif
2269
2270  call timer('m_calc_A',2)
2271
2272end subroutine calc_A
2273
2274!================================================!
2275! calculate operator matrix in WF basis (sparse  !
2276! routine)                                       !
2277!================================================!
2278subroutine calc_A_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,As,c,A,Asc)
2279  implicit none
2280
2281  !**** INPUT ***********************************!
2282
2283  integer, intent(in) :: h_dim       ! num. of AOs (global)
2284  integer, intent(in) :: N_occ       ! num. of WFs (global)
2285  integer, intent(in) :: ispin       ! up/down spin (1/2)
2286  integer, intent(in) :: nhmax       ! first dimension of listh and sparse matrices
2287  integer, intent(in) :: numh(:)     ! num. of nonzero elements of each row of sparse matrices
2288  integer, intent(in) :: listhptr(:) ! pointer to start of row in listh
2289  integer, intent(in) :: listh(:)    ! list of nonzero elements of each row of sparse matrices
2290
2291  real(dp), intent(in) :: As(:)  ! operator matrix in AO basis (sparse)
2292  real(dp), intent(in) :: c(:,:) ! WF coeffs. matrix
2293
2294  !**** INOUT ***********************************!
2295
2296  real(dp), intent(inout) :: A(:,:)   ! operator matrix in WF basis
2297  real(dp), intent(inout) :: Asc(:,:) ! work matrix
2298
2299  !**** LOCAL ***********************************!
2300
2301  integer :: i
2302  integer :: j
2303  integer :: l
2304#ifdef MPI
2305  integer :: n_comm
2306  integer :: info
2307  integer :: h_dim_loc_recv
2308  integer :: nhmax_recv
2309#endif
2310
2311  !**********************************************!
2312
2313  call timer('m_calc_A',1)
2314
2315#ifdef MPI
2316  Asc=0.0_dp
2317  do n_comm=0,Nodes-1
2318    if (n_comm==Node) then
2319      h_dim_loc_recv=h_dim_loc_1D
2320      nhmax_recv=nhmax
2321      numh_recv(1:h_dim_loc_1D)=numh(1:h_dim_loc_1D)
2322      listhptr_recv(1:h_dim_loc_1D)=listhptr(1:h_dim_loc_1D)
2323      listh_recv(1:nhmax)=listh(1:nhmax)
2324      As_recv(1:nhmax)=As(1:nhmax)
2325      h_dim_l2g_recv(1:h_dim_loc_1D)=h_dim_l2g(1:h_dim_loc_1D)
2326    end if
2327    call mpi_bcast(h_dim_loc_recv,   1,              mpi_integer,         n_comm,mpi_comm_world,info)
2328    call mpi_bcast(nhmax_recv,       1,              mpi_integer,         n_comm,mpi_comm_world,info)
2329    call mpi_bcast(numh_recv(1),     h_dim_loc_recv, mpi_integer,         n_comm,mpi_comm_world,info)
2330    call mpi_bcast(listhptr_recv(1), h_dim_loc_recv, mpi_integer,         n_comm,mpi_comm_world,info)
2331    call mpi_bcast(listh_recv(1),    nhmax_recv,     mpi_integer,         n_comm,mpi_comm_world,info)
2332    call mpi_bcast(As_recv(1),       nhmax_recv,     mpi_double_precision,n_comm,mpi_comm_world,info)
2333    call mpi_bcast(h_dim_l2g_recv(1),h_dim_loc_recv, mpi_integer,         n_comm,mpi_comm_world,info)
2334    if (n_comm==Node) then
2335      do i=1,h_dim_loc_1D
2336        do j=1,numh(i)
2337          l=listhptr(i)+j
2338          Asc(:,h_dim_l2g(i))=Asc(:,h_dim_l2g(i))+As(l)*c(:,listh(l))
2339        end do
2340      end do
2341    else
2342      do i=1,h_dim_loc_recv
2343        do j=1,numh_recv(i)
2344          l=listhptr_recv(i)+j
2345          Asc(:,h_dim_l2g_recv(i))=Asc(:,h_dim_l2g_recv(i))+As_recv(l)*c(:,listh_recv(l))
2346        end do
2347      end do
2348    end if
2349  end do
2350  call pdgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,Asc,1,1,desc3(1:9,ispin),c,1,1,desc3(1:9,ispin),0.0_dp,A,1,1,desc2(1:9,ispin))
2351#else
2352  Asc=0.0_dp
2353  do i=1,h_dim
2354    do j=1,numh(i)
2355      l=listhptr(i)+j
2356      Asc(:,i)=Asc(:,i)+As(l)*c(:,listh(l))
2357    end do
2358  end do
2359  call dgemm('N','T',N_occ,N_occ,h_dim,1.0_dp,Asc,N_occ,c,N_occ,0.0_dp,A,N_occ)
2360#endif
2361
2362  call timer('m_calc_A',2)
2363
2364end subroutine calc_A_sparse
2365
2366!================================================!
2367! calculate operator matrix in AO basis (dense   !
2368! routine)                                       !
2369!================================================!
2370subroutine calc_densmat(h_dim,N_occ,ispin,A,c1,Ap,cA,c2)
2371  implicit none
2372
2373  !**** INPUT ***********************************!
2374
2375  integer, intent(in) :: h_dim ! num. of AOs (global)
2376  integer, intent(in) :: N_occ ! num. of WFs (global)
2377  integer, intent(in) :: ispin ! up/down spin (1/2)
2378
2379  real(dp), intent(in) :: A(:,:)            ! Operator matrix in WF basis
2380  real(dp), intent(in) :: c1(:,:)           ! WF coeffs. matrix
2381  real(dp), intent(in), optional :: c2(:,:) ! pre-multiplied WF coeffs. matrix
2382
2383  !**** INOUT ***********************************!
2384
2385  real(dp), intent(inout) :: Ap(:,:) ! operator matrix in orbital basis
2386  real(dp), intent(inout) :: cA(:,:) ! work matrix
2387
2388  !**********************************************!
2389
2390  call timer('m_calc_densmat',1)
2391
2392#ifdef MPI
2393  if (present(c2)) then
2394    call pdgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,1,1,desc2(1:9,ispin),c2,1,1,desc3(1:9,ispin),0.0_dp,cA,1,1,desc3(1:9,ispin))
2395  else
2396    call pdgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,1,1,desc2(1:9,ispin),c1,1,1,desc3(1:9,ispin),0.0_dp,cA,1,1,desc3(1:9,ispin))
2397  end if
2398  call pdgemm('T','N',h_dim,h_dim,N_occ,1.0_dp,c1,1,1,desc3(1:9,ispin),cA,1,1,desc3(1:9,ispin),0.0_dp,Ap,1,1,desc1)
2399#else
2400  if (present(c2)) then
2401    call dgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,N_occ,c2,N_occ,0.0_dp,cA,N_occ)
2402  else
2403    call dgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,N_occ,c1,N_occ,0.0_dp,cA,N_occ)
2404  end if
2405  call dgemm('T','N',h_dim,h_dim,N_occ,1.0_dp,c1,N_occ,cA,N_occ,0.0_dp,Ap,h_dim)
2406#endif
2407
2408  call timer('m_calc_densmat',2)
2409
2410end subroutine calc_densmat
2411
2412!================================================!
2413! calculate operator matrix in AO basis (sparse  !
2414! routine)                                       !
2415!================================================!
2416subroutine calc_densmat_sparse(h_dim,N_occ,ispin,nhmax,numh,listhptr,listh,A,c1,As,cA,c2)
2417  implicit none
2418
2419  !**** INPUT ***********************************!
2420
2421  integer, intent(in) :: h_dim       ! num. of AOs (global)
2422  integer, intent(in) :: N_occ       ! num. of WFs (global)
2423  integer, intent(in) :: ispin       ! up/down spin (1/2)
2424  integer, intent(in) :: nhmax       ! first dimension of listh and sparse matrices
2425  integer, intent(in) :: numh(:)     ! num. of nonzero elements of each row of sparse matrices
2426  integer, intent(in) :: listhptr(:) ! pointer to start of row in listh
2427  integer, intent(in) :: listh(:)    ! list of nonzero elements of each row of sparse matrices
2428
2429  real(dp), intent(in) :: A(:,:)            ! operator matrix in WF basis
2430  real(dp), intent(in) :: c1(:,:)           ! WF coeffs. matrix
2431  real(dp), intent(in), optional :: c2(:,:) ! pre-multiplied WF coeffs. matrix
2432
2433  !**** OUTPUT **********************************!
2434
2435  real(dp), intent(out) :: As(:) ! operator matrix in AO basis (sparse)
2436
2437  !**** INOUT ***********************************!
2438
2439  real(dp), intent(inout) :: cA(:,:) ! work matrix
2440
2441  !**** LOCAL ***********************************!
2442
2443  integer :: i
2444  integer :: j
2445  integer :: k
2446  integer :: l
2447#ifdef MPI
2448  integer :: n_comm
2449  integer :: info
2450  integer :: h_dim_loc_recv
2451  integer :: nhmax_recv
2452#endif
2453
2454  !**********************************************!
2455
2456  call timer('m_calc_densmat',1)
2457
2458#ifdef MPI
2459  if (present(c2)) then
2460    call pdgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,1,1,desc2(1:9,ispin),c2,1,1,desc3(1:9,ispin),0.0_dp,cA,1,1,desc3(1:9,ispin))
2461  else
2462    call pdgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,1,1,desc2(1:9,ispin),c1,1,1,desc3(1:9,ispin),0.0_dp,cA,1,1,desc3(1:9,ispin))
2463  end if
2464  do n_comm=0,Nodes-1
2465    if (n_comm==Node) then
2466      h_dim_loc_recv=h_dim_loc_1D
2467      nhmax_recv=nhmax
2468      numh_recv(1:h_dim_loc_1D)=numh(1:h_dim_loc_1D)
2469      listhptr_recv(1:h_dim_loc_1D)=listhptr(1:h_dim_loc_1D)
2470      listh_recv(1:nhmax)=listh(1:nhmax)
2471      h_dim_l2g_recv(1:h_dim_loc_1D)=h_dim_l2g(1:h_dim_loc_1D)
2472    end if
2473    call mpi_bcast(h_dim_loc_recv,   1,              mpi_integer,n_comm,mpi_comm_world,info)
2474    call mpi_bcast(nhmax_recv,       1,              mpi_integer,n_comm,mpi_comm_world,info)
2475    call mpi_bcast(numh_recv(1),     h_dim_loc_recv, mpi_integer,n_comm,mpi_comm_world,info)
2476    call mpi_bcast(listhptr_recv(1), h_dim_loc_recv, mpi_integer,n_comm,mpi_comm_world,info)
2477    call mpi_bcast(listh_recv(1),    nhmax_recv,     mpi_integer,n_comm,mpi_comm_world,info)
2478    call mpi_bcast(h_dim_l2g_recv(1),h_dim_loc_recv, mpi_integer,n_comm,mpi_comm_world,info)
2479    As_recv=0.0_dp
2480    do i=1,h_dim_loc_recv
2481      do j=1,numh_recv(i)
2482        l=listhptr_recv(i)+j
2483        do k=1,N_occ_loc_1D(ispin)
2484          As_recv(l)=As_recv(l)+cA(k,listh_recv(l))*c1(k,h_dim_l2g_recv(i))
2485        end do
2486      end do
2487    end do
2488    call mpi_reduce(As_recv(1),As(1),nhmax_recv,mpi_double_precision,mpi_sum,n_comm,mpi_comm_world,info)
2489  end do
2490#else
2491  if (present(c2)) then
2492    call dgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,N_occ,c2,N_occ,0.0_dp,cA,N_occ)
2493  else
2494    call dgemm('N','N',N_occ,h_dim,N_occ,1.0_dp,A,N_occ,c1,N_occ,0.0_dp,cA,N_occ)
2495  end if
2496  As=0.0_dp
2497  do i=1,h_dim
2498    do j=1,numh(i)
2499      l=listhptr(i)+j
2500      do k=1,N_occ
2501        As(l)=As(l)+cA(k,listh(l))*c1(k,i)
2502      end do
2503    end do
2504  end do
2505#endif
2506
2507  call timer('m_calc_densmat',2)
2508
2509end subroutine calc_densmat_sparse
2510
2511!================================================!
2512! calculate coeffs. of the quartic line search   !
2513! equation using analytical expressions          !
2514!================================================!
2515subroutine calc_coeff(h_dim,N_occ,ispin,H,S,Hd,Sd,Hdd,Sdd,coeff,SdT)
2516  implicit none
2517
2518  !**** INPUT ***********************************!
2519
2520  integer, intent(in) :: h_dim ! num. of AOs (global)
2521  integer, intent(in) :: N_occ ! num. of WFs (global)
2522  integer, intent(in) :: ispin ! up/down spin (1/2)
2523
2524  real(dp), intent(in) :: H(:,:)   ! hamiltonian matrix in WF basis
2525  real(dp), intent(in) :: S(:,:)   ! overlap matrix in WF basis
2526  real(dp), intent(in) :: Hd(:,:)  ! g^T*h*c
2527  real(dp), intent(in) :: Sd(:,:)  ! g^T*s*c
2528  real(dp), intent(in) :: Hdd(:,:) ! g^T*h*g
2529  real(dp), intent(in) :: Sdd(:,:) ! g^T*h*g
2530
2531  !**** INOUT ***********************************!
2532
2533  real(dp), intent(inout) :: coeff(0:4) ! coeffs. of the quartic equation
2534  real(dp), intent(inout) :: SdT(:,:)   ! work matrix
2535
2536  !**** LOCAL ***********************************!
2537
2538#ifdef MPI
2539  integer :: info
2540#else
2541  integer :: i, j
2542#endif
2543
2544  real(dp) :: Tr_loc
2545  real(dp) :: TrH
2546  real(dp) :: TrHS
2547  real(dp) :: TrHd
2548  real(dp) :: TrHdS
2549  real(dp) :: TrHSd
2550  real(dp) :: TrHdd
2551  real(dp) :: TrHddS
2552  real(dp) :: TrHSdd
2553  real(dp) :: TrHdSd
2554  real(dp) :: TrHdSdT
2555  real(dp) :: TrHddSd
2556  real(dp) :: TrHdSdd
2557  real(dp) :: TrHddSdd
2558  real(dp), external :: ddot
2559#ifdef MPI
2560  real(dp), external :: pdlatra
2561#endif
2562
2563  !**********************************************!
2564
2565  call timer('m_calc_coeff',1)
2566
2567#ifdef MPI
2568  TrH=pdlatra(N_occ,H,1,1,desc2(1:9,ispin))
2569  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),H,1,S,1)
2570  call mpi_allreduce(Tr_loc,TrHS,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2571
2572  TrHd=pdlatra(N_occ,Hd,1,1,desc2(1:9,ispin))
2573  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),Hd,1,S,1)
2574  call mpi_allreduce(Tr_loc,TrHdS,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2575  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),H,1,Sd,1)
2576  call mpi_allreduce(Tr_loc,TrHSd,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2577
2578  TrHdd=pdlatra(N_occ,Hdd,1,1,desc2(1:9,ispin))
2579  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),Hdd,1,S,1)
2580  call mpi_allreduce(Tr_loc,TrHddS,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2581  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),H,1,Sdd,1)
2582  call mpi_allreduce(Tr_loc,TrHSdd,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2583  call pdtran(N_occ,N_occ,1.0_dp,Sd,1,1,desc2(1:9,ispin),0.0_dp,SdT,1,1,desc2(1:9,ispin))
2584  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),Hd,1,SdT,1)
2585  call mpi_allreduce(Tr_loc,TrHdSd,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2586  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),Hd,1,Sd,1)
2587  call mpi_allreduce(Tr_loc,TrHdSdT,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2588
2589  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),Hdd,1,Sd,1)
2590  call mpi_allreduce(Tr_loc,TrHddSd,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2591  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),Hd,1,Sdd,1)
2592  call mpi_allreduce(Tr_loc,TrHdSdd,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2593
2594  Tr_loc=ddot(N_occ_loc(1,ispin)*N_occ_loc(2,ispin),Hdd,1,Sdd,1)
2595  call mpi_allreduce(Tr_loc,TrHddSdd,1,mpi_double_precision,mpi_sum,mpi_comm_world,info)
2596#else
2597  TrH=0.0_dp
2598  do i=1,N_occ
2599    TrH=TrH+H(i,i)
2600  end do
2601  TrHS=ddot(N_occ*N_occ,H,1,S,1)
2602
2603  TrHd=0.0_dp
2604  do i=1,N_occ
2605    TrHd=TrHd+Hd(i,i)
2606  end do
2607  TrHdS=ddot(N_occ*N_occ,Hd,1,S,1)
2608  TrHSd=ddot(N_occ*N_occ,H,1,Sd,1)
2609
2610  TrHdd=0.0_dp
2611  do i=1,N_occ
2612    TrHdd=TrHdd+Hdd(i,i)
2613  end do
2614  TrHddS=ddot(N_occ*N_occ,Hdd,1,S,1)
2615  TrHSdd=ddot(N_occ*N_occ,H,1,Sdd,1)
2616  do i=1,N_occ
2617    do j=1,N_occ
2618      SdT(j,i)=Sd(i,j)
2619    end do
2620  end do
2621  TrHdSd=ddot(N_occ*N_occ,Hd,1,SdT,1)
2622  TrHdSdT=ddot(N_occ*N_occ,Hd,1,Sd,1)
2623
2624  TrHddSd=ddot(N_occ*N_occ,Hdd,1,Sd,1)
2625  TrHdSdd=ddot(N_occ*N_occ,Hd,1,Sdd,1)
2626
2627  TrHddSdd=ddot(N_occ*N_occ,Hdd,1,Sdd,1)
2628#endif
2629
2630  coeff(0)=2.0_dp*TrH-TrHS
2631  coeff(1)=2.0_dp*(2.0_dp*TrHd-TrHdS-TrHSd)
2632  coeff(2)=2.0_dp*(TrHdd-TrHdSd-TrHdSdT)-TrHddS-TrHSdd
2633  coeff(3)=-2.0_dp*(TrHddSd+TrHdSdd)
2634  coeff(4)=-TrHddSdd
2635
2636  call timer('m_calc_coeff',2)
2637
2638end subroutine calc_coeff
2639
2640!================================================!
2641! random number generator                        !
2642! -initialize with:                              !
2643!  call rand_init()                              !
2644! -generate new number with:                     !
2645!  call random_numer(rn)                         !
2646!  where where rn is a real(dp) variable         !
2647!================================================!
2648subroutine rand_init
2649  implicit none
2650
2651  !**** LOCAL ***********************************!
2652
2653  character(10) :: system_time
2654
2655  integer :: i
2656  integer :: rand_size
2657  integer, allocatable :: rand_seed(:)
2658
2659  real(dp) :: rtime
2660  real(dp) :: rn
2661
2662  !**********************************************!
2663
2664  call random_seed(size=rand_size)
2665  allocate(rand_seed(1:rand_size))
2666  call date_and_time(time=system_time)
2667  read (system_time,*) rtime
2668  !rand_seed=(Node+1)*int(rtime*1000.0_dp)
2669  rand_seed=(Node+1)*123456
2670  call random_seed(put=rand_seed)
2671  deallocate(rand_seed)
2672
2673  do i=1,10000
2674    call random_number(rn)
2675  end do
2676
2677end subroutine rand_init
2678
2679end module m_dminim
2680