1!================================================================================
2!
3! Module:
4!
5! (1) mtxel_cor()          Originally By ?       Last Modified 8/15/2015 (FHJ)
6!
7!     Computes the correlation contribution to the QP self-energy with
8!     different approaches (SX+CH, or COR directly), and with different levels
9!     of approximations to the dynamical effects of W (static, GPP, real-axis
10!     integration and contour deformation).
11!
12!     More specifically, this routine computes the contribution from the
13!     current processor to the correlation part of the self-energy operator
14!     between bands `n` (sig%diag(in)) and `m` (sig%diag(im)) at the frequency
15!     of the band `l` (sig%diag(il)).
16!
17!     Screened exchange is put into asxt(iw), Coulomb hole into acht(iw),
18!     iw goes from 1 to 3 (see below for what it is).
19!
20!     On entry, asxt(:) and acht(:) are zeroed.
21!
22!     Physically, this routine will compute:
23!
24!     < psi_n ( k ) | Sigma_cor ( E ) | psi_m ( k ) >
25!
26!     where Sigma_cor(E) = Sigma(E) - Sigma_X(E). We compute the retarded
27!     self-energy operator, except for contour-deformation calculations,
28!     in which case we compute the time-ordered Sigma.
29!
30!     Sigma_cor(E) is computed within different approximations:
31!     - static COHSEX approximation, both exact and partial sum CH (gsm)
32!     - Generalized Plasmon Pole model (?)
33!     - full frequency dependent inverse dielectric matrix (CDS/CHP/gsm)
34!
35!     ch2 variables are used to calculate the CH part of the imaginary part
36!     of the self energy assuming a zero energy broadening in the energy
37!     denominator for the evaluation of the self energy (CHP).
38!
39!================================================================================
40
41#include "f_defs.h"
42
43module mtxel_cor_m
44
45  use global_m
46  use misc_m
47  use blas_m
48  use timing_m, only: timing => sigma_timing
49
50  implicit none
51
52  private
53
54  public :: mtxel_cor
55
56contains
57
58subroutine mtxel_cor(in,il,ispin,ncouls,neps,gvec,eps,ph,ind, &
59  indinv,isrtrqi,isrtrq,vcoul,crys,sig,wpg,wfnk,wfnkq,ncoulch, &
60  aqsn,aqsm,aqsch,asigt_imag,acht_n1,asxt,acht,achtcor,achtcor_n1,nspin,qk, &
61  coulfact,inv_igp_index,ngpown,epsR,epsA, &
62  achtD_n1,asxtDyn,achtDyn,achtDyn_cor,achtDyn_corb,ach2tDyn,icalc)
63
64  integer, intent(in) :: in,il,ispin,ncouls,neps
65  type (gspace), intent(in) :: gvec
66  !> (neps,ngpown) Uninitialized unless we are running the complex version
67  SCALAR, pointer, intent(in) :: eps(:,:)
68  SCALAR, intent(in) :: ph(:) !< (gvec%ng)
69  integer, intent(in) :: ind(:), indinv(:), isrtrq(:) !< (gvec%ng)
70  !> (gvec%ng) Uninitialized unless freq_dep=0 .or. exact_ch=1
71  integer, pointer, intent(in) :: isrtrqi(:)
72  real(DP), intent(in) :: vcoul(:) !< (ncoul)
73  type (crystal), intent(in) :: crys
74  type (siginfo), intent(in) :: sig
75  type (wpgen), intent(in) :: wpg
76  type (wfnkstates), intent(in) :: wfnk
77  type (wfnkqstates), intent(in) :: wfnkq
78  integer, intent(in) :: ncoulch
79  SCALAR, intent(in) :: aqsn(:,:), aqsm(:,:) !< (peinf%ntband_max,ncoul)
80  !> (ncoulch) Uninitialized unless freq_dep=0 .or. exact_ch=1
81  SCALAR, pointer, intent(in) :: aqsch(:)
82  !> (sig%ntband) Uninitialized in FF calculations
83  SCALAR, pointer, intent(out) :: acht_n1(:)
84  !> (3 or sig%nfreqeval) Uninitialized unless freq_dep=1
85  SCALAR, pointer, intent(out) :: asxt(:), acht(:)
86! FHJ: static remainder, resolved over bands
87  SCALAR, pointer, intent(out) :: achtcor_n1(:)
88  SCALAR, intent(out) :: achtcor
89  SCALAR, intent(out) :: asigt_imag
90  integer, intent(in) :: nspin
91  real(DP), intent(in) :: qk(:) !< (3)
92  real(DP), intent(in) :: coulfact
93  integer, intent(in) :: inv_igp_index(:) !< (neps)
94  integer, intent(in) :: ngpown
95  !> The following pointers are uninitialized unless we are running the complex version
96  complex(DPC), pointer, intent(in) :: epsR(:,:,:),epsA(:,:,:) !< (neps,ngpown,sig%nFreq)
97  complex(DPC), pointer, intent(out) :: achtD_n1(:) !< (sig%ntband)
98  complex(DPC), pointer, intent(out) :: asxtDyn(:), achtDyn(:), achtDyn_cor(:), ach2tDyn(:) !< (sig%nfreqeval)
99  complex(DPC), pointer, intent(out) :: achtDyn_corb(:) !< (sig%nfreqeval)
100  integer, intent(in) :: icalc
101
102  SCALAR, allocatable :: epstemp(:), aqsntemp(:,:),aqsmtemp(:,:)
103  complex(DPC), allocatable :: epsRtemp(:,:),epsAtemp(:,:)
104  SCALAR, allocatable:: acht_n1_loc(:)
105
106  integer :: my_igp, indigp, ipe, j, iout, my_id
107  real(DP) :: diff, diffmin
108  logical :: flag_occ
109  integer :: ig,igp,igpp,igpp2,iw,iwlda,n1,iband,n1true,nstart,nend,gpp(3)
110  real(DP), allocatable :: wx_array(:)
111! chs - partial sum static CH, chx - exact static CH
112  SCALAR :: achstemp, achxtemp, schx, epsggp, I_epsggp, achxtemp_gp, Omega2, wtilde2
113  SCALAR, allocatable :: asxtemp(:),achtemp(:)
114  SCALAR, allocatable :: wpmtx(:),I_eps_array(:,:)
115  real(DP) :: e_lk, e_n1kq, lambda, phi, freq0, wx, occ
116  complex(DPC) :: wtilde, wtilde2_temp, ctemp
117  complex(DPC), allocatable :: wtilde_array(:,:)
118
119! full-frequency
120  integer :: ifreq, nfreq_real
121  real(DP) :: E_max, pref_zb, freqStart, freqEnd
122  real(DP), allocatable :: pref(:), wxi(:)
123  complex(DPC), allocatable :: asxDtemp(:), achDtemp(:), ach2Dtemp(:), achDtemp_cor(:)
124  complex(DPC), allocatable :: achDtemp_corb(:), sch2Di(:), ssxDi(:)
125  complex(DPC), allocatable :: schDi(:), schDi_cor(:), schDi_corb(:)
126  complex(DPC) :: ssxDit,ssxDitt,ssxDittt,schDt,schDtt
127  complex(DPC), allocatable :: epsRggp(:),I_epsRggp(:)
128  complex(DPC), allocatable :: I_epsR_array(:,:,:),I_epsA_array(:,:,:)
129  SCALAR, allocatable :: I_eps_imag(:,:,:)
130  SCALAR :: mcph
131
132! subspace global variables
133  logical :: do_sigma_subspace
134  integer :: actual_nm
135  integer :: ngpown_sub_max, Nbas_own_max
136  integer :: ngpown_sub,  Nbas_own, wing_pos
137  integer, pointer :: eps_sub_info(:,:)
138  complex(DPC), pointer :: eigenvec_sub(:,:)
139  complex(DPC), pointer :: eps_sub(:,:,:)
140  complex(DPC), pointer :: eps_wings_correction_rows(:,:)
141  complex(DPC), pointer :: eps_wings_correction_cols(:,:)
142  real(DP), pointer :: vcoul_sub(:)
143! subspace local variables
144  logical :: my_do_subspace
145  logical :: fix_wings, fix_wings_res
146  integer :: my_G_start, my_G_end, my_G
147  integer :: my_Nb_start, my_Nb_end, my_Nb, Nb_tot, my_ib
148  integer :: wing_pos_igp
149  integer, allocatable :: ipe_2_LocSubSize(:,:)
150  complex(DPC), allocatable :: wings_Re(:,:,:)  !(:,:,1/2) 1=row, 2=col
151  complex(DPC), allocatable :: Re_eps_sub(:,:,:)
152  complex(DPC) :: wings_Im_elem
153  SCALAR, allocatable :: wings_Im(:,:,:)
154  SCALAR, allocatable :: Im_eps_sub(:,:,:)
155  SCALAR, allocatable :: Msub_m(:,:), Msub_n(:,:)
156  SCALAR, allocatable :: Caux(:,:), Caux_send(:,:), Caux_rec(:,:), Maux(:,:)
157  SCALAR, allocatable :: n_q_order(:,:), m_q_order(:,:)
158  SCALAR, allocatable :: Msub_m_temp(:,:), Msub_n_temp(:,:)
159  ! non-blocking cyclic communication
160  integer :: ipe_inx
161  integer :: isend_static, irec_static
162  integer :: actual_send, actual_rec
163  integer :: req_send_n, tag_send_n, req_rec_n, tag_rec_n
164  integer :: req_send_m, tag_send_m, req_rec_m, tag_rec_m
165  integer :: ib_start, ib_end, ib_size, my_num_band
166#ifdef MPI
167  integer :: stat(MPI_STATUS_SIZE)
168#endif
169
170  complex(DPC), allocatable :: mat_1(:,:), mat_2(:,:)
171
172  PUSH_SUB(mtxel_cor)
173
174  my_do_subspace = .false.
175  if(sig%do_sigma_subspace) then
176    ! check if all pointers are assosiated
177    if(associated(sig%epssub%eps_sub_info) .and. &
178       associated(sig%epssub%eigenvec_sub) .and. &
179       associated(sig%epssub%eps_sub) .and. &
180       associated(sig%epssub%eps_wings_correction_rows) .and. &
181       associated(sig%epssub%eps_wings_correction_cols) .and. &
182       associated(sig%epssub%vcoul_sub)) then
183      my_do_subspace = sig%do_sigma_subspace
184    end if
185  end if
186
187  if(my_do_subspace) then
188    !
189    actual_nm = sig%epssub%actual_nm
190    nullify(eps_sub_info, eigenvec_sub, eps_sub, &
191            eps_wings_correction_rows, eps_wings_correction_cols, &
192            vcoul_sub)
193    eps_sub_info               => sig%epssub%eps_sub_info(:,:,actual_nm)
194    eigenvec_sub               => sig%epssub%eigenvec_sub(:,:,actual_nm)
195    eps_sub                    => sig%epssub%eps_sub(:,:,:,actual_nm)
196    eps_wings_correction_rows  => sig%epssub%eps_wings_correction_rows(:,:)
197    eps_wings_correction_cols  => sig%epssub%eps_wings_correction_cols(:,:)
198    vcoul_sub                  => sig%epssub%vcoul_sub(:,actual_nm)
199    !
200    wing_pos       = sig%epssub%wing_pos(actual_nm)
201    !
202    ngpown_sub_max = sig%epssub%ngpown_sub_max
203    Nbas_own_max   = sig%epssub%Nbas_own_max
204    ngpown_sub     = sig%epssub%ngpown_sub
205    Nbas_own       = sig%epssub%Nbas_own
206    !
207    my_Nb_start = eps_sub_info(1,1)
208    my_Nb_end   = eps_sub_info(2,1)
209    Nb_tot      = eps_sub_info(3,1)
210    my_Nb = my_Nb_end - my_Nb_start + 1
211    !
212    my_G_start  = eps_sub_info(1,2)
213    my_G_end    = eps_sub_info(2,2)
214    my_G = my_G_end - my_G_start + 1
215
216    SAFE_ALLOCATE(ipe_2_LocSubSize, (3, peinf%npes_pool))
217    ipe_2_LocSubSize = 0
218    ipe_2_LocSubSize(1,peinf%pool_rank+1) = my_Nb_start
219    ipe_2_LocSubSize(2,peinf%pool_rank+1) = my_Nb_end
220    ipe_2_LocSubSize(3,peinf%pool_rank+1) = my_Nb
221#ifdef MPI
222    call MPI_Allreduce(MPI_IN_PLACE, ipe_2_LocSubSize(:,:), &
223                       3 * peinf%npes_pool, MPI_INTEGER, MPI_SUM, &
224                       peinf%pool_comm, mpierr)
225#endif
226
227    if( (my_Nb_start .ne. my_G_start) .or. &
228        (my_Nb_end   .ne. my_G_end) .or. &
229        (my_Nb       .ne. my_G) ) then
230      call die("BUG: Subspace Epsilon and eigenvector distribution don't match!")
231    end if
232
233
234    !XX SAFE_ALLOCATE(Caux, (ncouls,Nb_tot))
235    !XX !XXXX replicate
236    !XX Caux = ZERO
237    !XX if(my_G_start > 0) then
238    !XX   ! Caux(my_G_start:my_G_end,1:Nb_tot) = eigenvec_sub(1:my_G,1:Nb_tot)
239    !XX   Caux(1:ncouls,my_G_start:my_G_end) = eigenvec_sub(1:ncouls,1:my_G)
240    !XX end if
241    !XX call MPI_Allreduce(MPI_IN_PLACE, Caux(:,:), &
242    !XX                    ncouls * Nb_tot, MPI_COMPLEX_DPC, MPI_SUM, &
243    !XX                    peinf%pool_comm, mpierr)
244
245    ! matrix elements in subspace
246    SAFE_ALLOCATE(Msub_m, (Nb_tot,peinf%ntband_max))
247    SAFE_ALLOCATE(Msub_n, (Nb_tot,peinf%ntband_max))
248  end if
249
250  if(sig%freq_dep .eq. -1) then
251    call die("BUG: cannot call mtxel_cor for Hartree-Fock!")
252  endif
253  nfreq_real = sig%nFreq - sig%nfreq_imag
254
255  SAFE_ALLOCATE(acht_n1_loc, (sig%ntband))
256  if (sig%freq_dep/=0 .and. sig%exact_ch==1) then
257    achtcor_n1 = ZERO
258  endif
259
260! Initialize Output Arrays
261! SIB: Zero contribution to asx(n) and ach(n) for this irq
262
263  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
264    asxt(:) = ZERO
265    acht(:) = ZERO
266    acht_n1(:) = ZERO
267    acht_n1_loc(:) = ZERO
268  elseif (sig%freq_dep.eq.2) then
269    asxtDyn(:) = (0.0d0,0.0d0)
270    achtDyn(:) = (0.0d0,0.0d0)
271    achtDyn_cor(:) = (0.0d0,0.0d0)
272    achtDyn_corb(:) = (0.0d0,0.0d0)
273    ach2tDyn(:) = (0.0d0,0.0d0)
274    achtD_n1(:) = (0.0d0,0.0d0)
275  endif
276  achtcor = ZERO
277  asigt_imag = ZERO
278
279  if (peinf%my_pool .eq. -1) then
280    POP_SUB(mtxel_cor)
281    return
282  endif
283
284
285!-----------------------
286! Initialization for full-frequency CH integral
287  call timing%start(timing%m_cor_init)
288  if (sig%freq_dep.eq.2) then
289    SAFE_ALLOCATE(pref, (sig%nFreq))
290#ifdef CPLX
291    pref_zb = 0.5D0 / PI_D
292#else
293    pref_zb = 1D0 / PI_D
294#endif
295    do ifreq=1,sig%nFreq
296      if (ifreq .lt. sig%nFreq) then
297        pref(ifreq)=(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))/PI_D
298      else
299        pref(ifreq)=pref(ifreq-1)
300      endif
301    enddo
302    pref(1)=pref(1)*0.5d0
303    pref(sig%nFreq)=pref(sig%nFreq)*0.5d0
304#ifdef CPLX
305    do ifreq=1,sig%nFreq
306      pref(ifreq)=pref(ifreq)*0.5d0
307    enddo
308#endif
309    E_max=sig%dFreqGrid(sig%nFreq)
310    SAFE_ALLOCATE(asxDtemp, (sig%nfreqeval))
311    asxDtemp = 0D0
312    SAFE_ALLOCATE(achDtemp, (sig%nfreqeval))
313    achDtemp = 0D0
314    SAFE_ALLOCATE(achDtemp_cor, (sig%nfreqeval))
315    achDtemp_cor = 0D0
316    SAFE_ALLOCATE(achDtemp_corb, (sig%nfreqeval))
317    achDtemp_corb = 0D0
318    SAFE_ALLOCATE(ach2Dtemp, (sig%nfreqeval))
319    ach2Dtemp = 0D0
320    SAFE_ALLOCATE(schDi, (sig%nfreqeval))
321    schDi=0D0
322    SAFE_ALLOCATE(schDi_cor, (sig%nfreqeval))
323    schDi_cor=0D0
324    SAFE_ALLOCATE(schDi_corb, (sig%nfreqeval))
325    schDi_corb=0D0
326    SAFE_ALLOCATE(sch2Di, (sig%nfreqeval))
327    sch2Di=0D0
328    SAFE_ALLOCATE(ssxDi, (sig%nfreqeval))
329    ssxDi=0D0
330    SAFE_ALLOCATE(wxi, (sig%nfreqeval))
331    wxi=0D0
332    if(.not. (my_do_subspace)) then
333      ! in the subspace case we allocate this later
334      SAFE_ALLOCATE(I_epsR_array, (ncouls,ngpown,nfreq_real))
335      if (sig%need_advanced) then
336        SAFE_ALLOCATE(I_epsA_array, (ncouls,ngpown,nfreq_real))
337      endif
338      if (sig%freq_dep_method==2) then
339        SAFE_ALLOCATE(I_eps_imag, (ncouls,ngpown,sig%nfreq_imag))
340      endif
341    end if ! .not.(my_do_subspace)
342  else if (sig%freq_dep.eq.1) then
343    SAFE_ALLOCATE(I_eps_array, (ncouls,ngpown))
344    SAFE_ALLOCATE(wtilde_array, (ncouls,ngpown))
345!$OMP PARALLEL DO private(j)
346    do j = 1, ngpown
347      I_eps_array(:,j)=ZERO
348      wtilde_array(:,j)=(0D0,0D0)
349    enddo
350!$OMP END PARALLEL DO
351  else if (sig%freq_dep.eq.3) then
352    SAFE_ALLOCATE(I_eps_array, (ncouls,ngpown))
353    SAFE_ALLOCATE(wtilde_array, (ncouls,ngpown))
354    I_eps_array(:,:)=ZERO
355    wtilde_array(:,:)=(0D0,0D0)
356  else
357    SAFE_ALLOCATE(I_eps_array, (ncouls,ngpown))
358  endif
359
360  if(my_do_subspace) then
361     !XXX ! a bit stupid, just deallocate and reallocate
362     !XXX SAFE_DEALLOCATE(I_epsR_array)
363     !XXX SAFE_DEALLOCATE(I_eps_imag)
364     SAFE_ALLOCATE(I_epsR_array,  (ncouls,ngpown,1))
365     SAFE_ALLOCATE(I_eps_imag, (1,1,1))
366     !
367     SAFE_ALLOCATE(Im_eps_sub, (Nb_tot,my_Nb,sig%nfreq_imag))
368     SAFE_ALLOCATE(Re_eps_sub, (Nb_tot,my_Nb,nfreq_real))
369     Im_eps_sub = ZERO
370     Re_eps_sub = (0D0,0D0)
371     SAFE_ALLOCATE(wings_Im, (ncouls, sig%nfreq_imag, 2))
372     SAFE_ALLOCATE(wings_Re, (ncouls, nfreq_real, 2))
373     wings_Im = ZERO
374     wings_Re = (0D0,0D0)
375     ! reordered matrix elements
376     SAFE_ALLOCATE(n_q_order, (ncouls,peinf%ntband_max))
377     SAFE_ALLOCATE(m_q_order, (ncouls,peinf%ntband_max))
378     n_q_order = ZERO
379     m_q_order = ZERO
380  end if
381
382! SIB: the array wfnk%ek has bands indexed by sig%diag
383! take the right band index
384! DVF : sig%ncore_excl has to be substracted here because wfnk%ek is defined in the
385! read_wavefunction subroutine in input.f90 to be referenced to the case with
386! no core states.
387
388  iband = sig%diag(il)
389  e_lk = wfnk%ek(iband-sig%ncore_excl,ispin)
390  ! FHJ: Figure out starting frequency for freq. grid
391  if (sig%freq_grid_shift<2) then
392    freq0 = sig%freqevalmin
393  else
394    freq0 = e_lk - sig%freqevalstep*(sig%nfreqeval-1)/2
395    if (sig%freq_dep==2) then
396      !FHJ: Avoid hitting a pole
397      freq0 = freq0 + TOL_SMALL
398    endif
399  endif
400
401! JRD: Initial frequencies for plasmon pole case.
402! Note: below assumes that we will use iw=1,2,3 in
403! subroutine shiftenergy depending on value of sig%fdf
404
405  if (sig%freq_dep.eq.1 .or. sig%freq_dep.eq.0.or.sig%freq_dep.eq.3) then
406    if (sig%fdf.eq.-3) then
407      SAFE_ALLOCATE(asxtemp,(sig%nfreqeval))
408      SAFE_ALLOCATE(achtemp,(sig%nfreqeval))
409    else
410      SAFE_ALLOCATE(asxtemp,(3))
411      SAFE_ALLOCATE(achtemp,(3))
412    endif
413    asxtemp(:) = ZERO
414    achtemp(:) = ZERO
415  endif
416
417  if (sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
418    if (sig%fdf.eq.-3) then
419      nstart=1
420      nend=sig%nfreqeval
421      SAFE_ALLOCATE(wxi, (sig%nfreqeval))
422      SAFE_ALLOCATE(wx_array, (sig%nfreqeval))
423      do iw=1,sig%nfreqeval
424        wx = freq0 + (iw-1)*sig%freqevalstep
425        wxi(iw) = wx
426      enddo
427    elseif (sig%fdf.eq.-1) then
428      nstart = 1
429      nend = 2
430      SAFE_ALLOCATE(wx_array, (3))
431    elseif (sig%fdf.eq.0) then
432      nstart = 1
433      nend = 3
434      SAFE_ALLOCATE(wx_array, (3))
435    elseif (sig%fdf.eq.1.or.(sig%fdf.eq.2.and.icalc.eq.1)) then
436      nstart = 2
437      nend = 3
438      SAFE_ALLOCATE(wx_array, (3))
439    else
440      nstart = 2
441      nend = 2
442      SAFE_ALLOCATE(wx_array, (3))
443    endif
444  endif
445
446  call timing%stop(timing%m_cor_init)
447!-------------- Main loop over G` (igp) ---------------------------------------
448  call timing%start(timing%m_cor_epsinit)
449  achxtemp = ZERO
450
451! In below OMP region:
452! I_eps_array, I_epsR_array, I_epsA_array, I_eps_imag, wtilde_array are shared
453
454!$OMP PARALLEL private (ig,epsggp,I_epsggp,gpp,iout,schx,igpp,igpp2,achxtemp_gp,igp,indigp, &
455!$OMP                      epstemp,epsRtemp,I_epsRggp,epsRggp, &
456#ifdef CPLX
457!$OMP                      epsAtemp,wtilde2_temp,lambda,phi, &
458#endif
459!$OMP                      wpmtx,wtilde,wtilde2,Omega2,iw,mcph,ctemp,wings_Im_elem)
460
461#ifdef OMP
462  my_id = omp_get_thread_num()
463#else
464  my_id = 0
465#endif
466
467! Allocate Temporary Arrays
468
469  select case(sig%freq_dep)
470  case(0)
471    SAFE_ALLOCATE(epstemp, (neps))
472  case(1)
473    SAFE_ALLOCATE(epstemp, (neps))
474    SAFE_ALLOCATE(wpmtx, (neps))
475  case(2)
476    SAFE_ALLOCATE(epstemp, (neps))
477  case(3)
478    SAFE_ALLOCATE(epstemp, (neps))
479    SAFE_ALLOCATE(epsRtemp, (neps,sig%nFreq))
480    SAFE_ALLOCATE(epsRggp, (sig%nFreq))
481    SAFE_ALLOCATE(I_epsRggp, (sig%nFreq))
482  end select
483
484!$OMP DO reduction(+:achxtemp)
485  do my_igp=1,ngpown
486
487    indigp = inv_igp_index(my_igp)
488    igp = indinv(indigp)
489
490    if (igp .gt. ncouls) write(6,*) "CATASTROPHE", peinf%inode, my_igp, igp
491    if (igp .gt. ncouls .or. igp .le. 0) cycle
492
493!!------------- Initialize eps^-1 for this G` ---------------------------------
494
495    select case(sig%freq_dep)
496    case(0,1)
497      epstemp(:)=eps(:,my_igp)
498    case(2)
499      epstemp(:)=SCALARIFY(epsR(:,my_igp,1))
500    case(3)
501      epstemp(:)=SCALARIFY(epsR(:,my_igp,1))
502      epsRtemp(:,:)=epsR(:,my_igp,:)
503
504    end select
505
506!------------------------------------------------------------------------------
507! (gsm) Below you`ll find the code to compute SX & CH in the static COHSEX
508!       approximation (both the exact and partial sum expressions for CH),
509!       within the GPP model, and using the full frequency dependent RPA
510!       inverse dielectric matrix. The GPP section of the code is well
511!       commented (thanks to Sohrab), while the COHSEX and RPA sections
512!       are not... But most of the variables are the same anyways,
513!       so just look at the comments in the GPP section.
514!------------------------------------------------------------------------------
515
516! (gsm) <<<<<< static COHSEX approximation - exact static CH >>>>>>
517
518! < n k | \Sigma_{CH} (r, r`; 0) | m k > =
519! \frac{1}{2} \sum_{q G G`}
520! < n k | e^{i (G - G`) \cdot r} | m k >
521! [\eps_{G G`}^{-1} (q; 0) - \delta_{G G`}] v (q + G`)
522
523    if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
524
525! Only Computed on One Processor Per Pool
526
527      ! JRD: Since, we are distributed over igp now, all procs need to do this
528
529      achxtemp_gp = ZERO
530
531      do ig=1,ncouls
532
533        epsggp = ph(ig)*MYCONJG(ph(igp))*SCALARIFY(epstemp(ind(ig)))
534
535        if (ig.eq.igp) then
536          I_epsggp = epsggp - 1.0d0
537        else
538          I_epsggp = epsggp
539        endif
540        if (abs(I_epsggp).lt.TOL_Small) cycle
541        if (ig.ne.igp) then
542          gpp(:)=gvec%components(:,isrtrq(ig))-gvec%components(:,isrtrq(igp))
543          call findvector(iout,gpp,gvec)
544          if (iout.eq.0) cycle
545          igpp=isrtrqi(iout)
546          if (igpp.lt.1.or.igpp.gt.ncoulch) cycle
547          gpp(:)=gvec%components(:,isrtrq(igp))-gvec%components(:,isrtrq(ig))
548          call findvector(iout,gpp,gvec)
549          if (iout.eq.0) cycle
550          igpp2=isrtrqi(iout)
551          if (igpp2.lt.1.or.igpp2.gt.ncoulch) cycle
552        else
553          gpp(:)=0
554          call findvector(iout,gpp,gvec)
555          if (iout.eq.0) cycle
556          igpp=isrtrqi(iout)
557          if (igpp.lt.1.or.igpp.gt.ncoulch) cycle
558        endif
559        schx = aqsch(igpp) * I_epsggp
560        achxtemp_gp = achxtemp_gp + schx
561      enddo ! over G (ig)
562
563      achxtemp = achxtemp + achxtemp_gp * vcoul(igp) * 0.5d0
564
565    endif ! sig%freq_dep.eq.0.or.sig%exact_ch.eq.1
566
567!!-----------------------------------------------------------------------------------------
568! (gsm) <<<<<< static COHSEX approximation - CH as a partial sum over empty bands >>>>>>
569
570! < n k | \Sigma_{SX} (r, r`; 0) | m k > =
571! - \sum_{n_1}^{occ} \sum_{q G G`}
572! < n k | e^{i (q + G) \cdot r} | n_1 k - q >
573! < n_1 k - q | e^{- i (q + G`) \cdot r`} | m k >
574! [\eps_{G G`}^{-1} (q; 0) - \delta_{G G`}] v (q + G`)
575
576! < n k | \Sigma_{CH} (r, r`; 0) | m k > =
577! \frac{1}{2} \sum_{n_1} \sum_{q G G`}
578! < n k | e^{i (q + G) \cdot r} | n_1 k - q >
579! < n_1 k - q | e^{- i (q + G`) \cdot r`} | m k >
580! [\eps_{G G`}^{-1} (q; 0) - \delta_{G G`}] v (q + G`)
581
582
583    if (sig%freq_dep.eq.0) then
584
585      do ig=1,ncouls
586        epsggp = ph(ig)*MYCONJG(ph(igp))*epstemp(ind(ig))
587        if (ig.eq.igp) then
588          I_epsggp = epsggp - 1.0d0
589        else
590          I_epsggp = epsggp
591        endif
592
593        I_eps_array(ig,my_igp) = I_epsggp
594
595      enddo ! over G (ig)
596
597! (gsm) <<<<<< Generalized Plasmon Pole model >>>>>>
598
599    elseif (sig%freq_dep.eq.1) then
600
601! Zero out temporary accumulation variables
602
603!----------------------
604! Calculate Plasma Frequencies
605!
606! SIB: Here we get the plasmon-pole effective plasma frequecies
607! Omega(G,G`)^2 (equation (31) of Hybersten & Louie, PRB 34, 1986, p 5396)
608! which come in the vector wp(G) for current G`.
609
610! SIB: We calculate wp(G) for a given G` (trade-off for memory)
611! Note that wp(G) G=1,ncouls requires O(N) operations
612! Even if we redo it for each band n, not so bad
613
614! JRD: I changed this to use qk instead of qkk because we use vcoul at q=0
615! (instead of small q) throughout
616
617
618! given how many times this routine is called, timing it appears to take a non-negligible amount of time
619
620      call wpeff(crys,gvec,wpg,sig,neps,isrtrq,igp,ncouls,wpmtx,nspin,qk,vcoul,coulfact)
621
622!!------ For given G` (igp), loop over all G vectors (ig) in lower triangle ---
623
624      if ( my_id == 0 ) call timing%start(timing%m_cor_pp_prep)
625
626      do ig=1,ncouls
627
628! Put epsilon(G,G`) into epsggp
629
630        epsggp = ph(ig)*MYCONJG(ph(igp))*epstemp(ind(ig))
631
632! I_epsggp = Kronecker(G,G`) - eps(G,G`)
633
634        if (ig.eq.igp) then
635          I_epsggp = ONE - epsggp
636        else
637          I_epsggp = ZERO - epsggp
638        endif
639
640! If I_epsggp is too small, then we skip this (G,G`) entry
641! This only happens when eps is 1 on diagonal or 0 off diagonal
642! but, this means no screening correct and is already treated properly in bare
643! exchange term
644
645        if (abs(I_epsggp).lt.TOL_Small) cycle
646
647        I_eps_array(ig,my_igp) = I_epsggp
648
649! Omega2 = Omega(G,G`)^2 = effective plasma freq^2
650
651        Omega2 = wpmtx(ig)
652
653! If Omega2 is too small, then we skip this (G,G`) entry
654! JRD: I am not sure why we have to cycle here... :/ Probably not needed
655! FHJ: If Omega2->0, both the SX and the CH terms vanish. But the CH term
656! goes to zero as Omega2/wtilde ~ Omega2/sqrt(Omega2). Skipping this term is
657! probably better than risking a 0/sqrt(0) division.
658
659
660        if (abs(Omega2).lt.TOL_Small) cycle
661
662#ifdef CPLX
663
664! <<<<<<<<<<<< COMPLEX GPP >>>>>>>>>>>>
665
666! (gsm) equations (17), (20), (21) from [PRB 40, 3162 (1989)]
667
668        wtilde2_temp = Omega2 / I_epsggp
669
670        lambda = abs(wtilde2_temp)
671        if (lambda .lt. TOL_Small) cycle
672        phi = atan2(IMAG(wtilde2_temp), dble(wtilde2_temp))
673        if (abs(cos(phi)) .lt. TOL_Small) cycle
674        wtilde2 = lambda / cos(phi)
675! this is not needed because we recalculate Omega2 below
676!        Omega2 = Omega2 * CMPLX(1.0d0, -tan(phi))
677
678#else
679
680! <<<<<<<<<<<< REAL GPP >>>>>>>>>>>>
681
682! (gsm) equation (30) from [PRB 34, 5390 (1986)]
683
684        wtilde2 = Omega2 / I_epsggp
685
686        if (abs(wtilde2) .lt. TOL_Small) cycle
687
688#endif
689
690        !FHJ: What do we do if we find an invalid mode with wtilde2<0?
691        if (dble(wtilde2)<0) then
692          select case (sig%invalid_gpp_mode)
693            case (0) ! Skip invalid GPP mode and ignore its contribution to the self energy.
694              wtilde = (0d0,0d0)
695            case (1) ! "Find" a purely imaginary mode frequency (GSM & JRD).
696              wtilde = sqrt(COMPLEXIFY(wtilde2))
697            case (2) ! Set the GPP mode frequency to a fixed value of 2 Ry.
698              wtilde = CMPLX(2d0*ryd, 0d0)
699            case (3) ! Treat that mode within the static COHSEX approximation.
700              wtilde = CMPLX(1d0/TOL_ZERO, 0d0)
701            case default
702              wtilde = CMPLX(1d0/TOL_ZERO, 0d0)
703          endselect
704        else
705          wtilde = CMPLX(sqrt(dble(wtilde2)), 0d0)
706        endif
707        wtilde_array(ig,my_igp) = wtilde
708
709      enddo ! G Loop for GPP Setup
710
711      if (my_id.eq.0) call timing%stop(timing%m_cor_pp_prep)
712
713    elseif (sig%freq_dep.eq.3) then
714
715      if (my_id.eq.0) call timing%start(timing%m_cor_pp_prep)
716
717      do ig=1,ncouls
718
719! Put epsilon(G,G`) into epsRggp
720! JRD XXX This is bad. But it is FJR code, so who cares...
721        epsRggp(:) = ph(ig)*MYCONJG(ph(igp))*epsRtemp(ind(ig),:)
722
723! I_epsRggp = Kronecker(G,G`) - eps(G,G`)
724        if (ig.eq.igp) then
725          I_epsRggp(:) = ONE - epsRggp(:)
726        else
727          I_epsRggp(:) = ZERO - epsRggp(:)
728        endif
729
730! If I_epsggp is too small, then we skip this (G,G`) entry
731! This only happens when eps is 1 on diagonal or 0 off diagonal
732! but, this means no screening correct and is already treated properly in bare
733! exchange term
734
735        if (all(abs(I_epsRggp(:)).lt.TOL_Small)) cycle
736
737        I_eps_array(ig,my_igp) = SCALARIFY(I_epsRggp(1))
738        wtilde2 = abs(sig%dFreqBrd(2))**2 * I_epsRggp(2)/ ( I_epsRggp(1) - I_epsRggp(2) )
739
740        !FHJ: What do we do if we find an invalid mode with wtilde2<0?
741        if (dble(wtilde2)<0) then
742          select case (sig%invalid_gpp_mode)
743            case (0) ! Skip invalid mode and ignore its contribution to the self energy.
744              wtilde = (0d0,0d0)
745            case (1) ! "Find" a purely complex mode frequency (GSM & JRD).
746              wtilde = sqrt(COMPLEXIFY(wtilde2))
747            case (2) ! Set the mode frequency to a fixed value of 2 Ry.
748              wtilde = CMPLX(2d0*ryd, 0d0)
749            case (3) ! Treat that mode within the static COHSEX approximation
750              wtilde = CMPLX(1d0/TOL_ZERO, 0d0)
751            case default
752              wtilde = CMPLX(1d0/TOL_ZERO, 0d0)
753          endselect
754        else
755          wtilde = sqrt(dble(wtilde2))
756        endif
757        wtilde_array(ig,my_igp) = wtilde
758
759      enddo ! G Loop for GPP Setup
760
761      if (my_id.eq.0) call timing%stop(timing%m_cor_pp_prep)
762
763!!--------------------------------------------------------------------
764! (gsm) <<<<<< full frequency dependent inverse dielectric matrix >>>>>>
765
766! The code below makes use of the following relations:
767!
768! {eps_{G G`}^r}^{-1}(q, -E) = {eps_{G G`}^a}^{-1}(q, E)
769! for general systems (both real and complex versions of the code)
770!
771! {eps_{G G`}^a}^{-1}(q, E) = {{eps_{G G`}^r}^{-1}}^{*}(q, E)
772! for systems with inversion symmetry (the real version of the code)
773! since plane-wave matrix elements are real
774!
775! {eps_{G G}^a}^{-1}(q, E) = {{eps_{G G}^r}^{-1}}^{*}(q, E)
776! for general systems, the diagonal of the matrix (G` = G)
777! since plane-wave matrix elements are complex conjugates of each other
778!
779! The complex version of the code uses
780! {eps_{G G`}^r}^{-1}(q, E) and {eps_{G G`}^a}^{-1}(q, E) for E >= 0
781!
782! The real version of the code uses
783! {eps_{G G`}^r}^{-1}(q, E) for E >= 0
784
785! CHP: full frequency routine - the case for sig%ggpsum == 1
786!
787! On top of the above relations, we need the following:
788! Assuming that W_{G,G`}={eps_{G,G`}}^{-1} v(q+G`),
789!
790! W_{G`,G}^r(E) = {W_{G,G`}^a}^{*}(E)
791!               = {W_{G,G`}^r}^{*}(-E) (this form is used if E<0)
792! for general systems (both real and complex version of the code)
793!
794! W_{G`,G}^a(E) = {W_{G,G`}^r}^{*}(E)
795!               = {W_{G,G`}^a}^{*}(-E) (this form is used if E<0)
796! for general systems (both real and complex version of the code)
797!
798! W_{G`,G}^r(E) = W_{G,G`}^r(E)
799! for systems with inversion symmetry
800!
801! W_{G`,G}^a(E) = W_{G,G`}^a(E)
802! for systems with inversion symmetry
803!
804! Note that eps^{-1} does not have these symmetries.
805
806    endif
807  enddo
808!$OMP END DO
809
810  if (sig%freq_dep.eq.2) then
811    if (sig%freq_dep_method==0) then
812
813!$OMP DO
814      do iw = 1, sig%nFreq
815        do my_igp=1,ngpown
816
817          indigp = inv_igp_index(my_igp)
818          igp = indinv(indigp)
819
820          if (igp .gt. ncouls .or. igp .le. 0) cycle
821
822          mcph = MYCONJG(ph(igp))
823! JRD: XXX ind() may kill us here
824          do ig = 1, ncouls
825            I_epsR_array(ig,my_igp,iw) = - mcph * ph(ig) * epsR(ind(ig),my_igp,iw)
826          enddo
827          I_epsR_array(igp,my_igp,iw) = I_epsR_array(igp,my_igp,iw) + 1.0d0
828        enddo
829      enddo
830!$OMP END DO
831
832    else
833
834! JRD XXX
835! May want to add schedule dynamic here since iw > nfreq_real has more work. Or move OMP in one level
836!$OMP DO
837      do iw = 1, sig%nFreq
838        if (iw .le. nfreq_real) then
839
840          if(my_do_subspace) then
841            ! indigp = inv_igp_index(my_igp) -> local to global index for full Eps mat
842            ! indinv() -> from the global Epsilon index to the k+q indexing
843            ! ind() -> from the global k+q indexing go th the Epsilon indexing
844            ! ph -> phase in the k+q indexing
845            if(iw == 1) then
846              ! copy full epsilon only for the static case
847              I_epsR_array(:,:,1) = (0.0d0, 0.0d0)
848              do my_igp=1,ngpown
849                indigp = inv_igp_index(my_igp)
850                igp = indinv(indigp)
851
852                if (igp .gt. ncouls .or. igp .le. 0) cycle
853                !XX if (indigp .gt. ncouls .or. indigp .le. 0) cycle
854
855                mcph = MYCONJG(ph(igp))
856                do ig = 1, ncouls
857                  I_epsR_array(ig,my_igp,iw) = - mcph * ph(indinv(ig)) * epsR(ig, my_igp, iw)
858                enddo
859                I_epsR_array(indigp,my_igp,iw) = I_epsR_array(indigp,my_igp,iw) + 1.0d0
860
861              end do
862            end if
863
864            ! fix wings, here we go with the ordering driven by the epsilon matrix
865            ! each MPI task own the full wings correction, both row and col
866            wing_pos_igp = indinv(wing_pos)
867            do ig = 1, ncouls
868              ! row
869              ! transform eps -> k+q index
870              igp = indinv(ig)
871              wings_Re(ig,iw,1) = - eps_wings_correction_rows(ig,iw) * MYCONJG(ph(igp)) * ph(wing_pos_igp)
872              ! wings_Re(ig,iw,1) = wings_Re(ig,iw,1) * vcoul_sub(ig) / vcoul_sub(wing_pos)
873            end do
874            do ig = 1, ncouls
875              ! column
876              igp = indinv(ig)
877              wings_Re(ig,iw,2) = - eps_wings_correction_cols(ig,iw) * ph(igp) * MYCONJG(ph(wing_pos_igp))
878              ! wings_Re(ig,iw,2) = wings_Re(ig,iw,2) * vcoul_sub(wing_pos) / vcoul_sub(ig)
879            end do
880            ! and here the famous factor 1.0
881            wings_Re(wing_pos,iw,1) = wings_Re(wing_pos,iw,1) + 1.0d0
882            ! copy subspace epsilon
883            Re_eps_sub(1:Nb_tot,1:my_Nb,iw) = - eps_sub(1:Nb_tot,1:my_Nb,iw)
884
885          else ! my_do_subspace
886
887            do my_igp=1,ngpown
888
889              indigp = inv_igp_index(my_igp)
890              igp = indinv(indigp)
891
892              if (igp .gt. ncouls .or. igp .le. 0) cycle
893
894              mcph = MYCONJG(ph(igp))
895! JRD: XXX ind() may kill us here
896              do ig = 1, ncouls
897                I_epsR_array(ig,my_igp,iw) = - mcph * ph(ig) * epsR(ind(ig),my_igp,iw)
898              enddo
899              I_epsR_array(igp,my_igp,iw) = I_epsR_array(igp,my_igp,iw) + 1.0d0
900            enddo
901
902          end if ! my_do_subspace
903
904        endif
905        if (iw .gt. nfreq_real) then
906
907          if(my_do_subspace) then
908            ! fix wings
909            wing_pos_igp = indinv(wing_pos)
910            do ig = 1, ncouls
911              ! row
912              ! transform eps -> k+q index
913              igp = indinv(ig)
914              wings_Im_elem = - eps_wings_correction_rows(ig,iw) * MYCONJG(ph(igp)) * ph(wing_pos_igp)
915              wings_Im(ig,iw-nfreq_real,1) = SCALARIFY(wings_Im_elem)
916              !XXX wings_Im(ig,iw-nfreq_real,1) = wings_Im(ig,iw-nfreq_real,1) * vcoul_sub(ig) / vcoul_sub(wing_pos)
917            end do
918            do ig = 1, ncouls
919              ! column
920              igp = indinv(ig)
921              wings_Im_elem = - eps_wings_correction_cols(ig,iw) * ph(igp) * MYCONJG(ph(wing_pos_igp))
922              wings_Im(ig,iw-nfreq_real,2) = SCALARIFY(wings_Im_elem)
923              !XXX wings_Im(ig,iw-nfreq_real,2) = wings_Im(ig,iw-nfreq_real,2) * vcoul_sub(wing_pos) / vcoul_sub(ig)
924            end do
925            ! and here the famous factor 1.0
926            wings_Im(wing_pos,iw-nfreq_real,1) = wings_Im(wing_pos,iw-nfreq_real,1) + 1.0d0
927            ! copy subspace epsilon
928            Im_eps_sub(1:Nb_tot,1:my_Nb,iw-nfreq_real) = - SCALARIFY(eps_sub(1:Nb_tot,1:my_Nb,iw))
929          else ! my_do_subspace
930
931            do my_igp=1,ngpown
932
933              indigp = inv_igp_index(my_igp)
934              igp = indinv(indigp)
935
936              if (igp .gt. ncouls .or. igp .le. 0) cycle
937
938              mcph = MYCONJG(ph(igp))
939! JRD: XXX ind() may kill us here
940              do ig = 1, ncouls
941                ctemp = - mcph * ph(ig) * epsR(ind(ig),my_igp,iw)
942                I_eps_imag(ig,my_igp,iw-nfreq_real) = SCALARIFY(ctemp)
943              enddo
944              I_eps_imag(igp,my_igp,iw-nfreq_real) = I_eps_imag(igp,my_igp,iw-nfreq_real) + 1.0d0
945            enddo
946
947          end if ! my_do_subspace
948
949        endif
950      enddo
951!OMP END DO
952
953    endif
954
955    if (sig%need_advanced) then
956
957!$OMP DO
958      do iw = 1, sig%nFreq
959        do my_igp=1,ngpown
960
961          indigp = inv_igp_index(my_igp)
962          igp = indinv(indigp)
963
964          if (igp .gt. ncouls .or. igp .le. 0) cycle
965
966          mcph = MYCONJG(ph(igp))
967! JRD: XXX ind() may kill us here
968          do ig = 1, ncouls
969            I_epsA_array(ig,my_igp,iw) = - mcph * ph(ig) * epsA(ind(ig),my_igp,iw)
970          enddo
971          I_epsA_array(igp,my_igp,iw) = I_epsA_array(igp,my_igp,iw) + 1.0d0
972        enddo
973      enddo
974!$OMP END DO
975
976    endif
977  endif
978
979  if (sig%freq_dep.eq.0) then
980    SAFE_DEALLOCATE(epstemp)
981  endif
982  if (sig%freq_dep.eq.1) then
983    SAFE_DEALLOCATE(wpmtx)
984    SAFE_DEALLOCATE(epstemp)
985  endif
986  if (sig%freq_dep.eq.2) then
987    SAFE_DEALLOCATE(epstemp)
988  endif
989  if (sig%freq_dep.eq.3) then
990    SAFE_DEALLOCATE(epstemp)
991    SAFE_DEALLOCATE(epsRtemp)
992    SAFE_DEALLOCATE(epsRggp)
993    SAFE_DEALLOCATE(I_epsRggp)
994  endif
995
996!$OMP END PARALLEL
997  call timing%stop(timing%m_cor_epsinit)
998
999  ! transform basis, moved outside OMP parallel region
1000  call timing%start(timing%sub_transf_tot)
1001  if(my_do_subspace) then
1002    ! transform basis function
1003    ! reorder wfn matrix elements
1004    n_q_order = ZERO
1005    m_q_order = ZERO
1006    do ig = 1, ncouls
1007      igp = indinv(ig)
1008      if (igp .gt. ncouls .or. igp .le. 0) cycle
1009      n_q_order(ig,:) = aqsn(igp,:)
1010      m_q_order(ig,:) = aqsm(igp,:)
1011    end do
1012
1013    ! copy matrix elements and scale them with the coulomb operator
1014    SAFE_ALLOCATE(aqsntemp,(ncouls,peinf%ntband_max))
1015    SAFE_ALLOCATE(aqsmtemp,(ncouls,peinf%ntband_max))
1016    aqsntemp = n_q_order
1017    aqsmtemp = m_q_order
1018    do ig = 1, ncouls
1019      igp = indinv(ig)
1020      if (igp .gt. ncouls .or. igp .le. 0) cycle
1021      aqsntemp(ig,:) = aqsntemp(ig,:) * ph(igp) * SQRT(vcoul_sub(ig))
1022      aqsmtemp(ig,:) = aqsmtemp(ig,:) * ph(igp) * vcoul(igp) / SQRT(vcoul_sub(ig))
1023    end do
1024
1025    SAFE_ALLOCATE(Caux, (ncouls, Nbas_own_max))
1026    SAFE_ALLOCATE(Caux_send, (ncouls, Nbas_own_max))
1027    SAFE_ALLOCATE(Caux_rec, (ncouls, Nbas_own_max))
1028    SAFE_ALLOCATE(Maux, (Nbas_own_max, peinf%ntband_max))
1029
1030    Caux      = ZERO  ! (0D0,0D0)
1031    Caux_send = ZERO  ! (0D0,0D0)
1032    Caux_rec  = ZERO  ! (0D0,0D0)
1033    Maux      = ZERO  ! (0D0,0D0)
1034
1035    ! get ready for the first cycle
1036    isend_static = MOD(peinf%pool_rank + 1 + peinf%npes_pool, peinf%npes_pool)
1037    irec_static  = MOD(peinf%pool_rank - 1 + peinf%npes_pool, peinf%npes_pool)
1038    if(my_Nb_start > 0) then
1039      Caux(1:ncouls, 1:Nbas_own) = SCALARIFY(eigenvec_sub(1:ncouls, 1:Nbas_own))
1040    end if
1041    Msub_n = ZERO  ! (0D0,0D0)
1042    Msub_m = ZERO  ! (0D0,0D0)
1043
1044    my_num_band = peinf%ntband_dist(peinf%pool_rank+1)
1045
1046    ipe = peinf%pool_rank + 1
1047    do ipe_inx = 1, peinf%npes_pool
1048      actual_send = MOD(peinf%pool_rank + ipe_inx + peinf%npes_pool, peinf%npes_pool)
1049      actual_rec  = MOD(peinf%pool_rank - ipe_inx + peinf%npes_pool, peinf%npes_pool)
1050#ifdef MPI
1051      call timing%start(timing%sub_transf_com)
1052      ! post receiving mex
1053      tag_rec_n = 1
1054      req_rec_n = MPI_REQUEST_NULL
1055      CALL MPI_Irecv(Caux_rec, ncouls*Nbas_own_max, MPI_SCALAR, irec_static,&
1056                     tag_rec_n, peinf%pool_comm, req_rec_n, mpierr)
1057      ! post send mex
1058      tag_send_n = 1
1059      req_send_n = MPI_REQUEST_NULL
1060      Caux_send  = Caux
1061      CALL MPI_Isend(Caux_send, ncouls*Nbas_own_max, MPI_SCALAR, isend_static,&
1062                     tag_send_n, peinf%pool_comm, req_send_n, mpierr)
1063      call timing%stop(timing%sub_transf_com)
1064#endif
1065      ! go with zgemm
1066      ib_start = ipe_2_LocSubSize(1,ipe)
1067      ib_end   = ipe_2_LocSubSize(2,ipe)
1068      ib_size  = ipe_2_LocSubSize(3,ipe)
1069      call timing%start(timing%sub_transf_gemm)
1070      if(ib_start > 0) then
1071#ifdef CPLX
1072        call zgemm('T','N', Nbas_own_max, peinf%ntband_max, ncouls, &
1073                   (1D0,0D0), Caux(:,:), ncouls, &
1074                          aqsntemp(:,:), ncouls, &
1075                   (0D0,0D0), Maux, Nbas_own_max)
1076#else
1077        call dgemm('T','N', Nbas_own_max, peinf%ntband_max, ncouls, &
1078                    1.0D0, Caux(:,:), ncouls, &
1079                          aqsntemp(:,:), ncouls, &
1080                    0.0D0, Maux, Nbas_own_max)
1081#endif
1082        Msub_n(ib_start:ib_end, 1:my_num_band) = Msub_n(ib_start:ib_end, 1:my_num_band) + &
1083                                                 Maux(1:ib_size,1:my_num_band)
1084
1085#ifdef CPLX
1086        call zgemm('T','N', Nbas_own_max, peinf%ntband_max, ncouls, &
1087                   (1D0,0D0), Caux(:,:), ncouls, &
1088                          aqsmtemp(:,:), ncouls, &
1089                   (0D0,0D0), Maux, Nbas_own_max)
1090#else
1091        call dgemm('T','N', Nbas_own_max, peinf%ntband_max, ncouls, &
1092                   1.0D0, Caux(:,:), ncouls, &
1093                          aqsmtemp(:,:), ncouls, &
1094                   0.0D0, Maux, Nbas_own_max)
1095#endif
1096        Msub_m(ib_start:ib_end, 1:my_num_band) = Msub_m(ib_start:ib_end, 1:my_num_band) + &
1097                                                 Maux(1:ib_size,1:my_num_band)
1098      end if
1099      call timing%stop(timing%sub_transf_gemm)
1100#ifdef MPI
1101      call timing%start(timing%sub_transf_com)
1102      CALL MPI_Wait(req_rec_n,stat,mpierr)
1103      CALL MPI_Wait(req_send_n,stat,mpierr)
1104      call timing%stop(timing%sub_transf_com)
1105#endif
1106      ! get ready for next cycle
1107      Caux = Caux_rec
1108      ipe = actual_rec + 1
1109    end do
1110    SAFE_DEALLOCATE(aqsntemp)
1111    SAFE_DEALLOCATE(aqsmtemp)
1112  end if  ! my_do_subspace
1113  ! finish with basis transformation
1114  call timing%stop(timing%sub_transf_tot)
1115
1116!-----------------------------------------------------------------------------
1117! End of setup. Begin computation of Sigma.
1118!-----------------------------------------------------------------------------
1119
1120  if (sig%freq_dep/=0 .and. sig%exact_ch==1) then
1121    ! Initialize the static part of the static remainder correction, resolved
1122    ! over bands. We put the static term into band #1, and later on
1123    ! substract the CH part. Note the famous factor of 1/2 here.
1124    achtcor_n1(1) = 0.5d0 * achxtemp
1125  endif
1126  SAFE_ALLOCATE(aqsntemp,(ncouls,peinf%ntband_max))
1127  SAFE_ALLOCATE(aqsmtemp,(ncouls,peinf%ntband_max))
1128
1129  if (my_do_subspace) then
1130    ! create buffers for communication
1131    SAFE_ALLOCATE(Msub_n_temp, (Nb_tot,peinf%ntband_max))
1132    SAFE_ALLOCATE(Msub_m_temp, (Nb_tot,peinf%ntband_max))
1133    Msub_n_temp = ZERO
1134    Msub_m_temp = ZERO
1135    ! set the flag for fixing the wings
1136    fix_wings = .true.
1137    fix_wings_res = .true.
1138    !XXXX
1139    if(sig%subsample) then
1140      ! in the subsample case no fix is done, set flag to false
1141      fix_wings = .false.
1142      fix_wings_res = .false.
1143    end if
1144    !XXXX
1145    ! dry run just to fix wings, we pass Msub_m_temp and Msub_n_temp with
1146    ! zeros and we pretend to run for our actual proc.
1147    call timing%start(timing%m_cor_sub_wings)
1148    aqsntemp = ZERO
1149    aqsmtemp = ZERO
1150    aqsntemp(1:ncouls,1:my_num_band) = n_q_order(1:ncouls,1:my_num_band)
1151    aqsmtemp(1:ncouls,1:my_num_band) = m_q_order(1:ncouls,1:my_num_band)
1152    ipe = peinf%pool_rank + 1
1153    call sigma_cd_subspace()
1154    call timing%stop(timing%m_cor_sub_wings)
1155    !XXXX
1156  end if
1157
1158  ! FHJ: Loop over each processor/pool, and over each band that ipe owns.
1159  ! Then, calculate contribution to Sigma due to that band, stored in the
1160  ! aqs*temp arrays.
1161  do ipe = 1, peinf%npes_pool
1162
1163    call timing%start(timing%m_cor_comm)
1164    if (peinf%pool_rank .eq. ipe-1) then
1165      if(my_do_subspace) then
1166        !XXX aqsntemp(:,:) = n_q_order(:,:)
1167        !XXX aqsmtemp(:,:) = m_q_order(:,:)
1168        Msub_n_temp(:,:) = Msub_n(:,:)
1169        Msub_m_temp(:,:) = Msub_m(:,:)
1170      else
1171        aqsntemp(:,:) = aqsn(1:ncouls,:)
1172        aqsmtemp(:,:) = aqsm(1:ncouls,:)
1173      end if
1174    endif
1175
1176#ifdef MPI
1177    if (peinf%my_pool/=-1) then
1178      if (my_do_subspace) then
1179        call MPI_Bcast(Msub_n_temp, Nb_tot * peinf%ntband_max, MPI_SCALAR, ipe-1, &
1180                       peinf%pool_comm, mpierr)
1181        call MPI_Bcast(Msub_m_temp, Nb_tot * peinf%ntband_max, MPI_SCALAR, ipe-1, &
1182                       peinf%pool_comm, mpierr)
1183      else
1184
1185        call MPI_Bcast(aqsntemp, peinf%ntband_max*ncouls, MPI_SCALAR, ipe-1, &
1186          peinf%pool_comm, mpierr)
1187        ! FHJ: Only bother to BCast aqsmtemp if this is an off-diag. calculation
1188        if (icalc==2) then
1189          call MPI_Bcast(aqsmtemp, peinf%ntband_max*ncouls, MPI_SCALAR, ipe-1, &
1190            peinf%pool_comm, mpierr)
1191        else
1192          aqsmtemp = aqsntemp
1193        endif
1194
1195      end if ! my_do_subspace
1196    endif
1197#endif
1198    call timing%stop(timing%m_cor_comm)
1199
1200    ! here we want to move the band index inside the actual routine such
1201    ! that we can calculate sigma by calling ZGEMM
1202    if(my_do_subspace) then
1203
1204      call sigma_cd_subspace()
1205
1206    else ! my_do_subspace
1207
1208      do n1 = 1, peinf%ntband_dist(ipe)
1209
1210        ! n1true = "True" band index of the band n1 w.r.t. all bands
1211        n1true = peinf%indext_dist(n1,ipe)
1212        ! energy of the |n1,k-q> state
1213        e_n1kq = wfnkq%ekq(n1true,ispin)
1214        ! occupation of the |n1,k-q> state
1215        flag_occ = (n1true<=(sig%nvband+sig%ncrit)) &
1216          .and.((sig%ncrit==0).or.(e_n1kq<=(sig%efermi+sig%tol)))
1217        if (abs(e_n1kq-sig%efermi)<sig%tol) then
1218          occ = 0.5d0 ! Fermi-Dirac distribution = 1/2 at Fermi level
1219        else
1220          occ = 1.0d0
1221        endif
1222
1223        ! FHJ: CH term used for static remainder for current band "n1true".
1224        achstemp = ZERO
1225
1226        !FHJ: Call specialized code to calculate contribution to <nk|Sigma^c|mk>
1227        !due to current band n1
1228        if (sig%freq_dep==0) then
1229          call sigma_cohsex()
1230        else if (sig%freq_dep==1 .or. sig%freq_dep==3) then
1231          call sigma_gpp()
1232        else if (sig%freq_dep==2 .and. sig%freq_dep_method==0) then
1233          call sigma_ra()
1234        else if (sig%freq_dep==2 .and. sig%freq_dep_method==2) then
1235          call sigma_cd()
1236        endif
1237
1238        if (sig%freq_dep/=0 .and. sig%exact_ch==1) then
1239          ! Compute the static remainder resolved over bands (just the CH part here).
1240          ! Note the famous factor of 1/2 here.
1241          achtcor_n1(n1true) = achtcor_n1(n1true) - 0.5d0*achstemp
1242        endif
1243      enddo ! over bands (n1)
1244
1245    end if! my_do_subspace
1246
1247  enddo ! over bands (ipe)
1248
1249  ! FHJ: Sum contribution to Sigma due to different bands into a single
1250  ! contribution due to the portion of the dielectric matrix that I own. We`ll
1251  ! reduce these contributions in sigma_main.f90 after we loop over all q points.
1252
1253  if (sig%freq_dep/=0 .and. sig%exact_ch==1) then
1254    ! Compute the static remainder integrated over bands.
1255    ! Note the famous factor of 1/2 here.
1256    achtcor = sum(achtcor_n1)
1257  endif
1258
1259  if (sig%freq_dep==0) then
1260
1261    if (sig%exact_ch==1) then
1262      achtemp(2) = achxtemp
1263      achxtemp = ZERO
1264    endif
1265    asxt(:) = asxt(:) + asxtemp(2)
1266    acht(:) = acht(:) + achtemp(2)
1267    if (sig%exact_ch==0) then
1268      achtcor = achtcor + (achxtemp - achtemp(2))
1269    endif
1270
1271  elseif (sig%freq_dep==1 .or. sig%freq_dep==3) then
1272
1273    do iw=nstart,nend
1274      asxt(iw) = asxt(iw) + asxtemp(iw)
1275      acht(iw) = acht(iw) + achtemp(iw)
1276    enddo
1277
1278  elseif (sig%freq_dep==2 .and. sig%freq_dep_method==0) then
1279
1280    do iw = 1, sig%nfreqeval
1281      asxtDyn(iw) = asxtDyn(iw) + asxDtemp(iw)
1282      achtDyn(iw) = achtDyn(iw) + achDtemp(iw)
1283      achtDyn_cor(iw) = achtDyn_cor(iw) + achDtemp_cor(iw)
1284      achtDyn_corb(iw) = achtDyn_corb(iw) + achDtemp_corb(iw)
1285      ach2tDyn(iw) = ach2tDyn(iw) + ach2Dtemp(iw)
1286    enddo
1287
1288  elseif (sig%freq_dep==2 .and. sig%freq_dep_method==2) then
1289
1290    do iw = 1, sig%nfreqeval
1291      asxtDyn(iw) = asxtDyn(iw) + asxDtemp(iw)
1292      achtDyn(iw) = achtDyn(iw) + achDtemp(iw)
1293      achtDyn_cor(iw) = achtDyn_cor(iw) + achDtemp(iw)
1294      achtDyn_corb(iw) = achtDyn_corb(iw) + achDtemp(iw) + asxDtemp(iw)
1295    enddo
1296
1297  endif
1298
1299  SAFE_DEALLOCATE(aqsntemp)
1300  SAFE_DEALLOCATE(aqsmtemp)
1301
1302
1303!-----------------------------------------------------------------------------
1304! Done calculating Sigma! Deallocate and finish.
1305!-----------------------------------------------------------------------------
1306
1307  if (sig%freq_dep.eq.1 .or.sig%freq_dep.eq.3) then
1308    SAFE_DEALLOCATE(wx_array)
1309    SAFE_DEALLOCATE(wtilde_array)
1310    if (sig%fdf.eq.-3) then
1311      SAFE_DEALLOCATE(wxi)
1312    endif
1313  endif
1314
1315  if (sig%freq_dep.eq.1 .or. sig%freq_dep.eq.0 .or. sig%freq_dep.eq.3)  then
1316    SAFE_DEALLOCATE(asxtemp)
1317    SAFE_DEALLOCATE(achtemp)
1318    SAFE_DEALLOCATE(I_eps_array)
1319    acht_n1(1:sig%ntband) = acht_n1_loc(1:sig%ntband)
1320  endif
1321
1322  if (sig%freq_dep.eq.2) then
1323    SAFE_DEALLOCATE(pref)
1324    SAFE_DEALLOCATE(asxDtemp)
1325    SAFE_DEALLOCATE(achDtemp)
1326    SAFE_DEALLOCATE(achDtemp_cor)
1327    SAFE_DEALLOCATE(achDtemp_corb)
1328    SAFE_DEALLOCATE(ach2Dtemp)
1329    SAFE_DEALLOCATE(schDi)
1330    SAFE_DEALLOCATE(schDi_cor)
1331    SAFE_DEALLOCATE(schDi_corb)
1332    SAFE_DEALLOCATE(sch2Di)
1333    SAFE_DEALLOCATE(ssxDi)
1334    SAFE_DEALLOCATE(wxi)
1335    SAFE_DEALLOCATE(I_epsR_array)
1336    if (sig%need_advanced) then
1337      SAFE_DEALLOCATE(I_epsA_array)
1338    endif
1339    if (sig%freq_dep_method==2) then
1340      SAFE_DEALLOCATE(I_eps_imag)
1341    endif
1342  endif
1343  SAFE_DEALLOCATE(acht_n1_loc)
1344
1345  POP_SUB(mtxel_cor)
1346  return
1347
1348contains
1349
1350!==============================================================================
1351!                                   COHSEX
1352!==============================================================================
1353  !> Calculate Sigma matrix elements, COHSEX formalism
1354  subroutine sigma_cohsex()
1355    SCALAR :: aqsn_Ieps, asxtemp_loc, achtemp_loc
1356
1357    PUSH_SUB(mtxel_cor.sigma_cohsex)
1358
1359    call timing%start(timing%m_cor_sx_ch)
1360!$OMP PARALLEL DO private (my_igp,igp,indigp,ig,aqsn_Ieps,achtemp_loc, &
1361!$OMP                       asxtemp_loc) reduction(+:achtemp,asxtemp,acht_n1_loc)
1362    do my_igp = 1, ngpown
1363      indigp = inv_igp_index(my_igp)
1364      igp = indinv(indigp)
1365      if (igp>ncouls .or. igp<=0) cycle
1366      achtemp_loc = ZERO
1367      asxtemp_loc = ZERO
1368
1369      do ig = 1, ncouls
1370        aqsn_Ieps = aqsntemp(ig,n1) * I_eps_array(ig,my_igp)
1371        if (sig%exact_ch==0) then
1372          achtemp_loc = achtemp_loc + 0.5d0*aqsn_Ieps
1373        endif
1374        if (flag_occ) asxtemp_loc = asxtemp_loc - aqsn_Ieps
1375      enddo ! ig
1376
1377      asxtemp(2) = asxtemp(2) + asxtemp_loc*occ*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1378      if (sig%exact_ch==0) then
1379        achtemp(2) = achtemp(2) + achtemp_loc*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1380        acht_n1_loc(n1true) = acht_n1_loc(n1true) + achtemp_loc*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1381      endif
1382    enddo ! my_igp
1383!$OMP END PARALLEL DO
1384    call timing%stop(timing%m_cor_sx_ch)
1385
1386    POP_SUB(mtxel_cor.sigma_cohsex)
1387
1388  end subroutine sigma_cohsex
1389
1390
1391!==============================================================================
1392!                                     GPP
1393!==============================================================================
1394  !> Calculate Sigma matrix elements, GPP (HL and GN) formalism
1395  subroutine sigma_gpp()
1396    SCALAR, allocatable :: ssx_array(:), sch_array(:)
1397    real(DP) :: delwr, delw2, wdiffr, rden, wxt, ssxcutoff, limitone, limittwo
1398    complex(DPC) :: halfinvwtilde, delw, wdiff, cden
1399    integer :: igbeg, igend, igblk
1400    SCALAR :: ssx, sch, ssxt, scht, schtt
1401
1402    PUSH_SUB(mtxel_cor.sigma_gpp)
1403
1404    igblk = 512
1405
1406    ! Some constants used in the loop below, computed here to save
1407    ! floating point operations
1408    limitone=1D0/(TOL_Small*4D0)
1409    limittwo=sig%gamma**2
1410
1411    ! GSM: compute the static CH for the static remainder
1412    if (sig%exact_ch.eq.1) then
1413      call acc_static_ch(ngpown, ncouls, inv_igp_index, indinv, vcoul, &
1414        aqsntemp(:,n1), aqsmtemp(:,n1), achstemp, eps_scalar=I_eps_array)
1415    endif
1416
1417!!!!--- Loop over three energy values which we compute Sigma -----------
1418!
1419! SIB: In terms of formulae, the two terms we will compute are given in
1420! formulae (34a,34b) on p. 5397 of above reference.
1421!
1422!                                         Omega^2(G,G`)
1423!    SX(E) = M(n,G)*conj(M(m,G`)) * ------------------------ * Vcoul(G`)
1424!                                   (E-E_n1(k-q))^2-wtilde^2
1425!
1426!                                           Omega^2(G,G`)
1427!    CH(E) = M(n,G)*conj(M(m,G`)) * ----------------------------- * Vcoul(G`)
1428!                                   2*wtilde*[E-E_n1(k-q)-wtilde]
1429!
1430! and we are evaluating both at E = E_l(k) and E = E_l(k) +/- dE.
1431! SX only gets contributions for the band n1 being an occupied state.
1432!
1433! For diagonal elements, we need matrix elements at various
1434! energies (to get quasi-particle energies), i.e. for iw=1,2,3;
1435! but for off-diagonal elements, we only need them at iw=2
1436! (the actual energy) so we won`t even bother calculating
1437! them at other energies
1438
1439    call timing%start(timing%m_cor_sx_ch)
1440    do iw=nstart,nend
1441! wx = E_l(k) - E_n1(k-q) + dE = difference in energies
1442! of the two states appearing as E - E_n1(k-q) above.
1443      if (sig%fdf .eq. -3) then
1444        wx_array(iw) = wxi(iw) - e_n1kq
1445      else
1446        wx_array(iw) = e_lk + sig%dw*(iw-2) - e_n1kq
1447      endif
1448      if (abs(wx_array(iw)) .lt. TOL_Zero) wx_array(iw) = TOL_Zero
1449    enddo
1450
1451! JRD: This Loop is Performance critical. Make Sure you don`t mess it up
1452
1453!$OMP PARALLEL private (my_igp,igp,indigp,ssx_array,sch_array,ig, &
1454!$OMP                      wtilde,wtilde2,halfinvwtilde,ssxcutoff,sch,ssx, &
1455!$OMP                      iw,delw,delw2,Omega2,scht,schtt,ssxt,wxt, &
1456!$OMP                      rden,cden,delwr,wdiffr,wdiff,igbeg,igend)
1457
1458    if (sig%fdf.eq.-3) then
1459      SAFE_ALLOCATE(ssx_array,(sig%nfreqeval))
1460      SAFE_ALLOCATE(sch_array,(sig%nfreqeval))
1461    else
1462      SAFE_ALLOCATE(ssx_array,(3))
1463      SAFE_ALLOCATE(sch_array,(3))
1464    endif
1465
1466!$OMP DO reduction(+:asxtemp,acht_n1_loc,achtemp)
1467    do my_igp = 1, ngpown
1468
1469      indigp = inv_igp_index(my_igp)
1470      igp = indinv(indigp)
1471
1472      if (igp .gt. ncouls .or. igp .le. 0) cycle
1473
1474      ssx_array = ZERO
1475      sch_array = ZERO
1476
1477! delw measures how close to zero the difference
1478! wx - wtilde = E - E_n1(k-q) - wtilde is relative to wtilde:
1479! delw = (E - E_n1(k-q) - wtilde) / (2 * wtilde)
1480! delw2 is the squared absolute value of delw
1481
1482! If delw is small, both SX and CH blow up, but their sum (for
1483! an occupied state n1) is finite.  In such a case, their sum
1484! is Omega^2 / (4 * wtilde2) / (1 + delw). Then the sum is
1485! assigned to ssx and sch is set to zero.
1486
1487! If ssx is too large (which can happen due to the pole at
1488! wx + wtilde = 0 of the SX term), then we should drop this term.
1489! See the discussion at the bottom of p. 5411-5412 of Hybertsen & Louie.
1490
1491! If G.neq.G`, then since we sum over only lower triangle,
1492! we include the contribution we would have had from (G`,G).
1493
1494      if (flag_occ) then
1495
1496        do iw=nstart,nend
1497
1498          scht=0D0
1499          ssxt=0D0
1500          wxt = wx_array(iw)
1501
1502          do ig = 1, ncouls
1503
1504! Here we recompute Omega2 = wtilde2 * I_eps_array. This contains
1505! the factor of (1 - i tan phi) from Eqs. 21 & 22 of arXiv paper.
1506
1507!FIXME: Here we use temporary variables wtilde, wtilde2, Omega2 while
1508! in the following sections we use wtilde_array and I_eps_array directly.
1509! JRD, please write a comment here explaining whether this is critical
1510! for performance or it doesn`t matter.
1511
1512            wtilde = wtilde_array(ig,my_igp)
1513            wtilde2 = wtilde**2
1514            Omega2 = wtilde2 * I_eps_array(ig,my_igp)
1515
1516! Cycle bad for vectorization. Not needed wtilde is zero
1517!                  if (abs(Omega2) .lt. TOL_Zero) cycle
1518
1519            wdiff = wxt - wtilde
1520
1521            cden = wdiff
1522            rden = cden * CONJG(cden)
1523            rden = 1D0 / rden
1524            delw = wtilde * CONJG(cden) * rden
1525            delwr = delw*CONJG(delw)
1526            wdiffr = wdiff*CONJG(wdiff)
1527
1528! This Practice is bad for vectorization and understanding of the output.
1529! JRD: Complex division is hard to vectorize. So, we help the compiler.
1530            if (wdiffr.gt.limittwo .and. delwr.lt.limitone) then
1531              sch = delw * I_eps_array(ig,my_igp)
1532              cden = wxt**2 - wtilde2
1533              rden = cden*CONJG(cden)
1534              rden = 1D0 / rden
1535              ssx = Omega2 * CONJG(cden) * rden
1536            else if ( delwr .gt. TOL_Zero) then
1537              sch = 0.0d0
1538              cden = (4.0d0 * wtilde2 * (delw + 0.5D0 ))
1539              rden = cden*MYCONJG(cden)
1540              rden = 1D0 / rden
1541              ssx = -Omega2 * MYCONJG(cden) * rden * delw
1542            else
1543              sch = 0.0d0
1544              ssx = 0.0d0
1545            endif
1546
1547! JRD: Breaks vectorization. But, I will have to fix later because
1548! leaving it out breaks GSM example.
1549            ssxcutoff = sig%sexcut*abs(I_eps_array(ig,my_igp))
1550            if (abs(ssx) .gt. ssxcutoff .and. wxt .lt. 0.0d0) ssx=0.0d0
1551
1552            ssxt = ssxt + aqsntemp(ig,n1)*ssx
1553            scht = scht + aqsntemp(ig,n1)*sch
1554
1555          enddo ! loop over g
1556          ssx_array(iw) = ssx_array(iw) + ssxt*MYCONJG(aqsmtemp(igp,n1))
1557          sch_array(iw) = sch_array(iw) + 0.5D0*scht*MYCONJG(aqsmtemp(igp,n1))
1558        enddo
1559
1560      else
1561
1562        do igbeg = 1,ncouls,igblk
1563        igend = min(igbeg+igblk-1,ncouls)
1564        do iw=nstart,nend
1565
1566          scht=0D0
1567          ssxt=0D0
1568          wxt = wx_array(iw)
1569
1570!dir$ no unroll
1571          do ig = igbeg, igend
1572
1573! Here we recompute Omega2 = wtilde2 * I_eps_array. This contains
1574! the factor of (1 - i tan phi) from Eqs. 21 & 22 of arXiv paper.
1575
1576!FIXME: Here we use wtilde_array and I_eps_array directly while in the
1577! previous sections we use temporary variables wtilde, wtilde2, Omega2.
1578! JRD, please write a comment here explaining whether this is critical
1579! for performance or it doesn`t matter.
1580
1581            wdiff = wxt - wtilde_array(ig,my_igp)
1582
1583            cden = wdiff
1584            rden = cden * CONJG(cden)
1585            rden = 1D0 / rden
1586            delw = wtilde_array(ig,my_igp) * CONJG(cden) * rden
1587            delwr = delw*CONJG(delw)
1588            wdiffr = wdiff*CONJG(wdiff)
1589
1590            schtt = aqsntemp(ig,n1) * delw * I_eps_array(ig,my_igp)
1591
1592! JRD: This if is OK for vectorization
1593            if (wdiffr.gt.limittwo .and. delwr.lt.limitone) then
1594              scht = scht + schtt
1595            endif
1596
1597          enddo ! loop over g
1598
1599          sch_array(iw) = sch_array(iw) + 0.5D0*scht*MYCONJG(aqsmtemp(igp,n1))
1600
1601        enddo
1602        enddo
1603
1604      endif
1605
1606! If a valence band, then accumulate SX contribution.
1607
1608      if (flag_occ) then
1609        do iw=nstart,nend
1610          asxtemp(iw) = asxtemp(iw) - ssx_array(iw) * occ * vcoul(igp)
1611        enddo
1612      endif
1613
1614! Accumulate CH contribution.
1615
1616      do iw=nstart,nend
1617        achtemp(iw) = achtemp(iw) + sch_array(iw) * vcoul(igp)
1618      enddo
1619
1620! Logging CH convergence.
1621
1622      acht_n1_loc(n1true) = acht_n1_loc(n1true) + sch_array(2) * vcoul(igp)
1623
1624    enddo ! igp
1625!$OMP END DO
1626    SAFE_DEALLOCATE(ssx_array)
1627    SAFE_DEALLOCATE(sch_array)
1628!$OMP END PARALLEL
1629    call timing%stop(timing%m_cor_sx_ch)
1630
1631    POP_SUB(mtxel_cor.sigma_gpp)
1632
1633  end subroutine sigma_gpp
1634
1635
1636!==============================================================================
1637!                                  REAL AXIS
1638!==============================================================================
1639  !> Calculate Sigma matrix elements, full-freq / REAL-AXIS formalism
1640  subroutine sigma_ra()
1641    complex(DPC) :: I_epsRggp_int, I_epsAggp_int
1642    real(DP) :: cedifft_zb,intfact,cedifft_zb_left,cedifft_zb_right
1643    complex(DPC) :: cedifft_coh, cedifft_cor
1644    complex(DPC) :: schDt_avg, schDt_right, schDt_left, schDt_lin, schDt_lin2, schDt_lin3
1645    complex(DPC) :: schDttt, schDttt_cor, schD, sch2Dt, sch2Dtt
1646    complex(DPC), allocatable :: schDt_array(:)
1647    real(DP) :: fact1, fact2
1648    integer :: ijk
1649    logical, save :: warned=.false.
1650
1651    PUSH_SUB(mtxel_cor.sigma_ra)
1652
1653    ! JRD: Find iw closest to e_lk
1654    diffmin = INF
1655    do iw=1,sig%nfreqeval
1656      diff = abs(freq0 + (iw-1)*sig%freqevalstep - e_lk)
1657      if (diff .lt. diffmin) then
1658        diffmin=diff
1659        iwlda=iw
1660      endif
1661    enddo
1662
1663    do iw=1,sig%nfreqeval
1664      wx = freq0 - e_n1kq + (iw-1)*sig%freqevalstep
1665      wxi(iw) = wx
1666    enddo
1667
1668    ! GSM: compute the static CH for the static remainder
1669    if (sig%exact_ch.eq.1) then
1670      call acc_static_ch(ngpown, ncouls, inv_igp_index, indinv, vcoul, &
1671        aqsntemp(:,n1), aqsmtemp(:,n1), achstemp, eps_cplx=I_epsR_array(:,:,1))
1672    endif
1673
1674    ssxDi = (0D0,0D0)
1675    schDi = (0D0,0D0)
1676    schDi_cor = (0D0,0D0)
1677    schDi_corb = (0D0,0D0)
1678    sch2Di = (0D0,0D0)
1679
1680    call timing%start(timing%m_cor_ra_sx)
1681    ! JRD: Don`t want to thread here, nfreqeval could be small
1682    do iw=1,sig%nfreqeval
1683      wx = wxi(iw)
1684      ! SX and CH terms: equation (1.42) of Catalin`s thesis
1685      ! Note the negative sign in I_epsRggp and I_epsAggp
1686
1687      if (flag_occ) then
1688
1689        if(wx.ge.0.0d0) then
1690          ifreq=0
1691          do ijk = 1, sig%nFreq-1
1692            if (wx .ge. sig%dFreqGrid(ijk) .and. wx .lt. sig%dFreqGrid(ijk+1)) then
1693              ifreq=ijk
1694            endif
1695          enddo
1696          if (ifreq .eq. 0) then
1697            ifreq = sig%nFreq+3 ! three is for luck
1698          endif
1699        else
1700          ifreq=0
1701          do ijk = 1, sig%nFreq-1
1702            if (-wx .ge. sig%dFreqGrid(ijk) .and. -wx .lt. sig%dFreqGrid(ijk+1)) then
1703              ifreq=ijk
1704            endif
1705          enddo
1706          if (ifreq .eq. 0) then
1707            ifreq = sig%nFreq+3 ! three is for luck
1708          endif
1709        endif
1710
1711        if(ifreq.ge.sig%nFreq) then
1712          if (.not.warned .and. peinf%pool_rank==0) then
1713            write(0,777) iband, n1true, e_lk, e_n1kq, wx, E_max
1714          endif
1715          warned = .true.
1716          ifreq=sig%nFreq-1
1717        endif
1718
1719#ifdef CPLX
1720        if(wx.ge.0.d0) then
1721
1722          fact1 = (sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1723          fact2 = (wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1724
1725          ssxDittt = 0D0
1726
1727!$OMP PARALLEL do private (my_igp,igp,indigp,ssxDitt,ig, &
1728!$OMP                       ssxDit) reduction(+:ssxDittt)
1729          do my_igp = 1, ngpown
1730            indigp = inv_igp_index(my_igp)
1731            igp = indinv(indigp)
1732
1733            if (igp .gt. ncouls .or. igp .le. 0) cycle
1734
1735            ssxDitt = (0D0,0D0)
1736            do ig = 1, ncouls
1737              ssxDit=I_epsR_array(ig,my_igp,ifreq)*fact1 + &
1738              I_epsR_array(ig,my_igp,ifreq+1)*fact2
1739              ssxDitt = ssxDitt + aqsntemp(ig,n1)*ssxDit
1740            enddo
1741            ssxDittt = ssxDittt + ssxDitt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1742          enddo
1743!$OMP END PARALLEL DO
1744
1745          ssxDi(iw) = ssxDi(iw) + ssxDittt
1746
1747        else
1748
1749          fact1 = (sig%dFreqGrid(ifreq+1)+wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1750          fact2 = (-sig%dFreqGrid(ifreq)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1751
1752          ssxDittt = 0D0
1753
1754!$OMP PARALLEL do private (my_igp,igp,indigp,ssxDitt,ig, &
1755!$OMP                       ssxDit) reduction(+:ssxDittt)
1756          do my_igp = 1, ngpown
1757            indigp = inv_igp_index(my_igp)
1758            igp = indinv(indigp)
1759
1760            if (igp .gt. ncouls .or. igp .le. 0) cycle
1761
1762            ssxDitt = (0D0,0D0)
1763            do ig = 1, ncouls
1764              ssxDit=I_epsA_array(ig,my_igp,ifreq)*fact1+ &
1765                I_epsA_array(ig,my_igp,ifreq+1)*fact2
1766              ssxDitt = ssxDitt + aqsntemp(ig,n1)*ssxDit
1767            enddo
1768            ssxDittt = ssxDittt + ssxDitt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1769          enddo
1770!$OMP END PARALLEL DO
1771
1772          ssxDi(iw) = ssxDi(iw) + ssxDittt
1773
1774        endif
1775#else
1776        if(wx.ge.0.d0) then
1777
1778          fact1 = (sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1779          fact2 = (wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1780
1781          ssxDittt = 0D0
1782
1783!$OMP PARALLEL do private (my_igp,igp,indigp,ssxDitt,ig, &
1784!$OMP                       ssxDit) reduction(+:ssxDittt)
1785          do my_igp = 1, ngpown
1786            indigp = inv_igp_index(my_igp)
1787            igp = indinv(indigp)
1788
1789            if (igp .gt. ncouls .or. igp .le. 0) cycle
1790
1791            ssxDitt = (0D0,0D0)
1792            do ig = 1, ncouls
1793              ssxDit=I_epsR_array(ig,my_igp,ifreq)*fact1+ &
1794                I_epsR_array(ig,my_igp,ifreq+1)*fact2
1795              ssxDitt = ssxDitt + aqsntemp(ig,n1)*ssxDit
1796            enddo
1797            ssxDittt = ssxDittt + ssxDitt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1798          enddo
1799!$OMP END PARALLEL DO
1800
1801          ssxDi(iw) = ssxDi(iw) + ssxDittt
1802
1803        else
1804
1805          fact1 = (sig%dFreqGrid(ifreq+1)+wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1806          fact2 = (-sig%dFreqGrid(ifreq)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1807
1808          ssxDittt = 0D0
1809
1810!$OMP PARALLEL do private (my_igp,igp,indigp,ssxDitt,ig, &
1811!$OMP                       ssxDit) reduction(+:ssxDittt)
1812          do my_igp = 1, ngpown
1813            indigp = inv_igp_index(my_igp)
1814            igp = indinv(indigp)
1815
1816            if (igp .gt. ncouls .or. igp .le. 0) cycle
1817
1818            ssxDitt = (0D0,0D0)
1819            do ig = 1, ncouls
1820              ssxDit=CONJG(I_epsR_array(ig,my_igp,ifreq))*fact1 + &
1821                CONJG(I_epsR_array(ig,my_igp,ifreq+1))*fact2
1822              ssxDitt = ssxDitt + aqsntemp(ig,n1)*ssxDit
1823            enddo
1824            ssxDittt = ssxDittt + ssxDitt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1825          enddo
1826!$OMP END PARALLEL DO
1827
1828          ssxDi(iw) = ssxDi(iw) + ssxDittt
1829
1830        endif
1831#endif
1832      endif
1833    enddo
1834    call timing%stop(timing%m_cor_ra_sx)
1835
1836    ! JRD: Now do CH term
1837    call timing%start(timing%m_cor_ra_ch)
1838    SAFE_ALLOCATE(schDt_array,(sig%Nfreq))
1839    schDt_array(:) = 0D0
1840!$OMP PARALLEL do private (my_igp,igp,indigp,ig,schDtt,I_epsRggp_int, &
1841!$OMP                      I_epsAggp_int,schD,schDt)
1842    do ifreq=1,sig%Nfreq
1843        schDt = (0D0,0D0)
1844
1845        do my_igp = 1, ngpown
1846          indigp = inv_igp_index(my_igp)
1847          igp = indinv(indigp)
1848
1849          if (igp .gt. ncouls .or. igp .le. 0) cycle
1850
1851! JRD: The below loop is performance critical
1852
1853          schDtt = (0D0,0D0)
1854          do ig = 1, ncouls
1855
1856            I_epsRggp_int = I_epsR_array(ig,my_igp,ifreq)
1857
1858#ifdef CPLX
1859            I_epsAggp_int = I_epsA_array(ig,my_igp,ifreq)
1860
1861            ! for G,G` components
1862            schD=I_epsRggp_int-I_epsAggp_int
1863
1864            ! for G`,G components
1865            schDtt = schDtt + aqsntemp(ig,n1)*schD
1866#else
1867            schD= CMPLX(IMAG(I_epsRggp_int),0.0d0)
1868            schDtt = schDtt + aqsntemp(ig,n1)*schD
1869#endif
1870          enddo
1871          schDt = schDt + schDtt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
1872        enddo
1873
1874! XXX: Threads could be stomping on each-other`s cache over this... try reduction?
1875        schDt_array(ifreq) = schDt
1876
1877    enddo
1878!$OMP END PARALLEL DO
1879    call timing%stop(timing%m_cor_ra_ch)
1880
1881    call timing%start(timing%m_cor_ra_sum)
1882!$OMP PARALLEL do private (ifreq,schDt,cedifft_zb,cedifft_coh,cedifft_cor, &
1883!$OMP                      cedifft_zb_right,cedifft_zb_left,schDt_right,schDt_left, &
1884!$OMP                      schDt_avg,schDt_lin,schDt_lin2,intfact,iw, &
1885!$OMP                      schDt_lin3) reduction(+:schDi,schDi_corb,schDi_cor,sch2Di)
1886    do ifreq=1,sig%Nfreq
1887
1888        schDt = schDt_array(ifreq)
1889
1890        cedifft_zb = sig%dFreqGrid(ifreq)
1891        cedifft_coh = CMPLX(cedifft_zb,0D0)- sig%dFreqBrd(ifreq)
1892
1893        if( flag_occ) then
1894          cedifft_cor = -1.0d0*CMPLX(cedifft_zb,0D0) - sig%dFreqBrd(ifreq)
1895        else
1896          cedifft_cor = CMPLX(cedifft_zb,0D0) - sig%dFreqBrd(ifreq)
1897        endif
1898
1899        if (ifreq .ne. 1) then
1900          cedifft_zb_right = cedifft_zb
1901          cedifft_zb_left = sig%dFreqGrid(ifreq-1)
1902          schDt_right = schDt
1903          schDt_left = schDt_array(ifreq-1)
1904          schDt_avg = 0.5D0 * ( schDt_right + schDt_left )
1905          schDt_lin = schDt_right - schDt_left
1906          schDt_lin2 = schDt_lin/(cedifft_zb_right-cedifft_zb_left)
1907        endif
1908
1909#ifdef CPLX
1910! The below two lines are for sigma1 and sigma3
1911        if (ifreq .ne. sig%Nfreq) then
1912          schDi(:) = schDi(:) - CMPLX(0.d0,pref(ifreq)) * schDt / ( wxi(:)-cedifft_coh)
1913          schDi_corb(:) = schDi_corb(:) - CMPLX(0.d0,pref(ifreq)) * schDt / ( wxi(:)-cedifft_cor)
1914        endif
1915        if (ifreq .ne. 1) then
1916          do iw = 1, sig%nfreqeval
1917!These lines are for sigma2
1918            intfact=abs((wxi(iw)-cedifft_zb_right)/(wxi(iw)-cedifft_zb_left))
1919            if (intfact .lt. 1d-4) intfact = 1d-4
1920            if (intfact .gt. 1d4) intfact = 1d4
1921            intfact = -log(intfact)
1922            sch2Di(iw) = sch2Di(iw) - CMPLX(0.d0,pref_zb) * schDt_avg * intfact
1923!These lines are for sigma4
1924            if (flag_occ) then
1925              intfact=abs((wxi(iw)+cedifft_zb_right)/(wxi(iw)+cedifft_zb_left))
1926              if (intfact .lt. 1d-4) intfact = 1d-4
1927              if (intfact .gt. 1d4) intfact = 1d4
1928              intfact = log(intfact)
1929              schDt_lin3 = (schDt_left + schDt_lin2*(-wxi(iw)-cedifft_zb_left))*intfact
1930            else
1931              schDt_lin3 = (schDt_left + schDt_lin2*(wxi(iw)-cedifft_zb_left))*intfact
1932            endif
1933            schDt_lin3 = schDt_lin3 + schDt_lin
1934            schDi_cor(iw) = schDi_cor(iw) - CMPLX(0.d0,pref_zb) * schDt_lin3
1935          enddo
1936        endif
1937#else
1938! The below two lines are for sigma1 and sigma3
1939        if (ifreq .ne. sig%Nfreq) then
1940          schDi(:) = schDi(:) + pref(ifreq)*schDt / ( wxi(:)-cedifft_coh)
1941          schDi_corb(:) = schDi_corb(:) + pref(ifreq)*schDt / ( wxi(:)-cedifft_cor)
1942        endif
1943        if (ifreq .ne. 1) then
1944          do iw = 1, sig%nfreqeval
1945!These lines are for sigma2
1946            intfact=abs((wxi(iw)-cedifft_zb_right)/(wxi(iw)-cedifft_zb_left))
1947            if (intfact .lt. 1d-4) intfact = 1d-4
1948            if (intfact .gt. 1d4) intfact = 1d4
1949            intfact = -log(intfact)
1950            sch2Di(iw) = sch2Di(iw) + pref_zb * schDt_avg * intfact
1951!These lines are for sigma4
1952            if (flag_occ) then
1953              intfact=abs((wxi(iw)+cedifft_zb_right)/(wxi(iw)+cedifft_zb_left))
1954              if (intfact .lt. 1d-4) intfact = 1d-4
1955              if (intfact .gt. 1d4) intfact = 1d4
1956              intfact = log(intfact)
1957              schDt_lin3 = (schDt_left + schDt_lin2*(-wxi(iw)-cedifft_zb_left))*intfact
1958            else
1959              schDt_lin3 = (schDt_left + schDt_lin2*(wxi(iw)-cedifft_zb_left))*intfact
1960            endif
1961            schDt_lin3 = schDt_lin3 + schDt_lin
1962            schDi_cor(iw) = schDi_cor(iw) + pref_zb * schDt_lin3
1963          enddo
1964        endif
1965#endif
1966    enddo
1967!$OMP END PARALLEL DO
1968    SAFE_DEALLOCATE(schDt_array)
1969    call timing%stop(timing%m_cor_ra_sum)
1970
1971    ! JRD: Compute Sigma2 and Sigma4 delta function contributions
1972    call timing%start(timing%m_cor_ra_ch2)
1973    do iw = 1, sig%nfreqeval
1974      wx = wxi(iw)
1975      if(wx .ge. 0.0d0) then
1976        ifreq=0
1977        do ijk = 1, sig%nFreq-1
1978          if (wx .ge. sig%dFreqGrid(ijk) .and. wx .lt. sig%dFreqGrid(ijk+1)) then
1979            ifreq=ijk
1980          endif
1981        enddo
1982        if (ifreq .eq. 0) then
1983          ifreq=sig%nFreq-1
1984        endif
1985
1986        fact1=(sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1987        fact2=(wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
1988
1989        schDttt = 0D0
1990        schDttt_cor = 0D0
1991
1992!$OMP PARALLEL do private (my_igp,igp,indigp,ig, &
1993!$OMP                      sch2Dt,sch2Dtt) reduction(+:schDttt,schDttt_cor)
1994        do my_igp = 1, ngpown
1995          indigp = inv_igp_index(my_igp)
1996          igp = indinv(indigp)
1997
1998          if (igp .gt. ncouls .or. igp .le. 0) cycle
1999
2000          sch2Dtt = (0D0,0D0)
2001          do ig = 1, ncouls
2002#ifdef CPLX
2003            sch2Dt=-0.5D0*((I_epsR_array(ig,my_igp,ifreq)-I_epsA_array(ig,my_igp,ifreq))*fact1 + &
2004                   (I_epsR_array(ig,my_igp,ifreq+1)-I_epsA_array(ig,my_igp,ifreq+1))*fact2)
2005#else
2006            sch2Dt = CMPLX(0D0,-1D0)* &
2007              (IMAG(I_epsR_array(ig,my_igp,ifreq))*fact1 + IMAG(I_epsR_array(ig,my_igp,ifreq+1))*fact2)
2008#endif
2009            sch2Dtt = sch2Dtt + aqsntemp(ig,n1)*sch2Dt
2010          enddo
2011          schDttt = schDttt + sch2Dtt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
2012          if (.not.flag_occ) then
2013            schDttt_cor = schDttt_cor + sch2Dtt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
2014          endif
2015        enddo
2016!$OMP END PARALLEL DO
2017
2018        sch2Di(iw) = sch2Di(iw) + schDttt
2019        schDi_cor(iw) = schDi_cor(iw) + schDttt_cor
2020      else if (flag_occ) then
2021        wx=-wx
2022        ifreq=0
2023        do ijk = 1, sig%nFreq-1
2024          if (wx .ge. sig%dFreqGrid(ijk) .and. wx .lt. sig%dFreqGrid(ijk+1)) then
2025            ifreq=ijk
2026          endif
2027        enddo
2028        if (ifreq .eq. 0) then
2029          ifreq=sig%nFreq-1
2030        endif
2031
2032        fact1=(sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
2033        fact2=(wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
2034
2035        schDttt_cor = 0D0
2036
2037!$OMP PARALLEL do private (my_igp,igp,indigp,ig, &
2038!$OMP                      sch2Dt,sch2Dtt) reduction(+:schDttt_cor)
2039        do my_igp = 1, ngpown
2040          indigp = inv_igp_index(my_igp)
2041          igp = indinv(indigp)
2042
2043          if (igp .gt. ncouls .or. igp .le. 0) cycle
2044
2045          sch2Dtt = (0D0,0D0)
2046          do ig = 1, ncouls
2047#ifdef CPLX
2048            sch2Dt=-0.5D0*((I_epsR_array(ig,my_igp,ifreq)-I_epsA_array(ig,my_igp,ifreq))*fact1 + &
2049                   (I_epsR_array(ig,my_igp,ifreq+1)-I_epsA_array(ig,my_igp,ifreq+1))*fact2)
2050#else
2051            sch2Dt = CMPLX(0D0,-1D0)* &
2052              (IMAG(I_epsR_array(ig,my_igp,ifreq))*fact1 + IMAG(I_epsR_array(ig,my_igp,ifreq+1))*fact2)
2053#endif
2054            sch2Dtt = sch2Dtt + aqsntemp(ig,n1)*sch2Dt
2055          enddo
2056          schDttt_cor = schDttt_cor + sch2Dtt*vcoul(igp)*MYCONJG(aqsmtemp(igp,n1))
2057        enddo
2058!$OMP END PARALLEL DO
2059        schDi_cor(iw) = schDi_cor(iw) + schDttt_cor
2060      endif
2061    enddo
2062    call timing%stop(timing%m_cor_ra_ch2)
2063
2064    do iw = 1, sig%nfreqeval
2065      if (flag_occ) then
2066        asxDtemp(iw) = asxDtemp(iw) + ssxDi(iw)*occ
2067      endif
2068      achDtemp(iw) = achDtemp(iw) + schDi(iw)
2069      achDtemp_cor(iw) = achDtemp_cor(iw) + schDi_cor(iw)
2070      achDtemp_corb(iw) = achDtemp_corb(iw) + schDi_corb(iw)
2071      ach2Dtemp(iw) = ach2Dtemp(iw) + sch2Di(iw)
2072    ! JRD: This is now close to LDA
2073      if (iw.eq.iwlda) achtD_n1(n1true) = &
2074        achtD_n1(n1true) + schDi(iw)
2075    enddo ! over iw
2076777         format(1x,"WARNING: The real frequency range is too small." &
2077              ,/,3x,"l =",i3,1x,"n1 =",i5,1x,"E_l =",f8.3,1x,"E_n1" &
2078              ,1x,"=",f8.3,1x,"wx =",f8.3,1x,"E_max =",f8.3)
2079
2080    POP_SUB(mtxel_cor.sigma_ra)
2081
2082  end subroutine sigma_ra
2083
2084
2085!==============================================================================
2086!                             CONTOUR DEFORMATION
2087!==============================================================================
2088  !> Calculate Sigma matrix elements, full-freq / CONTOUR-DEFORMATION formalism
2089  subroutine sigma_cd()
2090    real(DP) :: imag_freqs(sig%nfreq_imag)
2091    SCALAR :: c0, c1, c2, c3, y1, y2, y3, m1, m2, mm(sig%nfreq_imag)
2092    real(DP) :: x1, x2, x3, dx
2093    SCALAR :: sW_imag_freqs(sig%nfreq_imag), sint(sig%nfreqeval)
2094    SCALAR :: sW_iomega, sW_iomega_acc
2095    complex(DPC) :: sres(sig%nfreqeval), sres_omega, sres_omega_acc
2096    integer :: occ_sign, ijk, ifreq_my_igp
2097    real(DP) :: fact1, fact2
2098    logical, save :: warned=.false.
2099
2100    ! FHJ: WARNING: we calc. the TO Sigma here, while RA/FF calculates Ret Sigma.
2101    ! Everything we implement here is Eqn. (14) from PRB 67, 155208 (2003).
2102    ! Note the apparent sign flip; that`s because we have (1-epsinv)*v instead
2103    ! of (epsinv-1)*v = W^c.
2104    !
2105    ! FHJ: Definitions (following PRB 67, 155208 notation):
2106    ! sW_imag_freqs(i \omega) = - \sum_{G, G`} M^*_{G} W^c_{G G`}(i \omega) M(G`)
2107    ! sint(\omega) = \frac{1}{\pi} \int_0^{\inf} d{\omega`}
2108    !   sW_imag_freqs(i \omega`) (\omega-Elk)/((\omega-Elk)^2 + (\omega`)^2)
2109    !
2110    ! Note: we actually evaluate sint(\omega) using the quadrature scheme
2111    ! from Fabien`s thesis.
2112
2113    PUSH_SUB(mtxel_cor.sigma_cd)
2114
2115    ! JRD: Find iw closest to e_lk
2116    diffmin = INF
2117    do iw=1,sig%nfreqeval
2118      diff = abs(freq0 + (iw-1)*sig%freqevalstep - e_lk)
2119      if (diff .lt. diffmin) then
2120        diffmin=diff
2121        iwlda=iw
2122      endif
2123    enddo
2124
2125    ! wxi = omega - e_n``k
2126    do iw=1,sig%nfreqeval
2127      wx = freq0 - e_n1kq + (iw-1)*sig%freqevalstep
2128      wxi(iw) = wx
2129    enddo
2130
2131    ! GSM: compute the static CH for the static remainder
2132    if (sig%exact_ch.eq.1) then
2133      call acc_static_ch(ngpown, ncouls, inv_igp_index, indinv, vcoul, &
2134        aqsntemp(:,n1), aqsmtemp(:,n1), achstemp, eps_cplx=I_epsR_array(:,:,1))
2135    endif
2136
2137    sres(:) = (0D0,0D0) ! Residue contrib.
2138    sint(:) = ZERO ! Integral contrib.
2139
2140    ! FHJ: residue contribution to the correlation self energy
2141    ! JRD: Don`t want to thread here, nfreqeval could be small
2142    call timing%start(timing%m_cor_cd_res)
2143    do iw=1,sig%nfreqeval
2144
2145      wx = wxi(iw)
2146      ! FHJ: need either wx>=0 and empty or wx<0 and occupied
2147      if ( (wx>=0.0d0) .eqv. flag_occ) cycle
2148      occ_sign = idint(sign(1d0, wx))
2149      ! FHJ: and from now on we can work on | omega - e_n``k |
2150      wx = abs(wx)
2151
2152      ifreq = 0
2153      do ijk = 1, nfreq_real-1
2154        if (wx>=sig%dFreqGrid(ijk) .and. wx<sig%dFreqGrid(ijk+1)) then
2155          ifreq = ijk
2156        endif
2157      enddo
2158      if (ifreq==0) then
2159        ifreq = nfreq_real+3 ! three is for luck
2160      endif
2161      if(ifreq>=nfreq_real) then
2162        if (.not.warned .and. peinf%pool_rank==0) then
2163          write(0,777) iband, n1true, e_lk, e_n1kq, wx, E_max
2164        endif
2165        warned = .true.
2166        ifreq = nfreq_real-1
2167      endif
2168
2169      sres_omega = 0D0
2170      if (nfreq_real>1) then
2171        fact1 = (sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
2172        fact2 = (wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
2173      endif
2174
2175!$OMP PARALLEL do private (my_igp,igp,indigp,sres_omega_acc,ig) reduction(+:sres_omega)
2176      do my_igp = 1, ngpown
2177        indigp = inv_igp_index(my_igp)
2178        igp = indinv(indigp)
2179        if (igp>ncouls .or. igp<=0) cycle
2180
2181        sres_omega_acc = (0D0,0D0)
2182        ! JRD: The below loop is performance critical
2183        if (nfreq_real>1) then
2184          do ig = 1, ncouls
2185            sres_omega_acc = sres_omega_acc + aqsntemp(ig,n1)*( &
2186              I_epsR_array(ig,my_igp,ifreq)*fact1 + &
2187              I_epsR_array(ig,my_igp,ifreq+1)*fact2)
2188          enddo
2189        else
2190          do ig = 1, ncouls
2191            sres_omega_acc = sres_omega_acc + aqsntemp(ig,n1)*I_epsR_array(ig,my_igp,1)
2192          enddo
2193        endif
2194        sres_omega = sres_omega + MYCONJG(aqsmtemp(igp,n1))*sres_omega_acc*vcoul(igp)
2195      enddo
2196!$OMP END PARALLEL DO
2197
2198      sres(iw) = sres(iw) - occ_sign*sres_omega
2199
2200    enddo ! iw loop
2201    call timing%stop(timing%m_cor_cd_res)
2202
2203    ! JRD: Now do Integral contribution
2204    call timing%start(timing%m_cor_cd_int)
2205    sW_imag_freqs(:) = ZERO
2206!$OMP PARALLEL DO PRIVATE(igp,indigp,ig,sW_iomega_acc,ifreq,my_igp,ifreq_my_igp) &
2207!$OMP REDUCTION(+:sW_imag_freqs)
2208    do ifreq_my_igp = 0, sig%nfreq_imag*ngpown - 1
2209      ifreq = ifreq_my_igp/ngpown + 1
2210      my_igp = mod(ifreq_my_igp, ngpown) + 1
2211      indigp = inv_igp_index(my_igp)
2212      igp = indinv(indigp)
2213      if (igp>ncouls .or. igp<=0) cycle
2214
2215      sW_iomega_acc = ZERO
2216      ! JRD: The below loop is performance critical
2217      do ig = 1, ncouls
2218        sW_iomega_acc = sW_iomega_acc + aqsntemp(ig,n1)*I_eps_imag(ig,my_igp,ifreq)
2219      enddo
2220      sW_imag_freqs(ifreq) = sW_imag_freqs(ifreq) + &
2221        MYCONJG(aqsmtemp(igp,n1))*sW_iomega_acc*vcoul(igp)
2222    enddo
2223!$OMP END PARALLEL DO
2224    call timing%stop(timing%m_cor_cd_int)
2225
2226    call timing%start(timing%m_cor_cd_sum)
2227    imag_freqs(:) = IMAG(sig%dFreqGrid(nfreq_real+1:)+sig%dFreqBrd(nfreq_real+1:))
2228    select case (sig%cd_int_method)
2229
2230      case (0)
2231        ! FHJ: Integration from Fabien`s thesis; assumes that the matrix elements
2232        ! are piecewise constant functions, and perform the integral over each
2233        ! interval centered are the frequency evaluation point.
2234
2235        ! JRD: XXX Should center this integral....
2236        ! FHJ: What do you mean?
2237        ifreq = 1
2238        sW_iomega = sW_imag_freqs(ifreq)
2239        freqStart = imag_freqs(ifreq)
2240        freqEnd = (imag_freqs(ifreq) + imag_Freqs(ifreq+1)) * 0.5d0
2241        sint(:) = sint(:) + sW_iomega * ( atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:)) )
2242
2243        !FHJ: no need to do OMP here, each trip is very short, and OMP can
2244        !hurt vectorization.
2245        do ifreq = 2, sig%nfreq_imag-1
2246            sW_iomega = sW_imag_freqs(ifreq)
2247            freqStart = (imag_freqs(ifreq-1) + imag_Freqs(ifreq)) * 0.5d0
2248            freqEnd = (imag_freqs(ifreq) + imag_Freqs(ifreq+1)) * 0.5d0
2249            sint(:) = sint(:) + sW_iomega * ( atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:)) )
2250        enddo
2251
2252        sW_iomega = sW_imag_freqs(sig%nfreq_imag)
2253        freqStart = (imag_freqs(sig%nfreq_imag-1) + imag_Freqs(sig%nfreq_imag)) * 0.5d0
2254        freqEnd = (-imag_freqs(sig%nfreq_imag-1) + 3d0*imag_Freqs(sig%nfreq_imag)) * 0.5d0
2255        sint(:) = sint(:) + sW_iomega * ( atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:)) )
2256
2257
2258      case (2)
2259        ! FHJ: Integration scheme: assume piecewise quadratic matrix elements
2260        ! on each integration segment
2261
2262        ! FHJ: First segment is special. TODO: assert that freqStart==0d0
2263        freqStart = imag_freqs(1)
2264        freqEnd = imag_freqs(2)
2265        c0 = sW_imag_freqs(1)
2266        c1 = 0d0
2267        c2 = (sW_imag_freqs(2) - c0) / (freqEnd-freqStart)**2
2268        sint(:) = sint(:) + &
2269          c2 * wxi(:) * (freqEnd-freqStart) + &
2270          (c0 - c2*wxi(:)**2) * (atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:))) + &
2271          c1*wxi(:)*0.5d0*log((wxi(:)**2 + freqEnd**2)/(wxi(:)**2 + freqStart**2))
2272
2273        ! FHJ: other segments: find c0, c1 and c2 by using each previous point.
2274        ! This also works for complex y`s. This sum is over the starting points.
2275        do ifreq = 2, sig%nfreq_imag-1
2276          x1 = imag_freqs(ifreq-1)
2277          x2 = imag_freqs(ifreq)
2278          x3 = imag_freqs(ifreq+1)
2279          y1 = sW_imag_freqs(ifreq-1)
2280          y2 = sW_imag_freqs(ifreq)
2281          y3 = sW_imag_freqs(ifreq+1)
2282          c2 = ((y2-y1)*(x1-x3) + (y3-y1)*(x2-x1)) / &
2283               ((x1-x3)*(x2**2-x1**2) + (x2-x1)*(x3**2-x1**2))
2284          c1 = ((y2-y1) - c2*(x2**2-x1**2)) / (x2-x1)
2285          c0 = y1 - c2*x1**2 - c1*x1
2286          freqEnd = x3
2287          freqStart = x2
2288          sint(:) = sint(:) + &
2289            c2 * wxi(:) * (freqEnd-freqStart) + &
2290            (c0 - c2*wxi(:)**2) * (atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:))) + &
2291            c1*wxi(:)*0.5d0*log((wxi(:)**2 + freqEnd**2)/(wxi(:)**2 + freqStart**2))
2292        enddo
2293
2294        ! FHJ: add Lorentzian tail? This is only a good idea is the last freq. is really large
2295        !freqStart = x3
2296        !AA = y3*x3**2
2297        !sint(:) = sint(:) + AA/(2d0*wxi(:)**2*freqStart)*( &
2298        !  2d0*wxi(:) - sign(1d0,wxi(:))*PI_D*freqStart + 2d0*freqStart*atan(freqStart/wxi(:)) )
2299
2300
2301      case (3)
2302        ! FHJ: Integration scheme: assume piecewise cubic matrix elements
2303        ! on each integration segment. Find cubic Hermite spline representation
2304        ! using a finite difference approximations for the derivatives
2305
2306        ! Estimate derivatives
2307        mm(1) = ZERO
2308        mm(2:sig%nfreq_imag-1) = &
2309          0.5d0*(sW_imag_freqs(3:sig%nfreq_imag) - sW_imag_freqs(2:sig%nfreq_imag-1)) / &
2310          ( imag_freqs(3:sig%nfreq_imag) - imag_freqs(2:sig%nfreq_imag-1) ) + &
2311          0.5d0*(sW_imag_freqs(2:sig%nfreq_imag-1) - sW_imag_freqs(1:sig%nfreq_imag-2)) / &
2312          ( imag_freqs(2:sig%nfreq_imag-1) - imag_freqs(1:sig%nfreq_imag-2) )
2313        mm(sig%nfreq_imag) = mm(sig%nfreq_imag-1) + &
2314          (imag_freqs(sig%nfreq_imag)-imag_freqs(sig%nfreq_imag-1)) * &
2315          (mm(sig%nfreq_imag-1)-mm(sig%nfreq_imag-2)) / &
2316          (imag_freqs(sig%nfreq_imag-1)-imag_freqs(sig%nfreq_imag-2))
2317
2318        ! FHJ: other segments: find c0, c1 and c2 by using each previous point.
2319        ! This also works for complex y`s. This sum is over the starting points.
2320        do ifreq = 1, sig%nfreq_imag-1
2321          x1 = imag_freqs(ifreq)
2322          x2 = imag_freqs(ifreq+1)
2323          y1 = sW_imag_freqs(ifreq)
2324          y2 = sW_imag_freqs(ifreq+1)
2325          dx = x2 - x1
2326
2327          c0 = -(m1*x1) - (2*m1*x1**2)/dx - (m2*x1**2)/dx - (m1*x1**3)/dx**2 - &
2328                (m2*x1**3)/dx**2 + y1 - (3*x1**2*y1)/dx**2 - (2*x1**3*y1)/dx**3 + &
2329                (3*x1**2*y2)/dx**2 + (2*x1**3*y2)/dx**3
2330          c1 =  m1 + (4*m1*x1)/dx + (2*m2*x1)/dx + (3*m1*x1**2)/dx**2 + &
2331                (3*m2*x1**2)/dx**2 + (6*x1*y1)/dx**2 + (6*x1**2*y1)/dx**3 - &
2332                (6*x1*y2)/dx**2 - (6*x1**2*y2)/dx**3
2333          c2 =  (-2*m1)/dx - m2/dx - (3*m1*x1)/dx**2 - (3*m2*x1)/dx**2 - &
2334                (3*y1)/dx**2 - (6*x1*y1)/dx**3 + (3*y2)/dx**2 + (6*x1*y2)/dx**3
2335          c3 = m1/dx**2 + m2/dx**2 + (2*y1)/dx**3 - (2*y2)/dx**3
2336
2337          sint(:) = sint(:) + ((wxi*dx)*(2*c2 + c3*(x1+x2)) - &
2338            2d0*(c0 - wxi**2*c2)*(atan(x1/wxi) - atan(x2/wxi)) + &
2339            wxi*(-c1 + wxi**2*c3)*log((wxi**2 + x1**2)/(wxi**2 + x2**2)))/2d0
2340        enddo
2341
2342      case default
2343        call die('Invalid integration method')
2344
2345    endselect !sig%cd_int_method
2346    call timing%stop(timing%m_cor_cd_sum)
2347
2348    do iw = 1, sig%nfreqeval
2349      !FHJ: the output accumulated arrays "asxtDyn" and "asxtDyn" actually store
2350      !the residue and integral contribution to the GW self energy for CD
2351      !calculations, respectively.
2352      asxDtemp(iw) = asxDtemp(iw) + sres(iw)
2353      achDtemp(iw) = achDtemp(iw) + sint(iw)/PI_D
2354      ! JRD: This is now close to LDA
2355      if (iw==iwlda) achtD_n1(n1true) = achtD_n1(n1true) + sint(iw)/PI_D
2356    enddo ! over iw
2357
2358777         format(1x,"WARNING: The real frequency range is too small." &
2359              ,/,3x,"l =",i3,1x,"n1 =",i5,1x,"E_l =",f8.3,1x,"E_n1" &
2360              ,1x,"=",f8.3,1x,"wx =",f8.3,1x,"E_max =",f8.3)
2361
2362    POP_SUB(mtxel_cor.sigma_cd)
2363
2364  end subroutine sigma_cd
2365
2366!==============================================================================
2367!                        SUBSPACE CONTOUR DEFORMATION
2368!==============================================================================
2369  !> Calculate Subspace Sigma matrix elements, full-freq / CONTOUR-DEFORMATION formalism
2370  subroutine sigma_cd_subspace()
2371    real(DP) :: imag_freqs(sig%nfreq_imag)
2372    SCALAR :: c0, c1, c2, c3, y1, y2, y3, m1, m2, mm(sig%nfreq_imag)
2373    real(DP) :: x1, x2, x3, dx
2374    SCALAR :: sW_imag_freqs(sig%nfreq_imag, peinf%ntband_max), sint(sig%nfreqeval)
2375    SCALAR :: sW_iomega, sW_iomega_acc
2376    complex(DPC) :: sres(sig%nfreqeval), sres_omega, sres_omega_acc
2377    integer :: occ_sign, ijk, ifreq_my_igp
2378    real(DP) :: fact1, fact2
2379    logical, save :: warned=.false.
2380
2381    integer :: n1
2382    SCALAR, allocatable :: tempz(:,:,:), achstemp_array(:)
2383    ! variables for static reminder
2384    SCALAR, ALLOCATABLE :: schs_array(:,:), eps_scalar(:,:), achs_n1(:)
2385    logical :: use_zgemm
2386    integer :: ipe_block, ipe_real, my_ib
2387    integer :: n1_start, n1_end, n1_size_actual
2388    integer :: n1_ipe
2389
2390    PUSH_SUB(mtxel_cor.sigma_cd_subspace)
2391
2392    ! check if we need to calculate the static reminder
2393    if (sig%exact_ch.eq.1) then
2394      call timing%start(timing%m_cor_remain)
2395      SAFE_ALLOCATE(schs_array, (my_Nb, peinf%ntband_dist(ipe)))
2396      SAFE_ALLOCATE(achs_n1, (peinf%ntband_dist(ipe)))
2397
2398!$OMP PARALLEL do private (n1)
2399      do n1 = 1, peinf%ntband_dist(ipe)
2400        achs_n1(n1) = ZERO
2401      end do
2402!$OMP END PARALLEL DO
2403
2404      if (my_Nb_start > 0) then
2405#ifdef CPLX
2406        call zgemm('t','n', my_Nb,  peinf%ntband_dist(ipe), Nb_tot, (-1D0,0D0), &
2407                   Re_eps_sub(:,:,1), Nb_tot, &
2408                   Msub_n_temp, Nb_tot, (0D0,0D0), &
2409                   schs_array, my_Nb)
2410#else
2411        SAFE_ALLOCATE(eps_scalar, (Nb_tot,my_Nb))
2412        eps_scalar(:,:) = SCALARIFY(Re_eps_sub(:,:,1))
2413        CALL dgemm('t','n',  my_Nb,  peinf%ntband_dist(ipe), Nb_tot, -1.0D+00, &
2414                   eps_scalar(:,:), Nb_tot, &
2415                   Msub_n_temp, Nb_tot, 0.0D+00, &
2416                   schs_array, my_Nb)
2417        SAFE_DEALLOCATE(eps_scalar)
2418#endif
2419
2420!$OMP PARALLEL do private (n1,my_ib) reduction(+:achs_n1)
2421        do n1 = 1, peinf%ntband_dist(ipe)
2422          do my_ib = 1, my_Nb
2423            achs_n1(n1) = achs_n1(n1) + MYCONJG(Msub_m_temp(my_ib+my_Nb_start-1,n1)) * &
2424                                        schs_array(my_ib,n1) * 0.5D0
2425          enddo
2426        end do
2427!$OMP END PARALLEL DO
2428      end if ! my_Nb_start > 0
2429
2430      ! correct wings, same flag as residual part (computed in the same loop later)
2431      ! note the sign is changed and the famous factor 0.5
2432      if(ipe-1 == peinf%pool_rank) then
2433        if(fix_wings_res) then
2434          do n1 = 1, peinf%ntband_dist(ipe)
2435            achs_n1(n1) = achs_n1(n1) - MYCONJG(aqsmtemp(wing_pos,n1)) * &
2436                                        SUM(wings_Re(1:ncouls,1,2) * aqsntemp(1:ncouls,n1)) * &
2437                                            vcoul(indinv(wing_pos)) * 0.5d0
2438            do ig = 1, ncouls
2439              achs_n1(n1) = achs_n1(n1) - MYCONJG(aqsmtemp(ig,n1)) * &
2440                                              wings_Re(ig,1,1) * &
2441                                              aqsntemp(wing_pos,n1) * &
2442                                              vcoul(indinv(ig)) * 0.5d0
2443            end do
2444          end do
2445        end if
2446      end if ! wings
2447
2448      do n1 = 1, peinf%ntband_dist(ipe)
2449        n1true = peinf%indext_dist(n1,ipe)
2450        achtcor_n1(n1true) = achtcor_n1(n1true) - 0.5d0 * achs_n1(n1)
2451      end do
2452
2453      SAFE_DEALLOCATE(schs_array)
2454      SAFE_DEALLOCATE(achs_n1)
2455
2456      call timing%stop(timing%m_cor_remain)
2457    end if
2458
2459    ! residual contribution
2460    do n1 = 1, peinf%ntband_dist(ipe)
2461
2462      ! n1true = "True" band index of the band n1 w.r.t. all bands
2463      n1true = peinf%indext_dist(n1,ipe)
2464      ! energy of the |n1,k-q> state
2465      e_n1kq = wfnkq%ekq(n1true,ispin)
2466      ! occupation of the |n1,k-q> state
2467      flag_occ = (n1true<=(sig%nvband+sig%ncrit)) &
2468        .and.((sig%ncrit==0).or.(e_n1kq<=(sig%efermi+sig%tol)))
2469      if (abs(e_n1kq-sig%efermi)<sig%tol) then
2470        occ = 0.5d0 ! Fermi-Dirac distribution = 1/2 at Fermi level
2471      else
2472        occ = 1.0d0
2473      endif
2474
2475      ! JRD: Find iw closest to e_lk
2476      diffmin = INF
2477      do iw=1,sig%nfreqeval
2478        diff = abs(freq0 + (iw-1)*sig%freqevalstep - e_lk)
2479        if (diff .lt. diffmin) then
2480          diffmin=diff
2481          iwlda=iw
2482        endif
2483      enddo
2484
2485      ! wxi = omega - e_n``k
2486      do iw=1,sig%nfreqeval
2487        wx = freq0 - e_n1kq + (iw-1)*sig%freqevalstep
2488        wxi(iw) = wx
2489      enddo
2490
2491      ! FHJ: residue contribution to the correlation self energy
2492      ! JRD: Don`t want to thread here, nfreqeval could be small
2493      call timing%start(timing%m_cor_cd_res)
2494      sres(:) = (0D0,0D0) ! Residue contrib.
2495      do iw=1,sig%nfreqeval
2496
2497        wx = wxi(iw)
2498        ! FHJ: need either wx>=0 and empty or wx<0 and occupied
2499        if ( (wx>=0.0d0) .eqv. flag_occ) cycle
2500        occ_sign = idint(sign(1d0, wx))
2501        ! FHJ: and from now on we can work on | omega - e_n``k |
2502        wx = abs(wx)
2503
2504        ifreq = 0
2505        do ijk = 1, nfreq_real-1
2506          if (wx>=sig%dFreqGrid(ijk) .and. wx<sig%dFreqGrid(ijk+1)) then
2507            ifreq = ijk
2508          endif
2509        enddo
2510        if (ifreq==0) then
2511          ifreq = nfreq_real+3 ! three is for luck
2512        endif
2513        if(ifreq>=nfreq_real) then
2514          if (.not.warned .and. peinf%pool_rank==0) then
2515            write(0,777) iband, n1true, e_lk, e_n1kq, wx, E_max
2516          endif
2517          warned = .true.
2518          ifreq = nfreq_real-1
2519        endif
2520
2521        sres_omega = 0D0
2522        if (nfreq_real>1) then
2523          fact1 = (sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
2524          fact2 = (wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
2525        endif
2526
2527        if (my_Nb_start > 0) then
2528
2529!$OMP PARALLEL do private (my_ib, sres_omega_acc, ig) &
2530!$OMP          reduction(+:sres_omega)
2531          do my_ib = 1, my_Nb
2532            sres_omega_acc = (0D0,0D0)
2533
2534            ! JRD: The below loop is performance critical
2535            if (nfreq_real>1) then
2536              do ig = 1, Nb_tot
2537                sres_omega_acc = sres_omega_acc + Msub_n_temp(ig,n1)*(&
2538                     Re_eps_sub(ig,my_ib,ifreq)*fact1 + &
2539                     Re_eps_sub(ig,my_ib,ifreq+1)*fact2)
2540              end do
2541            else
2542              do ig = 1, Nb_tot
2543                sres_omega_acc = sres_omega_acc + Msub_n_temp(ig,n1)*Re_eps_sub(ig,my_ib,1)
2544              end do
2545            end if
2546
2547            sres_omega = sres_omega + MYCONJG(Msub_m_temp(my_ib+my_Nb_start-1,n1))*sres_omega_acc
2548          end do
2549!$OMP END PARALLEL DO
2550
2551        end if ! my_Nb_start > 0
2552
2553        if(ipe-1 == peinf%pool_rank) then
2554          if(fix_wings_res) then
2555            !XXXX if (nfreq_real>1) then
2556            !XXXX   sres_omega = sres_omega + MYCONJG(aqsmtemp(wing_pos,n1)) * &
2557            !XXXX                             SUM((wings_Re(1:ncouls,ifreq,2)*fact1 + &
2558            !XXXX                                  wings_Re(1:ncouls,ifreq+1,2)*fact2) * aqsntemp(1:ncouls,n1)) * &
2559            !XXXX                             vcoul(indinv(wing_pos))
2560            !XXXX   do ig = 1, ncouls
2561            !XXXX     sres_omega = sres_omega + MYCONJG(aqsmtemp(ig,n1)) * &
2562            !XXXX                               (wings_Re(ig,ifreq,1)*fact1 + &
2563            !XXXX                                wings_Re(ig,ifreq+1,1)*fact2) * aqsntemp(wing_pos,n1) * &
2564            !XXXX                               vcoul(indinv(ig))
2565            !XXXX   end do
2566            !XXXX else
2567            !XXXX   sres_omega = sres_omega + MYCONJG(aqsmtemp(wing_pos,n1)) * &
2568            !XXXX                             SUM(wings_Re(1:ncouls,1,2) * aqsntemp(1:ncouls,n1)) * &
2569            !XXXX                             vcoul(indinv(wing_pos))
2570            !XXXX   do ig = 1, ncouls
2571            !XXXX     sres_omega = sres_omega + MYCONJG(aqsmtemp(ig,n1)) * &
2572            !XXXX                               wings_Re(ig,1,1) * &
2573            !XXXX                               aqsntemp(wing_pos,n1) * &
2574            !XXXX                               vcoul(indinv(ig))
2575            !XXXX   end do
2576            !XXXX end if
2577            sres_omega_acc = (0D0,0D0)
2578            if (nfreq_real>1) then
2579!$OMP PARALLEL do private (ig) &
2580!$OMP          reduction(+:sres_omega_acc)
2581               do ig = 1, ncouls
2582                 sres_omega_acc = sres_omega_acc + MYCONJG(aqsmtemp(wing_pos,n1)) * &
2583                                           (wings_Re(ig,ifreq,2)*fact1 + &
2584                                            wings_Re(ig,ifreq+1,2)*fact2) * aqsntemp(ig,n1) * &
2585                                            vcoul(indinv(wing_pos))
2586                 sres_omega_acc = sres_omega_acc + MYCONJG(aqsmtemp(ig,n1)) * &
2587                                           (wings_Re(ig,ifreq,1)*fact1 + &
2588                                            wings_Re(ig,ifreq+1,1)*fact2) * aqsntemp(wing_pos,n1) * &
2589                                            vcoul(indinv(ig))
2590               end do
2591!$END OMP PARALLEL DO
2592            else
2593!$OMP PARALLEL do private (ig) &
2594!$OMP          reduction(+:sres_omega_acc)
2595               do ig = 1, ncouls
2596                 sres_omega_acc = sres_omega_acc + MYCONJG(aqsmtemp(wing_pos,n1)) * &
2597                                            wings_Re(ig,1,2) * aqsntemp(ig,n1) * &
2598                                            vcoul(indinv(wing_pos))
2599                 sres_omega_acc = sres_omega_acc + MYCONJG(aqsmtemp(ig,n1)) * &
2600                                            wings_Re(ig,1,1) * aqsntemp(wing_pos,n1) * &
2601                                            vcoul(indinv(ig))
2602               end do
2603!$END OMP PARALLEL DO
2604            end if
2605            sres_omega = sres_omega + sres_omega_acc
2606          end if
2607        end if ! fix tail-wings
2608
2609        sres(iw) = sres(iw) - occ_sign*sres_omega
2610
2611      enddo ! iw loop
2612      call timing%stop(timing%m_cor_cd_res)
2613
2614      do iw = 1, sig%nfreqeval
2615        !FHJ: the output accumulated arrays "asxtDyn" and "asxtDyn" actually store
2616        !the residue and integral contribution to the GW self energy for CD
2617        !calculations, respectively.
2618        asxDtemp(iw) = asxDtemp(iw) + sres(iw)
2619      enddo ! over iw
2620
2621    end do ! n1 RESIDUAL PART
2622    if(ipe-1 == peinf%pool_rank) fix_wings_res = .false.
2623
2624    ! integral part
2625!$OMP PARALLEL do private (n1,ifreq)
2626      do n1 = 1, peinf%ntband_max
2627        do ifreq = 1, sig%nfreq_imag
2628          sW_imag_freqs(ifreq,n1) = ZERO
2629        end do
2630      end do
2631!$OMP END PARALLEL DO
2632
2633    if(my_Nb_start > 0) then
2634
2635      call timing%start(timing%m_cor_cd_int)
2636      ! go with ZGEMM
2637      SAFE_ALLOCATE(tempz, (my_Nb, sig%nfreq_imag, peinf%ntband_dist(ipe)))
2638      !
2639      call timing%start(timing%m_cor_cd_gemm)
2640#ifdef CPLX
2641      call zgemm('t','n', my_Nb*sig%nfreq_imag,  peinf%ntband_dist(ipe), Nb_tot, &
2642                 (1D0,0D0), Im_eps_sub, Nb_tot, &
2643                            Msub_n_temp, Nb_tot,&
2644                 (0D0,0D0), tempz, my_Nb*sig%nfreq_imag)
2645#else
2646      call dgemm('t','n', my_Nb*sig%nfreq_imag,  peinf%ntband_dist(ipe), Nb_tot, &
2647                 1.0D0, Im_eps_sub, Nb_tot, &
2648                        Msub_n_temp, Nb_tot,&
2649                 0.0D0, tempz, my_Nb*sig%nfreq_imag)
2650
2651#endif
2652     call timing%stop(timing%m_cor_cd_gemm)
2653
2654!$OMP PARALLEL do private (n1,my_ib,my_igp,igp,indigp,sW_iomega_acc,ifreq) &
2655!$OMP          reduction(+:sW_imag_freqs)
2656      do n1 = 1, peinf%ntband_dist(ipe)
2657        do ifreq = 1, sig%nfreq_imag
2658          sW_iomega_acc = ZERO
2659          do my_ib = 1, my_Nb
2660            sW_iomega_acc = sW_iomega_acc + &
2661                MYCONJG(Msub_m_temp(my_Nb_start+my_ib-1,n1))*tempz(my_ib,ifreq,n1)
2662          end do
2663          sW_imag_freqs(ifreq,n1) = sW_imag_freqs(ifreq,n1) + sW_iomega_acc
2664        end do
2665      end do
2666!$OMP END PARALLEL DO
2667
2668      SAFE_DEALLOCATE(tempz)
2669    end if ! my_Nb_start > 0
2670
2671    ! fix wings
2672    if(ipe-1 == peinf%pool_rank) then
2673      if(fix_wings) then
2674!$OMP PARALLEL do private (n1,ifreq)  collapse(2)
2675        do n1 = 1, peinf%ntband_dist(ipe)
2676          do ifreq = 1, sig%nfreq_imag
2677            sW_imag_freqs(ifreq,n1) = sW_imag_freqs(ifreq,n1) + &
2678                                      MYCONJG(aqsmtemp(wing_pos,n1)) * &
2679                                      SUM(wings_Im(1:ncouls,ifreq,2) * aqsntemp(1:ncouls,n1)) * &
2680                                      vcoul(indinv(wing_pos))
2681          end do
2682        end do
2683!$OMP END PARALLEL DO
2684
2685!$OMP PARALLEL do private (n1,ifreq,ig)  collapse(2)
2686        do n1 = 1, peinf%ntband_dist(ipe)
2687          do ifreq = 1, sig%nfreq_imag
2688            do ig = 1, ncouls
2689            sW_imag_freqs(ifreq,n1) = sW_imag_freqs(ifreq,n1) + &
2690                                      MYCONJG(aqsmtemp(ig,n1)) * &
2691                                      wings_Im(ig,ifreq,1) * aqsntemp(wing_pos,n1) * &
2692                                      vcoul(indinv(ig))
2693            end do
2694          end do
2695        end do
2696!$OMP END PARALLEL DO
2697
2698        fix_wings = .false.
2699      end if
2700    end if
2701    call timing%stop(timing%m_cor_cd_int)
2702
2703    call timing%start(timing%m_cor_cd_sum)
2704    ! calculate integral contribution
2705    do n1 = 1, peinf%ntband_dist(ipe)
2706
2707      ! n1true = "True" band index of the band n1 w.r.t. all bands
2708      n1true = peinf%indext_dist(n1,ipe)
2709      ! energy of the |n1,k-q> state
2710      e_n1kq = wfnkq%ekq(n1true,ispin)
2711      ! occupation of the |n1,k-q> state
2712      flag_occ = (n1true<=(sig%nvband+sig%ncrit)) &
2713        .and.((sig%ncrit==0).or.(e_n1kq<=(sig%efermi+sig%tol)))
2714      if (abs(e_n1kq-sig%efermi)<sig%tol) then
2715        occ = 0.5d0 ! Fermi-Dirac distribution = 1/2 at Fermi level
2716      else
2717        occ = 1.0d0
2718      endif
2719
2720      ! FHJ: CH term used for static remainder for current band "n1true".
2721      achstemp = ZERO
2722
2723      ! JRD: Find iw closest to e_lk
2724      diffmin = INF
2725      do iw=1,sig%nfreqeval
2726        diff = abs(freq0 + (iw-1)*sig%freqevalstep - e_lk)
2727        if (diff .lt. diffmin) then
2728          diffmin=diff
2729          iwlda=iw
2730        endif
2731      enddo
2732
2733      ! wxi = omega - e_n``k
2734      do iw=1,sig%nfreqeval
2735        wx = freq0 - e_n1kq + (iw-1)*sig%freqevalstep
2736        wxi(iw) = wx
2737      enddo
2738
2739      sint(:) = ZERO ! Integral contrib.
2740      imag_freqs(:) = IMAG(sig%dFreqGrid(nfreq_real+1:)+sig%dFreqBrd(nfreq_real+1:))
2741      select case (sig%cd_int_method)
2742
2743        case (0)
2744          ! FHJ: Integration from Fabien`s thesis; assumes that the matrix elements
2745          ! are piecewise constant functions, and perform the integral over each
2746          ! interval centered are the frequency evaluation point.
2747
2748          ! JRD: XXX Should center this integral....
2749          ! FHJ: What do you mean?
2750          ! MDB: explicit OMP parallelization over the sigma grid index
2751
2752!$OMP PARALLEL DO private (iw,ifreq,sW_iomega,freqStart,freqEnd) &
2753!$OMP             reduction(+:sint)
2754          do iw=1,sig%nfreqeval
2755            ifreq = 1
2756            sW_iomega = sW_imag_freqs(ifreq,n1)
2757            freqStart = imag_freqs(ifreq)
2758            freqEnd = (imag_freqs(ifreq) + imag_Freqs(ifreq+1)) * 0.5d0
2759
2760            !XXX sint(:) = sint(:) + sW_iomega * ( atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:)) )
2761            sint(iw) = sint(iw) + sW_iomega * ( atan(freqEnd/wxi(iw)) - atan(freqStart/wxi(iw)) )
2762
2763            !FHJ: no need to do OMP here, each trip is very short, and OMP can
2764            !hurt vectorization.
2765            do ifreq = 2, sig%nfreq_imag-1
2766                sW_iomega = sW_imag_freqs(ifreq,n1)
2767                freqStart = (imag_freqs(ifreq-1) + imag_Freqs(ifreq)) * 0.5d0
2768                freqEnd = (imag_freqs(ifreq) + imag_Freqs(ifreq+1)) * 0.5d0
2769
2770                !XXXX sint(:) = sint(:) + sW_iomega * ( atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:)) )
2771                sint(iw) = sint(iw) + sW_iomega * ( atan(freqEnd/wxi(iw)) - atan(freqStart/wxi(iw)) )
2772
2773            enddo
2774
2775            sW_iomega = sW_imag_freqs(sig%nfreq_imag,n1)
2776            freqStart = (imag_freqs(sig%nfreq_imag-1) + imag_Freqs(sig%nfreq_imag)) * 0.5d0
2777            freqEnd = (-imag_freqs(sig%nfreq_imag-1) + 3d0*imag_Freqs(sig%nfreq_imag)) * 0.5d0
2778
2779            !XXXX sint(:) = sint(:) + sW_iomega * ( atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:)) )
2780            sint(iw) = sint(iw) + sW_iomega * ( atan(freqEnd/wxi(iw)) - atan(freqStart/wxi(iw)) )
2781
2782          end do  ! iw
2783!$OMP END PARALLEL DO
2784
2785        case (2)
2786          ! FHJ: Integration scheme: assume piecewise quadratic matrix elements
2787          ! on each integration segment
2788
2789          ! FHJ: First segment is special. TODO: assert that freqStart==0d0
2790          freqStart = imag_freqs(1)
2791          freqEnd = imag_freqs(2)
2792          c0 = sW_imag_freqs(1,n1)
2793          c1 = 0d0
2794          c2 = (sW_imag_freqs(2,n1) - c0) / (freqEnd-freqStart)**2
2795          sint(:) = sint(:) + &
2796            c2 * wxi(:) * (freqEnd-freqStart) + &
2797            (c0 - c2*wxi(:)**2) * (atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:))) + &
2798            c1*wxi(:)*0.5d0*log((wxi(:)**2 + freqEnd**2)/(wxi(:)**2 + freqStart**2))
2799
2800          ! FHJ: other segments: find c0, c1 and c2 by using each previous point.
2801          ! This also works for complex y`s. This sum is over the starting points.
2802          do ifreq = 2, sig%nfreq_imag-1
2803            x1 = imag_freqs(ifreq-1)
2804            x2 = imag_freqs(ifreq)
2805            x3 = imag_freqs(ifreq+1)
2806            y1 = sW_imag_freqs(ifreq-1,n1)
2807            y2 = sW_imag_freqs(ifreq,n1)
2808            y3 = sW_imag_freqs(ifreq+1,n1)
2809            c2 = ((y2-y1)*(x1-x3) + (y3-y1)*(x2-x1)) / &
2810                 ((x1-x3)*(x2**2-x1**2) + (x2-x1)*(x3**2-x1**2))
2811            c1 = ((y2-y1) - c2*(x2**2-x1**2)) / (x2-x1)
2812            c0 = y1 - c2*x1**2 - c1*x1
2813            freqEnd = x3
2814            freqStart = x2
2815            sint(:) = sint(:) + &
2816              c2 * wxi(:) * (freqEnd-freqStart) + &
2817              (c0 - c2*wxi(:)**2) * (atan(freqEnd/wxi(:)) - atan(freqStart/wxi(:))) + &
2818              c1*wxi(:)*0.5d0*log((wxi(:)**2 + freqEnd**2)/(wxi(:)**2 + freqStart**2))
2819          enddo
2820
2821          ! FHJ: add Lorentzian tail? This is only a good idea is the last freq. is really large
2822          !freqStart = x3
2823          !AA = y3*x3**2
2824          !sint(:) = sint(:) + AA/(2d0*wxi(:)**2*freqStart)*( &
2825          !  2d0*wxi(:) - sign(1d0,wxi(:))*PI_D*freqStart + 2d0*freqStart*atan(freqStart/wxi(:)) )
2826
2827
2828        case (3)
2829          ! FHJ: Integration scheme: assume piecewise cubic matrix elements
2830          ! on each integration segment. Find cubic Hermite spline representation
2831          ! using a finite difference approximations for the derivatives
2832
2833          ! Estimate derivatives
2834          mm(1) = ZERO
2835          mm(2:sig%nfreq_imag-1) = &
2836            0.5d0*(sW_imag_freqs(3:sig%nfreq_imag,n1) - sW_imag_freqs(2:sig%nfreq_imag-1,n1)) / &
2837            ( imag_freqs(3:sig%nfreq_imag) - imag_freqs(2:sig%nfreq_imag-1) ) + &
2838            0.5d0*(sW_imag_freqs(2:sig%nfreq_imag-1,n1) - sW_imag_freqs(1:sig%nfreq_imag-2,n1)) / &
2839            ( imag_freqs(2:sig%nfreq_imag-1) - imag_freqs(1:sig%nfreq_imag-2) )
2840          mm(sig%nfreq_imag) = mm(sig%nfreq_imag-1) + &
2841            (imag_freqs(sig%nfreq_imag)-imag_freqs(sig%nfreq_imag-1)) * &
2842            (mm(sig%nfreq_imag-1)-mm(sig%nfreq_imag-2)) / &
2843            (imag_freqs(sig%nfreq_imag-1)-imag_freqs(sig%nfreq_imag-2))
2844
2845          ! FHJ: other segments: find c0, c1 and c2 by using each previous point.
2846          ! This also works for complex y`s. This sum is over the starting points.
2847          do ifreq = 1, sig%nfreq_imag-1
2848            x1 = imag_freqs(ifreq)
2849            x2 = imag_freqs(ifreq+1)
2850            y1 = sW_imag_freqs(ifreq,n1)
2851            y2 = sW_imag_freqs(ifreq+1,n1)
2852            dx = x2 - x1
2853
2854            c0 = -(m1*x1) - (2*m1*x1**2)/dx - (m2*x1**2)/dx - (m1*x1**3)/dx**2 - &
2855                  (m2*x1**3)/dx**2 + y1 - (3*x1**2*y1)/dx**2 - (2*x1**3*y1)/dx**3 + &
2856                  (3*x1**2*y2)/dx**2 + (2*x1**3*y2)/dx**3
2857            c1 =  m1 + (4*m1*x1)/dx + (2*m2*x1)/dx + (3*m1*x1**2)/dx**2 + &
2858                  (3*m2*x1**2)/dx**2 + (6*x1*y1)/dx**2 + (6*x1**2*y1)/dx**3 - &
2859                  (6*x1*y2)/dx**2 - (6*x1**2*y2)/dx**3
2860            c2 =  (-2*m1)/dx - m2/dx - (3*m1*x1)/dx**2 - (3*m2*x1)/dx**2 - &
2861                  (3*y1)/dx**2 - (6*x1*y1)/dx**3 + (3*y2)/dx**2 + (6*x1*y2)/dx**3
2862            c3 = m1/dx**2 + m2/dx**2 + (2*y1)/dx**3 - (2*y2)/dx**3
2863
2864            sint(:) = sint(:) + ((wxi*dx)*(2*c2 + c3*(x1+x2)) - &
2865              2d0*(c0 - wxi**2*c2)*(atan(x1/wxi) - atan(x2/wxi)) + &
2866              wxi*(-c1 + wxi**2*c3)*log((wxi**2 + x1**2)/(wxi**2 + x2**2)))/2d0
2867          enddo
2868
2869        case default
2870          call die('Invalid integration method')
2871
2872      endselect !sig%cd_int_method
2873      call timing%stop(timing%m_cor_cd_sum)
2874
2875      do iw = 1, sig%nfreqeval
2876        !FHJ: the output accumulated arrays "asxtDyn" and "asxtDyn" actually store
2877        !the residue and integral contribution to the GW self energy for CD
2878        !calculations, respectively.
2879        achDtemp(iw) = achDtemp(iw) + sint(iw)/PI_D
2880        ! JRD: This is now close to LDA
2881        if (iw==iwlda) achtD_n1(n1true) = achtD_n1(n1true) + sint(iw)/PI_D
2882      enddo ! over iw
2883
2884    end do ! n1
2885
2886777         format(1x,"WARNING: The real frequency range is too small." &
2887              ,/,3x,"l =",i3,1x,"n1 =",i5,1x,"E_l =",f8.3,1x,"E_n1" &
2888              ,1x,"=",f8.3,1x,"wx =",f8.3,1x,"E_max =",f8.3)
2889
2890    POP_SUB(mtxel_cor.sigma_cd_subspace)
2891
2892  end subroutine sigma_cd_subspace
2893
2894end subroutine mtxel_cor
2895
2896!> Accumulates the partial static CH as a sum over all bands. We use this to
2897!! calculate the static remainder later on as 1/2 of the difference between the
2898!! exact static CH and the partial static CH obtained here.
2899subroutine acc_static_ch(ngpown, ncouls, inv_igp_index, indinv, vcoul, &
2900  aqsn_n1, aqsm_n1, achs, eps_scalar, eps_cplx)
2901  integer, intent(in) :: ngpown, ncouls, inv_igp_index(:), indinv(:)
2902  real(DP), intent(in) :: vcoul(:)
2903  SCALAR, intent(in) :: aqsn_n1(:), aqsm_n1(:)
2904  SCALAR, intent(inout) :: achs
2905  SCALAR, intent(in), optional :: eps_scalar(:,:)
2906  complex(DPC), intent(in), optional :: eps_cplx(:,:)
2907
2908  SCALAR :: schs
2909  integer :: my_igp, igp, indigp, ig
2910
2911  PUSH_SUB(acc_static_ch)
2912
2913  if (present(eps_scalar).eqv.present(eps_cplx)) &
2914    call die('Internal error calling acc_static_ch', only_root_writes=.true.)
2915
2916  call timing%start(timing%m_cor_remain)
2917!$OMP PARALLEL do private (my_igp,indigp,igp,ig,schs) reduction(+:achs)
2918  do my_igp = 1, ngpown
2919    indigp = inv_igp_index(my_igp)
2920    igp = indinv(indigp)
2921    if (igp>ncouls .or. igp<=0) cycle
2922
2923    schs = ZERO
2924    if (present(eps_scalar)) then
2925      do ig = 1, ncouls
2926        schs = schs - aqsn_n1(ig)*eps_scalar(ig,my_igp)
2927      enddo
2928    else
2929      do ig = 1, ncouls
2930        schs = schs - aqsn_n1(ig)*SCALARIFY(eps_cplx(ig,my_igp))
2931      enddo
2932    endif
2933    achs = achs + MYCONJG(aqsm_n1(igp))*schs*vcoul(igp)*0.5D0
2934  enddo
2935!$OMP END PARALLEL DO
2936  call timing%stop(timing%m_cor_remain)
2937
2938  POP_SUB(acc_static_ch)
2939
2940end subroutine acc_static_ch
2941
2942end module mtxel_cor_m
2943
2944