1! Copyrigh(C) 2005-2018 Quantum ESPRESSO group
2! This file is distributed under the terms of the
3! GNU General Public License. See the file `License'
4! in the root directory of the present distribution,
5! or http://www.gnu.org/copyleft/gpl.txt .
6!
7!-----------------------------------------------------------------------------
8MODULE exx
9  !-----------------------------------------------------------------------------
10  !! Variables and subroutines for calculation of exact-exchange contribution.
11  !! Implements ACE: Lin Lin, J. Chem. Theory Comput. 2016, 12, 2242.
12  !! Contains code for band parallelization over pairs of bands: see T. Barnes,
13  !! T. Kurth, P. Carrier, N. Wichmann, D. Prendergast, P.R.C. Kent, J. Deslippe
14  !! Computer Physics Communications 2017, dx.doi.org/10.1016/j.cpc.2017.01.008.
15  !
16  USE kinds,                ONLY : DP
17  USE noncollin_module,     ONLY : noncolin, npol
18  USE io_global,            ONLY : ionode, stdout
19  !
20  USE control_flags,        ONLY : gamma_only, tqr
21  USE fft_types,            ONLY : fft_type_descriptor
22  USE stick_base,           ONLY : sticks_map, sticks_map_deallocate
23  !
24  IMPLICIT NONE
25  !
26  SAVE
27  !
28  ! ... general purpose vars
29  !
30  REAL(DP):: exxalfa=0._DP
31  !! the parameter multiplying the exact-exchange part
32  REAL(DP), ALLOCATABLE :: x_occupation(:,:)
33  !! the weights of auxiliary functions in the density matrix
34  INTEGER :: x_nbnd_occ
35  !! number of bands of auxiliary functions with at least
36  !! some x_occupation > eps_occ
37  REAL(DP), PARAMETER :: eps_occ = 1.d-8
38  !! occupation threshold
39  COMPLEX(DP), ALLOCATABLE :: exxbuff(:,:,:)
40  !! Buffers: temporary (complex) buffer for wfc storage
41  REAL(DP), ALLOCATABLE :: locbuff(:,:,:)
42  !! temporary (real) buffer for wfc storage
43  REAL(DP), ALLOCATABLE :: locmat(:,:,:)
44  !! buffer for matrix of localization integrals
45  REAL(DP), ALLOCATABLE :: exxmat(:,:,:,:)
46  !! buffer for matrix of localization integrals (K)
47  !
48#if defined(__USE_INTEL_HBM_DIRECTIVES)
49!DIR$ ATTRIBUTES FASTMEM :: exxbuff
50#elif defined(__USE_CRAY_HBM_DIRECTIVES)
51!DIR$ memory(bandwidth) exxbuff
52#endif
53  !
54  LOGICAL :: use_ace
55  !! if .TRUE. use Lin Lin's ACE method, if .FALSE. do not use ACE,
56  !! use old algorithm instead
57  COMPLEX(DP), ALLOCATABLE :: xi(:,:,:)
58  !! ACE projectors
59  COMPLEX(DP), ALLOCATABLE :: evc0(:,:,:)
60  !! old wfc (G-space) needed to compute fock3
61  INTEGER :: nbndproj
62  !!
63  LOGICAL :: domat
64  !!
65  REAL(DP)::  local_thr
66  !! threshold for Lin Lin's SCDM localized orbitals: discard
67  !! contribution to V_x if overlap between localized orbitals
68  !! is smaller than "local_thr".
69  !
70  ! ... energy related variables
71  !
72  REAL(DP) :: fock0 = 0.0_DP
73  !! sum <old|Vx(old)|old>
74  REAL(DP) :: fock1 = 0.0_DP
75  !! sum <new|vx(old)|new>
76  REAL(DP) :: fock2 = 0.0_DP
77  !! sum <new|vx(new)|new>
78  REAL(DP) :: fock3 = 0.0_DP
79  !! sum <old|vx(new)|old>
80  REAL(DP) :: dexx  = 0.0_DP
81  !! fock1 - 0.5*(fock2+fock0)
82  !
83  ! ... custom fft grid and related G-vectors
84  !
85  TYPE(fft_type_descriptor) :: dfftt
86  LOGICAL :: exx_fft_initialized = .FALSE.
87  REAL(kind=DP), DIMENSION(:), POINTER :: ggt => null()
88  !! G-vectors in custom gri
89  REAL(kind=DP), DIMENSION(:,:),POINTER :: gt => null()
90  !! G-vectors in custom grid
91  INTEGER :: gstart_t
92  !! gstart_t=2 if ggt(1)=0, =1 otherwise
93  INTEGER :: npwt
94  !! number of plane waves in custom grid (Gamma-only)
95  INTEGER :: ngmt_g
96  !! Total number of G-vectors in custom grid
97  REAL(DP)  :: ecutfock
98  !! energy cutoff for custom grid
99  INTEGER :: ibnd_start = 0
100  !! starting band index used in bgrp parallelization
101  INTEGER :: ibnd_end = 0
102  !! ending band index used in bgrp parallelization
103  INTEGER :: ibnd_buff_start
104  !! starting buffer index used in bgrp parallelization
105  INTEGER :: ibnd_buff_end
106  !! ending buffer index used in bgrp parallelization
107  !
108 CONTAINS
109#define _CX(A)  CMPLX(A,0._dp,kind=DP)
110#define _CY(A)  CMPLX(0._dp,-A,kind=DP)
111  !
112  !------------------------------------------------------------------------
113  SUBROUTINE exx_fft_create()
114    !------------------------------------------------------------------------
115    !! Initialise the custom grid that allows to put the wavefunction
116    !! onto the new (smaller) grid for \rho=\psi_{k+q}\psi^*_k and vice versa.
117    !! Set up fft descriptors, including parallel stuff: sticks, planes, etc.
118    !
119    USE gvecw,          ONLY : ecutwfc
120    USE gvect,          ONLY : ecutrho, ngm, g, gg, gstart, mill
121    USE cell_base,      ONLY : at, bg, tpiba2
122    USE recvec_subs,    ONLY : ggen, ggens
123    USE fft_base,       ONLY : smap
124    USE fft_types,      ONLY : fft_type_init
125    USE symm_base,      ONLY : fft_fact
126    USE mp_exx,         ONLY : nproc_egrp, negrp, intra_egrp_comm
127    USE mp_bands,       ONLY : nproc_bgrp, intra_bgrp_comm, nyfft
128    !
129    USE klist,          ONLY : nks, xk
130    USE mp_pools,       ONLY : inter_pool_comm
131    USE mp,             ONLY : mp_max, mp_sum
132    !
133    USE control_flags,  ONLY : tqr
134    USE realus,         ONLY : qpointlist, tabxx, tabp
135    USE exx_band,       ONLY : smap_exx
136    USE command_line_options, ONLY : nmany_
137    !
138    IMPLICIT NONE
139    !
140    ! ... local variables
141    !
142    INTEGER :: ik, ngmt
143    INTEGER, ALLOCATABLE :: ig_l2gt(:), millt(:,:)
144    INTEGER, EXTERNAL :: n_plane_waves
145    REAL(DP) :: gkcut, gcutmt
146    LOGICAL :: lpara
147    !
148    IF ( exx_fft_initialized ) RETURN
149    !
150    ! Initialise the custom grid that allows us to put the wavefunction
151    ! onto the new (smaller) grid for \rho=\psi_{k+q}\psi^*_k and vice versa
152    !
153    ! gkcut is such that all |k+G|^2 < gkcut (in units of (2pi/a)^2)
154    ! Note that with k-points, gkcut > ecutwfc/(2pi/a)^2
155    ! gcutmt is such that |q+G|^2 < gcutmt
156    !
157    IF ( gamma_only ) THEN
158       gkcut = ecutwfc / tpiba2
159       gcutmt = ecutfock / tpiba2
160    ELSE
161       gkcut = 0.0_DP
162       DO ik = 1, nks
163          gkcut = MAX(gkcut, SQRT(SUM(xk(:,ik)**2)))
164       ENDDO
165       CALL mp_max( gkcut, inter_pool_comm )
166       ! Alternatively, variable "qnorm" earlier computed in "exx_grid_init"
167       ! could be used as follows:
168       ! gkcut = ( SQRT(ecutwfc/tpiba2) + qnorm )**2
169       gkcut = ( SQRT(ecutwfc/tpiba2) + gkcut )**2
170       !
171       ! ... the following instruction may be needed if ecutfock \simeq ecutwfc
172       ! and guarantees that all k+G are included
173       !
174       gcutmt = MAX(ecutfock/tpiba2, gkcut)
175    ENDIF
176    !
177    ! ... set up fft descriptors, including parallel stuff: sticks, planes, etc.
178    !
179    IF (negrp == 1) THEN
180       !
181       ! ... no band parallelization: exx grid is a subgrid of general grid
182       !
183       lpara = ( nproc_bgrp > 1 )
184       CALL fft_type_init( dfftt, smap, "rho", gamma_only, lpara,         &
185                           intra_bgrp_comm, at, bg, gcutmt, gcutmt/gkcut, &
186                           fft_fact=fft_fact, nyfft=nyfft, nmany=nmany_ )
187       CALL ggens( dfftt, gamma_only, at, g, gg, mill, gcutmt, ngmt, gt, ggt )
188       gstart_t = gstart
189       npwt = n_plane_waves(ecutwfc/tpiba2, nks, xk, gt, ngmt)
190       ngmt_g = ngmt
191       CALL mp_sum( ngmt_g, intra_bgrp_comm )
192       !
193    ELSE
194       !
195       WRITE( 6, "(5X,'Exchange parallelized over bands (',i4,' band groups)')" ) &
196              negrp
197       lpara = ( nproc_egrp > 1 )
198       CALL fft_type_init( dfftt, smap_exx, "rho", gamma_only, lpara,     &
199                           intra_egrp_comm, at, bg, gcutmt, gcutmt/gkcut, &
200                           fft_fact=fft_fact, nyfft=nyfft, nmany=nmany_ )
201       ngmt = dfftt%ngm
202       ngmt_g = ngmt
203       CALL mp_sum( ngmt_g, intra_egrp_comm )
204       ALLOCATE( gt(3,dfftt%ngm) )
205       ALLOCATE( ggt(dfftt%ngm)  )
206       ALLOCATE( millt(3,dfftt%ngm) )
207       ALLOCATE( ig_l2gt(dfftt%ngm) )
208       !
209       CALL ggen( dfftt, gamma_only, at, bg, gcutmt, ngmt_g, ngmt, &
210                  gt, ggt, millt, ig_l2gt, gstart_t )
211       !
212       DEALLOCATE( ig_l2gt )
213       DEALLOCATE( millt )
214       npwt = n_plane_waves( ecutwfc/tpiba2, nks, xk, gt, ngmt )
215       !
216    ENDIF
217    ! define clock labels (this enables the corresponding fft too)
218    dfftt%rho_clock_label = 'fftc' ; dfftt%wave_clock_label = 'fftcw'
219    !
220    WRITE( stdout, '(/5x,"EXX grid: ",i8," G-vectors", 5x,       &
221         &   "FFT dimensions: (",i4,",",i4,",",i4,")")') ngmt_g, &
222         &   dfftt%nr1, dfftt%nr2, dfftt%nr3
223    !
224    exx_fft_initialized = .TRUE.
225    !
226    IF (tqr) THEN
227       IF (ecutfock == ecutrho) THEN
228          WRITE( stdout, '(5x,"Real-space augmentation: EXX grid -> DENSE grid")' )
229          tabxx => tabp
230       ELSE
231          WRITE( stdout, '(5x,"Real-space augmentation: initializing EXX grid")' )
232          CALL qpointlist( dfftt, tabxx )
233       ENDIF
234    ENDIF
235    !
236    RETURN
237    !
238  END SUBROUTINE exx_fft_create
239  !
240  !
241  !------------------------------------------------------------------------
242  SUBROUTINE exx_gvec_reinit( at_old )
243    !----------------------------------------------------------------------
244    !! Re-initialize g-vectors after rescaling.
245    !
246    USE cell_base,  ONLY : bg
247    !
248    IMPLICIT NONE
249    !
250    REAL(DP), INTENT(IN) :: at_old(3,3)
251    !! the lattice vectors at the previous step
252    !
253    ! ... local variables
254    !
255    INTEGER :: ig
256    REAL(DP) :: gx, gy, gz
257    !
258    ! ... rescale g-vectors
259    !
260    CALL cryst_to_cart( dfftt%ngm, gt, at_old, -1 )
261    CALL cryst_to_cart( dfftt%ngm, gt, bg,     +1 )
262    !
263    DO ig = 1, dfftt%ngm
264       gx = gt(1,ig)
265       gy = gt(2,ig)
266       gz = gt(3,ig)
267       ggt(ig) = gx*gx + gy*gy + gz*gz
268    ENDDO
269    !
270  END SUBROUTINE exx_gvec_reinit
271  !
272  !
273  !------------------------------------------------------------------------
274  SUBROUTINE deallocate_exx()
275    !------------------------------------------------------------------------
276    !! Deallocates exx objects.
277    !
278    USE becmod,    ONLY : deallocate_bec_type, is_allocated_bec_type, bec_type
279    USE us_exx,    ONLY : becxx
280    USE exx_base,  ONLY : xkq_collect, index_xkq, index_xk, index_sym, rir, &
281                          working_pool, exx_grid_initialized
282    !
283    IMPLICIT NONE
284    !
285    INTEGER :: ikq
286    !
287    exx_grid_initialized = .FALSE.
288    !
289    IF ( ALLOCATED(index_xkq) )    DEALLOCATE( index_xkq )
290    IF ( ALLOCATED(index_xk ) )    DEALLOCATE( index_xk  )
291    IF ( ALLOCATED(index_sym) )    DEALLOCATE( index_sym )
292    IF ( ALLOCATED(rir) )          DEALLOCATE( rir )
293    IF ( ALLOCATED(x_occupation) ) DEALLOCATE( x_occupation )
294    IF ( ALLOCATED(xkq_collect ) ) DEALLOCATE( xkq_collect  )
295    IF ( ALLOCATED(exxbuff) )      DEALLOCATE( exxbuff )
296    IF ( ALLOCATED(locbuff) )      DEALLOCATE( locbuff )
297    IF ( ALLOCATED(locmat) )       DEALLOCATE( locmat )
298    IF ( ALLOCATED(exxmat) )       DEALLOCATE( exxmat )
299    IF ( ALLOCATED(xi)   )         DEALLOCATE( xi   )
300    IF ( ALLOCATED(evc0) )         DEALLOCATE( evc0 )
301    !
302    IF ( ALLOCATED(becxx) ) THEN
303      DO ikq = 1, SIZE(becxx)
304        IF (is_allocated_bec_type(becxx(ikq))) CALL deallocate_bec_type( becxx(ikq) )
305      ENDDO
306      !
307      DEALLOCATE( becxx )
308    ENDIF
309    !
310    IF ( ALLOCATED(working_pool) )  DEALLOCATE( working_pool )
311    !
312    exx_fft_initialized = .FALSE.
313    IF ( ASSOCIATED(gt)  )  DEALLOCATE( gt  )
314    IF ( ASSOCIATED(ggt) )  DEALLOCATE( ggt )
315    !
316  END SUBROUTINE deallocate_exx
317  !
318  !
319  !------------------------------------------------------------------------
320  SUBROUTINE exxinit( DoLoc )
321    !------------------------------------------------------------------------
322    !! This subroutine is run before the first H_psi() of each iteration.
323    !! It saves the wavefunctions for the right density matrix, in real space.
324    !
325    USE wavefunctions,        ONLY : evc
326    USE io_files,             ONLY : nwordwfc, iunwfc_exx
327    USE buffers,              ONLY : get_buffer
328    USE wvfct,                ONLY : nbnd, npwx, wg, current_k
329    USE klist,                ONLY : ngk, nks, nkstot, xk, wk, igk_k
330    USE symm_base,            ONLY : nsym, s, sr
331    USE mp_pools,             ONLY : npool, nproc_pool, me_pool, inter_pool_comm
332    USE mp_exx,               ONLY : me_egrp, negrp, init_index_over_band,  &
333                                     my_egrp_id, inter_egrp_comm,           &
334                                     intra_egrp_comm, iexx_start, iexx_end, &
335                                     all_start, all_end
336    USE mp,                   ONLY : mp_sum, mp_bcast
337    USE funct,                ONLY : get_exx_fraction, start_exx,exx_is_active, &
338                                     get_screening_parameter, get_gau_parameter
339    USE scatter_mod,          ONLY : gather_grid, scatter_grid
340    USE fft_interfaces,       ONLY : invfft
341    USE uspp,                 ONLY : nkb, vkb, okvan
342    USE us_exx,               ONLY : rotate_becxx
343    USE paw_variables,        ONLY : okpaw
344    USE paw_exx,              ONLY : PAW_init_fock_kernel
345    USE mp_orthopools,        ONLY : intra_orthopool_comm
346    USE exx_base,             ONLY : nkqs, xkq_collect, index_xk, index_sym,  &
347                                     exx_set_symm, rir, working_pool, exxdiv, &
348                                     erfc_scrlen, gau_scrlen, exx_divergence
349    USE exx_band,             ONLY : change_data_structure, nwordwfc_exx, &
350                                     transform_evc_to_exx, igk_exx, evc_exx
351    !
352    IMPLICIT NONE
353    !
354    LOGICAL :: DoLoc
355    !! TRUE:  Real Array locbuff(ir, nbnd, nkqs);
356    !! FALSE: Complex Array exxbuff(ir, nbnd/2, nkqs).
357    !
358    ! ... local variables
359    !
360    INTEGER :: ik, ibnd, i, j, k, ir, isym, ikq, ig
361    INTEGER :: ibnd_loop_start
362    INTEGER :: ipol, jpol
363    REAL(DP), ALLOCATABLE :: occ(:,:)
364    COMPLEX(DP),ALLOCATABLE :: temppsic(:)
365#if defined(__USE_INTEL_HBM_DIRECTIVES)
366!DIR$ ATTRIBUTES FASTMEM :: temppsic
367#elif defined(__USE_CRAY_HBM_DIRECTIVES)
368!DIR$ memory(bandwidth) temppsic
369#endif
370    COMPLEX(DP),ALLOCATABLE :: temppsic_nc(:,:), psic_nc(:,:)
371    COMPLEX(DP),ALLOCATABLE :: psic_exx(:)
372    INTEGER :: nxxs, nrxxs
373#if defined(__MPI)
374    COMPLEX(DP),ALLOCATABLE  :: temppsic_all(:), psic_all(:)
375    COMPLEX(DP), ALLOCATABLE :: temppsic_all_nc(:,:), psic_all_nc(:,:)
376#endif
377    COMPLEX(DP) :: d_spin(2,2,48)
378    INTEGER :: npw, current_ik
379    INTEGER, EXTERNAL :: global_kpoint_index
380    INTEGER :: ibnd_start_new, ibnd_end_new, max_buff_bands_per_egrp
381    INTEGER :: ibnd_exx, evc_offset
382    !
383    CALL start_clock ('exxinit')
384    IF ( Doloc ) THEN
385        WRITE(stdout,'(/,5X,"Using localization algorithm with threshold: ",&
386                & D10.2)') local_thr
387        ! IF (.NOT.gamma_only) CALL errore('exxinit','SCDM with K-points NYI',1)
388        IF (okvan .OR. okpaw) CALL errore( 'exxinit','SCDM with USPP/PAW not &
389                                           &implemented', 1 )
390    ENDIF
391    IF ( use_ace ) &
392        WRITE(stdout,'(/,5X,"Using ACE for calculation of exact exchange")')
393    !
394    CALL transform_evc_to_exx( 2 )
395    !
396    ! ... prepare the symmetry matrices for the spin part
397    !
398    IF (noncolin) THEN
399       DO isym = 1, nsym
400          CALL find_u( sr(:,:,isym), d_spin(:,:,isym) )
401       ENDDO
402    ENDIF
403    !
404    CALL exx_fft_create()
405    !
406    ! Note that nxxs is not the same as nrxxs in parallel case
407    nxxs = dfftt%nr1x * dfftt%nr2x * dfftt%nr3x
408    nrxxs = dfftt%nnr
409#if defined(__MPI)
410    IF (noncolin) THEN
411       ALLOCATE( psic_all_nc(nxxs,npol), temppsic_all_nc(nxxs,npol) )
412    ELSEIF ( .NOT. gamma_only ) THEN
413       ALLOCATE( psic_all(nxxs), temppsic_all(nxxs) )
414    ENDIF
415#endif
416    IF (noncolin) THEN
417       ALLOCATE( temppsic_nc(nrxxs, npol), psic_nc(nrxxs, npol) )
418    ELSEIF ( .NOT. gamma_only ) THEN
419       ALLOCATE( temppsic(nrxxs) )
420    ENDIF
421    !
422    ALLOCATE( psic_exx(nrxxs) )
423    !
424    IF (.NOT.exx_is_active()) THEN
425       !
426       erfc_scrlen = get_screening_parameter()
427       gau_scrlen = get_gau_parameter()
428       exxdiv  = exx_divergence()
429       exxalfa = get_exx_fraction()
430       !
431       CALL start_exx()
432    ENDIF
433    !
434    IF (.NOT. gamma_only) CALL exx_set_symm( dfftt%nr1,  dfftt%nr2,  dfftt%nr3, &
435                                             dfftt%nr1x, dfftt%nr2x, dfftt%nr3x )
436    ! set occupations of wavefunctions used in the calculation of exchange term
437    IF (.NOT. ALLOCATED(x_occupation)) ALLOCATE( x_occupation(nbnd,nkstot) )
438    ALLOCATE( occ(nbnd,nks) )
439    !
440    DO ik = 1, nks
441       IF (ABS(wk(ik)) > eps_occ) THEN
442          occ(1:nbnd,ik) = wg(1:nbnd,ik) / wk(ik)
443       ELSE
444          occ(1:nbnd,ik) = 0._DP
445       ENDIF
446    ENDDO
447    !
448    CALL poolcollect( nbnd, nks, occ, nkstot, x_occupation )
449    !
450    DEALLOCATE( occ )
451    !
452    ! ... find an upper bound to the number of bands with non zero occupation.
453    ! Useful to distribute bands among band groups
454    !
455    x_nbnd_occ = 0
456    DO ik = 1, nkstot
457       DO ibnd = MAX(1,x_nbnd_occ), nbnd
458          IF (ABS(x_occupation(ibnd,ik)) > eps_occ) x_nbnd_occ = ibnd
459       ENDDO
460    ENDDO
461    !
462    IF (nbndproj == 0) nbndproj = nbnd
463    !
464    CALL divide( inter_egrp_comm, x_nbnd_occ, ibnd_start, ibnd_end )
465    CALL init_index_over_band( inter_egrp_comm, nbnd, nbnd )
466    !
467    ! ... this will cause exxbuff to be calculated for every band
468    ibnd_start_new = iexx_start
469    ibnd_end_new = iexx_end
470    !
471    IF ( gamma_only ) THEN
472        ibnd_buff_start = (ibnd_start_new+1)/2
473        ibnd_buff_end   = (ibnd_end_new+1)/2
474        max_buff_bands_per_egrp = MAXVAL((all_end(:)+1)/2-(all_start(:)+1)/2)+1
475    ELSE
476        ibnd_buff_start = ibnd_start_new
477        ibnd_buff_end   = ibnd_end_new
478        max_buff_bands_per_egrp = MAXVAL(all_end(:)-all_start(:))+1
479    ENDIF
480    !
481    IF (DoLoc) THEN
482      !
483      IF (gamma_only) THEN
484        IF (.NOT. ALLOCATED(locbuff)) ALLOCATE( locbuff(nrxxs*npol,nbnd,nks) )
485        IF (.NOT. ALLOCATED(locmat))  ALLOCATE( locmat(nbnd,nbnd,nks) )
486        locbuff = 0.0d0
487        locmat = 0.0d0
488      ELSE
489        IF (.NOT. ALLOCATED(exxbuff)) ALLOCATE( exxbuff(nrxxs*npol,nbnd,nkqs) )
490        IF (.NOT. ALLOCATED(exxmat) ) ALLOCATE( exxmat(nbnd,nkqs,nbnd,nks) )
491        exxbuff = (0.0d0, 0.0d0)
492        exxmat = 0.0d0
493      ENDIF
494      !
495      IF (.NOT. ALLOCATED(evc0)) then
496        ALLOCATE( evc0(npwx*npol,nbndproj,nks) )
497        evc0 = (0.0d0,0.0d0)
498      ENDIF
499      !
500    ELSE
501      !
502      IF (.NOT. ALLOCATED(exxbuff)) THEN
503         IF (gamma_only) THEN
504            ALLOCATE( exxbuff(nrxxs*npol,ibnd_buff_start:ibnd_buff_start + &
505                                          max_buff_bands_per_egrp-1,nkqs) ) ! THIS WORKS as for k
506         ELSE
507            ALLOCATE( exxbuff(nrxxs*npol,ibnd_buff_start:ibnd_buff_start + &
508                                          max_buff_bands_per_egrp-1,nkqs) )
509         ENDIF
510      ENDIF
511    ENDIF
512    !
513    !assign buffer
514    IF(DoLoc) THEN
515      IF(gamma_only) THEN
516!$omp parallel do collapse(3) default(shared) firstprivate(npol,nrxxs,nkqs, &
517!$omp                ibnd_buff_start,ibnd_buff_end) private(ir,ibnd,ikq,ipol)
518        DO ikq=1,SIZE(locbuff,3)
519          DO ibnd=1, x_nbnd_occ
520            DO ir=1,nrxxs*npol
521              locbuff(ir,ibnd,ikq)=0.0_DP
522            ENDDO
523          ENDDO
524        ENDDO
525      ENDIF
526    ELSE
527!$omp parallel do collapse(3) default(shared) firstprivate(npol,nrxxs,nkqs, &
528!$omp                ibnd_buff_start,ibnd_buff_end) private(ir,ibnd,ikq,ipol)
529      DO ikq = 1, SIZE(exxbuff,3)
530         DO ibnd = ibnd_buff_start, ibnd_buff_end
531            DO ir = 1, nrxxs*npol
532               exxbuff(ir,ibnd,ikq) = (0.0_DP,0.0_DP)
533            ENDDO
534         ENDDO
535      ENDDO
536      ! the above loops will replaced with the following line soon
537      !CALL threaded_memset(exxbuff, 0.0_DP, nrxxs*npol*SIZE(exxbuff,2)*nkqs*2)
538    ENDIF
539    !
540    ! ... This is parallelized over pools. Each pool computes only its k-points
541    !
542    KPOINTS_LOOP : &
543    DO ik = 1, nks
544       !
545       IF ( nks > 1 ) CALL get_buffer( evc_exx, nwordwfc_exx, iunwfc_exx, ik )
546       !
547       ! ik         = index of k-point in this pool
548       ! current_ik = index of k-point over all pools
549       !
550       current_ik = global_kpoint_index( nkstot, ik )
551       !
552       IF_GAMMA_ONLY : &
553       IF (gamma_only) THEN
554          !
555          IF (MOD(iexx_start,2) == 0) THEN
556             ibnd_loop_start = iexx_start-1
557          ELSE
558             ibnd_loop_start = iexx_start
559          ENDIF
560          !
561          evc_offset = 0
562          DO ibnd = ibnd_loop_start, iexx_end, 2
563             !
564             psic_exx(:) = ( 0._DP, 0._DP )
565             !
566             IF ( ibnd < iexx_end ) THEN
567                IF ( ibnd == ibnd_loop_start .AND. MOD(iexx_start,2) == 0 ) THEN
568                   DO ig = 1, npwt
569                      psic_exx(dfftt%nl(ig))  = ( 0._DP, 1._DP )*evc_exx(ig,1)
570                      psic_exx(dfftt%nlm(ig)) = ( 0._DP, 1._DP )*CONJG(evc_exx(ig,1))
571                   ENDDO
572                   evc_offset = -1
573                ELSE
574                   DO ig = 1, npwt
575                      psic_exx(dfftt%nl(ig))  = evc_exx(ig,ibnd-ibnd_loop_start+evc_offset+1) &
576                           + ( 0._DP, 1._DP ) * evc_exx(ig,ibnd-ibnd_loop_start+evc_offset+2)
577                      psic_exx(dfftt%nlm(ig)) = CONJG( evc_exx(ig,ibnd-ibnd_loop_start+evc_offset+1) ) &
578                           + ( 0._DP, 1._DP ) * CONJG( evc_exx(ig,ibnd-ibnd_loop_start+evc_offset+2) )
579                   ENDDO
580                ENDIF
581             ELSE
582                DO ig=1,npwt
583                   psic_exx(dfftt%nl (ig)) = evc_exx(ig,ibnd-ibnd_loop_start+evc_offset+1)
584                   psic_exx(dfftt%nlm(ig)) = CONJG( evc_exx(ig,ibnd-ibnd_loop_start+evc_offset+1) )
585                ENDDO
586             ENDIF
587             !
588             CALL invfft( 'Wave', psic_exx, dfftt )
589             !
590             IF (DoLoc) THEN
591               locbuff(1:nrxxs,ibnd-ibnd_loop_start+evc_offset+1,ik) = DBLE(  psic_exx(1:nrxxs) )
592               IF (ibnd-ibnd_loop_start+evc_offset+2 <= nbnd) &
593                  locbuff(1:nrxxs,ibnd-ibnd_loop_start+evc_offset+2,ik) = AIMAG( psic_exx(1:nrxxs) )
594             ELSE
595               exxbuff(1:nrxxs,(ibnd+1)/2,current_ik)=psic_exx(1:nrxxs)
596             ENDIF
597             !
598          ENDDO
599          !
600       ELSE IF_GAMMA_ONLY
601          !
602          npw = ngk (ik)
603          IBND_LOOP_K : &
604          DO ibnd = iexx_start, iexx_end
605             !
606             ibnd_exx = ibnd
607             IF (noncolin) THEN
608!$omp parallel do default(shared) private(ir) firstprivate(nrxxs)
609                DO ir = 1, nrxxs
610                   temppsic_nc(ir,1) = ( 0._DP, 0._DP )
611                   temppsic_nc(ir,2) = ( 0._DP, 0._DP )
612                ENDDO
613!$omp parallel do default(shared) private(ig) firstprivate(npw,ik,ibnd_exx)
614                DO ig = 1, npw
615                   temppsic_nc(dfftt%nl(igk_exx(ig,ik)),1) = evc_exx(ig,ibnd-iexx_start+1)
616                ENDDO
617!$omp end parallel do
618                CALL invfft( 'Wave', temppsic_nc(:,1), dfftt )
619!$omp parallel do default(shared) private(ig) firstprivate(npw,ik,ibnd_exx,npwx)
620                DO ig = 1, npw
621                   temppsic_nc(dfftt%nl(igk_exx(ig,ik)),2) = evc_exx(ig+npwx,ibnd-iexx_start+1)
622                ENDDO
623!$omp end parallel do
624                CALL invfft( 'Wave', temppsic_nc(:,2), dfftt )
625             ELSE
626!$omp parallel do default(shared) private(ir) firstprivate(nrxxs)
627                DO ir = 1, nrxxs
628                   temppsic(ir) = ( 0._DP, 0._DP )
629                ENDDO
630!$omp parallel do default(shared) private(ig) firstprivate(npw,ik,ibnd_exx)
631                DO ig = 1, npw
632                   temppsic(dfftt%nl(igk_exx(ig,ik))) = evc_exx(ig,ibnd-iexx_start+1)
633                ENDDO
634!$omp end parallel do
635                CALL invfft( 'Wave', temppsic, dfftt )
636             ENDIF
637             !
638             DO ikq = 1, nkqs
639                !
640                IF (index_xk(ikq) /= current_ik) CYCLE
641                isym = ABS(index_sym(ikq) )
642                !
643                IF (noncolin) THEN ! noncolinear
644#if defined(__MPI)
645                   DO ipol = 1, npol
646                      CALL gather_grid( dfftt, temppsic_nc(:,ipol), temppsic_all_nc(:,ipol) )
647                   ENDDO
648                   !
649                   IF ( me_egrp == 0 ) THEN
650!$omp parallel do default(shared) private(ir) firstprivate(npol,nxxs)
651                      DO ir = 1, nxxs
652                         !DIR$ UNROLL_AND_JAM (2)
653                         DO ipol = 1, npol
654                            psic_all_nc(ir,ipol) = (0.0_DP, 0.0_DP)
655                         ENDDO
656                      ENDDO
657!$omp end parallel do
658!$omp parallel do default(shared) private(ir) firstprivate(npol,isym,nxxs) reduction(+:psic_all_nc)
659                      DO ir = 1, nxxs
660                         !DIR$ UNROLL_AND_JAM (4)
661                         DO ipol = 1, npol
662                            DO jpol = 1, npol
663                               psic_all_nc(ir,ipol) = psic_all_nc(ir,ipol) + &
664                                             CONJG(d_spin(jpol,ipol,isym)) * &
665                                             temppsic_all_nc(rir(ir,isym),jpol)
666                            ENDDO
667                         ENDDO
668                      ENDDO
669!$omp end parallel do
670                   ENDIF
671                   !
672                   DO ipol = 1, npol
673                      CALL scatter_grid( dfftt, psic_all_nc(:,ipol), psic_nc(:,ipol) )
674                   ENDDO
675#else
676!$omp parallel do default(shared) private(ir) firstprivate(npol,nxxs)
677                   DO ir = 1, nxxs
678                      !DIR$ UNROLL_AND_JAM (2)
679                      DO ipol = 1, npol
680                         psic_nc(ir,ipol) = (0._DP,0._DP)
681                      ENDDO
682                   ENDDO
683!$omp end parallel do
684!$omp parallel do default(shared) private(ipol,jpol,ir) firstprivate(npol,isym,nxxs) reduction(+:psic_nc)
685                   DO ir = 1, nxxs
686                      !DIR$ UNROLL_AND_JAM (4)
687                      DO ipol = 1, npol
688                         DO jpol = 1, npol
689                            psic_nc(ir,ipol) = psic_nc(ir,ipol) + CONJG(d_spin(jpol,ipol,isym))* &
690                                               temppsic_nc(rir(ir,isym),jpol)
691                         ENDDO
692                      ENDDO
693                   ENDDO
694!$omp end parallel do
695#endif
696                   IF (index_sym(ikq) > 0 ) THEN
697                      ! sym. op. without time reversal: normal case
698!$omp parallel do default(shared) private(ir) firstprivate(ibnd,isym,ikq)
699                      DO ir = 1, nrxxs
700                         exxbuff(ir,ibnd,ikq) = psic_nc(ir,1)
701                         exxbuff(ir+nrxxs,ibnd,ikq) = psic_nc(ir,2)
702                      ENDDO
703!$omp end parallel do
704                   ELSE
705                      ! sym. op. with time reversal: spin 1->2*, 2->-1*
706!$omp parallel do default(shared) private(ir) firstprivate(ibnd,isym,ikq)
707                      DO ir = 1, nrxxs
708                         exxbuff(ir,ibnd,ikq) = CONJG(psic_nc(ir,2))
709                         exxbuff(ir+nrxxs,ibnd,ikq) = -CONJG(psic_nc(ir,1))
710                      ENDDO
711!$omp end parallel do
712                   ENDIF
713                ELSE ! noncolinear
714#if defined(__MPI)
715                   CALL gather_grid( dfftt, temppsic, temppsic_all )
716                   IF ( me_egrp == 0 ) THEN
717!$omp parallel do default(shared) private(ir) firstprivate(isym)
718                      DO ir = 1, nxxs
719                         psic_all(ir) = temppsic_all(rir(ir,isym))
720                      ENDDO
721!$omp end parallel do
722                   ENDIF
723                   CALL scatter_grid( dfftt, psic_all, psic_exx )
724#else
725!$omp parallel do default(shared) private(ir) firstprivate(isym)
726                   DO ir = 1, nrxxs
727                      psic_exx(ir) = temppsic(rir(ir,isym))
728                   ENDDO
729!$omp end parallel do
730#endif
731!$omp parallel do default(shared) private(ir) firstprivate(isym,ibnd,ikq)
732                   DO ir = 1, nrxxs
733                      IF (index_sym(ikq) < 0 ) THEN
734                         psic_exx(ir) = CONJG(psic_exx(ir))
735                      ENDIF
736                      exxbuff(ir,ibnd,ikq) = psic_exx(ir)
737                   ENDDO
738!$omp end parallel do
739                   !
740                ENDIF ! noncolinear
741                !
742             ENDDO
743             !
744          ENDDO&
745          IBND_LOOP_K
746          !
747       ENDIF&
748       IF_GAMMA_ONLY
749    ENDDO&
750    KPOINTS_LOOP
751    !
752    DEALLOCATE( psic_exx )
753    IF (noncolin) THEN
754       DEALLOCATE( temppsic_nc, psic_nc )
755#if defined(__MPI)
756       DEALLOCATE( temppsic_all_nc, psic_all_nc )
757#endif
758    ELSE IF ( .NOT. gamma_only ) THEN
759       DEALLOCATE( temppsic )
760#if defined(__MPI)
761       DEALLOCATE( temppsic_all, psic_all )
762#endif
763    ENDIF
764    !
765    ! Each wavefunction in exxbuff is computed by a single pool, collect among
766    ! pools in a smart way (i.e. without doing all-to-all sum and bcast)
767    ! See also the initialization of working_pool in exx_mp_init
768    ! Note that in Gamma-only LSDA can be parallelized over two pools, and there
769    ! is no need to communicate anything: each pools deals with its own spin
770    !
771    IF ( .NOT. gamma_only ) THEN
772       DO ikq = 1, nkqs
773         CALL mp_bcast( exxbuff(:,:,ikq), working_pool(ikq), intra_orthopool_comm )
774       ENDDO
775    ENDIF
776    !
777    ! For US/PAW only: compute <beta_I|psi_j,k+q> for the entire
778    ! de-symmetrized k+q grid by rotating the ones from the irreducible wedge
779    !
780    IF (okvan) CALL rotate_becxx( nkqs, index_xk, index_sym, xkq_collect )
781    !
782    ! Initialize 4-wavefunctions one-center Fock integrals
783    !    \int \psi_a(r)\phi_a(r)\phi_b(r')\psi_b(r')/|r-r'|
784    !
785    IF (okpaw) CALL PAW_init_fock_kernel()
786    !
787    CALL change_data_structure( .FALSE. )
788    !
789    CALL stop_clock( 'exxinit' )
790    !
791  END SUBROUTINE exxinit
792  !
793  !
794  !-----------------------------------------------------------------------
795  SUBROUTINE vexx( lda, n, m, psi, hpsi, becpsi )
796    !-----------------------------------------------------------------------
797    !! Wrapper routine computing V_x\psi, V_x = exchange potential.
798    !! Calls generic version vexx_k or Gamma-specific one vexx_gamma.
799    !
800    USE becmod,         ONLY : bec_type
801    USE uspp,           ONLY : okvan
802    USE paw_variables,  ONLY : okpaw
803    USE us_exx,         ONLY : becxx
804    USE mp_exx,         ONLY : negrp, inter_egrp_comm, init_index_over_band
805    USE wvfct,          ONLY : nbnd
806    USE exx_band,       ONLY : transform_psi_to_exx, transform_hpsi_to_local, &
807                               psi_exx, hpsi_exx, igk_exx
808    !
809    IMPLICIT NONE
810    !
811    INTEGER :: lda
812    !! input: leading dimension of arrays psi and hpsi
813    INTEGER :: n
814    !! input: true dimension of psi and hpsi
815    INTEGER :: m
816    !! input: number of states psi
817    COMPLEX(DP) :: psi(lda*npol,m)
818    !! input: m wavefunctions
819    COMPLEX(DP) :: hpsi(lda*npol,m)
820    !! output: V_x*psi
821    TYPE(bec_type), OPTIONAL :: becpsi
822    !! input: <beta|psi>, optional but needed for US and PAW case
823    !
824    INTEGER :: i
825    !
826    IF ((okvan.OR.okpaw) .AND. .NOT. PRESENT(becpsi)) &
827       CALL errore( 'vexx','becpsi needed for US/PAW case', 1 )
828    CALL start_clock( 'vexx' )
829    !
830    IF (negrp > 1) THEN
831       CALL init_index_over_band( inter_egrp_comm, nbnd, m )
832       !
833       ! ... transform psi to the EXX data structure
834       CALL transform_psi_to_exx( lda, n, m, psi )
835    ENDIF
836    !
837    ! ... calculate the EXX contribution to hpsi
838    !
839    IF ( gamma_only ) THEN
840       IF (negrp == 1)THEN
841          CALL vexx_gamma( lda, n, m, psi, hpsi, becpsi )
842       ELSE
843          CALL vexx_gamma( lda, n, m, psi_exx, hpsi_exx, becpsi )
844       ENDIF
845    ELSE
846       IF (negrp.eq.1)THEN
847          CALL vexx_k( lda, n, m, psi, hpsi, becpsi )
848       ELSE
849          CALL vexx_k( lda, n, m, psi_exx, hpsi_exx, becpsi )
850       ENDIF
851    ENDIF
852    !
853    IF (negrp > 1) THEN
854       !
855       ! ... transform hpsi to the local data structure
856       !
857       CALL transform_hpsi_to_local(lda,n,m,hpsi)
858       !
859    ENDIF
860    !
861    CALL stop_clock( 'vexx' )
862    !
863  END SUBROUTINE vexx
864  !
865  !
866  !-----------------------------------------------------------------------
867  SUBROUTINE vexx_gamma( lda, n, m, psi, hpsi, becpsi )
868    !-----------------------------------------------------------------------
869    !! Gamma-specific version of vexx.
870    !
871    USE constants,      ONLY : fpi, e2, pi
872    USE cell_base,      ONLY : omega
873    USE gvect,          ONLY : ngm, g
874    USE wvfct,          ONLY : npwx, current_k, nbnd
875    USE klist,          ONLY : xk, nks, nkstot, igk_k
876    USE fft_interfaces, ONLY : fwfft, invfft
877    USE becmod,         ONLY : bec_type
878    USE mp_exx,         ONLY : inter_egrp_comm, my_egrp_id, &
879                               intra_egrp_comm, me_egrp, &
880                               negrp, max_pairs, egrp_pairs, ibands, nibands, &
881                               iexx_istart, iexx_iend, &
882                               all_start, all_end, iexx_start, jblock
883    USE mp,             ONLY : mp_sum, mp_barrier, mp_circular_shift_left
884    USE uspp,           ONLY : nkb, okvan
885    USE paw_variables,  ONLY : okpaw
886    USE us_exx,         ONLY : bexg_merge, becxx, addusxx_g, addusxx_r, &
887                               newdxx_g, newdxx_r, add_nlxx_pot, &
888                               qvan_init, qvan_clean
889    USE paw_exx,        ONLY : PAW_newdxx
890    USE exx_base,       ONLY : nqs, index_xkq, index_xk, xkq_collect, &
891                               coulomb_fac, g2_convolution_all
892    USE exx_band,       ONLY : result_sum, igk_exx
893    !
894    IMPLICIT NONE
895    !
896    INTEGER :: lda
897    !! input: leading dimension of arrays psi and hpsi
898    INTEGER :: n
899    !! input: true dimension of psi and hpsi
900    INTEGER :: m
901    !! input: number of states psi
902    COMPLEX(DP) :: psi(lda*npol,m)
903    !! input: m wavefunctions
904    COMPLEX(DP) :: hpsi(lda*npol,m)
905    !! output: V_x*psi
906    TYPE(bec_type), OPTIONAL :: becpsi ! or call a calbec(...psi) instead
907    !! input: <beta|psi>, optional but needed for US and PAW case
908    !
909    ! ... local variables
910    !
911    COMPLEX(DP), ALLOCATABLE :: result(:,:)
912    REAL(DP), ALLOCATABLE :: temppsic_dble (:)
913    REAL(DP), ALLOCATABLE :: temppsic_aimag(:)
914    !
915    COMPLEX(DP), ALLOCATABLE :: vc(:,:), deexx(:,:)
916    REAL(DP), ALLOCATABLE :: fac(:)
917    INTEGER :: ibnd, ik, im , ikq, iq, ipol
918    INTEGER :: ir, ig
919    INTEGER :: current_ik
920    INTEGER :: ibnd_loop_start
921    INTEGER :: nrxxs
922    REAL(DP) :: x1, x2, xkp(3)
923    REAL(DP) :: xkq(3)
924    INTEGER, EXTERNAL :: global_kpoint_index
925    INTEGER :: ialloc
926    COMPLEX(DP), ALLOCATABLE :: big_result(:,:)
927    INTEGER :: iproc, nproc_egrp, ii, ipair
928    INTEGER :: jbnd, jstart, jend
929    ! ... scratch space for fft of psi and rho
930    COMPLEX(DP), ALLOCATABLE :: psi_rhoc_work(:)
931    INTEGER :: jblock_start, jblock_end
932    INTEGER :: iegrp, wegrp
933    INTEGER :: exxbuff_index
934    INTEGER :: ending_im
935    !
936    ialloc = nibands(my_egrp_id+1)
937    !
938    ALLOCATE( fac(dfftt%ngm) )
939    !
940    nrxxs = dfftt%nnr
941    !
942    !ALLOCATE( result(nrxxs), temppsic_DBLE(nrxxs), temppsic_aimag(nrxxs) )
943    ALLOCATE( result(nrxxs,ialloc), temppsic_DBLE(nrxxs) )
944    ALLOCATE( temppsic_aimag(nrxxs) )
945    ALLOCATE( psi_rhoc_work(nrxxs) )
946    !
947    ALLOCATE( vc(nrxxs,ialloc) )
948    IF (okvan) ALLOCATE( deexx(nkb,ialloc) )
949    !
950    current_ik = global_kpoint_index( nkstot, current_k )
951    xkp = xk(:,current_k)
952    !
953    ALLOCATE( big_result(n,m) )
954    big_result = 0.0_DP
955    result = 0.0_DP
956    !
957    DO ii = 1, nibands(my_egrp_id+1)
958       IF (okvan) deexx(:,ii) = 0.0_DP
959    ENDDO
960    !
961    ! Here the loops start
962    !
963    INTERNAL_LOOP_ON_Q : &
964    DO iq = 1, nqs
965       !
966       ikq  = index_xkq(current_ik,iq)
967       ik   = index_xk(ikq)
968       xkq  = xkq_collect(:,ikq)
969       !
970       ! calculate the 1/|r-r'| (actually, k+q+g) factor and place it in fac
971       CALL g2_convolution_all( dfftt%ngm, gt, xkp, xkq, iq, current_k )
972       IF ( okvan .AND..NOT.tqr ) CALL qvan_init( dfftt%ngm, xkq, xkp )
973       !
974       DO iegrp = 1, negrp
975          !
976          ! compute the id of group whose data is currently worked on
977          wegrp = MOD(iegrp+my_egrp_id-1, negrp)+1
978          !
979          jblock_start = all_start(wegrp)
980          jblock_end   = all_end(wegrp)
981          !
982          LOOP_ON_PSI_BANDS : &
983          DO ii = 1,  nibands(my_egrp_id+1)
984             !
985             ibnd = ibands(ii,my_egrp_id+1)
986             !
987             IF (ibnd==0 .OR. ibnd>m) CYCLE
988             !
989             IF (MOD(ii,2) == 1) THEN
990                !
991                psi_rhoc_work = (0._DP,0._DP)
992                !
993                IF ((ii+1) <= MIN(m,nibands(my_egrp_id+1))) THEN
994                   ! deal with double bands
995!$omp parallel do  default(shared), private(ig)
996                   DO ig = 1, npwt
997                      psi_rhoc_work( dfftt%nl(ig) )  =       psi(ig, ii) + (0._DP,1._DP) * psi(ig, ii+1)
998                      psi_rhoc_work( dfftt%nlm(ig) ) = CONJG(psi(ig, ii) - (0._DP,1._DP) * psi(ig, ii+1))
999                   ENDDO
1000!$omp end parallel do
1001                ENDIF
1002                !
1003                IF ( ii == MIN(m,nibands(my_egrp_id+1)) ) THEN
1004                   ! deal with a single last band
1005!$omp parallel do  default(shared), private(ig)
1006                   DO ig = 1, npwt
1007                      psi_rhoc_work( dfftt%nl(ig) )  =       psi(ig,ii)
1008                      psi_rhoc_work( dfftt%nlm(ig) ) = CONJG(psi(ig,ii))
1009                   ENDDO
1010!$omp end parallel do
1011                ENDIF
1012                !
1013                CALL invfft( 'Wave', psi_rhoc_work, dfftt )
1014!$omp parallel do default(shared), private(ir)
1015                DO ir = 1, nrxxs
1016                   temppsic_DBLE(ir)  = DBLE( psi_rhoc_work(ir) )
1017                   temppsic_aimag(ir) = AIMAG( psi_rhoc_work(ir) )
1018                ENDDO
1019!$omp end parallel do
1020                !
1021             ENDIF
1022             !
1023             !
1024             !determine which j-bands to calculate
1025             jstart = 0
1026             jend = 0
1027             DO ipair = 1, max_pairs
1028                IF (egrp_pairs(1,ipair,my_egrp_id+1) == ibnd) THEN
1029                   IF (jstart == 0)THEN
1030                      jstart = egrp_pairs(2,ipair,my_egrp_id+1)
1031                      jend = jstart
1032                   ELSE
1033                      jend = egrp_pairs(2,ipair,my_egrp_id+1)
1034                   ENDIF
1035                ENDIF
1036             ENDDO
1037             !
1038             jstart = MAX(jstart,jblock_start)
1039             jend = MIN(jend,jblock_end)
1040             !
1041             IF (MOD(jstart,2) ==0 ) THEN
1042                ibnd_loop_start = jstart-1
1043             ELSE
1044                ibnd_loop_start = jstart
1045             ENDIF
1046             !
1047             IBND_LOOP_GAM : &
1048             DO jbnd = ibnd_loop_start, jend, 2 !for each band of psi
1049                !
1050                exxbuff_index = (jbnd+1)/2-(all_start(wegrp)+1)/2+(iexx_start+1)/2
1051                !
1052                IF ( jbnd < jstart ) THEN
1053                   x1 = 0.0_DP
1054                ELSE
1055                   x1 = x_occupation(jbnd,ik)
1056                ENDIF
1057                IF ( jbnd == jend) THEN
1058                   x2 = 0.0_DP
1059                ELSE
1060                   x2 = x_occupation(jbnd+1,ik)
1061                ENDIF
1062                IF ( ABS(x1) < eps_occ .AND. ABS(x2) < eps_occ ) CYCLE
1063                !
1064                ! ... calculate rho in real space. Gamma tricks are used.
1065                ! temppsic is real; tempphic contains one band in the real part,
1066                ! another one in the imaginary part; the same applies to rhoc
1067                !
1068                IF ( MOD(ii,2) == 0 ) THEN
1069!$omp parallel do default(shared), private(ir)
1070                   DO ir = 1, nrxxs
1071                      psi_rhoc_work(ir) = exxbuff(ir,exxbuff_index,ikq) * temppsic_aimag(ir) / omega
1072                   ENDDO
1073!$omp end parallel do
1074                ELSE
1075!$omp parallel do default(shared), private(ir)
1076                   DO ir = 1, nrxxs
1077                      psi_rhoc_work(ir) = exxbuff(ir,exxbuff_index,ikq) * temppsic_DBLE(ir) / omega
1078                   ENDDO
1079!$omp end parallel do
1080                ENDIF
1081                !
1082                ! ... bring rho to G-space
1083                !
1084                !   >>> add augmentation in REAL SPACE here
1085                IF (okvan .AND. tqr) THEN
1086                   IF (jbnd >= jstart) &
1087                        CALL addusxx_r( psi_rhoc_work, &
1088                       _CX(becxx(ikq)%r(:,jbnd)), _CX(becpsi%r(:,ibnd)))
1089                   IF (jbnd < jend) &
1090                        CALL addusxx_r( psi_rhoc_work, &
1091                       _CY(becxx(ikq)%r(:,jbnd+1)),_CX(becpsi%r(:,ibnd)))
1092                ENDIF
1093                !
1094                CALL fwfft( 'Rho', psi_rhoc_work, dfftt )
1095                !   >>>> add augmentation in G SPACE here
1096                IF (okvan .AND. .NOT. tqr) THEN
1097                   ! ... contribution from one band added to real (in real space) part of rhoc
1098                   IF (jbnd >= jstart) &
1099                        CALL addusxx_g( dfftt, psi_rhoc_work, xkq,  xkp, 'r', &
1100                        becphi_r=becxx(ikq)%r(:,jbnd), becpsi_r=becpsi%r(:,ibnd) )
1101                   ! ... contribution from following band added to imaginary (in real space) part of rhoc
1102                   IF (jbnd < jend) &
1103                        CALL addusxx_g( dfftt, psi_rhoc_work, xkq,  xkp, 'i', &
1104                        becphi_r=becxx(ikq)%r(:,jbnd+1), becpsi_r=becpsi%r(:,ibnd) )
1105                ENDIF
1106                !   >>>> charge density done
1107                !
1108                vc(:,ii) = 0._DP
1109                !
1110!$omp parallel do default(shared), private(ig)
1111                DO ig = 1, dfftt%ngm
1112                   !
1113                   vc(dfftt%nl(ig),ii)  = coulomb_fac(ig,iq,current_k) * psi_rhoc_work(dfftt%nl(ig))
1114                   vc(dfftt%nlm(ig),ii) = coulomb_fac(ig,iq,current_k) * psi_rhoc_work(dfftt%nlm(ig))
1115                   !
1116                ENDDO
1117!$omp end parallel do
1118                !
1119                !   >>>>  compute <psi|H_fock G SPACE here
1120                IF (okvan .AND. .NOT. tqr) THEN
1121                   IF (jbnd >= jstart) &
1122                        CALL newdxx_g( dfftt, vc(:,ii), xkq, xkp, 'r', deexx(:,ii), &
1123                           becphi_r=x1*becxx(ikq)%r(:,jbnd) )
1124                   IF (jbnd<jend) &
1125                        CALL newdxx_g( dfftt, vc(:,ii), xkq, xkp, 'i', deexx(:,ii), &
1126                            becphi_r=x2*becxx(ikq)%r(:,jbnd+1) )
1127                ENDIF
1128                !
1129                !brings back v in real space
1130                CALL invfft( 'Rho', vc(:,ii), dfftt )
1131                !
1132                !   >>>>  compute <psi|H_fock REAL SPACE here
1133                IF (okvan .AND. tqr) THEN
1134                   IF (jbnd >= jstart) &
1135                        CALL newdxx_r( dfftt,vc(:,ii), _CX(x1*becxx(ikq)%r(:,jbnd)), deexx(:,ii) )
1136                   IF (jbnd < jend) &
1137                        CALL newdxx_r( dfftt,vc(:,ii), _CY(x2*becxx(ikq)%r(:,jbnd+1)), deexx(:,ii) )
1138                ENDIF
1139                !
1140                IF (okpaw) THEN
1141                   IF (jbnd >= jstart) &
1142                        CALL PAW_newdxx( x1/nqs, _CX(becxx(ikq)%r(:,jbnd)),&
1143                                                _CX(becpsi%r(:,ibnd)), deexx(:,ii) )
1144                   IF (jbnd < jend) &
1145                        CALL PAW_newdxx( x2/nqs, _CX(becxx(ikq)%r(:,jbnd+1)),&
1146                                                _CX(becpsi%r(:,ibnd)), deexx(:,ii) )
1147                ENDIF
1148                !
1149                ! ... accumulates over bands and k points
1150                !
1151!$omp parallel do default(shared), private(ir)
1152                DO ir = 1, nrxxs
1153                   result(ir,ii) = result(ir,ii) &
1154                                 + x1* DBLE(vc(ir,ii))* DBLE(exxbuff(ir,exxbuff_index,ikq)) &
1155                                 + x2*AIMAG(vc(ir,ii))*AIMAG(exxbuff(ir,exxbuff_index,ikq))
1156                ENDDO
1157!$omp end parallel do
1158                !
1159             ENDDO &
1160             IBND_LOOP_GAM
1161             !
1162          ENDDO &
1163          LOOP_ON_PSI_BANDS
1164          !
1165          ! get the next nbnd/negrp data
1166          IF (negrp > 1) CALL mp_circular_shift_left( exxbuff(:,:,ikq), me_egrp, inter_egrp_comm )
1167          !
1168       ENDDO ! iegrp
1169       IF (okvan .AND. .NOT.tqr) CALL qvan_clean()
1170    ENDDO &
1171    INTERNAL_LOOP_ON_Q
1172    !
1173    DO ii = 1, nibands(my_egrp_id+1)
1174       !
1175       ibnd = ibands(ii,my_egrp_id+1)
1176       !
1177       IF (ibnd == 0 .OR. ibnd>m) CYCLE
1178       !
1179       IF (okvan) THEN
1180          CALL mp_sum( deexx(:,ii), intra_egrp_comm )
1181       ENDIF
1182       !
1183       ! ... brings back result in G-space
1184       !
1185       CALL fwfft( 'Wave' , result(:,ii), dfftt )
1186       !
1187       ! ... communicate result
1188       DO ig = 1, n
1189          big_result(ig,ibnd) = big_result(ig,ibnd) - exxalfa*result(dfftt%nl(igk_exx(ig,current_k)),ii)
1190       ENDDO
1191       !
1192       ! ... add non-local \sum_I |beta_I> \alpha_Ii (the sum on i is outside)
1193       IF (okvan) CALL add_nlxx_pot( lda, big_result(:,ibnd), xkp, n, &
1194                                     igk_exx(1,current_k), deexx(:,ii), eps_occ, exxalfa )
1195    ENDDO
1196    !
1197    CALL result_sum( n*npol, m, big_result )
1198    !
1199    IF (iexx_istart(my_egrp_id+1) > 0) THEN
1200       IF (negrp == 1) THEN
1201          ending_im = m
1202       ELSE
1203          ending_im = iexx_iend(my_egrp_id+1) - iexx_istart(my_egrp_id+1) + 1
1204       END IF
1205       DO im = 1, ending_im
1206!$omp parallel do default(shared), private(ig) firstprivate(im,n)
1207           DO ig = 1, n
1208              hpsi(ig,im) = hpsi(ig,im) + big_result(ig,im+iexx_istart(my_egrp_id+1)-1)
1209           ENDDO
1210!$omp end parallel do
1211       ENDDO
1212    ENDIF
1213    !
1214    DEALLOCATE( big_result )
1215    DEALLOCATE( result, temppsic_dble, temppsic_aimag )
1216    DEALLOCATE( psi_rhoc_work )
1217    DEALLOCATE( vc, fac )
1218    IF (okvan) DEALLOCATE( deexx )
1219    !
1220  END SUBROUTINE vexx_gamma
1221  !
1222  !
1223  !-----------------------------------------------------------------------
1224  SUBROUTINE vexx_k( lda, n, m, psi, hpsi, becpsi )
1225    !-----------------------------------------------------------------------
1226    !! Generic, k-point version of vexx.
1227    !
1228    USE constants,      ONLY : fpi, e2, pi
1229    USE cell_base,      ONLY : omega
1230    USE gvect,          ONLY : ngm, g
1231    USE wvfct,          ONLY : npwx, current_k, nbnd
1232    USE klist,          ONLY : xk, nks, nkstot
1233    USE fft_interfaces, ONLY : fwfft, invfft
1234    USE becmod,         ONLY : bec_type
1235    USE mp_exx,         ONLY : inter_egrp_comm, my_egrp_id, negrp, &
1236                               intra_egrp_comm, me_egrp, &
1237                               max_pairs, egrp_pairs, ibands, nibands, &
1238                               max_ibands, iexx_istart, iexx_iend, &
1239                               all_start, all_end, iexx_start, jblock
1240    USE mp,             ONLY : mp_sum, mp_barrier, mp_circular_shift_left
1241    USE uspp,           ONLY : nkb, okvan
1242    USE paw_variables,  ONLY : okpaw
1243    USE us_exx,         ONLY : bexg_merge, becxx, addusxx_g, addusxx_r, &
1244                               newdxx_g, newdxx_r, add_nlxx_pot, &
1245                               qvan_init, qvan_clean
1246    USE paw_exx,        ONLY : PAW_newdxx
1247    USE exx_base,       ONLY : nqs, xkq_collect, index_xkq, index_xk, &
1248                               coulomb_fac, g2_convolution_all
1249    USE exx_band,       ONLY : result_sum, igk_exx
1250    USE io_global,      ONLY : stdout
1251    !
1252    !
1253    IMPLICIT NONE
1254    !
1255    INTEGER :: lda
1256    !! input: leading dimension of arrays psi and hpsi
1257    INTEGER :: n
1258    !! input: true dimension of psi and hpsi
1259    INTEGER :: m
1260    !! input: number of states psi
1261    COMPLEX(DP) :: psi(lda*npol,max_ibands)
1262    !! input: m wavefunctions
1263    COMPLEX(DP) :: hpsi(lda*npol,max_ibands)
1264    !! output: V_x*psi
1265    TYPE(bec_type), OPTIONAL :: becpsi  ! or call a calbec(...psi) instead
1266    !! input: <beta|psi>, optional but needed for US and PAW case
1267    !
1268    ! ... local variables
1269    !
1270    COMPLEX(DP),ALLOCATABLE :: temppsic(:,:), result(:,:)
1271#if defined(__USE_INTEL_HBM_DIRECTIVES)
1272!DIR$ ATTRIBUTES FASTMEM :: result
1273#elif defined(__USE_CRAY_HBM_DIRECTIVES)
1274!DIR$ memory(bandwidth) result
1275#endif
1276    COMPLEX(DP),ALLOCATABLE :: temppsic_nc(:,:,:),result_nc(:,:,:)
1277    INTEGER :: request_send, request_recv
1278    !
1279    COMPLEX(DP),ALLOCATABLE :: deexx(:,:)
1280    COMPLEX(DP),ALLOCATABLE,TARGET :: rhoc(:,:), vc(:,:)
1281#if defined(__USE_MANY_FFT)
1282    COMPLEX(DP),POINTER :: prhoc(:), pvc(:)
1283#endif
1284#if defined(__USE_INTEL_HBM_DIRECTIVES)
1285!DIR$ ATTRIBUTES FASTMEM :: rhoc, vc
1286#elif defined(__USE_CRAY_HBM_DIRECTIVES)
1287!DIR$ memory(bandwidth) rhoc, vc
1288#endif
1289    REAL(DP),   ALLOCATABLE :: fac(:), facb(:)
1290    INTEGER :: ibnd, ik, im , ikq, iq, ipol
1291    INTEGER :: ir, ig, ir_start, ir_end
1292    INTEGER :: irt, nrt, nblock
1293    INTEGER :: current_ik
1294    INTEGER :: ibnd_loop_start
1295    INTEGER :: nrxxs
1296    REAL(DP) :: x1, x2, xkp(3), omega_inv, nqs_inv
1297    REAL(DP) :: xkq(3)
1298    INTEGER, EXTERNAL :: global_kpoint_index
1299    DOUBLE PRECISION :: max, tempx
1300    COMPLEX(DP), ALLOCATABLE :: big_result(:,:)
1301    INTEGER :: ir_out, ipair, jbnd
1302    INTEGER :: ii, jstart, jend, jcount, jind
1303    INTEGER :: ialloc, ending_im
1304    INTEGER :: ijt, njt, jblock_start, jblock_end
1305    INTEGER :: iegrp, wegrp
1306    !
1307    ialloc = nibands(my_egrp_id+1)
1308    !
1309    ALLOCATE( fac(dfftt%ngm) )
1310    nrxxs= dfftt%nnr
1311    ALLOCATE( facb(nrxxs) )
1312    !
1313    IF (noncolin) THEN
1314       ALLOCATE( temppsic_nc(nrxxs,npol,ialloc), result_nc(nrxxs,npol,ialloc) )
1315    ELSE
1316       ALLOCATE( temppsic(nrxxs,ialloc), result(nrxxs,ialloc) )
1317    ENDIF
1318    !
1319    IF (okvan) ALLOCATE( deexx(nkb,ialloc) )
1320    !
1321    current_ik = global_kpoint_index( nkstot, current_k )
1322    xkp = xk(:,current_k)
1323    !
1324    ALLOCATE( big_result(n*npol,m) )
1325    big_result = 0.0_DP
1326    !
1327    ! allocate arrays for rhoc and vc
1328    ALLOCATE( rhoc(nrxxs,jblock), vc(nrxxs,jblock) )
1329#if defined(__USE_MANY_FFT)
1330    prhoc(1:nrxxs*jblock) => rhoc(:,:)
1331    pvc(1:nrxxs*jblock) => vc(:,:)
1332#endif
1333    !
1334    DO ii = 1, nibands(my_egrp_id+1)
1335       !
1336       ibnd = ibands(ii,my_egrp_id+1)
1337       !
1338       IF (ibnd==0 .OR. ibnd>m) CYCLE
1339       IF (okvan) deexx(:,ii) = 0._DP
1340       !
1341       IF (noncolin) THEN
1342          temppsic_nc(:,:,ii) = 0._DP
1343       ELSE
1344!$omp parallel do  default(shared), private(ir) firstprivate(nrxxs)
1345          DO ir = 1, nrxxs
1346             temppsic(ir,ii) = 0._DP
1347          ENDDO
1348       ENDIF
1349       !
1350       IF (noncolin) THEN
1351          !
1352!$omp parallel do  default(shared), private(ig)
1353          DO ig = 1, n
1354             temppsic_nc(dfftt%nl(igk_exx(ig,current_k)),1,ii) = psi(ig,ii)
1355             temppsic_nc(dfftt%nl(igk_exx(ig,current_k)),2,ii) = psi(npwx+ig,ii)
1356          ENDDO
1357!$omp end parallel do
1358          !
1359          CALL invfft( 'Wave', temppsic_nc(:,1,ii), dfftt )
1360          CALL invfft( 'Wave', temppsic_nc(:,2,ii), dfftt )
1361          !
1362       ELSE
1363          !
1364!$omp parallel do  default(shared), private(ig)
1365          DO ig = 1, n
1366             temppsic( dfftt%nl(igk_exx(ig,current_k)), ii ) = psi(ig,ii)
1367          ENDDO
1368!$omp end parallel do
1369          !
1370          CALL invfft( 'Wave', temppsic(:,ii), dfftt )
1371          !
1372       ENDIF
1373       !
1374       IF (noncolin) THEN
1375!$omp parallel do default(shared) firstprivate(nrxxs) private(ir)
1376          DO ir = 1, nrxxs
1377             result_nc(ir,1,ii) = 0.0_DP
1378             result_nc(ir,2,ii) = 0.0_DP
1379          ENDDO
1380       ELSE
1381!$omp parallel do default(shared) firstprivate(nrxxs) private(ir)
1382          DO ir = 1, nrxxs
1383             result(ir,ii) = 0.0_DP
1384          ENDDO
1385       ENDIF
1386       !
1387    ENDDO
1388    !
1389    !precompute these guys
1390    omega_inv = 1.0 / omega
1391    nqs_inv = 1.0 / nqs
1392    !
1393    !------------------------------------------------------------------------!
1394    ! Beginning of main loop
1395    !------------------------------------------------------------------------!
1396    DO iq = 1, nqs
1397       !
1398       ikq = index_xkq(current_ik,iq)
1399       ik  = index_xk(ikq)
1400       xkq = xkq_collect(:,ikq)
1401       !
1402       ! calculate the 1/|r-r'| (actually, k+q+g) factor and place it in fac
1403       CALL g2_convolution_all( dfftt%ngm, gt, xkp, xkq, iq, current_k )
1404       !
1405! JRD - below not threaded
1406       facb = 0D0
1407       DO ig = 1, dfftt%ngm
1408          facb(dfftt%nl(ig)) = coulomb_fac(ig,iq,current_k)
1409       ENDDO
1410       !
1411       IF ( okvan .AND..NOT.tqr ) CALL qvan_init( dfftt%ngm, xkq, xkp )
1412       !
1413       DO iegrp = 1, negrp
1414          !
1415          ! compute the id of group whose data is currently worked on
1416          wegrp = MOD(iegrp+my_egrp_id-1, negrp)+1
1417          njt = (all_end(wegrp)-all_start(wegrp)+jblock)/jblock
1418          !
1419          DO ijt = 1, njt
1420             !
1421             jblock_start = (ijt - 1) * jblock + all_start(wegrp)
1422             jblock_end = MIN(jblock_start+jblock-1,all_end(wegrp))
1423             !
1424             DO ii = 1, nibands(my_egrp_id+1)
1425                !
1426                ibnd = ibands(ii,my_egrp_id+1)
1427                !
1428                IF (ibnd==0 .OR. ibnd>m) CYCLE
1429                !
1430                !determine which j-bands to calculate
1431                jstart = 0
1432                jend = 0
1433                !
1434                DO ipair = 1, max_pairs
1435                   IF (egrp_pairs(1,ipair,my_egrp_id+1) == ibnd) THEN
1436                      IF (jstart == 0) THEN
1437                         jstart = egrp_pairs(2,ipair,my_egrp_id+1)
1438                         jend = jstart
1439                      ELSE
1440                         jend = egrp_pairs(2,ipair,my_egrp_id+1)
1441                      ENDIF
1442                   ENDIF
1443                ENDDO
1444                !
1445                jstart = MAX( jstart, jblock_start )
1446                jend = MIN( jend, jblock_end )
1447                !
1448                !how many iters
1449                jcount = jend-jstart+1
1450                IF (jcount <= 0) CYCLE
1451                !
1452                !----------------------------------------------------------------------!
1453                !INNER LOOP START
1454                !----------------------------------------------------------------------!
1455                !
1456                nblock = 2048
1457                nrt = (nrxxs+nblock-1)/nblock
1458                !
1459!$omp parallel do collapse(2) private(ir_start,ir_end)
1460                DO irt = 1, nrt
1461                   DO jbnd = jstart, jend
1462                      ir_start = (irt - 1) * nblock + 1
1463                      ir_end = MIN(ir_start+nblock-1, nrxxs)
1464                      IF (noncolin) THEN
1465                         DO ir = ir_start, ir_end
1466                            rhoc(ir,jbnd-jstart+1) = ( CONJG(exxbuff(ir,jbnd-all_start(wegrp)+ &
1467                                                       iexx_start,ikq))*temppsic_nc(ir,1,ii) + &
1468                                                       CONJG(exxbuff(nrxxs+ir,jbnd-all_start(wegrp)+ &
1469                                                       iexx_start,ikq))*temppsic_nc(ir,2,ii) )/omega
1470                         ENDDO
1471                      ELSE
1472!DIR$ vector nontemporal (rhoc)
1473                         DO ir = ir_start, ir_end
1474                            rhoc(ir,jbnd-jstart+1) = CONJG(exxbuff(ir,jbnd-all_start(wegrp)+ &
1475                                                     iexx_start,ikq))*temppsic(ir,ii)*omega_inv
1476                         ENDDO
1477                      ENDIF
1478                   ENDDO
1479                ENDDO
1480!$omp end parallel do
1481                !
1482                !   ... add augmentation in REAL space HERE
1483                IF (okvan .AND. tqr) THEN ! augment the "charge" in real space
1484                   DO jbnd = jstart, jend
1485                      CALL addusxx_r(rhoc(:,jbnd-jstart+1), becxx(ikq)%k(:,jbnd), becpsi%k(:,ibnd))
1486                   ENDDO
1487                ENDIF
1488                !
1489                !   ... brings it to G-space
1490#if defined(__USE_MANY_FFT)
1491                CALL fwfft( 'Rho', prhoc, dfftt, howmany=jcount )
1492#else
1493                DO jbnd=jstart, jend
1494                   CALL fwfft( 'Rho', rhoc(:,jbnd-jstart+1), dfftt )
1495                ENDDO
1496#endif
1497                !
1498                !   ... add augmentation in G space HERE
1499                IF (okvan .AND. .NOT. tqr) THEN
1500                   DO jbnd = jstart, jend
1501                      CALL addusxx_g( dfftt, rhoc(:,jbnd-jstart+1), xkq, xkp, &
1502                      'c', becphi_c=becxx(ikq)%k(:,jbnd),becpsi_c=becpsi%k(:,ibnd) )
1503                   ENDDO
1504                ENDIF
1505                !   ... charge done
1506                !
1507!call start_collection()
1508!$omp parallel do collapse(2) private(ir_start,ir_end)
1509                DO irt = 1, nrt
1510                   DO jbnd = jstart, jend
1511                      ir_start = (irt - 1) * nblock + 1
1512                      ir_end = MIN(ir_start+nblock-1,nrxxs)
1513!DIR$ vector nontemporal (vc)
1514                      DO ir = ir_start, ir_end
1515                         vc(ir,jbnd-jstart+1) = facb(ir) * rhoc(ir,jbnd-jstart+1)*&
1516                                                x_occupation(jbnd,ik) * nqs_inv
1517                      ENDDO
1518                   ENDDO
1519                ENDDO
1520!$omp end parallel do
1521!call stop_collection()
1522                !
1523                ! Add ultrasoft contribution (RECIPROCAL SPACE)
1524                ! compute alpha_I,j,k+q = \sum_J \int <beta_J|phi_j,k+q> V_i,j,k,q Q_I,J(r) d3r
1525                IF (okvan .AND. .NOT. tqr) THEN
1526                   DO jbnd=jstart, jend
1527                      CALL newdxx_g( dfftt, vc(:,jbnd-jstart+1), xkq, xkp, 'c',&
1528                                     deexx(:,ii), becphi_c=becxx(ikq)%k(:,jbnd) )
1529                   ENDDO
1530                ENDIF
1531                !
1532                !brings back v in real space
1533#if defined(__USE_MANY_FFT)
1534                !fft many
1535                CALL invfft( 'Rho', pvc, dfftt, howmany=jcount )
1536#else
1537                DO jbnd = jstart, jend
1538                   CALL invfft( 'Rho', vc(:,jbnd-jstart+1), dfftt )
1539                ENDDO
1540#endif
1541                !
1542                ! Add ultrasoft contribution (REAL SPACE)
1543                IF (okvan .AND. tqr) THEN
1544                   DO jbnd = jstart, jend
1545                      CALL newdxx_r( dfftt, vc(:,jbnd-jstart+1), becxx(ikq)%k(:,jbnd),deexx(:,ii) )
1546                   ENDDO
1547                ENDIF
1548                !
1549                ! ... Add PAW one-center contribution
1550                IF (okpaw) THEN
1551                   DO jbnd = jstart, jend
1552                      CALL PAW_newdxx( x_occupation(jbnd,ik)/nqs, becxx(ikq)%k(:,jbnd), &
1553                                       becpsi%k(:,ibnd), deexx(:,ii) )
1554                   ENDDO
1555                ENDIF
1556                !
1557                ! ... accumulates over bands and k points
1558                !
1559!call start_collection()
1560!$omp parallel do private(ir_start,ir_end)
1561                DO irt = 1, nrt
1562                   DO jbnd = jstart, jend
1563                      ir_start = (irt - 1) * nblock + 1
1564                      ir_end = MIN(ir_start+nblock-1, nrxxs)
1565                      IF (noncolin) THEN
1566                         DO ir = ir_start, ir_end
1567                            result_nc(ir,1,ii) = result_nc(ir,1,ii) + vc(ir,jbnd-jstart+1) * &
1568                                                 exxbuff(ir,jbnd-all_start(wegrp)+iexx_start,ikq)
1569                            result_nc(ir,2,ii) = result_nc(ir,2,ii) + vc(ir,jbnd-jstart+1) * &
1570                                                 exxbuff(ir+nrxxs,jbnd-all_start(wegrp)+iexx_start,ikq)
1571                         ENDDO
1572                      ELSE
1573!!dir$ vector nontemporal (result)
1574                         DO ir = ir_start, ir_end
1575                            result(ir,ii) = result(ir,ii) + vc(ir,jbnd-jstart+1)* &
1576                                            exxbuff(ir,jbnd-all_start(wegrp)+iexx_start,ikq)
1577                         ENDDO
1578                      ENDIF
1579                   ENDDO
1580                ENDDO
1581!$omp end parallel do
1582!call stop_collection()
1583                !
1584                !----------------------------------------------------------------------!
1585                !INNER LOOP END
1586                !----------------------------------------------------------------------!
1587                !
1588             ENDDO !I-LOOP
1589          ENDDO !IJT
1590          !
1591          ! get the next nbnd/negrp data
1592          IF (negrp > 1) CALL mp_circular_shift_left( exxbuff(:,:,ikq), me_egrp, inter_egrp_comm )
1593          !
1594       ENDDO !iegrp
1595       !
1596       IF ( okvan .AND..NOT.tqr ) CALL qvan_clean()
1597       !
1598    ENDDO
1599    !
1600    !
1601    !
1602    DO ii = 1, nibands(my_egrp_id+1)
1603       !
1604       ibnd = ibands(ii,my_egrp_id+1)
1605       !
1606       IF (ibnd==0 .OR. ibnd>m) CYCLE
1607       !
1608       IF (okvan) THEN
1609          CALL mp_sum( deexx(:,ii), intra_egrp_comm )
1610       ENDIF
1611       !
1612       IF (noncolin) THEN
1613          !brings back result in G-space
1614          CALL fwfft( 'Wave', result_nc(:,1,ii), dfftt )
1615          CALL fwfft( 'Wave', result_nc(:,2,ii), dfftt )
1616          !
1617          DO ig = 1, n
1618             big_result(ig,ibnd) = big_result(ig,ibnd) - exxalfa* &
1619                                   result_nc(dfftt%nl(igk_exx(ig,current_k)),1,ii)
1620             big_result(n+ig,ibnd) = big_result(n+ig,ibnd) - exxalfa* &
1621                                     result_nc(dfftt%nl(igk_exx(ig,current_k)),2,ii)
1622          ENDDO
1623       ELSE
1624          !
1625          CALL fwfft( 'Wave', result(:,ii), dfftt )
1626          !
1627          DO ig = 1, n
1628             big_result(ig,ibnd) = big_result(ig,ibnd) - exxalfa* &
1629                                   result(dfftt%nl(igk_exx(ig,current_k)),ii)
1630          ENDDO
1631       ENDIF
1632       !
1633       ! add non-local \sum_I |beta_I> \alpha_Ii (the sum on i is outside)
1634       IF (okvan) CALL add_nlxx_pot( lda, big_result(:,ibnd), xkp, n, igk_exx(:,current_k), &
1635                                    deexx(:,ii), eps_occ, exxalfa )
1636       !
1637    ENDDO
1638    !
1639    !deallocate temporary arrays
1640    DEALLOCATE( rhoc, vc )
1641    !
1642    !sum result
1643    CALL result_sum( n*npol, m, big_result )
1644    !
1645    IF (iexx_istart(my_egrp_id+1) > 0) THEN
1646       !
1647       IF (negrp == 1) THEN
1648          ending_im = m
1649       ELSE
1650          ending_im = iexx_iend(my_egrp_id+1) - iexx_istart(my_egrp_id+1) + 1
1651       ENDIF
1652       !
1653       IF (noncolin) THEN
1654          DO im = 1, ending_im
1655!$omp parallel do default(shared), private(ig) firstprivate(im,n)
1656             DO ig = 1, n
1657                hpsi(ig,im) = hpsi(ig,im) + big_result(ig,im+iexx_istart(my_egrp_id+1)-1)
1658             ENDDO
1659!$omp end parallel do
1660!$omp parallel do default(shared), private(ig) firstprivate(im,n)
1661             DO ig = 1, n
1662                hpsi(lda+ig,im) = hpsi(lda+ig,im) + big_result(n+ig,im+iexx_istart(my_egrp_id+1)-1)
1663             ENDDO
1664!$omp end parallel do
1665          ENDDO
1666       ELSE
1667          DO im = 1, ending_im
1668!$omp parallel do default(shared), private(ig) firstprivate(im,n)
1669             DO ig = 1, n
1670                hpsi(ig,im) = hpsi(ig,im) + big_result(ig,im+iexx_istart(my_egrp_id+1)-1)
1671             ENDDO
1672!$omp end parallel do
1673          ENDDO
1674       ENDIF
1675    ENDIF
1676    !
1677    IF (noncolin) THEN
1678       DEALLOCATE( temppsic_nc, result_nc )
1679    ELSE
1680       DEALLOCATE( temppsic, result )
1681    ENDIF
1682    !
1683    DEALLOCATE( big_result )
1684    DEALLOCATE( fac, facb )
1685    IF (okvan) DEALLOCATE( deexx )
1686    !
1687  END SUBROUTINE vexx_k
1688  !
1689  !
1690  !-----------------------------------------------------------------------
1691  FUNCTION exxenergy()
1692    !-----------------------------------------------------------------------
1693    !! NB: This function is meant to give the SAME RESULT as exxenergy2.
1694    !! It is worth keeping it in the repository because in spite of being
1695    !! slower it is a simple driver using vexx potential routine so it is
1696    !! good, from time to time, to replace exxenergy2 with it to check that
1697    !! everything is ok and energy and potential are consistent as they should.
1698    !
1699    USE io_files,               ONLY : iunwfc_exx, nwordwfc
1700    USE buffers,                ONLY : get_buffer
1701    USE wvfct,                  ONLY : nbnd, npwx, wg, current_k
1702    USE gvect,                  ONLY : gstart
1703    USE wavefunctions,          ONLY : evc
1704    USE lsda_mod,               ONLY : lsda, current_spin, isk
1705    USE klist,                  ONLY : ngk, nks, xk
1706    USE mp_pools,               ONLY : inter_pool_comm
1707    USE mp_exx,                 ONLY : intra_egrp_comm, intra_egrp_comm, &
1708                                       negrp
1709    USE mp,                     ONLY : mp_sum
1710    USE becmod,                 ONLY : bec_type, allocate_bec_type, &
1711                                       deallocate_bec_type, calbec
1712    USE uspp,                   ONLY : okvan,nkb,vkb
1713    USE exx_band,               ONLY : nwordwfc_exx, igk_exx
1714    !
1715    IMPLICIT NONE
1716    !
1717    TYPE(bec_type) :: becpsi
1718    REAL(DP) :: exxenergy,  energy
1719    INTEGER :: npw, ibnd, ik
1720    COMPLEX(DP) :: vxpsi(npwx*npol,nbnd), psi(npwx*npol,nbnd)
1721    !
1722    exxenergy = 0._DP
1723    !
1724    CALL start_clock( 'exxenergy' )
1725    !
1726    IF (okvan) CALL allocate_bec_type( nkb, nbnd, becpsi )
1727    energy = 0._dp
1728    !
1729    DO ik = 1, nks
1730       npw = ngk(ik)
1731       ! setup variables for usage by vexx (same logic as for H_psi)
1732       current_k = ik
1733       IF ( lsda ) current_spin = isk(ik)
1734       ! end setup
1735       IF ( nks > 1 ) THEN
1736          CALL get_buffer( psi, nwordwfc_exx, iunwfc_exx, ik )
1737       ELSE
1738          psi(1:npwx*npol,1:nbnd) = evc(1:npwx*npol,1:nbnd)
1739       ENDIF
1740       !
1741       IF (okvan) THEN
1742          ! prepare the |beta> function at k+q
1743          CALL init_us_2( npw, igk_exx(1,ik), xk(:,ik), vkb )
1744          ! compute <beta_I|psi_j> at this k+q point, for all band and all projectors
1745          CALL calbec( npw, vkb, psi, becpsi, nbnd )
1746       ENDIF
1747       !
1748       vxpsi(:,:) = (0._dp, 0._dp)
1749       CALL vexx( npwx, npw, nbnd, psi, vxpsi, becpsi )
1750       !
1751       DO ibnd = 1, nbnd
1752          energy = energy + DBLE(wg(ibnd,ik) * dot_product(psi(1:npw,ibnd),vxpsi(1:npw,ibnd)))
1753          IF (noncolin) energy = energy + &
1754                  DBLE(wg(ibnd,ik) * dot_product(psi(npwx+1:npwx+npw,ibnd),vxpsi(npwx+1:npwx+npw,ibnd)))
1755          !
1756       ENDDO
1757       IF (gamma_only .AND. gstart == 2) THEN
1758           DO ibnd = 1, nbnd
1759              energy = energy - &
1760                       DBLE(0.5_dp * wg(ibnd,ik) * CONJG(psi(1,ibnd)) * vxpsi(1,ibnd))
1761           ENDDO
1762       ENDIF
1763    ENDDO
1764    !
1765    IF (gamma_only) energy = 2 * energy
1766    !
1767    CALL mp_sum( energy, intra_egrp_comm )
1768    CALL mp_sum( energy, inter_pool_comm )
1769    IF (okvan)  CALL deallocate_bec_type( becpsi )
1770    !
1771    exxenergy = energy
1772    !
1773    CALL stop_clock( 'exxenergy' )
1774    !
1775  END FUNCTION exxenergy
1776  !
1777  !
1778  !-----------------------------------------------------------------------
1779  FUNCTION exxenergy2()
1780    !-----------------------------------------------------------------------
1781    !! Wrapper to \(\texttt{exxenergy2_gamma}\) and \(\texttt{exxenergy2_k}\).
1782    !
1783    IMPLICIT NONE
1784    !
1785    REAL(DP) :: exxenergy2
1786    !
1787    CALL start_clock( 'exxenergy' )
1788    !
1789    IF ( gamma_only ) THEN
1790       exxenergy2 = exxenergy2_gamma()
1791    ELSE
1792       exxenergy2 = exxenergy2_k()
1793    ENDIF
1794    !
1795    CALL stop_clock( 'exxenergy' )
1796    !
1797  END FUNCTION  exxenergy2
1798  !
1799  !
1800  !-----------------------------------------------------------------------
1801  FUNCTION exxenergy2_gamma()
1802    !-----------------------------------------------------------------------
1803    !
1804    USE constants,               ONLY : fpi, e2, pi
1805    USE io_files,                ONLY : iunwfc_exx, nwordwfc
1806    USE buffers,                 ONLY : get_buffer
1807    USE cell_base,               ONLY : alat, omega, bg, at, tpiba
1808    USE symm_base,               ONLY : nsym, s
1809    USE gvect,                   ONLY : ngm, gstart, g
1810    USE wvfct,                   ONLY : nbnd, npwx, wg
1811    USE wavefunctions,           ONLY : evc
1812    USE klist,                   ONLY : xk, ngk, nks, nkstot
1813    USE lsda_mod,                ONLY : lsda, current_spin, isk
1814    USE mp_pools,                ONLY : inter_pool_comm
1815    USE mp_bands,                ONLY : intra_bgrp_comm
1816    USE mp_exx,                  ONLY : inter_egrp_comm, my_egrp_id, negrp, &
1817                                        intra_egrp_comm, me_egrp, &
1818                                        max_pairs, egrp_pairs, ibands, nibands, &
1819                                        iexx_istart, iexx_iend, &
1820                                        all_start, all_end, iexx_start, &
1821                                        init_index_over_band, jblock
1822    USE mp,                      ONLY : mp_sum, mp_circular_shift_left
1823    USE fft_interfaces,          ONLY : fwfft, invfft
1824    USE gvect,                   ONLY : ecutrho
1825    USE klist,                   ONLY : wk
1826    USE uspp,                    ONLY : okvan,nkb,vkb
1827    USE becmod,                  ONLY : bec_type, allocate_bec_type, &
1828                                        deallocate_bec_type, calbec
1829    USE paw_variables,           ONLY : okpaw
1830    USE paw_exx,                 ONLY : PAW_xx_energy
1831    USE us_exx,                  ONLY : bexg_merge, becxx, addusxx_g, &
1832                                        addusxx_r, qvan_init, qvan_clean
1833    USE exx_base,                ONLY : nqs, xkq_collect, index_xkq, index_xk, &
1834                                        coulomb_fac, g2_convolution_all
1835    USE exx_band,                ONLY : igk_exx, change_data_structure, &
1836                                        transform_evc_to_exx, nwordwfc_exx, &
1837                                        evc_exx
1838    !
1839    IMPLICIT NONE
1840    !
1841    REAL(DP)   :: exxenergy2_gamma
1842    !
1843    ! ... local variables
1844    !
1845    REAL(DP) :: energy
1846    COMPLEX(DP), ALLOCATABLE :: temppsic(:)
1847    COMPLEX(DP), ALLOCATABLE :: rhoc(:)
1848    REAL(DP),    ALLOCATABLE :: fac(:)
1849    COMPLEX(DP), ALLOCATABLE :: vkb_exx(:,:)
1850    INTEGER  :: jbnd, ibnd, ik, ikk, ig, ikq, iq, ir
1851    INTEGER  :: nrxxs, current_ik, ibnd_loop_start
1852    REAL(DP) :: x1, x2
1853    REAL(DP) :: xkq(3), xkp(3), vc
1854    INTEGER, EXTERNAL :: global_kpoint_index
1855    !
1856    TYPE(bec_type) :: becpsi
1857    COMPLEX(DP), ALLOCATABLE :: psi_t(:), prod_tot(:)
1858    REAL(DP),ALLOCATABLE :: temppsic_dble (:)
1859    REAL(DP),ALLOCATABLE :: temppsic_aimag(:)
1860    INTEGER :: npw
1861    INTEGER :: istart, iend, ipair, ii, ialloc
1862    INTEGER :: ijt, njt, jblock_start, jblock_end
1863    INTEGER :: exxbuff_index
1864    INTEGER :: calbec_start, calbec_end
1865    INTEGER :: intra_bgrp_comm_
1866    INTEGER :: iegrp, wegrp
1867    !
1868    CALL init_index_over_band( inter_egrp_comm, nbnd, nbnd )
1869    !
1870    CALL transform_evc_to_exx( 0 )
1871    !
1872    ialloc = nibands(my_egrp_id+1)
1873    !
1874    nrxxs = dfftt%nnr
1875    ALLOCATE( fac(dfftt%ngm) )
1876    !
1877    ALLOCATE( temppsic(nrxxs), temppsic_DBLE(nrxxs), temppsic_aimag(nrxxs) )
1878    ALLOCATE( rhoc(nrxxs) )
1879    ALLOCATE( vkb_exx(npwx,nkb) )
1880    !
1881    energy = 0.0_DP
1882    !
1883    CALL allocate_bec_type( nkb, nbnd, becpsi )
1884    !
1885    IKK_LOOP : &
1886    DO ikk = 1, nks
1887       current_ik = global_kpoint_index( nkstot, ikk )
1888       xkp = xk(:,ikk)
1889       !
1890       IF ( lsda ) current_spin = isk(ikk)
1891       npw = ngk (ikk)
1892       IF ( nks > 1 ) CALL get_buffer( evc_exx, nwordwfc_exx, iunwfc_exx, ikk )
1893       !
1894       ! ... prepare the |beta> function at k+q
1895       CALL init_us_2( npw, igk_exx(:,ikk), xkp, vkb_exx )
1896       !
1897       ! ... compute <beta_I|psi_j> at this k+q point, for all band and all projectors
1898       calbec_start = ibands(1,my_egrp_id+1)
1899       calbec_end = ibands(nibands(my_egrp_id+1),my_egrp_id+1)
1900       !
1901       intra_bgrp_comm_ = intra_bgrp_comm
1902       intra_bgrp_comm = intra_egrp_comm
1903       !
1904       CALL calbec( npw, vkb_exx, evc_exx, becpsi, nibands(my_egrp_id+1) )
1905       !
1906       intra_bgrp_comm = intra_bgrp_comm_
1907       !
1908       IQ_LOOP : &
1909       DO iq = 1,nqs
1910          !
1911          ikq  = index_xkq(current_ik,iq)
1912          ik   = index_xk(ikq)
1913          !
1914          xkq = xkq_collect(:,ikq)
1915          !
1916          CALL g2_convolution_all( dfftt%ngm, gt, xkp, xkq, iq, current_ik )
1917          !
1918          fac = coulomb_fac(:,iq,current_ik)
1919          fac(gstart_t:) = 2 * coulomb_fac(gstart_t:,iq,current_ik)
1920          !
1921          IF ( okvan .AND. .NOT.tqr ) CALL qvan_init( dfftt%ngm, xkq, xkp )
1922          !
1923          DO iegrp = 1, negrp
1924             !
1925             ! ... compute the id of group whose data is currently worked on
1926             wegrp = MOD(iegrp+my_egrp_id-1, negrp)+1
1927             !
1928             jblock_start = all_start(wegrp)
1929             jblock_end   = all_end(wegrp)
1930             !
1931             JBND_LOOP : &
1932             DO ii = 1, nibands(my_egrp_id+1)
1933                !
1934                jbnd = ibands(ii,my_egrp_id+1)
1935                !
1936                IF (jbnd==0 .OR. jbnd>nbnd) CYCLE
1937                !
1938                IF ( MOD(ii,2)==1 ) THEN
1939                   !
1940                   temppsic = (0._DP,0._DP)
1941                   !
1942                   IF ( (ii+1) <= nibands(my_egrp_id+1) ) THEN
1943                      ! deal with double bands
1944!$omp parallel do  default(shared), private(ig)
1945                      DO ig = 1, npwt
1946                         temppsic( dfftt%nl(ig) )  = &
1947                              evc_exx(ig,ii) + (0._DP,1._DP) * evc_exx(ig,ii+1)
1948                         temppsic( dfftt%nlm(ig) ) = &
1949                              CONJG(evc_exx(ig,ii) - (0._DP,1._DP) * evc_exx(ig,ii+1))
1950                      ENDDO
1951!$omp end parallel do
1952                   ENDIF
1953                   !
1954                   IF (ii == nibands(my_egrp_id+1)) THEN
1955                      ! deal with a single last band
1956!$omp parallel do  default(shared), private(ig)
1957                      DO ig = 1, npwt
1958                         temppsic( dfftt%nl(ig) ) = evc_exx(ig,ii)
1959                         temppsic( dfftt%nlm(ig) ) = CONJG(evc_exx(ig,ii))
1960                      ENDDO
1961!$omp end parallel do
1962                   ENDIF
1963                   !
1964                   CALL invfft( 'Wave', temppsic, dfftt )
1965!$omp parallel do default(shared), private(ir)
1966                   DO ir = 1, nrxxs
1967                      temppsic_DBLE(ir) = DBLE( temppsic(ir) )
1968                      temppsic_aimag(ir) = AIMAG( temppsic(ir) )
1969                   ENDDO
1970!$omp end parallel do
1971                   !
1972                ENDIF
1973                !
1974                !determine which j-bands to calculate
1975                istart = 0
1976                iend = 0
1977                !
1978                DO ipair = 1, max_pairs
1979                   IF (egrp_pairs(1,ipair,my_egrp_id+1) == jbnd) THEN
1980                      IF (istart == 0) THEN
1981                         istart = egrp_pairs(2,ipair,my_egrp_id+1)
1982                         iend = istart
1983                      ELSE
1984                         iend = egrp_pairs(2,ipair,my_egrp_id+1)
1985                      ENDIF
1986                   ENDIF
1987                ENDDO
1988                !
1989                istart = MAX(istart,jblock_start)
1990                iend = MIN(iend,jblock_end)
1991                !
1992                IF (MOD(istart,2) == 0) THEN
1993                   ibnd_loop_start = istart-1
1994                ELSE
1995                   ibnd_loop_start = istart
1996                ENDIF
1997                !
1998                IBND_LOOP_GAM : &
1999                DO ibnd = ibnd_loop_start, iend, 2       !for each band of psi
2000                   !
2001                   exxbuff_index = (ibnd+1)/2-(all_start(wegrp)+1)/2+(iexx_start+1)/2
2002                   !
2003                   IF ( ibnd < istart ) THEN
2004                      x1 = 0.0_DP
2005                   ELSE
2006                      x1 = x_occupation(ibnd,ik)
2007                   ENDIF
2008                   !
2009                   IF ( ibnd < iend ) THEN
2010                      x2 = x_occupation(ibnd+1,ik)
2011                   ELSE
2012                      x2 = 0.0_DP
2013                   ENDIF
2014                   ! calculate rho in real space. Gamma tricks are used.
2015                   ! temppsic is real; tempphic contains band 1 in the real part,
2016                   ! band 2 in the imaginary part; the same applies to rhoc
2017                   !
2018                   IF ( MOD(ii,2) == 0 ) THEN
2019                      rhoc = 0.0_DP
2020!$omp parallel do default(shared), private(ir)
2021                      DO ir = 1, nrxxs
2022                         rhoc(ir) = exxbuff(ir,exxbuff_index,ikq) * temppsic_aimag(ir) / omega
2023                      ENDDO
2024!$omp end parallel do
2025                   ELSE
2026!$omp parallel do default(shared), private(ir)
2027                      DO ir = 1, nrxxs
2028                         rhoc(ir) = exxbuff(ir,exxbuff_index,ikq) * temppsic_DBLE(ir) / omega
2029                      ENDDO
2030!$omp end parallel do
2031                   ENDIF
2032                   !
2033                   IF (okvan .AND. tqr) THEN
2034                      IF (ibnd >= istart) &
2035                           CALL addusxx_r( rhoc,_CX(becxx(ikq)%r(:,ibnd)), &
2036                                          _CX(becpsi%r(:,jbnd)) )
2037                      IF (ibnd<iend) &
2038                           CALL addusxx_r(rhoc,_CY(becxx(ikq)%r(:,ibnd+1)), &
2039                                          _CX(becpsi%r(:,jbnd)))
2040                   ENDIF
2041                   !
2042                   ! bring rhoc to G-space
2043                   CALL fwfft( 'Rho', rhoc, dfftt )
2044                   !
2045                   IF (okvan .AND. .NOT.tqr) THEN
2046                      IF (ibnd >= istart) &
2047                           CALL addusxx_g( dfftt, rhoc, xkq, xkp, 'r', &
2048                           becphi_r=becxx(ikq)%r(:,ibnd), becpsi_r=becpsi%r(:,jbnd-calbec_start+1) )
2049                      IF (ibnd < iend) &
2050                           CALL addusxx_g( dfftt, rhoc, xkq, xkp, 'i', &
2051                           becphi_r=becxx(ikq)%r(:,ibnd+1), becpsi_r=becpsi%r(:,jbnd-calbec_start+1) )
2052                   ENDIF
2053                   !
2054                   vc = 0.0_DP
2055!$omp parallel do  default(shared), private(ig),  reduction(+:vc)
2056                   DO ig = 1, dfftt%ngm
2057                      !
2058                      ! The real part of rhoc contains the contribution from band ibnd
2059                      ! The imaginary part    contains the contribution from band ibnd+1
2060                      !
2061                      vc = vc + fac(ig) * ( x1 * &
2062                           ABS( rhoc(dfftt%nl(ig)) + CONJG(rhoc(dfftt%nlm(ig))) )**2 &
2063                                 +x2 * &
2064                           ABS( rhoc(dfftt%nl(ig)) - CONJG(rhoc(dfftt%nlm(ig))) )**2 )
2065                   ENDDO
2066!$omp end parallel do
2067                   !
2068                   vc = vc * omega * 0.25_DP / nqs
2069                   energy = energy - exxalfa * vc * wg(jbnd,ikk)
2070                   !
2071                   IF (okpaw) THEN
2072                      IF (ibnd >= ibnd_start) &
2073                           energy = energy + exxalfa*wg(jbnd,ikk)*&
2074                           x1 * PAW_xx_energy(_CX(becxx(ikq)%r(:,ibnd)),_CX(becpsi%r(:,jbnd)) )
2075                      IF (ibnd < ibnd_end) &
2076                           energy = energy + exxalfa*wg(jbnd,ikk)*&
2077                           x2 * PAW_xx_energy(_CX(becxx(ikq)%r(:,ibnd+1)), _CX(becpsi%r(:,jbnd)) )
2078                   ENDIF
2079                   !
2080                ENDDO &
2081                IBND_LOOP_GAM
2082             ENDDO &
2083             JBND_LOOP
2084             !
2085             ! get the next nbnd/negrp data
2086             IF (negrp > 1) CALL mp_circular_shift_left( exxbuff(:,:,ikq), me_egrp, inter_egrp_comm )
2087             !
2088          ENDDO ! iegrp
2089          IF ( okvan .AND. .NOT.tqr ) CALL qvan_clean( )
2090          !
2091       ENDDO &
2092       IQ_LOOP
2093    ENDDO &
2094    IKK_LOOP
2095    !
2096    DEALLOCATE( temppsic, temppsic_dble, temppsic_aimag )
2097    !
2098    DEALLOCATE( rhoc, fac )
2099    CALL deallocate_bec_type( becpsi )
2100    !
2101    CALL mp_sum( energy, inter_egrp_comm )
2102    CALL mp_sum( energy, intra_egrp_comm )
2103    CALL mp_sum( energy, inter_pool_comm )
2104    !
2105    exxenergy2_gamma = energy
2106    !
2107    CALL change_data_structure( .FALSE. )
2108    !
2109  END FUNCTION  exxenergy2_gamma
2110  !
2111  !
2112  !-----------------------------------------------------------------------
2113  FUNCTION exxenergy2_k()
2114    !-----------------------------------------------------------------------
2115    !
2116    USE constants,               ONLY : fpi, e2, pi
2117    USE io_files,                ONLY : iunwfc_exx, nwordwfc
2118    USE buffers,                 ONLY : get_buffer
2119    USE cell_base,               ONLY : alat, omega, bg, at, tpiba
2120    USE symm_base,               ONLY : nsym, s
2121    USE gvect,                   ONLY : ngm, gstart, g
2122    USE wvfct,                   ONLY : nbnd, npwx, wg
2123    USE wavefunctions,           ONLY : evc
2124    USE klist,                   ONLY : xk, ngk, nks, nkstot
2125    USE lsda_mod,                ONLY : lsda, current_spin, isk
2126    USE mp_pools,                ONLY : inter_pool_comm
2127    USE mp_exx,                  ONLY : inter_egrp_comm, my_egrp_id, negrp, &
2128                                        intra_egrp_comm, me_egrp, &
2129                                        max_pairs, egrp_pairs, ibands, nibands, &
2130                                        iexx_istart, iexx_iend, &
2131                                        all_start, all_end, iexx_start, &
2132                                        init_index_over_band, jblock
2133    USE mp_bands,                ONLY : intra_bgrp_comm
2134    USE mp,                      ONLY : mp_sum, mp_circular_shift_left
2135    USE fft_interfaces,          ONLY : fwfft, invfft
2136    USE gvect,                   ONLY : ecutrho
2137    USE klist,                   ONLY : wk
2138    USE uspp,                    ONLY : okvan,nkb,vkb
2139    USE becmod,                  ONLY : bec_type, allocate_bec_type, &
2140                                        deallocate_bec_type, calbec
2141    USE paw_variables,           ONLY : okpaw
2142    USE paw_exx,                 ONLY : PAW_xx_energy
2143    USE us_exx,                  ONLY : bexg_merge, becxx, addusxx_g, &
2144                                        addusxx_r, qvan_init, qvan_clean
2145    USE exx_base,                ONLY : nqs, xkq_collect, index_xkq, index_xk, &
2146                                        coulomb_fac, g2_convolution_all
2147    USE exx_band,                ONLY : change_data_structure, &
2148                                        transform_evc_to_exx, nwordwfc_exx, &
2149                                        igk_exx, evc_exx
2150    !
2151    IMPLICIT NONE
2152    !
2153    REAL(DP)   :: exxenergy2_k
2154    !
2155    ! ... local variables
2156    !
2157    REAL(DP) :: energy
2158    COMPLEX(DP), ALLOCATABLE :: temppsic(:,:)
2159    COMPLEX(DP), ALLOCATABLE :: temppsic_nc(:,:,:)
2160    COMPLEX(DP), ALLOCATABLE,TARGET :: rhoc(:,:)
2161#if defined(__USE_MANY_FFT)
2162    COMPLEX(DP), POINTER :: prhoc(:)
2163#endif
2164    REAL(DP),    ALLOCATABLE :: fac(:)
2165    INTEGER  :: npw, jbnd, ibnd, ibnd_inner_start, ibnd_inner_end, ibnd_inner_count, &
2166                ik, ikk, ig, ikq, iq, ir
2167    INTEGER  :: h_ibnd, nrxxs, current_ik, ibnd_loop_start, nblock, nrt, irt, &
2168                ir_start, ir_end
2169    REAL(DP) :: x1, x2
2170    REAL(DP) :: xkq(3), xkp(3), vc, omega_inv
2171    INTEGER, EXTERNAL :: global_kpoint_index
2172    !
2173    TYPE(bec_type) :: becpsi
2174    COMPLEX(DP), ALLOCATABLE :: psi_t(:), prod_tot(:)
2175    INTEGER :: intra_bgrp_comm_
2176    INTEGER :: ii, ialloc, jstart, jend, ipair
2177    INTEGER :: ijt, njt, jblock_start, jblock_end
2178    INTEGER :: iegrp, wegrp
2179    !
2180    CALL init_index_over_band( inter_egrp_comm, nbnd, nbnd )
2181    !
2182    CALL transform_evc_to_exx( 0 )
2183    !
2184    ialloc = nibands(my_egrp_id+1)
2185    !
2186    nrxxs = dfftt%nnr
2187    ALLOCATE( fac(dfftt%ngm) )
2188    !
2189    IF (noncolin) THEN
2190       ALLOCATE( temppsic_nc(nrxxs,npol,ialloc) )
2191    ELSE
2192       ALLOCATE( temppsic(nrxxs,ialloc) )
2193    ENDIF
2194    !
2195    energy = 0.0_DP
2196    !
2197    CALL allocate_bec_type( nkb, nbnd, becpsi )
2198    !
2199    !precompute that stuff
2200    omega_inv = 1.0/omega
2201    !
2202    IKK_LOOP : &
2203    DO ikk = 1, nks
2204       !
2205       current_ik = global_kpoint_index ( nkstot, ikk )
2206       xkp = xk(:,ikk)
2207       !
2208       IF ( lsda ) current_spin = isk(ikk)
2209       npw = ngk(ikk)
2210       IF ( nks > 1 ) CALL get_buffer( evc_exx, nwordwfc_exx, iunwfc_exx, ikk )
2211       !
2212       ! compute <beta_I|psi_j> at this k+q point, for all band and all projectors
2213       intra_bgrp_comm_ = intra_bgrp_comm
2214       intra_bgrp_comm = intra_egrp_comm
2215       !
2216       IF (okvan .OR. okpaw) THEN
2217          CALL compute_becpsi( npw, igk_exx(:,ikk), xkp, evc_exx, &
2218                               becpsi%k(:,ibands(1,my_egrp_id+1)) )
2219       ENDIF
2220       !
2221       intra_bgrp_comm = intra_bgrp_comm_
2222       !
2223       ! ... precompute temppsic
2224       !
2225       IF (noncolin) THEN
2226          temppsic_nc = 0.0_DP
2227       ELSE
2228          temppsic = 0.0_DP
2229       ENDIF
2230       !
2231       DO ii = 1, nibands(my_egrp_id+1)
2232          !
2233          jbnd = ibands(ii,my_egrp_id+1)
2234          !
2235          IF (jbnd == 0 .OR. jbnd > nbnd) CYCLE
2236          !
2237          !IF ( abs(wg(jbnd,ikk)) < eps_occ) CYCLE
2238          !
2239          IF (noncolin) THEN
2240             !
2241!$omp parallel do default(shared), private(ig)
2242             DO ig = 1, npw
2243                temppsic_nc(dfftt%nl(igk_exx(ig,ikk)),1,ii) = evc_exx(ig,ii)
2244                temppsic_nc(dfftt%nl(igk_exx(ig,ikk)),2,ii) = evc_exx(npwx+ig,ii)
2245             ENDDO
2246!$omp end parallel do
2247             !
2248             CALL invfft( 'Wave', temppsic_nc(:,1,ii), dfftt )
2249             CALL invfft( 'Wave', temppsic_nc(:,2,ii), dfftt )
2250             !
2251          ELSE
2252!$omp parallel do default(shared), private(ig)
2253             DO ig = 1, npw
2254                temppsic(dfftt%nl(igk_exx(ig,ikk)),ii) = evc_exx(ig,ii)
2255             ENDDO
2256!$omp end parallel do
2257             !
2258             CALL invfft( 'Wave', temppsic(:,ii), dfftt )
2259             !
2260          ENDIF
2261       ENDDO
2262       !
2263       IQ_LOOP : &
2264       DO iq = 1,nqs
2265          !
2266          ikq  = index_xkq(current_ik,iq)
2267          ik   = index_xk(ikq)
2268          !
2269          xkq = xkq_collect(:,ikq)
2270          !
2271          CALL g2_convolution_all( dfftt%ngm, gt, xkp, xkq, iq, ikk )
2272          IF ( okvan .AND..NOT.tqr ) CALL qvan_init( dfftt%ngm, xkq, xkp )
2273          !
2274          DO iegrp = 1, negrp
2275             !
2276             ! ... compute the id of group whose data is currently worked on
2277             wegrp = MOD(iegrp+my_egrp_id-1, negrp)+1
2278             njt = (all_end(wegrp)-all_start(wegrp)+jblock)/jblock
2279             !
2280             IJT_LOOP : &
2281             DO ijt = 1, njt
2282                !
2283                jblock_start = (ijt - 1) * jblock + all_start(wegrp)
2284                jblock_end = MIN(jblock_start+jblock-1,all_end(wegrp))
2285                !
2286                JBND_LOOP : &
2287                DO ii = 1, nibands(my_egrp_id+1)
2288                   !
2289                   jbnd = ibands(ii,my_egrp_id+1)
2290                   !
2291                   IF (jbnd==0 .OR. jbnd>nbnd) CYCLE
2292                   !
2293                   !determine which j-bands to calculate
2294                   jstart = 0
2295                   jend = 0
2296                   !
2297                   DO ipair = 1, max_pairs
2298                      IF (egrp_pairs(1,ipair,my_egrp_id+1) == jbnd) THEN
2299                         IF (jstart == 0) THEN
2300                            jstart = egrp_pairs(2,ipair,my_egrp_id+1)
2301                            jend = jstart
2302                         ELSE
2303                            jend = egrp_pairs(2,ipair,my_egrp_id+1)
2304                         ENDIF
2305                      ENDIF
2306                   ENDDO
2307                   !
2308                   !these variables prepare for inner band parallelism
2309                   jstart = MAX(jstart,jblock_start)
2310                   jend = MIN(jend,jblock_end)
2311                   ibnd_inner_start = jstart
2312                   ibnd_inner_end = jend
2313                   ibnd_inner_count = jend-jstart+1
2314                   !
2315                   !allocate arrays
2316                   ALLOCATE( rhoc(nrxxs,ibnd_inner_count) )
2317#if defined(__USE_MANY_FFT)
2318                   prhoc(1:nrxxs*ibnd_inner_count) => rhoc
2319#endif
2320                   !calculate rho in real space
2321                   nblock = 2048
2322                   nrt = (nrxxs+nblock-1) / nblock
2323!$omp parallel do collapse(2) private(ir_start,ir_end)
2324                   DO irt = 1, nrt
2325                      DO ibnd = ibnd_inner_start, ibnd_inner_end
2326                         ir_start = (irt - 1) * nblock + 1
2327                         ir_end = MIN(ir_start+nblock-1,nrxxs)
2328                         IF (noncolin) THEN
2329                            DO ir = ir_start, ir_end
2330                               rhoc(ir,ibnd-ibnd_inner_start+1) = &
2331                                 ( CONJG(exxbuff(ir,ibnd-all_start(wegrp)+iexx_start,ikq)) * &
2332                                 temppsic_nc(ir,1,ii) + &
2333                                 CONJG(exxbuff(nrxxs+ir,ibnd-all_start(wegrp)+iexx_start,ikq)) * &
2334                                 temppsic_nc(ir,2,ii) ) * omega_inv
2335                            ENDDO
2336                         ELSE
2337                            DO ir = ir_start, ir_end
2338                               rhoc(ir,ibnd-ibnd_inner_start+1) = omega_inv * &
2339                                 CONJG(exxbuff(ir,ibnd-all_start(wegrp)+iexx_start,ikq)) * &
2340                                 temppsic(ir,ii)
2341                            ENDDO
2342                         ENDIF
2343                      ENDDO
2344                   ENDDO
2345!$omp end parallel do
2346                   !
2347                   ! augment the "charge" in real space
2348                   IF (okvan .AND. tqr) THEN
2349!$omp parallel do default(shared) private(ibnd) firstprivate(ibnd_inner_start,ibnd_inner_end)
2350                      DO ibnd = ibnd_inner_start, ibnd_inner_end
2351                         CALL addusxx_r( rhoc(:,ibnd-ibnd_inner_start+1), &
2352                                        becxx(ikq)%k(:,ibnd), becpsi%k(:,jbnd))
2353                      ENDDO
2354!$omp end parallel do
2355                   ENDIF
2356                   !
2357                   ! bring rhoc to G-space
2358#if defined(__USE_MANY_FFT)
2359                   CALL fwfft ('Rho', prhoc, dfftt, howmany=ibnd_inner_count)
2360#else
2361                   DO ibnd = ibnd_inner_start, ibnd_inner_end
2362                      CALL fwfft('Rho', rhoc(:,ibnd-ibnd_inner_start+1), dfftt)
2363                   ENDDO
2364#endif
2365                   ! augment the "charge" in G space
2366                   IF (okvan .AND. .NOT. tqr) THEN
2367                      DO ibnd = ibnd_inner_start, ibnd_inner_end
2368                         CALL addusxx_g(dfftt, rhoc(:,ibnd-ibnd_inner_start+1), &
2369                              xkq, xkp, 'c', becphi_c=becxx(ikq)%k(:,ibnd),     &
2370                              becpsi_c=becpsi%k(:,jbnd))
2371                      ENDDO
2372                   ENDIF
2373                   !
2374!$omp parallel do reduction(+:energy) private(vc)
2375                   DO ibnd = ibnd_inner_start, ibnd_inner_end
2376                      vc=0.0_DP
2377                      DO ig=1,dfftt%ngm
2378                         vc = vc + coulomb_fac(ig,iq,ikk) * &
2379                             DBLE(rhoc(dfftt%nl(ig),ibnd-ibnd_inner_start+1) *&
2380                             CONJG(rhoc(dfftt%nl(ig),ibnd-ibnd_inner_start+1)))
2381                      ENDDO
2382                      vc = vc * omega * x_occupation(ibnd,ik) / nqs
2383                      energy = energy - exxalfa * vc * wg(jbnd,ikk)
2384                      !
2385                      IF (okpaw) THEN
2386                         energy = energy +exxalfa*x_occupation(ibnd,ik)/nqs*wg(jbnd,ikk) &
2387                              *PAW_xx_energy(becxx(ikq)%k(:,ibnd), becpsi%k(:,jbnd))
2388                      ENDIF
2389                   ENDDO
2390!$omp end parallel do
2391                   !
2392                   !deallocate memory
2393                   DEALLOCATE( rhoc )
2394                ENDDO &
2395                JBND_LOOP
2396                !
2397             ENDDO&
2398             IJT_LOOP
2399             ! get the next nbnd/negrp data
2400             IF (negrp > 1) call mp_circular_shift_left( exxbuff(:,:,ikq), me_egrp, inter_egrp_comm )
2401             !
2402          END DO !iegrp
2403          !
2404          IF ( okvan .AND. .NOT.tqr ) CALL qvan_clean()
2405       ENDDO &
2406       IQ_LOOP
2407       !
2408    ENDDO &
2409    IKK_LOOP
2410    !
2411    IF (noncolin) THEN
2412       DEALLOCATE( temppsic_nc )
2413    ELSE
2414       DEALLOCATE( temppsic )
2415    ENDIF
2416    !
2417    DEALLOCATE( fac )
2418    !
2419    CALL deallocate_bec_type( becpsi )
2420    !
2421    CALL mp_sum( energy, inter_egrp_comm )
2422    CALL mp_sum( energy, intra_egrp_comm )
2423    CALL mp_sum( energy, inter_pool_comm )
2424    !
2425    exxenergy2_k = energy
2426    CALL change_data_structure( .FALSE. )
2427    !
2428  END FUNCTION  exxenergy2_k
2429  !
2430  !
2431  !-----------------------------------------------------------------------
2432  FUNCTION exx_stress()
2433    !-----------------------------------------------------------------------
2434    !! This is Eq.(10) of PRB 73, 125120 (2006).
2435    !
2436    USE constants,            ONLY : fpi, e2, pi, tpi
2437    USE io_files,             ONLY : iunwfc_exx, nwordwfc
2438    USE buffers,              ONLY : get_buffer
2439    USE cell_base,            ONLY : alat, omega, bg, at, tpiba
2440    USE symm_base,            ONLY : nsym, s
2441    USE wvfct,                ONLY : nbnd, npwx, wg, current_k
2442    USE wavefunctions,        ONLY : evc
2443    USE klist,                ONLY : xk, ngk, nks
2444    USE lsda_mod,             ONLY : lsda, current_spin, isk
2445    USE gvect,                ONLY : g
2446    USE mp_pools,             ONLY : npool, inter_pool_comm
2447    USE mp_exx,               ONLY : inter_egrp_comm, intra_egrp_comm, &
2448                                     ibands, nibands, my_egrp_id, jblock, &
2449                                     egrp_pairs, max_pairs, negrp, me_egrp, &
2450                                     all_start, all_end, iexx_start
2451    USE mp,                   ONLY : mp_sum, mp_circular_shift_left
2452    USE fft_base,             ONLY : dffts
2453    USE fft_interfaces,       ONLY : fwfft, invfft
2454    USE uspp,                 ONLY : okvan
2455    !
2456    USE exx_base,             ONLY : nq1, nq2, nq3, nqs, eps, exxdiv,       &
2457                                     x_gamma_extrapolation, on_double_grid, &
2458                                     grid_factor, yukawa, erfc_scrlen,      &
2459                                     use_coulomb_vcut_ws, use_coulomb_vcut_spheric, &
2460                                     gau_scrlen, vcut, index_xkq, index_xk, index_sym
2461    USE exx_band,             ONLY : change_data_structure, transform_evc_to_exx, &
2462                                     g_exx, igk_exx, nwordwfc_exx, evc_exx
2463    USE coulomb_vcut_module,  ONLY : vcut_get,  vcut_spheric_get
2464    !
2465    IMPLICIT NONE
2466    !
2467    ! ... local variables
2468    !
2469    REAL(DP) :: exx_stress(3,3), exx_stress_(3,3)
2470    !
2471    COMPLEX(DP),ALLOCATABLE :: tempphic(:), temppsic(:), result(:)
2472    COMPLEX(DP),ALLOCATABLE :: tempphic_nc(:,:), temppsic_nc(:,:), &
2473                               result_nc(:,:)
2474    COMPLEX(DP),ALLOCATABLE :: rhoc(:)
2475    REAL(DP),   ALLOCATABLE :: fac(:), fac_tens(:,:,:), fac_stress(:)
2476    INTEGER  :: npw, jbnd, ibnd, ik, ikk, ig, ir, ikq, iq, isym
2477    INTEGER  :: nqi, iqi, beta, nrxxs, ngm
2478    INTEGER  :: ibnd_loop_start
2479    REAL(DP) :: x1, x2
2480    REAL(DP) :: qq, xk_cryst(3), sxk(3), xkq(3), vc(3,3), x, q(3)
2481    REAL(DP) :: delta(3,3)
2482    INTEGER :: jstart, jend, ii, ipair, jblock_start, jblock_end
2483    INTEGER :: iegrp, wegrp
2484    INTEGER :: exxbuff_index
2485    !
2486    CALL start_clock( 'exx_stress' )
2487    !
2488    CALL transform_evc_to_exx( 0 )
2489    !
2490    IF (npool>1) CALL errore( 'exx_stress', 'stress not available with pools', 1 )
2491    IF (noncolin) CALL errore( 'exx_stress', 'noncolinear stress not implemented', 1 )
2492    IF (okvan) CALL infomsg( 'exx_stress', 'USPP stress not tested' )
2493    !
2494    nrxxs = dfftt%nnr
2495    ngm   = dfftt%ngm
2496    delta = RESHAPE( (/1._dp,0._dp,0._dp, 0._dp,1._dp,0._dp, 0._dp,0._dp,1._dp/), (/3,3/))
2497    exx_stress_ = 0._dp
2498    !
2499    ALLOCATE( tempphic(nrxxs), temppsic(nrxxs), rhoc(nrxxs), fac(ngm) )
2500    ALLOCATE( fac_tens(3,3,ngm), fac_stress(ngm) )
2501    !
2502    nqi = nqs
2503    !
2504    ! ... loop over k-points
2505    DO ikk = 1, nks
2506       current_k = ikk
2507       IF (lsda) current_spin = isk(ikk)
2508       npw = ngk(ikk)
2509       !
2510       IF (nks > 1) CALL get_buffer( evc_exx, nwordwfc_exx, iunwfc_exx, ikk )
2511       !
2512       DO iqi = 1, nqi
2513          !
2514          iq = iqi
2515          !
2516          ikq  = index_xkq(current_k,iq)
2517          ik   = index_xk(ikq)
2518          isym = ABS(index_sym(ikq))
2519          !
2520
2521          ! FIXME: use cryst_to_cart and company as above..
2522          xk_cryst(:) = at(1,:)*xk(1,ik)+at(2,:)*xk(2,ik)+at(3,:)*xk(3,ik)
2523          IF (index_sym(ikq) < 0) xk_cryst = -xk_cryst
2524          sxk(:) = s(:,1,isym)*xk_cryst(1) + &
2525                   s(:,2,isym)*xk_cryst(2) + &
2526                   s(:,3,isym)*xk_cryst(3)
2527          xkq(:) = bg(:,1)*sxk(1) + bg(:,2)*sxk(2) + bg(:,3)*sxk(3)
2528          !
2529          !CALL start_clock ('exxen2_ngmloop')
2530          !
2531!$omp parallel do default(shared), private(ig, beta, q, qq, on_double_grid, x)
2532          DO ig = 1, ngm
2533             IF (negrp == 1) THEN
2534                q(1) = xk(1,current_k) - xkq(1) + g(1,ig)
2535                q(2) = xk(2,current_k) - xkq(2) + g(2,ig)
2536                q(3) = xk(3,current_k) - xkq(3) + g(3,ig)
2537             ELSE
2538                q(1) = xk(1,current_k) - xkq(1) + g_exx(1,ig)
2539                q(2) = xk(2,current_k) - xkq(2) + g_exx(2,ig)
2540                q(3) = xk(3,current_k) - xkq(3) + g_exx(3,ig)
2541             ENDIF
2542             !
2543             q = q * tpiba
2544             qq = ( q(1)*q(1) + q(2)*q(2) + q(3)*q(3) )
2545             !
2546             DO beta = 1, 3
2547                fac_tens(1:3,beta,ig) = q(1:3)*q(beta)
2548             ENDDO
2549             !
2550             IF (x_gamma_extrapolation) THEN
2551                on_double_grid = .TRUE.
2552                x= 0.5d0/tpiba*(q(1)*at(1,1)+q(2)*at(2,1)+q(3)*at(3,1))*nq1
2553                on_double_grid = on_double_grid .AND. (ABS(x-NINT(x))<eps)
2554                x= 0.5d0/tpiba*(q(1)*at(1,2)+q(2)*at(2,2)+q(3)*at(3,2))*nq2
2555                on_double_grid = on_double_grid .AND. (ABS(x-NINT(x))<eps)
2556                x= 0.5d0/tpiba*(q(1)*at(1,3)+q(2)*at(2,3)+q(3)*at(3,3))*nq3
2557                on_double_grid = on_double_grid .AND. (ABS(x-NINT(x))<eps)
2558             ELSE
2559                on_double_grid = .FALSE.
2560             ENDIF
2561             !
2562             IF (use_coulomb_vcut_ws) THEN
2563                fac(ig) = vcut_get(vcut, q)
2564                fac_stress(ig) = 0._dp   ! not implemented
2565                IF (gamma_only .AND. qq > 1.d-8) fac(ig) = 2.d0 * fac(ig)
2566                !
2567             ELSEIF ( use_coulomb_vcut_spheric ) THEN
2568                fac(ig) = vcut_spheric_get(vcut, q)
2569                fac_stress(ig) = 0._dp   ! not implemented
2570                IF (gamma_only .AND. qq > 1.d-8) fac(ig) = 2.d0 * fac(ig)
2571                !
2572             ELSEIF (gau_scrlen > 0) THEN
2573                fac(ig) = e2*((pi/gau_scrlen)**(1.5d0))* &
2574                          EXP(-qq/4.d0/gau_scrlen) * grid_factor
2575                fac_stress(ig) =  e2*2.d0/4.d0/gau_scrlen  * &
2576                                  EXP(-qq/4.d0/gau_scrlen) * &
2577                                  ((pi/gau_scrlen)**(1.5d0))*grid_factor
2578                IF (gamma_only) fac(ig) = 2.d0 * fac(ig)
2579                IF (gamma_only) fac_stress(ig) = 2.d0 * fac_stress(ig)
2580                IF (on_double_grid) fac(ig) = 0._dp
2581                IF (on_double_grid) fac_stress(ig) = 0._dp
2582                !
2583             ELSEIF (qq > 1.d-8) THEN
2584                IF ( erfc_scrlen > 0 ) THEN
2585                  fac(ig)=e2*fpi/qq*(1._dp-EXP(-qq/4.d0/erfc_scrlen**2)) * grid_factor
2586                  fac_stress(ig) = -e2*fpi * 2.d0/qq**2 * ( &
2587                      (1._dp+qq/4.d0/erfc_scrlen**2)*EXP(-qq/4.d0/erfc_scrlen**2) - 1._dp) * &
2588                      grid_factor
2589                ELSE
2590                  fac(ig)=e2*fpi/( qq + yukawa ) * grid_factor
2591                  fac_stress(ig) = 2.d0 * e2*fpi/(qq+yukawa)**2 * grid_factor
2592                ENDIF
2593                !
2594                IF (gamma_only) fac(ig) = 2.d0 * fac(ig)
2595                IF (gamma_only) fac_stress(ig) = 2.d0 * fac_stress(ig)
2596                IF (on_double_grid) fac(ig) = 0._dp
2597                IF (on_double_grid) fac_stress(ig) = 0._dp
2598                !
2599             ELSE
2600                !
2601                fac(ig) = -exxdiv ! or rather something else (see f.gygi)
2602                fac_stress(ig) = 0._dp  ! or -exxdiv_stress (not yet implemented)
2603                IF ( yukawa> 0._dp .AND. .NOT. x_gamma_extrapolation) THEN
2604                  fac(ig) = fac(ig) + e2*fpi/( qq + yukawa )
2605                  fac_stress(ig) = 2.d0 * e2*fpi/(qq+yukawa)**2
2606                ENDIF
2607                IF (erfc_scrlen > 0._dp .AND. .NOT. x_gamma_extrapolation) THEN
2608                  fac(ig) = e2*fpi / (4.d0*erfc_scrlen**2)
2609                  fac_stress(ig) = e2*fpi / (8.d0*erfc_scrlen**4)
2610                ENDIF
2611                !
2612             ENDIF
2613             !
2614          ENDDO
2615!$omp end parallel do
2616          !CALL stop_clock ('exxen2_ngmloop')
2617          DO iegrp = 1, negrp
2618             !
2619             ! compute the id of group whose data is currently worked on
2620             wegrp = MOD(iegrp+my_egrp_id-1, negrp)+1
2621             !
2622             jblock_start = all_start(wegrp)
2623             jblock_end = all_end(wegrp)
2624             !
2625             ! loop over bands
2626             DO ii = 1, nibands(my_egrp_id+1)
2627                !
2628                jbnd = ibands(ii,my_egrp_id+1)
2629                !
2630                IF (jbnd==0 .OR. jbnd>nbnd) CYCLE
2631                !
2632                !determine which j-bands to calculate
2633                jstart = 0
2634                jend = 0
2635                DO ipair=1, max_pairs
2636                   IF (egrp_pairs(1,ipair,my_egrp_id+1).eq.jbnd)THEN
2637                      IF (jstart == 0)THEN
2638                         jstart = egrp_pairs(2,ipair,my_egrp_id+1)
2639                         jend = jstart
2640                      ELSE
2641                         jend = egrp_pairs(2,ipair,my_egrp_id+1)
2642                      ENDIF
2643                   ENDIF
2644                ENDDO
2645                !
2646                jstart = MAX(jstart,jblock_start)
2647                jend = MIN(jend,jblock_end)
2648                !
2649                temppsic(:) = ( 0._dp, 0._dp )
2650!$omp parallel do default(shared), private(ig)
2651                DO ig = 1, npw
2652                   temppsic(dfftt%nl(igk_exx(ig,ikk))) = evc_exx(ig,ii)
2653                ENDDO
2654!$omp end parallel do
2655                !
2656                IF (gamma_only) THEN
2657!$omp parallel do default(shared), private(ig)
2658                   DO ig = 1, npw
2659                      temppsic(dfftt%nlm(igk_exx(ig,ikk))) = CONJG(evc_exx(ig,ii))
2660                   ENDDO
2661!$omp end parallel do
2662                ENDIF
2663                !
2664                CALL invfft( 'Wave', temppsic, dfftt )
2665                !
2666                IF (gamma_only) THEN
2667                   !
2668                   IF (MOD(jstart,2) == 0) THEN
2669                      ibnd_loop_start = jstart-1
2670                   ELSE
2671                      ibnd_loop_start = jstart
2672                   ENDIF
2673                   !
2674                   DO ibnd = ibnd_loop_start, jend, 2     !for each band of psi
2675                      !
2676                      exxbuff_index = (ibnd+1)/2-(all_start(wegrp)+1)/2+(iexx_start+1)/2
2677                      !
2678                      IF ( ibnd < jstart ) THEN
2679                         x1 = 0._dp
2680                      ELSE
2681                         x1 = x_occupation(ibnd,ik)
2682                      ENDIF
2683                      !
2684                      IF ( ibnd == jend) THEN
2685                         x2 = 0._dp
2686                      ELSE
2687                         x2 = x_occupation(ibnd+1,ik)
2688                      ENDIF
2689                      !
2690                      IF ( ABS(x1) < eps_occ .AND. ABS(x2) < eps_occ ) CYCLE
2691                      !
2692                      ! calculate rho in real space
2693!$omp parallel do default(shared), private(ir)
2694                      DO ir = 1, nrxxs
2695                         tempphic(ir) = exxbuff(ir,exxbuff_index,ikq)
2696                         rhoc(ir) = CONJG(tempphic(ir))*temppsic(ir) / omega
2697                      ENDDO
2698!$omp end parallel do
2699                      ! bring it to G-space
2700                      CALL fwfft( 'Rho', rhoc, dfftt )
2701                      !
2702                      vc = 0._dp
2703!$omp parallel do default(shared), private(ig), reduction(+:vc)
2704                      DO ig = 1, ngm
2705                         !
2706                         vc(:,:) = vc(:,:) + x1 * 0.25_dp * &
2707                                   ABS( rhoc(dfftt%nl(ig)) + &
2708                                   CONJG(rhoc(dfftt%nlm(ig))))**2 * &
2709                                   (fac_tens(:,:,ig)*fac_stress(ig)/2.d0 - delta(:,:)*fac(ig))
2710                         vc(:,:) = vc(:,:) + x2 * 0.25_dp * &
2711                                   ABS( rhoc(dfftt%nl(ig)) - &
2712                                   CONJG(rhoc(dfftt%nlm(ig))))**2 * &
2713                                   (fac_tens(:,:,ig)*fac_stress(ig)/2.d0 - delta(:,:)*fac(ig))
2714                      ENDDO
2715!$omp end parallel do
2716                      vc = vc / nqs / 4.d0
2717                      exx_stress_ = exx_stress_ + exxalfa * vc * wg(jbnd,ikk)
2718                   ENDDO
2719                   !
2720                ELSE
2721                   !
2722                   DO ibnd = jstart, jend    !for each band of psi
2723                      !
2724                      IF ( ABS(x_occupation(ibnd,ik)) < 1.d-6) CYCLE
2725                      !
2726                      ! calculate rho in real space
2727!$omp parallel do default(shared), private(ir)
2728                      DO ir = 1, nrxxs
2729                         tempphic(ir) = exxbuff(ir,ibnd-all_start(wegrp)+iexx_start,ikq)
2730                         rhoc(ir) = CONJG(tempphic(ir))*temppsic(ir) / omega
2731                      ENDDO
2732!$omp end parallel do
2733                      !
2734                      ! bring it to G-space
2735                      CALL fwfft( 'Rho', rhoc, dfftt )
2736                      !
2737                      vc = 0._dp
2738!$omp parallel do default(shared), private(ig), reduction(+:vc)
2739                      DO ig = 1, ngm
2740                         vc(:,:) = vc(:,:) + rhoc(dfftt%nl(ig))  * &
2741                                   CONJG(rhoc(dfftt%nl(ig))) *     &
2742                                   (fac_tens(:,:,ig)*fac_stress(ig)/2.d0 - &
2743                                   delta(:,:)*fac(ig))
2744                      ENDDO
2745!$omp end parallel do
2746                      !
2747                      vc = vc * x_occupation(ibnd,ik) / nqs / 4.d0
2748                      exx_stress_ = exx_stress_ + exxalfa * vc * wg(jbnd,ikk)
2749                      !
2750                   ENDDO
2751                   !
2752                ENDIF ! gamma or k-points
2753                !
2754             ENDDO ! jbnd
2755             !
2756             ! get the next nbnd/negrp data
2757             IF (negrp > 1) CALL mp_circular_shift_left( exxbuff(:,:,ikq), me_egrp, &
2758                                                         inter_egrp_comm )
2759             !
2760          ENDDO ! iegrp
2761          !
2762       ENDDO ! iqi
2763       !
2764    ENDDO ! ikk
2765    !
2766    DEALLOCATE( tempphic, temppsic, rhoc, fac, fac_tens, fac_stress )
2767    !
2768    CALL mp_sum( exx_stress_, intra_egrp_comm )
2769    CALL mp_sum( exx_stress_, inter_egrp_comm )
2770    CALL mp_sum( exx_stress_, inter_pool_comm )
2771    !
2772    exx_stress = exx_stress_
2773    !
2774    CALL change_data_structure( .FALSE. )
2775    !
2776    CALL stop_clock( 'exx_stress' )
2777    !
2778  END FUNCTION exx_stress
2779  !
2780  !
2781  !----------------------------------------------------------------------
2782  SUBROUTINE compute_becpsi( npw_, igk_, q_, evc_exx, becpsi_k )
2783  !----------------------------------------------------------------------
2784  !! Calculates beta functions (Kleinman-Bylander projectors), with
2785  !! structure factor, for all atoms, in reciprocal space.
2786  !! FIXME: why so much replicated code?
2787  !
2788  USE kinds,         ONLY : DP
2789  USE ions_base,     ONLY : nat, ntyp => nsp, ityp, tau
2790  USE cell_base,     ONLY : tpiba, omega
2791  USE constants,     ONLY : tpi
2792  USE gvect,         ONLY : eigts1, eigts2, eigts3, mill, g
2793  USE wvfct,         ONLY : npwx, nbnd
2794  USE us,            ONLY : nqx, dq, tab, tab_d2y, spline_ps
2795  USE m_gth,         ONLY : mk_ffnl_gth
2796  USE splinelib
2797  USE uspp,          ONLY : nkb, nhtol, nhtolm, indv
2798  USE uspp_param,    ONLY : upf, lmaxkb, nhm, nh
2799  USE becmod,        ONLY : calbec
2800  USE mp_exx,        ONLY : ibands, nibands, my_egrp_id
2801  !
2802  IMPLICIT NONE
2803  !
2804  INTEGER, INTENT(IN) :: npw_
2805  !! number of PWs
2806  INTEGER, INTENT(IN) :: igk_(npw_)
2807  !! indices of G in the list of q+G vectors
2808  REAL(DP), INTENT(IN) :: q_(3)
2809  !! q vector (2pi/a units)
2810  COMPLEX(DP), INTENT(IN) :: evc_exx(npwx,nibands(my_egrp_id+1))
2811  !! wavefunctions from the PW set to exx
2812  COMPLEX(DP), INTENT(OUT) :: becpsi_k(nkb,nibands(my_egrp_id+1))
2813  !! <beta|psi> for k points
2814  !
2815  ! ... local variables
2816  !
2817  COMPLEX(DP) :: vkb_(npwx,1) !beta functions (npw_ <= npwx)
2818  !
2819  INTEGER :: i0, i1, i2, i3, ig, lm, na, nt, nb, ih, jkb
2820  !
2821  REAL(DP) :: px, ux, vx, wx, arg
2822  REAL(DP), ALLOCATABLE :: gk(:,:), qg(:), vq(:), ylm(:,:), vkb1(:,:)
2823  !
2824  COMPLEX(DP) :: phase, pref
2825  COMPLEX(DP), ALLOCATABLE :: sk(:)
2826  !
2827  REAL(DP), ALLOCATABLE :: xdata(:)
2828  INTEGER :: iq
2829  INTEGER :: istart, iend
2830  !
2831  istart = ibands(1,my_egrp_id+1)
2832  iend = ibands(nibands(my_egrp_id+1),my_egrp_id+1)
2833  !
2834  IF (lmaxkb < 0) RETURN
2835  !
2836  ALLOCATE( vkb1(npw_,nhm) )
2837  ALLOCATE( sk(npw_) )
2838  ALLOCATE( qg(npw_) )
2839  ALLOCATE( vq(npw_) )
2840  ALLOCATE( ylm(npw_,(lmaxkb+1)**2) )
2841  ALLOCATE( gk(3,npw_) )
2842  !
2843  ! write(*,'(3i4,i5,3f10.5)') size(tab,1), size(tab,2), size(tab,3), size(vq), q_
2844  !
2845  DO ig = 1, npw_
2846     gk(1,ig) = q_(1) + g(1,igk_(ig))
2847     gk(2,ig) = q_(2) + g(2,igk_(ig))
2848     gk(3,ig) = q_(3) + g(3,igk_(ig))
2849     qg(ig) = gk(1,ig)**2 + gk(2,ig)**2 + gk(3,ig)**2
2850  ENDDO
2851  !
2852  CALL ylmr2( (lmaxkb+1)**2, npw_, gk, qg, ylm )
2853  !
2854  ! ... set now qg=|q+G| in atomic units
2855  !
2856  DO ig = 1, npw_
2857     qg(ig) = SQRT(qg(ig))*tpiba
2858  ENDDO
2859  !
2860  IF (spline_ps) THEN
2861     ALLOCATE( xdata(nqx) )
2862     DO iq = 1, nqx
2863       xdata(iq) = (iq - 1) * dq
2864     ENDDO
2865  ENDIF
2866  ! |beta_lm(q)> = (4pi/omega).Y_lm(q).f_l(q).(i^l).S(q)
2867  jkb = 0
2868  !
2869  DO nt = 1, ntyp
2870     ! ... calculate beta in G-space using an interpolation table:
2871     ! f_l(q)=\int _0 ^\infty dr r^2 f_l(r) j_l(q.r)
2872     DO nb = 1, upf(nt)%nbeta
2873        IF ( upf(nt)%is_gth ) THEN
2874           CALL mk_ffnl_gth( nt, nb, npw_, omega, qg, vq )
2875        ELSE
2876           DO ig = 1, npw_
2877              IF (spline_ps) THEN
2878                vq(ig) = splint(xdata, tab(:,nb,nt), tab_d2y(:,nb,nt), qg(ig))
2879              ELSE
2880                px = qg (ig) / dq - INT(qg (ig) / dq)
2881                ux = 1.d0 - px
2882                vx = 2.d0 - px
2883                wx = 3.d0 - px
2884                i0 = INT( qg (ig) / dq ) + 1
2885                i1 = i0 + 1
2886                i2 = i0 + 2
2887                i3 = i0 + 3
2888                vq (ig) = tab (i0, nb, nt) * ux * vx * wx / 6.d0 + &
2889                          tab (i1, nb, nt) * px * vx * wx / 2.d0 - &
2890                          tab (i2, nb, nt) * px * ux * wx / 2.d0 + &
2891                          tab (i3, nb, nt) * px * ux * vx / 6.d0
2892              ENDIF
2893           ENDDO
2894        ENDIF
2895        !
2896        ! ... add spherical harmonic part  (Y_lm(q)*f_l(q))
2897        DO ih = 1, nh(nt)
2898           IF (nb == indv(ih,nt)) THEN
2899              !l = nhtol (ih,nt)
2900              lm = nhtolm(ih,nt)
2901              DO ig = 1, npw_
2902                 vkb1(ig,ih) = ylm(ig,lm) * vq(ig)
2903              ENDDO
2904           ENDIF
2905        ENDDO
2906        !
2907     ENDDO
2908     !
2909     ! ... vkb1 contains all betas including angular part for type nt
2910     ! now add the structure factor and factor (-i)^l
2911     !
2912     DO na = 1, nat
2913        ! ordering: first all betas for atoms of type 1
2914        !           then  all betas for atoms of type 2  and so on
2915        IF (ityp(na) == nt) THEN
2916           arg = (q_(1) * tau(1,na) + &
2917                  q_(2) * tau(2,na) + &
2918                  q_(3) * tau(3,na) ) * tpi
2919           phase = CMPLX(COS(arg), - SIN(arg), KIND=DP)
2920           DO ig = 1, npw_
2921              sk (ig) = eigts1(mill(1,igk_(ig)), na) * &
2922                        eigts2(mill(2,igk_(ig)), na) * &
2923                        eigts3(mill(3,igk_(ig)), na)
2924           ENDDO
2925           !
2926           DO ih = 1, nh (nt)
2927              jkb = jkb + 1
2928              pref = (0.d0, -1.d0)**nhtol(ih,nt) * phase
2929              DO ig = 1, npw_
2930                 vkb_(ig,1) = vkb1(ig,ih) * sk(ig) * pref
2931              ENDDO
2932              !
2933              DO ig = npw_+1, npwx
2934                 vkb_(ig, 1) = (0.0_DP, 0.0_DP)
2935              ENDDO
2936              !
2937              CALL calbec( npw_, vkb_, evc_exx, becpsi_k(jkb:jkb,:), &
2938                           nibands(my_egrp_id+1) )
2939           ENDDO
2940           !
2941        ENDIF
2942        !
2943     ENDDO
2944  ENDDO
2945  !
2946  DEALLOCATE( gk )
2947  DEALLOCATE( ylm )
2948  DEALLOCATE( vq )
2949  DEALLOCATE( qg )
2950  DEALLOCATE( sk )
2951  DEALLOCATE( vkb1 )
2952  !
2953  RETURN
2954  !
2955  END SUBROUTINE compute_becpsi
2956  !
2957  !
2958  !-----------------------------------------------------------------------------
2959  SUBROUTINE aceinit( DoLoc, exex )
2960    !----------------------------------------------------------------------------
2961    !! ACE Initialization
2962    !
2963    USE wvfct,            ONLY : nbnd, npwx, current_k
2964    USE klist,            ONLY : nks, xk, ngk, igk_k
2965    USE uspp,             ONLY : nkb, vkb, okvan
2966    USE becmod,           ONLY : allocate_bec_type, deallocate_bec_type, &
2967                                 bec_type, calbec
2968    USE lsda_mod,         ONLY : current_spin, lsda, isk
2969    USE io_files,         ONLY : nwordwfc, iunwfc
2970    USE buffers,          ONLY : get_buffer
2971    USE mp_pools,         ONLY : inter_pool_comm
2972    USE mp_bands,         ONLY : intra_bgrp_comm
2973    USE mp,               ONLY : mp_sum
2974    USE wavefunctions,    ONLY : evc
2975    !
2976    IMPLICIT NONE
2977    !
2978    LOGICAL, INTENT(IN) :: DoLoc
2979    !! if TRUE calculates exact exchange with SCDM orbitals
2980    REAL(DP), OPTIONAL, INTENT(OUT) :: exex
2981    !! ACE energy
2982    !
2983    ! ... local variables
2984    !
2985    REAL(DP) :: ee, eexx
2986    INTEGER :: ik, npw
2987    TYPE(bec_type) :: becpsi
2988    !
2989    IF (nbndproj < x_nbnd_occ .OR. nbndproj > nbnd) THEN
2990       WRITE( stdout, '(3(A,I4))' ) ' occ = ', x_nbnd_occ, ' proj = ', nbndproj, &
2991                                    ' tot = ', nbnd
2992       CALL errore( 'aceinit', 'n_proj must be between occ and tot.', 1 )
2993    ENDIF
2994    !
2995    IF (.NOT. ALLOCATED(xi)) ALLOCATE( xi(npwx*npol,nbndproj,nks) )
2996    IF ( okvan ) CALL allocate_bec_type( nkb, nbnd, becpsi )
2997    !
2998    eexx = 0.0d0
2999    xi = (0.0d0,0.0d0)
3000    !
3001    DO ik = 1, nks
3002       npw = ngk(ik)
3003       current_k = ik
3004       IF ( lsda ) current_spin = isk(ik)
3005       IF ( nks > 1 ) CALL get_buffer( evc, nwordwfc, iunwfc, ik )
3006       IF ( okvan ) THEN
3007          CALL init_us_2( npw, igk_k(1,ik), xk(:,ik), vkb )
3008          CALL calbec( npw, vkb, evc, becpsi, nbnd )
3009       ENDIF
3010       IF (gamma_only) THEN
3011          CALL aceinit_gamma( DoLoc, npw, nbnd, evc, xi(1,1,ik), becpsi, ee )
3012       ELSE
3013          CALL aceinit_k( DoLoc, npw, nbnd, evc, xi(1,1,ik), becpsi, ee )
3014       ENDIF
3015       eexx = eexx + ee
3016    ENDDO
3017    !
3018    CALL mp_sum( eexx, inter_pool_comm )
3019    ! WRITE(stdout,'(/,5X,"ACE energy",f15.8)') eexx
3020    !
3021    IF (PRESENT(exex)) exex = eexx
3022    IF ( okvan ) CALL deallocate_bec_type( becpsi )
3023    !
3024    domat = .FALSE.
3025    !
3026  END SUBROUTINE aceinit
3027  !
3028  !
3029  !---------------------------------------------------------------------------------
3030  SUBROUTINE aceinit_gamma( DoLoc, nnpw, nbnd, phi, xitmp, becpsi, exxe )
3031    !-------------------------------------------------------------------------------
3032    !! Compute xi(npw,nbndproj) for the ACE method.
3033    !
3034    USE becmod,         ONLY : bec_type
3035    USE lsda_mod,       ONLY : current_spin
3036    USE mp,             ONLY : mp_stop
3037    !
3038    IMPLICIT NONE
3039    !
3040    LOGICAL, INTENT(IN) :: DoLoc
3041    !! if TRUE calculates exact exchange with SCDM orbitals
3042    INTEGER :: nnpw
3043    !! number of pw
3044    INTEGER :: nbnd
3045    !! number of bands
3046    COMPLEX(DP) :: phi(nnpw,nbnd)
3047    !! wavefunction
3048    COMPLEX(DP) :: xitmp(nnpw,nbndproj)
3049    !! xi(npw,nbndproj)
3050    TYPE(bec_type), INTENT(IN) :: becpsi
3051    !! <beta|psi>
3052    REAL(DP) :: exxe
3053    !! exx energy
3054    !
3055    ! ... local variables
3056    !
3057    INTEGER :: nrxxs
3058    REAL(DP), ALLOCATABLE :: mexx(:,:)
3059    REAL(DP), PARAMETER :: Zero=0.0d0, One=1.0d0, Two=2.0d0, Pt5=0.50d0
3060    LOGICAL :: domat0
3061    !
3062    CALL start_clock( 'aceinit' )
3063    !
3064    nrxxs = dfftt%nnr * npol
3065    !
3066    ALLOCATE( mexx(nbndproj,nbndproj) )
3067    xitmp = (Zero,Zero)
3068    mexx = Zero
3069    !
3070    IF ( DoLoc ) then
3071      CALL vexx_loc( nnpw, nbndproj, xitmp, mexx )
3072      CALL MatSymm( 'S', 'L', mexx,nbndproj )
3073    ELSE
3074    ! |xi> = Vx[phi]|phi>
3075    CALL vexx( nnpw, nnpw, nbndproj, phi, xitmp, becpsi )
3076    ! mexx = <phi|Vx[phi]|phi>
3077    CALL matcalc( 'exact', .TRUE., 0, nnpw, nbndproj, nbndproj, phi, xitmp, mexx, exxe )
3078    ! |xi> = -One * Vx[phi]|phi> * rmexx^T
3079    ENDIF
3080    !
3081    CALL aceupdate( nbndproj, nnpw, xitmp, mexx )
3082    !
3083    DEALLOCATE( mexx )
3084    !
3085    IF ( local_thr > 0.0d0 ) THEN
3086      domat0 = domat
3087      domat = .TRUE.
3088      CALL vexxace_gamma( nnpw, nbndproj, evc0(1,1,current_spin), exxe )
3089      evc0(:,:,current_spin) = phi(:,:)
3090      domat = domat0
3091    ENDIF
3092    !
3093    CALL stop_clock( 'aceinit' )
3094    !
3095  END SUBROUTINE aceinit_gamma
3096  !
3097  !
3098  !----------------------------------------------------------------------------------
3099  SUBROUTINE vexxace_gamma( nnpw, nbnd, phi, exxe, vphi )
3100    !-------------------------------------------------------------------------------
3101    !! Do the ACE potential and (optional) print the ACE matrix representation.
3102    !
3103    USE wvfct,        ONLY : current_k, wg
3104    USE lsda_mod,     ONLY : current_spin
3105    !
3106    IMPLICIT NONE
3107    !
3108    INTEGER :: nnpw
3109    !! number of plane waves
3110    INTEGER :: nbnd
3111    !! number of bands
3112    COMPLEX(DP) :: phi(nnpw,nbnd)
3113    !! wave function
3114    REAL(DP) :: exxe
3115    !! exx energy
3116    COMPLEX(DP), OPTIONAL :: vphi(nnpw,nbnd)
3117    !! v times phi
3118    !
3119    ! ... local variables
3120    !
3121    INTEGER :: i, ik
3122    REAL*8, ALLOCATABLE :: rmexx(:,:)
3123    COMPLEX(DP),ALLOCATABLE :: cmexx(:,:), vv(:,:)
3124    REAL*8, PARAMETER :: Zero=0.0d0, One=1.0d0, Two=2.0d0, Pt5=0.50d0
3125    !
3126    CALL start_clock( 'vexxace' )
3127    !
3128    ALLOCATE( vv(nnpw,nbnd) )
3129    !
3130    IF (PRESENT(vphi)) THEN
3131      vv = vphi
3132    ELSE
3133      vv = (Zero,Zero)
3134    ENDIF
3135    !
3136    ! do the ACE potential
3137    ALLOCATE( rmexx(nbndproj,nbnd), cmexx(nbndproj,nbnd) )
3138    !
3139    rmexx = Zero
3140    cmexx = (Zero,Zero)
3141    ! <xi|phi>
3142    CALL matcalc( '<xi|phi>', .FALSE. , 0, nnpw, nbndproj, nbnd, xi(1,1,current_k), &
3143                  phi, rmexx, exxe )
3144    ! |vv> = |vphi> + (-One) * |xi> * <xi|phi>
3145    cmexx = (One,Zero)*rmexx
3146    !
3147    CALL ZGEMM( 'N', 'N', nnpw, nbnd, nbndproj, -(One,Zero), xi(1,1,current_k), &
3148                nnpw, cmexx, nbndproj, (One,Zero), vv, nnpw )
3149    !
3150    DEALLOCATE( cmexx, rmexx )
3151    !
3152    IF (domat) THEN
3153      ALLOCATE( rmexx(nbnd,nbnd) )
3154      CALL matcalc( 'ACE', .TRUE., 0, nnpw, nbnd, nbnd, phi, vv, rmexx, exxe )
3155      DEALLOCATE( rmexx )
3156#if defined(__DEBUG)
3157        WRITE(stdout,'(4(A,I3),A,I9,A,f12.6)') 'vexxace: nbnd=', nbnd, ' nbndproj=',nbndproj, &
3158                                               ' k=',current_k,' spin=',current_spin,' npw=', &
3159                                               nnpw, ' E=',exxe
3160      ELSE
3161        WRITE(stdout,'(4(A,I3),A,I9)')         'vexxace: nbnd=', nbnd, ' nbndproj=',nbndproj, &
3162                                               ' k=',current_k,' spin=',current_spin,' npw=', &
3163                                               nnpw
3164#endif
3165      ENDIF
3166      !
3167      IF (PRESENT(vphi)) vphi = vv
3168      DEALLOCATE( vv )
3169      !
3170      CALL stop_clock( 'vexxace' )
3171      !
3172  END SUBROUTINE vexxace_gamma
3173  !
3174  !
3175  !-------------------------------------------------------------------------------------------
3176  SUBROUTINE aceupdate( nbndproj, nnpw, xitmp, rmexx )
3177    !----------------------------------------------------------------------------------------
3178    !! Build the ACE operator from the potential amd matrix (rmexx is assumed symmetric
3179    !! and only the Lower Triangular part is considered).
3180    !
3181    IMPLICIT NONE
3182    !
3183    INTEGER :: nbndproj
3184    !! number of bands
3185    INTEGER :: nnpw
3186    !! number of PW
3187    COMPLEX(DP) :: xitmp(nnpw,nbndproj)
3188    !! xi(nnpw,nbndproj)
3189    REAL(DP) :: rmexx(nbndproj,nbndproj)
3190    !! |xi> = -One * Vx[phi]|phi> * rmexx^T
3191    !
3192    ! ... local variables
3193    !
3194    COMPLEX(DP), ALLOCATABLE :: cmexx(:,:)
3195    REAL(DP), PARAMETER :: Zero=0.0d0, One=1.0d0, Two=2.0d0, Pt5=0.50d0
3196    !
3197    CALL start_clock( 'aceupdate' )
3198    !
3199    ! rmexx = -(Cholesky(rmexx))^-1
3200    rmexx = -rmexx
3201    ! CALL invchol( nbndproj, rmexx )
3202    CALL MatChol( nbndproj, rmexx )
3203    CALL MatInv( 'L', nbndproj, rmexx )
3204    !
3205    ! |xi> = -One * Vx[phi]|phi> * rmexx^T
3206    ALLOCATE( cmexx(nbndproj,nbndproj) )
3207    cmexx = (One,Zero)*rmexx
3208    CALL ZTRMM( 'R', 'L', 'C', 'N', nnpw, nbndproj, (One,Zero), cmexx, nbndproj, xitmp, nnpw )
3209    !
3210    DEALLOCATE( cmexx )
3211    !
3212    CALL stop_clock( 'aceupdate' )
3213    !
3214  END SUBROUTINE
3215  !
3216  !
3217  !---------------------------------------------------------------------------------------------
3218  SUBROUTINE aceinit_k( DoLoc, nnpw, nbnd, phi, xitmp, becpsi, exxe )
3219    !-----------------------------------------------------------------------------------------
3220    !! Compute xi(npw,nbndproj) for the ACE method.
3221    !
3222    USE becmod,               ONLY : bec_type
3223    USE wvfct,                ONLY : current_k, npwx
3224    USE klist,                ONLY : wk
3225    USE noncollin_module,     ONLY : npol
3226    !
3227    IMPLICIT NONE
3228    !
3229    LOGICAL, INTENT(IN) :: DoLoc
3230    !! if TRUE calculates exact exchange with SCDM orbitals
3231    INTEGER :: nnpw
3232    !! number of PW
3233    INTEGER :: nbnd
3234    !! number of bands
3235    COMPLEX(DP) :: phi(npwx*npol,nbnd)
3236    !! wave function
3237    COMPLEX(DP) :: xitmp(npwx*npol,nbndproj)
3238    !! xi(nnpw,nbndproj)
3239    TYPE(bec_type), INTENT(IN) :: becpsi
3240    !! <beta|psi>
3241    REAL(DP) :: exxe
3242    !! exx energy
3243    !
3244    ! ... local variables
3245    !
3246    COMPLEX(DP), ALLOCATABLE :: mexx(:,:)
3247    REAL(DP) :: exxe0
3248    REAL(DP), PARAMETER :: Zero=0.0d0, One=1.0d0, Two=2.0d0, Pt5=0.50d0
3249    INTEGER :: i
3250    LOGICAL :: domat0
3251    !
3252    CALL start_clock( 'aceinit' )
3253    !
3254    IF (nbndproj>nbnd) CALL errore( 'aceinit_k', 'nbndproj greater than nbnd.', 1 )
3255    IF (nbndproj<=0)   CALL errore( 'aceinit_k', 'nbndproj le 0.', 1 )
3256    !
3257    ALLOCATE( mexx(nbndproj,nbndproj) )
3258    xitmp = (Zero,Zero)
3259    mexx  = (Zero,Zero)
3260    IF ( DoLoc ) THEN
3261      CALL vexx_loc_k( nnpw, nbndproj, xitmp, mexx, exxe )
3262      CALL MatSymm_k( 'S', 'L', mexx, nbndproj )
3263    ELSE
3264      ! |xi> = Vx[phi]|phi>
3265      CALL vexx( npwx, nnpw, nbndproj, phi, xitmp, becpsi )
3266      ! mexx = <phi|Vx[phi]|phi>
3267      CALL matcalc_k( 'exact', .TRUE., 0, current_k, npwx*npol, nbndproj, nbndproj, &
3268                      phi, xitmp, mexx, exxe )
3269    ENDIF
3270#if defined(__DEBUG)
3271      WRITE( stdout,'(3(A,I3),A,I9,A,f12.6)') 'aceinit_k: nbnd=', nbnd, ' nbndproj=',nbndproj, &
3272                                              ' k=',current_k,' npw=',nnpw,' Ex(k)=',exxe
3273#endif
3274    ! Skip k-points that have exactly zero weight
3275    IF(wk(current_k)/=0._dp)THEN
3276      ! |xi> = -One * Vx[phi]|phi> * rmexx^T
3277      CALL aceupdate_k( nbndproj, nnpw, xitmp, mexx )
3278    ENDIF
3279    !
3280    DEALLOCATE( mexx )
3281    !
3282    IF ( DoLoc ) THEN
3283       domat0 = domat
3284       domat = .TRUE.
3285       CALL vexxace_k( nnpw, nbnd, evc0(1,1,current_k), exxe )
3286       evc0(:,:,current_k) = phi(:,:)
3287       domat = domat0
3288    ENDIF
3289    !
3290    CALL stop_clock( 'aceinit' )
3291    !
3292  END SUBROUTINE aceinit_k
3293  !
3294  !
3295  !------------------------------------------------------------------------------
3296  SUBROUTINE aceupdate_k( nbndproj, nnpw, xitmp, mexx )
3297    !----------------------------------------------------------------------------
3298    !! Updates xi(npw,nbndproj) for the ACE method.
3299    !
3300    USE wvfct,                ONLY : npwx
3301    USE noncollin_module,     ONLY : noncolin, npol
3302    !
3303    IMPLICIT NONE
3304    !
3305    INTEGER :: nbndproj
3306    !! number of bands
3307    INTEGER :: nnpw
3308    !! number of PW
3309    COMPLEX(DP) :: mexx(nbndproj,nbndproj)
3310    !! mexx = -(Cholesky(mexx))^-1
3311    COMPLEX(DP) :: xitmp(npwx*npol,nbndproj)
3312    !! |xi> = -One * Vx[phi]|phi> * mexx^T
3313    !
3314    CALL start_clock( 'aceupdate' )
3315    !
3316    ! mexx = -(Cholesky(mexx))^-1
3317    mexx = -mexx
3318    CALL invchol_k( nbndproj, mexx )
3319    !
3320    ! |xi> = -One * Vx[phi]|phi> * mexx^T
3321    CALL ZTRMM( 'R', 'L', 'C', 'N', npwx*npol, nbndproj, (1.0_dp,0.0_dp), mexx,nbndproj, &
3322                xitmp, npwx*npol )
3323    !
3324    CALL stop_clock( 'aceupdate' )
3325    !
3326  END SUBROUTINE aceupdate_k
3327  !
3328  !
3329  !--------------------------------------------------------------------------------------
3330  SUBROUTINE vexxace_k( nnpw, nbnd, phi, exxe, vphi )
3331    !-----------------------------------------------------------------------------------
3332    !! Do the ACE potential and (optional) print the ACE matrix representation.
3333    !
3334    USE becmod,               ONLY : calbec
3335    USE wvfct,                ONLY : current_k, npwx
3336    USE noncollin_module,     ONLY : npol
3337    !
3338    IMPLICIT NONE
3339    !
3340    REAL(DP) :: exxe
3341    !! exx energy
3342    INTEGER :: nnpw
3343    !! number of PW
3344    INTEGER :: nbnd
3345    !! number of bands
3346    COMPLEX(DP) :: phi(npwx*npol,nbnd)
3347    !! wave function
3348    COMPLEX(DP), OPTIONAL :: vphi(npwx*npol,nbnd)
3349    !! ACE potential
3350    !
3351    ! ... local variables
3352    !
3353    INTEGER :: i
3354    COMPLEX(DP), ALLOCATABLE :: cmexx(:,:), vv(:,:)
3355    REAL*8, PARAMETER :: Zero=0.0d0, One=1.0d0, Two=2.0d0, Pt5=0.50d0
3356    !
3357    CALL start_clock( 'vexxace' )
3358    !
3359    ALLOCATE( vv(npwx*npol,nbnd) )
3360    IF (PRESENT(vphi)) THEN
3361      vv = vphi
3362    ELSE
3363      vv = (Zero, Zero)
3364    ENDIF
3365    !
3366    ! do the ACE potential!
3367    ALLOCATE( cmexx(nbndproj,nbnd) )
3368    cmexx = (Zero,Zero)
3369    ! <xi|phi>
3370    CALL matcalc_k( '<xi|phi>', .FALSE., 0, current_k, npwx*npol, nbndproj, nbnd, &
3371                    xi(1,1,current_k), phi, cmexx, exxe )
3372    !
3373    ! |vv> = |vphi> + (-One) * |xi> * <xi|phi>!
3374    CALL ZGEMM( 'N', 'N', npwx*npol, nbnd, nbndproj, -(One,Zero), xi(1,1,current_k), &
3375                npwx*npol, cmexx, nbndproj, (One,Zero), vv, npwx*npol )
3376    !
3377    IF (domat) THEN
3378       !
3379       IF ( nbndproj /= nbnd) THEN
3380          DEALLOCATE( cmexx )
3381          ALLOCATE( cmexx(nbnd,nbnd) )
3382       ENDIF
3383       !
3384       CALL matcalc_k( 'ACE', .TRUE., 0, current_k, npwx*npol, nbnd, nbnd, phi, vv, cmexx, exxe )
3385       !
3386#if defined(__DEBUG)
3387       WRITE(stdout,'(3(A,I3),A,I9,A,f12.6)') 'vexxace_k: nbnd=', nbnd, ' nbndproj=',nbndproj, &
3388                    ' k=',current_k,' npw=',nnpw, ' Ex(k)=',exxe
3389    ELSE
3390       WRITE(stdout,'(3(A,I3),A,I9)') 'vexxace_k: nbnd=', nbnd, ' nbndproj=',nbndproj, &
3391                       ' k=',current_k,' npw=',nnpw
3392#endif
3393    ENDIF
3394    !
3395    IF (PRESENT(vphi)) vphi = vv
3396    DEALLOCATE( vv, cmexx )
3397    !
3398    CALL stop_clock( 'vexxace' )
3399    !
3400  END SUBROUTINE vexxace_k
3401  !
3402  !
3403  !---------------------------------------------------------------------------------
3404  SUBROUTINE vexx_loc( npw, nbnd, hpsi, mexx )
3405    !---------------------------------------------------------------------------------
3406    !! Exact exchange with SCDM orbitals.
3407    !! Vx|phi> =  Vx|psi> <psi|Vx|psi>^(-1) <psi|Vx|phi>.
3408    !! locmat contains localization integrals.
3409    !
3410    USE noncollin_module,  ONLY : npol
3411    USE cell_base,         ONLY : omega, alat
3412    USE wvfct,             ONLY : current_k
3413    USE klist,             ONLY : xk, nks, nkstot
3414    USE fft_interfaces,    ONLY : fwfft, invfft
3415    USE mp,                ONLY : mp_stop, mp_barrier, mp_sum
3416    USE mp_bands,          ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp
3417    USE exx_base,          ONLY : nqs, xkq_collect, index_xkq, index_xk, &
3418                                  g2_convolution
3419    !
3420    IMPLICIT NONE
3421    !
3422    INTEGER :: npw
3423    !! number of PW
3424    INTEGER :: nbnd
3425    !! number of bands
3426    COMPLEX(DP) :: hpsi(npw,nbnd)
3427    !! hpsi
3428    REAL(DP) :: mexx(nbnd,nbnd)
3429    !! mexx contains in output the exchange matrix
3430    !
3431    ! ... local variables
3432    !
3433    INTEGER :: nrxxs, npairs, ntot, NBands
3434    INTEGER :: ig, ir, ik, ikq, iq, ibnd, jbnd, kbnd, NQR
3435    INTEGER :: current_ik
3436    REAL(DP) :: exxe
3437    COMPLEX(DP), ALLOCATABLE :: rhoc(:), vc(:), RESULT(:,:)
3438    REAL(DP), ALLOCATABLE :: fac(:)
3439    REAL(DP) :: xkp(3), xkq(3)
3440    INTEGER, EXTERNAL  :: global_kpoint_index
3441    !
3442    WRITE( stdout, '(5X,A)' ) ' '
3443    WRITE( stdout, '(5X,A)' ) 'Exact-exchange with localized orbitals'
3444    !
3445    CALL start_clock( 'vexxloc' )
3446    !
3447    WRITE( stdout,'(7X,A,f24.12)' ) 'local_thr =', local_thr
3448    nrxxs = dfftt%nnr
3449    NQR = nrxxs*npol
3450    !
3451    ! ... exchange projected onto localized orbitals
3452    WRITE( stdout,'(A)' ) 'Allocating exx quantities...'
3453    ALLOCATE( fac(dfftt%ngm) )
3454    ALLOCATE( rhoc(nrxxs), vc(NQR) )
3455    ALLOCATE( RESULT(nrxxs,nbnd) )
3456    WRITE( stdout,'(A)' ) 'Allocations done.'
3457    !
3458    current_ik = global_kpoint_index( nkstot, current_k )
3459    xkp = xk(:,current_k)
3460    !
3461    vc = (0.0d0, 0.0d0)
3462    npairs = 0
3463    !
3464    DO iq = 1, nqs
3465       ikq  = index_xkq(current_ik,iq)
3466       ik   = index_xk(ikq)
3467       xkq  = xkq_collect(:,ikq)
3468       !
3469       CALL g2_convolution( dfftt%ngm, gt, xkp, xkq, fac )
3470       !
3471       RESULT = (0.0d0, 0.0d0)
3472       !
3473       DO ibnd = 1, nbnd
3474         !
3475         IF (x_occupation(ibnd,ikq) > 0.0d0) THEN
3476           !
3477           DO ir = 1, NQR
3478             rhoc(ir) = locbuff(ir,ibnd,ikq) * locbuff(ir,ibnd,ikq) / omega
3479           ENDDO
3480           !
3481           CALL fwfft( 'Rho', rhoc, dfftt )
3482           !
3483           vc = (0.0d0, 0.0d0)
3484           DO ig = 1, dfftt%ngm
3485               vc(dfftt%nl(ig))  = fac(ig) * rhoc(dfftt%nl(ig))
3486               vc(dfftt%nlm(ig)) = fac(ig) * rhoc(dfftt%nlm(ig))
3487           ENDDO
3488           !
3489           CALL invfft( 'Rho', vc, dfftt )
3490           !
3491           DO ir = 1, NQR
3492             RESULT(ir,ibnd) = RESULT(ir,ibnd) + locbuff(ir,ibnd,ikq) * vc(ir)
3493           ENDDO
3494           !
3495         ENDIF
3496         !
3497         DO kbnd = 1, ibnd-1
3498           IF ( (locmat(ibnd,kbnd,ikq) > local_thr) .AND. &
3499                ( (x_occupation(ibnd,ikq) > 0.0d0) .OR.   &
3500                  (x_occupation(kbnd,ikq) > 0.0d0) ) ) THEN
3501             !
3502             !write(stdout,'(3I4,3f12.6,A)') ikq, ibnd, kbnd, x_occupation(ibnd,ikq), &
3503             !                    x_occupation(kbnd,ikq), locmat(ibnd,kbnd,ikq), ' IN '
3504             !
3505             DO ir = 1, NQR
3506               rhoc(ir) = locbuff(ir,ibnd,ikq) * locbuff(ir,kbnd,ikq) / omega
3507             ENDDO
3508             !
3509             npairs = npairs + 1
3510             !
3511             CALL fwfft( 'Rho', rhoc, dfftt )
3512             !
3513             vc = (0.0d0, 0.0d0)
3514             !
3515             DO ig = 1, dfftt%ngm
3516                 vc(dfftt%nl(ig))  = fac(ig) * rhoc(dfftt%nl(ig))
3517                 vc(dfftt%nlm(ig)) = fac(ig) * rhoc(dfftt%nlm(ig))
3518             ENDDO
3519             !
3520             CALL invfft( 'Rho', vc, dfftt )
3521             !
3522             DO ir = 1, NQR
3523               RESULT(ir,kbnd) = RESULT(ir,kbnd) + x_occupation(ibnd,ikq) * locbuff(ir,ibnd,ikq) * vc(ir)
3524             ENDDO
3525             !
3526             DO ir = 1, NQR
3527               RESULT(ir,ibnd) = RESULT(ir,ibnd) + x_occupation(kbnd,ikq) * locbuff(ir,kbnd,ikq) * vc(ir)
3528             ENDDO
3529             ! ELSE
3530             !   write(stdout,'(3I4,3f12.6,A)') ikq, ibnd, kbnd, x_occupation(ibnd,ikq), &
3531             !               x_occupation(kbnd,ikq), locmat(ibnd,kbnd,ikq), '      OUT '
3532           ENDIF
3533           !
3534         ENDDO
3535         !
3536       ENDDO
3537       !
3538       DO jbnd = 1, nbnd
3539         !
3540         CALL fwfft( 'Wave', RESULT(:,jbnd), dfftt )
3541         !
3542         DO ig = 1, npw
3543            hpsi(ig,jbnd) = hpsi(ig,jbnd) - exxalfa*RESULT(dfftt%nl(ig),jbnd)
3544         ENDDO
3545         !
3546       ENDDO
3547       !
3548    ENDDO
3549    !
3550    DEALLOCATE( fac, vc )
3551    DEALLOCATE( RESULT )
3552    !
3553    ! ... localized functions to G-space and exchange matrix onto localized functions
3554    ALLOCATE( RESULT(npw,nbnd) )
3555    RESULT = (0.0d0,0.0d0)
3556    !
3557    DO jbnd = 1, nbnd
3558      rhoc(:) = DBLE(locbuff(:,jbnd,ikq)) + (0.0d0,1.0d0)*0.0d0
3559      CALL fwfft( 'Wave' , rhoc, dfftt )
3560      DO ig = 1, npw
3561        RESULT(ig,jbnd) = rhoc(dfftt%nl(ig))
3562      ENDDO
3563    ENDDO
3564    !
3565    DEALLOCATE( rhoc )
3566    !
3567    CALL matcalc( 'M1-', .TRUE., 0, npw, nbnd, nbnd, RESULT, hpsi, mexx, exxe )
3568    !
3569    DEALLOCATE( RESULT )
3570    !
3571    NBands = INT(SUM(x_occupation(:,ikq)))
3572    ntot = NBands * (NBands-1)/2 + NBands * (nbnd-NBands)
3573    WRITE( stdout,'(7X,2(A,I12),A,f12.2)') '  Pairs(full): ',      ntot, &
3574                  '   Pairs(included): ', npairs, &
3575                  '   Pairs(%): ', DBLE(npairs)/DBLE(ntot)*100.0d0
3576    !
3577    CALL stop_clock( 'vexxloc' )
3578    !
3579  END SUBROUTINE vexx_loc
3580  !
3581  !
3582  !----------------------------------------------------------------------------------------------
3583  SUBROUTINE compute_density( DoPrint, Shift, CenterPBC, SpreadPBC, Overlap, PsiI, PsiJ, NQR, &
3584                              ibnd, jbnd )
3585    !-------------------------------------------------------------------------------------------
3586    !! Manipulate density: get pair density, center, spread, absolute overlap.
3587    !! Shift:
3588    !! .FALSE. refer the centers to the cell -L/2 ... +L/2;
3589    !! .TRUE.  shift the centers to the cell 0 ... L (presumably the one given in input).
3590    !
3591    USE constants,        ONLY : pi, bohr_radius_angs
3592    USE cell_base,        ONLY : alat, omega
3593    USE mp,               ONLY : mp_sum
3594    USE mp_bands,         ONLY : intra_bgrp_comm
3595    USE fft_types,        ONLY : fft_index_to_3d
3596    !
3597    IMPLICIT NONE
3598    !
3599    LOGICAL, INTENT(IN) :: DoPrint
3600    !! whether to print or not the quantities
3601    LOGICAL,  INTENT(IN) :: Shift
3602    !! .FALSE. Centers with respect to the minimum image cell convention;
3603    !! .TRUE.  Centers shifted to the input cell.
3604    REAL(DP), INTENT(OUT) :: CenterPBC(3)
3605    REAL(DP), INTENT(OUT) :: SpreadPBC(3)
3606    REAL(DP), INTENT(OUT) :: Overlap
3607    INTEGER,  INTENT(IN) :: NQR
3608    REAL(DP), INTENT(IN) :: PsiI(NQR)
3609    REAL(DP), INTENT(IN) :: PsiJ(NQR)
3610    INTEGER, INTENT(IN) :: ibnd
3611    INTEGER, INTENT(IN) :: jbnd
3612    !
3613    ! ... local variables
3614    !
3615    REAL(DP) :: vol, rbuff, TotSpread
3616    INTEGER :: ir, i, j, k
3617    LOGICAL :: offrange
3618    COMPLEX(DP) :: cbuff(3)
3619    REAL(DP), PARAMETER :: Zero=0.0d0, One=1.0d0, Two=2.0d0
3620    !
3621    vol = omega / DBLE(dfftt%nr1 * dfftt%nr2 * dfftt%nr3)
3622    !
3623    CenterPBC = Zero
3624    SpreadPBC = Zero
3625    Overlap = Zero
3626    cbuff = (Zero,Zero)
3627    rbuff = Zero
3628    !
3629    DO ir = 1, dfftt%nr1x*dfftt%my_nr2p*dfftt%my_nr3p
3630       !
3631       ! ... three dimensional indexes
3632       !
3633       CALL fft_index_to_3d (ir, dfftt, i,j,k, offrange)
3634       IF ( offrange ) CYCLE
3635       !
3636       rbuff = PsiI(ir) * PsiJ(ir) / omega
3637       Overlap = Overlap + ABS(rbuff)*vol
3638       cbuff(1) = cbuff(1) + rbuff*EXP((Zero,One)*Two*pi*DBLE(i)/DBLE(dfftt%nr1))*vol
3639       cbuff(2) = cbuff(2) + rbuff*EXP((Zero,One)*Two*pi*DBLE(j)/DBLE(dfftt%nr2))*vol
3640       cbuff(3) = cbuff(3) + rbuff*EXP((Zero,One)*Two*pi*DBLE(k)/DBLE(dfftt%nr3))*vol
3641    ENDDO
3642    !
3643    CALL mp_sum( cbuff, intra_bgrp_comm )
3644    CALL mp_sum( Overlap, intra_bgrp_comm )
3645    !
3646    CenterPBC(1) =  alat/Two/pi*AIMAG( LOG(cbuff(1)) )
3647    CenterPBC(2) =  alat/Two/pi*AIMAG( LOG(cbuff(2)) )
3648    CenterPBC(3) =  alat/Two/pi*AIMAG( LOG(cbuff(3)) )
3649    !
3650    IF (Shift) THEN
3651      IF (CenterPBC(1) < Zero) CenterPBC(1) = CenterPBC(1) + alat
3652      IF (CenterPBC(2) < Zero) CenterPBC(2) = CenterPBC(2) + alat
3653      IF (CenterPBC(3) < Zero) CenterPBC(3) = CenterPBC(3) + alat
3654    ENDIF
3655    !
3656    rbuff = DBLE(cbuff(1))**2 + AIMAG(cbuff(1))**2
3657    SpreadPBC(1) = -(alat/Two/pi)**2 * DLOG(rbuff)
3658    rbuff = DBLE(cbuff(2))**2 + AIMAG(cbuff(2))**2
3659    SpreadPBC(2) = -(alat/Two/pi)**2 * DLOG(rbuff)
3660    rbuff = DBLE(cbuff(3))**2 + AIMAG(cbuff(3))**2
3661    SpreadPBC(3) = -(alat/Two/pi)**2 * DLOG(rbuff)
3662    TotSpread = (SpreadPBC(1) + SpreadPBC(2) + SpreadPBC(3))*bohr_radius_angs**2
3663    !
3664    IF (DoPrint) THEN
3665      WRITE(stdout,'(A,2I4)')     'MOs:                  ', ibnd, jbnd
3666      WRITE(stdout,'(A,10f12.6)') 'Absolute Overlap:     ', Overlap
3667      WRITE(stdout,'(A,10f12.6)') 'Center(PBC)[A]:       ', CenterPBC(1)*bohr_radius_angs, &
3668              CenterPBC(2)*bohr_radius_angs, CenterPBC(3)*bohr_radius_angs
3669      WRITE(stdout,'(A,10f12.6)') 'Spread [A**2]:        ', SpreadPBC(1)*bohr_radius_angs**2, &
3670              SpreadPBC(2)*bohr_radius_angs**2, SpreadPBC(3)*bohr_radius_angs**2
3671      WRITE(stdout,'(A,10f12.6)') 'Total Spread [A**2]:  ', TotSpread
3672    ENDIF
3673    !
3674    IF (TotSpread < Zero) CALL errore( 'compute_density', 'Negative spread found', 1 )
3675    !
3676  END SUBROUTINE compute_density
3677  !
3678  !
3679  !------------------------------------------------------------------------------------
3680  SUBROUTINE compute_density_k( DoPrint, Shift, CenterPBC, SpreadPBC, Overlap, PsiI, &
3681                                PsiJ, NQR, ibnd, jbnd )
3682    !----------------------------------------------------------------------------------
3683    !! Manipulate density: get pair density, center, spread, absolute overlap.
3684    !! Shift:
3685    !! .FALSE. refer the centers to the cell -L/2 ... +L/2
3686    !! .TRUE.  shift the centers to the cell 0 ... L (presumably the
3687    !!         one given in input )
3688    !
3689    USE constants,        ONLY : pi, bohr_radius_angs
3690    USE cell_base,        ONLY : alat, omega
3691    USE mp,               ONLY : mp_sum
3692    USE mp_bands,         ONLY : intra_bgrp_comm
3693    USE fft_types,        ONLY : fft_index_to_3d
3694    !
3695    IMPLICIT NONE
3696    !
3697    LOGICAL, INTENT(IN) :: DoPrint
3698    !! whether to print or not the quantities
3699    LOGICAL, INTENT(IN) :: Shift
3700    !! .FALSE. Centers with respect to the minimum image cell convention
3701    !! .TRUE.  Centers shifted to the input cell
3702    REAL(DP), INTENT(OUT) :: CenterPBC(3)
3703    !! Coordinates of the center
3704    REAL(DP), INTENT(OUT) :: SpreadPBC(3)
3705    REAL(DP), INTENT(OUT) :: Overlap
3706    INTEGER,  INTENT(IN) :: NQR
3707    COMPLEX(DP), INTENT(IN) :: PsiI(NQR)
3708    COMPLEX(DP), INTENT(IN) :: PsiJ(NQR)
3709    INTEGER,  INTENT(IN) :: ibnd
3710    INTEGER,  INTENT(IN) :: jbnd
3711    !
3712    ! ... local variables
3713    !
3714    REAL(DP) :: vol, TotSpread, rbuff
3715    INTEGER :: ir, i, j, k
3716    LOGICAL :: offrange
3717    COMPLEX(DP) :: cbuff(3)
3718    REAL(DP), PARAMETER :: Zero=0.0d0, One=1.0d0, Two=2.0d0
3719    !
3720    vol = omega / DBLE(dfftt%nr1 * dfftt%nr2 * dfftt%nr3)
3721    !
3722    CenterPBC = Zero
3723    SpreadPBC = Zero
3724    Overlap = Zero
3725    cbuff = (Zero, Zero)
3726    rbuff = Zero
3727    !
3728    DO ir = 1, dfftt%nr1x*dfftt%my_nr2p*dfftt%my_nr3p
3729       !
3730       ! ... three dimensional indexes
3731       !
3732       CALL fft_index_to_3d (ir, dfftt, i,j,k, offrange)
3733       IF ( offrange ) CYCLE
3734       !
3735       rbuff = ABS(PsiI(ir) * CONJG(PsiJ(ir)) / omega )
3736       Overlap = Overlap + rbuff*vol
3737       cbuff(1) = cbuff(1) + rbuff*EXP((Zero,One)*Two*pi*DBLE(i)/DBLE(dfftt%nr1))*vol
3738       cbuff(2) = cbuff(2) + rbuff*EXP((Zero,One)*Two*pi*DBLE(j)/DBLE(dfftt%nr2))*vol
3739       cbuff(3) = cbuff(3) + rbuff*EXP((Zero,One)*Two*pi*DBLE(k)/DBLE(dfftt%nr3))*vol
3740    ENDDO
3741    !
3742    CALL mp_sum( cbuff, intra_bgrp_comm )
3743    CALL mp_sum( Overlap, intra_bgrp_comm )
3744    !
3745    CenterPBC(1) =  alat/Two/pi*AIMAG(LOG(cbuff(1)))
3746    CenterPBC(2) =  alat/Two/pi*AIMAG(LOG(cbuff(2)))
3747    CenterPBC(3) =  alat/Two/pi*AIMAG(LOG(cbuff(3)))
3748    !
3749    IF (Shift) THEN
3750      IF (CenterPBC(1) < Zero) CenterPBC(1) = CenterPBC(1) + alat
3751      IF (CenterPBC(2) < Zero) CenterPBC(2) = CenterPBC(2) + alat
3752      IF (CenterPBC(3) < Zero) CenterPBC(3) = CenterPBC(3) + alat
3753    ENDIF
3754    !
3755    rbuff = DBLE(cbuff(1))**2 + AIMAG(cbuff(1))**2
3756    SpreadPBC(1) = -(alat/Two/pi)**2 * LOG(rbuff)
3757    rbuff = DBLE(cbuff(2))**2 + AIMAG(cbuff(2))**2
3758    SpreadPBC(2) = -(alat/Two/pi)**2 * LOG(rbuff)
3759    rbuff = DBLE(cbuff(3))**2 + AIMAG(cbuff(3))**2
3760    SpreadPBC(3) = -(alat/Two/pi)**2 * LOG(rbuff)
3761    TotSpread = (SpreadPBC(1) + SpreadPBC(2) + SpreadPBC(3))*bohr_radius_angs**2
3762    !
3763    IF (DoPrint) THEN
3764      WRITE(stdout,'(A,2I4)')     'MOs:                  ', ibnd, jbnd
3765      WRITE(stdout,'(A,10f12.6)') 'Absolute Overlap:     ', Overlap
3766      WRITE(stdout,'(A,10f12.6)') 'Center(PBC)[A]:       ', CenterPBC(1)*bohr_radius_angs, &
3767              CenterPBC(2)*bohr_radius_angs, CenterPBC(3)*bohr_radius_angs
3768      WRITE(stdout,'(A,10f12.6)') 'Spread [A**2]:        ', SpreadPBC(1)*bohr_radius_angs**2, &
3769              SpreadPBC(2)*bohr_radius_angs**2, SpreadPBC(3)*bohr_radius_angs**2
3770      WRITE(stdout,'(A,10f12.6)') 'Total Spread [A**2]:  ', TotSpread
3771    ENDIF
3772    !
3773    IF (TotSpread < Zero) CALL errore( 'compute_density_k', 'Negative spread found', 1 )
3774    !
3775  END SUBROUTINE compute_density_k
3776  !
3777  !
3778  !------------------------------------------------------------------------
3779  SUBROUTINE vexx_loc_k( npw, NBands, hpsi, mexx, exxe )
3780    !-----------------------------------------------------------------------
3781    !! Generic, k-point version of \(\texttt{vexx}\).
3782    !
3783    USE cell_base,       ONLY : omega
3784    USE gvect,           ONLY : ngm, g
3785    USE wvfct,           ONLY : current_k, npwx
3786    USE klist,           ONLY : xk, nks, nkstot
3787    USE fft_interfaces,  ONLY : fwfft, invfft
3788    USE exx_base,        ONLY : index_xkq, nqs, index_xk, xkq_collect, &
3789                                g2_convolution
3790    USE exx_band,        ONLY : igk_exx
3791    !
3792    IMPLICIT NONE
3793    !
3794    INTEGER :: npw
3795    !! number of PW
3796    INTEGER :: NBands
3797    !! number of bands
3798    COMPLEX(DP) :: hpsi(npwx*npol,NBands)
3799    !! h psi
3800    COMPLEX(DP) :: mexx(NBands,NBands)
3801    !! mexx contains in output the exchange matrix
3802    REAL(DP) :: exxe
3803    !! exx energy
3804    !
3805    ! ... local variables
3806    !
3807    COMPLEX(DP), ALLOCATABLE :: RESULT(:), RESULT2(:,:)
3808    COMPLEX(DP), ALLOCATABLE :: rhoc(:), vc(:)
3809    REAL(DP), ALLOCATABLE :: fac(:)
3810    INTEGER :: ibnd, jbnd, ik, ikq, iq
3811    INTEGER :: ir, ig, NBin, NBtot
3812    INTEGER :: current_ik, current_jk
3813    INTEGER :: nrxxs
3814    REAL(DP) :: xkp(3)
3815    REAL(DP) :: xkq(3)
3816    !
3817    INTEGER, EXTERNAL :: global_kpoint_index
3818    !
3819    CALL start_clock( 'vexxloc' )
3820    !
3821    ALLOCATE( fac(dfftt%ngm) )
3822    !
3823    nrxxs = dfftt%nnr
3824    !
3825    ALLOCATE( RESULT(nrxxs) )
3826    ALLOCATE( rhoc(nrxxs), vc(nrxxs) )
3827    !
3828    current_ik = global_kpoint_index ( nkstot, current_k )
3829    current_jk = index_xkq(current_ik,1)
3830    !
3831    NBin = 0
3832    NBtot  = 0
3833    xkp = xk(:,current_k)
3834    DO jbnd = 1, NBands
3835       RESULT = (0.0_DP, 0.0_DP)
3836       DO iq = 1, nqs
3837          ikq = index_xkq(current_ik,iq)
3838          ik  = index_xk(ikq)
3839          xkq = xkq_collect(:,ikq)
3840          CALL g2_convolution( dfftt%ngm, gt, xkp, xkq, fac )
3841          DO ibnd = 1, NBands
3842             ! IF ( abs(x_occupation(ibnd,ik)) < eps_occ) CYCLE
3843             !
3844             NBtot = NBtot + 1
3845             IF ((exxmat(ibnd,ikq,jbnd,current_k) > local_thr).AND. &
3846                ((x_occupation(ibnd,ik) > eps_occ))) THEN
3847                  NBin = NBin + 1
3848               !
3849               ! write(stdout,'(4I4,f12.6,A)') ibnd, ikq, jbnd, current_k, exxmat(ibnd,ikq,jbnd,current_k), ' IN '
3850!$omp parallel do default(shared), private(ir)
3851               DO ir = 1, nrxxs
3852                  rhoc(ir)=CONJG(exxbuff(ir,ibnd,ikq))*exxbuff(ir,jbnd,current_jk) / omega
3853               ENDDO
3854!$omp end parallel do
3855               CALL fwfft( 'Rho', rhoc, dfftt )
3856               vc = (0._DP, 0._DP)
3857!$omp parallel do default(shared), private(ig)
3858               DO ig = 1, dfftt%ngm
3859                  vc(dfftt%nl(ig)) = &
3860                        fac(ig) * rhoc(dfftt%nl(ig)) * x_occupation(ibnd,ik) / nqs
3861               ENDDO
3862!$omp end parallel do
3863               CALL invfft( 'Rho', vc, dfftt )
3864!$omp parallel do default(shared), private(ir)
3865               DO ir = 1, nrxxs
3866                  RESULT(ir) = RESULT(ir) + vc(ir)*exxbuff(ir,ibnd,ikq)
3867               ENDDO
3868!$omp end parallel do
3869!            ELSE
3870!              write(stdout,'(4I4,f12.6,A)') ibnd, ikq, jbnd, current_k, exxmat(ibnd,ikq,jbnd,current_k), ' OUT'
3871             ENDIF
3872         ENDDO
3873       ENDDO
3874       !
3875       CALL fwfft( 'Wave', RESULT, dfftt )
3876!$omp parallel do default(shared), private(ig)
3877       DO ig = 1, npw
3878          hpsi(ig,jbnd) = hpsi(ig,jbnd) - exxalfa*RESULT(dfftt%nl(igk_exx(ig,current_k)))
3879       ENDDO
3880!$omp end parallel do
3881    ENDDO
3882    !
3883    DEALLOCATE( RESULT )
3884    DEALLOCATE( vc, fac )
3885    !
3886    ! ... Localized functions to G-space and exchange matrix onto localized functions
3887    ALLOCATE( RESULT2(npwx,NBands) )
3888    RESULT2 = (0.0d0,0.0d0)
3889    !
3890    DO jbnd = 1, NBands
3891      rhoc(:) = exxbuff(:,jbnd,current_jk)
3892      CALL fwfft( 'Wave' , rhoc, dfftt )
3893      DO ig = 1, npw
3894        RESULT2(ig,jbnd) = rhoc(dfftt%nl(igk_exx(ig,current_k)))
3895      ENDDO
3896    ENDDO
3897    !
3898    DEALLOCATE( rhoc )
3899    CALL matcalc_k( 'M1-', .TRUE., 0, current_k, npwx*npol, NBands, NBands, RESULT2, hpsi, mexx, exxe )
3900    DEALLOCATE( RESULT2 )
3901    !
3902    WRITE(stdout,'(7X,2(A,I12),A,f12.2)') '  Pairs(full): ',  NBtot, &
3903            '   Pairs(included): ', NBin, &
3904            '   Pairs(%): ', DBLE(NBin)/DBLE(NBtot)*100.0d0
3905    !
3906    CALL stop_clock( 'vexxloc' )
3907    !
3908  END SUBROUTINE vexx_loc_k
3909  !
3910  !
3911END MODULE exx
3912