1SUBROUTINE exx_gs(nfi, c)
2    !=======================================================================================
3    ! Code Version 1.0 (Princeton University, September 2014)
4    !=======================================================================================
5    ! Note:  From this code exx_potential is returned after multiplying mixing parameter exxalfa.
6    !        Later the exx_potential is added with GGA potential in forces.f90.
7    !        In the future, full exx_potential should be returned and the mixing parameter exxalfa
8    !        should be multiplied in forces.f90.
9    !=======================================================================================
10    !
11    USE kinds,                   ONLY  : DP
12    USE constants,               ONLY  : fpi
13    USE fft_base,                ONLY  : dffts, dfftp
14    USE mp,                      ONLY  : mp_barrier, mp_sum
15    USE mp_global,               ONLY  : nproc_image, me_image, root_image, intra_image_comm, intra_bgrp_comm, me_bgrp
16    USE parallel_include
17    USE io_global,               ONLY  : stdout
18    USE cell_base,               ONLY  : omega, ainv, h
19    USE cell_base,               ONLY  : ibrav
20    USE cell_base,               ONLY  : isotropic  !True if volume option is chosen for cell_dofree
21    USE electrons_base,          ONLY  : nbsp, nbspx, nspin
22    USE gvecw,                   ONLY  : ngw
23    USE wannier_module,          ONLY  : wfc
24    USE exx_module,              ONLY  : my_nbspx, my_nbsp, my_nxyz, index_my_nbsp,rk_of_obtl, lindex_of_obtl
25    USE exx_module,              ONLY  : selfv, pairv, pair_dist
26    USE exx_module,              ONLY  : pair_label, pair_status, pair_step
27    USE exx_module,              ONLY  : exx_potential
28    USE exx_module,              ONLY  : n_exx, sc_fac ! EXX step and the performance scaling factor
29    USE exx_module,              ONLY  : coe_1st_derv, coeke, nord1, nord2, fornberg
30    USE exx_module,              ONLY  : thdtood, odtothd_in_sp, thdtood_in_sp
31    USE exx_module,              ONLY  : np_in_sp_s   , np_in_sp_me_s   , np_in_sp_p   , np_in_sp_me_p
32    USE exx_module,              ONLY  : xx_in_sp,yy_in_sp,zz_in_sp,sc_xx_in_sp,sc_yy_in_sp,sc_zz_in_sp
33    USE energies,                ONLY  : exx
34    USE printout_base,           ONLY  : printout_base_open, printout_base_unit, printout_base_close
35    USE wannier_base,            ONLY  : neigh, dis_cutoff, texx_cube
36    USE mp_wave,                 ONLY  : redistwfr
37    !
38    USE time_step,               ONLY  : tps                !md time in picoseconds
39    USE io_global,               ONLY  : ionode             !logical for I/O node
40    USE cp_main_variables,       ONLY  : iprint_stdout      !print control
41    USE printout_base,           ONLY  : printout_base_close!close print unit
42    USE printout_base,           ONLY  : printout_base_open !open print unit
43    USE printout_base,           ONLY  : printout_base_unit !printout_base_unit
44    USE exx_module,              ONLY  : dexx_dh
45    USE exx_module,              ONLY  : exx_energy_cell_derivative
46    USE exx_module,              ONLY  : exxalfa
47    USE fft_helper_subroutines
48    use exx_module, only : psime_pair_recv, psime_pair_send
49    ! cubic domain related "use" variables
50    USE exx_module,              ONLY  : selfrho, pairrho
51    USE exx_module,              ONLY  : nrg
52    USE exx_module,              ONLY  : s_me_r, s_ps_r, p_me_r, p_ps_r
53    USE exx_module,              ONLY  : n_s_me, n_s_ps, n_p_me, n_p_ps
54    USE exx_module,              ONLY  : lmax
55    USE exx_module,              ONLY  : me_cs
56    USE exx_module,              ONLY  : me_rs
57    USE exx_module,              ONLY  : me_ri
58    USE exx_module,              ONLY  : me_rc
59    USE exx_module,              ONLY  : coemicf !MCA/HK : dirty hack for std CG
60    USE exx_module,              ONLY  : exx_energy_cell_derivative_cube
61    !
62    IMPLICIT NONE
63    COMPLEX(DP)   c(ngw, nbspx)        ! wave functions at time t
64#if defined(__MPI)
65    !
66    INTEGER  :: istatus(MPI_STATUS_SIZE)
67#endif
68    INTEGER     ir, ip, i, j,nfi, ierr, nnrtot, nr1s,nr2s,nr3s
69    INTEGER     nj_max, iunit
70    REAl(DP)    sa1,a(3),ha, hb, hc
71    REAl(DP)    hcub, centerx, centery, centerz
72    REAL(DP)    middle(3)
73    REAL(DP)    d_pair            ! pair distance
74    !
75    REAL(DP),    ALLOCATABLE ::   vpsil(:,:)
76    REAL(DP),    ALLOCATABLE ::   rhol(:),rho_in_sp(:),vl(:)
77    ! cubic domain related variables
78    INTEGER     l,m
79    INTEGER     lid,gid
80    INTEGER     s_me_r1,s_me_r2,s_me_r3,s_me_r4,s_me_r5,s_me_r6
81    REAl(DP)    inv_omega
82    REAl(DP)    dist, dxy, costheta, sintheta
83    COMPLEX(DP) cxy
84    REAl(DP)    plm(0:lmax,0:lmax)
85    REAl(DP)    ha_proj(3), hb_proj(3), hc_proj(3)
86    REAL(DP)    dq1,dq2,dq3
87    REAL(DP)    dqs1,dqs2,dqs3
88    REAL(DP)    start_timer, stop_timer
89    REAL(DP),   ALLOCATABLE :: rhome(:),rhops(:),potme(:)
90    INTEGER                 :: pos, oldest_step, guess_status
91    REAl(DP),   ALLOCATABLE :: psime(:)
92    REAL(DP),    ALLOCATABLE ::   psi(:,:)
93    !
94    INTEGER   iobtl, gindex_of_iobtl, irank, rk_of_obtl_trcv, rk_of_obtl_tbs
95    INTEGER   obtl_tbs, lindex_obtl_tbs, obtl_trcv, lindex_obtl_trcv
96    INTEGER   obtl_tbadd
97    REAL(DP)  totalenergy, totalenergyg, tot_energy(nbsp)
98    REAL(DP)  total_exx_derv(3,3), total_exx_derv_g(3,3)
99    REAL(DP)  selfe, paire(neigh/2), &
100        self_dexx_dhab(3,3), pair_dexx_dhab(3,3,neigh/2)
101    !
102    INTEGER,   ALLOCATABLE   :: isendreq(:)
103    INTEGER,   ALLOCATABLE   :: irecvreq(:)
104    INTEGER                  :: irecv_count
105    INTEGER                  :: isend_count
106    INTEGER                  :: itr
107    INTEGER                  :: jtr
108    INTEGER                  :: tran(3)
109    INTEGER                  :: proc
110    INTEGER                  :: tmp_iobtl
111    INTEGER                  :: me
112    !
113    INTEGER                  :: k,jj,ii,ia,ib,ic,my_var,my_var2,my_var3,i_fac,va,cgstep
114    INTEGER                  :: ndim,nogrp
115    INTEGER,    ALLOCATABLE  :: obtl_recv(:,:), obtl_send(:,:), num_recv(:), num_send(:)
116    REAl(DP),   ALLOCATABLE  :: wannierc(:,:),wannierc_tmp(:,:)
117    REAl(DP),   ALLOCATABLE  :: psil(:)
118    REAL(DP),   ALLOCATABLE  :: exx_tmp(:,:),exx_tmp3(:,:)
119    INTEGER,    ALLOCATABLE  :: sdispls(:), sendcount(:)
120    INTEGER,    ALLOCATABLE  :: rdispls(:), recvcount(:)
121    INTEGER,    ALLOCATABLE  :: sdispls1(:), sendcount1(:)
122    INTEGER,    ALLOCATABLE  :: rdispls1(:), recvcount1(:)
123    !
124    REAL(DP) :: Jim(3,3)  ! jacobian [d/d x]        [d/d a]
125    !                                |d/d y| = [J]  |d/d b|
126    !                                [d/d z]        [d/d c]
127    INTEGER            :: psgsn=3 !MCA: number of steps that arrays are stored. Memory overhead from here?
128    !
129    !=============================================================================================
130    !
131    CALL start_clock('exx_gs_setup')
132    CALL exx_gs_setup_common
133    if (texx_cube) then
134      CALL exx_gs_setup_cube
135    else
136      CALL exx_gs_setup_sphere
137    end if
138    !
139    !========================================================================
140    !
141    CALL stop_clock('exx_gs_setup')
142    !
143    !-------------------------------------------------------------------------
144    ! Get the Wannier center and compute the pair overlap matrix
145    !-------------------------------------------------------------------------
146    !
147    CALL start_clock('exx_pairs')
148    !
149    ndim=MAX(nproc_image, nbsp)
150    ALLOCATE (wannierc(3,ndim)); wannierc=0.0_DP
151    ALLOCATE (wannierc_tmp(3,nbsp)); wannierc_tmp=0.0_DP
152    !
153    ! Adjust Cartesian coordinates of wannier centres according to periodic boundary conditions...
154    ! N.B.: PBC are imposed here in the range [0,1)...
155    !
156    DO iobtl=1,nbsp
157      !
158      wannierc_tmp(1,iobtl)=ainv(1,1)*wfc(1,iobtl)+ainv(1,2)*wfc(2,iobtl)+ainv(1,3)*wfc(3,iobtl)   ! s = h^-1 r
159      wannierc_tmp(2,iobtl)=ainv(2,1)*wfc(1,iobtl)+ainv(2,2)*wfc(2,iobtl)+ainv(2,3)*wfc(3,iobtl)   ! s = h^-1 r
160      wannierc_tmp(3,iobtl)=ainv(3,1)*wfc(1,iobtl)+ainv(3,2)*wfc(2,iobtl)+ainv(3,3)*wfc(3,iobtl)   ! s = h^-1 r
161      !
162      wannierc_tmp(1,iobtl)=wannierc_tmp(1,iobtl)-FLOOR(wannierc_tmp(1,iobtl))   ! impose PBC on s in range: [0,1)
163      wannierc_tmp(2,iobtl)=wannierc_tmp(2,iobtl)-FLOOR(wannierc_tmp(2,iobtl))   ! impose PBC on s in range: [0,1)
164      wannierc_tmp(3,iobtl)=wannierc_tmp(3,iobtl)-FLOOR(wannierc_tmp(3,iobtl))   ! impose PBC on s in range: [0,1)
165      !
166      wannierc(1,iobtl)=h(1,1)*wannierc_tmp(1,iobtl)+h(1,2)*wannierc_tmp(2,iobtl)+h(1,3)*wannierc_tmp(3,iobtl)   ! r = h s
167      wannierc(2,iobtl)=h(2,1)*wannierc_tmp(1,iobtl)+h(2,2)*wannierc_tmp(2,iobtl)+h(2,3)*wannierc_tmp(3,iobtl)   ! r = h s
168      wannierc(3,iobtl)=h(3,1)*wannierc_tmp(1,iobtl)+h(3,2)*wannierc_tmp(2,iobtl)+h(3,3)*wannierc_tmp(3,iobtl)   ! r = h s
169      !
170      wannierc_tmp(:,iobtl)=wannierc(:,iobtl) ! keep a temporary copy to compute pair indices
171      !
172    END DO
173    !
174    ! make copy of wannier centres when number of processors > number of bands
175    !
176    IF(nproc_image.GT.nbsp) THEN
177      !
178      DO iobtl=nbsp+1,nproc_image
179        !
180        ir=MOD(iobtl,nbsp)
181        IF(ir.EQ.0 )THEN
182          wannierc(:,iobtl)=wannierc(:,nbsp)
183        ELSE
184          wannierc(:,iobtl)=wannierc(:,ir)
185        END IF
186        !
187      END DO
188      !
189    END IF
190    !
191    ! overlap is the unique neighbor list that for each band or processor image (ndim)
192    ! num_recv is the number of unique neighbors for each band or processor image (ndim)
193    !---------------------------------------------------------------------------------------------
194    ALLOCATE (obtl_recv(neigh/2, ndim)); obtl_recv=0
195    ALLOCATE (obtl_send(neigh, ndim));   obtl_send=0
196    ALLOCATE (num_recv(ndim)); num_recv=0
197    ALLOCATE (num_send(ndim)); num_send=0
198    ! generate the unique neighbor list
199    !
200    CALL exx_index_pair(wannierc_tmp, obtl_recv, num_recv, nj_max, ndim)
201    !
202    IF (ALLOCATED(wannierc_tmp))            DEALLOCATE(wannierc_tmp)
203    !---------------------------------------------------------------------------------------------
204    DO itr = 1, ndim
205      !
206      DO jtr = 1, neigh/2
207        !
208        IF (obtl_recv(jtr, itr) .NE. 0) THEN
209          !
210          num_send( obtl_recv(jtr, itr) ) = num_send( obtl_recv(jtr, itr) ) + 1
211          obtl_send( num_send( obtl_recv(jtr, itr) ), obtl_recv(jtr, itr) ) = itr
212          !
213        END IF
214        !
215      END DO
216      !
217    END DO
218    !---------------------------------------------------------------------------------------------
219    !
220    CALL stop_clock('exx_pairs')
221    !
222    !-------------------------------------------------------------------------
223    !
224    ! Allocate variables to store potentials for 3 steps ....
225    !
226    IF (n_exx.EQ.0) THEN
227      !
228      ! the following variables are used in the extrapolation of exx potentials
229      if (texx_cube) then
230        ALLOCATE( selfv  ( n_s_ps, psgsn, my_nbspx), stat=ierr ); selfv=0.0_DP
231        ALLOCATE( pairv  ( n_p_ps, psgsn, neigh, my_nbspx), stat=ierr ); pairv=0.0_DP
232        ALLOCATE( selfrho( n_s_ps, psgsn, my_nbspx), stat=ierr ); selfrho=0.0_DP
233        ALLOCATE( pairrho( n_p_ps, psgsn, neigh, my_nbspx), stat=ierr ); pairrho=0.0_DP
234        ALLOCATE( pair_label( neigh, my_nbspx ), stat=ierr ); pair_label=0.0_DP
235        ALLOCATE( pair_step( neigh, my_nbspx ), stat=ierr ); pair_step=0.0_DP
236        ALLOCATE( pair_status( neigh, my_nbspx ), stat=ierr ); pair_status=0.0_DP
237      else
238        ALLOCATE( selfv  ( np_in_sp_s, psgsn, my_nbspx), stat=ierr ); selfv=0.0_DP
239        ALLOCATE( pairv  ( np_in_sp_p, psgsn, neigh, my_nbspx), stat=ierr ); pairv=0.0_DP
240        ALLOCATE( pair_dist( psgsn, neigh, my_nbspx), stat=ierr ); pair_dist=0.0_DP
241      end if
242      !
243    END IF
244    !
245    !-------------------------------------------------------------------------
246    !
247    ! update exx step ...
248    !
249    n_exx = n_exx + 1
250    !
251    !=========================================================================
252    !
253    ! obtain orbitals on each local processor, stored in psi
254    !
255    ALLOCATE ( psi(  nnrtot, my_nbsp(me ) ) ); psi=0.0_DP
256    ALLOCATE ( vpsil(nnrtot, my_nbsp(me ) ) ); vpsil=0.0_DP
257    !
258    CALL start_clock('r_orbital')
259    !
260    ! In c variable, the plane wave wavefunction, is distributed in nr3 parts
261    ! and stored in each parallel mpitasks (or processors).
262    ! The exx_psi maps c in to psi, which is in real space grid.
263    !
264    ! c   -- one processor has a part of information from  all bands
265    ! psi -- one processor has complete information of one band (or more)
266    !
267    CALL exx_psi(c, psi, nnrtot, my_nbsp, my_nxyz, nbsp)
268    !
269    CALL stop_clock('r_orbital')
270    !
271    !===============================================================================
272    !
273    !           PAIR AND SELF POTENTIALS ARE CALCULATED IN ONE BIG LOOP
274    !
275    !===============================================================================
276    !
277    !========================================================================
278    !                      THE MOST OUTER LOOP STARTS:
279    !========================================================================
280    !
281    ! flag identifying whether the pair communication is done or not
282    !
283    ALLOCATE( irecvreq( neigh * my_nbsp(me) ) )
284    ALLOCATE( isendreq( neigh * my_nbsp(me) ) )
285    !
286    ! obtain psi in sphere (psil) for neighbors
287    !
288    call start_clock('exx_big_alloc')
289    if (texx_cube) then
290      if (.not.allocated(psime_pair_send)) then
291        ALLOCATE ( psime_pair_send(n_p_me, neigh, my_nbsp(me)) ); psime_pair_send=0.0_DP
292      end if
293      if (.not.allocated(psime_pair_recv)) then
294        ALLOCATE ( psime_pair_recv(n_p_me, neigh, my_nbsp(me)) ); psime_pair_recv=0.0_DP
295      end if
296    else
297      if (.not.allocated(psime_pair_send)) then
298        ALLOCATE ( psime_pair_send(np_in_sp_me_p, neigh, my_nbsp(me)) ); psime_pair_send=0.0_DP
299      end if
300      if (.not.allocated(psime_pair_recv)) then
301        ALLOCATE ( psime_pair_recv(np_in_sp_me_p, neigh, my_nbsp(me)) ); psime_pair_recv=0.0_DP
302      end if
303    end if
304    call stop_clock('exx_big_alloc')
305    !
306    ! initialize totalenergy and derivatives
307    !
308    totalenergy = 0.0_DP
309    total_exx_derv(:,:) =0.0_DP
310    !
311    ! my_var is the maximum of nbsp or nproc_image
312    my_var = MAX(nproc_image, nbsp)
313    !
314#if defined(__MPI)
315    irecv_count = 0
316    isend_count = 0
317    ! ---------- start of Non-blocking communication (1) -----------
318    ! In this region of the code, we start with non-blocking send of
319    ! local orbitals to form MLWF product potentials on the non-self
320    ! multipole expansion domain. This part allows all pairs to be
321    ! send simultaneously instead of one-by-one...
322    ! --------------------------------------------------------------
323    CALL start_clock('send_psi')
324    !
325    ! we should use my_nbspx (maxval(my_nbsp(me))) here
326    !
327    DO iobtl = 1, my_nbspx
328      !
329      gindex_of_iobtl = index_my_nbsp(iobtl, me)
330      if (gindex_of_iobtl.gt.nbsp) cycle ! index_my_nbsp is set to nbsp+1 when not assigned
331      !
332      !========================================================================
333      !
334      !prep receives
335      DO itr = 1, neigh/2
336        !
337        obtl_tbs = obtl_recv( itr, gindex_of_iobtl )
338        !
339        IF ( obtl_tbs .NE. 0 ) THEN
340          !
341          rk_of_obtl_tbs  = rk_of_obtl(obtl_tbs)
342          !
343          irecv_count = irecv_count + 1
344          if (texx_cube) then
345            CALL MPI_IRECV( psime_pair_recv(1, itr, iobtl), n_p_me, MPI_DOUBLE_PRECISION, rk_of_obtl_tbs, &
346              obtl_tbs*ndim+gindex_of_iobtl, intra_image_comm, irecvreq(irecv_count), ierr)
347          else
348            CALL MPI_IRECV( psime_pair_recv(1, itr, iobtl), np_in_sp_me_p, MPI_DOUBLE_PRECISION, rk_of_obtl_tbs, &
349              obtl_tbs*ndim+gindex_of_iobtl, intra_image_comm, irecvreq(irecv_count), ierr)
350          end if
351          !
352        END IF
353        !
354      END DO
355      !
356      !prep sends
357      DO itr = 1, neigh
358        !
359        obtl_trcv = obtl_send( itr, gindex_of_iobtl )
360        !
361        IF ( obtl_trcv .NE. 0 ) THEN
362          !
363          rk_of_obtl_trcv = rk_of_obtl(obtl_trcv)
364          !
365          ! calculate mid point of two wannier centers
366          CALL getmiddlewc( wannierc(1, gindex_of_iobtl), wannierc(1, obtl_trcv), h, ainv, middle )
367          !
368          ! calculate translation vector from the center of the box
369          CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle, tran)
370          !
371          ! get the localized psi around the mid point of two wannier centers
372          ! note: the psime is centered at the center of the box
373          ! (using the translation vector "tran" from middle of wfc to the center of box)
374          if (texx_cube) then
375            CALL getpsicb( nrg, p_me_r, psi(1,iobtl), psime_pair_send(1, itr, iobtl), tran)
376          else
377            CALL getpsil( nnrtot, np_in_sp_me_p, psi(1, iobtl), psime_pair_send(1, itr, iobtl), tran)
378          end if
379          !
380          isend_count = isend_count + 1
381          if (texx_cube) then
382            CALL MPI_ISEND( psime_pair_send(1, itr, iobtl), n_p_me, MPI_DOUBLE_PRECISION, rk_of_obtl_trcv, &
383                            gindex_of_iobtl*ndim+obtl_trcv, intra_image_comm, isendreq(isend_count), ierr)
384          else
385            CALL MPI_ISEND( psime_pair_send(1, itr, iobtl), np_in_sp_me_p, MPI_DOUBLE_PRECISION, rk_of_obtl_trcv, &
386              gindex_of_iobtl*ndim+obtl_trcv, intra_image_comm, isendreq(isend_count), ierr)
387          end if
388          !
389        END IF
390        !
391      END DO
392      !========================================================================
393      !
394    END DO
395    !
396    !==========================================================================
397    !
398    CALL stop_clock('send_psi')
399    !
400    !==========================================================================
401    !
402    CALL start_clock('send_psi_wait')
403    !
404    DO itr = 1, isend_count
405      CALL MPI_WAIT(isendreq(itr), istatus, ierr)
406    END DO
407    !
408    DO itr = 1, irecv_count
409      CALL MPI_WAIT(irecvreq(itr), istatus, ierr)
410    END DO
411    !
412    CALL stop_clock('send_psi_wait')
413    ! ------- end of Non-blocking communication (1): synchronize by wait ------
414#endif
415    !
416    !=========================================================================
417    ! after this loop ( do irank ), all the processor got all the overlapping orbitals
418    ! for the i_obtl orbital and ready to calculate pair potential
419    !=========================================================================
420    !
421    !=========================================================================
422    !                              CALCULATION STARTS HERE
423    !=========================================================================
424    ! printout header for the cgsteps
425    !
426    IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
427      !
428      IF (ionode) THEN
429        !
430        iunit=printout_base_unit("ncg")
431        !
432        CALL printout_base_open("ncg")
433        !
434        WRITE(iunit,'(I8,F16.8)')nfi,tps
435        !
436      END IF
437      !
438    END IF
439    !
440    CALL start_clock('getpairv')
441    !
442    if (texx_cube) then
443      ! Do some allocations
444      IF(.not.ALLOCATED(psime )) ALLOCATE( psime(max(n_p_me,n_s_me)) );
445      IF(.not.ALLOCATED(rhome )) ALLOCATE( rhome(max(n_p_me,n_s_me)) );
446      IF(.not.ALLOCATED(rhops )) ALLOCATE( rhops(max(n_p_ps,n_s_ps)) );
447      IF(.not.ALLOCATED(potme )) ALLOCATE( potme(max(n_p_me,n_s_me)) );
448    end if
449    DO iobtl = 1, my_nbspx
450      !
451      middle(:)=0.0_DP
452      !
453      gindex_of_iobtl = index_my_nbsp(iobtl, me)
454      if (gindex_of_iobtl.gt.nbsp) cycle ! index_my_nbsp is set to nbsp+1 when not assigned
455      !
456      IF ( gindex_of_iobtl .LE. my_var) THEN
457        !
458        !-- second loop starts: calculate overlapping potential with the j_th orbitals --
459        !
460        ! my_var3 is the unique neighbor number for gindex_of_iobtl
461        my_var3=num_recv( gindex_of_iobtl )
462        !
463        do j = 1, my_var3
464          !
465          ! my_var2 is the global index of the unique pair to gindex_of_iobtl
466          my_var2=obtl_recv(j,gindex_of_iobtl)
467          !
468          IF (my_var2 .NE. 0) THEN
469            !==================================================================================
470            !                      find the position of previous v/rho
471            !==================================================================================
472            !
473            if (texx_cube) then
474              call solve_a_nonself_pair_cube
475            else
476              call solve_a_nonself_pair_sphere
477            end if
478            !
479            paire(j) = paire(j) * 0.5_DP* hcub             ! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
480            totalenergy = totalenergy + 2.0_DP*paire(j)    ! the factor of two comes from the identity of ij and ji pair
481            !
482            IF (.NOT. (isotropic .AND. (ibrav.EQ.1) )) THEN
483              CALL start_clock('exx_cell_derv')
484              !
485              ! EXX cell derivative (note: exxalfa is included in vofrho.f90 when calculate stress)
486              !
487              if (texx_cube) then
488                CALL exx_energy_cell_derivative_cube(p_me_r, p_ps_r, tran, rhome, potme, &
489                  ha_proj, hb_proj, hc_proj, Jim, pair_dexx_dhab(:,:,j))
490              else
491                CALL exx_energy_cell_derivative(np_in_sp_me_p, np_in_sp_p, tran,&
492                  vl, ha, hb, hc, rhol, pair_dexx_dhab(:,:,j))
493              end if
494              !
495              ! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
496              !
497              pair_dexx_dhab(:,:,j) = pair_dexx_dhab(:,:,j)*0.5_DP*hcub
498              !
499              ! accumulate the derivative from different pair terms
500              !
501              total_exx_derv(:,:) = total_exx_derv(:,:) + 2.0_DP*pair_dexx_dhab(:,:,j)
502              !
503              !
504              ! if isotropic => calculate the stress tensor in vofrho.f90
505              !
506              CALL stop_clock('exx_cell_derv')
507            END IF
508            !
509            if (.not.texx_cube) then
510              IF (ALLOCATED(psil))            DEALLOCATE(psil)
511              IF (ALLOCATED(rhol))            DEALLOCATE(rhol)
512              IF (ALLOCATED(rho_in_sp))       DEALLOCATE(rho_in_sp)
513              IF (ALLOCATED(vl))              DEALLOCATE(vl)
514            end if
515            !
516          END IF
517          !
518        END DO !for j
519        !
520      END IF !gindex_of_iobtl <= nbsp
521      !
522    END DO
523    CALL stop_clock('getpairv')
524    !
525    !===============================================================================
526    ! After this loop, each processor finished the pair potentials for the
527    ! iobtl orbital, and shall talk to send/recv vpsiforj
528    !===============================================================================
529    !
530    !===============================================================================
531    !                INITIALIZE SEND VPSI BEFORE CALCULATE PAIR
532    !===============================================================================
533    !
534#if defined(__MPI)
535    irecv_count = 0
536    isend_count = 0
537    !
538    ! ---------- start of Non-blocking communication (2) -----------
539    ! In this region of the code, we use non-blocking MPI feature to
540    ! send the EXX contributions to orbital force and asynchronously
541    ! overlap with the self-exchange computation...
542    ! --------------------------------------------------------------
543    CALL start_clock('send_v')
544    !
545    !========================================================================
546    !
547    DO iobtl = 1, my_nbspx
548      !
549      !========================================================================
550      !
551      gindex_of_iobtl = index_my_nbsp(iobtl, me)
552      if (gindex_of_iobtl.gt.nbsp) cycle ! index_my_nbsp is set to nbsp+1 when not assigned
553      !
554      DO itr = 1, neigh/2
555        !
556        obtl_trcv = obtl_recv( itr, gindex_of_iobtl )
557        !
558        IF ( obtl_trcv .NE. 0 ) THEN
559          !
560          rk_of_obtl_trcv  = rk_of_obtl(obtl_trcv)
561          !
562          isend_count = isend_count + 1
563          if (texx_cube) then
564            CALL MPI_ISEND( psime_pair_recv(1,itr,iobtl), n_p_me, MPI_DOUBLE_PRECISION, rk_of_obtl_trcv, &
565              gindex_of_iobtl*ndim+obtl_trcv, intra_image_comm, isendreq(isend_count), ierr)
566          else
567            CALL MPI_ISEND( psime_pair_recv(1,itr,iobtl), np_in_sp_me_p, MPI_DOUBLE_PRECISION, rk_of_obtl_trcv, &
568              gindex_of_iobtl*ndim+obtl_trcv, intra_image_comm, isendreq(isend_count), ierr)
569          end if
570          !
571          ! WRITE(my_unit,*) "itr = ", itr
572          ! WRITE(my_unit,*) "iobtl = ", iobtl
573          ! WRITE(my_unit,*) "gindex_of_iobtl = ",gindex_of_iobtl
574          ! WRITE(my_unit,*) "obtl_trcv: ", obtl_trcv
575          ! WRITE(my_unit,*) "rk_of_obtl_trcv: ", rk_of_obtl_trcv
576          ! WRITE(my_unit,*) "tag: ", gindex_of_iobtl*nbsp+obtl_trcv
577          !
578        END IF
579        !
580      END DO
581      !
582      !========================================================================
583      !
584      !MCA: send and receive now change their roles. Before, we were communicating
585      !the orbitals. Now, it is the potential that we got out of Poisson. Send is
586      !recieving and recv is sending. For now on, only psime_pair_send is used for
587      !computation.
588      !
589      DO itr = 1, neigh
590        !
591        obtl_tbs = obtl_send( itr, gindex_of_iobtl )
592        !
593        IF ( obtl_tbs .NE. 0 ) THEN
594          !
595          rk_of_obtl_tbs = rk_of_obtl(obtl_tbs)
596          !
597          irecv_count = irecv_count + 1
598          if (texx_cube) then
599            CALL MPI_IRECV( psime_pair_send(1, itr, iobtl), n_p_me, MPI_DOUBLE_PRECISION, rk_of_obtl_tbs, &
600              obtl_tbs*ndim+gindex_of_iobtl, intra_image_comm, irecvreq(irecv_count), ierr)
601          else
602            CALL MPI_IRECV( psime_pair_send(1, itr, iobtl), np_in_sp_me_p, MPI_DOUBLE_PRECISION, rk_of_obtl_tbs, &
603              obtl_tbs*ndim+gindex_of_iobtl, intra_image_comm, irecvreq(irecv_count), ierr)
604          end if
605          !
606          ! WRITE(my_unit,*) "itr = ", itr
607          ! WRITE(my_unit,*) "iobtl = ", iobtl
608          ! WRITE(my_unit,*) "gindex_of_iobtl = ", gindex_of_iobtl
609          ! WRITE(my_unit,*) "obtl_tbs: ", obtl_tbs
610          ! WRITE(my_unit,*) "rk_of_obtl_tbs: ", rk_of_obtl_tbs
611          ! WRITE(my_unit,*) "tag: ", obtl_tbs*nbsp+gindex_of_iobtl
612          !
613          !
614        END IF
615        !
616      END DO
617      !
618      !========================================================================
619      !
620    END DO
621    !
622    !========================================================================
623    !
624    CALL stop_clock('send_v')
625#endif
626    !
627    !=========================================================================
628    !
629    !              SELF POTENTIAL FOR EACH ORBITAL STARTS HERE
630    !
631    !=========================================================================
632    !
633    CALL start_clock('getselfv')
634    !
635    !=========================================================================
636    DO iobtl = 1, my_nbspx
637      !
638      IF (iobtl.LE.my_nbsp(me)) THEN ! skip when the loop of my_nbspx goes outside of scope
639        !
640        IF (me.GT.(nbsp*(sc_fac-1))) THEN ! compatible with more processors than nbsp
641          !
642          gindex_of_iobtl =  index_my_nbsp(iobtl, me)
643          if (gindex_of_iobtl.gt.nbsp) cycle ! index_my_nbsp is set to nbsp+1 when not assigned
644          !
645          if (texx_cube) then
646            call solve_a_self_pair_cube
647          else
648            call solve_a_self_pair_sphere
649          end if
650          selfe = selfe * 0.5_DP * hcub               ! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
651          totalenergy = totalenergy + selfe
652          !
653          !IF(me .GT. nbsp) THEN
654          !  vpsil(:,iobtl) = 0.0_DP
655          !END IF
656          !
657          IF (.NOT. (isotropic .AND. (ibrav.EQ.1))) THEN
658            !
659            !  EXX cell derivative (note: need to include exxalfa later)
660            CALL start_clock('exx_cell_derv')
661            !
662            if (texx_cube) then
663              CALL exx_energy_cell_derivative_cube(s_me_r, s_ps_r, tran, rhome, potme, &
664                ha_proj, hb_proj, hc_proj, Jim, self_dexx_dhab(:,:))
665            else
666              CALL exx_energy_cell_derivative(np_in_sp_me_s, np_in_sp_s, tran,&
667                vl, ha, hb, hc, rhol, self_dexx_dhab(:,:))
668            end if
669            !
670            ! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
671            !
672            self_dexx_dhab(:,:) = self_dexx_dhab(:,:)*0.5_DP*hcub
673            !
674            ! combine derivative with pair terms
675            !
676            total_exx_derv(:,:) = total_exx_derv(:,:) + self_dexx_dhab(:,:)
677            !
678            CALL stop_clock('exx_cell_derv')
679            !
680            ! if isotropic => calculate the stress tensor in vofrho.f90
681            !
682          END IF
683          !
684          if (.not.texx_cube) then
685            IF (ALLOCATED(psil))            DEALLOCATE(psil)
686            IF (ALLOCATED(rhol))            DEALLOCATE(rhol)
687            IF (ALLOCATED(rho_in_sp))       DEALLOCATE(rho_in_sp)
688            IF (ALLOCATED(vl))              DEALLOCATE(vl)
689          end if
690          !
691        END IF ! me
692        !
693      END IF !iobtl
694      !
695    END DO ! iobtl
696    if (texx_cube) then
697      IF (ALLOCATED(psime))           DEALLOCATE(psime)
698      IF (ALLOCATED(rhome))           DEALLOCATE(rhome)
699      IF (ALLOCATED(rhops))           DEALLOCATE(rhops)
700      IF (ALLOCATED(potme))           DEALLOCATE(potme)
701    end if
702    CALL stop_clock('getselfv')
703    !========================================================================
704    !
705#if defined(__MPI)
706    CALL start_clock('send_v_wait')
707    !
708    DO itr = 1, isend_count
709      CALL MPI_WAIT(isendreq(itr), istatus, ierr)
710    END DO
711    !
712    DO itr = 1, irecv_count
713      CALL MPI_WAIT(irecvreq(itr), istatus, ierr)
714    END DO
715    !
716    CALL stop_clock('send_v_wait')
717    ! ------- end of Non-blocking communication (2): synchronize by wait ------
718    !
719    !========================================================================
720    CALL start_clock('force_rec')
721    !
722    DO iobtl = 1, my_nbspx
723      !
724      gindex_of_iobtl =  index_my_nbsp(iobtl, me)
725      if (gindex_of_iobtl.gt.nbsp) cycle ! index_my_nbsp is set to nbsp+1 when not assigned
726      !
727      DO itr = 1, neigh
728        !
729        obtl_tbadd = obtl_send(itr, gindex_of_iobtl)
730        !
731        IF ( obtl_tbadd .NE. 0 ) THEN
732          !
733          CALL getmiddlewc( wannierc(1,gindex_of_iobtl), wannierc(1,obtl_tbadd), h, ainv, middle )
734          !
735          ! calculate translation vector from the center of the box
736          CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle, tran )
737          !
738          ! upadate vpsil PBE0
739          !
740          if (texx_cube) then
741            CALL updateforce_rec(nrg, p_me_r, vpsil(1,iobtl), psime_pair_send(1,itr,iobtl), tran)
742          else
743            !$omp parallel do private(ir)
744            DO ip = 1, np_in_sp_me_p
745              CALL l2goff (ip,ir,tran) ! local is centered at box center; global index is offset by tran
746              vpsil(ir,iobtl) = vpsil(ir,iobtl) + psime_pair_send(ip,itr,iobtl)
747            END DO
748            !$omp end parallel do
749          end if
750          !
751        END IF
752        !
753      END DO
754      !
755    END DO ! iobtl
756    !
757    CALL stop_clock('force_rec')
758    !========================================================================
759#endif
760    !
761    CALL start_clock('totalenergy')
762    !
763    totalenergyg=0.0_DP ! mpi reduction variable initialization
764    exx=0.0_DP          ! exx energy (used to handle the open/closed shell energy)
765    !
766#if defined(__MPI)
767    ! collect the totalenergy of each mpi task to totalenergyg
768    CALL MPI_ALLREDUCE(totalenergy, totalenergyg, 1, MPI_DOUBLE_PRECISION, &
769        &                        MPI_SUM, intra_image_comm, ierr)
770    !
771#else
772    totalenergyg = totalenergy
773#endif
774    exx = totalenergyg
775    IF (nspin .EQ. 1) exx = exx + totalenergyg ! if closed shell double the totalenergy
776    !
777    !WRITE(stdout, '("EXX Energy",2F30.14," step",I7)')exx,totalenergyg*2.0_DP, nfi
778    !
779    CALL stop_clock('totalenergy')
780    !
781    CALL start_clock('exx_cell_derv')
782    !
783    total_exx_derv_g(:,:) = 0.0_DP ! mpi reduction variable initialization
784    !
785    IF (.NOT. (isotropic .AND. (ibrav.EQ.1))) THEN
786#if defined(__MPI)
787      ! collect the total_exx_derv of each mpi task to total_exx_derv_g
788      CALL MPI_ALLREDUCE(total_exx_derv(:,:), total_exx_derv_g(:,:), 9, &
789          MPI_DOUBLE_PRECISION, MPI_SUM, intra_image_comm, ierr)
790      !
791#else
792      total_exx_derv_g(:,:) = total_exx_derv(:,:)
793#endif
794    END IF
795    !
796    ! for closed shell case inclued spin factor of 2
797    dexx_dh(:,:) = total_exx_derv_g(:,:)
798    IF (nspin .EQ. 1) dexx_dh(:,:) = dexx_dh(:,:) + total_exx_derv_g(:,:)
799    !
800    CALL stop_clock('exx_cell_derv')
801    !
802    ! Local to global distribution of EXX potential
803    ! vpsil (local) --> exx_potential (global)
804    !
805    CALL start_clock('vl2vg')
806    exx_potential=0.0_DP
807    !
808    IF (nproc_image .LE. nbsp) THEN
809      !
810#ifdef __MPI
811      CALL redistwfr ( exx_potential, vpsil, my_nxyz, my_nbsp, intra_image_comm, -1 )
812#else
813      exx_potential = vpsil
814#endif
815      !
816    ELSE
817      !
818      !-----------Zhaofeng's vpsil (local) to exx_potential (global) -----------
819      !
820      nogrp = fftx_ntgrp(dffts)
821      !
822      ALLOCATE( sdispls(nproc_image), sendcount(nproc_image) ); sdispls=0; sendcount=0
823      ALLOCATE( rdispls(nproc_image), recvcount(nproc_image) ); rdispls=0; recvcount=0
824      ALLOCATE( sdispls1(nogrp), sendcount1(nogrp) ); sdispls1=0; sendcount1=0
825      ALLOCATE( rdispls1(nogrp), recvcount1(nogrp) ); rdispls1=0; recvcount1=0
826      !
827      DO proc = 1, nproc_image
828        !
829        IF (me <= nogrp*nr3s) THEN
830          sendcount(proc) =nr1s*nr2s/nogrp
831        ELSE
832          sendcount(proc) = 0
833        END IF
834        !proc 1 holds  the nr1s*nr2s/nogrp information of 1024 orbital(duplicate)
835        !proc 640 as well
836        !however, 641 to 1024 idle
837        IF (proc <= nogrp*nr3s) THEN
838          recvcount(proc)=nr1s*nr2s/nogrp
839        ELSE
840          recvcount(proc)=0
841        END IF
842        !
843      END DO
844      !
845      sdispls(1) = 0
846      rdispls(1) = 0
847      !
848      DO proc = 2,  nproc_image
849        sdispls(proc)=  sdispls(proc-1) + sendcount(proc-1)
850        rdispls(proc) = rdispls(proc-1) + recvcount(proc-1)
851      END DO
852      !
853      ALLOCATE(exx_tmp (dffts%nnr,nproc_image/nogrp)); exx_tmp=0.0_DP
854      ALLOCATE(exx_tmp3(dffts%nnr,nproc_image/nogrp)); exx_tmp3=0.0_DP
855      !
856#if defined(__MPI)
857      !
858      CALL mp_barrier( intra_image_comm )
859      CALL MPI_ALLTOALLV(vpsil(1,1), recvcount,rdispls,MPI_DOUBLE_PRECISION, &
860          &           exx_tmp, sendcount,sdispls, MPI_DOUBLE_PRECISION, &
861          &           intra_image_comm, ierr)
862#endif
863      !
864      va = dffts%nnr/nogrp
865      DO j=1,nproc_image/nogrp,2
866        DO i=1,2*nogrp
867          ii=((i-1)/2)*va
868          jj=j+mod(i-1,2)
869          ia=(i-1-((i-1)/nogrp)*nogrp)*va
870          ib=j+(i-1)/nogrp
871          !$omp parallel do
872          DO ir=1,va
873            exx_tmp3(ii+ir,jj)=exx_tmp(ia+ir,ib)
874          END DO
875          !$omp end parallel do
876        END DO
877      END DO
878      !
879      DO proc = 1 , nogrp
880        sendcount1(proc) = dffts%nnr/nogrp
881        recvcount1(proc) = dffts%nnr/nogrp
882      END DO
883      !
884      rdispls1(1) = 0
885      sdispls1(1) = 0
886      !
887      DO proc = 2, nogrp
888        sdispls1(proc) = sdispls1(proc-1) + sendcount1(proc-1)
889        rdispls1(proc) = rdispls1(proc-1) + recvcount1(proc-1)
890      END DO
891      !
892#if defined(__MPI)
893      !
894      DO ir=1,nproc_image/nogrp
895        CALL mp_barrier( fftx_tgcomm(dffts) )
896        CALL MPI_ALLTOALLV(exx_tmp3(1,ir), sendcount1, sdispls1, MPI_DOUBLE_PRECISION, &
897            &         exx_potential(1,ir),recvcount1, rdispls1, MPI_DOUBLE_PRECISION, &
898            &         fftx_tgcomm(dffts), ierr)
899      END DO
900#endif
901      !
902      DO ir=1,nbsp/nogrp
903        DO i=1,sc_fac-1
904          ii=i*nbsp/nogrp
905          !$omp parallel do
906          DO ia=1,dffts%nnr
907            exx_potential(ia,ir)=exx_potential(ia,ir)+exx_potential(ia,ir+ii)
908          END DO
909          !$omp end parallel do
910        END DO
911      END DO
912      !
913      !-----------Zhaofeng's vpsil (local) to exx_potential (global) -----------
914      !
915    END IF ! vl2vg
916    !
917    CALL stop_clock('vl2vg')
918    !
919    !==============================================================================
920    IF (ALLOCATED(vpsil))           DEALLOCATE(vpsil)
921    IF (ALLOCATED(psi))             DEALLOCATE(psi)
922    IF (ALLOCATED(isendreq))        DEALLOCATE(isendreq)
923    IF (ALLOCATED(irecvreq))        DEALLOCATE(irecvreq)
924    IF (ALLOCATED(wannierc))        DEALLOCATE(wannierc)
925    IF (ALLOCATED(obtl_recv))       DEALLOCATE(obtl_recv)
926    IF (ALLOCATED(obtl_send))       DEALLOCATE(obtl_send)
927    IF (ALLOCATED(num_recv))        DEALLOCATE(num_recv)
928    IF (ALLOCATED(num_send))        DEALLOCATE(num_send)
929    IF (ALLOCATED(exx_tmp))         DEALLOCATE(exx_tmp)
930    IF (ALLOCATED(exx_tmp3))        DEALLOCATE(exx_tmp3)
931    IF (ALLOCATED(sdispls))         DEALLOCATE(sdispls)
932    IF (ALLOCATED(rdispls))         DEALLOCATE(rdispls)
933    IF (ALLOCATED(sdispls1))        DEALLOCATE(sdispls1)
934    IF (ALLOCATED(rdispls1))        DEALLOCATE(rdispls1)
935    IF (ALLOCATED(sendcount))       DEALLOCATE(sendcount)
936    IF (ALLOCATED(recvcount))       DEALLOCATE(recvcount)
937    IF (ALLOCATED(sendcount1))      DEALLOCATE(sendcount1)
938    IF (ALLOCATED(recvcount1))      DEALLOCATE(recvcount1)
939    !
940    RETURN
941  contains
942
943    SUBROUTINE  exx_gs_setup_common()
944      IMPLICIT NONE
945      !
946      ! make processor index start from 1
947      !
948      me = me_image+1
949      !
950      ! number of real space gird along each lattice parameter directions (a1, a2, a3)
951      !
952      nr1s=dfftp%nr1; nr2s=dfftp%nr2; nr3s=dfftp%nr3
953      !
954      ! the length of each lattice parameters
955      !
956      a(1)=DSQRT(h(1,1)*h(1,1)+h(2,1)*h(2,1)+h(3,1)*h(3,1))   ! lattice 1
957      a(2)=DSQRT(h(1,2)*h(1,2)+h(2,2)*h(2,2)+h(3,2)*h(3,2))   ! lattice 2
958      a(3)=DSQRT(h(1,3)*h(1,3)+h(2,3)*h(2,3)+h(3,3)*h(3,3))   ! lattice 3
959      !
960      ! grid spacing in each lattice parameters
961      !
962      ha = a(1) / DBLE(nr1s)  !grid spacing in Lattice 1 direction
963      hb = a(2) / DBLE(nr2s)  !grid spacing in Lattice 2 direction
964      hc = a(3) / DBLE(nr3s)  !grid spacing in Lattice 3 direction
965      !
966      ! total number of real space grid points in the global mesh
967      ! and the corresponding volume elements for each grid point
968      !
969      nnrtot = nr1s * nr2s * nr3s
970      hcub = omega / DBLE(nnrtot) !nnrtot in parallel
971      !
972      ! the x,y,z coordinates of the center of the box (gird center)
973      ! NOTE: center of the box is set to grid point at int(nr1/2), int(nr2/2), int(nr3/2) for every cell
974      !
975      centerx = h(1,1)*DBLE(INT(nr1s/2))+h(1,2)*DBLE(INT(nr2s/2))+h(1,3)*DBLE(INT(nr3s/2))   ! r = h s
976      centery = h(2,1)*DBLE(INT(nr1s/2))+h(2,2)*DBLE(INT(nr2s/2))+h(2,3)*DBLE(INT(nr3s/2))   ! r = h s
977      centerz = h(3,1)*DBLE(INT(nr1s/2))+h(3,2)*DBLE(INT(nr2s/2))+h(3,3)*DBLE(INT(nr3s/2))   ! r = h s
978      !
979      ! inverse volume
980      !
981      sa1 = 1.0_DP/omega
982      !
983      ! Compute coeke and renormalize
984      ! This part needs to be done once in constant volume simulation and
985      ! needs to be done every step in variable cell simulationulations ...
986      !
987      ! get ha*d/da, ha^2*d^2/da^2 stencil and cross coefficients
988      !   1. for the finite difference coefficients, we follow B. Fornberg in
989      !       Math. Comp. 51 (1988), 699-706
990      !
991      CALL fornberg(nord1, nord2,coe_1st_derv(:,1),coeke(:,1,1),coeke(:,1,2),ierr)
992      !
993      IF (ierr .ne. 0) THEN
994        WRITE(stdout,*) ' ERROR: Wrong parameter in CALL of Fornberg'
995        WRITE(stdout,*) ' STOP in exx_gs'
996        RETURN
997      END IF
998      !RENORMALIZE COEKES WITH RESPECT TO THE GRID SPACING
999      ! first derivative coefficients
1000      !
1001      coe_1st_derv(:,3) = coe_1st_derv(:,1)/hc ! d/dc stencil
1002      coe_1st_derv(:,2) = coe_1st_derv(:,1)/hb ! d/db stencil
1003      coe_1st_derv(:,1) = coe_1st_derv(:,1)/ha ! d/da stencil
1004      !
1005      ! NOTE: in the second derivatives there is a additional factor of
1006      !       -4*pi because we merege that from the Poisson equation
1007      !
1008      !                \nabla^2 V = -4*\pi \rho
1009      !
1010      ! axial derivatives
1011      !
1012      coeke(:,3,3) = -coeke(:,1,1)/(hc*hc*fpi) ! -d^2/dc^2/4pi stencil
1013      coeke(:,2,2) = -coeke(:,1,1)/(hb*hb*fpi) ! -d^2/db^2/4pi stencil
1014      coeke(:,1,1) = -coeke(:,1,1)/(ha*ha*fpi) ! -d^2/da^2/4pi stencil
1015      !
1016      ! cross derivatives
1017      !
1018      coeke(:,2,3) = -coeke(:,1,2)/(hb*hc*fpi) ! -d^2/dbdc/4pi stencil
1019      coeke(:,1,3) = -coeke(:,1,2)/(ha*hc*fpi) ! -d^2/dadc/4pi stencil
1020      coeke(:,1,2) = -coeke(:,1,2)/(ha*hb*fpi) ! -d^2/dadb/4pi stencil
1021      !
1022      ! -- Jacobian for the general (non-orthogonal) --
1023      ! please see the following reference for details <todo: EXX paper>
1024      !
1025      ! J = transpose(ainv).(diag(a))
1026      !
1027      Jim(:,1) = ainv(1,:)*a(1)
1028      Jim(:,2) = ainv(2,:)*a(2) ! i={xyz}, m={abc}
1029      Jim(:,3) = ainv(3,:)*a(3)
1030      !
1031      ! -- weigh coeke with the Jacobian --
1032      !
1033      ! axial derivatives
1034      !
1035      coeke(:,3,3) = (Jim(1,3)**2+Jim(2,3)**2+Jim(3,3)**2)*coeke(:,3,3)
1036      coeke(:,2,2) = (Jim(1,2)**2+Jim(2,2)**2+Jim(3,2)**2)*coeke(:,2,2)
1037      coeke(:,1,1) = (Jim(1,1)**2+Jim(2,1)**2+Jim(3,1)**2)*coeke(:,1,1)
1038      !
1039      ! cross derivatives (needed for non-othogonal grids in the second derivatives)
1040      !
1041      coeke(:,2,3) = 2.0_DP*(Jim(1,2)*Jim(1,3)+Jim(2,2)*Jim(2,3)+Jim(3,2)*Jim(3,3))*coeke(:,2,3)
1042      coeke(:,1,3) = 2.0_DP*(Jim(1,1)*Jim(1,3)+Jim(2,1)*Jim(2,3)+Jim(3,1)*Jim(3,3))*coeke(:,1,3)
1043      coeke(:,1,2) = 2.0_DP*(Jim(1,1)*Jim(1,2)+Jim(2,1)*Jim(2,2)+Jim(3,1)*Jim(3,2))*coeke(:,1,2)
1044      coeke(:,3,2) = coeke(:,2,3) ! symmetry of coeke
1045      coeke(:,2,1) = coeke(:,1,2) ! symmetry of coeke
1046      coeke(:,3,1) = coeke(:,1,3) ! symmetry of coeke
1047      !
1048      ! a samall check on the shape of user defined cell (if any)
1049      !
1050      IF ((ibrav.EQ.0).AND.(nfi.EQ.1)) THEN
1051        WRITE(stdout,*) 'EXX info: If you are using an orthogonal cell without its cell vectors&
1052          & aligned to the xyz directions, the EXX calculation may be twice more expensive.'
1053      END IF
1054      RETURN
1055    END SUBROUTINE exx_gs_setup_common
1056
1057    subroutine  exx_gs_setup_cube()
1058      implicit none
1059      psgsn=1
1060      ! consider a grid an unit cell, the lattice vectors for the grid
1061      ha_proj(:) = ha*h(:,1)/a(1)
1062      hb_proj(:) = hb*h(:,2)/a(2)
1063      hc_proj(:) = hc*h(:,3)/a(3)
1064      inv_omega = 1.0_DP/omega
1065      s_me_r1=s_me_r(1)
1066      s_me_r2=s_me_r(2)
1067      s_me_r3=s_me_r(3)
1068      s_me_r4=s_me_r(4)
1069      s_me_r5=s_me_r(5)
1070      s_me_r6=s_me_r(6)
1071      DO k = s_me_r(3),s_me_r(6)
1072        DO j = s_me_r(2),s_me_r(5)
1073          DO i = s_me_r(1),s_me_r(4)
1074            !---------------------------------------------------------------------------------------
1075            dqs1 = (DBLE(i)/DBLE(nr1s)) - DBLE(INT(nr1s/2))/DBLE(nr1s)
1076            dqs2 = (DBLE(j)/DBLE(nr2s)) - DBLE(INT(nr2s/2))/DBLE(nr2s)
1077            dqs3 = (DBLE(k)/DBLE(nr3s)) - DBLE(INT(nr3s/2))/DBLE(nr3s)
1078            !
1079            ! Here we are computing distances between Grid points and center of the simulation cell, so no MIC is needed ...
1080            ! Compute distance between grid point and the center of the simulation cell in R space
1081            !
1082            dq1=h(1,1)*dqs1+h(1,2)*dqs2+h(1,3)*dqs3   !r_i = h s_i
1083            dq2=h(2,1)*dqs1+h(2,2)*dqs2+h(2,3)*dqs3   !r_i = h s_i
1084            dq3=h(3,1)*dqs1+h(3,2)*dqs2+h(3,3)*dqs3   !r_i = h s_i
1085            !
1086            dist = DSQRT(dq1*dq1+dq2*dq2+dq3*dq3)
1087            !-------------------------------------------------
1088            me_cs(1,i,j,k)=dq1
1089            me_cs(2,i,j,k)=dq2
1090            me_cs(3,i,j,k)=dq3
1091            !-------------------------------------------------
1092            me_rs(0,i,j,k)=1.0_DP
1093            me_rs(1,i,j,k)=dist
1094            !-------------------------------------------------
1095            me_ri(0,i,j,k)=1.0_DP
1096            !
1097            IF(dist.GE.1.0E-10) THEN
1098              me_ri(1,i,j,k)=1.0_DP/dist
1099            ELSE
1100              me_ri(1,i,j,k)=0.0_DP ! JJ: this seems wrong, but is correct
1101            END IF
1102            !-------------------------------------------------
1103            DO l=2,lmax
1104              me_rs(l,i,j,k)=me_rs(l-1,i,j,k)*me_rs(1,i,j,k)
1105            END DO
1106            !-------------------------------------------------
1107            DO l=2,lmax+1
1108              me_ri(l,i,j,k)=me_ri(l-1,i,j,k)*me_ri(1,i,j,k)
1109            END DO
1110            !-------------------------------------------------
1111            IF( (i.GE.s_me_r1).AND.(i.LE.s_me_r4).AND. &
1112                (j.GE.s_me_r2).AND.(j.LE.s_me_r5).AND. &
1113                (k.GE.s_me_r3).AND.(k.LE.s_me_r6) ) THEN
1114              !-------------------------------------------------
1115              dxy      = DSQRT(dq1*dq1+dq2*dq2)
1116              !-------------------------------------------------
1117              me_rc(0,i,j,k)     =1.0_DP
1118              me_rc(1:lmax,i,j,k)=0.0_DP
1119              !-------------------------------------------------
1120              IF (dxy .GT. 1.0E-10) THEN
1121                !-----------------------------------------------
1122                cxy = CMPLX(dq1,dq2)/dxy
1123                !-----------------------------------------------
1124                DO m=1,lmax
1125                  me_rc(m,i,j,k)=me_rc(m-1,i,j,k)*cxy
1126                END DO
1127                !-----------------------------------------------
1128              END IF
1129              !-------------------------------------------------
1130            END IF
1131            !-------------------------------------------------
1132          END DO
1133        END DO
1134      END DO
1135      !---------------------------------------------------------------------------------------------
1136      return
1137    end subroutine exx_gs_setup_cube
1138
1139    subroutine  exx_gs_setup_sphere()
1140      implicit none
1141      !========================================================================
1142      ! Compute distances between grid points and the center of the simulation cell in R space
1143      ! This part needs to be done once in constant volume simulation and
1144      ! needs to be done every step in variable cell simulationulations ...
1145      !
1146      !========================================================================
1147      DO i=1,np_in_sp_me_s
1148        xx_in_sp(i)=h(1,1)*sc_xx_in_sp(i)+h(1,2)*sc_yy_in_sp(i)+h(1,3)*sc_zz_in_sp(i)   ! r = h s
1149        yy_in_sp(i)=h(2,1)*sc_xx_in_sp(i)+h(2,2)*sc_yy_in_sp(i)+h(2,3)*sc_zz_in_sp(i)   ! r = h s
1150        zz_in_sp(i)=h(3,1)*sc_xx_in_sp(i)+h(3,2)*sc_yy_in_sp(i)+h(3,3)*sc_zz_in_sp(i)   ! r = h s
1151      END DO
1152      return
1153    end subroutine exx_gs_setup_sphere
1154
1155    subroutine  solve_a_nonself_pair_cube()
1156      implicit none
1157      call generate_pe_guess_cube
1158      call start_clock('exx_grid_trans')
1159      CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,my_var2), h, ainv, middle )
1160      !
1161      ! calculate translation vector from the center of the box
1162      CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle, tran)
1163      call stop_clock('exx_grid_trans')
1164      !
1165      ! get the localized psi around the mid point of two wannier centers
1166      ! note: the psime is centered at the center of the box
1167      ! (using the translation vector "tran" from middle of wfc to the center of box)
1168      call start_clock('exx_psicb')
1169      CALL getpsicb( nrg, p_me_r, psi(1,iobtl), psime(1), tran)
1170#if ! defined(__MPI)
1171      ! does not need communication so construct it here; my_var2 is simultaneously the global and local index (serial case)
1172      CALL getpsicb( nrg, p_me_r, psi(1,my_var2), psime_pair_recv(1, j, iobtl), tran)
1173#endif
1174      call stop_clock('exx_psicb')
1175      !
1176      ! the localized density rhome
1177      call start_clock('exx_getrhol')
1178      CALL getrhol_cube(p_me_r, p_ps_r, psime(1), psime_pair_recv(1, j, iobtl), rhome, rhops, inv_omega)
1179      call stop_clock('exx_getrhol')
1180      !
1181      ! calculate the exx potential from the pair density by solving Poisson
1182      !
1183      !--------------------------------------------------------------------------------------
1184      call start_clock('exx_vofr')
1185      CALL getvofr_cube( p_me_r, p_ps_r, max(n_p_me,n_s_me), max(n_p_ps,n_s_ps), hcub, &
1186        rhops, potme, pair_status(pos, iobtl), psgsn, pairrho(:,:,pos,iobtl), &
1187        pairv(:,:,pos,iobtl), cgstep)
1188      call stop_clock('exx_vofr')
1189      !--------------------------------------------------------------------------------------
1190      !
1191      !--------------------------------------------------------------------------------------
1192      ! write cgsteps in the suffix.ncg (unit=44)
1193      !--------------------------------------------------------------------------------------
1194      IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
1195        IF (ionode) THEN ! maybe not needed for ionode (if one want more information)
1196          WRITE(iunit,'(3X,"(i,j,cgsteps)",3I6)') gindex_of_iobtl, my_var2, cgstep
1197        END IF
1198      END IF
1199      !--------------------------------------------------------------------------------------
1200      !
1201      !--------------------------------------------------------------------------------------
1202      ! update force and energy
1203      !--------------------------------------------------------------------------------------
1204      call start_clock('exx_force_loc')
1205      CALL updateforce_loc(nrg, p_me_r, vpsil(:,iobtl), potme, psime, psime_pair_recv(1,j,iobtl),tran)
1206#if ! defined(__MPI)
1207      ! does not need communication so construct it here; my_var2 is simultaneously the global and local index (serial case)
1208      CALL updateforce_loc(nrg, p_me_r, vpsil(:,my_var2), potme, psime_pair_recv(1,j,iobtl), psime, tran)
1209#endif
1210      call stop_clock('exx_force_loc')
1211      !
1212      call start_clock('exx_penergy')
1213      CALL vvprod_cube(p_me_r, rhome, potme, paire(j))    ! dot product of the rho and potme
1214      call stop_clock('exx_penergy')
1215      return
1216    end subroutine solve_a_nonself_pair_cube
1217
1218    subroutine  solve_a_nonself_pair_sphere()
1219      implicit none
1220      call start_clock('exx_grid_trans')
1221      CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,my_var2), h, ainv, middle )
1222      ! d_pair is used in the extrapolation scheme (sphere only)
1223      CALL get_pair_dist(wannierc(1,gindex_of_iobtl),wannierc(1,my_var2),d_pair)
1224      !
1225      ! calculate translation vector from the center of the box
1226      CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle, tran)
1227      call stop_clock('exx_grid_trans')
1228      !
1229      ! get the localized psi around the mid point of two wannier centers
1230      ! note: the psil is centered at the center of the box
1231      ! (using the translation vector "tran" from middle of wfc to the center of box)
1232      ALLOCATE ( psil(np_in_sp_me_p) ); psil=0.0_DP ! HK/MCA: (TODO) the allocation can to be done in a reusable fashion
1233      CALL getpsil( nnrtot, np_in_sp_me_p, psi(1, iobtl), psil(1), tran)
1234#if ! defined(__MPI)
1235      ! does not need communication so construct it here; my_var2 is simultaneously the global and local index (serial case)
1236      CALL getpsil( nnrtot, np_in_sp_me_p, psi(1, my_var2), psime_pair_recv(1, j, iobtl), tran)
1237#endif
1238      !
1239      ! the localized density rhol
1240      ! HK/MCA: (TODO) need to make these array allocation in a reusable fashion
1241      ALLOCATE ( rhol(np_in_sp_me_p) ); rhol=0.0_DP
1242      ALLOCATE ( rho_in_sp(np_in_sp_p) ); rho_in_sp=0.0_DP
1243      CALL getrhol_sphere( np_in_sp_me_p, np_in_sp_p, psil(1), psime_pair_recv(1, j, iobtl), rhol, rho_in_sp, tran, sa1)
1244      !
1245      ! calculate the exx potential from the pair density by solving Poisson
1246      !
1247      ! calculate the exx potential from the pair density by solving Poisson
1248      !
1249      ALLOCATE ( vl(np_in_sp_me_p) ); vl=0.0_DP ! compute potential (vl) in ME sphere
1250      CALL start_clock('getvofr')
1251      ! HK/MCA: d_pair is used in the extrapolation scheme (check if still working) see also ``call get_pair_dist''
1252      CALL getvofr_sphere( np_in_sp_me_p, np_in_sp_p, hcub, rho_in_sp, vl,&
1253        pairv(1,1,j,iobtl), pairv(1,2,j,iobtl), pairv(1,3,j,iobtl),&
1254        .FALSE., d_pair, pair_dist(1,j,iobtl), pair_dist(2,j,iobtl),&
1255        pair_dist(3,j,iobtl),cgstep)
1256      CALL stop_clock('getvofr')
1257      !
1258      ! write cgsteps in the suffix.ncg (unit=44)
1259      !
1260      IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
1261        !
1262        IF (ionode) THEN ! maybe not needed for ionode (if one want more information)
1263          !
1264          WRITE(iunit,'(3X,"(i,j,cgsteps)",3I6)') gindex_of_iobtl, my_var2, cgstep
1265          !
1266        END IF
1267        !
1268      END IF
1269      !
1270      ! update vpsil in the global grid (exxalfa is 0.25 for PBE0)
1271      !$omp parallel do private(ir)
1272      DO ip = 1, np_in_sp_me_p
1273        CALL l2goff (ip,ir,tran) ! local is centered at box center; global index is offset by tran
1274        vpsil(ir,iobtl) = vpsil(ir,iobtl) - exxalfa*vl(ip)*psime_pair_recv(ip,j,iobtl) ! to remain
1275#if defined(__MPI)
1276        psime_pair_recv(ip,j,iobtl) =            - exxalfa*vl(ip)*psil(ip)             ! to be sent
1277#else
1278        ! does not need communication so construct it here; my_var2 is simultaneously the global and local index (serial case)
1279        vpsil(ir,my_var2) = vpsil(ir,my_var2)    - exxalfa*vl(ip)*psil(ip)
1280#endif
1281      END DO
1282      !$omp end parallel do
1283      !
1284      CALL vvprod_sphere(np_in_sp_me_p, rhol, vl, paire(j)) ! dot product of the rho and vl !HK (todo): do we need to do PS+ME ?? rho_in_sp may be enough
1285      return
1286    end subroutine solve_a_nonself_pair_sphere
1287
1288    subroutine  generate_pe_guess_cube()
1289      implicit none
1290      guess_status = 1        ! guess what?
1291      pos = 0
1292      oldest_step = 100000000
1293      DO itr = 1, neigh
1294        IF ( pair_label(itr, iobtl) .EQ. 0 ) THEN
1295          pos = itr
1296          EXIT
1297        ELSE IF ( pair_label(itr, iobtl) .EQ. my_var2 ) THEN
1298          guess_status = 1
1299          pos = itr
1300          EXIT
1301        ELSE IF ( pair_step(itr, iobtl) < oldest_step ) THEN
1302          pos = itr
1303          oldest_step = pair_step(itr, iobtl)
1304        END IF
1305      END DO
1306      !
1307      IF (guess_status .EQ. 1) THEN
1308        IF (pair_step(pos, iobtl) .EQ. n_exx - 1) THEN
1309          pair_status(pos, iobtl) = pair_status(pos, iobtl) + 1
1310        ELSE
1311          pair_status(pos, iobtl) = 1
1312        END IF
1313      ELSE
1314        pair_status(pos, iobtl) = 0
1315      END IF
1316      !
1317      pair_label(pos, iobtl) = my_var2
1318      pair_step(pos, iobtl)  = n_exx
1319      return
1320    end subroutine generate_pe_guess_cube
1321
1322    subroutine  solve_a_self_pair_cube()
1323      implicit none
1324      ! calculate translation vector from the center of the box
1325      CALL getsftv(nr1s, nr2s, nr3s, h, ainv, wannierc(1, gindex_of_iobtl), tran)
1326      !
1327      ! get the localized psi around the wannier centers
1328      ! note: the psime is centered at the center of the box
1329      ! (using the translation vector "tran" from the wfc to the center of box)
1330      !
1331      CALL getpsicb( nrg, s_me_r, psi(1,iobtl), psime(1), tran)
1332      ! get the localized density rhome
1333      CALL getrhol_cube(s_me_r, s_ps_r, psime(1), psime(1), rhome, rhops, inv_omega)
1334      !
1335      ! calculate the exx potential from the pair density by solving Poisson
1336      !
1337      !--------------------------------------------------------------------------------------
1338      CALL start_clock('getvofr')
1339      !
1340      CALL getvofr_cube( s_me_r, s_ps_r, n_s_me, n_s_ps, hcub, rhops, potme, n_exx-1, psgsn, &
1341        selfrho(:,:,iobtl), selfv(:,:,iobtl), cgstep)
1342      !
1343      CALL stop_clock('getvofr')
1344      !--------------------------------------------------------------------------------------
1345      !
1346      !--------------------------------------------------------------------------------------
1347      ! write cgsteps in the suffix.ncg (unit=44)
1348      !--------------------------------------------------------------------------------------
1349      IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
1350        IF (ionode) THEN ! maybe not needed for ionode (if one want more information)
1351          WRITE(44,'(3X,"(i,i,cgsteps)",3I6)') gindex_of_iobtl, gindex_of_iobtl, cgstep
1352          CALL printout_base_close("ncg")
1353        END IF
1354      END IF
1355      !--------------------------------------------------------------------------------------
1356      !
1357      !--------------------------------------------------------------------------------------
1358      ! update force and energy
1359      !--------------------------------------------------------------------------------------
1360      !
1361      CALL updateforce_slf(nrg, s_me_r, vpsil(1,iobtl), potme, psime, tran)
1362      CALL vvprod_cube(s_me_r, rhome, potme, selfe)    ! dot product of the rho and potme
1363      return
1364    end subroutine solve_a_self_pair_cube
1365
1366    subroutine  solve_a_self_pair_sphere()
1367      implicit none
1368      ! calculate translation vector from the center of the box
1369      CALL getsftv(nr1s, nr2s, nr3s, h, ainv, wannierc(1, gindex_of_iobtl), tran)
1370      !
1371      ! get the localized psi around the wannier centers
1372      ! note: the psil is centered at the center of the box
1373      ! (using the translation vector "tran" from the wfc to the center of box)
1374      ALLOCATE ( psil(np_in_sp_me_s) ); psil=0.0_DP
1375      CALL getpsil( nnrtot, np_in_sp_me_s, psi(1, iobtl), psil(1), tran)
1376      !
1377      ! get the localized density rhol
1378      ALLOCATE ( rhol(np_in_sp_me_s) ); rhol=0.0_DP
1379      ALLOCATE ( rho_in_sp(np_in_sp_s) ); rho_in_sp=0.0_DP
1380      CALL getrhol_sphere( np_in_sp_me_s, np_in_sp_s, psil(1), psil(1), rhol, rho_in_sp, tran, sa1)
1381      !
1382      ! calculate the exx potential from the pair density by solving Poisson
1383      !
1384      ALLOCATE ( vl(np_in_sp_me_s) ); vl=0.0_DP ! compute potential (vl) in ME sphere
1385      CALL start_clock('getvofr')
1386      CALL getvofr_sphere( np_in_sp_me_s,np_in_sp_s,&
1387        hcub, rho_in_sp, vl, selfv(1,1,iobtl), selfv(1,2,iobtl),&
1388        selfv(1,3,iobtl), .TRUE., 0.0, 0.0, 0.0, 0.0,cgstep)
1389      !
1390      CALL stop_clock('getvofr')
1391      !
1392      ! write cgsteps in the suffix.ncg (unit=44)
1393      !
1394      IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
1395        !
1396        IF (ionode) THEN ! maybe not needed for ionode (if one want more information)
1397          !
1398          WRITE(44,'(3X,"(i,i,cgsteps)",3I6)') gindex_of_iobtl, gindex_of_iobtl, cgstep
1399          !
1400          CALL printout_base_close("ncg")
1401          !
1402        END IF
1403        !
1404      END IF
1405      !
1406      ! update vpsil in the global grid (exxalfa is 0.25 for PBE0)
1407      !$omp parallel do private(ir)
1408      DO ip = 1, np_in_sp_me_s
1409        CALL l2goff (ip,ir,tran) ! local is centered at box center; global index is offset by tran
1410        vpsil(ir,iobtl) = vpsil(ir,iobtl) - exxalfa*vl(ip)*psil(ip) ! PBE0
1411      END DO
1412      !$omp end parallel do
1413      !
1414      ! compute exchange energy in ME sphere
1415      CALL vvprod_sphere(np_in_sp_me_s, rhol, vl, selfe) ! dot product of the rho and vl !HK (todo): do we need to do PS+ME ?? rho_in_sp may be enough
1416      return
1417    end subroutine solve_a_self_pair_sphere
1418
1419END SUBROUTINE exx_gs
1420!====================================================================================
1421
1422!==============================================================================
1423SUBROUTINE getsftv(nr1s, nr2s, nr3s, h, ainv, wc, tran)
1424    !
1425    USE kinds, ONLY  : DP
1426    !
1427    IMPLICIT NONE
1428    !
1429    INTEGER  nr1s, nr2s, nr3s, tran(3)
1430    REAL(DP) wc(3), wcs(3), h(3,3), ainv(3,3)
1431    !
1432    INTEGER  i, bcm(3), wcm(3)
1433    !
1434    bcm(1) = INT(nr1s/2)
1435    bcm(2) = INT(nr2s/2)
1436    bcm(3) = INT(nr3s/2)
1437    !
1438    wcs(1)=ainv(1,1)*wc(1)+ainv(1,2)*wc(2)+ainv(1,3)*wc(3)   ! s_ij = h^-1 r_ij
1439    wcs(2)=ainv(2,1)*wc(1)+ainv(2,2)*wc(2)+ainv(2,3)*wc(3)   ! s_ij = h^-1 r_ij
1440    wcs(3)=ainv(3,1)*wc(1)+ainv(3,2)*wc(2)+ainv(3,3)*wc(3)   ! s_ij = h^-1 r_ij
1441    !
1442    ! we map to nearest grid point along each lattice vector
1443    !
1444    wcm(1) = IDNINT( wcs(1)*DBLE(nr1s) )
1445    wcm(2) = IDNINT( wcs(2)*DBLE(nr2s) )
1446    wcm(3) = IDNINT( wcs(3)*DBLE(nr3s) )
1447    !
1448    DO i = 1, 3
1449      tran(i) = bcm(i) - wcm(i)
1450    ENDDO
1451    !
1452    RETURN
1453END SUBROUTINE getsftv
1454!==============================================================================
1455
1456!==============================================================================
1457SUBROUTINE getmiddlewc(wc1, wc2, h, ainv, mid)
1458    !
1459    USE kinds, ONLY  : DP
1460    !
1461    IMPLICIT NONE
1462    !
1463    REAL(DP)  wc1(3), wc2(3), mid(3)
1464    REAL(DP)  dij(3), dij2(3), h(3,3), ainv(3,3), tmp(3)
1465    !
1466    INTEGER   i
1467    !
1468    dij(1)=wc2(1)-wc1(1)   ! r_ij = r_i - r_j
1469    dij(2)=wc2(2)-wc1(2)   ! r_ij = r_i - r_j
1470    dij(3)=wc2(3)-wc1(3)   ! r_ij = r_i - r_j
1471    !
1472    dij2(1)=ainv(1,1)*dij(1)+ainv(1,2)*dij(2)+ainv(1,3)*dij(3)   ! s_ij = h^-1 r_ij
1473    dij2(2)=ainv(2,1)*dij(1)+ainv(2,2)*dij(2)+ainv(2,3)*dij(3)   ! s_ij = h^-1 r_ij
1474    dij2(3)=ainv(3,1)*dij(1)+ainv(3,2)*dij(2)+ainv(3,3)*dij(3)   ! s_ij = h^-1 r_ij
1475    !
1476    dij2(1)=dij2(1)-IDNINT(dij2(1))   ! impose MIC on s_ij in range: [-0.5,+0.5]
1477    dij2(2)=dij2(2)-IDNINT(dij2(2))   ! impose MIC on s_ij in range: [-0.5,+0.5]
1478    dij2(3)=dij2(3)-IDNINT(dij2(3))   ! impose MIC on s_ij in range: [-0.5,+0.5]
1479    !
1480    dij(1)=h(1,1)*dij2(1)+h(1,2)*dij2(2)+h(1,3)*dij2(3)   ! r_ij = h s_ij (MIC)
1481    dij(2)=h(2,1)*dij2(1)+h(2,2)*dij2(2)+h(2,3)*dij2(3)   ! r_ij = h s_ij (MIC)
1482    dij(3)=h(3,1)*dij2(1)+h(3,2)*dij2(2)+h(3,3)*dij2(3)   ! r_ij = h s_ij (MIC)
1483    !
1484    mid(1) = wc1(1) + 0.5_DP*dij(1)
1485    mid(2) = wc1(2) + 0.5_DP*dij(2)
1486    mid(3) = wc1(3) + 0.5_DP*dij(3)
1487    !
1488    RETURN
1489END SUBROUTINE getmiddlewc
1490!==============================================================================
1491
1492!==============================================================================
1493SUBROUTINE get_pair_dist (wc1, wc2, P_dist)
1494    !
1495    USE kinds,                   ONLY  : DP
1496    USE cell_base,               ONLY  : omega, ainv, h
1497    !
1498    IMPLICIT NONE
1499    ! P_dist is the pair distance with in the minimum image convention
1500    REAL(DP)  wc1(3), wc2(3), P_dist
1501    REAL(DP)  dist_vec_in_r(3), dist_vec_in_s(3)
1502    !
1503    INTEGER i
1504    !
1505    P_dist = 0.D0
1506    ! dist vector
1507    DO i = 1,3
1508      dist_vec_in_r(i) = wc1(i) - wc2(i)
1509    END DO
1510    !
1511    dist_vec_in_s(1)=ainv(1,1)*dist_vec_in_r(1)+ainv(1,2)*dist_vec_in_r(2)+ainv(1,3)*dist_vec_in_r(3)   ! s = h^-1 r
1512    dist_vec_in_s(2)=ainv(2,1)*dist_vec_in_r(1)+ainv(2,2)*dist_vec_in_r(2)+ainv(2,3)*dist_vec_in_r(3)   ! s = h^-1 r
1513    dist_vec_in_s(3)=ainv(3,1)*dist_vec_in_r(1)+ainv(3,2)*dist_vec_in_r(2)+ainv(3,3)*dist_vec_in_r(3)   ! s = h^-1 r
1514    !
1515    DO i = 1,3
1516      dist_vec_in_s(i)=dist_vec_in_s(i)-IDNINT(dist_vec_in_s(i))
1517    END DO
1518    !
1519    dist_vec_in_r(1)=h(1,1)*dist_vec_in_s(1)+h(1,2)*dist_vec_in_s(2)+h(1,3)*dist_vec_in_s(3)   ! r = h s
1520    dist_vec_in_r(2)=h(2,1)*dist_vec_in_s(1)+h(2,2)*dist_vec_in_s(2)+h(2,3)*dist_vec_in_s(3)   ! r = h s
1521    dist_vec_in_r(3)=h(3,1)*dist_vec_in_s(1)+h(3,2)*dist_vec_in_s(2)+h(3,3)*dist_vec_in_s(3)   ! r = h s
1522    !
1523    DO i = 1, 3
1524      P_dist = P_dist + dist_vec_in_r(i)*dist_vec_in_r(i)
1525    END DO
1526    !
1527    P_dist = DSQRT(P_dist)
1528    !
1529    RETURN
1530END SUBROUTINE get_pair_dist
1531!==============================================================================
1532
1533!==============================================================================
1534SUBROUTINE vvprod_sphere(n, v1, v2, prod)
1535    !
1536    USE kinds, ONLY  : DP
1537    !
1538    IMPLICIT NONE
1539    !
1540    INTEGER  n
1541    REAL(DP) prod, v1(n), v2(n), vp
1542    !
1543    INTEGER  i
1544    !
1545    prod = 0.0_DP
1546    vp = 0.0_DP
1547    !
1548    !$omp parallel do reduction(+:vp)
1549    DO i = 1, n
1550      vp = vp + v1(i) * v2(i)
1551    END DO
1552    !$omp end parallel do
1553    !
1554    prod = vp
1555    !
1556    RETURN
1557END SUBROUTINE vvprod_sphere
1558!==============================================================================
1559
1560!==============================================================================
1561SUBROUTINE vvprod_cube(me_r, v1, v2, prod)
1562    !
1563    USE kinds, ONLY  : DP
1564    !
1565    IMPLICIT NONE
1566    !
1567    !----------------------------------------------------------------
1568    INTEGER      :: me_r(6)
1569    REAL(DP)     :: v1(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1570    REAL(DP)     :: v2(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1571    REAL(DP)     :: prod
1572    !----------------------------------------------------------------
1573    INTEGER      :: i,j,k
1574    REAL(DP)     :: prodp
1575    !----------------------------------------------------------------
1576    !
1577    prodp=0.0D0
1578    !
1579    ! WRITE(*,*) "vvprod"
1580    !
1581    !$omp parallel do private(i,j,k) reduction(+:prodp)
1582    DO k=me_r(3),me_r(6)
1583      DO j=me_r(2),me_r(5)
1584        DO i=me_r(1),me_r(4)
1585          !----------------------------------------------------------
1586          prodp = prodp + v1(i,j,k)*v2(i,j,k)
1587          !----------------------------------------------------------
1588          ! WRITE(*,"(I4,I4,I4,F15.11,F15.11,F15.11)") i,j,k,v1(i,j,k),v2(i,j,k),v1(i,j,k)*v2(i,j,k)
1589          !----------------------------------------------------------
1590        END DO
1591      END DO
1592    END DO
1593    !----------------------------------------------------------------
1594    !$omp end parallel do
1595    !
1596    prod = prodp
1597    !
1598    RETURN
1599END SUBROUTINE vvprod_cube
1600!==============================================================================
1601
1602!==============================================================================
1603SUBROUTINE getpsil( ntot, np_in_sp_me, psi, psi2, tran)
1604    !
1605    USE kinds, ONLY  : DP
1606    !
1607    IMPLICIT NONE
1608    !
1609    INTEGER  ntot, tran(3), np_in_sp_me
1610    REAL(DP) psi(ntot), psi2(np_in_sp_me)
1611    !
1612    INTEGER  ir, ip, i, j, k, ii, jj, kk
1613    !
1614    !$omp parallel do private(ir)
1615    DO ip = 1, np_in_sp_me
1616      CALL l2goff (ip,ir,tran)
1617      psi2(ip) = psi(ir)
1618    END DO
1619    !$omp end parallel do
1620    !
1621    RETURN
1622END SUBROUTINE getpsil
1623!==============================================================================
1624
1625!==============================================================================
1626SUBROUTINE getrhol_cube(me_r, ps_r, psi1, psi2, rhome, rhops, inv_omega)
1627    !
1628    USE kinds, ONLY  : DP
1629    !
1630    IMPLICIT NONE
1631    !
1632    INTEGER  me_r(6), ps_r(6)
1633    REAL(DP) psi1 (me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1634    REAL(DP) psi2 (me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1635    REAL(DP) rhome(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1636    REAL(DP) rhops(ps_r(1):ps_r(4),ps_r(2):ps_r(5),ps_r(3):ps_r(6))
1637    REAL(DP) inv_omega
1638    !
1639    INTEGER  i, j, k
1640    !
1641    !$omp parallel do private(i,j,k)
1642    DO k=ps_r(3),ps_r(6)
1643      DO j=ps_r(2),ps_r(5)
1644        DO i=ps_r(1),ps_r(4)
1645          rhops(i,j,k) = psi1(i,j,k) * psi2(i,j,k) * inv_omega
1646        END DO
1647      END DO
1648    END DO
1649    !$omp end parallel do
1650    !--------------------------------------------------------------------------
1651    !
1652    !--------------------------------------------------------------------------
1653    !$omp parallel do private(i,j,k)
1654    DO k=me_r(3),me_r(6)
1655      DO j=me_r(2),me_r(5)
1656        DO i=me_r(1),me_r(4)
1657          rhome(i,j,k) = psi1(i,j,k)*psi2(i,j,k) * inv_omega
1658        END DO
1659      END DO
1660    END DO
1661    !$omp end parallel do
1662    !--------------------------------------------------------------------------
1663    !
1664    RETURN
1665END SUBROUTINE getrhol_cube
1666!==============================================================================
1667
1668!==============================================================================
1669SUBROUTINE getrhol_sphere( np_in_sp_me, np_in_sp, psi, psi2, rho, rho_in_sp, tran, sa1)
1670    !
1671    USE kinds, ONLY  : DP
1672    !
1673    IMPLICIT NONE
1674    !
1675    INTEGER  np_in_sp_me, tran(3),np_in_sp
1676    REAL(DP) psi(np_in_sp_me), psi2(np_in_sp_me), rho(np_in_sp_me),sa1, rho_in_sp(np_in_sp)
1677    !
1678    INTEGER  ir, ip, i, j, k, ii, jj, kk
1679    rho_in_sp(:) = 0.D0
1680    !
1681    !$omp parallel do
1682    DO ip = 1, np_in_sp_me
1683      rho(ip) = psi(ip) * psi2(ip) * sa1
1684      IF( ip.LE.np_in_sp ) THEN
1685        rho_in_sp( ip ) = rho(ip)
1686      END IF
1687    ENDDO
1688    !$omp end parallel do
1689    !
1690    RETURN
1691END SUBROUTINE getrhol_sphere
1692!==============================================================================
1693
1694!==============================================================================
1695SUBROUTINE l2goff (lind,gind,tran)
1696    !
1697    USE exx_module,       ONLY  : odtothd_in_sp, thdtood
1698    USE fft_base,         ONLY  : dfftp
1699    !
1700    IMPLICIT NONE
1701    !
1702    INTEGER  tran(3),lind,gind
1703    INTEGER  ir, ip, i, j, k, ii, jj, kk, nr1s, nr2s, nr3s
1704    !
1705    nr1s=dfftp%nr1; nr2s=dfftp%nr2; nr3s=dfftp%nr3
1706    !
1707    i  = odtothd_in_sp(1, lind)
1708    j  = odtothd_in_sp(2, lind)
1709    k  = odtothd_in_sp(3, lind)
1710    !
1711    ii = i - tran(1)
1712    jj = j - tran(2)
1713    kk = k - tran(3)
1714    !
1715    IF ( ii .GT. nr1s)ii = ii - nr1s
1716    IF ( jj .GT. nr2s)jj = jj - nr2s
1717    IF ( kk .GT. nr3s)kk = kk - nr3s
1718    !
1719    IF ( ii .LT. 1)ii = ii + nr1s
1720    IF ( jj .LT. 1)jj = jj + nr2s
1721    IF ( kk .LT. 1)kk = kk + nr3s
1722    !
1723    gind = thdtood(ii, jj, kk)
1724    !
1725    RETURN
1726END SUBROUTINE l2goff
1727!==============================================================================
1728SUBROUTINE getpsicb(nrg,nrl,psig,psil,tran)
1729    !
1730    USE kinds, ONLY  : DP
1731    USE fft_base,         ONLY  : dfftp
1732    !
1733    IMPLICIT NONE
1734    !
1735    INTEGER      :: nrg(3)
1736    INTEGER      :: nrl(6)
1737    REAL(DP)     :: psig(nrg(1),nrg(2),nrg(3))
1738    REAL(DP)     :: psil(nrl(1):nrl(4),nrl(2):nrl(5),nrl(3):nrl(6))
1739    INTEGER      :: tran(3)
1740    INTEGER      :: gid(3)
1741    !INTEGER      :: lid(3)
1742    INTEGER      :: i,j,k
1743    INTEGER      :: ti, tj, tk
1744    INTEGER      :: gi, gj, gk
1745    integer, external :: l2gcb
1746    !
1747    ti = tran(1); tj = tran(2); tk = tran(3)
1748    !$omp parallel do private(i,j,k,gi,gj,gk)
1749    DO k = nrl(3),nrl(6)
1750      DO j = nrl(2),nrl(5)
1751        DO i = nrl(1),nrl(4)
1752          !----------------------------------------------------
1753          gi = l2gcb(dfftp%nr1,i,ti)
1754          gj = l2gcb(dfftp%nr2,j,tj)
1755          gk = l2gcb(dfftp%nr3,k,tk)
1756          psil(i,j,k)=psig(gi,gj,gk)
1757          !----------------------------------------------------
1758        END DO
1759      END DO
1760    END DO
1761    !$omp end parallel do
1762    !----------------------------------------------------------
1763    !
1764    RETURN
1765END SUBROUTINE getpsicb
1766!==============================================================================
1767
1768integer function l2gcb(n,l,t)
1769  implicit none
1770  integer :: n, l, t
1771  l2gcb = MOD(l-t-1+n, n)+1
1772end function l2gcb
1773SUBROUTINE updateforce_loc(nrg, me_r, vpsil, potme, psime1, psime2, tran)
1774    !
1775    USE kinds,                   ONLY  : DP
1776    USE exx_module,              ONLY  : exxalfa
1777    USE fft_base,         ONLY  : dfftp
1778    !
1779    IMPLICIT NONE
1780    !
1781    INTEGER      :: nrg(3)
1782    INTEGER      :: me_r(6)
1783    REAL(DP)     :: vpsil(nrg(1),nrg(2),nrg(3))
1784    REAL(DP)     :: potme(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1785    REAL(DP)     :: psime1(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1786    REAL(DP)     :: psime2(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1787    INTEGER      :: tran(3)
1788    !----------------------------------------------------------------
1789    INTEGER      :: gid(3)
1790    INTEGER      :: lid(3)
1791    INTEGER      :: i,j,k
1792    INTEGER      :: gi,gj,gk,ti,tj,tk
1793    integer, external :: l2gcb
1794    !
1795    ti=tran(1);tj=tran(2);tk=tran(3)
1796    !----------------------------------------------------------------
1797    ! update vpsil in the global grid (exxalfa is 0.25 for PBE0)
1798    !----------------------------------------------------------------
1799    !$omp parallel do private(i,j,k,gi,gj,gk)
1800    !----------------------------------------------------------------
1801    DO k=me_r(3),me_r(6)
1802      DO j=me_r(2),me_r(5)
1803        DO i=me_r(1),me_r(4)
1804          !----------------------------------------------------------
1805          !CALL l2gcb(lid,gid,tran)
1806          gi = l2gcb(dfftp%nr1,i,ti)
1807          gj = l2gcb(dfftp%nr2,j,tj)
1808          gk = l2gcb(dfftp%nr3,k,tk)
1809          !----------------------------------------------------------
1810          vpsil(gi,gj,gk) = vpsil(gi,gj,gk) &
1811                                         - exxalfa*potme(i,j,k)*psime2(i,j,k)
1812#if defined(__MPI)
1813          psime2(i,j,k) = - exxalfa*potme(i,j,k)*psime1(i,j,k)
1814#endif
1815          !----------------------------------------------------------
1816        END DO
1817      END DO
1818    END DO
1819    !----------------------------------------------------------------
1820    !$omp end parallel do
1821    !----------------------------------------------------------------
1822
1823    !----------------------------------------------------------------
1824    RETURN
1825    !----------------------------------------------------------------
1826END SUBROUTINE updateforce_loc
1827!==============================================================================
1828
1829!==============================================================================
1830SUBROUTINE updateforce_slf(nrg, me_r, vpsil, potme, psime, tran)
1831    !
1832    USE kinds,                   ONLY  : DP
1833    USE exx_module,              ONLY  : exxalfa
1834    USE fft_base,         ONLY  : dfftp
1835    !
1836    IMPLICIT NONE
1837    !
1838    INTEGER      :: nrg(3)
1839    INTEGER      :: me_r(6)
1840    REAL(DP)     :: vpsil(nrg(1),nrg(2),nrg(3))
1841    REAL(DP)     :: potme(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1842    REAL(DP)     :: psime(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1843    INTEGER      :: tran(3)
1844    !----------------------------------------------------------------
1845    INTEGER      :: gid(3)
1846    INTEGER      :: lid(3)
1847    INTEGER      :: i,j,k
1848    !
1849    INTEGER      :: gi,gj,gk,ti,tj,tk
1850    integer, external :: l2gcb
1851    !
1852    ti=tran(1);tj=tran(2);tk=tran(3)
1853    !----------------------------------------------------------------
1854    ! update vpsil in the global grid (exxalfa is 0.25 for PBE0)
1855    !----------------------------------------------------------------
1856    !$omp parallel do private(i,j,k,gi,gj,gk)
1857    DO k=me_r(3),me_r(6)
1858      DO j=me_r(2),me_r(5)
1859        DO i=me_r(1),me_r(4)
1860          !----------------------------------------------------------
1861          gi = l2gcb(dfftp%nr1,i,ti)
1862          gj = l2gcb(dfftp%nr2,j,tj)
1863          gk = l2gcb(dfftp%nr3,k,tk)
1864          !----------------------------------------------------------
1865          vpsil(gi,gj,gk) = vpsil(gi,gj,gk) &
1866                                         - exxalfa*potme(i,j,k)*psime(i,j,k)
1867          !----------------------------------------------------------
1868        END DO
1869      END DO
1870    END DO
1871    !----------------------------------------------------------------
1872    !$omp end parallel do
1873    !----------------------------------------------------------------
1874
1875    !----------------------------------------------------------------
1876    RETURN
1877    !----------------------------------------------------------------
1878END SUBROUTINE updateforce_slf
1879!==============================================================================
1880
1881!==============================================================================
1882SUBROUTINE updateforce_rec(nrg, me_r, vpsil, force, tran)
1883    !
1884    USE kinds,                   ONLY  : DP
1885    USE exx_module,              ONLY  : exxalfa
1886    USE fft_base,                ONLY  : dfftp
1887    !
1888    IMPLICIT NONE
1889    !
1890    INTEGER      :: nrg(3)
1891    INTEGER      :: me_r(6)
1892    REAL(DP)     :: vpsil(nrg(1),nrg(2),nrg(3))
1893    REAL(DP)     :: force(me_r(1):me_r(4),me_r(2):me_r(5),me_r(3):me_r(6))
1894    INTEGER      :: tran(3)
1895    !----------------------------------------------------------------
1896    INTEGER      :: gi, gj, gk
1897    INTEGER      :: i,j,k
1898    INTEGER      :: ti,tj,tk
1899    integer, external :: l2gcb
1900    !
1901    ti=tran(1);tj=tran(2);tk=tran(3)
1902    !
1903    !----------------------------------------------------------------
1904    ! update vpsil in the global grid (exxalfa is 0.25 for PBE0)
1905    !----------------------------------------------------------------
1906    !$omp parallel do private(i,j,k,gi,gj,gk)
1907    !----------------------------------------------------------------
1908    DO k=me_r(3),me_r(6)
1909      DO j=me_r(2),me_r(5)
1910        DO i=me_r(1),me_r(4)
1911          !----------------------------------------------------------
1912          gi = l2gcb(dfftp%nr1,i,ti)
1913          gj = l2gcb(dfftp%nr2,j,tj)
1914          gk = l2gcb(dfftp%nr3,k,tk)
1915          !----------------------------------------------------------
1916          vpsil(gi,gj,gk) = vpsil(gi,gj,gk) + force(i,j,k)
1917          !----------------------------------------------------------
1918        END DO
1919      END DO
1920    END DO
1921    !----------------------------------------------------------------
1922    !$omp end parallel do
1923    !----------------------------------------------------------------
1924
1925    !----------------------------------------------------------------
1926    RETURN
1927    !----------------------------------------------------------------
1928END SUBROUTINE updateforce_rec
1929!==============================================================================
1930