1subroutine khamiltonian
2     !------------------------------------------------------------------------------------------
3     !
4     !  This subroutine calculates and saves to disk the elements needed for the construction of
5     !  the k-dependent Hamiltonian in the optimal basis |e_i>:
6     !  K1_ij, K0_ij, K1_ij, Vloc_ij, V_nonloc_ij (only the projectors <\beta_l|exp(ik.r)|e_j>)
7     !  It calculates also the nonlocal commutator for the the velocity matrix elements
8     !
9     USE kinds, ONLY : DP
10     USE input_simple
11     USE scf, ONLY : vrs
12     USE wavefunctions, ONLY : psic
13     USE cell_base, ONLY : tpiba2, tpiba, bg, alat, at
14     USE gvect, ONLY : gg, g
15     USE io_global, ONLY : stdout, ionode, ionode_id
16     USE mp_world, ONLY : world_comm
17     USE mp, ONLY : mp_sum,mp_barrier, mp_bcast
18     USE noncollin_module, ONLY: npol, noncolin
19     USE io_files,  ONLY : prefix, tmp_dir
20     USE fft_base,         ONLY : dffts
21     USE fft_interfaces,   ONLY : fwfft, invfft
22     USE spin_orb,      ONLY : domag
23     USE cell_base, ONLY : omega
24     USE ions_base,     ONLY: nat, ntyp => nsp, ityp
25     USE uspp_param,    ONLY: nh, nhm
26     USE uspp,          ONLY: nkb, deeq, indv_ijkb0, deeq_nc
27     USE lsda_mod,      ONLY : nspin
28     USE becmod,        ONLY : bec_type, calbec, allocate_bec_type, deallocate_bec_type
29     USE wvfct, ONLY : npwx
30     USE klist, ONLY : nelec
31     USE wannier_gw, ONLY : num_nbndv
32     !
33     IMPLICIT NONE
34     !
35    COMPLEX(kind=DP), ALLOCATABLE :: g2e(:,:) , g2e_mat(:,:) , rwfc(:,:) , v_rwfc(:,:)
36    COMPLEX(kind=DP), ALLOCATABLE :: deeqc(:,:,:)
37    COMPLEX(kind=DP), ALLOCATABLE :: d_wfc_e(:,:)
38    COMPLEX(kind=DP), ALLOCATABLE :: commut_mat(:,:) , sum_beck_nc(:,:,:,:) , sum_beckc(:,:,:) , sum_commut_mat(:,:,:,:)
39    COMPLEX(kind=DP) :: sup, sdwn
40    INTEGER :: ipol, ig, itot_e, iun, iun1, idir, ii, ir, jj, kk, jtot_e, kindex, sum_kindex
41    INTEGER, EXTERNAL :: find_free_unit
42    INTEGER, ALLOCATABLE :: igkk(:)
43    REAL(kind=DP) :: qk(3)
44    TYPE(bec_type) :: beck, beck2
45    !
46    call start_clock('khamiltonian')
47    !
48    write(stdout,*) '  '
49    write(stdout,*) 'Compute and store (in files prefix.hamiltonian and prefix.hamiltonian_k) the'
50    write(stdout,*) 'matrix elements needed for the k-dependent Hamiltonian in the optimal basis:'
51    !
52    allocate(g2e(npw_max*npol,ntot_e),g2e_mat(ntot_e,ntot_e),d_wfc_e(npw_max*npol,ntot_e))
53    !
54    if (noncolin) then
55      allocate(sum_beck_nc(nkb,npol,ntot_e,nkpoints(3)))
56    else
57      allocate(sum_beckc(nkb,ntot_e,nkpoints(3)))
58    endif
59    !
60    if (nonlocal_commutator) then
61      allocate(commut_mat(ntot_e,ntot_e), sum_commut_mat(ntot_e,ntot_e,3,nkpoints(3)))
62    endif
63    !
64    g2e(1:npw_max*npol,1:ntot_e) = wfc_e(1:npw_max*npol,1:ntot_e)
65    !
66    ! K0_ij (kinetic part)
67    call start_clock('K0')
68    write(stdout,*) 'K0_ij'
69    do itot_e=1,ntot_e
70     do ipol=0,npol-1
71        do ig=1,npw_max
72           g2e(ig + ipol*npw_max,itot_e) = gg(ig)*g2e(ig + ipol*npw_max,itot_e)
73        enddo
74     enddo
75    enddo
76    ! Scalar product: at the end we have \sum_G e_i*(G) G**2 e_j(G)
77    call ZGEMM('C','N',ntot_e,ntot_e,npol*npw_max,(1.d0,0.d0),g2e,npol*npw_max,wfc_e,npol*npw_max,(0.d0,0.d0),g2e_mat,ntot_e)
78    call mp_sum(g2e_mat,world_comm)
79    !
80    g2e_mat = g2e_mat*tpiba2
81    !
82    if(ionode) then
83      iun=find_free_unit()
84      open( unit= iun, file=trim(tmp_dir)//trim(prefix)//'.hamiltonian', &
85                &status='unknown',form='unformatted')
86      write(iun) ntot_e
87    endif
88    if (ionode) write(iun) g2e_mat(1:ntot_e,1:ntot_e)
89    call stop_clock('K0')
90    ! End K0_ij
91    !
92    ! K1_ij (kinetic part)
93    call start_clock('K1')
94    write(stdout,*) 'K1_ij'
95    do idir=1,3
96      g2e(1:npw_max*npol,1:ntot_e) = wfc_e(1:npw_max*npol,1:ntot_e)
97      do itot_e=1,ntot_e
98       do ipol=0,npol-1
99          do ig=1,npw_max
100             g2e(ig + ipol*npw_max,itot_e) = g(idir,ig)*g2e(ig + ipol*npw_max,itot_e)
101          enddo
102       enddo
103      enddo
104     ! Scalar product: at the end we have \sum_G e_i*(G) G e_j(G)
105     call ZGEMM('C','N',ntot_e,ntot_e,npol*npw_max,(1.d0,0.d0),g2e, &
106             npol*npw_max,wfc_e,npol*npw_max,(0.d0,0.d0),g2e_mat,ntot_e)
107     call mp_sum(g2e_mat,world_comm)
108     !
109     g2e_mat = g2e_mat*tpiba*2.0 ! In the definition of K1_ij there is a factor 2 in front
110     !
111     if (ionode) write(iun) g2e_mat(1:ntot_e,1:ntot_e) ! write to disk
112     !
113    enddo
114    call stop_clock('K1')
115    ! End K1_ij
116    !
117    ! Vloc_ij
118    call start_clock('Vloc')
119    write(stdout,*) 'Vloc_ij'
120    allocate(rwfc(dffts%nnr*npol,ntot_e))
121    allocate(v_rwfc(dffts%nnr*npol,ntot_e))
122    !
123    do ii=1,ntot_e
124         psic(1:dffts%nnr)=0.d0
125         psic(dffts%nl(1:npw_max))=wfc_e(1:npw_max,ii)
126         CALL invfft ('Wave', psic, dffts)      ! Inverse fourier transform of |e_i> on the smooth grid
127         rwfc(1:dffts%nnr,ii)=psic(1:dffts%nnr) ! Wavefunction in real space
128
129         if(npol>1) then
130            psic(1:dffts%nnr)=0.d0
131            psic(dffts%nl(1:npw_max))=wfc_e(npw_max+1:npw_max+npw_max,ii)
132            CALL invfft ('Wave', psic, dffts)
133            rwfc(dffts%nnr+1:2*dffts%nnr,ii)=psic(1:dffts%nnr)
134         endif
135    enddo
136    !
137    v_rwfc(1:dffts%nnr*npol,1:ntot_e) = rwfc(1:dffts%nnr*npol,1:ntot_e)
138    if (noncolin) then
139     do ii=1,ntot_e
140       if (domag) then
141               do ir=1, dffts%nnr
142                  sup = v_rwfc(ir,ii) * (vrs(ir,1)+vrs(ir,4)) + &
143                        v_rwfc(ir+ dffts%nnr,ii) * (vrs(ir,2)-(0.d0,1.d0)*vrs(ir,3))
144                  sdwn =  v_rwfc(ir+ dffts%nnr,ii) * (vrs(ir,1)-vrs(ir,4)) + &
145                         v_rwfc(ir,ii) * (vrs(ir,2)+(0.d0,1.d0)*vrs(ir,3))
146                  v_rwfc(ir,ii)=sup
147                  v_rwfc(ir+ dffts%nnr,ii)=sdwn
148               enddo
149       else
150               do ir=1, dffts%nnr
151                  v_rwfc(ir,ii) = v_rwfc(ir,ii) * vrs(ir,1)
152                  v_rwfc(dffts%nnr+ir,ii) = v_rwfc(dffts%nnr+ir,ii) * vrs(ir,1)
153               enddo
154               !v_rwfc(dffts%nnr+1:dffts%nnr*npol,ii) = 0.d0
155       endif
156     enddo
157    else
158      do ii=1,ntot_e
159        do ir=1,dffts%nnr
160          v_rwfc(ir,ii) = v_rwfc(ir,ii)*vrs(ir,1)  ! V_loc(r) x e_i(r)
161        enddo
162      enddo
163    end if
164
165    ! Calculate integral of e_j(r)* x V_loc(r) x e_i(r)
166    call ZGEMM('C','N',ntot_e,ntot_e,npol*dffts%nnr,(1.d0,0.d0),rwfc, &
167            npol*dffts%nnr,v_rwfc,npol*dffts%nnr,(0.d0,0.d0),g2e_mat,ntot_e)
168    call mp_sum(g2e_mat,world_comm)
169    !
170    g2e_mat = g2e_mat / dble(dffts%nr1*dffts%nr2*dffts%nr3)
171    !
172    if (ionode) write(iun) g2e_mat(1:ntot_e,1:ntot_e)
173    !
174    deallocate(rwfc,v_rwfc)
175    call stop_clock('Vloc')
176    ! End Vloc_ij
177
178    ! Vnonloc_ij
179    write(stdout,*) 'Vnonloc_ij(k) (projectors)'
180    if (nonlocal_commutator) then
181      write(stdout,*) '! Non local commutator from the velocity operator will also be computed !'
182    else
183      write(stdout,*) 'WARNING: Non local commutator from the velocity operator will NOT be computed'
184    endif
185    !
186    if(ionode) then
187      write(iun) noncolin
188      write(iun) nat
189      write(iun) ntyp
190      write(iun) nhm
191      write(iun) nspin
192      write(iun) nkb
193      write(iun) npol
194      write(iun) ityp(1:nat)
195      write(iun) nh(1:ntyp)
196      write(iun) indv_ijkb0(1:nat)
197      write(iun) nkpoints
198    endif
199    !
200    allocate(deeqc(nhm,nhm,nat), vkb_max(npw_max,nkb), igkk(npw_max))
201    if (noncolin) then
202     if (ionode) write(iun) deeq_nc(1:nhm,1:nhm,1:nat,1:nspin)
203    else
204     deeqc(1:nhm,1:nhm,1:nat) = cmplx(deeq(1:nhm,1:nhm,1:nat,1), 0.d0, KIND=DP)
205     if (ionode) write(iun) deeqc(1:nhm,1:nhm,1:nat)
206    endif
207    !
208    do ig=1,npw_max
209      igkk(ig) = ig
210    enddo
211    !
212    call allocate_bec_type(nkb,ntot_e,beck)
213    call allocate_bec_type(nkb,ntot_e,beck2)
214    !
215    if(ionode) then
216      iun1=find_free_unit()
217      open( unit= iun1, file=trim(tmp_dir)//trim(prefix)//'.hamiltonian_k', &
218                &status='unknown',form='unformatted')
219    endif
220    !
221    !! Calculation of the non-local part of the pseudopotential on a uniform k-mesh
222    write(stdout,'(a,i7)')  ' Total number of k-points:', (nkpoints(1))*(nkpoints(2))*(nkpoints(3))
223    do ii=0,nkpoints(1)-1
224          do jj=0,nkpoints(2)-1
225             !
226             do kk=0,nkpoints(3)-1
227                kindex = kk + jj*nkpoints(3) + ii*nkpoints(2)*nkpoints(3) + 1  ! global kindex
228                sum_kindex =  kk +  1
229                write(stdout,'(a,i7)')  ' k-point:', kindex
230                qk(1:3)=bg(1:3,1)*dble(ii)/dble(nkpoints(1))+bg(1:3,2)*dble(jj)/dble(nkpoints(2))+&
231                     &  bg(1:3,3)*dble(kk)/dble(nkpoints(3))
232                !
233                call start_clock('Vnloc')
234                if (nkb>0) then
235                   call init_us_2_max(npw_max,igkk,qk,vkb_max)  ! get the projectors \beta_Ilm (k-dependent)
236                endif
237                !vkb_max(npwx+1:npw_max,1:nkb) = 0.d0 ! WARNING: HERE I PUT TO ZERO THE ELEMENTS OF BETA WiTH G > npwx
238                !
239                call calbec(npw_max,vkb_max,wfc_e,beck,ntot_e) ! scalar product <\beta_Ilm|exp(ik.r)|e_j>
240                !
241                if (noncolin) then
242                   sum_beck_nc(1:nkb,1:npol,1:ntot_e,sum_kindex) = beck%nc(1:nkb,1:npol,1:ntot_e)
243                else
244                   sum_beckc(1:nkb,1:ntot_e,sum_kindex) = beck%k(1:nkb,1:ntot_e)
245                endif
246                call stop_clock('Vnloc')
247                !
248                call start_clock('commutator')
249                if (nonlocal_commutator) then
250                    ! Calculation of commutator
251                    do ipol=1,3
252                        !
253                        call commutator_Hx_psi_simple(qk, npw_max, ntot_e, beck, beck2, ipol, d_wfc_e, wfc_e, vkb_max)
254                        !
255                        ! Calculate the matrix <e_i|[V(k),r]|e_j>
256                        commut_mat = 0.d0
257                        call start_clock('commut_zgemm')
258                        call ZGEMM('C','N',ntot_e,ntot_e,npol*npw_max,(1.d0,0.d0),wfc_e,npol*npw_max,d_wfc_e, &
259                        & npol*npw_max,(0.d0,0.d0),commut_mat,ntot_e)
260                        call stop_clock('commut_zgemm')
261                        !
262                        call start_clock('commut_mp_sum')
263                        call mp_sum(commut_mat,world_comm)
264                        call stop_clock('commut_mp_sum')
265                        !
266                        ! save i[V_nl,r] = -i [r,V_nl]
267                        sum_commut_mat(1:ntot_e,1:ntot_e,ipol,sum_kindex) = (0.d0,1.d0)*commut_mat(1:ntot_e,1:ntot_e)
268                        !
269                    enddo
270                    ! End calculation of commutator
271                endif
272                call stop_clock('commutator')
273                !
274             enddo
275             !
276             call start_clock('Vnloc_write')
277             if (noncolin) then
278               if (ionode) write(iun1) sum_beck_nc(1:nkb,1:npol,1:ntot_e,1:nkpoints(3))
279             else
280               if (ionode) write(iun1) sum_beckc(1:nkb,1:ntot_e,1:nkpoints(3))
281             endif
282             call stop_clock('Vnloc_write')
283             !
284             if (nonlocal_commutator) then
285               call start_clock('commut_write')
286               if (ionode) write(iun1) sum_commut_mat(1:ntot_e,1:ntot_e,1:3,1:nkpoints(3))
287               call stop_clock('commut_write')
288             endif
289             !
290          enddo
291    enddo
292    !
293    deallocate(deeqc,g2e_mat,d_wfc_e,g2e)
294    if (noncolin) then
295      deallocate(sum_beck_nc)
296    else
297      deallocate(sum_beckc)
298    endif
299    !
300    if (nonlocal_commutator) then
301      deallocate(commut_mat, sum_commut_mat)
302    endif
303    !
304    call deallocate_bec_type(beck)
305    call deallocate_bec_type(beck2)
306    deallocate(vkb_max,igkk)
307    !End Vnonloc_ij
308    !
309    if(ionode) then
310       write(iun) alat
311       write(iun) bg(1:3,1:3)
312       write(iun) at(1:3,1:3)
313       write(iun) nelec
314       write(iun) omega
315       write(iun) num_val
316       write(iun) num_cond
317       write(iun) num_nbndv
318       write(iun) nonlocal_commutator
319       write(iun) s_bands
320    endif
321    !
322    if(ionode) then
323     close(iun)
324     close(iun1)
325    endif
326    !
327    write(stdout,*) ' '
328    write(stdout,*) '**********************************'
329    write(stdout,*) 'Matrix elements computed and saved'
330    write(stdout,*) '**********************************'
331    !
332    call stop_clock('khamiltonian')
333    !
334    ! Write in output the input parameters
335    write(stdout,*) '                     '
336    write(stdout,*) 'INPUT PARAMETERS:'
337    write(stdout,'(a,f10.5)') ' s_bands [a.u.] = ', s_bands
338    write(stdout,'(a,I3,I3,I3)')           ' k-grid = ', nkpoints(1:3)
339    if (.not. nonlocal_commutator) then
340        write(stdout,*)           'nonlocal_commutator =    False'
341    elseif (nonlocal_commutator) then
342        write(stdout,*)           'nonlocal_commutator =    True'
343    endif
344    write(stdout,*) '                     '
345    !
346    call print_clock('K0')
347    call print_clock('K1')
348    call print_clock('Vloc')
349    call print_clock('Vnloc')
350    call print_clock('Vnloc_write')
351    call print_clock('commutator')
352    call print_clock('commut_Hx_psi')
353    !call print_clock('gen_beta1')
354    !call print_clock('gen_beta2')
355    call print_clock('commut_nbnd')
356    call print_clock('commut_zgemm')
357    call print_clock('commut_mp_sum')
358    call print_clock('commut_write')
359    call print_clock('khamiltonian')
360    !
361end subroutine khamiltonian
362