1!
2! Copyright (C) 2002-2009 Quantum ESPRESSO group
3! This file is distributed under the terms of the
4! GNU General Public License. See the file `License'
5! in the root directory of the present distribution,
6! or http://www.gnu.org/copyleft/gpl.txt .
7!
8
9
10!-----------------------------------------------------------------------
11   subroutine calcmt( nrlx, fdiag, zmat, fmat )
12!-----------------------------------------------------------------------
13!
14!  constructs fmat=z0^t.fdiag.z0    zmat = z0^t
15!
16      USE kinds,             ONLY: DP
17      use electrons_base,    ONLY: nudx, nspin, nupdwn, iupdwn, nx => nbspx
18      USE cp_main_variables, ONLY: idesc
19      USE mp,                ONLY: mp_sum, mp_bcast
20
21      implicit none
22
23      include 'laxlib.fh'
24
25      integer, intent(in)  :: nrlx
26      real(DP) :: zmat( nrlx, nudx, nspin ), fmat( nrlx, nudx, nspin )
27      !  NOTE: zmat and fmat are distributed by row across processors
28      real(dp) :: fdiag( nx ) !  fdiag is replicated
29
30      integer  :: iss, nss, istart, i, j, k, ii, jj, kk
31      integer  :: np_rot, me_rot, nrl, comm_rot, ip, nrl_ip
32
33      real(DP), ALLOCATABLE :: mtmp(:,:)
34      real(DP) :: f_z0t
35
36
37      call start_clock('calcmt')
38
39      fmat = 0.0d0
40
41      DO iss = 1, nspin
42
43         nss      = nupdwn( iss )
44         istart   = iupdwn( iss )
45         np_rot   = idesc( LAX_DESC_NPR, iss ) * idesc( LAX_DESC_NPC, iss )
46         me_rot   = idesc( LAX_DESC_MYPE, iss )
47         nrl      = idesc( LAX_DESC_NRL, iss )
48         comm_rot = idesc( LAX_DESC_COMM, iss )
49
50         IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
51
52            ALLOCATE( mtmp( MAXVAL(idesc(LAX_DESC_NRLX, :)), nudx ) )
53
54            DO ip = 1, np_rot
55
56               IF( me_rot == ( ip - 1 ) ) THEN
57                  mtmp = zmat(:,:,iss)
58               END IF
59               nrl_ip = ldim_cyclic( nss, np_rot, ip - 1 )
60               CALL mp_bcast( mtmp , ip - 1 , comm_rot )
61
62               DO j = 1, nss
63                  ii = ip
64                  DO i = 1, nrl_ip
65                     f_z0t = fdiag( j + istart - 1 ) * mtmp( i, j )
66                     DO k = 1, nrl
67                        fmat( k, ii, iss ) = fmat( k, ii, iss )+ zmat( k, j, iss ) * f_z0t
68                     END DO
69                     ii = ii + np_rot
70                  END DO
71               END DO
72
73            END DO
74
75            DEALLOCATE( mtmp )
76
77         END IF
78
79      END DO
80
81      call stop_clock('calcmt')
82
83      RETURN
84      END SUBROUTINE calcmt
85
86
87!-----------------------------------------------------------------------
88      subroutine rotate( nrlx, z0, c0, bec, c0diag, becdiag )
89!-----------------------------------------------------------------------
90      use kinds, only: dp
91      use electrons_base, only: nudx, nspin, nupdwn, iupdwn, nx => nbspx, n => nbsp
92      use uspp_param, only: nh, upf
93      use uspp, only :nkb, nkbus, qq_nt, indv_ijkb0
94      use gvecw, only: ngw
95      use ions_base, only: nat, ityp
96      USE cp_main_variables, ONLY: idesc
97      USE cp_interfaces,     ONLY: protate
98
99      implicit none
100      include 'laxlib.fh'
101      integer, intent(in) :: nrlx
102      real(kind=DP)    z0( nrlx, nudx, nspin )
103      real(kind=DP)    bec( nkb, n ), becdiag( nkb, n )
104      complex(kind=DP) c0( ngw, nx ), c0diag( ngw, nx )
105      integer :: np_rot, me_rot, nrl, comm_rot
106      integer iss, nss, istart
107
108      CALL start_clock( 'rotate' )
109
110      DO iss = 1, nspin
111         istart   = iupdwn( iss )
112         nss      = nupdwn( iss )
113         np_rot   = idesc( LAX_DESC_NPR, iss ) * idesc( LAX_DESC_NPC, iss )
114         me_rot   = idesc( LAX_DESC_MYPE, iss )
115         nrl      = idesc( LAX_DESC_NRL, iss )
116         comm_rot = idesc( LAX_DESC_COMM, iss )
117         CALL protate ( c0, bec, c0diag, becdiag, ngw, nss, istart, z0(:,:,iss), nrl, &
118                        ityp, nat, indv_ijkb0, nh, np_rot, me_rot, comm_rot )
119      END DO
120
121      CALL stop_clock( 'rotate' )
122      return
123      end subroutine rotate
124
125
126    subroutine minparabola(ene0,dene0,ene1,passop,passo,stima)
127!this subroutines finds the minimum of a quadratic real function
128
129      use kinds, only : dp
130
131      implicit none
132      real(dp) ene0,dene0,ene1,passop,passo,stima
133      real(dp) a,b,c!a*x^2+b*x+c
134
135      c=ene0
136      b=dene0
137      a=(ene1-b*passop-c)/(passop**2.d0)
138
139      passo = -b/(2.d0*a)
140      if( a.lt.0.d0) then
141         if(ene1.lt.ene0) then
142            passo=passop
143         else
144            passo=0.5d0*passop
145         endif
146      endif
147
148
149      stima=a*passo**2.d0+b*passo+c
150
151
152      return
153    end subroutine minparabola
154
155
156subroutine pc2(a,beca,b,becb)
157
158! this function applies the operator Pc
159
160!    this subroutine applies the Pc operator
161!    a input :unperturbed wavefunctions
162!    b input :first order wavefunctions
163!    b output:b_i =b_i-a_j><a_j|S|b_i>
164
165      use kinds, only: dp
166      use ions_base, only: na, nsp, nat, ityp
167      use io_global, only: stdout
168      use mp_global, only: intra_bgrp_comm
169      use gvecw, only: ngw
170      use constants, only: pi, fpi
171      use gvect, only: gstart
172      use mp, only: mp_sum
173      use electrons_base, only: n => nbsp, ispin,  nupdwn, iupdwn, nspin
174      use uspp_param, only: nh, upf
175      use uspp, only :nkb, nkbus, indv_ijkb0
176      use uspp, only :qq_nt
177
178
179      implicit none
180
181      complex(kind=DP) a(ngw,n), b(ngw,n)
182
183      real(kind=DP)    beca(nkb,n),becb(nkb,n)
184! local variables
185      integer is, iv, jv, ia, inl, jnl, i, j,ig
186      real(kind=DP) sca
187      real(DP), allocatable :: bectmp(:,:)
188      real(DP), allocatable :: qq_tmp(:,:), qqb_tmp(:,:)
189      complex(DP), allocatable :: zbectmp(:,:)
190      integer :: nl_max
191      integer :: nss,iss, istart
192
193      logical :: mat_par=.true.!if true uses parallel routines
194
195      CALL start_clock( 'pc2' )
196
197      do iss= 1, nspin
198         nss= nupdwn( iss )
199         istart= iupdwn( iss )
200
201         allocate(bectmp(nss,nss))
202         allocate(zbectmp(nss,nss))
203         bectmp(:,:)=0.d0
204
205         call zgemm('C','N',nss,nss,ngw,(1.d0,0.d0),a(:,istart),ngw,b(:,istart),ngw,(0.d0,0.d0),zbectmp,nss)
206
207         do j=1,nss
208            do i=1,nss
209               bectmp(i,j)=2.d0*dble(zbectmp(i,j))
210               if(gstart==2) bectmp(i,j)=bectmp(i,j)-DBLE(a(1,j))*DBLE(b(1,i))
211
212            enddo
213         enddo
214         deallocate(zbectmp)
215         call mp_sum( bectmp(:,:), intra_bgrp_comm)
216         if(nkbus >= 0) then
217
218            nl_max=0
219            do is=1,nsp
220               nl_max=nl_max+nh(is)*na(is)
221            enddo
222            allocate (qq_tmp(nl_max,nl_max))
223            allocate (qqb_tmp(nl_max,nss))
224            qq_tmp(:,:)=0.d0
225            do ia=1,nat
226               is = ityp(ia)
227               IF( upf(is)%tvanp ) THEN
228                  do iv=1,nh(is)
229                     do jv=1,nh(is)
230                        inl = indv_ijkb0(ia) + iv
231                        jnl = indv_ijkb0(ia) + jv
232                        qq_tmp(inl,jnl)=qq_nt(iv,jv,is)
233                     enddo
234                  enddo
235               ENDIF
236            enddo
237            if(.not. mat_par)  then
238               call dgemm('N','N',nl_max,nss,nl_max,1.d0,qq_tmp,nl_max,becb(:,istart),nkb,0.d0,qqb_tmp,nl_max)
239               call dgemm('T','N',nss,nss,nl_max,1.d0,beca(:,istart),nkb,qqb_tmp,nl_max,1.d0,bectmp,nss)
240            else
241               call para_dgemm &
242& ('N','N',nl_max,nss,nl_max,1.d0,qq_tmp,nl_max,becb(:,istart),nkb,0.d0,qqb_tmp,nl_max, intra_bgrp_comm)
243               call para_dgemm &
244&('T','N',nss,nss,nl_max,1.d0,beca(:,istart),nkb,qqb_tmp,nl_max,1.d0,bectmp,nss, intra_bgrp_comm)
245            endif
246            deallocate(qq_tmp,qqb_tmp)
247         endif
248         allocate(zbectmp(nss,nss))
249         do i=1,nss
250            do j=1,nss
251               zbectmp(i,j)=CMPLX(bectmp(i,j),0.d0,kind=dp)
252            enddo
253         enddo
254         call zgemm('N','N',ngw,nss,nss,(-1.d0,0.d0),a(:,istart),ngw,zbectmp,nss,(1.d0,0.d0),b(:,istart),ngw)
255         deallocate(zbectmp)
256         call dgemm('N','N',nkb,nss,nss,1.0d0,beca(:,istart),nkb,bectmp,nss,1.0d0,becb(:,istart),nkb)
257         deallocate(bectmp)
258      enddo!on spin
259      CALL stop_clock( 'pc2' )
260      return
261    end subroutine pc2
262
263
264    subroutine pcdaga2(a,as ,b )
265
266! this function applies the operator Pc
267
268!    this subroutine applies the Pc^dagerr operator
269!    a input :unperturbed wavefunctions
270!    b input :first order wavefunctions
271!    b output:b_i =b_i - S|a_j><a_j|b_i>
272
273      use kinds
274      use io_global, only: stdout
275      use mp_global, only: intra_bgrp_comm
276      use gvecw, only: ngw
277      use constants, only: pi, fpi
278      use gvect, only: gstart
279      use mp, only: mp_sum
280      use electrons_base, only: n => nbsp, ispin
281
282      implicit none
283
284      complex(dp) a(ngw,n), b(ngw,n), as(ngw,n)
285      ! local variables
286      integer is, iv, jv, ia, inl, jnl, i, j,ig
287      real(dp) sca
288      real(DP), allocatable:: scar(:)
289      !
290      call start_clock('pcdaga2')
291      allocate(scar(n))
292      do j=1,n
293         do i=1,n
294            sca=0.0d0
295            if(ispin(i) == ispin(j)) then
296               if (gstart==2) b(1,i) = CMPLX(dble(b(1,i)),0.0d0,kind=dp)
297               do  ig=1,ngw           !loop on g vectors
298                  sca=sca+DBLE(CONJG(a(ig,j))*b(ig,i))
299               enddo
300               sca = sca*2.0d0  !2. for real weavefunctions
301               if (gstart==2) sca = sca - dble(a(1,j))*dble(b(1,i))
302            endif
303            scar(i) = sca
304         enddo
305
306
307         call mp_sum( scar, intra_bgrp_comm )
308
309
310         do i=1,n
311            if(ispin(i) == ispin(j)) then
312               sca = scar(i)
313               do ig=1,ngw
314                  b(ig,i)=b(ig,i)-sca*as(ig,j)
315               enddo
316               ! this to prevent numerical errors
317               if (gstart==2) b(1,i) = CMPLX(dble(b(1,i)),0.0d0,kind=dp)
318            endif
319         enddo
320      enddo
321      deallocate(scar)
322      call stop_clock('pcdaga2')
323      return
324      end subroutine pcdaga2
325
326     subroutine set_x_minus1(betae,m_minus1,ema0bg,use_ema)
327
328! this function calculates the factors for the inverse of the US K  matrix
329! it takes care of the preconditioning
330
331      use kinds, only: dp
332      use ions_base, only: nsp, nat, ityp
333      use io_global, only: stdout
334      use mp_global, only: intra_bgrp_comm
335      use gvecw, only: ngw
336      use constants, only: pi, fpi
337      use gvect, only: gstart
338      use mp, only: mp_sum, mp_bcast
339      use electrons_base, only: n => nbsp, ispin
340      use uspp_param, only: nh, upf
341      use uspp, only :nkb,qq_nt,nkbus, indv_ijkb0
342      use io_global, ONLY: ionode, ionode_id
343
344      implicit none
345
346      complex(DP) :: betae(ngw,nkb)
347      real(DP)    :: m_minus1(nkb,nkb)
348      real(DP)    :: ema0bg(ngw)
349      logical     :: use_ema
350
351
352! local variables
353      real(DP),allocatable :: q_matrix(:,:), b_matrix(:,:),c_matrix(:,:)
354      integer is, iv, jv, ia, inl, jnl, i, j, k,ig, js, ja
355      real(DP) sca
356      integer info, lwork
357      integer, allocatable :: ipiv(:)
358      real(dp),allocatable :: work(:)
359
360      call start_clock('set_x_minus1')
361      allocate(ipiv(nkb))
362      allocate(work(nkb))
363
364      lwork=nkb
365
366      allocate(q_matrix(nkb,nkb),c_matrix(nkb,nkb))
367!construct q matrix
368      q_matrix(:,:) = 0.d0
369
370      do ia=1,nat
371         is = ityp(ia)
372         IF( upf(is)%tvanp ) THEN
373            do iv=1,nh(is)
374               do jv=1,nh(is)
375                    inl = indv_ijkb0(ia) + iv
376                    jnl = indv_ijkb0(ia) + jv
377                    q_matrix(inl,jnl)= qq_nt(iv,jv,is)
378               enddo
379            enddo
380         END IF
381      enddo
382
383!construct b matrix
384! m_minus1 used to be b matrix
385      m_minus1(:,:) = 0.d0
386      do ia=1,nat
387         is = ityp(ia)
388         IF( upf(is)%tvanp ) THEN
389            do iv=1,nh(is)
390               do ja=1,nat
391                  js=ityp(ja)
392                  IF( upf(js)%tvanp ) THEN
393                     do jv=1,nh(js)
394                        inl = indv_ijkb0(ia) + iv
395                        jnl = indv_ijkb0(ja) + jv
396                        sca=0.d0
397                        if (use_ema) then
398                           ! k_minus case
399                        do  ig=1,ngw           !loop on g vectors
400                           sca=sca+ema0bg(ig)*DBLE(CONJG(betae(ig,inl))*betae(ig,jnl))
401                        enddo
402                        sca = sca*2.0d0  !2. for real weavefunctions
403                        if (gstart==2) sca = sca - ema0bg(1)*DBLE(CONJG(betae(1,inl))*betae(1,jnl))
404                        else
405                           ! s_minus case
406                        do  ig=1,ngw           !loop on g vectors
407                           sca=sca+DBLE(CONJG(betae(ig,inl))*betae(ig,jnl))
408                        enddo
409                        sca = sca*2.0d0  !2. for real weavefunctions
410                        if (gstart==2) sca = sca - DBLE(CONJG(betae(1,inl))*betae(1,jnl))
411                        endif
412                        m_minus1(inl,jnl)=sca
413                     enddo
414                  END IF
415               enddo
416            enddo
417         END IF
418      enddo
419      call mp_sum( m_minus1, intra_bgrp_comm )
420
421!calculate -(1+QB)**(-1) * Q
422      CALL dgemm('N','N',nkb,nkb,nkb,1.0d0,q_matrix,nkb,m_minus1,nkb,0.0d0,c_matrix,nkb)
423
424      do i=1,nkb
425         c_matrix(i,i)=c_matrix(i,i)+1.d0
426      enddo
427
428      if(ionode) then
429        call dgetrf(nkb,nkb,c_matrix,nkb,ipiv,info)
430        if(info .ne. 0) write(stdout,*) 'set_k_minus1 Problem with dgetrf :', info
431        call dgetri(nkb,c_matrix,nkb,ipiv,work,lwork,info)
432        if(info .ne. 0) write(stdout,*) 'set_k_minus1 Problem with dgetri :', info
433      endif
434      call mp_bcast( c_matrix, ionode_id, intra_bgrp_comm )
435
436
437      CALL dgemm('N','N',nkb,nkb,nkb,-1.0d0,c_matrix,nkb,q_matrix,nkb,0.0d0,m_minus1,nkb)
438
439      deallocate(q_matrix,c_matrix)
440      deallocate(ipiv,work)
441      call stop_clock('set_x_minus1')
442      return
443    end subroutine set_x_minus1
444!
445      subroutine xminus1(c0,betae,ema0bg,beck,m_minus1,do_k)
446! if (do_k) then
447!-----------------------------------------------------------------------
448!     input: c0 , bec=<c0|beta>, betae=|beta>
449!     computes the matrix phi (with the old positions)
450!       where  |phi> = K^{-1}|c0>
451! else
452!-----------------------------------------------------------------------
453!     input: c0 , bec=<c0|beta>, betae=|beta>
454!     computes the matrix phi (with the old positions)
455!       where  |phi> = s^{-1}|c0>
456! endif
457      use kinds, only: dp
458      use ions_base, only: nsp, nat, ityp
459      use io_global, only: stdout
460      use mp_global, only: intra_bgrp_comm
461      use uspp_param, only: nh, upf
462      use uspp, only :nkb, nkbus, qq_nt, indv_ijkb0
463      use electrons_base, only: n => nbsp
464      use gvecw, only: ngw
465      use constants, only: pi, fpi
466      use mp, only: mp_sum
467      use gvect, only: gstart
468!
469      implicit none
470      complex(dp) c0(ngw,n), betae(ngw,nkb)
471      real(dp)    beck(nkb,n), ema0bg(ngw)
472      real(DP)    :: m_minus1(nkb,nkb)
473      logical :: do_k
474! local variables
475      complex(dp), allocatable :: phi(:,:)
476      real(dp) , allocatable   :: qtemp(:,:)
477      integer is, iv, jv, ia, inl, jnl, i, j, js, ja,ig
478      real(dp) becktmp
479
480
481      logical :: mat_par=.true.!if true uses parallel routines
482
483      call start_clock('xminus1')
484
485      if (nkbus.gt.0) then
486!calculates beck
487         if (do_k) then
488            beck(:,:) = 0.d0
489            do ia=1,nat
490               is=ityp(ia)
491               IF( upf(is)%tvanp ) THEN
492                  do iv=1,nh(is)
493                     inl = indv_ijkb0(ia) + iv
494                     do i=1,n
495                        becktmp = 0.0d0
496                        do ig=1,ngw
497                           becktmp=becktmp+ema0bg(ig)*DBLE(CONJG(betae(ig,inl))*c0(ig,i))
498                        enddo
499                        becktmp = becktmp*2.0d0
500                        if (gstart==2) becktmp = becktmp-ema0bg(1)*DBLE(CONJG(betae(1,inl))*c0(1,i))
501                        beck(inl,i) = beck(inl,i) + becktmp
502                     enddo
503                  enddo
504               END IF
505            enddo
506            call mp_sum( beck, intra_bgrp_comm )
507         endif
508!
509!
510      allocate(phi(ngw,n))
511      allocate(qtemp(nkb,n))
512      phi(1:ngw,1:n) = 0.0d0
513      qtemp(:,:) = 0.0d0
514      if(.not.mat_par) then
515         call dgemm( 'N', 'N', nkb, n, nkb, 1.0d0, m_minus1,nkb ,    &
516                    beck, nkb, 0.0d0, qtemp,nkb )
517      else
518         call para_dgemm( 'N', 'N', nkb, n, nkb, 1.0d0, m_minus1,nkb ,    &
519                    beck, nkb, 0.0d0, qtemp,nkb,intra_bgrp_comm )
520      endif
521
522!NB  nkb is the total number of US projectors
523!    it works because the first pseudos are the vanderbilt's ones
524
525         CALL dgemm( 'N', 'N', 2*ngw, n, nkb, 1.0d0, betae, 2*ngw,    &
526                    qtemp, nkb, 0.0d0, phi, 2*ngw )
527         if (do_k) then
528            do j=1,n
529               do ig=1,ngw
530                  c0(ig,j)=(phi(ig,j)+c0(ig,j))*ema0bg(ig)
531               end do
532            end do
533         else
534            do j=1,n
535               do i=1,ngw
536                  c0(i,j)=(phi(i,j)+c0(i,j))
537               end do
538            end do
539         endif
540      deallocate(qtemp,phi)
541
542      else
543         if (do_k) then
544            do j=1,n
545               do ig=1,ngw
546                  c0(ig,j)=c0(ig,j)*ema0bg(ig)
547               end do
548            end do
549         endif
550      endif
551      call stop_clock('xminus1')
552      return
553     end subroutine xminus1
554
555      SUBROUTINE emass_precond_tpa( ema0bg, tpiba2, emaec )
556       use kinds, ONLY : dp
557       use gvecw, ONLY : g2kin,ngw
558       IMPLICIT NONE
559       REAL(DP), INTENT(OUT) :: ema0bg(ngw)
560       REAL(DP), INTENT(IN) ::  tpiba2, emaec
561       INTEGER :: i
562
563       real(DP) :: x
564
565       call start_clock('emass_p_tpa')
566       do i = 1, ngw
567
568          x=0.5d0*tpiba2*g2kin(i)/emaec
569          ema0bg(i) = 1.d0/(1.d0+(16.d0*x**4)/(27.d0+18.d0*x+12.d0*x**2+8.d0*x**3))
570       end do
571       call stop_clock('emass_p_tpa')
572      RETURN
573      END SUBROUTINE emass_precond_tpa
574
575      subroutine ave_kin( c, ngwx, n, ene_ave )
576!this subroutine calculates the average kinetic energy of
577!each state , to be used for preconditioning
578
579
580      USE kinds,              ONLY: DP
581      USE constants,          ONLY: pi, fpi
582      USE gvecw,              ONLY: ngw
583      USE gvect, ONLY: gstart
584      USE gvecw,              ONLY: g2kin
585      USE mp,                 ONLY: mp_sum
586      USE mp_global,          ONLY: intra_bgrp_comm
587      USE cell_base,          ONLY: tpiba2
588
589      IMPLICIT NONE
590
591
592      ! input
593
594      INTEGER,     INTENT(IN) :: ngwx, n
595      COMPLEX(kind=DP), INTENT(IN) :: c( ngwx, n )
596      REAL(kind=DP), INTENT(out) :: ene_ave(n)!average kinetic energy to be calculated
597      !
598      ! local
599
600      INTEGER  :: ig, i
601
602      !
603      DO i=1,n
604         ene_ave(i)=0.d0
605         DO ig=gstart,ngw
606            ene_ave(i)=ene_ave(i)+DBLE(CONJG(c(ig,i))*c(ig,i))*g2kin(ig)
607         END DO
608      END DO
609
610
611      CALL mp_sum( ene_ave(1:n), intra_bgrp_comm )
612      ene_ave(:)=ene_ave(:)*tpiba2
613
614      RETURN
615    END subroutine ave_kin
616
617
618
619      subroutine xminus1_state(c0,betae,ema0bg,beck,m_minus1,do_k,ave_kin)
620! if (do_k) then
621!-----------------------------------------------------------------------
622!     input: c0 , bec=<c0|beta>, betae=|beta>
623!     computes the matrix phi (with the old positions)
624!       where  |phi> = K^{-1}|c0>
625! else
626!-----------------------------------------------------------------------
627!     input: c0 , bec=<c0|beta>, betae=|beta>
628!     computes the matrix phi (with the old positions)
629!       where  |phi> = s^{-1}|c0>
630! endif
631!adapted for state by state
632      use kinds, only: dp
633      use ions_base, only: nat, ityp
634      use io_global, only: stdout
635      use mp_global, only: intra_bgrp_comm
636      use uspp_param, only: nh, upf
637      use uspp, only :nkb, nkbus, qq_nt, indv_ijkb0
638      use electrons_base, only: n => nbsp
639      use gvecw, only: ngw
640      use constants, only: pi, fpi
641      use mp, only: mp_sum
642      use gvect, only: gstart
643      USE gvecw,              ONLY: g2kin
644      USE cell_base,          ONLY: tpiba2
645
646
647!
648      implicit none
649      complex(dp) c0(ngw,n), betae(ngw,nkb)
650      real(dp)    beck(nkb,n), ema0bg(ngw)
651      real(DP)    :: m_minus1(nkb,nkb)
652      logical :: do_k
653      real(kind=DP) :: ave_kin(n)!average kinetic energy per state
654! local variables
655      complex(dp), allocatable :: phi(:,:)
656      real(dp) , allocatable   :: qtemp(:,:)
657      integer is, iv, jv, ia, inl, jnl, i, j, js, ja,ig
658      real(dp) becktmp
659      real(kind=DP) :: prec_fact, x
660
661
662      call start_clock('xminus1')
663      if (nkbus.gt.0) then
664!calculates beck
665         if (do_k) then
666            beck(:,:) = 0.d0
667
668            do ia=1,nat
669               is = ityp(ia)
670               IF( upf(is)%tvanp ) THEN
671                  do iv=1,nh(is)
672                     inl = indv_ijkb0(ia) + iv
673                     do i=1,n
674                        becktmp = 0.0d0
675                        do ig=1,ngw
676                           becktmp=becktmp+ema0bg(ig)*DBLE(CONJG(betae(ig,inl))*c0(ig,i))
677                        enddo
678                        becktmp = becktmp*2.0d0
679                        if (gstart==2) becktmp = becktmp-ema0bg(1)*DBLE(CONJG(betae(1,inl))*c0(1,i))
680                        beck(inl,i) = beck(inl,i) + becktmp
681                     enddo
682                  enddo
683               END IF
684            enddo
685            call mp_sum( beck, intra_bgrp_comm )
686         endif
687!
688!
689      allocate(phi(ngw,n))
690      allocate(qtemp(nkb,n))
691      phi(1:ngw,1:n) = 0.0d0
692      qtemp(:,:) = 0.0d0
693      call dgemm( 'N', 'N', nkb, n, nkb, 1.0d0, m_minus1,nkb ,    &
694                    beck, nkb, 0.0d0, qtemp,nkb )
695
696
697
698!NB nkb is the total number of US projectors, it works because the first pseudos are the vanderbilt's ones
699
700      CALL dgemm( 'N', 'N', 2*ngw, n, nkb, 1.0d0, betae, 2*ngw,    &
701           qtemp, nkb, 0.0d0, phi, 2*ngw )
702      if (do_k) then
703         do j=1,n
704            do ig=1,ngw
705               x=tpiba2*g2kin(i)/ave_kin(j)
706               prec_fact = 1.d0/(1.d0+(16.d0*x**4)/(27.d0+18.d0*x+12.d0*x**2+8.d0*x**3))
707                c0(ig,j)=c0(ig,j)*prec_fact
708               !c0(ig,j)=(phi(ig,j)+c0(ig,j))*ema0bg(ig)
709            end do
710         end do
711      else
712         do j=1,n
713            do i=1,ngw
714               c0(i,j)=(phi(i,j)+c0(i,j))
715            end do
716         end do
717      endif
718      deallocate(qtemp,phi)
719
720   else
721      if (do_k) then
722         do j=1,n
723            do ig=1,ngw
724               x=tpiba2*g2kin(ig)/ave_kin(j)
725               prec_fact = 1.d0/(1.d0+(16.d0*x**4)/(27.d0+18.d0*x+12.d0*x**2+8.d0*x**3))
726               c0(ig,j)=c0(ig,j)*prec_fact
727            end do
728         end do
729      endif
730   endif
731   call stop_clock('xminus1')
732   return
733 end subroutine xminus1_state
734!
735! ... some simple routines for parallel linear algebra (the matrices are
736! ... always replicated on all the cpus)
737!
738! ... written by carlo sbraccia ( 2006 )
739!
740!----------------------------------------------------------------------------
741SUBROUTINE para_dgemm( transa, transb, m, n, k, &
742                       alpha, a, lda, b, ldb, beta, c, ldc, comm )
743  !----------------------------------------------------------------------------
744  !
745  ! ... trivial parallelization (splitting matrix B by columns) of dgemm
746  !
747  USE kinds, ONLY : DP
748  !
749  IMPLICIT NONE
750  !
751  CHARACTER(LEN=1), INTENT(IN)    :: transa, transb
752  INTEGER,          INTENT(IN)    :: m, n, k
753  REAL(DP),         INTENT(IN)    :: alpha, beta
754  INTEGER,          INTENT(IN)    :: lda, ldb, ldc
755  REAL(DP),         INTENT(INOUT) :: a(lda,*), b(ldb,*), c(ldc,*)
756  INTEGER,          INTENT(IN)    :: comm
757  !
758  ! ... quick return if possible
759  !
760  IF ( m == 0 .OR. n == 0 .OR. &
761       ( ( alpha == 0.0_DP .OR. k == 0 ) .AND. beta == 1.0_DP ) ) RETURN
762  !
763!write(*,*) 'DEBUG: para_dgemm'
764  !
765  CALL rep_matmul_drv( transa, transb, m, n, k, &
766                       alpha, a, lda, b, ldb, beta, c, ldc, comm )
767  RETURN
768  !
769END SUBROUTINE para_dgemm
770