1!
2! Copyright (C) 2001-2015 Quantum ESPRESSO group
3! This file is distributed under the terms of the
4! GNU General Public License. See the file `License'
5! in the root directory of the present distribution,
6! or http://www.gnu.org/copyleft/gpl.txt .
7!
8#define ZERO ( 0.D0, 0.D0 )
9#define ONE  ( 1.D0, 0.D0 )
10!
11!----------------------------------------------------------------------------
12SUBROUTINE cegterg( h_psi, s_psi, uspp, g_psi, &
13                    npw, npwx, nvec, nvecx, npol, evc, ethr, &
14                    e, btype, notcnv, lrot, dav_iter, nhpsi )
15  !----------------------------------------------------------------------------
16  !
17  ! ... iterative solution of the eigenvalue problem:
18  !
19  ! ... ( H - e S ) * evc = 0
20  !
21  ! ... where H is an hermitean operator, e is a real scalar,
22  ! ... S is an overlap matrix, evc is a complex vector
23  !
24  USE util_param,    ONLY : DP
25  USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id,&
26                            nbgrp, my_bgrp_id, me_bgrp, root_bgrp
27  USE mp,            ONLY : mp_sum, mp_gather, mp_bcast, mp_size,&
28                            mp_type_create_column_section, mp_type_free
29  !
30  IMPLICIT NONE
31  !
32  include 'laxlib.fh'
33  !
34  INTEGER, INTENT(IN) :: npw, npwx, nvec, nvecx, npol
35    ! dimension of the matrix to be diagonalized
36    ! leading dimension of matrix evc, as declared in the calling pgm unit
37    ! integer number of searched low-lying roots
38    ! maximum dimension of the reduced basis set :
39    !    (the basis set is refreshed when its dimension would exceed nvecx)
40    ! umber of spin polarizations
41  INTEGER, PARAMETER :: blocksize = 256
42  INTEGER :: numblock
43    ! chunking parameters
44  COMPLEX(DP), INTENT(INOUT) :: evc(npwx*npol,nvec)
45    !  evc contains the  refined estimates of the eigenvectors
46  REAL(DP), INTENT(IN) :: ethr
47    ! energy threshold for convergence :
48    !   root improvement is stopped, when two consecutive estimates of the root
49    !   differ by less than ethr.
50  LOGICAL, INTENT(IN) :: uspp
51    ! if .FALSE. : do not calculate S|psi>
52  INTEGER, INTENT(IN) :: btype(nvec)
53    ! band type ( 1 = occupied, 0 = empty )
54  LOGICAL, INTENT(IN) :: lrot
55    ! .TRUE. if the wfc have already been rotated
56  REAL(DP), INTENT(OUT) :: e(nvec)
57    ! contains the estimated roots.
58  INTEGER, INTENT(OUT) :: dav_iter, notcnv
59    ! integer number of iterations performed
60    ! number of unconverged roots
61  INTEGER, INTENT(OUT) :: nhpsi
62    ! total number of indivitual hpsi
63  !
64  ! ... LOCAL variables
65  !
66  INTEGER, PARAMETER :: maxter = 20
67    ! maximum number of iterations
68  !
69  INTEGER :: kter, nbase, np, kdim, kdmx, n, m, ipol, nb1, nbn
70    ! counter on iterations
71    ! dimension of the reduced basis
72    ! counter on the reduced basis vectors
73    ! adapted npw and npwx
74    ! do-loop counters
75  INTEGER :: n_start, n_end, my_n
76  INTEGER :: column_section_type
77    ! defines a column section for communication
78  INTEGER :: ierr
79  COMPLEX(DP), ALLOCATABLE :: hc(:,:), sc(:,:), vc(:,:)
80    ! Hamiltonian on the reduced basis
81    ! S matrix on the reduced basis
82    ! the eigenvectors of the Hamiltonian
83  REAL(DP), ALLOCATABLE :: ew(:)
84    ! eigenvalues of the reduced hamiltonian
85  COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
86    ! work space, contains psi
87    ! the product of H and psi
88    ! the product of S and psi
89  LOGICAL, ALLOCATABLE  :: conv(:)
90    ! true if the root is converged
91  REAL(DP) :: empty_ethr
92    ! threshold for empty bands
93  INTEGER, ALLOCATABLE :: recv_counts(:), displs(:)
94    ! receive counts and memory offsets
95  !
96  REAL(DP), EXTERNAL :: ddot
97  !
98  EXTERNAL  h_psi,    s_psi,    g_psi
99    ! h_psi(npwx,npw,nvec,psi,hpsi)
100    !     calculates H|psi>
101    ! s_psi(npwx,npw,nvec,spsi)
102    !     calculates S|psi> (if needed)
103    !     Vectors psi,hpsi,spsi are dimensioned (npwx*npol,nvec)
104    ! g_psi(npwx,npw,notcnv,psi,e)
105    !    calculates (diag(h)-e)^-1 * psi, diagonal approx. to (h-e)^-1*psi
106    !    the first nvec columns contain the trial eigenvectors
107  !
108  nhpsi = 0
109  CALL start_clock( 'cegterg' ); !write(*,*) 'start cegterg' ; FLUSH(6)
110  !
111  IF ( nvec > nvecx / 2 ) CALL errore( 'cegterg', 'nvecx is too small', 1 )
112  !
113  ! ... threshold for empty bands
114  !
115  empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
116  !
117  IF ( npol == 1 ) THEN
118     !
119     kdim = npw
120     kdmx = npwx
121     !
122  ELSE
123     !
124     kdim = npwx*npol
125     kdmx = npwx*npol
126     !
127  END IF
128  !
129  ! compute the number of chuncks
130  numblock  = (npw+blocksize-1)/blocksize
131  !
132  ALLOCATE(  psi( npwx*npol, nvecx ), STAT=ierr )
133  IF( ierr /= 0 ) &
134     CALL errore( ' cegterg ',' cannot allocate psi ', ABS(ierr) )
135  ALLOCATE( hpsi( npwx*npol, nvecx ), STAT=ierr )
136  IF( ierr /= 0 ) &
137     CALL errore( ' cegterg ',' cannot allocate hpsi ', ABS(ierr) )
138  !
139  IF ( uspp ) THEN
140     ALLOCATE( spsi( npwx*npol, nvecx ), STAT=ierr )
141     IF( ierr /= 0 ) &
142        CALL errore( ' cegterg ',' cannot allocate spsi ', ABS(ierr) )
143  END IF
144  !
145  ALLOCATE( sc( nvecx, nvecx ), STAT=ierr )
146  IF( ierr /= 0 ) &
147     CALL errore( ' cegterg ',' cannot allocate sc ', ABS(ierr) )
148  ALLOCATE( hc( nvecx, nvecx ), STAT=ierr )
149  IF( ierr /= 0 ) &
150     CALL errore( ' cegterg ',' cannot allocate hc ', ABS(ierr) )
151  ALLOCATE( vc( nvecx, nvecx ), STAT=ierr )
152  IF( ierr /= 0 ) &
153     CALL errore( ' cegterg ',' cannot allocate vc ', ABS(ierr) )
154  ALLOCATE( ew( nvecx ), STAT=ierr )
155  IF( ierr /= 0 ) &
156     CALL errore( ' cegterg ',' cannot allocate ew ', ABS(ierr) )
157  ALLOCATE( conv( nvec ), STAT=ierr )
158  IF( ierr /= 0 ) &
159     CALL errore( ' cegterg ',' cannot allocate conv ', ABS(ierr) )
160  ALLOCATE( recv_counts(mp_size(inter_bgrp_comm)), displs(mp_size(inter_bgrp_comm)) )
161  !
162  notcnv = nvec
163  nbase  = nvec
164  conv   = .FALSE.
165  !
166  CALL threaded_memcpy(psi, evc, nvec*npol*npwx*2)
167  !
168  ! ... hpsi contains h times the basis vectors
169  !
170  CALL h_psi( npwx, npw, nvec, psi, hpsi ) ; nhpsi = nhpsi + nvec
171  !
172  ! ... spsi contains s times the basis vectors
173  !
174  IF ( uspp ) CALL s_psi( npwx, npw, nvec, psi, spsi )
175  !
176  ! ... hc contains the projection of the hamiltonian onto the reduced
177  ! ... space vc contains the eigenvectors of hc
178  !
179  CALL start_clock( 'cegterg:init' )
180  !
181  CALL divide_all(inter_bgrp_comm,nbase,n_start,n_end,recv_counts,displs)
182  CALL mp_type_create_column_section(sc(1,1), 0, nbase, nvecx, column_section_type)
183  my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
184  !
185  if (n_start .le. n_end) &
186  CALL ZGEMM( 'C','N', nbase, my_n, kdim, ONE, psi, kdmx, hpsi(1,n_start), kdmx, ZERO, hc(1,n_start), nvecx )
187  !
188  if (n_start .le. n_end) CALL mp_sum( hc( 1:nbase, n_start:n_end ), intra_bgrp_comm )
189  CALL mp_gather( hc, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
190  !
191  IF ( uspp ) THEN
192     !
193     if (n_start .le. n_end) &
194     CALL ZGEMM( 'C','N', nbase, my_n, kdim, ONE, psi, kdmx, spsi(1,n_start), kdmx, &
195                 ZERO, sc(1,n_start), nvecx )
196     !
197  ELSE
198     !
199     if (n_start .le. n_end) &
200     CALL ZGEMM( 'C','N', nbase, my_n, kdim, ONE, psi, kdmx, psi(1,n_start), kdmx, &
201                 ZERO, sc(1,n_start), nvecx )
202     !
203  END IF
204  !
205  if (n_start .le. n_end) CALL mp_sum( sc( 1:nbase, n_start:n_end ), intra_bgrp_comm )
206  CALL mp_gather( sc, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
207  !
208  CALL mp_type_free( column_section_type )
209  !
210  DO n = 1, nbase
211     !
212     ! ... the diagonal of hc and sc must be strictly real
213     !
214     hc(n,n) = CMPLX( REAL( hc(n,n) ), 0.D0 ,kind=DP)
215     sc(n,n) = CMPLX( REAL( sc(n,n) ), 0.D0 ,kind=DP)
216     !
217     DO m = n + 1, nbase
218        !
219        hc(n,m) = CONJG( hc(m,n) )
220        sc(n,m) = CONJG( sc(m,n) )
221        !
222     END DO
223     !
224  END DO
225  !
226  CALL stop_clock( 'cegterg:init' )
227  !
228  IF ( lrot ) THEN
229     !
230     vc(1:nbase,1:nbase) = ZERO
231     !
232     DO n = 1, nbase
233        !
234        e(n) = REAL( hc(n,n) )
235        !
236        vc(n,n) = ONE
237        !
238     END DO
239     !
240     CALL mp_bcast( e, root_bgrp_id, inter_bgrp_comm )
241     !
242  ELSE
243     !
244     ! ... diagonalize the reduced hamiltonian
245     !
246     CALL start_clock( 'cegterg:diag' )
247     IF( my_bgrp_id == root_bgrp_id ) THEN
248        CALL diaghg( nbase, nvec, hc, sc, nvecx, ew, vc, me_bgrp, root_bgrp, intra_bgrp_comm )
249     END IF
250     IF( nbgrp > 1 ) THEN
251        CALL mp_bcast( vc, root_bgrp_id, inter_bgrp_comm )
252        CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
253     ENDIF
254     CALL stop_clock( 'cegterg:diag' )
255     !
256     e(1:nvec) = ew(1:nvec)
257     !
258  END IF
259  !
260  ! ... iterate
261  !
262  iterate: DO kter = 1, maxter
263     !
264     dav_iter = kter ; !write(*,*) kter, notcnv, conv
265     !
266     CALL start_clock( 'cegterg:update' )
267     !
268     np = 0
269     !
270     DO n = 1, nvec
271        !
272        IF ( .NOT. conv(n) ) THEN
273           !
274           ! ... this root not yet converged ...
275           !
276           np = np + 1
277           !
278           ! ... reorder eigenvectors so that coefficients for unconverged
279           ! ... roots come first. This allows to use quick matrix-matrix
280           ! ... multiplications to set a new basis vector (see below)
281           !
282           IF ( np /= n ) vc(:,np) = vc(:,n)
283           !
284           ! ... for use in g_psi
285           !
286           ew(nbase+np) = e(n)
287           !
288        END IF
289        !
290     END DO
291     !
292     nb1 = nbase + 1
293     !
294     ! ... expand the basis set with new basis vectors ( H - e*S )|psi> ...
295     !
296     CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
297     my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
298     IF ( uspp ) THEN
299        !
300        if (n_start .le. n_end) &
301        CALL ZGEMM( 'N','N', kdim, notcnv, my_n, ONE, spsi(1,n_start), kdmx, vc(n_start,1), nvecx, &
302                    ZERO, psi(1,nb1), kdmx )
303        !
304     ELSE
305        !
306        if (n_start .le. n_end) &
307        CALL ZGEMM( 'N','N', kdim, notcnv, my_n, ONE, psi(1,n_start), kdmx, vc(n_start,1), nvecx, &
308                    ZERO, psi(1,nb1), kdmx )
309        !
310     END IF
311! NB: must not call mp_sum over inter_bgrp_comm here because it is done later to the full correction
312     !
313     !$omp parallel do collapse(3)
314     DO n = 1, notcnv
315        DO ipol = 1, npol
316           DO m = 1, numblock
317              psi( (m-1)*blocksize+(ipol-1)*npwx+1: &
318                    MIN(npw, m*blocksize)+(ipol-1)*npwx,nbase+n) = &
319                         - ew(nbase+n) * &
320              psi( (m-1)*blocksize+(ipol-1)*npwx+1: &
321                    MIN(npw, m*blocksize)+(ipol-1)*npwx,nbase+n)
322           END DO
323        END DO
324     END DO
325     !$omp end parallel do
326     !
327     if (n_start .le. n_end) &
328     CALL ZGEMM( 'N','N', kdim, notcnv, my_n, ONE, hpsi(1,n_start), kdmx, vc(n_start,1), nvecx, &
329                 ONE, psi(1,nb1), kdmx )
330     CALL mp_sum( psi(:,nb1:nbase+notcnv), inter_bgrp_comm )
331     !
332     ! clean up garbage if there is any
333     IF (npw < npwx) psi(npw+1:npwx,nb1:nbase+notcnv) = ZERO
334     IF (npol == 2)  psi(npwx+npw+1:2*npwx,nb1:nbase+notcnv) = ZERO
335     !
336     CALL stop_clock( 'cegterg:update' )
337     !
338     ! ... approximate inverse iteration
339     !
340     CALL g_psi( npwx, npw, notcnv, npol, psi(1,nb1), ew(nb1) )
341     !
342     ! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
343     ! ... order to improve numerical stability of subspace diagonalization
344     ! ... (cdiaghg) ew is used as work array :
345     !
346     ! ...         ew = <psi_i|psi_i>,  i = nbase + 1, nbase + notcnv
347     !
348     DO n = 1, notcnv
349        !
350        nbn = nbase + n
351        !
352        IF ( npol == 1 ) THEN
353           !
354           ew(n) = ddot( 2*npw, psi(1,nbn), 1, psi(1,nbn), 1 )
355           !
356        ELSE
357           !
358           ew(n) = ddot( 2*npw, psi(1,nbn), 1, psi(1,nbn), 1 ) + &
359                   ddot( 2*npw, psi(npwx+1,nbn), 1, psi(npwx+1,nbn), 1 )
360           !
361        END IF
362        !
363     END DO
364     !
365     CALL mp_sum( ew( 1:notcnv ), intra_bgrp_comm )
366     !
367     !$omp parallel do collapse(3)
368     DO n = 1, notcnv
369        DO ipol = 1, npol
370           DO m = 1, numblock
371              psi( (m-1)*blocksize+(ipol-1)*npwx+1: &
372                    MIN(npw, m*blocksize)+(ipol-1)*npwx,nbase+n) = &
373              psi( (m-1)*blocksize+(ipol-1)*npwx+1: &
374                    MIN(npw, m*blocksize)+(ipol-1)*npwx,nbase+n) / &
375                    SQRT( ew(n) )
376           END DO
377        END DO
378     END DO
379     !$omp end parallel do
380     !
381     ! ... here compute the hpsi and spsi of the new functions
382     !
383     CALL h_psi( npwx, npw, notcnv, psi(1,nb1), hpsi(1,nb1) ) ; nhpsi = nhpsi + notcnv
384     !
385     IF ( uspp ) CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
386     !
387     ! ... update the reduced hamiltonian
388     !
389     CALL start_clock( 'cegterg:overlap' )
390     !
391     CALL divide_all(inter_bgrp_comm,nbase+notcnv,n_start,n_end,recv_counts,displs)
392     CALL mp_type_create_column_section(sc(1,1), nbase, notcnv, nvecx, column_section_type)
393     my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
394     !
395     CALL ZGEMM( 'C','N', notcnv, my_n, kdim, ONE, hpsi(1,nb1), kdmx, psi(1,n_start), kdmx, &
396                 ZERO, hc(nb1,n_start), nvecx )
397     !
398     if (n_start .le. n_end) CALL mp_sum( hc( nb1:nbase+notcnv, n_start:n_end ), intra_bgrp_comm )
399     CALL mp_gather( hc, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
400     !
401     CALL divide(inter_bgrp_comm,nbase+notcnv,n_start,n_end)
402     my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
403     IF ( uspp ) THEN
404        !
405        CALL ZGEMM( 'C','N', notcnv, my_n, kdim, ONE, spsi(1,nb1), kdmx, psi(1,n_start), kdmx, &
406                    ZERO, sc(nb1,n_start), nvecx )
407        !
408     ELSE
409        !
410        CALL ZGEMM( 'C','N', notcnv, my_n, kdim, ONE, psi(1,nb1), kdmx, psi(1,n_start), kdmx, &
411                    ZERO, sc(nb1,n_start), nvecx )
412        !
413     END IF
414     !
415     if (n_start .le. n_end) CALL mp_sum( sc( nb1:nbase+notcnv, n_start:n_end ), intra_bgrp_comm )
416     CALL mp_gather( sc, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
417     !
418     CALL mp_type_free( column_section_type )
419     !
420     CALL stop_clock( 'cegterg:overlap' )
421     !
422     nbase = nbase + notcnv
423     !
424     DO n = 1, nbase
425        !
426        ! ... the diagonal of hc and sc must be strictly real
427        !
428        IF( n>=nb1 ) THEN
429           hc(n,n) = CMPLX( REAL( hc(n,n) ), 0.D0 ,kind=DP)
430           sc(n,n) = CMPLX( REAL( sc(n,n) ), 0.D0 ,kind=DP)
431        ENDIF
432        !
433        DO m = MAX(n+1,nb1), nbase
434           !
435           hc(n,m) = CONJG( hc(m,n) )
436           sc(n,m) = CONJG( sc(m,n) )
437           !
438        END DO
439        !
440     END DO
441     !
442     ! ... diagonalize the reduced hamiltonian
443     !
444     CALL start_clock( 'cegterg:diag' )
445     IF( my_bgrp_id == root_bgrp_id ) THEN
446        CALL diaghg( nbase, nvec, hc, sc, nvecx, ew, vc, me_bgrp, root_bgrp, intra_bgrp_comm )
447     END IF
448     IF( nbgrp > 1 ) THEN
449        CALL mp_bcast( vc, root_bgrp_id, inter_bgrp_comm )
450        CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
451     ENDIF
452     CALL stop_clock( 'cegterg:diag' )
453     !
454     ! ... test for convergence
455     !
456     WHERE( btype(1:nvec) == 1 )
457        !
458        conv(1:nvec) = ( ( ABS( ew(1:nvec) - e(1:nvec) ) < ethr ) )
459        !
460     ELSEWHERE
461        !
462        conv(1:nvec) = ( ( ABS( ew(1:nvec) - e(1:nvec) ) < empty_ethr ) )
463        !
464     END WHERE
465     ! ... next line useful for band parallelization of exact exchange
466     IF ( nbgrp > 1 ) CALL mp_bcast(conv,root_bgrp_id,inter_bgrp_comm)
467     !
468     notcnv = COUNT( .NOT. conv(:) )
469     !
470     e(1:nvec) = ew(1:nvec)
471     !
472     ! ... if overall convergence has been achieved, or the dimension of
473     ! ... the reduced basis set is becoming too large, or in any case if
474     ! ... we are at the last iteration refresh the basis set. i.e. replace
475     ! ... the first nvec elements with the current estimate of the
476     ! ... eigenvectors;  set the basis dimension to nvec.
477     !
478     IF ( notcnv == 0 .OR. &
479          nbase+notcnv > nvecx .OR. dav_iter == maxter ) THEN
480        !
481        CALL start_clock( 'cegterg:last' )
482        !
483        CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
484        my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
485        CALL ZGEMM( 'N','N', kdim, nvec, my_n, ONE, psi(1,n_start), kdmx, vc(n_start,1), nvecx, &
486                    ZERO, evc, kdmx )
487        CALL mp_sum( evc, inter_bgrp_comm )
488        !
489        IF ( notcnv == 0 ) THEN
490           !
491           ! ... all roots converged: return
492           !
493           CALL stop_clock( 'cegterg:last' )
494           !
495           EXIT iterate
496           !
497        ELSE IF ( dav_iter == maxter ) THEN
498           !
499           ! ... last iteration, some roots not converged: return
500           !
501           !!!WRITE( stdout, '(5X,"WARNING: ",I5, &
502           !!!     &   " eigenvalues not converged")' ) notcnv
503           !
504           CALL stop_clock( 'cegterg:last' )
505           !
506           EXIT iterate
507           !
508        END IF
509        !
510        ! ... refresh psi, H*psi and S*psi
511        !
512        CALL threaded_memcpy(psi, evc, nvec*npol*npwx*2)
513        !
514        IF ( uspp ) THEN
515           !
516           CALL ZGEMM( 'N','N', kdim, nvec, my_n, ONE, spsi(1,n_start), kdmx, vc(n_start,1), nvecx, &
517                       ZERO, psi(1,nvec+1), kdmx)
518           CALL threaded_memcpy(spsi, psi(1,nvec+1), nvec*npol*npwx*2)
519           CALL mp_sum( spsi(:,1:nvec), inter_bgrp_comm )
520           !
521        END IF
522        !
523        CALL ZGEMM( 'N','N', kdim, nvec, my_n, ONE, hpsi(1,n_start), kdmx, vc(n_start,1), nvecx, &
524                    ZERO, psi(1,nvec+1), kdmx )
525        CALL threaded_memcpy(hpsi, psi(1,nvec+1), nvec*npol*npwx*2)
526        CALL mp_sum( hpsi(:,1:nvec), inter_bgrp_comm )
527        !
528        ! ... refresh the reduced hamiltonian
529        !
530        nbase = nvec
531        !
532        hc(1:nbase,1:nbase) = ZERO
533        sc(1:nbase,1:nbase) = ZERO
534        vc(1:nbase,1:nbase) = ZERO
535        !
536        DO n = 1, nbase
537           !
538           hc(n,n) = CMPLX( e(n), 0.0_DP ,kind=DP)
539           !
540           sc(n,n) = ONE
541           vc(n,n) = ONE
542           !
543        END DO
544        !
545        CALL stop_clock( 'cegterg:last' )
546        !
547     END IF
548     !
549  END DO iterate
550  !
551  DEALLOCATE( recv_counts )
552  DEALLOCATE( displs )
553  DEALLOCATE( conv )
554  DEALLOCATE( ew )
555  DEALLOCATE( vc )
556  DEALLOCATE( hc )
557  DEALLOCATE( sc )
558  !
559  IF ( uspp ) DEALLOCATE( spsi )
560  !
561  DEALLOCATE( hpsi )
562  DEALLOCATE( psi )
563  !
564  CALL stop_clock( 'cegterg' ); !write(*,*) 'stop cegterg' ; FLUSH(6)
565  !call print_clock( 'cegterg' )
566  !call print_clock( 'cegterg:init' )
567  !call print_clock( 'cegterg:diag' )
568  !call print_clock( 'cegterg:update' )
569  !call print_clock( 'cegterg:overlap' )
570  !call print_clock( 'cegterg:last' )
571  !
572  RETURN
573  !
574END SUBROUTINE cegterg
575
576!
577!  Subroutine with distributed matrixes
578!  (written by Carlo Cavazzoni)
579!
580!----------------------------------------------------------------------------
581SUBROUTINE pcegterg(h_psi, s_psi, uspp, g_psi, &
582                    npw, npwx, nvec, nvecx, npol, evc, ethr, &
583                    e, btype, notcnv, lrot, dav_iter, nhpsi )
584  !----------------------------------------------------------------------------
585  !
586  ! ... iterative solution of the eigenvalue problem:
587  !
588  ! ... ( H - e S ) * evc = 0
589  !
590  ! ... where H is an hermitean operator, e is a real scalar,
591  ! ... S is an uspp matrix, evc is a complex vector
592  !
593  USE util_param,       ONLY : DP, stdout
594  USE mp_bands_util,    ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id
595  USE mp,               ONLY : mp_bcast, mp_root_sum, mp_sum, mp_barrier, &
596                               mp_size, mp_type_free, mp_allgather
597  !
598  IMPLICIT NONE
599  !
600  include 'laxlib.fh'
601  !
602  INTEGER, INTENT(IN) :: npw, npwx, nvec, nvecx, npol
603    ! dimension of the matrix to be diagonalized
604    ! leading dimension of matrix evc, as declared in the calling pgm unit
605    ! integer number of searched low-lying roots
606    ! maximum dimension of the reduced basis set
607    !    (the basis set is refreshed when its dimension would exceed nvecx)
608    ! number of spin polarizations
609  INTEGER, PARAMETER :: blocksize = 256
610  INTEGER :: numblock
611    ! chunking parameters
612  COMPLEX(DP), INTENT(INOUT) :: evc(npwx*npol,nvec)
613    !  evc   contains the  refined estimates of the eigenvectors
614  REAL(DP), INTENT(IN) :: ethr
615    ! energy threshold for convergence: root improvement is stopped,
616    ! when two consecutive estimates of the root differ by less than ethr.
617  LOGICAL, INTENT(IN) :: uspp
618    ! if .FALSE. : S|psi> not needed
619  INTEGER, INTENT(IN) :: btype(nvec)
620    ! band type ( 1 = occupied, 0 = empty )
621  LOGICAL, INTENT(IN) :: lrot
622    ! .TRUE. if the wfc have already been rotated
623  REAL(DP), INTENT(OUT) :: e(nvec)
624    ! contains the estimated roots.
625  INTEGER, INTENT(OUT) :: dav_iter, notcnv
626    ! integer  number of iterations performed
627    ! number of unconverged roots
628  INTEGER, INTENT(OUT) :: nhpsi
629    ! total number of indivitual hpsi
630  !
631  ! ... LOCAL variables
632  !
633  INTEGER, PARAMETER :: maxter = 20
634    ! maximum number of iterations
635  !
636  INTEGER :: kter, nbase, np, kdim, kdmx, n, m, ipol, nb1, nbn
637    ! counter on iterations
638    ! dimension of the reduced basis
639    ! counter on the reduced basis vectors
640    ! do-loop counters
641  INTEGER :: ierr
642  REAL(DP), ALLOCATABLE :: ew(:)
643  COMPLEX(DP), ALLOCATABLE :: hl(:,:), sl(:,:), vl(:,:)
644    ! Hamiltonian on the reduced basis
645    ! S matrix on the reduced basis
646    ! eigenvectors of the Hamiltonian
647    ! eigenvalues of the reduced hamiltonian
648  COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
649    ! work space, contains psi
650    ! the product of H and psi
651    ! the product of S and psi
652  LOGICAL, ALLOCATABLE :: conv(:)
653    ! true if the root is converged
654  REAL(DP) :: empty_ethr
655    ! threshold for empty bands
656  INTEGER :: idesc(LAX_DESC_SIZE), idesc_old(LAX_DESC_SIZE)
657  INTEGER, ALLOCATABLE :: irc_ip( : )
658  INTEGER, ALLOCATABLE :: nrc_ip( : )
659  INTEGER, ALLOCATABLE :: rank_ip( :, : )
660    ! matrix distribution descriptors
661  INTEGER :: nx
662    ! maximum local block dimension
663  LOGICAL :: la_proc
664    ! flag to distinguish procs involved in linear algebra
665  INTEGER, ALLOCATABLE :: notcnv_ip( : )
666  INTEGER, ALLOCATABLE :: ic_notcnv( : )
667  !
668  INTEGER :: np_ortho(2), ortho_parent_comm
669  LOGICAL :: do_distr_diag_inside_bgrp
670  !
671  REAL(DP), EXTERNAL :: ddot
672  !
673  EXTERNAL  h_psi, s_psi, g_psi
674    ! h_psi(npwx,npw,nvec,psi,hpsi)
675    !     calculates H|psi>
676    ! s_psi(npwx,npw,nvec,psi,spsi)
677    !     calculates S|psi> (if needed)
678    !     Vectors psi,hpsi,spsi are dimensioned (npwx,nvec)
679    ! g_psi(npwx,npw,notcnv,psi,e)
680    !    calculates (diag(h)-e)^-1 * psi, diagonal approx. to (h-e)^-1*psi
681    !    the first nvec columns contain the trial eigenvectors
682  !
683  nhpsi = 0
684  CALL start_clock( 'cegterg' )
685  !
686  CALL laxlib_getval( np_ortho = np_ortho, ortho_parent_comm = ortho_parent_comm, &
687    do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp )
688  !
689  IF ( nvec > nvecx / 2 ) CALL errore( 'pcegterg', 'nvecx is too small', 1 )
690  !
691  ! ... threshold for empty bands
692  !
693  empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
694  !
695  IF ( npol == 1 ) THEN
696     !
697     kdim = npw
698     kdmx = npwx
699     !
700  ELSE
701     !
702     kdim = npwx*npol
703     kdmx = npwx*npol
704     !
705  END IF
706  !
707  ! compute the number of chuncks
708  numblock  = (npw+blocksize-1)/blocksize
709  !
710  ALLOCATE(  psi( npwx*npol, nvecx ), STAT=ierr )
711  IF( ierr /= 0 ) &
712     CALL errore( ' pcegterg ',' cannot allocate psi ', ABS(ierr) )
713  !
714  ALLOCATE( hpsi( npwx*npol, nvecx ), STAT=ierr )
715  IF( ierr /= 0 ) &
716     CALL errore( ' pcegterg ',' cannot allocate hpsi ', ABS(ierr) )
717  !
718  IF ( uspp ) THEN
719     ALLOCATE( spsi( npwx*npol, nvecx ), STAT=ierr )
720     IF( ierr /= 0 ) &
721        CALL errore( ' pcegterg ',' cannot allocate spsi ', ABS(ierr) )
722  END IF
723  !
724  ! ... Initialize the matrix descriptor
725  !
726  ALLOCATE( ic_notcnv( np_ortho(2) ), STAT=ierr )
727  IF( ierr /= 0 ) &
728     CALL errore( ' pcegterg ',' cannot allocate ic_notcnv ', ABS(ierr) )
729  !
730  ALLOCATE( notcnv_ip( np_ortho(2) ), STAT=ierr )
731  IF( ierr /= 0 ) &
732     CALL errore( ' pcegterg ',' cannot allocate notcnv_ip ', ABS(ierr) )
733  !
734  CALL desc_init( nvec, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
735  !
736  IF( la_proc ) THEN
737     !
738     ! only procs involved in the diagonalization need to allocate local
739     ! matrix block.
740     !
741     ALLOCATE( vl( nx , nx ), STAT=ierr )
742     IF( ierr /= 0 ) &
743        CALL errore( ' pcegterg ',' cannot allocate vl ', ABS(ierr) )
744     !
745     ALLOCATE( sl( nx , nx ), STAT=ierr )
746     IF( ierr /= 0 ) &
747        CALL errore( ' pcegterg ',' cannot allocate sl ', ABS(ierr) )
748     !
749     ALLOCATE( hl( nx , nx ), STAT=ierr )
750     IF( ierr /= 0 ) &
751        CALL errore( ' pcegterg ',' cannot allocate hl ', ABS(ierr) )
752     !
753  ELSE
754     !
755     ALLOCATE( vl( 1 , 1 ), STAT=ierr )
756     IF( ierr /= 0 ) &
757        CALL errore( ' pcegterg ',' cannot allocate vl ', ABS(ierr) )
758     !
759     ALLOCATE( sl( 1 , 1 ), STAT=ierr )
760     IF( ierr /= 0 ) &
761        CALL errore( ' pcegterg ',' cannot allocate sl ', ABS(ierr) )
762     !
763     ALLOCATE( hl( 1 , 1 ), STAT=ierr )
764     IF( ierr /= 0 ) &
765        CALL errore( ' pcegterg ',' cannot allocate hl ', ABS(ierr) )
766     !
767  END IF
768  !
769  ALLOCATE( ew( nvecx ), STAT=ierr )
770  IF( ierr /= 0 ) &
771     CALL errore( ' pcegterg ',' cannot allocate ew ', ABS(ierr) )
772  !
773  ALLOCATE( conv( nvec ), STAT=ierr )
774  IF( ierr /= 0 ) &
775     CALL errore( ' pcegterg ',' cannot allocate conv ', ABS(ierr) )
776  !
777  notcnv = nvec
778  nbase  = nvec
779  conv   = .FALSE.
780  !
781  CALL threaded_memcpy(psi, evc, nvec*npol*npwx*2)
782  !
783  ! ... hpsi contains h times the basis vectors
784  !
785  CALL h_psi( npwx, npw, nvec, psi, hpsi ) ; nhpsi = nhpsi + nvec
786  !
787  IF ( uspp ) CALL s_psi( npwx, npw, nvec, psi, spsi )
788  !
789  ! ... hl contains the projection of the hamiltonian onto the reduced
790  ! ... space, vl contains the eigenvectors of hl. Remember hl, vl and sl
791  ! ... are all distributed across processors, global replicated matrixes
792  ! ... here are never allocated
793  !
794  CALL start_clock( 'cegterg:init' )
795
796  CALL compute_distmat( hl, psi, hpsi )
797  !
798  IF ( uspp ) THEN
799     !
800     CALL compute_distmat( sl, psi, spsi )
801     !
802  ELSE
803     !
804     CALL compute_distmat( sl, psi, psi )
805     !
806  END IF
807  CALL stop_clock( 'cegterg:init' )
808  !
809  IF ( lrot ) THEN
810     !
811     CALL set_e_from_h()
812     !
813     CALL set_to_identity( vl, idesc )
814     !
815  ELSE
816     !
817     ! ... diagonalize the reduced hamiltonian
818     !     Calling block parallel algorithm
819     !
820     CALL start_clock( 'cegterg:diag' )
821     IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg ew and vl are the same across ortho_parent_comm
822        ! only the first bgrp performs the diagonalization
823        IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
824        IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
825           CALL mp_bcast( vl, root_bgrp_id, inter_bgrp_comm )
826           CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
827        ENDIF
828     ELSE
829        CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
830     END IF
831     CALL stop_clock( 'cegterg:diag' )
832     !
833     e(1:nvec) = ew(1:nvec)
834     !
835  END IF
836  !
837  ! ... iterate
838  !
839  iterate: DO kter = 1, maxter
840     !
841     dav_iter = kter ; !write(*,*) kter, notcnv, conv
842     !
843     CALL start_clock( 'cegterg:update' )
844     !
845     CALL reorder_v()
846     !
847     nb1 = nbase + 1
848     !
849     ! ... expand the basis set with new basis vectors ( H - e*S )|psi> ...
850     !
851     CALL hpsi_dot_v()
852     !
853     CALL stop_clock( 'cegterg:update' )
854     !
855     ! ... approximate inverse iteration
856     !
857     CALL g_psi( npwx, npw, notcnv, npol, psi(1,nb1), ew(nb1) )
858     !
859     ! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
860     ! ... order to improve numerical stability of subspace diagonalization
861     ! ... (cdiaghg) ew is used as work array :
862     !
863     ! ...         ew = <psi_i|psi_i>,  i = nbase + 1, nbase + notcnv
864     !
865     DO n = 1, notcnv
866        !
867        nbn = nbase + n
868        !
869        IF ( npol == 1 ) THEN
870           !
871           ew(n) = ddot( 2*npw, psi(1,nbn), 1, psi(1,nbn), 1 )
872           !
873        ELSE
874           !
875           ew(n) = ddot( 2*npw, psi(1,nbn), 1, psi(1,nbn), 1 ) + &
876                   ddot( 2*npw, psi(npwx+1,nbn), 1, psi(npwx+1,nbn), 1 )
877           !
878        END IF
879        !
880     END DO
881     !
882     CALL mp_sum( ew( 1:notcnv ), intra_bgrp_comm )
883     !
884     !$omp parallel do collapse(3)
885     DO n = 1, notcnv
886        DO ipol = 1, npol
887           DO m = 1, numblock
888              psi( (m-1)*blocksize+(ipol-1)*npwx+1: &
889                    MIN(npw, m*blocksize)+(ipol-1)*npwx,nbase+n) = &
890              psi( (m-1)*blocksize+(ipol-1)*npwx+1: &
891                    MIN(npw, m*blocksize)+(ipol-1)*npwx,nbase+n) / &
892                    SQRT( ew(n) )
893           END DO
894        END DO
895     END DO
896     !$omp end parallel do
897     !
898     ! ... here compute the hpsi and spsi of the new functions
899     !
900     CALL h_psi( npwx, npw, notcnv, psi(1,nb1), hpsi(1,nb1) ) ; nhpsi = nhpsi + notcnv
901     !
902     IF ( uspp ) CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
903     !
904     ! ... update the reduced hamiltonian
905     !
906     CALL start_clock( 'cegterg:overlap' )
907     !
908     ! we need to save the old descriptor in order to redistribute matrices
909     !
910     idesc_old = idesc
911     !
912     ! ... RE-Initialize the matrix descriptor
913     !
914     CALL desc_init( nbase+notcnv, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
915     !
916     IF( la_proc ) THEN
917
918        !  redistribute hl and sl (see dsqmred), since the dimension of the subspace has changed
919        !
920        vl = hl
921        DEALLOCATE( hl )
922        ALLOCATE( hl( nx , nx ), STAT=ierr )
923        IF( ierr /= 0 ) &
924           CALL errore( ' pcegterg ',' cannot allocate hl ', ABS(ierr) )
925
926        CALL laxlib_zsqmred( nbase, vl, idesc_old(LAX_DESC_NRCX), idesc_old, nbase+notcnv, hl, nx, idesc )
927
928        vl = sl
929        DEALLOCATE( sl )
930        ALLOCATE( sl( nx , nx ), STAT=ierr )
931        IF( ierr /= 0 ) &
932           CALL errore( ' pcegterg ',' cannot allocate sl ', ABS(ierr) )
933
934        CALL laxlib_zsqmred( nbase, vl, idesc_old(LAX_DESC_NRCX), idesc_old, nbase+notcnv, sl, nx, idesc )
935
936        DEALLOCATE( vl )
937        ALLOCATE( vl( nx , nx ), STAT=ierr )
938        IF( ierr /= 0 ) &
939           CALL errore( ' pcegterg ',' cannot allocate vl ', ABS(ierr) )
940
941     END IF
942     !
943     !
944     CALL update_distmat( hl, psi, hpsi )
945     !
946     IF ( uspp ) THEN
947        !
948        CALL update_distmat( sl, psi, spsi )
949        !
950     ELSE
951        !
952        CALL update_distmat( sl, psi, psi )
953        !
954     END IF
955     !
956     CALL stop_clock( 'cegterg:overlap' )
957     !
958     nbase = nbase + notcnv
959     !
960     ! ... diagonalize the reduced hamiltonian
961     !     Call block parallel algorithm
962     !
963     CALL start_clock( 'cegterg:diag' )
964     IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg ew and vl are the same across ortho_parent_comm
965        ! only the first bgrp performs the diagonalization
966        IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
967        IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
968           CALL mp_bcast( vl, root_bgrp_id, inter_bgrp_comm )
969           CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
970        ENDIF
971     ELSE
972        CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
973     END IF
974     CALL stop_clock( 'cegterg:diag' )
975     !
976     ! ... test for convergence
977     !
978     WHERE( btype(1:nvec) == 1 )
979        !
980        conv(1:nvec) = ( ( ABS( ew(1:nvec) - e(1:nvec) ) < ethr ) )
981        !
982     ELSEWHERE
983        !
984        conv(1:nvec) = ( ( ABS( ew(1:nvec) - e(1:nvec) ) < empty_ethr ) )
985        !
986     END WHERE
987     ! ... next line useful for band parallelization of exact exchange
988     IF ( nbgrp > 1 ) CALL mp_bcast(conv,root_bgrp_id,inter_bgrp_comm)
989     !
990     notcnv = COUNT( .NOT. conv(:) )
991     !
992     e(1:nvec) = ew(1:nvec)
993     !
994     ! ... if overall convergence has been achieved, or the dimension of
995     ! ... the reduced basis set is becoming too large, or in any case if
996     ! ... we are at the last iteration refresh the basis set. i.e. replace
997     ! ... the first nvec elements with the current estimate of the
998     ! ... eigenvectors;  set the basis dimension to nvec.
999     !
1000     IF ( notcnv == 0 .OR. nbase+notcnv > nvecx .OR. dav_iter == maxter ) THEN
1001        !
1002        CALL start_clock( 'cegterg:last' )
1003        !
1004        CALL refresh_evc()
1005        !
1006        IF ( notcnv == 0 ) THEN
1007           !
1008           ! ... all roots converged: return
1009           !
1010           CALL stop_clock( 'cegterg:last' )
1011           !
1012           EXIT iterate
1013           !
1014        ELSE IF ( dav_iter == maxter ) THEN
1015           !
1016           ! ... last iteration, some roots not converged: return
1017           !
1018           !!!WRITE( stdout, '(5X,"WARNING: ",I5, &
1019           !!!     &   " eigenvalues not converged")' ) notcnv
1020           !
1021           CALL stop_clock( 'cegterg:last' )
1022           !
1023           EXIT iterate
1024           !
1025        END IF
1026        !
1027        ! ... refresh psi, H*psi and S*psi
1028        !
1029        CALL threaded_memcpy(psi, evc, nvec*npol*npwx*2)
1030        !
1031        IF ( uspp ) THEN
1032           !
1033           CALL refresh_spsi()
1034           !
1035        END IF
1036        !
1037        CALL refresh_hpsi()
1038        !
1039        ! ... refresh the reduced hamiltonian
1040        !
1041        nbase = nvec
1042        !
1043        CALL desc_init( nvec, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
1044        !
1045        IF( la_proc ) THEN
1046           !
1047           ! note that nx has been changed by desc_init
1048           ! we need to re-alloc with the new size.
1049           !
1050           DEALLOCATE( vl, hl, sl )
1051           ALLOCATE( vl( nx, nx ), STAT=ierr )
1052           IF( ierr /= 0 ) &
1053              CALL errore( ' pcegterg ',' cannot allocate vl ', ABS(ierr) )
1054           ALLOCATE( hl( nx, nx ), STAT=ierr )
1055           IF( ierr /= 0 ) &
1056              CALL errore( ' pcegterg ',' cannot allocate hl ', ABS(ierr) )
1057           ALLOCATE( sl( nx, nx ), STAT=ierr )
1058           IF( ierr /= 0 ) &
1059              CALL errore( ' pcegterg ',' cannot allocate sl ', ABS(ierr) )
1060           !
1061        END IF
1062        !
1063        CALL set_h_from_e( )
1064        !
1065        CALL set_to_identity( vl, idesc )
1066        CALL set_to_identity( sl, idesc )
1067        !
1068        CALL stop_clock( 'cegterg:last' )
1069        !
1070     END IF
1071     !
1072  END DO iterate
1073  !
1074  DEALLOCATE( vl, hl, sl )
1075  !
1076  DEALLOCATE( rank_ip )
1077  DEALLOCATE( irc_ip )
1078  DEALLOCATE( nrc_ip )
1079  DEALLOCATE( ic_notcnv )
1080  DEALLOCATE( notcnv_ip )
1081  DEALLOCATE( conv )
1082  DEALLOCATE( ew )
1083  !
1084  IF ( uspp ) DEALLOCATE( spsi )
1085  !
1086  DEALLOCATE( hpsi )
1087  DEALLOCATE( psi )
1088  !
1089  CALL stop_clock( 'cegterg' )
1090  !call print_clock( 'cegterg' )
1091  !call print_clock( 'cegterg:init' )
1092  !call print_clock( 'cegterg:diag' )
1093  !call print_clock( 'cegterg:update' )
1094  !call print_clock( 'cegterg:overlap' )
1095  !call print_clock( 'cegterg:last' )
1096  !
1097  RETURN
1098  !
1099  !
1100CONTAINS
1101  !
1102  SUBROUTINE set_to_identity( distmat, idesc )
1103     INTEGER, INTENT(IN)  :: idesc(LAX_DESC_SIZE)
1104     COMPLEX(DP), INTENT(OUT) :: distmat(:,:)
1105     INTEGER :: i
1106     distmat = ( 0_DP , 0_DP )
1107     IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) .AND. idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
1108        DO i = 1, idesc(LAX_DESC_NC)
1109           distmat( i, i ) = ( 1_DP , 0_DP )
1110        END DO
1111     END IF
1112     RETURN
1113  END SUBROUTINE set_to_identity
1114  !
1115  !
1116  SUBROUTINE reorder_v()
1117     !
1118     INTEGER :: ipc
1119     INTEGER :: nc, ic
1120     INTEGER :: nl, npl
1121     !
1122     np = 0
1123     !
1124     notcnv_ip = 0
1125     !
1126     n = 0
1127     !
1128     DO ipc = 1, idesc(LAX_DESC_NPC)
1129        !
1130        nc = nrc_ip( ipc )
1131        ic = irc_ip( ipc )
1132        !
1133        npl = 0
1134        !
1135        IF( ic <= nvec ) THEN
1136           !
1137           DO nl = 1, min( nvec - ic + 1, nc )
1138              !
1139              n  = n  + 1
1140              !
1141              IF ( .NOT. conv(n) ) THEN
1142                 !
1143                 ! ... this root not yet converged ...
1144                 !
1145                 np  = np  + 1
1146                 npl = npl + 1
1147                 IF( npl == 1 ) ic_notcnv( ipc ) = np
1148                 !
1149                 ! ... reorder eigenvectors so that coefficients for unconverged
1150                 ! ... roots come first. This allows to use quick matrix-matrix
1151                 ! ... multiplications to set a new basis vector (see below)
1152                 !
1153                 notcnv_ip( ipc ) = notcnv_ip( ipc ) + 1
1154                 !
1155                 IF ( npl /= nl ) THEN
1156                    IF( la_proc .AND. idesc(LAX_DESC_MYC) == ipc-1 ) THEN
1157                       vl( :, npl) = vl( :, nl )
1158                    END IF
1159                 END IF
1160                 !
1161                 ! ... for use in g_psi
1162                 !
1163                 ew(nbase+np) = e(n)
1164                 !
1165              END IF
1166              !
1167           END DO
1168           !
1169        END IF
1170        !
1171     END DO
1172     !
1173  END SUBROUTINE reorder_v
1174  !
1175  !
1176  SUBROUTINE hpsi_dot_v()
1177     !
1178     INTEGER :: ipc, ipr
1179     INTEGER :: nr, ir, ic, notcl, root, np, ipol, ib
1180     COMPLEX(DP), ALLOCATABLE :: vtmp( :, : )
1181     COMPLEX(DP), ALLOCATABLE :: ptmp( :, : )
1182     COMPLEX(DP) :: beta
1183
1184     ALLOCATE( vtmp( nx, nx ) )
1185     ALLOCATE( ptmp( npwx*npol, nx ) )
1186
1187     DO ipc = 1, idesc(LAX_DESC_NPC)
1188        !
1189        IF( notcnv_ip( ipc ) > 0 ) THEN
1190
1191           notcl = notcnv_ip( ipc )
1192           ic    = ic_notcnv( ipc )
1193
1194           beta = ZERO
1195
1196           DO ipr = 1, idesc(LAX_DESC_NPR)
1197              !
1198              nr = nrc_ip( ipr )
1199              ir = irc_ip( ipr )
1200              !
1201              root = rank_ip( ipr, ipc )
1202
1203              IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
1204                 vtmp(:,1:notcl) = vl(:,1:notcl)
1205              END IF
1206
1207              CALL mp_bcast( vtmp(:,1:notcl), root, ortho_parent_comm )
1208              !
1209              IF ( uspp ) THEN
1210                 !
1211                 CALL ZGEMM( 'N', 'N', kdim, notcl, nr, ONE, &
1212                    spsi(1, ir), kdmx, vtmp, nx, beta, psi(1,nb1+ic-1), kdmx )
1213                 !
1214              ELSE
1215                 !
1216                 CALL ZGEMM( 'N', 'N', kdim, notcl, nr, ONE, &
1217                    psi(1, ir), kdmx, vtmp, nx, beta, psi(1,nb1+ic-1), kdmx )
1218                 !
1219              END IF
1220              !
1221              CALL ZGEMM( 'N', 'N', kdim, notcl, nr, ONE, &
1222                      hpsi(1, ir), kdmx, vtmp, nx, beta, ptmp, kdmx )
1223
1224              beta = ONE
1225
1226           END DO
1227
1228           !$omp parallel do collapse(3)
1229           DO np = 1, notcl
1230              DO ipol = 1, npol
1231                 DO ib = 1, numblock
1232                    !
1233                    psi( (ib-1)*blocksize+(ipol-1)*npwx+1: &
1234                         MIN(npw, ib*blocksize)+(ipol-1)*npwx,nbase+np+ic-1) = &
1235                    ptmp((ib-1)*blocksize+(ipol-1)*npwx+1: &
1236                         MIN(npw, ib*blocksize)+(ipol-1)*npwx,np) - &
1237                    ew(nbase+np+ic-1) * psi((ib-1)*blocksize+(ipol-1)*npwx+1:&
1238                       MIN(npw, ib*blocksize)+(ipol-1)*npwx,nbase+np+ic-1)
1239                    !
1240                 END DO
1241              END DO
1242           END DO
1243           !$omp end parallel do
1244           !
1245           ! clean up garbage if there is any
1246           IF (npw < npwx) psi(npw+1:npwx,nbase+ic:nbase+notcl+ic-1) = ZERO
1247           IF (npol == 2)  psi(npwx+npw+1:2*npwx,nbase+ic:nbase+notcl+ic-1) = ZERO
1248           !
1249        END IF
1250        !
1251     END DO
1252
1253     DEALLOCATE( vtmp )
1254     DEALLOCATE( ptmp )
1255
1256     RETURN
1257  END SUBROUTINE hpsi_dot_v
1258  !
1259  !
1260  SUBROUTINE refresh_evc( )
1261     !
1262     INTEGER :: ipc, ipr
1263     INTEGER :: nr, nc, ir, ic, root
1264     COMPLEX(DP), ALLOCATABLE :: vtmp( :, : )
1265     COMPLEX(DP) :: beta
1266
1267     ALLOCATE( vtmp( nx, nx ) )
1268     !
1269     DO ipc = 1, idesc(LAX_DESC_NPC)
1270        !
1271        nc = nrc_ip( ipc )
1272        ic = irc_ip( ipc )
1273        !
1274        IF( ic <= nvec ) THEN
1275           !
1276           nc = min( nc, nvec - ic + 1 )
1277           !
1278           beta = ZERO
1279
1280           DO ipr = 1, idesc(LAX_DESC_NPR)
1281              !
1282              nr = nrc_ip( ipr )
1283              ir = irc_ip( ipr )
1284              !
1285              root = rank_ip( ipr, ipc )
1286
1287              IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
1288                 !
1289                 !  this proc sends his block
1290                 !
1291                 CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
1292                 CALL ZGEMM( 'N', 'N', kdim, nc, nr, ONE, &
1293                          psi(1,ir), kdmx, vl, nx, beta, evc(1,ic), kdmx )
1294              ELSE
1295                 !
1296                 !  all other procs receive
1297                 !
1298                 CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
1299                 CALL ZGEMM( 'N', 'N', kdim, nc, nr, ONE, &
1300                          psi(1,ir), kdmx, vtmp, nx, beta, evc(1,ic), kdmx )
1301              END IF
1302              !
1303
1304              beta = ONE
1305
1306           END DO
1307           !
1308        END IF
1309        !
1310     END DO
1311     !
1312     DEALLOCATE( vtmp )
1313
1314     RETURN
1315  END SUBROUTINE refresh_evc
1316  !
1317  !
1318  SUBROUTINE refresh_spsi( )
1319     !
1320     INTEGER :: ipc, ipr
1321     INTEGER :: nr, nc, ir, ic, root
1322     COMPLEX(DP), ALLOCATABLE :: vtmp( :, : )
1323     COMPLEX(DP) :: beta
1324
1325     ALLOCATE( vtmp( nx, nx ) )
1326     !
1327     DO ipc = 1, idesc(LAX_DESC_NPC)
1328        !
1329        nc = nrc_ip( ipc )
1330        ic = irc_ip( ipc )
1331        !
1332        IF( ic <= nvec ) THEN
1333           !
1334           nc = min( nc, nvec - ic + 1 )
1335           !
1336           beta = ZERO
1337           !
1338           DO ipr = 1, idesc(LAX_DESC_NPR)
1339              !
1340              nr = nrc_ip( ipr )
1341              ir = irc_ip( ipr )
1342              !
1343              root = rank_ip( ipr, ipc )
1344
1345              IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
1346                 !
1347                 !  this proc sends his block
1348                 !
1349                 CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
1350                 CALL ZGEMM( 'N', 'N', kdim, nc, nr, ONE, &
1351                          spsi(1,ir), kdmx, vl, nx, beta, psi(1,nvec+ic), kdmx )
1352              ELSE
1353                 !
1354                 !  all other procs receive
1355                 !
1356                 CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
1357                 CALL ZGEMM( 'N', 'N', kdim, nc, nr, ONE, &
1358                          spsi(1,ir), kdmx, vtmp, nx, beta, psi(1,nvec+ic), kdmx )
1359              END IF
1360              !
1361              beta = ONE
1362
1363           END DO
1364           !
1365        END IF
1366        !
1367     END DO
1368     !
1369     CALL threaded_memcpy(spsi, psi(1,nvec+1), nvec*npol*npwx*2)
1370     !
1371     DEALLOCATE( vtmp )
1372
1373     RETURN
1374  END SUBROUTINE refresh_spsi
1375  !
1376  !
1377  !
1378  SUBROUTINE refresh_hpsi( )
1379     !
1380     INTEGER :: ipc, ipr
1381     INTEGER :: nr, nc, ir, ic, root
1382     COMPLEX(DP), ALLOCATABLE :: vtmp( :, : )
1383     COMPLEX(DP) :: beta
1384
1385     ALLOCATE( vtmp( nx, nx ) )
1386     !
1387     DO ipc = 1, idesc(LAX_DESC_NPC)
1388        !
1389        nc = nrc_ip( ipc )
1390        ic = irc_ip( ipc )
1391        !
1392        IF( ic <= nvec ) THEN
1393           !
1394           nc = min( nc, nvec - ic + 1 )
1395           !
1396           beta = ZERO
1397           !
1398           DO ipr = 1, idesc(LAX_DESC_NPR)
1399              !
1400              nr = nrc_ip( ipr )
1401              ir = irc_ip( ipr )
1402              !
1403              root = rank_ip( ipr, ipc )
1404
1405              IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
1406                 !
1407                 !  this proc sends his block
1408                 !
1409                 CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
1410                 CALL ZGEMM( 'N', 'N', kdim, nc, nr, ONE, &
1411                          hpsi(1,ir), kdmx, vl, nx, beta, psi(1,nvec+ic), kdmx )
1412              ELSE
1413                 !
1414                 !  all other procs receive
1415                 !
1416                 CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
1417                 CALL ZGEMM( 'N', 'N', kdim, nc, nr, ONE, &
1418                          hpsi(1,ir), kdmx, vtmp, nx, beta, psi(1,nvec+ic), kdmx )
1419              END IF
1420              !
1421              beta = ONE
1422
1423           END DO
1424           !
1425        END IF
1426        !
1427     END DO
1428     !
1429     DEALLOCATE( vtmp )
1430     !
1431     CALL threaded_memcpy(hpsi, psi(1,nvec+1), nvec*npol*npwx*2)
1432     !
1433     RETURN
1434  END SUBROUTINE refresh_hpsi
1435  !
1436  !
1437  SUBROUTINE compute_distmat( dm, v, w )
1438     !
1439     !  This subroutine compute <vi|wj> and store the
1440     !  result in distributed matrix dm
1441     !
1442     INTEGER :: ipc, ipr
1443     INTEGER :: nr, nc, ir, ic, root
1444     COMPLEX(DP), INTENT(OUT) :: dm( :, : )
1445     COMPLEX(DP) :: v(:,:), w(:,:)
1446     COMPLEX(DP), ALLOCATABLE :: work( :, : )
1447     !
1448     ALLOCATE( work( nx, nx ) )
1449     !
1450     work = ZERO
1451     !
1452     !  Only upper triangle is computed, then the matrix is hermitianized
1453     !
1454     DO ipc = 1, idesc(LAX_DESC_NPC) !  loop on column procs
1455        !
1456        nc = nrc_ip( ipc )
1457        ic = irc_ip( ipc )
1458        !
1459        DO ipr = 1, ipc ! idesc(LAX_DESC_NPR) ! ipc ! use symmetry for the loop on row procs
1460           !
1461           nr = nrc_ip( ipr )
1462           ir = irc_ip( ipr )
1463           !
1464           !  rank of the processor for which this block (ipr,ipc) is destinated
1465           !
1466           root = rank_ip( ipr, ipc )
1467
1468           ! use blas subs. on the matrix block
1469
1470           CALL ZGEMM( 'C', 'N', nr, nc, kdim, ONE , &
1471                       v(1,ir), kdmx, w(1,ic), kdmx, ZERO, work, nx )
1472
1473           ! accumulate result on dm of root proc.
1474
1475           CALL mp_root_sum( work, dm, root, ortho_parent_comm )
1476
1477        END DO
1478        !
1479     END DO
1480     if (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) dm = dm/nbgrp
1481     !
1482     !  The matrix is hermitianized using upper triangle
1483     !
1484     CALL laxlib_zsqmher( nbase, dm, nx, idesc )
1485     !
1486     DEALLOCATE( work )
1487     !
1488     RETURN
1489  END SUBROUTINE compute_distmat
1490  !
1491  !
1492  SUBROUTINE update_distmat( dm, v, w )
1493     !
1494     INTEGER :: ipc, ipr
1495     INTEGER :: nr, nc, ir, ic, root, icc, ii
1496     COMPLEX(DP) :: dm( :, : )
1497     COMPLEX(DP) :: v(:,:), w(:,:)
1498     COMPLEX(DP), ALLOCATABLE :: vtmp( :, : )
1499
1500     ALLOCATE( vtmp( nx, nx ) )
1501     !
1502     vtmp = ZERO
1503     !
1504     DO ipc = 1, idesc(LAX_DESC_NPC)
1505        !
1506        nc = nrc_ip( ipc )
1507        ic = irc_ip( ipc )
1508        !
1509        IF( ic+nc-1 >= nb1 ) THEN
1510           !
1511           nc = MIN( nc, ic+nc-1 - nb1 + 1 )
1512           IF( ic >= nb1 ) THEN
1513              ii = ic
1514              icc = 1
1515           ELSE
1516              ii = nb1
1517              icc = nb1-ic+1
1518           END IF
1519           !
1520           ! icc to nc is the local index of the unconverged bands
1521           ! ii is the global index of the first unconverged bands
1522           !
1523           DO ipr = 1, ipc ! idesc(LAX_DESC_NPR) use symmetry
1524              !
1525              nr = nrc_ip( ipr )
1526              ir = irc_ip( ipr )
1527              !
1528              root = rank_ip( ipr, ipc )
1529
1530              CALL ZGEMM( 'C', 'N', nr, nc, kdim, ONE, v(1, ir), &
1531                          kdmx, w(1,ii), kdmx, ZERO, vtmp, nx )
1532              IF (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) vtmp = vtmp/nbgrp
1533              !
1534              IF(  (idesc(LAX_DESC_ACTIVE_NODE) > 0) .AND. &
1535                   (ipr-1 == idesc(LAX_DESC_MYR)) .AND. (ipc-1 == idesc(LAX_DESC_MYC)) ) THEN
1536                 CALL mp_root_sum( vtmp(:,1:nc), dm(:,icc:icc+nc-1), root, ortho_parent_comm )
1537              ELSE
1538                 CALL mp_root_sum( vtmp(:,1:nc), dm, root, ortho_parent_comm )
1539              END IF
1540
1541           END DO
1542           !
1543        END IF
1544        !
1545     END DO
1546     !
1547     CALL laxlib_zsqmher( nbase+notcnv, dm, nx, idesc )
1548     !
1549     DEALLOCATE( vtmp )
1550     RETURN
1551  END SUBROUTINE update_distmat
1552  !
1553  !
1554  SUBROUTINE set_e_from_h()
1555     INTEGER :: nc, ic, i
1556     e(1:nbase) = 0_DP
1557     IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) .AND. la_proc ) THEN
1558        nc = idesc(LAX_DESC_NC)
1559        ic = idesc(LAX_DESC_IC)
1560        DO i = 1, nc
1561           e( i + ic - 1 ) = REAL( hl( i, i ) )
1562        END DO
1563     END IF
1564     CALL mp_sum( e(1:nbase), ortho_parent_comm )
1565     RETURN
1566  END SUBROUTINE set_e_from_h
1567  !
1568  SUBROUTINE set_h_from_e()
1569     INTEGER :: nc, ic, i
1570     IF( la_proc ) THEN
1571        hl = ZERO
1572        IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
1573           nc = idesc(LAX_DESC_NC)
1574           ic = idesc(LAX_DESC_IC)
1575           DO i = 1, nc
1576              hl(i,i) = CMPLX( e( i + ic - 1 ), 0_DP ,kind=DP)
1577           END DO
1578        END IF
1579     END IF
1580     RETURN
1581  END SUBROUTINE set_h_from_e
1582  !
1583END SUBROUTINE pcegterg
1584