1!=================================================================================
2!
3! Routines:
4!
5! (1) epscopy()                                 Last Modified 2014 (FHJ)
6!
7!     Copy dielectric matrices from eps0mat(.h5)/epsmat(.h5) files to memory.
8!
9!     This routine reads the eps0mat and epsmat files (units 10 and 11)
10!     and copies the relevant information about those dielectric matrices
11!     to memory (epsmpi). Processor 0 will actually read data and redistribute
12!     them to other processors. The eps0mat can have more than one q-point,
13!     which is useful when performing voronoi averages for the q->0 point.
14!
15!==================================================================================
16
17#include "f_defs.h"
18
19module epscopy_m
20
21  use global_m
22  use misc_m
23  use io_utils_m
24  use epsread_hdf5_m
25  use scalapack_m
26  use timing_m, only: timing => sigma_timing
27
28  implicit none
29
30  private
31
32  public :: epscopy_init, epscopy
33
34  type :: comm_buffer
35    complex(DPC), allocatable :: msg(:,:)
36    integer, allocatable :: col_global_indx(:)
37    integer :: nrow, ncol
38    integer :: proc
39  end type
40
41contains
42
43!> Determines the number of q-points, the maximum rank of the dielectric matrix,
44!! number of frequencies
45subroutine epscopy_init(sig, neps)
46  type (siginfo), intent(inout) :: sig
47  integer, intent(out) :: neps !< Max. rank of dielectric matrix
48
49  integer :: ngarbage1, nmtx, itrash, ig, igp, iq
50  integer :: nFreq, nfreq_imag, freq_dep_flag
51  real(DP), allocatable :: dFreqGrid(:)
52  complex(DPC), allocatable :: dFreqBrd(:)
53  logical, allocatable :: is_imag(:)
54  logical :: file_exists
55  logical :: subtrunc_flags(6)
56  integer :: neig_sub, ierr
57  real(DP) :: chi_eigenvalue_cutoff
58#ifdef HDF5
59  integer :: ngarbage3(3), nmtxmax
60  real(DP) :: dgarbage1
61#endif
62
63  PUSH_SUB(epscopy_init)
64
65  sig%subspace_q0 = .FALSE.
66  sig%matrix_in_subspace_basis_q0 = .FALSE.
67  sig%keep_full_eps_static_q0 = .FALSE.
68  sig%subspace = .FALSE.
69  sig%matrix_in_subspace_basis = .FALSE.
70  sig%keep_full_eps_static = .FALSE.
71  subtrunc_flags(:) = .FALSE.
72  sig%neig_sub_max = 0
73
74  if (sig%freq_dep>=0.and.sig%freq_dep<=3) then
75#ifdef HDF5
76    if (sig%use_hdf5) then
77      if (peinf%inode==0) then
78        call read_eps_grid_sizes_hdf5(ngarbage1, sig%nq0, dgarbage1, sig%nFreq, &
79          sig%nfreq_imag, nmtxmax, ngarbage3, freq_dep_flag, 'eps0mat.h5')
80        neps = nmtxmax
81
82        ! Consistency check, before continuing
83        if (freq_dep_flag==2.and.sig%freq_dep/=2) &
84          call die('eps0mat is frequency-dependent, but this Sigma calculation is not.')
85        if (freq_dep_flag==0.and.(sig%freq_dep==2.or.sig%freq_dep==3)) then
86          call die('This Sigma calculation is frequency-dependent, but eps0mat is not.')
87        endif
88        if (freq_dep_flag==3.and.sig%freq_dep/=3) then
89          call die('This Sigma calculation is for GN GPP, but eps0mat is not.')
90        endif
91
92        ! Check if subspace method has been used
93        neig_sub = 0
94        call read_eps_subspace_info(sig%subspace_q0, &
95                                    sig%matrix_in_subspace_basis_q0, &
96                                    sig%keep_full_eps_static_q0, &
97                                    neig_sub, chi_eigenvalue_cutoff, 'eps0mat.h5')
98        sig%neig_sub_max = MAX(neig_sub, sig%neig_sub_max)
99        ! Consistency check for subspace
100
101        if ( sig%subspace_q0 ) then
102          if ( sig%matrix_in_subspace_basis_q0 ) then
103#if !defined MPI || !defined USESCALAPACK
104            call die('Subspace method only works with MPI and SCALAPACK')
105#endif
106          end if
107          if ( sig%freq_dep/=2 ) then
108            call die('Subspace approximation implemented only for freq_dep = 2')
109          end if
110        end if
111
112        inquire(file="epsmat.h5", exist=file_exists)
113        if (file_exists) then
114          sig%igamma = 0
115        else
116          sig%igamma = 1
117        endif
118
119        if(sig%igamma/=0) then ! Gamma-only calculation
120          sig%nq1 = 0
121        else
122          call read_eps_grid_sizes_hdf5(ngarbage1, sig%nq1, dgarbage1, nfreq, nfreq_imag, &
123            nmtxmax, ngarbage3, freq_dep_flag, 'epsmat.h5')
124          if (nFreq/=sig%nFreq) then
125            call die('nFreq mismatch between eps0mat.h5 and epsmat.h5')
126          endif
127          if (nfreq_imag/=sig%nfreq_imag) then
128            call die('nfreq_imag mismatch between eps0mat.h5 and epsmat.h5')
129          endif
130          if (nmtxmax>neps) neps = nmtxmax
131
132          ! check subspace also for epsmat.h5
133          neig_sub = 0
134          call read_eps_subspace_info(sig%subspace, &
135                                      sig%matrix_in_subspace_basis, &
136                                      sig%keep_full_eps_static, &
137                                      neig_sub, chi_eigenvalue_cutoff, 'epsmat.h5')
138          sig%neig_sub_max = MAX(neig_sub, sig%neig_sub_max)
139          ! Consistency check for subspace
140
141          if ( sig%subspace ) then
142            if ( sig%matrix_in_subspace_basis ) then
143#if !defined MPI || !defined USESCALAPACK
144              call die('Subspace method only works with MPI and SCALAPACK')
145#endif
146            end if
147            if ( sig%freq_dep/=2 ) then
148              call die('Subspace approximation implemented only for freq_dep = 2')
149            end if
150          end if ! sig%subspace
151
152        endif ! .not.sig%igamma/=0
153      endif !peinf%inode==0
154    else
155#endif
156      if (peinf%inode==0) then
157        call open_file(unit=10,file='eps0mat',form='unformatted',status='old')
158        read(10)
159        ierr = 0
160        read(10,IOSTAT=ierr) freq_dep_flag, sig%nFreq, &
161                 sig%subspace_q0, sig%matrix_in_subspace_basis_q0, sig%keep_full_eps_static_q0
162        IF(ierr .NE. 0) THEN
163          ! probably this is due because of the absence of the subspace flags
164          sig%subspace_q0 = .FALSE.
165          sig%matrix_in_subspace_basis_q0 = .FALSE.
166          sig%keep_full_eps_static_q0 = .FALSE.
167        END IF
168
169! make sure that in the case of subspace MPI and scalapack are linked
170#if !defined MPI || !defined USESCALAPACK
171        IF(sig%subspace_q0) THEN
172          call die('Subspace method only works with MPI and SCALAPACK')
173        END IF
174#endif
175
176        ! Consistency check, before continuing
177        if (freq_dep_flag==2.and.sig%freq_dep/=2) &
178          call die('eps0mat is frequency-dependent, but this Sigma calculation is not.')
179        if (freq_dep_flag==0.and.sig%freq_dep==2) then
180          call die('This Sigma calculation is frequency-dependent, but eps0mat is not.')
181        endif
182        if (sig%freq_dep/=2 .and. sig%subspace_q0) then
183          call die('Subspace approximation implemented only for freq_dep = 2')
184        end if
185
186        read(10)
187        if (sig%freq_dep==2.or.sig%freq_dep==3) then
188          SAFE_ALLOCATE(dFreqGrid, (sig%nFreq))
189          SAFE_ALLOCATE(dFreqBrd, (sig%nFreq))
190          SAFE_ALLOCATE(is_imag, (sig%nFreq))
191          read(10) dFreqGrid, dFreqBrd
192          is_imag = (dabs(dFreqGrid)<TOL_ZERO)
193          is_imag(1) = .false.
194          sig%nfreq_imag = count(is_imag)
195          SAFE_DEALLOCATE(dFreqGrid)
196          SAFE_DEALLOCATE(dFreqBrd)
197          SAFE_DEALLOCATE(is_imag)
198        else
199          read(10)
200        endif
201        read(10)
202        read(10)
203        read(10)
204        read(10) sig%nq0
205        read(10)
206        !XXXXXXXX
207        IF(sig%subspace_q0 .AND. sig%matrix_in_subspace_basis_q0) THEN
208          do iq=1,sig%nq0
209            read(10) itrash, neps
210            read(10) ! ekin
211            read(10) ! q
212            read(10) ! vcoul
213            ! full epsilon omega zero
214            IF(sig%keep_full_eps_static_q0) THEN
215              do igp=1, neps
216                read(10) ! For epsRDyn only
217              end do
218            END IF
219            ! eigenvectors
220            read(10) neig_sub
221            IF(sig%neig_sub_max < neig_sub) sig%neig_sub_max = neig_sub
222            do igp=1, neps
223              read(10) ! For epsRDyn only
224            end do
225            ! subspace matrices
226            do igp=1, neig_sub
227              do ig=1, neig_sub
228                read(10)
229              end do
230#ifdef CPLX
231              do ig=1, neig_sub
232                read(10) ! For epsADyn (empty line...)
233              enddo
234#endif
235            end do
236          end do ! iq
237        ELSE
238          read(10) itrash, neps
239        END IF
240        !XXXXXXXX
241        call close_file(10)
242        call open_file(unit=10,file='eps0mat',form='unformatted',status='old')
243
244        inquire(file="epsmat", exist=file_exists)
245        if (file_exists) then
246          sig%igamma = 0
247        else
248          sig%igamma = 1
249        endif
250        if(sig%igamma/=0) then ! Gamma-only calculation
251          sig%nq1 = 0
252        else
253          call open_file(unit=11,file='epsmat',form='unformatted',status='old')
254          read(11)
255          ierr = 0
256          read(11,IOSTAT=ierr) ngarbage1, nFreq, &
257                   sig%subspace, sig%matrix_in_subspace_basis, sig%keep_full_eps_static
258          IF(ierr .NE. 0) THEN
259            sig%subspace = .FALSE.
260            sig%matrix_in_subspace_basis = .FALSE.
261            sig%keep_full_eps_static = .FALSE.
262          END IF
263
264! make sure that in the case of subspace MPI and scalapack are linked
265#if !defined MPI || !defined USESCALAPACK
266        IF(sig%subspace) THEN
267          call die('Subspace method only works with MPI and SCALAPACK')
268        END IF
269#endif
270
271          if (nFreq/=sig%nFreq) then
272            call die('nFreq mismatch between eps0mat and epsmat')
273          endif
274          if (sig%freq_dep/=2 .and. sig%subspace) then
275            call die('Subspace approximation implemented only for freq_dep = 2')
276          end if
277
278          read(11)
279          if (sig%freq_dep==2.or.sig%freq_dep==3) then
280            SAFE_ALLOCATE(dFreqGrid, (sig%nFreq))
281            SAFE_ALLOCATE(dFreqBrd, (sig%nFreq))
282            SAFE_ALLOCATE(is_imag, (sig%nFreq))
283            read(11) dFreqGrid, dFreqBrd
284            is_imag = (dabs(dFreqGrid)<TOL_ZERO)
285            is_imag(1) = .false.
286            if (sig%nfreq_imag/=count(is_imag)) then
287              call die('nfreq_imag mismatch between eps0mat and epsmat')
288            endif
289            SAFE_DEALLOCATE(dFreqGrid)
290            SAFE_DEALLOCATE(dFreqBrd)
291            SAFE_DEALLOCATE(is_imag)
292          else
293            read(11)
294          endif
295          read(11)
296          read(11)
297          read(11)
298          read(11) sig%nq1
299          read(11)
300
301          do iq=1,sig%nq1
302            read(11) itrash, nmtx
303            read(11)
304            read(11)
305
306            IF(sig%subspace .AND. sig%matrix_in_subspace_basis) THEN
307              read(11) ! vcoul
308              ! full epsilon omega zero
309              IF(sig%keep_full_eps_static) THEN
310                do igp=1, nmtx
311                  read(11) ! For epsRDyn only
312                end do
313              END IF
314              ! eigenvectors
315              read(11) neig_sub
316              do igp=1, nmtx
317                read(11) ! For epsRDyn only
318              end do
319              ! subspace matrices
320              do igp=1, neig_sub
321                do ig=1, neig_sub
322                  read(11)
323                end do
324#ifdef CPLX
325                do ig=1, neig_sub
326                  read(11) ! For epsADyn (empty line...)
327                enddo
328#endif
329              end do
330              ! write(*,*) neig_sub, nmtx
331              if(sig%neig_sub_max < neig_sub) sig%neig_sub_max=neig_sub
332            ELSE
333              if (sig%freq_dep==0.or.sig%freq_dep==1) then
334                do ig=1,nmtx
335                  read(11)
336                enddo
337              endif
338              if (sig%freq_dep==2.or.sig%freq_dep==3) then
339                do igp=1,nmtx
340                  do ig=1,nmtx
341                    read(11) ! For epsRDyn
342                  enddo
343#ifdef CPLX
344                  do ig=1,nmtx
345                    read(11) ! For epsADyn
346                  enddo
347#endif
348                enddo
349              endif
350            END IF
351            if(neps<nmtx) neps = nmtx
352          enddo
353          call close_file(11)
354          call open_file(unit=11,file='epsmat',form='unformatted',status='old')
355        endif
356      endif ! peinf%inode==0
357#ifdef HDF5
358    endif
359#endif
360#ifdef MPI
361    call MPI_Bcast(sig%igamma, 1, MPI_INTEGER, 0 ,MPI_COMM_WORLD, mpierr)
362    call MPI_Bcast(sig%nq0, 1, MPI_INTEGER, 0 ,MPI_COMM_WORLD, mpierr)
363    call MPI_Bcast(sig%nq1, 1, MPI_INTEGER, 0 ,MPI_COMM_WORLD, mpierr)
364    call MPI_Bcast(neps, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
365    call MPI_Bcast(sig%nFreq, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
366    call MPI_Bcast(sig%nfreq_imag, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
367!XXXX
368    subtrunc_flags(1) = sig%subspace_q0
369    subtrunc_flags(2) = sig%matrix_in_subspace_basis_q0
370    subtrunc_flags(3) = sig%keep_full_eps_static_q0
371    subtrunc_flags(4) = sig%subspace
372    subtrunc_flags(5) = sig%matrix_in_subspace_basis
373    subtrunc_flags(6) = sig%keep_full_eps_static
374    call MPI_Bcast(subtrunc_flags, 6, MPI_LOGICAL, 0, MPI_COMM_WORLD, mpierr)
375    sig%subspace_q0 = subtrunc_flags(1)
376    sig%matrix_in_subspace_basis_q0 = subtrunc_flags(2)
377    sig%keep_full_eps_static_q0 = subtrunc_flags(3)
378    sig%subspace = subtrunc_flags(4)
379    sig%matrix_in_subspace_basis = subtrunc_flags(5)
380    sig%keep_full_eps_static = subtrunc_flags(6)
381    call MPI_Bcast(sig%neig_sub_max, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
382!XXXX
383#endif
384    if (sig%freq_dep==2.or.sig%freq_dep==3) then
385      SAFE_ALLOCATE(sig%dFreqGrid,(sig%nFreq))
386      SAFE_ALLOCATE(sig%dFreqBrd,(sig%nFreq))
387    endif
388  else ! sig%freq_dep>=0.and.sig_freq_dep<=2
389    neps = 0
390    sig%nFreq = 0
391  endif
392  if (sig%nq0/=1.and..not.sig%subsample) call die('epscopy_init: nq0/=1')
393  if (peinf%inode==0) then
394    write(6,'(/1x,a,i0)') 'Total number of frequencies in the dielectric matrix: ', sig%nfreq
395    write(6,'(1x,a,i0/)') 'Number of imag. frequencies in the dielectric matrix: ', sig%nfreq_imag
396    !XXXXX
397    if(sig%subspace_q0) then
398      write(6,'(1x,a)') 'Eps0mat calculated with subspace truncation method'
399      IF(sig%matrix_in_subspace_basis_q0) THEN
400        write(6,'(1x,a)') 'Eps0mat matrices written within the subspace of omega 0'
401      END IF
402      IF(sig%keep_full_eps_static_q0) THEN
403        write(6,'(1x,a)') 'Eps0mat retains the full EpsInv for omega=0'
404      END IF
405      write(6,*)
406    end if
407    if(sig%subspace .AND. sig%igamma==0) then
408      write(6,'(1x,a)') 'Epsmat calculated with subspace truncation method'
409      IF(sig%matrix_in_subspace_basis) THEN
410        write(6,'(1x,a)') 'Epsmat matrices written within the subspace of omega 0'
411      END IF
412      IF(sig%keep_full_eps_static) THEN
413        write(6,'(1x,a)') 'Epsmat retains the full EpsInv for omega=0'
414      END IF
415      write(6,*)
416    end if
417    !XXXXX
418  endif
419
420  ! check if we can perform the calc in the subspace basis
421  if(sig%do_sigma_subspace) then
422    if(sig%igamma==0) then
423      ! k-point, we need both
424      if(sig%matrix_in_subspace_basis_q0 .AND. sig%matrix_in_subspace_basis) then
425        if (peinf%inode==0) write(6,'(1x,a)') 'DIRECT evalution of sigma in subspace basis'
426      else
427        if (peinf%inode==0) write(6,'(1x,a)') 'Epsilon matrices not in subspace basis!'
428        if (peinf%inode==0) write(6,'(1x,a)') 'Direct evalution of sigma in subspace basis turned off'
429        sig%do_sigma_subspace = .false.
430      end if
431    else
432      if(sig%matrix_in_subspace_basis_q0) then
433        if (peinf%inode==0) write(6,'(1x,a)') 'DIRECT evalution of sigma in subspace basis'
434      else
435        if (peinf%inode==0) write(6,'(1x,a)') 'Epsilon matrix not in subspace basis!'
436        if (peinf%inode==0) write(6,'(1x,a)') 'Direct evalution of sigma in subspace basis turned off'
437        sig%do_sigma_subspace = .false.
438      end if
439    end if
440  end if
441
442  POP_SUB(epscopy_init)
443
444end subroutine epscopy_init
445
446
447subroutine epscopy(crys,gvec,sig,neps,epsmpi,epshead,iunit_eps,fne)
448  type (crystal), intent(in) :: crys
449  type (gspace), intent(in) :: gvec
450  type (siginfo), intent(inout) :: sig
451  integer, intent(in) :: neps !< number of G-vectors up to epsilon cutoff defined in sigma.inp
452  type (epsmpiinfo), intent(inout) :: epsmpi
453  SCALAR, intent(out) :: epshead !< for full frequency, this is the retarded static head.
454  integer, intent(in) :: iunit_eps
455  character*20, intent(in) :: fne
456
457  SCALAR, allocatable :: eps(:)
458  complex(DPC), allocatable :: epsRDyn(:,:)
459  complex(DPC), allocatable :: epsADyn(:,:)
460  logical :: is_static
461#ifdef HDF5
462  integer, allocatable :: sub_neig_of_q(:)
463#endif
464
465  PUSH_SUB(epscopy)
466
467  epshead = ZERO
468  is_static = sig%freq_dep/=2 .AND. sig%freq_dep/=3
469
470  ! FHJ: Initialize buffers
471  if (is_static) then
472    SAFE_ALLOCATE(epsmpi%eps, (neps,epsmpi%ngpown,sig%nq))
473  else
474    if(sig%do_sigma_subspace) then
475      ! if doing calcultion within subspace allocate stuff
476      SAFE_ALLOCATE(sig%epssub%eps_sub, (sig%neig_sub_max, sig%epssub%Nbas_own, sig%nFreq, sig%nq))
477      !XXXX SAFE_ALLOCATE(sig%epssub%eigenvec_sub, (sig%epssub%ngpown_sub, sig%neig_sub_max,  sig%nq))
478      SAFE_ALLOCATE(sig%epssub%eigenvec_sub, (neps, sig%epssub%Nbas_own,  sig%nq))
479      SAFE_ALLOCATE(sig%epssub%eps_wings_rows, (neps, sig%nFreq,  sig%nq))
480      SAFE_ALLOCATE(sig%epssub%eps_wings_cols, (neps, sig%nFreq,  sig%nq))
481      SAFE_ALLOCATE(sig%epssub%eps_wings_correction_rows, (neps, sig%nFreq))
482      SAFE_ALLOCATE(sig%epssub%eps_wings_correction_cols, (neps, sig%nFreq))
483      sig%epssub%eps_sub = (0.0d0,0.0d0)
484      sig%epssub%eigenvec_sub = (0.0d0,0.0d0)
485      sig%epssub%eps_wings_rows = (0.0d0,0.0d0)
486      sig%epssub%eps_wings_cols = (0.0d0,0.0d0)
487      sig%epssub%eps_wings_correction_rows = (0.0d0,0.0d0)
488      sig%epssub%eps_wings_correction_cols = (0.0d0,0.0d0)
489      SAFE_ALLOCATE(sig%epssub%eps_sub_info, (3, 2, sig%nq))
490      SAFE_ALLOCATE(sig%epssub%wing_pos, (sig%nq))
491      sig%epssub%wing_pos = 0
492      SAFE_ALLOCATE(sig%epssub%vcoul_sub, (neps, sig%nq))
493      sig%epssub%vcoul_sub = 0.0d0
494      !XXX deallocate and reallocate epsmpi%epsR with a single frequency
495      !XXX SAFE_DEALLOCATE_P(epsmpi%epsR)
496      SAFE_ALLOCATE(epsmpi%epsR, (neps,epsmpi%ngpown,1,sig%nq))
497    else
498      ! standard case
499      SAFE_ALLOCATE(epsmpi%epsR, (neps,epsmpi%ngpown,sig%nFreq,sig%nq))
500      if (sig%need_advanced) then
501        SAFE_ALLOCATE(epsmpi%epsA, (neps,epsmpi%ngpown,sig%nFreq,sig%nq))
502      endif
503    end if
504  endif
505
506  ! FHJ: Temp buffers.
507  if (is_static) then
508    SAFE_ALLOCATE(eps, (neps))
509  else
510    SAFE_ALLOCATE(epsRDyn, (neps,sig%nFreq))
511    if (sig%need_advanced) then
512      SAFE_ALLOCATE(epsADyn, (neps,sig%nFreq))
513    endif
514  endif
515  !------------------------------------------------
516  ! FHJ: Read dielectric matrices from eps0mat(.h5)
517  !------------------------------------------------
518  call read_epsmat(.true.)
519#ifdef MPI
520  call MPI_Bcast(epshead, 1, MPI_SCALAR, 0, MPI_COMM_WORLD, mpierr)
521#endif
522  if (dble(epshead)<1d-3 .and. sig%iscreen==SCREEN_SEMICOND .and. peinf%inode==0) then
523    write(0,'(a)') 'WARNING: You are using semiconductor screening, but the'
524    write(0,'(a)') 'head of epsilon inverse is very small and seems metallic.'
525  endif
526  !------------------------------------------------
527  ! FHJ: Read dielectric matrices from epsmat(.h5)
528  !------------------------------------------------
529  if (sig%igamma==0) call read_epsmat(.false.)
530
531#ifdef MPI
532  call MPI_Bcast(sig%qgrid,3,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
533#endif
534  epsmpi%qk(:,:) = sig%qpt(:,:)
535
536  ! FHJ: Free temp. buffers
537  if (is_static) then
538    SAFE_DEALLOCATE(eps)
539  else
540    SAFE_DEALLOCATE(epsRDyn)
541    if (sig%need_advanced) then
542      SAFE_DEALLOCATE(epsADyn)
543    endif
544  endif
545
546  POP_SUB(epscopy)
547
548  return
549
550contains
551
552
553  subroutine read_epsmat(is_q0)
554    logical, intent(in) :: is_q0
555
556    character(len=16) :: fname
557    integer :: qoffset, nq_file, iunit
558    integer :: freq_dep_flag, nFreq, nfreq_imag, ng_old, ngq, nmtx
559    integer :: ii, ig, igp, nq_tmp, iq, qgrid(3), gg(3), iout, iw, dest, igp_loc
560    real(DP) :: ecuts, qk(3), ekin
561    real(DP), allocatable :: ekold(:)
562#ifdef HDF5
563    integer :: nmtxmax
564    integer, allocatable :: nmtx_of_q(:), isrtinvdummy(:)
565#endif
566    integer, allocatable :: isrtq(:), isrtqi(:), isrtold(:), gvecs_old(:,:)
567    type(progress_info) :: prog_info !< a user-friendly progress report
568    character(len=6) :: sname
569    character(len=11) :: sdate
570    logical :: matrix_in_subspace_basis, keep_full_eps_static
571    real(DP), allocatable :: vcoul(:)
572
573    PUSH_SUB(epscopy.read_epsmat)
574
575    if (is_q0) then
576      fname = 'eps0mat'
577      qoffset = 0
578      nq_file = sig%nq0
579      iunit = 10
580    else
581      fname = 'epsmat'
582      qoffset = sig%nq0
583      nq_file = sig%nq1
584      iunit = 11
585    endif
586    if (sig%use_hdf5) fname = TRUNC(fname) // '.h5'
587
588    if (peinf%inode==0) then
589      SAFE_ALLOCATE(ekold, (gvec%ng))
590      SAFE_ALLOCATE(isrtold, (gvec%ng))
591      SAFE_ALLOCATE(isrtq, (gvec%ng))
592      SAFE_ALLOCATE(isrtqi, (gvec%ng))
593    endif
594
595    matrix_in_subspace_basis = .FALSE.
596    keep_full_eps_static = .FALSE.
597    if (is_q0) then
598      if(sig%subspace_q0) then
599        matrix_in_subspace_basis = sig%matrix_in_subspace_basis_q0
600        keep_full_eps_static = sig%keep_full_eps_static_q0
601      end if
602    else
603      if(sig%subspace) then
604        matrix_in_subspace_basis = sig%matrix_in_subspace_basis
605        keep_full_eps_static = sig%keep_full_eps_static
606      end if
607    end if
608
609!------------------------------------------------------------------------------
610! Read q-grid, q-points, G-vectors and freq. grid
611!------------------------------------------------------------------------------
612    if (peinf%inode==0) then
613#ifdef HDF5
614      if (sig%use_hdf5) then
615        sname = 'chiGG0'
616        sdate = 'nodate'
617        call read_eps_grid_sizes_hdf5(ng_old, nq_tmp, ecuts, nfreq, nfreq_imag, &
618          nmtxmax, qgrid, freq_dep_flag, TRUNC(fname))
619        if (.not.is_static.and.is_q0) then
620          call read_eps_freqgrid_hdf5(nFreq, sig%dFreqGrid, sig%dFreqBrd, TRUNC(fname))
621        endif
622
623        SAFE_ALLOCATE(nmtx_of_q, (nq_file))
624        call read_eps_qgrid_hdf5(nq_tmp, sig%qpt(:,qoffset+1:), nmtx_of_q, TRUNC(fname))
625        ! Note: it seems like the old format stores isort up to ngq, and the
626        ! HDF5 stores up to ng_old
627        SAFE_ALLOCATE(gvecs_old, (3,ng_old))
628        call read_eps_old_gvecs_hdf5(ng_old, gvecs_old, TRUNC(fname))
629        ! subspace: read number of eigenvectors
630        if(matrix_in_subspace_basis) then
631          SAFE_ALLOCATE(sub_neig_of_q, (nq_file))
632          sub_neig_of_q = 0
633          call read_eps_sub_neig_of_q(nq_tmp, sub_neig_of_q, TRUNC(fname))
634        end if
635      else
636#endif
637        read(iunit) sname, sdate
638        read(iunit) freq_dep_flag, nFreq
639        read(iunit) qgrid(1:3)
640        if (.not.is_static.and.is_q0) then
641          read(iunit) sig%dFreqGrid, sig%dFreqBrd
642        else
643          read(iunit)
644        endif
645        read(iunit)
646        read(iunit)
647        read(iunit) ecuts
648        read(iunit) nq_tmp, ((sig%qpt(ii,qoffset+iq), ii=1,3), iq=1,nq_tmp)
649        read(iunit) ng_old
650        backspace(iunit)
651        SAFE_ALLOCATE(gvecs_old, (3, ng_old))
652        read(iunit) ng_old, ((gvecs_old(ii,iq), ii=1,3), iq=1,ng_old)
653#ifdef HDF5
654      endif
655#endif
656      if (nq_tmp/=nq_file) call die('nq mismatch for '//TRUNC(fname))
657      ! FHJ: only substitute sig%qgrid (used in voronoi averages) if sig%qgrid
658      ! is empty and this is epsmat(.h5), if this file is available
659      if (all(sig%qgrid(1:3)==0).and.((.not.is_q0).or.(sig%igamma==1))) then
660        sig%qgrid(:) = qgrid(:)
661      endif
662    endif ! peinf%inode==0
663
664
665!------------------------------------------------------------------------------
666! Bcast q-points and allocate/bcast full-frequency and epsilon buffers
667!------------------------------------------------------------------------------
668#ifdef MPI
669    call MPI_Bcast(sig%qpt(1,qoffset+1), 3*nq_file, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpierr)
670#endif
671    if (is_q0) then
672      sig%q0vec(:) = sig%qpt(:,1)
673      ! We need to manually set the q0 point in sig%qpt to zero
674      if (.not.sig%subsample) sig%qpt(:,:sig%nq0) = 0d0
675      if (.not.is_static) then
676#ifdef MPI
677        call MPI_Bcast(sig%dFreqGrid,sig%nFreq,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,mpierr)
678        call MPI_Bcast(sig%dFreqBrd,sig%nFreq,MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD,mpierr)
679#endif
680      endif
681    endif !is_q0
682
683
684!------------------------------------------------------------------------------
685! Loop over all q-points, map G-vectors and read/store dielectric matrices
686!------------------------------------------------------------------------------
687    call progress_init(prog_info, 'reading '//TRUNC(fname), 'q-point', nq_file)
688    do iq=1,nq_file
689      call progress_step(prog_info, iq)
690      call logit('Storing eps to memory')
691
692      ! FHJ: Map old isrt (from epsilon gspace) to new isrt (to gvec gspace)
693      !--------------------------------------------------------=------------
694      if (peinf%inode==0) then
695#ifdef HDF5
696        if (sig%use_hdf5) then
697          SAFE_ALLOCATE(isrtinvdummy, (ng_old))
698          call read_eps_gvecsofq_hdf5(ng_old,isrtold,isrtinvdummy,ekold,iq,TRUNC(fname))
699          SAFE_DEALLOCATE(isrtinvdummy)
700          nmtx = nmtx_of_q(iq)
701          ngq = ng_old
702          ! in case of subspace read the coulomb potential
703          if(matrix_in_subspace_basis) then
704            SAFE_ALLOCATE(vcoul, (nmtx))
705            call read_vcoul_hdf5(vcoul, iq, TRUNC(fname))
706          end if
707        else
708#endif
709          read(iunit) ngq, nmtx, (isrtold(ig),ii,ig=1,ngq)
710          read(iunit) (ekold(ig),ig=1,ngq)
711          read(iunit) (qk(ii),ii=1,3)
712          ! in case of subspace read the coulomb potential
713          if(matrix_in_subspace_basis) then
714            SAFE_ALLOCATE(vcoul, (nmtx))
715            read(iunit) (vcoul(ig),ig=1,nmtx)
716          end if
717#ifdef HDF5
718        endif
719#endif
720        isrtq = 0
721        isrtqi = 0
722        qk = sig%qpt(:,iq+qoffset)
723        if (is_q0) qk(:) = 0d0
724        do ig=1,ngq
725          if (ekold(isrtold(ig))<=sig%ecutb) then
726            gg(:) = gvecs_old(:, isrtold(ig))
727            call findvector(iout, gg, gvec)
728            isrtq(ig) = iout
729            isrtqi(iout) = ig
730            if (ig==1) then
731              ! just check the first so we do not waste time
732              ekin = DOT_PRODUCT(gvec%components(:,iout)+qk(:), MATMUL(crys%bdot, gvec%components(:,iout)+qk(:)))
733              if (abs(ekin-ekold(isrtold(ig))) > TOL_Zero) then
734                write(0,*) 'iq = ', iq, ' ig = ',ig, ' qk = ', qk
735                write(0,*) 'epsmat: ekold(isrtold(i)) = ', ekold(isrtold(ig)), ' ekin = ', ekin
736                call die("Incorrect kinetic energies in epsmat.")
737              endif
738            endif
739          endif  ! if (ekold(isrtold(i))<=sig%ecutb)
740        enddo  ! do ig=1,ngq
741      endif  ! if (peinf%inode==0)
742#ifdef MPI
743      call MPI_Bcast(nmtx,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
744#endif
745      if (peinf%inode==0) then
746        epsmpi%isrtq(:,iq+qoffset) = isrtq(:)
747        epsmpi%isrtqi(:,iq+qoffset) = isrtqi(:)
748      endif
749      epsmpi%nmtx(iq+qoffset) = nmtx
750
751      IF(matrix_in_subspace_basis) THEN
752        ! broadcast coulomb potential
753        IF(peinf%inode/=0) THEN
754          SAFE_ALLOCATE(vcoul, (nmtx))
755          vcoul = 0.0_DP
756        END IF
757#ifdef MPI
758        call MPI_Bcast(vcoul,nmtx,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
759#endif
760
761#if defined MPI && defined USESCALAPACK
762        call timing%start(timing%epscopy_sub)
763        call epscopy_subspace(nmtx,vcoul,keep_full_eps_static,iq,qoffset,iunit,fname)
764        call timing%stop(timing%epscopy_sub)
765#endif
766
767        IF(sig%need_advanced) THEN
768          write(6,'(1x,a)') &
769          'Subspace truncation method only implemented for sig%freq_dep==2 and sig%freq_dep_method==2'
770          call die('need_advanced is an invalid option for subspace + cd_integration_method.')
771        END IF
772
773        if (is_q0.and.iq==1.and.peinf%inode==0) then
774          epshead = epsmpi%epsR(1,1,1,1)
775        endif
776
777        SAFE_DEALLOCATE(vcoul)
778      ELSE ! subspace read
779#ifdef HDF5
780        if (sig%use_hdf5) then
781          ! FHJ: Read & store epsinv === HDF5 ===
782          !--------------------------------------
783          if (is_static) then
784            call timing%start(timing%epscopy_io)
785            call read_eps_matrix_par_hdf5(epsmpi%eps(:,:,iq+qoffset), &
786              epsmpi%nb, peinf%pool_rank, peinf%npes_pool, nmtx, &
787              iq, 1, fname)
788            call timing%stop(timing%epscopy_io)
789          else
790            call timing%start(timing%epscopy_io)
791            if (sig%need_advanced) then
792              call read_eps_matrix_par_f_hdf5(epsmpi%epsR(:,:,:,iq+qoffset), &
793                epsmpi%nb, peinf%pool_comm, peinf%my_pool, peinf%npools, nmtx, &
794                sig%nFreq, iq, 1, fname, advanced=epsmpi%epsA(:,:,:,iq+qoffset))
795            else
796              call read_eps_matrix_par_f_hdf5(epsmpi%epsR(:,:,:,iq+qoffset), &
797                epsmpi%nb, peinf%pool_comm, peinf%my_pool, peinf%npools, nmtx, &
798                sig%nFreq, iq, 1, fname)
799            endif
800            call timing%stop(timing%epscopy_io)
801          endif
802          if (is_q0.and.iq==1.and.peinf%inode==0) then
803            if (is_static) then
804              epshead = epsmpi%eps(1,1,1)
805            else
806              epshead = epsmpi%epsR(1,1,1,1)
807            endif
808          endif
809        else !HDF5
810#endif
811          ! FHJ: Read & store epsinv === Fortran binary  ===
812          !-------------------------------------------------
813          do igp=1,nmtx
814            if (is_static) then
815              if (peinf%inode==0) then
816                call timing%start(timing%epscopy_io)
817                read(iunit) (eps(ig),ig=1,nmtx)
818                call timing%stop(timing%epscopy_io)
819                if (is_q0.and.iq==1.and.igp==1) epshead = eps(1)
820                call timing%start(timing%epscopy_comm)
821                call timing%stop(timing%epscopy_comm)
822              endif
823
824              call timing%start(timing%epscopy_comm)
825#ifdef MPI
826              call MPI_Bcast(eps, neps, MPI_SCALAR, 0, MPI_COMM_WORLD, mpierr)
827#endif
828              call timing%stop(timing%epscopy_comm)
829            else ! is_static
830              if (peinf%inode==0) then
831                do ig=1,nmtx
832                  call timing%start(timing%epscopy_io)
833                  read(iunit) (epsRDyn(ig,iw),iw=1,sig%nFreq) ! Retarded part
834                  call timing%stop(timing%epscopy_io)
835                  if (is_q0.and.iq==1.and.igp==1) epshead = epsRDyn(1,1)
836                  call timing%start(timing%epscopy_comm)
837                  call timing%stop(timing%epscopy_comm)
838                enddo
839#ifdef CPLX
840                do ig=1,nmtx
841                  call timing%start(timing%epscopy_io)
842                  if (sig%need_advanced) then
843                    read(iunit) (epsADyn(ig,iw),iw=1,sig%nFreq) ! Advanced part
844                  else
845                    read(iunit) ! Advanced part
846                  endif
847                  call timing%stop(timing%epscopy_io)
848                enddo
849#endif
850              endif
851
852              call timing%start(timing%epscopy_comm)
853#ifdef MPI
854              call MPI_Bcast(epsRDyn, sig%nFreq*neps, MPI_COMPLEX_DPC, 0, MPI_COMM_WORLD, mpierr)
855              if (sig%need_advanced) then
856                call MPI_Bcast(epsADyn, sig%nFreq*neps, MPI_COMPLEX_DPC, 0, MPI_COMM_WORLD, mpierr)
857              endif
858#endif
859              call timing%stop(timing%epscopy_comm)
860            endif ! is_static
861
862            dest = INDXG2P(igp, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool)
863            igp_loc = INDXG2L(igp, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool)
864            call timing%start(timing%epscopy_comm)
865            if (peinf%pool_rank==dest) then
866              if (is_static) then
867                epsmpi%eps(:,igp_loc,iq+qoffset) = eps(:)
868              else
869                epsmpi%epsR(:,igp_loc,:,iq+qoffset) = epsRDyn(:,:)
870                if (sig%need_advanced) then
871                  epsmpi%epsA(:,igp_loc,:,iq+qoffset) = epsADyn(:,:)
872                endif
873              endif ! is_static
874            endif ! if (peinf%pool_rank==dest)
875            call timing%stop(timing%epscopy_comm)
876          enddo  ! do ig=1,nmtx
877#ifdef HDF5
878        endif !HDF5
879#endif
880      END IF ! subspace read
881    enddo ! iq
882    call progress_free(prog_info)
883
884!------------------------------------------------------------------------------
885! Done!
886!------------------------------------------------------------------------------
887
888    if (peinf%inode==0) then
889#ifdef HDF5
890      if (sig%use_hdf5) then
891        SAFE_DEALLOCATE(nmtx_of_q)
892        if(matrix_in_subspace_basis) then
893          SAFE_DEALLOCATE(sub_neig_of_q)
894        end if
895      else
896#endif
897        call close_file(iunit)
898#ifdef HDF5
899      endif
900#endif
901      SAFE_DEALLOCATE(ekold)
902      SAFE_DEALLOCATE(isrtold)
903      SAFE_DEALLOCATE(isrtq)
904      SAFE_DEALLOCATE(isrtqi)
905    endif
906
907    POP_SUB(epscopy.read_epsmat)
908
909  end subroutine read_epsmat
910
911#if defined MPI && defined USESCALAPACK
912  subroutine epscopy_subspace(nmtx,vcoul,keep_full_eps_static,iq,qoffset,iunit,fname)
913    integer  :: nmtx
914    real(DP) :: vcoul(nmtx)
915    logical  :: keep_full_eps_static
916    integer  :: iq, qoffset, iunit
917    character(len=16), intent(in) :: fname
918
919    complex(DPC), allocatable :: eigenvect(:,:), C_aux(:,:), buffer(:,:)
920    complex(DPC), allocatable :: eps_sub_f(:,:,:), E_aux(:,:,:), buffer_f(:,:,:)
921    complex(DPC), allocatable :: eps1Aux(:,:), C_Pgemm(:,:)
922    complex(DPC) :: C_send(nmtx), C_recv(nmtx)
923    real(DP) :: vc
924    type (scalapack) :: scal, scal_sub, scal_1d, scal_aux
925    integer, allocatable :: row_indices(:,:), col_indices(:,:)
926    integer, allocatable :: row_indices_sub(:,:), col_indices_sub(:,:)
927    integer :: desca(9), desc_sub(9), info
928    integer :: max_npr, max_npc, tag
929    integer :: nrow_loc_1d, ncol_loc_1d
930    integer :: nrow_loc_sub, ncol_loc_sub, max_npr_sub, max_npc_sub
931    integer :: igp, ig, ipe, neig_sub, Nfreq, ifreq, ig_l, ig_g, igp_l, igp_g
932    integer :: i_local, i_global, j_local, j_global
933    integer :: icurr, i, irow, j, icol, irowm, icolm
934
935    integer :: Nmsg_to_send, Nmsg_to_recv
936    type(comm_buffer), allocatable :: buf_send(:), buf_recv(:)
937    integer, allocatable :: pe2grid(:,:), grid2pe(:,:), pool2pe(:,:)
938    integer, allocatable :: num_col_to_send(:), num_col_to_recv(:)
939    integer, allocatable :: proc2msg_send(:), local_col_count(:)
940    integer, allocatable :: req_send(:), req_recv(:)
941    integer, allocatable :: my_status(:,:)
942    integer :: iproc_dum, iproc_send, iproc_rec, pool_send, iproc_row_rec
943    integer :: Nbuf_send, Nbuf_recv, imsg_recv, imsg_send, my_col
944    ! variables for sigma-subspace
945    complex(DPC), allocatable :: buffer_sub_eigen(:,:),  buffer_sub_eps(:,:,:)
946    integer :: indx_start, indx_end, sub_size
947    integer :: iproc_pool, iproc_row, iproc_col
948    integer :: max_head, gg(3), iout
949    integer :: ipe_wing, my_pos_loc_wing
950    ! variables for fast vcoul scaling
951    real(DP), allocatable :: row_vcoul_aux(:), col_vcoul_aux(:)
952    integer, allocatable :: my_diag_indeces(:,:)
953    integer :: idiag, my_num_diag_elem
954    ! variables for cyclic redistribution of eigenvectors and subspace epsilon
955    logical :: cyclic_redistr
956    complex(DPC), allocatable :: rec_buffer_sub_eigen(:,:), send_buffer_sub_eigen(:,:)
957    complex(DPC), allocatable :: rec_buffer_sub_eps(:,:,:), send_buffer_sub_eps(:,:,:)
958    integer :: max_npc_lt_Neig, npc_lt_Neig
959    integer :: req_send_singl, tag_send, req_rec_singl, tag_rec
960    integer :: irec_static, isend_static
961    integer :: actual_send, actual_rec, ipe_real
962    logical :: do_copy
963    integer :: ncol_to_copy
964    integer, allocatable :: copy_col_inx(:)
965
966    PUSH_SUB(epscopy.epscopy_subspace)
967
968    ! set default cyclic_redistr flag
969    cyclic_redistr = .true.
970    if( sig%sub_collective_redistr ) cyclic_redistr = .false.
971
972    ! set up the blas env for the full matrices
973    call blacs_setup(scal, nmtx, .true.)
974
975    ! exchange info
976    SAFE_ALLOCATE(scal%nprd, (peinf%npes))
977    SAFE_ALLOCATE(scal%npcd, (peinf%npes))
978    scal%nprd = 0
979    scal%npcd = 0
980    scal%nprd(peinf%inode + 1) = scal%npr
981    scal%npcd(peinf%inode + 1) = scal%npc
982    !
983    call MPI_ALLREDUCE(MPI_IN_PLACE,scal%nprd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
984    call MPI_ALLREDUCE(MPI_IN_PLACE,scal%npcd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
985    ! get max number of row/col
986    max_npr = MAXVAL(scal%nprd)
987    max_npc = MAXVAL(scal%npcd)
988
989    ! precompute indices
990    SAFE_ALLOCATE(row_indices, (max_npr, peinf%npes))
991    row_indices = 0
992    DO i_local = 1, scal%npr
993      i_global = indxl2g(i_local,  scal%nbl, scal%myprow, 0, scal%nprow)
994      row_indices(i_local,peinf%inode + 1) = i_global
995    END DO
996    SAFE_ALLOCATE(col_indices, (max_npc, peinf%npes))
997    col_indices = 0
998    DO i_local = 1, scal%npc
999      i_global = indxl2g(i_local,  scal%nbl, scal%mypcol, 0, scal%npcol)
1000      col_indices(i_local,peinf%inode + 1) = i_global
1001    END DO
1002    call MPI_ALLREDUCE(MPI_IN_PLACE,row_indices,peinf%npes*max_npr, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
1003    call MPI_ALLREDUCE(MPI_IN_PLACE,col_indices,peinf%npes*max_npc, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
1004
1005    !XXXXXXXXXX Here we allocate the full matrix on the first inode=0 (can be problematic in term of memory...)
1006    IF(keep_full_eps_static) THEN
1007      ! exchange info (low memory communication scheme)
1008      SAFE_ALLOCATE(pool2pe, (peinf%npools,peinf%npes_pool))
1009      pool2pe = 0
1010      IF(peinf%my_pool >= 0 .AND. peinf%pool_rank >= 0) THEN
1011        pool2pe(peinf%my_pool+1, peinf%pool_rank+1) = peinf%inode
1012      END IF
1013      call MPI_ALLREDUCE(MPI_IN_PLACE, pool2pe, peinf%npools*peinf%npes_pool, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr)
1014      !
1015      ! go with h5
1016#ifdef HDF5
1017      if (sig%use_hdf5) then
1018        ! read from .h5
1019        ! here we try to read only for a single pool of process using a 1d-cyclic
1020        ! scalapack layout
1021        scal_1d%nprow   = 1
1022        scal_1d%npcol   = peinf%npes_pool
1023        scal_1d%myprow  = 0
1024        scal_1d%mypcol  = peinf%pool_rank
1025        scal_1d%nbl     = epsmpi%nb
1026        scal_1d%icntxt  = scal%icntxt
1027        nrow_loc_1d = numroc(nmtx, scal_1d%nbl, scal_1d%myprow, 0, scal_1d%nprow)
1028        ncol_loc_1d = numroc(nmtx, scal_1d%nbl, scal_1d%mypcol, 0, scal_1d%npcol)
1029        scal_1d%npr     = nrow_loc_1d
1030        scal_1d%npc     = ncol_loc_1d
1031        ! write(*,*) peinf%inode, peinf%pool_rank, peinf%my_pool, nrow_loc_1d, ncol_loc_1d, epsmpi%ngpown
1032
1033        ! temporary buffer
1034        SAFE_ALLOCATE(eps1Aux, (scal_1d%npr, scal_1d%npc))
1035        eps1Aux = (0.0d0, 0.0d0)
1036        if( peinf%my_pool == 0) then
1037          !
1038          call timing%start(timing%epscopy_io)
1039          call read_eps_matrix_par_hdf5_general(scal_1d, eps1Aux, nmtx, nmtx, iq, 1, fname, &
1040                                                'mats/matrix_fulleps0', peinf%pool_comm)
1041          call timing%stop(timing%epscopy_io)
1042          !
1043          call timing%start(timing%epscopy_comm)
1044          do iproc_dum = 1, peinf%npools - 1
1045            iproc_send = pool2pe(iproc_dum+1, peinf%pool_rank+1)
1046            tag = 0
1047            CALL MPI_SEND(eps1Aux, scal_1d%npr*scal_1d%npc, &
1048                          MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr)
1049          end do
1050          call timing%stop(timing%epscopy_comm)
1051          ! nrow_loc_1d should be equal to nmtx
1052          epsmpi%epsR(1:nrow_loc_1d,1:ncol_loc_1d,1,iq+qoffset) = eps1Aux(:,:)
1053        else
1054          iproc_dum = pool2pe(1, peinf%pool_rank+1)
1055          ! this condition should ensure that the processes left over will not be
1056          ! involved in communication
1057          if(peinf%my_pool >= 0 .AND. peinf%pool_rank >= 0) then
1058            tag = 0
1059            CALL MPI_RECV(eps1Aux, scal_1d%npr*scal_1d%npc, &
1060                          MPI_COMPLEX_DPC, iproc_dum, tag, MPI_COMM_WORLD, mpistatus, mpierr)
1061          end if
1062          epsmpi%epsR(1:nrow_loc_1d,1:ncol_loc_1d,1,iq+qoffset) = eps1Aux(:,:)
1063        end if
1064        SAFE_DEALLOCATE(eps1Aux)
1065
1066      else  ! sig%use_hdf5
1067#endif
1068        ! standard binary read
1069        !
1070        IF(peinf%inode == 0) THEN
1071          DO j_global = 1, nmtx
1072            ! read full column
1073            read(iunit) (C_send(ig), ig=1, nmtx)
1074            !
1075            ! global to pool_rank (in the new sigma distribution)
1076            pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool)
1077            DO iproc_dum = 0, peinf%npools - 1
1078              ! select the process that I will have to send
1079              iproc_send = pool2pe(iproc_dum+1,pool_send+1)
1080              IF(iproc_send == 0) THEN
1081                j_local = INDXG2L( j_global, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool)
1082                epsmpi%epsR(1:nmtx,j_local,1,iq+qoffset) = C_send(1:nmtx)
1083              ELSE
1084                tag = 0
1085                CALL MPI_SEND(C_send, nmtx, &
1086                              MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr)
1087              END IF
1088            END DO
1089          END DO
1090        ELSE
1091          DO j_global = 1, nmtx
1092            ! global to pool_rank (in the new sigma distribution)
1093            pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool)
1094            DO iproc_dum = 0, peinf%npools - 1
1095              ! select the process that I will have to send
1096              iproc_send = pool2pe(iproc_dum+1,pool_send+1)
1097              IF(iproc_send == peinf%inode) THEN
1098                j_local = INDXG2L( j_global, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool)
1099                tag = 0
1100                CALL MPI_RECV(C_recv, nmtx, &
1101                              MPI_COMPLEX_DPC, 0, tag, MPI_COMM_WORLD, mpistatus, mpierr)
1102                epsmpi%epsR(1:nmtx,j_local,1,iq+qoffset) = C_recv
1103              END IF
1104            END DO
1105          END DO
1106        END IF
1107        !
1108#ifdef HDF5
1109      end if ! h5
1110#endif
1111      !
1112      SAFE_DEALLOCATE(pool2pe)
1113    END IF ! keep_full_eps_static
1114
1115    SAFE_ALLOCATE(eigenvect, (scal%npr, scal%npc))
1116    eigenvect = (0.0d0,0.0d0)
1117
1118#ifdef HDF5
1119    if (sig%use_hdf5) then
1120      ! read from .h5
1121      ! get number of eigenvectors
1122      neig_sub = 0
1123      IF(peinf%inode==0) THEN
1124        neig_sub = sub_neig_of_q(iq)
1125      END IF
1126      call MPI_Bcast(neig_sub,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
1127
1128      scal_aux%nprow   = scal%nprow
1129      scal_aux%npcol   = scal%npcol
1130      scal_aux%myprow  = scal%myprow
1131      scal_aux%mypcol  = scal%mypcol
1132      scal_aux%nbl     = scal%nbl
1133      scal_aux%icntxt  = scal%icntxt
1134      scal_aux%npr     = scal%npr
1135      scal_aux%npc     = numroc(neig_sub, scal%nbl, scal%mypcol, 0, scal%npcol)
1136      !
1137      call timing%start(timing%epscopy_io)
1138      if(.true.) then
1139        call read_eps_matrix_par_hdf5_general(scal_aux, eigenvect, nmtx, neig_sub, iq, 1, fname, &
1140                                              'mats/matrix_eigenvec', MPI_COMM_WORLD)
1141      else
1142        call read_eps_matrix_par_hdf5_general(scal, eigenvect, nmtx, nmtx, iq, 1, fname, &
1143                                              'mats/matrix_eigenvec', MPI_COMM_WORLD)
1144      end if
1145      call timing%stop(timing%epscopy_io)
1146      !
1147    else  ! sig%use_hdf5
1148#endif
1149      ! standard binary read
1150      !
1151      neig_sub = 0
1152      IF(peinf%inode==0) THEN
1153        read(iunit) neig_sub
1154      END IF
1155      call MPI_Bcast(neig_sub,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
1156
1157      SAFE_ALLOCATE(buffer, (max_npr,max_npc))
1158      buffer = (0.0d0,0.0d0)
1159
1160      ! read eigenvector and redistribute
1161      IF(peinf%inode==0) THEN
1162        call timing%start(timing%sub_io_vec)
1163        SAFE_ALLOCATE(C_aux, (nmtx,neig_sub))
1164        C_aux = (0.0d0,0.0d0)
1165        do igp=1, neig_sub
1166          read(iunit) (C_aux(ig,igp),ig=1,nmtx)
1167        end do
1168        do igp=neig_sub + 1, nmtx
1169          read(iunit)
1170        end do
1171        call timing%stop(timing%sub_io_vec)
1172        ! prepare the messages and send them
1173        DO ipe = 1, peinf%npes - 1
1174          call timing%start(timing%sub_prep_vec)
1175          buffer = (0.0d0,0.0d0)
1176          DO j_local=1, scal%npcd(ipe + 1)
1177            j_global = col_indices(j_local, ipe + 1)
1178            IF(j_global > neig_sub) CYCLE
1179            DO i_local=1, scal%nprd(ipe + 1)
1180              i_global = row_indices(i_local, ipe + 1)
1181              buffer(i_local, j_local) = C_aux(i_global,j_global)
1182            END DO
1183          END DO
1184          call timing%stop(timing%sub_prep_vec)
1185          ! send
1186          call timing%start(timing%sub_comm_vec)
1187          tag = 0
1188          CALL MPI_SEND(buffer, max_npr*max_npc, MPI_COMPLEX_DPC, ipe, tag, MPI_COMM_WORLD,mpierr)
1189          call timing%stop(timing%sub_comm_vec)
1190        END DO
1191        ! myself
1192        DO j_local=1, scal%npc
1193          j_global = col_indices(j_local, peinf%inode + 1)
1194          IF(j_global > neig_sub) CYCLE
1195          DO i_local=1, scal%npr
1196            i_global = row_indices(i_local, peinf%inode + 1)
1197            eigenvect(i_local, j_local) = C_aux(i_global,j_global)
1198          END DO
1199        END DO
1200        ! done
1201
1202        !XXX if(peinf%inode == 0) then
1203        !XXX   do ig = 1, neig_sub
1204        !XXX     do igp = 1, nmtx
1205        !XXX       write(2000,*) ig, igp, C_aux(igp,ig)
1206        !XXX     end do
1207        !XXX   end do
1208        !XXX   write(2000,*)
1209        !XXX end if
1210
1211        SAFE_DEALLOCATE(C_aux)
1212      ELSE
1213        ! receive my stuff
1214        tag = 0
1215        CALL MPI_RECV(buffer, max_npr*max_npc, MPI_COMPLEX_DPC, 0, tag, MPI_COMM_WORLD, mpistatus, mpierr)
1216        eigenvect(1:scal%npr, 1:scal%npc) = buffer(1:scal%npr, 1:scal%npc)
1217      END IF
1218      call MPI_Barrier(MPI_COMM_WORLD, mpierr)
1219      SAFE_DEALLOCATE(buffer)
1220      !
1221#ifdef HDF5
1222    end if ! h5
1223#endif
1224
1225      !XXX do j_local = 1, scal%npc
1226      !XXX   do i_local = 1, scal%npr
1227      !XXX     write(1000+peinf%inode,*) i_local, j_local, eigenvect(i_local,j_local)
1228      !XXX   end do
1229      !XXX end do
1230      !XXX flush(1000+peinf%inode)
1231      !XXX write(*,*) 'AAAAA'
1232
1233    ! All the same for the subspace epsilon (all frequencies in one)
1234    ! First create the scalapack env, retain the block/grid-layout as that
1235    ! of the eigenvalue matrix
1236    ! get the row/col size for the subspace matrices
1237    ! call MPI_Bcast(neig_sub,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
1238    nrow_loc_sub = numroc(neig_sub, scal%nbl, scal%myprow, 0, scal%nprow)
1239    ncol_loc_sub = numroc(neig_sub, scal%nbl, scal%mypcol, 0, scal%npcol)
1240    scal_sub%nprow   = scal%nprow
1241    scal_sub%npcol   = scal%npcol
1242    scal_sub%myprow  = scal%myprow
1243    scal_sub%mypcol  = scal%mypcol
1244    scal_sub%nbl     = scal%nbl
1245    scal_sub%icntxt  = scal%icntxt
1246    scal_sub%npr     = nrow_loc_sub
1247    scal_sub%npc     = ncol_loc_sub
1248    !
1249    SAFE_ALLOCATE(scal_sub%nprd, (peinf%npes))
1250    SAFE_ALLOCATE(scal_sub%npcd, (peinf%npes))
1251    scal_sub%nprd = 0
1252    scal_sub%npcd = 0
1253    scal_sub%nprd(peinf%inode + 1) = scal_sub%npr
1254    scal_sub%npcd(peinf%inode + 1) = scal_sub%npc
1255    !
1256    call MPI_ALLREDUCE(MPI_IN_PLACE,scal_sub%nprd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
1257    call MPI_ALLREDUCE(MPI_IN_PLACE,scal_sub%npcd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
1258    ! get max number of row/col
1259    max_npr_sub = MAXVAL(scal_sub%nprd)
1260    max_npc_sub = MAXVAL(scal_sub%npcd)
1261
1262    ! precompute indices for subspace matrices
1263    SAFE_ALLOCATE(row_indices_sub, (max_npr_sub, peinf%npes))
1264    row_indices_sub = 0
1265    DO i_local = 1, scal_sub%npr
1266      i_global = indxl2g(i_local,  scal_sub%nbl, scal_sub%myprow, 0, scal_sub%nprow)
1267      row_indices_sub(i_local, peinf%inode + 1) = i_global
1268    END DO
1269    SAFE_ALLOCATE(col_indices_sub, (max_npc_sub, peinf%npes))
1270    col_indices_sub = 0
1271    DO i_local = 1, scal_sub%npc
1272      i_global = indxl2g(i_local,  scal_sub%nbl, scal_sub%mypcol, 0, scal_sub%npcol)
1273      col_indices_sub(i_local, peinf%inode + 1) = i_global
1274    END DO
1275    call MPI_ALLREDUCE(MPI_IN_PLACE, row_indices_sub,peinf%npes*max_npr_sub, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
1276    call MPI_ALLREDUCE(MPI_IN_PLACE, col_indices_sub,peinf%npes*max_npc_sub, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr)
1277
1278    ! here we go with copy/redistribution
1279    Nfreq = sig%Nfreq
1280    SAFE_ALLOCATE(eps_sub_f, (MAX(1,scal_sub%npr), MAX(1,scal_sub%npc), Nfreq))
1281    eps_sub_f = (0.0d0,0.0d0)
1282#ifdef HDF5
1283    if (sig%use_hdf5) then
1284      ! read from .h5
1285      call timing%start(timing%epscopy_io)
1286      call read_eps_matrix_par_hdf5_general_f(scal_sub, eps_sub_f, neig_sub, neig_sub, Nfreq, iq, 1, fname, &
1287                                              'mats/matrix_subspace', MPI_COMM_WORLD)
1288      call timing%stop(timing%epscopy_io)
1289      !
1290    else  ! sig%use_hdf5
1291#endif
1292      ! binary case
1293      SAFE_ALLOCATE(buffer_f, (Nfreq, max_npr_sub, max_npc_sub))
1294      buffer_f  = (0.0d0,0.0d0)
1295      IF(peinf%inode==0) THEN
1296        call timing%start(timing%sub_io_eps)
1297        SAFE_ALLOCATE(E_aux, (Nfreq,neig_sub,neig_sub))
1298        E_aux = (0.0d0,0.0d0)
1299        do igp= 1, neig_sub
1300          do ig = 1, neig_sub
1301            read(iunit) (E_aux(ifreq,ig,igp),ifreq=1,Nfreq)
1302          end do
1303#ifdef CPLX
1304          do ig=1, neig_sub
1305            read(iunit) ! For epsADyn (empty line...)
1306          enddo
1307#endif
1308        end do
1309        call timing%stop(timing%sub_io_eps)
1310        ! prepare the messages and send them
1311        DO ipe = 1, peinf%npes - 1
1312          call timing%start(timing%sub_prep_eps)
1313          buffer_f = (0.0d0,0.0d0)
1314          DO j_local=1, scal_sub%npcd(ipe + 1)
1315            j_global = col_indices_sub(j_local, ipe + 1)
1316            DO i_local=1, scal_sub%nprd(ipe + 1)
1317              i_global = row_indices_sub(i_local, ipe + 1)
1318              buffer_f(1:Nfreq, i_local, j_local) = E_aux(1:Nfreq, i_global, j_global)
1319            END DO
1320          END DO
1321          call timing%stop(timing%sub_prep_eps)
1322          ! send
1323          call timing%start(timing%sub_comm_eps)
1324          tag = 0
1325          CALL MPI_SEND(buffer_f, Nfreq*max_npr_sub*max_npc_sub, MPI_COMPLEX_DPC, ipe, tag, MPI_COMM_WORLD, mpierr)
1326          call timing%stop(timing%sub_comm_eps)
1327        END DO
1328        ! myself
1329        DO j_local=1, scal_sub%npc
1330          j_global = col_indices_sub(j_local, peinf%inode + 1)
1331          DO i_local=1, scal_sub%npr
1332            i_global = row_indices_sub(i_local, peinf%inode + 1)
1333            eps_sub_f(i_local, j_local, 1:Nfreq) = E_aux(1:Nfreq, i_global, j_global)
1334          END DO
1335        END DO
1336        ! done
1337        SAFE_DEALLOCATE(E_aux)
1338      ELSE
1339        ! receive my stuff
1340        tag = 0
1341        CALL MPI_RECV(buffer_f, max_npr_sub*max_npc_sub*Nfreq, MPI_COMPLEX_DPC, 0, tag, MPI_COMM_WORLD, mpistatus, mpierr)
1342        DO ifreq = 1, Nfreq
1343          eps_sub_f(1:scal_sub%npr, 1:scal_sub%npc, ifreq) = buffer_f(ifreq, 1:scal_sub%npr, 1:scal_sub%npc)
1344        END DO
1345      END IF
1346      SAFE_DEALLOCATE(buffer_f)
1347      !
1348#ifdef HDF5
1349    end if ! sig%use_hdf5
1350#endif
1351
1352    ! initialize descriptors
1353    call descinit(desca, nmtx, nmtx, scal%nbl, scal%nbl, 0, 0, &
1354                  scal%icntxt, max(1,scal%npr), info)
1355    if(info < 0) then
1356      write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.'
1357      call die("descinit error for descaA in subspace epscopy")
1358    else if(info > 0) then
1359      write(0,*) 'info = ', info
1360      call die("descinit error for descaA in subspace epscopy")
1361    endif
1362
1363    call descinit(desc_sub, neig_sub, neig_sub, scal_sub%nbl, scal_sub%nbl, 0, 0, &
1364                  scal_sub%icntxt, max(1,nrow_loc_sub), info)
1365    if(info < 0) then
1366      write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.'
1367      call die("descinit error for desca_sub in subspace epscopy")
1368    else if(info > 0) then
1369      write(0,*) 'info = ', info
1370      call die("descinit error for desca_sub in subspace epscopy")
1371    endif
1372
1373    ! 0) initialize, map proc to rank_pool
1374    SAFE_ALLOCATE(grid2pe, (scal%nprow, scal%npcol))
1375    grid2pe = 0
1376    grid2pe(scal%myprow+1, scal%mypcol+1) = peinf%inode
1377    call MPI_ALLREDUCE(MPI_IN_PLACE, grid2pe, scal%nprow*scal%npcol , MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr)
1378    !
1379    SAFE_ALLOCATE(pe2grid, (2,peinf%npes))
1380    pe2grid = 0
1381    pe2grid(1, peinf%inode+1) = scal%myprow
1382    pe2grid(2, peinf%inode+1) = scal%mypcol
1383    call MPI_ALLREDUCE(MPI_IN_PLACE, pe2grid, 2*peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr)
1384    !
1385    SAFE_ALLOCATE(pool2pe, (peinf%npools,peinf%npes_pool))
1386    pool2pe = 0
1387    IF(peinf%my_pool >= 0 .AND. peinf%pool_rank >= 0) THEN
1388      pool2pe(peinf%my_pool+1, peinf%pool_rank+1) = peinf%inode
1389    END IF
1390    call MPI_ALLREDUCE(MPI_IN_PLACE, pool2pe, peinf%npools*peinf%npes_pool, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr)
1391    ! loop over my local column and calculate the how many of them I have to send to all other proc
1392    SAFE_ALLOCATE(num_col_to_send, (peinf%npes))
1393    num_col_to_send = 0
1394    do j_local = 1, scal%npc
1395      ! local to global (in the epsilon distribution)
1396      j_global = col_indices(j_local, peinf%inode + 1)
1397      IF(j_global > nmtx) CYCLE
1398      ! global to pool_rank (in the new sigma distribution)
1399      pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool)
1400      DO iproc_dum = 0, peinf%npools - 1
1401        ! select the process that I will have to send
1402        iproc_send = pool2pe(iproc_dum+1,pool_send+1)
1403        num_col_to_send(iproc_send+1) = num_col_to_send(iproc_send+1) + 1
1404      END DO
1405    end do
1406    ! loop over my local column (in the new sigma distribution) and precompute how many columns I have to receive from
1407    ! all other processes, this will happen only if peinf%my_pool >= 0 (I hope...)
1408    SAFE_ALLOCATE(num_col_to_recv, (peinf%npes))
1409    num_col_to_recv = 0
1410    IF(peinf%my_pool >= 0) THEN
1411      do j_local = 1, epsmpi%ngpown
1412        ! local to global (in the sigma distribution)
1413        j_global = INDXL2G(j_local, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool)
1414        IF(j_global > nmtx) CYCLE
1415        ! get the process in the epsilon distribution that hold this column
1416        iproc_row_rec = INDXG2P( j_global, scal%nbl, iproc_dum, 0, scal%npcol)
1417        DO iproc_dum = 0, scal%nprow - 1
1418          iproc_rec = grid2pe(iproc_dum+1, iproc_row_rec+1)
1419          num_col_to_recv(iproc_rec+1) = num_col_to_recv(iproc_rec+1) + 1
1420        END DO
1421      end do
1422    END IF
1423
1424    ! allocate receiving buffer
1425    Nbuf_recv = 0
1426    DO ipe = 1, peinf%npes
1427      IF(num_col_to_recv(ipe) > 0) Nbuf_recv = Nbuf_recv + 1
1428    END DO
1429    SAFE_ALLOCATE(buf_recv, (Nbuf_recv))
1430    imsg_recv = 0
1431    DO ipe = 1, peinf%npes
1432      IF(num_col_to_recv(ipe) > 0) THEN
1433        imsg_recv = imsg_recv + 1
1434        buf_recv(imsg_recv)%proc = ipe - 1
1435        buf_recv(imsg_recv)%ncol = num_col_to_recv(ipe)
1436        buf_recv(imsg_recv)%nrow = scal%nprd(ipe)
1437        SAFE_ALLOCATE(buf_recv(imsg_recv)%msg, (buf_recv(imsg_recv)%nrow,buf_recv(imsg_recv)%ncol))
1438        SAFE_ALLOCATE(buf_recv(imsg_recv)%col_global_indx, (buf_recv(imsg_recv)%ncol))
1439        buf_recv(imsg_recv)%col_global_indx = -100
1440      END IF
1441    END DO
1442
1443    ! allocate sending buffer
1444    Nbuf_send = 0
1445    DO ipe = 1, peinf%npes
1446      IF(num_col_to_send(ipe) > 0) Nbuf_send = Nbuf_send + 1
1447    END DO
1448    SAFE_ALLOCATE(proc2msg_send, (peinf%npes))
1449    proc2msg_send = 0
1450    SAFE_ALLOCATE(buf_send, (Nbuf_send))
1451    imsg_send = 0
1452    DO ipe = 1, peinf%npes
1453      IF(num_col_to_send(ipe) > 0) THEN
1454        imsg_send = imsg_send + 1
1455        buf_send(imsg_send)%proc = ipe - 1
1456        buf_send(imsg_send)%ncol = num_col_to_send(ipe)
1457        buf_send(imsg_send)%nrow = scal%npr
1458        SAFE_ALLOCATE(buf_send(imsg_send)%msg, (buf_send(imsg_send)%nrow, buf_send(imsg_send)%ncol))
1459        SAFE_ALLOCATE(buf_send(imsg_send)%col_global_indx, (buf_send(imsg_send)%ncol))
1460        buf_send(imsg_send)%col_global_indx = -100
1461        !
1462        proc2msg_send(ipe) = imsg_send
1463      END IF
1464    END DO
1465    SAFE_ALLOCATE(local_col_count, (Nbuf_send))
1466    SAFE_ALLOCATE(req_send, (2 * Nbuf_send))
1467    SAFE_ALLOCATE(my_status, (MPI_STATUS_SIZE, 2*Nbuf_send))
1468    SAFE_ALLOCATE(req_recv, (2 * Nbuf_recv))
1469
1470    ! in the case of sigma subspace find which is the head/wings position for this
1471    ! specific q-point
1472    if(sig%do_sigma_subspace) then
1473      max_head = 1
1474      if((peinf%inode==0)) then
1475        gg = 0
1476        call findvector(iout, gg, gvec)
1477        max_head = epsmpi%isrtqi(iout,iq+qoffset)
1478        call MPI_Bcast(max_head,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
1479      else
1480        call MPI_Bcast(max_head,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
1481      end if
1482      sig%epssub%wing_pos(iq+qoffset) = max_head
1483    end if
1484
1485    ! prepare the coul_Aux to avoid recomputing many times the SQRT(vcoul)
1486    call timing%start(timing%epscopy_vcoul)
1487    SAFE_ALLOCATE(row_vcoul_aux, (scal%npr)) ! row -> * SQRT(vc)
1488    SAFE_ALLOCATE(col_vcoul_aux, (scal%npc)) ! col -> * 1/SQRT(vc)
1489    ! initialize to one since it will be a scaling factor
1490    row_vcoul_aux = 1.0d+00
1491    do ig_l = 1, scal%npr
1492      ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow)
1493      row_vcoul_aux(ig_l) = sqrt(vcoul(ig_g))
1494    enddo
1495    col_vcoul_aux = 1.0d+00
1496    do igp_l = 1, scal%npc
1497      igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol)
1498      vc = 1.0d+00 / sqrt(vcoul(igp_g))
1499      col_vcoul_aux(igp_l) = vc
1500    end do
1501    ! diagonal elements
1502    my_num_diag_elem = 0
1503    do igp_l = 1, scal%npc
1504      igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol)
1505      do ig_l = 1, scal%npr
1506        ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow)
1507        if (ig_g==igp_g) my_num_diag_elem = my_num_diag_elem + 1
1508      enddo
1509    enddo
1510    ! allocate index array for diagonal elements
1511    SAFE_ALLOCATE(my_diag_indeces, (2,my_num_diag_elem))
1512    my_diag_indeces = 0
1513    idiag = 0
1514    do igp_l = 1, scal%npc
1515      igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol)
1516      vc = sqrt(vcoul(igp_g))
1517      do ig_l = 1, scal%npr
1518        ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow)
1519        if (ig_g==igp_g) then
1520          idiag = idiag + 1
1521          my_diag_indeces(1, idiag) = ig_l
1522          my_diag_indeces(2, idiag) = igp_l
1523        end if
1524      enddo
1525    enddo
1526    call timing%stop(timing%epscopy_vcoul)
1527
1528    ! for the subspace epsinv ONE has been already scaled from the diagonal...
1529    ! expand the subspace inverse epsilon
1530    ! allocate temporary matrices
1531    SAFE_ALLOCATE(eps1Aux, (scal%npr, scal%npc))
1532    SAFE_ALLOCATE(C_Pgemm, (scal%npr, scal%npc))
1533    ! loop over frequencies
1534    DO ifreq = 1, Nfreq
1535      call timing%start(timing%epscopy_pgemm)
1536      CALL pzgemm('N','N', nmtx, neig_sub, neig_sub, (1.0d0,0.0d0), eigenvect, 1, 1, desca, &
1537                  eps_sub_f(:,:,ifreq), 1, 1, desc_sub, (0.0d0,0.0d0), &
1538                  C_Pgemm, 1, 1, desca)
1539      if((.not.sig%do_sigma_subspace) .or. ifreq == 1) then
1540        ! here we always calculate the full epsilon^{-1} for the first frequency, this
1541        ! is apparently important if we need to calculate the static CH/SH
1542        CALL pzgemm('N','C', nmtx, nmtx, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, &
1543                    eigenvect, 1, 1, desca, (0.0d0,0.0d0), &
1544                    eps1Aux, 1, 1, desca)
1545      else
1546         CALL pzgemm('N','C', max_head, nmtx, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, &
1547                     eigenvect, 1, 1, desca, (0.0d0,0.0d0), &
1548                     eps1Aux, 1, 1, desca)
1549         do j_local = 1, scal%npc
1550           j_global = col_indices(j_local, peinf%inode + 1)
1551           if( j_global <= max_head) eps1Aux(:,j_local) = (0.0d0,0.0d0)
1552         end do
1553         CALL pzgemm('N','C', nmtx, max_head, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, &
1554                     eigenvect, 1, 1, desca, (1.0d0,0.0d0), &
1555                     eps1Aux, 1, 1, desca)
1556      end if
1557      call timing%stop(timing%epscopy_pgemm)
1558      ! here we have the full matrix for frequency ifreq, restore one on diagonal
1559      ! and "unsymmetrize" with vcoul
1560      ! FHJ: WARNING: never perform a nested loop over the global rows and
1561      ! columns, as these dimensions may be huge, and the code will spend
1562      ! a long time doing nothing. Instead, always loop over the local rows
1563      ! and columns and use indxl2g to get the corresponding global index.
1564      call timing%start(timing%epscopy_vcoul)
1565      if(.true.) then
1566        ! new fast scheme
1567        ! sum one on diagonal
1568        do idiag = 1, my_num_diag_elem
1569          i_local = my_diag_indeces(1, idiag)
1570          j_local = my_diag_indeces(2, idiag)
1571          eps1Aux(i_local, j_local) = eps1Aux(i_local, j_local) + ONE
1572        end do
1573        ! scale with coulomb operator
1574        do igp_l = 1, scal%npc
1575          vc = col_vcoul_aux(igp_l)
1576          do ig_l = 1, scal%npr
1577            eps1Aux(ig_l,igp_l) = eps1Aux(ig_l,igp_l) * row_vcoul_aux(ig_l) * vc
1578          end do
1579        end do
1580        !
1581      else
1582        ! keep old slow scheme for debug
1583        do igp_l = 1, scal%npc
1584          igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol)
1585          vc = sqrt(vcoul(igp_g))
1586          do ig_l = 1, scal%npr
1587            ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow)
1588            if (ig_g==igp_g) eps1Aux(ig_l,igp_l) = eps1Aux(ig_l,igp_l) + ONE
1589            eps1Aux(ig_l,igp_l) = eps1Aux(ig_l,igp_l) * sqrt(vcoul(ig_g)) / vc
1590          enddo
1591        enddo
1592        !
1593      end if
1594      call timing%stop(timing%epscopy_vcoul)
1595
1596      ! in case of subspace in sigma, we just save in the fully expanded
1597      ! form the static case (first frequency, equal to 0)
1598      IF((.not.sig%do_sigma_subspace) .OR. ifreq == 1) THEN
1599        ! here we have the epsinv matrix as output from a standard epsilon calculation
1600        ! redistribute according to the new layout as required in sigma (for omega=0 there
1601        ! is the possibility to keep the full epsinv)
1602        IF((.NOT.(keep_full_eps_static)) .OR. ifreq > 1) THEN
1603          ! post all messages to be received
1604          DO imsg_recv = 1, Nbuf_recv
1605            ! WRITE(2000+peinf%inode,*) peinf%inode, imsg_recv, buf_recv(imsg_recv)%proc
1606            tag = 0
1607            CALL MPI_IRECV(buf_recv(imsg_recv)%msg, buf_recv(imsg_recv)%ncol*buf_recv(imsg_recv)%nrow, &
1608                           MPI_COMPLEX_DPC, buf_recv(imsg_recv)%proc, tag, MPI_COMM_WORLD, req_recv((imsg_recv-1)*2+1), &
1609                           mpierr)
1610            tag = 0
1611            CALL MPI_IRECV(buf_recv(imsg_recv)%col_global_indx, buf_recv(imsg_recv)%ncol, &
1612                           MPI_INTEGER, buf_recv(imsg_recv)%proc, tag, MPI_COMM_WORLD, req_recv(imsg_recv*2), &
1613                           mpierr)
1614          END DO
1615          ! fill send buffer
1616          ! loop over local column
1617          local_col_count = 0
1618          do j_local = 1, scal%npc
1619            j_global = col_indices(j_local, peinf%inode + 1)
1620            IF(j_global > nmtx) CYCLE
1621            pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool)
1622            DO iproc_dum = 0, peinf%npools - 1
1623              iproc_send = pool2pe(iproc_dum+1,pool_send+1)
1624              imsg_send = proc2msg_send(iproc_send+1)
1625              local_col_count(imsg_send) = local_col_count(imsg_send) + 1
1626              buf_send(imsg_send)%msg(1:scal%npr,local_col_count(imsg_send)) = eps1Aux(1:scal%npr,j_local)
1627              buf_send(imsg_send)%col_global_indx(local_col_count(imsg_send)) = j_global
1628            END DO
1629          end do
1630          ! send messeges
1631          DO imsg_send = 1, Nbuf_send
1632            ! WRITE(3000+peinf%inode,*) peinf%inode, imsg_send, buf_send(imsg_send)%proc
1633            tag = 0
1634            CALL MPI_ISEND(buf_send(imsg_send)%msg, buf_send(imsg_send)%ncol*buf_send(imsg_send)%nrow, &
1635                           MPI_COMPLEX_DPC, buf_send(imsg_send)%proc, tag, MPI_COMM_WORLD, req_send((imsg_send-1)*2+1), &
1636                           mpierr)
1637            tag = 0
1638            CALL MPI_ISEND(buf_send(imsg_send)%col_global_indx, buf_send(imsg_send)%ncol, &
1639                           MPI_INTEGER, buf_send(imsg_send)%proc, tag, MPI_COMM_WORLD, req_send(imsg_send*2), &
1640                           mpierr)
1641          END DO
1642          ! collect messages
1643          DO imsg_recv = 1, Nbuf_recv
1644            CALL MPI_WAIT(req_recv((imsg_recv-1)*2+1), mpistatus, mpierr)
1645            CALL MPI_WAIT(req_recv(imsg_recv*2), mpistatus, mpierr)
1646            ! fill in epsmpi
1647            ! iproc_row_rec = pe2grid(1, buf_recv(imsg_recv)%proc+1)
1648            DO j_local = 1, buf_recv(imsg_recv)%ncol
1649              j_global = buf_recv(imsg_recv)%col_global_indx(j_local)
1650              my_col = INDXG2L(j_global, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool)
1651              DO i_local = 1, buf_recv(imsg_recv)%nrow
1652                i_global = row_indices(i_local, buf_recv(imsg_recv)%proc+1)
1653                epsmpi%epsR(i_global,my_col,ifreq,iq+qoffset) = buf_recv(imsg_recv)%msg(i_local,j_local)
1654              END DO
1655            END DO
1656          END DO
1657
1658          ! wait for all
1659          CALL MPI_WAITALL(Nbuf_send*2,req_send,my_status,mpierr)
1660        END IF
1661        call MPI_Barrier(MPI_COMM_WORLD, mpierr)
1662      END IF ! .not.subspace and ifreq>1
1663
1664      IF(sig%do_sigma_subspace) THEN
1665        ! in case of subspace here we need to save the wings of the full matrix
1666        ! for simplicity we always read from eps1Aux, even in the case (keep_full_eps_static and ifreq==1)
1667
1668        sig%epssub%eps_wings_cols(:,ifreq,iq+qoffset) = (0.0d0,0.0d0)
1669        ipe_wing = INDXG2P(max_head, scal%nbl, iproc_dum, 0, scal%npcol)
1670        if (ipe_wing == scal%mypcol) then
1671          my_pos_loc_wing = INDXG2L(max_head, scal%nbl, scal%mypcol, 0, scal%npcol)
1672          do i_local = 1,  scal%npr
1673            i_global = row_indices(i_local, peinf%inode + 1)
1674            sig%epssub%eps_wings_cols(i_global,ifreq,iq+qoffset) = eps1Aux(i_local,my_pos_loc_wing)
1675          end do
1676        end if
1677
1678        sig%epssub%eps_wings_rows(:,ifreq,iq+qoffset) = (0.0d0,0.0d0)
1679        ipe_wing = INDXG2P(max_head, scal%nbl, iproc_dum, 0, scal%nprow)
1680        if (ipe_wing == scal%myprow) then
1681          my_pos_loc_wing = INDXG2L(max_head, scal%nbl, scal%myprow, 0, scal%nprow)
1682          do j_local = 1, scal%npc
1683            j_global = col_indices(j_local, peinf%inode + 1)
1684            sig%epssub%eps_wings_rows(j_global,ifreq,iq+qoffset) = eps1Aux(my_pos_loc_wing,j_local)
1685            if (j_global == max_head) then
1686              sig%epssub%eps_wings_rows(max_head,ifreq,iq+qoffset) = &
1687              sig%epssub%eps_wings_rows(max_head,ifreq,iq+qoffset) - 1.0d0
1688            end if
1689          end do
1690        end if
1691
1692      END IF
1693
1694    END DO ! ifreq
1695
1696    SAFE_DEALLOCATE(row_vcoul_aux)
1697    SAFE_DEALLOCATE(col_vcoul_aux)
1698    SAFE_DEALLOCATE(my_diag_indeces)
1699
1700    ! check if we have to redistribute eigenvectors for sigma-subspace calc
1701    if(sig%do_sigma_subspace) then
1702      call timing%start(timing%epscopy_redstr)
1703      !
1704      ! sum-up wings from previous step
1705      call MPI_Allreduce(MPI_IN_PLACE, sig%epssub%eps_wings_rows(1:Neps,1:sig%nFreq,iq+qoffset), &
1706                         sig%epssub%neps * sig%nFreq, MPI_COMPLEX_DPC, MPI_SUM, MPI_COMM_WORLD, mpierr)
1707      call MPI_Allreduce(MPI_IN_PLACE, sig%epssub%eps_wings_cols(1:Neps,1:sig%nFreq,iq+qoffset), &
1708                         sig%epssub%neps * sig%nFreq, MPI_COMPLEX_DPC, MPI_SUM, MPI_COMM_WORLD, mpierr)
1709
1710      if ( cyclic_redistr ) then
1711        ! cyclic style redistribution (not the most efficient but easier than other methods)
1712        ! start allocating buffers and initialize
1713        ! eigenvectors
1714        ! figure out how many col do this porc own with global index less than Neig
1715        ! (assuming that if j_local < j_local' then j_global < j_global')
1716        npc_lt_Neig = 0
1717        do j_local = 1, scal%npc
1718          j_global = col_indices(j_local, peinf%inode + 1)
1719          if ( j_global > neig_sub ) exit
1720          npc_lt_Neig = npc_lt_Neig + 1
1721        end do
1722        max_npc_lt_Neig = npc_lt_Neig
1723        call mpi_allreduce(MPI_IN_PLACE, max_npc_lt_Neig, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
1724        !
1725        SAFE_ALLOCATE(buffer_sub_eigen,      (max_npr, max_npc_lt_Neig))
1726        SAFE_ALLOCATE(rec_buffer_sub_eigen,  (max_npr, max_npc_lt_Neig))
1727        SAFE_ALLOCATE(send_buffer_sub_eigen, (max_npr, max_npc_lt_Neig))
1728!$OMP PARALLEL do private (j_local, i_local) collapse(2)
1729        do j_local = 1, max_npc_lt_Neig
1730          do i_local = 1, max_npr
1731            buffer_sub_eigen(i_local, j_local)      = (0.0d0,0.0d0)
1732            rec_buffer_sub_eigen(i_local, j_local)  = (0.0d0,0.0d0)
1733            send_buffer_sub_eigen(i_local, j_local) = (0.0d0,0.0d0)
1734          end do
1735        end do
1736!$OMP END PARALLEL DO
1737        !
1738!$OMP PARALLEL do private (j_local, i_local) collapse(2)
1739        do j_local = 1, npc_lt_Neig
1740          do i_local = 1, scal%npr
1741            buffer_sub_eigen(i_local, j_local)      = eigenvect(i_local, j_local)
1742            send_buffer_sub_eigen(i_local, j_local) = eigenvect(i_local, j_local)
1743          end do
1744        end do
1745!$OMP END PARALLEL DO
1746
1747        ! calculate my index range and save info
1748        do_copy = .true.
1749        sig%epssub%eps_sub_info(:,2,iq+qoffset) = 0
1750        indx_start = peinf%pool_rank * sig%epssub%Nbas_own_max + 1
1751        if(indx_start <= neig_sub) then
1752          indx_end = MIN((peinf%pool_rank + 1) * sig%epssub%Nbas_own_max, neig_sub)
1753          sub_size = indx_end - indx_start + 1
1754          sig%epssub%eps_sub_info(1,2,iq+qoffset) = indx_start
1755          sig%epssub%eps_sub_info(2,2,iq+qoffset) = indx_end
1756        else
1757          indx_start = 0
1758          indx_end   = 0
1759          sub_size   = 0
1760          do_copy = .false.
1761        end if
1762        sig%epssub%eps_sub_info(3,2,iq+qoffset) = nmtx
1763
1764        ! keep track of the indeces to copy
1765        SAFE_ALLOCATE(copy_col_inx, (max_npc_lt_Neig))
1766        ! get process for communication
1767        isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes)
1768        irec_static  = MOD(peinf%inode - 1 + peinf%npes, peinf%npes)
1769        ! start cyclic loop (starting with myself)
1770        ipe_real = peinf%inode + 1
1771        do ipe = 1, peinf%npes
1772          !
1773          actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes)
1774          actual_rec  = MOD(peinf%inode - ipe + peinf%npes, peinf%npes)
1775          !
1776          ! post msgs
1777          tag_rec  = 1
1778          tag_send = 1
1779          req_rec_singl  = MPI_REQUEST_NULL
1780          req_send_singl = MPI_REQUEST_NULL
1781          CALL MPI_Irecv(rec_buffer_sub_eigen, max_npr*max_npc_lt_Neig, &
1782                         MPI_COMPLEX_DPC, irec_static, &
1783                         tag_rec, MPI_COMM_WORLD, req_rec_singl, mpierr)
1784          CALL MPI_Isend(send_buffer_sub_eigen, max_npr*max_npc_lt_Neig, &
1785                         MPI_COMPLEX_DPC, isend_static, &
1786                         tag_send, MPI_COMM_WORLD, req_send_singl, mpierr)
1787
1788          if ( do_copy ) then
1789            ! precompute number of column to copy
1790            ncol_to_copy = 0
1791            copy_col_inx = 0
1792            do j_local = 1, MIN(max_npc_lt_Neig, scal%npcd(ipe_real))
1793              j_global = col_indices(j_local, ipe_real)
1794              if ( j_global >= indx_start .and. j_global <= indx_end) then
1795                ncol_to_copy = ncol_to_copy + 1
1796                copy_col_inx( ncol_to_copy ) = j_local
1797              end if
1798              if (j_global > indx_end) exit
1799            end do
1800            ! copy message on local buffer
1801!$OMP PARALLEL do private (j, j_local, j_global, i_local, i_global)
1802            do j = 1, ncol_to_copy
1803              j_local  = copy_col_inx( j )
1804              j_global = col_indices(j_local, ipe_real) - indx_start + 1
1805              do i_local = 1, scal%nprd(ipe_real)
1806                i_global = row_indices(i_local, ipe_real)
1807                sig%epssub%eigenvec_sub(i_global, j_global, iq+qoffset) =  buffer_sub_eigen(i_local,j_local)
1808              end do
1809            end do
1810!$OMP END PARALLEL DO
1811            !
1812          end if
1813
1814          ! finalize communication
1815          CALL MPI_Wait(req_rec_singl,  mpistatus, mpierr)
1816          ! swap buffers
1817!$OMP PARALLEL do private (j_local, i_local) collapse(2)
1818          do j_local = 1, max_npc_lt_Neig
1819            do i_local = 1, max_npr
1820              buffer_sub_eigen(i_local, j_local) = rec_buffer_sub_eigen(i_local, j_local)
1821            end do
1822          end do
1823!$OMP END PARALLEL DO
1824          ! the same for the sendig buffer
1825          CALL MPI_Wait(req_send_singl, mpistatus, mpierr)
1826          ! swap buffers
1827!$OMP PARALLEL do private (j_local, i_local) collapse(2)
1828          do j_local = 1, max_npc_lt_Neig
1829            do i_local = 1, max_npr
1830              send_buffer_sub_eigen(i_local, j_local) = rec_buffer_sub_eigen(i_local, j_local)
1831            end do
1832          end do
1833!$OMP END PARALLEL DO
1834
1835          ! be ready for the next cycle
1836          ipe_real = actual_rec + 1
1837
1838        end do ! ipe
1839        ! deallocate stuff
1840        SAFE_DEALLOCATE(buffer_sub_eigen)
1841        SAFE_DEALLOCATE(rec_buffer_sub_eigen)
1842        SAFE_DEALLOCATE(send_buffer_sub_eigen)
1843        SAFE_DEALLOCATE(copy_col_inx)
1844
1845        ! the same for the subspace epsilon
1846        SAFE_ALLOCATE(buffer_sub_eps,      (max_npr_sub, max_npc_sub, Nfreq))
1847        SAFE_ALLOCATE(rec_buffer_sub_eps,  (max_npr_sub, max_npc_sub, Nfreq))
1848        SAFE_ALLOCATE(send_buffer_sub_eps, (max_npr_sub, max_npc_sub, Nfreq))
1849!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2)
1850        do ifreq = 1, Nfreq
1851          do j_local = 1, max_npc_sub
1852            do i_local = 1, max_npr_sub
1853              buffer_sub_eps(i_local, j_local, ifreq)      = (0.0d0,0.0d0)
1854              rec_buffer_sub_eps(i_local, j_local, ifreq)  = (0.0d0,0.0d0)
1855              send_buffer_sub_eps(i_local, j_local, ifreq) = (0.0d0,0.0d0)
1856            end do
1857          end do
1858        end do
1859!$OMP END PARALLEL DO
1860!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2)
1861        do ifreq = 1, Nfreq
1862          do j_local = 1, scal_sub%npc
1863            do i_local = 1, scal_sub%npr
1864              buffer_sub_eps(i_local, j_local, ifreq)      = eps_sub_f(i_local, j_local, ifreq)
1865              send_buffer_sub_eps(i_local, j_local, ifreq) = eps_sub_f(i_local, j_local, ifreq)
1866            end do
1867          end do
1868        end do
1869!$OMP END PARALLEL DO
1870
1871        ! calculate my index range and save info
1872        do_copy = .true.
1873        sig%epssub%eps_sub_info(:,1,iq+qoffset) = 0
1874        indx_start = peinf%pool_rank * sig%epssub%Nbas_own_max + 1
1875        if(indx_start <= neig_sub) then
1876          indx_end = MIN((peinf%pool_rank + 1) * sig%epssub%Nbas_own_max, neig_sub)
1877          sub_size = indx_end - indx_start + 1
1878          sig%epssub%eps_sub_info(1,1,iq+qoffset) = indx_start
1879          sig%epssub%eps_sub_info(2,1,iq+qoffset) = indx_end
1880        else
1881          indx_start = 0
1882          indx_end   = 0
1883          sub_size   = 0
1884          do_copy = .false.
1885        end if
1886        sig%epssub%eps_sub_info(3,1,iq+qoffset) = neig_sub
1887
1888        ! keep track of the indeces to copy
1889        SAFE_ALLOCATE(copy_col_inx, (max_npc_sub))
1890        ! get process for communication
1891        isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes)
1892        irec_static  = MOD(peinf%inode - 1 + peinf%npes, peinf%npes)
1893        ! start cyclic loop (starting with myself)
1894        ipe_real = peinf%inode + 1
1895        do ipe = 1, peinf%npes
1896          !
1897          actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes)
1898          actual_rec  = MOD(peinf%inode - ipe + peinf%npes, peinf%npes)
1899          !
1900          ! post msgs
1901          tag_rec  = 1
1902          tag_send = 1
1903          req_rec_singl  = MPI_REQUEST_NULL
1904          req_send_singl = MPI_REQUEST_NULL
1905          CALL MPI_Irecv(rec_buffer_sub_eps, max_npr_sub*max_npc_sub*Nfreq, &
1906                         MPI_COMPLEX_DPC, irec_static, &
1907                         tag_rec, MPI_COMM_WORLD, req_rec_singl, mpierr)
1908          CALL MPI_Isend(send_buffer_sub_eps, max_npr_sub*max_npc_sub*Nfreq, &
1909                         MPI_COMPLEX_DPC, isend_static, &
1910                         tag_send, MPI_COMM_WORLD, req_send_singl, mpierr)
1911
1912          if ( do_copy ) then
1913            ! precompute number of column to copy
1914            ncol_to_copy = 0
1915            copy_col_inx = 0
1916            do j_local = 1, scal_sub%npcd(ipe_real)
1917              j_global = col_indices_sub(j_local, ipe_real)
1918              if ( j_global >= indx_start .and. j_global <= indx_end) then
1919                ncol_to_copy = ncol_to_copy + 1
1920                copy_col_inx( ncol_to_copy ) = j_local
1921              end if
1922              if (j_global > indx_end) exit
1923            end do
1924            ! copy message on local buffer
1925!$OMP PARALLEL do private (j, j_local, j_global, i_local, i_global, ifreq)
1926            do ifreq = 1, Nfreq
1927              do j = 1, ncol_to_copy
1928                j_local  = copy_col_inx( j )
1929                j_global = col_indices_sub(j_local, ipe_real) - indx_start + 1
1930                do i_local = 1, scal_sub%nprd(ipe_real)
1931                  i_global = row_indices_sub(i_local, ipe_real)
1932                  sig%epssub%eps_sub(i_global, j_global, ifreq, iq+qoffset) = &
1933                                    buffer_sub_eps(i_local, j_local, ifreq)
1934                end do
1935              end do
1936            end do
1937!$OMP END PARALLEL DO
1938          end if ! do_copy
1939
1940          ! finalize communication
1941          CALL MPI_Wait(req_rec_singl,  mpistatus, mpierr)
1942          ! swap buffers
1943!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2)
1944          do ifreq = 1, Nfreq
1945            do j_local = 1, max_npc_sub
1946              do i_local = 1, max_npr_sub
1947                buffer_sub_eps(i_local, j_local, ifreq) = rec_buffer_sub_eps(i_local, j_local, ifreq)
1948              end do
1949            end do
1950          end do
1951!$OMP END PARALLEL DO
1952          ! the same for the sendig buffer
1953          CALL MPI_Wait(req_send_singl, mpistatus, mpierr)
1954          ! swap buffers
1955!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2)
1956          do ifreq = 1, Nfreq
1957            do j_local = 1, max_npc_sub
1958              do i_local = 1, max_npr_sub
1959                send_buffer_sub_eps(i_local, j_local, ifreq) = rec_buffer_sub_eps(i_local, j_local, ifreq)
1960              end do
1961            end do
1962          end do
1963!$OMP END PARALLEL DO
1964
1965          ! be ready for the next cycle
1966          ipe_real = actual_rec + 1
1967
1968        end do ! ipe
1969        !
1970        SAFE_DEALLOCATE(copy_col_inx)
1971        SAFE_DEALLOCATE(buffer_sub_eps)
1972        SAFE_DEALLOCATE(rec_buffer_sub_eps)
1973        SAFE_DEALLOCATE(send_buffer_sub_eps)
1974        ! DONE
1975      else  ! cyclic_redistr
1976        ! redistribution based on collective operations
1977        if (peinf%inode==0) write(6,'(1x,a)') 'Redistribute using collective operations!'
1978        !
1979        SAFE_ALLOCATE(buffer_sub_eps,   (neig_sub, sig%epssub%Nbas_own_max, Nfreq))
1980        !XXX SAFE_ALLOCATE(buffer_sub_eigen, (sig%epssub%ngpown_sub_max, neig_sub))
1981        SAFE_ALLOCATE(buffer_sub_eigen, (nmtx, sig%epssub%Nbas_own_max))
1982        do iproc_pool = 1, peinf%npes_pool
1983
1984          ! eigenvectors
1985          indx_start = (iproc_pool-1) * sig%epssub%Nbas_own_max + 1
1986          if(indx_start <= neig_sub) then
1987            indx_end = MIN(iproc_pool * sig%epssub%Nbas_own_max, neig_sub)
1988            sub_size = indx_end - indx_start + 1
1989
1990            buffer_sub_eigen = (0.0d0,0.0d0)
1991            icurr = 0
1992            do j_global = indx_start, indx_end
1993              icurr = icurr + 1
1994              iproc_col = INDXG2P( j_global, scal%nbl, iproc_dum, 0, scal%npcol)
1995              if(iproc_col == scal%mypcol) then
1996                 j_local = INDXG2L(j_global, scal%nbl, scal%mypcol, 0, scal%npcol)
1997                 do i_local = 1, scal%npr
1998                   i_global = row_indices(i_local, peinf%inode + 1)
1999                   if(i_global > nmtx) cycle
2000                   buffer_sub_eigen(i_global, icurr) = eigenvect(i_local, j_local)
2001                 end do
2002              end if
2003            end do
2004
2005            iproc_send = pool2pe(1,iproc_pool)
2006            if(iproc_send == peinf%inode) then
2007              call MPI_Reduce(MPI_IN_PLACE, buffer_sub_eigen, &
2008                              nmtx*sig%epssub%Nbas_own_max, MPI_COMPLEX_DPC,&
2009                              MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr)
2010              ! copy data
2011              sig%epssub%eigenvec_sub(1:nmtx, 1:sig%epssub%Nbas_own, iq+qoffset) = &
2012                     buffer_sub_eigen(1:nmtx, 1:sig%epssub%Nbas_own)
2013            else
2014              call MPI_Reduce(buffer_sub_eigen, buffer_sub_eigen, &
2015                              nmtx*sig%epssub%Nbas_own_max, MPI_COMPLEX_DPC,&
2016                              MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr)
2017            end if
2018          end if
2019
2020          ! save information
2021          if(iproc_pool-1 == peinf%pool_rank) then
2022            sig%epssub%eps_sub_info(:,2,iq+qoffset) = 0
2023            if(indx_start <= neig_sub) then
2024              sig%epssub%eps_sub_info(1,2,iq+qoffset) = indx_start
2025              sig%epssub%eps_sub_info(2,2,iq+qoffset) = indx_end
2026            end if
2027            sig%epssub%eps_sub_info(3,2,iq+qoffset) = nmtx
2028          end if
2029
2030          ! subspace epsilon
2031          indx_start = (iproc_pool-1) * sig%epssub%Nbas_own_max + 1
2032          if(indx_start <= neig_sub) then
2033            indx_end = MIN(iproc_pool * sig%epssub%Nbas_own_max, neig_sub)
2034            sub_size = indx_end - indx_start + 1
2035
2036            buffer_sub_eps = (0.0d0,0.0d0)
2037            icurr = 0
2038            do j_global = indx_start, indx_end
2039              icurr = icurr + 1
2040              iproc_col = INDXG2P( j_global, scal_sub%nbl, iproc_dum, 0, scal_sub%npcol)
2041              if(iproc_col == scal_sub%mypcol) then
2042                j_local = INDXG2L(j_global, scal_sub%nbl, scal_sub%mypcol, 0, scal_sub%npcol)
2043                do i_local = 1, scal_sub%npr
2044                  i_global = row_indices_sub(i_local, peinf%inode + 1)
2045                  if(i_global > neig_sub) cycle
2046                  buffer_sub_eps(i_global, icurr, :) = eps_sub_f(i_local, j_local, :)
2047                end do
2048              end if
2049            end do
2050
2051            iproc_send = pool2pe(1,iproc_pool)
2052            if(iproc_send == peinf%inode) then
2053              call MPI_Reduce(MPI_IN_PLACE, buffer_sub_eps, &
2054                              neig_sub * sig%epssub%Nbas_own_max * Nfreq, MPI_COMPLEX_DPC,&
2055                              MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr)
2056              ! copy data
2057              sig%epssub%eps_sub(1:neig_sub, 1:sig%epssub%Nbas_own, 1:Nfreq, iq+qoffset) = &
2058              buffer_sub_eps(1:neig_sub, 1:sig%epssub%Nbas_own, 1:Nfreq)
2059            else
2060              call MPI_Reduce(buffer_sub_eps, buffer_sub_eps, &
2061                              neig_sub * sig%epssub%Nbas_own_max * Nfreq, MPI_COMPLEX_DPC,&
2062                              MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr)
2063            end if
2064          end if
2065
2066          ! save information
2067          if(iproc_pool-1 == peinf%pool_rank) then
2068            sig%epssub%eps_sub_info(:,1,iq+qoffset) = 0
2069            if(indx_start <= neig_sub) then
2070              sig%epssub%eps_sub_info(1,1,iq+qoffset) = indx_start
2071              sig%epssub%eps_sub_info(2,1,iq+qoffset) = indx_end
2072            end if
2073            sig%epssub%eps_sub_info(3,1,iq+qoffset) = neig_sub
2074          end if
2075
2076        end do ! iproc_pool
2077        SAFE_DEALLOCATE(buffer_sub_eigen)
2078        SAFE_DEALLOCATE(buffer_sub_eps)
2079
2080        ! replicate over pools
2081        if(peinf%inode == pool2pe(1, peinf%pool_rank + 1)) then
2082          do pool_send = 2, peinf%npools
2083            iproc_send = pool2pe(pool_send,  peinf%pool_rank + 1)
2084            tag = 0
2085            CALL MPI_SEND(sig%epssub%eps_sub(:,:,:,iq+qoffset), &
2086                          sig%neig_sub_max * sig%epssub%Nbas_own * sig%nFreq, &
2087                          MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr)
2088            tag = 0
2089            CALL MPI_SEND(sig%epssub%eigenvec_sub(:,:,iq+qoffset), &
2090                          sig%epssub%neps * sig%epssub%Nbas_own, &
2091                          MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr)
2092          end do
2093        else
2094          if(peinf%my_pool >= 0) then
2095            iproc_rec = pool2pe(1, peinf%pool_rank + 1)
2096            tag = 0
2097            CALL MPI_RECV(sig%epssub%eps_sub(:,:,:,iq+qoffset), &
2098                          sig%neig_sub_max * sig%epssub%Nbas_own * sig%nFreq, &
2099                          MPI_COMPLEX_DPC, iproc_rec, tag, MPI_COMM_WORLD, mpistatus, mpierr)
2100            tag = 0
2101            CALL MPI_RECV(sig%epssub%eigenvec_sub(:,:,iq+qoffset), &
2102                          sig%epssub%neps * sig%epssub%Nbas_own, &
2103                          MPI_COMPLEX_DPC, iproc_rec, tag, MPI_COMM_WORLD, mpistatus, mpierr)
2104          end if
2105        end if
2106        !
2107      end if ! cyclic_redistr
2108
2109      ! copy coulomb operator
2110      sig%epssub%vcoul_sub(1:nmtx,iq+qoffset) = vcoul(1:nmtx)
2111      !
2112      call timing%stop(timing%epscopy_redstr)
2113    end if  ! do_sigma_subspace
2114
2115    SAFE_DEALLOCATE(eps1Aux)
2116    SAFE_DEALLOCATE(C_Pgemm)
2117    SAFE_DEALLOCATE(eigenvect)
2118    SAFE_DEALLOCATE(eps_sub_f)
2119
2120    ! deallocate buffer
2121    DO imsg_recv = 1, Nbuf_recv
2122      SAFE_DEALLOCATE( buf_recv(imsg_recv)%msg )
2123      SAFE_DEALLOCATE( buf_recv(imsg_recv)%col_global_indx )
2124    END DO
2125    SAFE_DEALLOCATE(buf_recv)
2126    DO imsg_send = 1, Nbuf_send
2127      SAFE_DEALLOCATE( buf_send(imsg_send)%msg )
2128      SAFE_DEALLOCATE( buf_send(imsg_send)%col_global_indx )
2129    END DO
2130    SAFE_DEALLOCATE(buf_send)
2131    SAFE_DEALLOCATE(proc2msg_send)
2132    SAFE_DEALLOCATE(local_col_count)
2133    SAFE_DEALLOCATE(req_send)
2134    SAFE_DEALLOCATE(my_status)
2135    SAFE_DEALLOCATE(req_recv)
2136
2137    !WRITE(*,*) sig%need_advanced
2138
2139    POP_SUB(epscopy.epscopy_subspace)
2140
2141  end subroutine
2142#endif
2143
2144end subroutine epscopy
2145
2146end module epscopy_m
2147