1!==============================================================================
2!
3! Modules:
4!
5! chi_summation_m                               Last Modified: 04/19/2012 (FHJ)
6!
7!   This module creates the (distributed) polarizability matrix chi by summing
8!   the (distributed) pol%gme matrices. There are routines that communicate the
9!   gme matrix using either matrix or element partitioning scheme.
10!
11!==============================================================================
12
13#include "f_defs.h"
14
15module chi_summation_m
16
17  use global_m
18  use blas_m
19  use mtxelmultiply_m
20  use scalapack_m
21  use lin_denominator_m
22  use io_utils_m
23  use vcoul_generator_m
24  use elpa_interface_m
25  use misc_m, only : procmem
26
27  use timing_m, only: timing => epsilon_timing
28
29  implicit none
30
31  private
32
33  public :: create_chi_summator, free_chi_summator,&
34    chi_summation_comm_matrix, chi_summation_comm_elements
35#if defined MPI && defined USESCALAPACK
36  public :: chi_summation_sub_trunc
37#endif
38
39  !> FHJ: the chi_summator "object"
40  type chi_summator_t
41    real(DP) :: fact
42    !> DVF: below are some temporary buffers needed for the chi summation. They are
43    !! described in detail in this comment.
44    !! gme = g-vector matrix element
45    !! gmetempX where X = n,r,c are temporary arrays for static calculations
46    !! n = normalized by the proper factor used in BGW
47    !! r = row, meaning the matrix elements for all bands (nv*nc*nk) that the proc owns
48    !! for the G-vectors in the rows of chi currently being computed
49    !! c = column, the matrix elements for all bands (nv*nc*nk) that the proc owns
50    !! for the G`-vectors in the rows of chi currently being computed
51    !! the RDyn arrays are needed for full-frequency (FF) calculations, real and complex
52    !! r2 is used in FF with matrix communication because you have a different denominator for
53    !! each frequency. Only the r2 array (not the r) array is used for element communication
54    !! the denominators are built into the gme`s for static calculations
55    !! eden arrays hold the energy denominators for FF
56    !! chilocal holds the contribution of a given processor to the GG` chunk of epsilon
57    !! being computed
58    !! chilocal2 holds the MPI reduced GG` chunk of epsilon being computed
59    SCALAR, allocatable :: chilocal(:,:)
60    SCALAR, allocatable :: chilocal2(:,:,:)
61    complex(DPC), allocatable :: chilocalRDyn(:,:,:)
62    complex(DPC), allocatable :: chilocal2RDyn(:,:,:,:)
63    complex(DPC), allocatable :: chiRDyntmp(:)
64    complex(DPC), allocatable :: chiRDynOld(:,:,:,:)
65    SCALAR, allocatable :: gmetempr(:,:),gmetempc(:,:)
66    SCALAR, allocatable :: gmetempn(:)
67    complex(DPC), allocatable :: gmeRDyntempn(:)
68    complex(DPC), allocatable :: gmeRDyntempr2(:,:,:)
69    complex(DPC), allocatable :: gmeRDyntempr3(:,:)
70    complex(DPC), allocatable :: gmeRDyntempc(:,:)
71    complex(DPC), allocatable :: gmeRDyntempcs(:,:)
72    integer, allocatable :: deltaCount(:,:)
73    integer, allocatable :: deltaCountReduce(:,:)
74  end type chi_summator_t
75
76  public :: chi_summator_t
77
78contains
79
80  subroutine create_chi_summator(this, pol, scal, fact, nspin)
81    type(chi_summator_t), intent(INOUT) :: this !<the chi_summator_t object
82    type(polarizability), intent(IN) :: pol
83    type(scalapack), intent(IN) :: scal
84    real(DP), intent(IN) :: fact
85    integer, intent(IN) :: nspin
86
87    PUSH_SUB(create_chi_summator)
88
89    this%fact = fact
90
91    if (pol%freq_dep .eq. 0) then
92      SAFE_ALLOCATE(this%gmetempn, (pol%nmtx))
93    endif
94    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then
95      SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx))
96      SAFE_ALLOCATE(this%chiRDyntmp, (pol%nfreq_in_group))
97    endif
98    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
99      SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx))
100      SAFE_ALLOCATE(this%chiRDyntmp, (pol%nfreq_in_group))
101      SAFE_ALLOCATE(this%chiRDynOld, (pol%nfreq_in_group,scal%npr,scal%npc,nspin))
102    endif
103    if (pol%freq_dep .eq. 3) then
104      SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx))
105      SAFE_ALLOCATE(this%chiRDyntmp, (pol%nfreq_in_group))
106    endif
107
108    POP_SUB(create_chi_summator)
109    return
110
111  end subroutine create_chi_summator
112
113  subroutine free_chi_summator(this, pol)
114    type(chi_summator_t), intent(INOUT) :: this !<the chi_summator_t object
115    type(polarizability), intent(IN) :: pol
116
117    PUSH_SUB(free_chi_summator)
118
119    if (pol%freq_dep .eq. 0) then
120      SAFE_DEALLOCATE(this%gmetempn)
121    endif
122    if (pol%freq_dep .eq. 2) then
123      SAFE_DEALLOCATE(this%gmeRDyntempn)
124      SAFE_DEALLOCATE(this%chiRDyntmp)
125      if ((pol%freq_dep_method .eq. 1)) then
126        SAFE_DEALLOCATE(this%chiRDynOld)
127      endif
128    endif
129    if (pol%freq_dep .eq. 3) then
130      SAFE_DEALLOCATE(this%gmeRDyntempn)
131      SAFE_DEALLOCATE(this%chiRDyntmp)
132    endif
133
134    POP_SUB(free_chi_summator)
135    return
136
137  end subroutine free_chi_summator
138
139  !-----------------------------------------------------------------------------
140  !                              GCOMM_MATRIX
141  !-----------------------------------------------------------------------------
142
143  !> Create the pol%chi matrix using gcomm_matrix sceheme
144  subroutine chi_summation_comm_matrix(this,pol,scal,kp,kpq,vwfn,&
145    nst,nrk,indt,pht)
146    type(chi_summator_t), intent(INOUT) :: this
147    type(polarizability), intent(INOUT) :: pol
148    type(scalapack), intent(in) :: scal
149    type(kpoints), intent(IN) :: kp,kpq
150    type(valence_wfns), intent(IN) :: vwfn
151
152    integer, intent(IN) :: nst(:)
153    integer, intent(IN) :: nrk
154    integer, intent(INOUT) :: indt(:,:,:)
155    SCALAR,  intent(INOUT) :: pht(:,:,:)
156
157    integer :: ntot_members(pol%nfreq_group)
158    integer :: icurr,ntot,ntot2,itot,ntotmax,ifreq_para
159    integer :: ipe, ilimit, ii, jj, kk, ll, iv, irk, it, ispin,im,mytot
160    integer :: i_myband,tag,irank,tmp_iv,im_proc
161    complex(DPC) :: negfact
162    real(DP) :: zvalue, cv_energy
163    type(cvpair_info) :: cvpair_temp
164    integer, allocatable :: tmprowindex(:),tmpcolindex(:)
165    SCALAR, allocatable :: tmprowph(:),tmpcolph(:), sendbuf(:,:)
166    complex(DPC), allocatable :: gmeRDyntempr(:,:)
167    complex(DPC), allocatable :: edenDRtemp(:,:)
168
169    ! frequency points for the spectral functions of the polarizability
170    integer :: isfreql, isfreqr, nwarn
171    real(DP) :: sfreql, sfreqr, wr, wl
172    ! Hilbert transform coefficients
173    complex(DPC) :: htwR(pol%nfreq,pol%nsfreq), htwA(pol%nfreq,pol%nsfreq)
174    complex(DPC) :: c11,c12,c13,c14,c21,c22,c23,c24
175    complex(DPC) :: cA11,cA12,cA13,cA14,cA21,cA22,cA23,cA24
176
177    integer :: isf,nf,nsf, request
178    real(DP) :: sf1,sf2
179    real(DP) :: step1,step2,fqt,eta
180    complex(DPC) :: c1,c2,j_dpc,cA1,cA2
181    logical :: first_reduce
182
183    ! variables for non-blocking cyclic scheme
184    integer :: isend_static, irec_static
185    integer :: actual_send, actual_rec
186    integer :: nsend_row, nsend_col, nrec_row, nrec_col
187    integer :: req_send, tag_send, req_rec, tag_rec
188#ifdef MPI
189    integer :: stat(MPI_STATUS_SIZE)
190#endif
191    SCALAR, allocatable :: buf_send(:,:,:), buf_rec(:,:,:), buf_temp(:,:,:)
192
193    type(progress_info) :: prog_info
194
195    PUSH_SUB(chi_summation_comm_matrix)
196
197    !call alloc_summation_buffers(pol, this%fact)
198
199    if (pol%freq_dep.eq.0) then
200      SAFE_ALLOCATE(this%chilocal2, (scal%npr,scal%npc,kp%nspin))
201!$OMP PARALLEL DO collapse(3)
202      do ii = 1, kp%nspin
203        do jj = 1, scal%npc
204          do kk = 1, scal%npr
205            this%chilocal2(kk,jj,ii)=0D0
206          enddo
207        enddo
208      enddo
209    endif
210
211    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then
212      SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin))
213!$OMP PARALLEL DO collapse(4)
214      do ii = 1, kp%nspin
215        do jj = 1, pol%nfreq_in_group
216          do kk = 1, scal%npc
217            do ll = 1, scal%npr
218              this%chilocal2RDyn(ll,kk,jj,ii)=0D0
219            enddo
220          enddo
221        enddo
222      enddo
223    endif
224
225! At this moment the spectral method only works for gcomm_elements.
226! call die in inread.f90
227    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
228      SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin))
229!$OMP PARALLEL DO collapse(4)
230      do ii = 1, kp%nspin
231        do jj = 1, pol%nfreq_in_group
232          do kk = 1, scal%npc
233            do ll = 1, scal%npr
234              this%chilocal2RDyn(ll,kk,jj,ii)=0D0
235            enddo
236          enddo
237        enddo
238      enddo
239
240    endif
241
242    if (pol%freq_dep .eq. 3) then
243      SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin))
244!$OMP PARALLEL DO collapse(4)
245      do ii = 1, kp%nspin
246        do jj = 1, pol%nfreq_in_group
247          do kk = 1, scal%npc
248            do ll = 1, scal%npr
249              this%chilocal2RDyn(ll,kk,jj,ii)=0D0
250            enddo
251          enddo
252        enddo
253      enddo
254    endif
255
256    ntot=0
257    ntot2=0
258
259    ntot = peinf%nvownactual*peinf%ncownactual
260    do irk = 1, nrk
261      ntot2=ntot2 + nst(irk)
262    enddo
263    ntot=ntot*ntot2
264
265    !-------------------------------------------------------------------
266    ! Static Be Here
267
268
269    if (pol%freq_dep .eq. 0) then
270
271#ifdef MPI
272      if(peinf%inode .eq. 0) then
273        call timing%start(timing%chi_sum_bar)
274      endif
275      call MPI_barrier(MPI_COMM_WORLD,mpierr)
276      if(peinf%inode .eq. 0) then
277        call timing%stop(timing%chi_sum_bar)
278      endif
279#endif
280
281      first_reduce = .true.
282
283      call progress_init(prog_info, 'building polarizability matrix', 'processor', &
284        peinf%npes)
285
286      if(pol%nonblocking_cyclic) then
287        ! initialize buffer for non-blocking cyclic scheme
288        ! static process for the communication
289        isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes)
290        irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes)
291        ! allocate my size for the first send
292        nsend_row = scal%nprd(peinf%inode+1)
293        nsend_col = scal%npcd(peinf%inode+1)
294        SAFE_ALLOCATE(buf_send, (nsend_row, nsend_col, kp%nspin))
295        do ispin = 1 , kp%nspin
296          !$OMP PARALLEL DO collapse(2)
297          do ii = 1, nsend_col
298            do jj = 1, nsend_row
299              buf_send(jj,ii,ispin) = ZERO
300            enddo
301          enddo
302        end do
303      end if
304
305      do ipe = 1, peinf%npes
306        call progress_step(prog_info)
307
308        if(pol%nonblocking_cyclic) then
309          ! calculate the actual process we have to send and we are receiving
310          actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes)
311          actual_rec  = MOD(peinf%inode - ipe + peinf%npes, peinf%npes)
312          nrec_row = scal%nprd(actual_rec+1)
313          nrec_col = scal%npcd(actual_rec+1)
314          ! allocate reciving buffer
315          SAFE_ALLOCATE(buf_rec, (nrec_row,nrec_col,kp%nspin))
316          do ispin = 1 , kp%nspin
317            !$OMP PARALLEL DO collapse(2)
318            do ii = 1, nrec_col
319              do jj = 1, nrec_row
320                buf_rec(jj,ii,ispin) = ZERO
321              enddo
322            enddo
323          end do
324#ifdef MPI
325          ! post message
326          if(peinf%inode .eq. 0) then
327            call timing%start(timing%chi_sum_comm)
328          endif
329          tag_rec = 1
330          tag_send = 1
331          req_rec = MPI_REQUEST_NULL
332          req_send = MPI_REQUEST_NULL
333          CALL MPI_Irecv(buf_rec, nrec_row*nrec_col*kp%nspin,MPI_SCALAR,irec_static,&
334                         tag_rec, MPI_COMM_WORLD, req_rec, mpierr)
335          CALL MPI_Isend(buf_send, nsend_row*nsend_col*kp%nspin,MPI_SCALAR,isend_static,&
336                         tag_send, MPI_COMM_WORLD, req_send, mpierr)
337#endif
338          if(peinf%inode .eq. 0) then
339            call timing%stop(timing%chi_sum_comm)
340          endif
341          ! allocate working array
342          if(peinf%inode .eq. 0) then
343            call timing%start(timing%chi_sum_array_alloc)
344          endif
345          SAFE_ALLOCATE(this%chilocal, (nrec_row, nrec_col))
346!$OMP PARALLEL DO collapse(2)
347          do ii = 1, nrec_col
348            do jj = 1, nrec_row
349              this%chilocal(jj,ii)=0D0
350            enddo
351          enddo
352          SAFE_ALLOCATE(buf_temp, (nrec_row, nrec_col, kp%nspin))
353          do ispin = 1 , kp%nspin
354!$OMP PARALLEL DO collapse(2)
355            do ii = 1, nrec_col
356              do jj = 1, nrec_row
357                buf_temp(jj,ii,ispin) = ZERO
358              enddo
359            enddo
360          end do
361          SAFE_ALLOCATE(this%gmetempr, (nrec_row, ntot))
362          SAFE_ALLOCATE(this%gmetempc, (nrec_col, ntot))
363          if(peinf%inode .eq. 0) then
364            call timing%stop(timing%chi_sum_array_alloc)
365          endif
366        else
367
368          if(peinf%inode .eq. 0) then
369            call timing%start(timing%chi_sum_array_alloc)
370          endif
371          SAFE_ALLOCATE(this%chilocal, (scal%nprd(ipe),scal%npcd(ipe)))
372!$OMP PARALLEL DO collapse(2)
373          do ii = 1, scal%npcd(ipe)
374            do jj = 1, scal%nprd(ipe)
375              this%chilocal(jj,ii)=0D0
376            enddo
377          enddo
378          SAFE_ALLOCATE(this%gmetempr, (scal%nprd(ipe),ntot))
379! JRD XXX Should change order here like do in FF case. I think I did...
380          SAFE_ALLOCATE(this%gmetempc, (scal%npcd(ipe),ntot))
381          if(peinf%inode .eq. 0) then
382            call timing%stop(timing%chi_sum_array_alloc)
383          endif
384        end if  ! pol%nonblocking_cyclic
385
386        do ispin = 1 , kp%nspin
387
388          if(pol%nonblocking_cyclic) then
389            call mtxelmultiply(scal,ntot,nrk,nst,this%fact,vwfn, &
390              this%gmetempr,this%gmetempc,this%chilocal,pol%gme,pol,indt,pht,actual_rec+1,ispin)
391!$OMP PARALLEL DO collapse(2)
392            do ii = 1, nrec_col
393              do jj = 1, nrec_row
394                buf_temp(jj,ii,ispin) = this%chilocal(jj,ii)
395              enddo
396            enddo
397          else
398
399            call mtxelmultiply(scal,ntot,nrk,nst,this%fact,vwfn, &
400              this%gmetempr,this%gmetempc,this%chilocal,pol%gme,pol,indt,pht,ipe,ispin)
401
402#ifdef NONBLOCKING
403          if(peinf%inode .eq. 0) then
404            call timing%start(timing%chi_sum_comm)
405          endif
406          if (.not. first_reduce) then
407            call MPI_WAIT(request,mpistatus,mpierr)
408            SAFE_DEALLOCATE(sendbuf)
409          endif
410          if(peinf%inode .eq. 0) then
411            call timing%stop(timing%chi_sum_comm)
412          endif
413
414          if(peinf%inode .eq. 0) then
415            call timing%start(timing%chi_sum_flt)
416          endif
417          SAFE_ALLOCATE(sendbuf, (scal%nprd(ipe),scal%npcd(ipe)))
418!$OMP PARALLEL DO collapse(2)
419          do ii = 1, scal%npcd(ipe)
420            do jj = 1, scal%nprd(ipe)
421              sendbuf(jj,ii) = this%chilocal(jj,ii)
422            enddo
423          enddo
424          if(peinf%inode .eq. 0) then
425            call timing%stop(timing%chi_sum_flt)
426          endif
427          if(peinf%inode .eq. 0) then
428            call timing%start(timing%chi_sum_ht_nb)
429          endif
430#ifdef MPI
431! JRD XXX Barrier probably implicit in IReduce
432          call MPI_IReduce(sendbuf(1,1),this%chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, &
433            MPI_SUM,ipe-1,MPI_COMM_WORLD,request,mpierr)
434#else
435          this%chilocal2(:,:,ispin)=this%chilocal(:,:)
436#endif
437          first_reduce = .false.
438          if(peinf%inode .eq. 0) then
439            call timing%stop(timing%chi_sum_ht_nb)
440          endif
441
442#else
443          if(peinf%inode .eq. 0) then
444            call timing%start(timing%chi_sum_comm)
445          endif
446#ifdef MPI
447          !call MPI_barrier(MPI_COMM_WORLD,mpierr)
448          call MPI_Reduce(this%chilocal(1,1),this%chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, &
449            MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
450#else
451          this%chilocal2(:,:,ispin)=this%chilocal(:,:)
452#endif
453          if(peinf%inode .eq. 0) then
454            call timing%stop(timing%chi_sum_comm)
455          endif
456#endif
457          ! in case nonblocking_cyclic communication will be finalize outside the spin-loop
458          end if ! pol%nonblocking_cyclic
459
460        enddo ! ispin
461
462        SAFE_DEALLOCATE(this%chilocal)
463        SAFE_DEALLOCATE(this%gmetempr)
464        SAFE_DEALLOCATE(this%gmetempc)
465
466        if(pol%nonblocking_cyclic) then
467#ifdef MPI
468          if(peinf%inode .eq. 0) then
469            call timing%start(timing%chi_sum_comm)
470          endif
471          ! make sure the buffer is received
472          CALL MPI_Wait(req_rec,stat,mpierr)
473#endif
474          ! accumulate contribution into receiving buffer
475          ! buf_rec(:,:,:) = buf_rec(:,:,:) + buf_temp
476          do ispin = 1 , kp%nspin
477!$OMP PARALLEL DO collapse(2)
478            do ii = 1, nrec_col
479              do jj = 1, nrec_row
480                buf_rec(jj,ii,ispin) = buf_rec(jj,ii,ispin) + buf_temp(jj,ii,ispin)
481              enddo
482            enddo
483          end do
484          SAFE_DEALLOCATE(buf_temp)
485#ifdef MPI
486          ! wait for the massage to be sent
487          CALL MPI_Wait(req_send,stat,mpierr)
488#endif
489          ! copy the messega to the sending buffer for the next cycle
490          SAFE_DEALLOCATE(buf_send)
491          SAFE_ALLOCATE(buf_send, (nrec_row, nrec_col, kp%nspin))
492          buf_send = buf_rec
493          nsend_row = nrec_row
494          nsend_col = nrec_col
495          ! deallocate receiving buffer
496          SAFE_DEALLOCATE(buf_rec)
497#ifdef MPI
498          if(peinf%inode .eq. 0) then
499            call timing%stop(timing%chi_sum_comm)
500          endif
501#endif
502        end if
503
504      enddo ! ipe
505
506      if(pol%nonblocking_cyclic) then
507        ! done
508        this%chilocal2(:,:,:) = buf_send(:,:,:)
509        SAFE_DEALLOCATE(buf_send)
510      else
511#ifdef NONBLOCKING
512        if(peinf%inode .eq. 0) then
513          call timing%start(timing%chi_sum_comm)
514        endif
515        call MPI_WAIT(request,mpistatus,mpierr)
516        SAFE_DEALLOCATE(sendbuf)
517        first_reduce = .true.
518        if(peinf%inode .eq. 0) then
519          call timing%stop(timing%chi_sum_comm)
520        endif
521#endif
522      end if
523
524      call progress_free(prog_info)
525
526      do ispin =1, kp%nspin
527        pol%chi(:,:,ispin) = this%chilocal2(:,:,ispin)
528      enddo
529      SAFE_DEALLOCATE(this%chilocal2)
530
531      !XXXXXXXXXXx
532      ! call diagonalize_scalapack(pol%nmtx, scal, pol%chi(:,:,1), 1.0D-3, icurr, this%chilocal)
533      !XXXXXXXXXXX
534
535    endif ! pol%freq_dep .eq. 0
536
537    !-------------------------------------
538    ! Full Frequency Be Here
539
540    negfact = -1D0*this%fact
541
542    if ( ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) .or. pol%freq_dep .eq. 3 ) then
543      if(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then
544#ifdef MPI
545        tag = 10 ! MPI tag
546        if(peinf%rank_mtxel .eq. 0) then
547          ntot_members(1) = ntot
548        endif
549        do irank = 1, pol%nfreq_group-1
550          if(peinf%rank_mtxel .eq. irank) then
551            call MPI_Send(ntot,1,MPI_INTEGER,0,tag,peinf%mtxel_comm,mpierr)
552          endif
553          if(peinf%rank_mtxel .eq. 0) then
554            call MPI_Recv(ntot_members(irank+1),1,MPI_INTEGER,irank,tag,peinf%mtxel_comm,mpistatus,mpierr)
555          endif
556        enddo
557        if(peinf%rank_mtxel .eq. 0) then
558          ntot = sum(ntot_members(1:pol%nfreq_group))
559        endif
560        call MPI_Bcast(ntot_members,pol%nfreq_group,MPI_INTEGER,0,peinf%mtxel_comm,mpierr)
561        call MPI_Bcast(ntot,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr)
562        call MPI_Bcast(ntotmax,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr)
563        ntotmax = peinf%nvownmax*peinf%ncownmax*nrk
564        !Communicate gme`s and energy denominators among processors in matrix element communication groups
565        if(peinf%inode .eq. 0) then
566          call timing%start(timing%chi_sum_comm)
567        endif
568        call MPI_Allgather(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,pol%gme(1,1,1,1,1,1),&
569                          ntotmax*kp%nspin*pol%nmtx,MPI_SCALAR,peinf%mtxel_comm,mpierr)
570        call MPI_Allgather(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,pol%edenDyn(1,1,1,1,1),&
571                          ntotmax*kp%nspin,MPI_REAL_DP,peinf%mtxel_comm,mpierr)
572        if(peinf%inode .eq. 0) then
573          call timing%stop(timing%chi_sum_comm)
574        endif
575#endif
576      endif
577
578      call progress_init(prog_info, 'building polarizability matrix', 'processor', &
579        peinf%npes_freqgrp)
580      do ipe = 1, peinf%npes_freqgrp
581        call progress_step(prog_info)
582        if (peinf%verb_debug .and. peinf%inode==0) then
583          write(6,'(A,i8,6x,A,i8,A)') '### ipe=',ipe,'(npes=',peinf%npes,')'
584        endif
585#ifdef MPI
586        call MPI_barrier(MPI_COMM_WORLD,mpierr)
587#endif
588        SAFE_ALLOCATE(this%gmeRDyntempr3, (scal%nprd(ipe),ntot))
589        SAFE_ALLOCATE(this%gmeRDyntempc, (scal%npcd(ipe),ntot))
590        SAFE_ALLOCATE(this%chilocalRDyn, (scal%nprd(ipe),scal%npcd(ipe),pol%nfreq_in_group))
591
592        SAFE_ALLOCATE(gmeRDyntempr, (scal%nprd(ipe),ntot))
593        SAFE_ALLOCATE(edenDRtemp, (ntot, pol%nfreq_in_group))
594
595
596!$OMP PARALLEL DO collapse(2)
597        do ii = 1, pol%nfreq_in_group
598          do jj = 1, scal%npcd(ipe)
599            do kk = 1, scal%nprd(ipe)
600              this%chilocalRDyn(kk,jj,ii)=0D0
601            enddo
602          enddo
603        enddo
604
605        do ispin = 1 , kp%nspin
606
607          itot = 0
608
609          if(peinf%inode .eq. 0) then
610            call timing%start(timing%chi_sum_prep)
611          endif
612
613          SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe)))
614          SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe)))
615          SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe)))
616          SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe)))
617
618          do im=1,pol%nfreq_group                          ! im labels which member of the mtxel comm group are you
619            im_proc=peinf%rank_f+1+(im-1)*peinf%npes_freqgrp ! im_proc gives this mtxel comm group member`s global
620            do irk = 1, nrk                                  ! proc number
621              do it = 1, nst(irk)
622
623                do icurr=1,scal%nprd(ipe)
624                  tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe),it,irk)
625                  tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe),it,irk)
626                enddo
627                do icurr=1,scal%npcd(ipe)
628                  tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe),it,irk)
629                  tmpcolph(icurr) = pht(scal%imycold(icurr,ipe),it,irk)
630                enddo
631
632! JRD XXX - Would rather put OMP here with collapse 2 - but unbalanced... need to loop over tmp_iv = 1, nvown instead
633                do iv = 1,vwfn%nband+pol%ncrit
634
635                  tmp_iv = peinf%global_indexv(iv,im_proc)
636
637                  if (peinf%does_it_ownv(iv,im_proc)) then
638                    ilimit = peinf%global_ncown(im_proc)
639                  else
640                    ilimit = 0
641                  endif
642
643                  !$OMP PARALLEL DO private (mytot,zvalue,jj,icurr,ifreq_para,cvpair_temp,cv_energy)
644                  do i_myband = 1, ilimit
645                    mytot = itot + i_myband
646                    zvalue = pol%edenDyn(peinf%global_indexv(iv,im_proc),i_myband,ispin,irk,im)
647                    if(pol%lin_denominator<TOL_Zero) then
648                      ! this is when the lin_denominator mode is not active.
649
650                      ! DVF: These factors of (peinf%inode)/peinf%npes*100000 are so that you don`t access the
651                      ! array edenDRtemp and other similar arrays for the processors that are not used
652                      ! when pol%nfreq_group does not divide the original number of processors in the
653                      ! calculation (peinf%npes_orig). This factor makes the loop start at some huge
654                      ! number, so the loop will in fact not execute at all
655
656                      do jj=1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group
657                        ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group
658                        if (abs(zvalue) .gt. Tol_Zero) then
659! JRD XXX - INDEX ORDER here worries me
660                          edenDRtemp(mytot,ifreq_para)= -0.5d0*( &
661                            1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ &
662                            1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd))
663                        else
664                          edenDRtemp(mytot,ifreq_para)= 0D0
665                        endif
666                      enddo
667
668                    endif
669                    do icurr=1,scal%nprd(ipe)
670                      gmeRDyntempr(icurr,mytot)=pol%gme(tmprowindex(icurr), &
671                        i_myband,tmp_iv,ispin,irk,im) * tmprowph(icurr)
672                    enddo
673                    !do jj = 1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group
674                    !  ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group
675                    !  this%gmeRDyntempr2(:,mytot,ifreq_para)=gmeRDyntempr(:)*edenDRtemp(ifreq_para)
676                    !enddo
677                    do icurr=1,scal%npcd(ipe)
678                      this%gmeRDyntempc(icurr,mytot) = &
679                        MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,tmp_iv,ispin,irk,im) * tmpcolph(icurr) )
680                    enddo
681                  enddo ! i_myband
682                  !$OMP END PARALLEL DO
683                  itot = itot+ilimit
684                enddo ! iv
685              enddo ! it
686            enddo ! irk
687          enddo ! im
688
689          SAFE_DEALLOCATE(tmprowindex)
690          SAFE_DEALLOCATE(tmpcolindex)
691          SAFE_DEALLOCATE(tmprowph)
692          SAFE_DEALLOCATE(tmpcolph)
693
694          if(peinf%inode .eq. 0) then
695            call timing%stop(timing%chi_sum_prep)
696          endif
697
698
699          !Do the zgemm`s
700          if(ntot > 0) then
701            do jj =1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group
702              ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group
703              if(peinf%inode .eq. 0) then
704                call timing%start(timing%chi_sum_gemm)
705              endif
706
707              !$omp parallel do private(icurr)
708              do mytot = 1, ntot
709                do icurr=1,scal%nprd(ipe)
710                  this%gmeRDyntempr3(icurr,mytot)=gmeRDyntempr(icurr,mytot)*edenDRtemp(mytot,ifreq_para)
711                enddo
712              enddo
713
714              call zgemm('n','t',scal%nprd(ipe),scal%npcd(ipe),ntot, &
715                negfact,this%gmeRDyntempr3(:,:),scal%nprd(ipe),this%gmeRDyntempc(:,:),scal%npcd(ipe),&
716                (0D0,0D0),this%chilocalRDyn(:,:,ifreq_para),scal%nprd(ipe))
717
718              if(peinf%inode .eq. 0) then
719                call timing%stop(timing%chi_sum_gemm)
720              endif
721            enddo
722          endif
723
724          if(peinf%inode .eq. 0) then
725            call timing%start(timing%chi_sum_comm)
726          endif
727
728          if(pol%nfreq_group .eq. 1) then
729#ifdef MPI
730            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
731              pol%nfreq_in_group*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
732              MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
733#endif
734          elseif(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then
735#ifdef MPI
736            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
737              pol%nfreq_in_group*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
738              MPI_SUM,ipe-1,peinf%freq_comm,mpierr)
739#endif
740          endif
741#ifndef MPI
742          this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:)
743#endif
744          if(peinf%inode .eq. 0) then
745            call timing%stop(timing%chi_sum_comm)
746          endif
747
748        enddo ! ispin
749        if(peinf%inode .eq. 0) then
750          call timing%start(timing%chi_sum_array_alloc)
751        endif
752        SAFE_DEALLOCATE(this%chilocalRDyn)
753        SAFE_DEALLOCATE(edenDRtemp)
754        SAFE_DEALLOCATE(gmeRDyntempr)
755        SAFE_DEALLOCATE(this%gmeRDyntempr3)
756        SAFE_DEALLOCATE(this%gmeRDyntempc)
757        if(peinf%inode .eq. 0) then
758          call timing%stop(timing%chi_sum_array_alloc)
759        endif
760      enddo ! ipe
761      call progress_free(prog_info)
762
763      do ispin =1, kp%nspin
764        do jj=1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group
765          ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group
766! JRD XXX This assignment is now a waste of time and memory. Should just set pol%chiR(A)Dyn
767! directly above
768          pol%chiRDyn(:,:,ifreq_para,ispin) = this%chilocal2RDyn(:,:,ifreq_para,ispin)
769        enddo ! jj
770      enddo ! ispin
771      SAFE_DEALLOCATE(this%chilocal2RDyn)
772      !XXXXXXXXXXX
773      ! DO jj = 1, pol%nfreq
774      !   IF(peinf%inode .eq. 0) WRITE(2000,*) jj
775      !   call diagonalize_scalapack(pol%nmtx, scal, pol%chiRDyn(:,:,jj,1), 1.0D-3, icurr, this%chilocal, eigenval)
776      !   DEALLOCATE(eigenval)
777      !   DEALLOCATE(this%chilocal)
778      ! END DO
779      !XXXXXXXXXXX
780    endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)
781
782    !-------------------------------------
783    ! Full Frequency Be Here.
784    ! M. Shishkin and G. Kresse, Implementation and performance of the
785    ! frequency-dependent GW method within the PAW framework,
786    ! PHYSICAL REVIEW B 74, 035101, 2006.
787
788    negfact = -1D0*this%fact
789
790    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
791      if(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then
792#ifdef MPI
793        tag = 10 ! MPI tag
794        if(peinf%rank_mtxel .eq. 0) then
795          ntot_members(1) = ntot
796        endif
797        do irank = 1, pol%nfreq_group-1
798          if(peinf%rank_mtxel .eq. irank) then
799            call MPI_Send(ntot,1,MPI_INTEGER,0,tag,peinf%mtxel_comm,mpierr)
800          endif
801          if(peinf%rank_mtxel .eq. 0) then
802            call MPI_Recv(ntot_members(irank+1),1,MPI_INTEGER,irank,tag,peinf%mtxel_comm,mpistatus,mpierr)
803          endif
804        enddo
805        if(peinf%rank_mtxel .eq. 0) then
806          ntot = sum(ntot_members(1:pol%nfreq_group))
807        endif
808        call MPI_Bcast(ntot_members,pol%nfreq_group,MPI_INTEGER,0,peinf%mtxel_comm,mpierr)
809        call MPI_Bcast(ntot,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr)
810        call MPI_Bcast(ntotmax,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr)
811        ntotmax = peinf%nvownmax*peinf%ncownmax*nrk
812        !Communicate gme`s among processors in matrix element communication groups
813        if(peinf%inode .eq. 0) then
814          call timing%start(timing%chi_sum_comm)
815        endif
816                  call MPI_Allgather(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,pol%gme(1,1,1,1,1,1),&
817                                   ntotmax*kp%nspin*pol%nmtx,MPI_SCALAR,peinf%mtxel_comm,mpierr)
818        if(peinf%inode .eq. 0) then
819          call timing%stop(timing%chi_sum_comm)
820        endif
821#endif
822      endif
823
824      ! -------------------------------------------
825      ! compute the Hilbert transform coefficients
826      ! -------------------------------------------
827      j_dpc=(0.0,1.0)
828      htwR(:,:)=(0.0,0.0)
829
830      nf=pol%nfreq
831      nsf=pol%nsfreq
832      do jj=1,nf
833        eta=pol%dBrdning/ryd
834        fqt=pol%dFreqGrid(jj)/ryd
835        do isf=1,nsf
836          if (isf==1) then
837            c1=(0.0,0.0)
838            step1=1.0
839          else
840            sf1=pol%dSFreqGrid(isf-1)/ryd
841            sf2=pol%dSFreqGrid(isf)/ryd
842            step1=sf2-sf1
843
844            c11=((sf1-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf2)/(eta))&
845              &          -atan((fqt-sf1)/(eta)))
846            c12=0.50*(sf1-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)&
847              &          /(((fqt-sf1)**2.0+eta*eta)))
848
849            c13=-((sf1+fqt)*j_dpc-eta)*(atan((fqt+sf2)/(eta))&
850              &          -atan((fqt+sf1)/(eta)))
851            c14=0.50*(sf1+fqt+j_dpc*eta)*log(((fqt+sf2)**2.0+eta*eta)&
852              &          /(((fqt+sf1)**2.0+eta*eta)))
853
854            c1=c11+c12+c13+c14
855            c1=c1/step1
856          endif
857
858          if (isf==nsf) then
859            c2=(0.0,0.0)
860            step2=1.0
861          else
862            sf1=pol%dSFreqGrid(isf)/ryd
863            sf2=pol%dSFreqGrid(isf+1)/ryd
864
865            step2=sf2-sf1
866
867            c21=((sf2-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf1)/(eta))&
868              &          -atan((fqt-sf2)/(eta)))
869            c22=0.50*(sf2-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)&
870              &          /(((fqt-sf2)**2.0+eta*eta)))
871
872            c23=-((sf2+fqt)*j_dpc-eta)*(atan((fqt+sf1)/(eta))&
873              &          -atan((fqt+sf2)/(eta)))
874            c24=0.50*(sf2+fqt+j_dpc*eta)*log(((fqt+sf1)**2.0+eta*eta)&
875              &          /(((fqt+sf2)**2.0+eta*eta)))
876
877            c2=c21+c22+c23+c24
878
879            c2=c2/step2
880          endif
881
882          if (isf==1.or.isf==nsf) then
883            htwR(jj,isf)=0.5d0*(c1/step1+c2/step2)
884          else
885            htwR(jj,isf)=1.0d0*(c1+c2)/(step1+step2)
886          endif
887
888        enddo
889      enddo
890
891      ! ----------------------------------------------------
892      ! compute the spectral functions of the polarizability
893      ! ----------------------------------------------------
894
895      call progress_init(prog_info, 'building polarizability matrix', 'processor', &
896        peinf%npes_freqgrp)
897
898      nwarn=0
899      do ipe = 1, peinf%npes_freqgrp
900        call progress_step(prog_info)
901        if (peinf%verb_debug .and. peinf%inode==0) then
902          write(6,'(A,i8,6x,A,i8,A)') '### ipe=',ipe,'(npes=',peinf%npes,')'
903        endif
904#ifdef MPI
905        call MPI_barrier(MPI_COMM_WORLD,mpierr)
906#endif
907
908        SAFE_ALLOCATE(gmeRDyntempr, (scal%nprd(ipe),1))
909        SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%nprd(ipe),ntot,pol%os_nsfreq_para))
910        SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npcd(ipe)))
911        SAFE_ALLOCATE(this%chilocalRDyn, (scal%nprd(ipe),scal%npcd(ipe),pol%os_nsfreq_para))
912
913! JRD XXX should thread if keep COMM elements
914        this%chilocalRDyn=0
915
916! JRD XXX if we keep COMM elements we should thread this
917        gmeRDyntempr=(0.0,0.0)
918        this%gmeRDyntempr2=(0.0,0.0)
919        this%gmeRDyntempc=(0.0,0.0)
920
921        do ispin = 1 , kp%nspin
922
923          itot = 0
924
925          if(peinf%inode .eq. 0) then
926            call timing%start(timing%chi_sum_prep)
927          endif
928
929          SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe)))
930          SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe)))
931          SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe)))
932          SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe)))
933
934          do im=1,pol%nfreq_group                          ! im labels which member of the mtxel comm group are you
935            im_proc=peinf%rank_f+1+(im-1)*peinf%npes_freqgrp ! im_proc gives this mtxel comm group member`s global
936            do irk = 1, nrk                                  ! proc number
937              do it = 1, nst(irk)
938
939                do icurr=1,scal%nprd(ipe)
940                  tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe),it,irk)
941                  tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe),it,irk)
942                enddo
943                do icurr=1,scal%npcd(ipe)
944                  tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe),it,irk)
945                  tmpcolph(icurr) = pht(scal%imycold(icurr,ipe),it,irk)
946                enddo
947
948                do iv = 1,vwfn%nband+pol%ncrit
949
950                  tmp_iv = peinf%global_indexv(iv,im_proc)
951
952                  if (peinf%does_it_ownv(iv,im_proc)) then
953                    ilimit = peinf%global_ncown(im_proc)
954                  else
955                    ilimit = 0
956                  endif
957
958                  !$OMP PARALLEL private (mytot,zvalue,gmeRDyntempr,icurr, &
959                  !$OMP                   isfreql,isfreqr,sfreql,sfreqr, &
960                  !$OMP                   wl,wr,i_myband,jj)
961                  !$OMP DO
962                  do i_myband = 1, ilimit
963
964                    zvalue = -pol%edenDyn(peinf%global_indexv(iv,im_proc),i_myband,ispin,irk,im)
965                    if (abs(zvalue) .gt. Tol_Zero) then
966
967                      mytot = itot + i_myband
968                      isfreql=-1
969
970                      do jj=pol%nsfreq,1,-1
971                        if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then
972                          isfreql=jj
973                          EXIT
974                        endif
975                      enddo
976
977                      if (isfreql.eq.pol%nsfreq) then
978                        nwarn=nwarn+1
979                        if (nwarn==1.and.peinf%inode.eq.0) then
980                          write(0,*) 'WARNING: for accuracy, sfrequency_high_cutoff should be '
981                          write(0,*) 'larger than energy difference between highest unoccupied '
982                          write(0,*) 'state and lowest occupied state.'
983                        endif
984                        cycle
985                      endif
986
987                      sfreql=pol%dSFreqGrid(isfreql)/ryd
988                      isfreqr=isfreql+1
989                      sfreqr=pol%dSFreqGrid(isfreqr)/ryd
990
991                      wr=  (zvalue-sfreql)/(sfreqr-sfreql)
992                      wl= -(zvalue-sfreqr)/(sfreqr-sfreql)
993
994                      do icurr=1,scal%nprd(ipe)
995                        gmeRDyntempr(icurr,1)=pol%gme(tmprowindex(icurr), &
996                          i_myband,tmp_iv,ispin,irk,im) * tmprowph(icurr)
997                      enddo
998
999                      this%gmeRDyntempr2(:,mytot,isfreql)=gmeRDyntempr(:,1)*wl
1000                      this%gmeRDyntempr2(:,mytot,isfreqr)=gmeRDyntempr(:,1)*wr
1001
1002                      do icurr=1,scal%npcd(ipe)
1003                        this%gmeRDyntempc(mytot,icurr) = &
1004                          MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,tmp_iv,ispin,irk,im) * tmpcolph(icurr) )
1005                      enddo
1006                    endif
1007                  enddo ! i_myband
1008                  !$OMP END DO
1009                  !$OMP END PARALLEL
1010                  itot = itot+ilimit
1011
1012                enddo ! iv
1013              enddo ! it
1014            enddo ! irk
1015          enddo ! im
1016
1017          SAFE_DEALLOCATE(tmprowindex)
1018          SAFE_DEALLOCATE(tmpcolindex)
1019          SAFE_DEALLOCATE(tmprowph)
1020          SAFE_DEALLOCATE(tmpcolph)
1021
1022          if(peinf%inode .eq. 0) then
1023            call timing%stop(timing%chi_sum_prep)
1024          endif
1025
1026          !Do the zgemm`s
1027          if(ntot > 0) then
1028            do jj =1+peinf%rank_mtxel, pol%nsfreq,pol%nfreq_group
1029
1030              if(peinf%inode .eq. 0) then
1031                call timing%start(timing%chi_sum_gemm)
1032              endif
1033
1034              call zgemm('n','n',scal%nprd(ipe),scal%npcd(ipe),ntot, &
1035                negfact,this%gmeRDyntempr2(:,:,jj),scal%nprd(ipe),this%gmeRDyntempc(:,:),ntot,&
1036                (0D0,0D0),this%chilocalRDyn(:,:,jj),scal%nprd(ipe))
1037
1038              if(peinf%inode .eq. 0) then
1039                call timing%stop(timing%chi_sum_gemm)
1040              endif
1041
1042            enddo
1043          endif
1044
1045          if(peinf%inode .eq. 0) then
1046            call timing%start(timing%chi_sum_comm)
1047          endif
1048
1049          if(pol%nfreq_group .eq. 1) then
1050#ifdef MPI
1051            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
1052              pol%os_nsfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
1053              MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
1054#endif
1055          elseif(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then
1056#ifdef MPI
1057            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
1058              pol%os_nsfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
1059              MPI_SUM,ipe-1,peinf%freq_comm,mpierr)
1060#endif
1061          endif
1062#ifndef MPI
1063          this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:)
1064#endif
1065          if(peinf%inode .eq. 0) then
1066            call timing%stop(timing%chi_sum_comm)
1067          endif
1068
1069
1070        enddo ! ispin
1071
1072        if(peinf%inode .eq. 0) then
1073          call timing%start(timing%chi_sum_array_alloc)
1074        endif
1075
1076        SAFE_DEALLOCATE(this%chilocalRDyn)
1077        SAFE_DEALLOCATE(gmeRDyntempr)
1078        SAFE_DEALLOCATE(this%gmeRDyntempr2)
1079        SAFE_DEALLOCATE(this%gmeRDyntempc)
1080
1081        if(peinf%inode .eq. 0) then
1082          call timing%stop(timing%chi_sum_array_alloc)
1083        endif
1084
1085      enddo ! ipe
1086      call progress_free(prog_info)
1087
1088      do ispin =1, kp%nspin
1089        do jj=1+peinf%rank_mtxel,pol%nsfreq,pol%nfreq_group
1090          pol%chiTDyn(jj,:,:,ispin) = this%chilocal2RDyn(:,:,jj,ispin)
1091        enddo ! jj
1092      enddo ! ispin
1093      SAFE_DEALLOCATE(this%chilocal2RDyn)
1094
1095      ! -----------------------------
1096      ! Hilbert transform
1097      ! ------------------------------
1098
1099
1100! JRD XXX - pol%chi... arrays need to be reordered here.
1101! I think we specifying LDC wrong here - should pol%nfreq_in_group not pol%nfreq right?
1102      call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, &
1103        (-1D0,0D0),htwR(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, &
1104        (0D0,0D0),this%chiRDynOld(:,:,:,:),pol%nfreq)
1105
1106      do ispin =1, kp%nspin
1107        do jj=1,pol%nfreq_in_group
1108          pol%chiRDyn(:,:,jj,ispin) = this%chiRDynOld(jj,:,:,ispin)
1109        enddo ! jj
1110      enddo ! ispin
1111
1112
1113    endif ! pol%freq_dep.eq.2.and.pol%freq_dep_method.eq.1
1114
1115    !call free_summation_buffers(pol)
1116
1117    POP_SUB(chi_summation_comm_matrix)
1118    return
1119
1120  end subroutine chi_summation_comm_matrix
1121
1122  !-----------------------------------------------------------------------------
1123  !                              GCOMM_ELEMENTS
1124  !-----------------------------------------------------------------------------
1125
1126  !> Create the pol%chi matrix using gcomm_elements sceheme
1127  subroutine chi_summation_comm_elements(this,pol,scal,kp,vwfn,cwfn,&
1128    nst,nrk,indt,pht)
1129    type(chi_summator_t), intent(INOUT) :: this
1130    type(polarizability), intent(INOUT) :: pol
1131    type(scalapack), intent(in) :: scal
1132    type(kpoints), intent(IN) :: kp
1133    type(valence_wfns), intent(IN) :: vwfn
1134    type(conduction_wfns), intent(IN) :: cwfn
1135
1136    integer, intent(IN) :: nst(:)
1137    integer, intent(IN) :: nrk
1138    integer, intent(INOUT) :: indt(:,:,:)
1139    SCALAR,  intent(INOUT) :: pht(:,:,:)
1140
1141    integer :: icurr,ntot,itot
1142    integer :: iv, ic, irk, it, ispin
1143
1144    SCALAR :: temp_gme
1145    integer, allocatable :: iowna(:)
1146    integer :: isend
1147
1148    complex(DPC), allocatable :: edenDRtemp(:)
1149
1150    real(DP) :: zvalue
1151    ! frequency points for the spectral functions of the polarizability
1152    integer :: isfreql, isfreqr
1153    real(DP) :: sfreql, sfreqr, wr, wl
1154    ! Hilbert tranform coefficients
1155    complex(DPC) :: htwR(pol%nfreq,pol%nsfreq), htwA(pol%nfreq,pol%nsfreq)
1156    complex(DPC) :: c11,c12,c13,c14,c21,c22,c23,c24
1157    complex(DPC) :: cA11,cA12,cA13,cA14,cA21,cA22,cA23,cA24
1158
1159    integer :: isf,nf,nsf,ifreq_para
1160    real(DP) :: sf1,sf2
1161    real(DP) :: step1,step2,fqt,eta
1162    complex(DPC) :: c1,c2,j_dpc,cA1,cA2
1163
1164    integer :: ii, jj
1165    type(progress_info) :: prog_info
1166
1167    integer :: nsftot, il, ir, n1, n2, n3, max_nv, nwarn
1168    integer, allocatable :: count_v(:), ind_v(:,:), ind_sf(:)
1169
1170    PUSH_SUB(chi_summation_comm_elements)
1171
1172    !call alloc_summation_buffers(pol, this%fact)
1173
1174    if(peinf%inode .eq. 0) then
1175      call timing%start(timing%chi_sum_array_alloc)
1176    endif
1177
1178    SAFE_ALLOCATE(iowna, (vwfn%nband+pol%ncrit))
1179
1180    if (pol%freq_dep .eq. 0) then
1181      SAFE_ALLOCATE(this%chilocal, (scal%npr,scal%npc))
1182    endif
1183
1184    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then
1185      SAFE_ALLOCATE(this%chilocalRDyn, (scal%npr,scal%npc,pol%nfreq_in_group))
1186    endif
1187
1188    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1189      SAFE_ALLOCATE(this%chilocalRDyn, (scal%npr,scal%npc,pol%os_nsfreq_para))
1190
1191      SAFE_ALLOCATE(count_v, (pol%os_nsfreq_para))
1192      SAFE_ALLOCATE(ind_sf, (pol%os_nsfreq_para))
1193    endif
1194
1195    if(peinf%inode .eq. 0) then
1196      call timing%stop(timing%chi_sum_array_alloc)
1197    endif
1198
1199    call logit("Starting chi Sum")
1200    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1201      ! -------------------------------------------
1202      ! compute the Hilbert transform coefficients
1203      ! -------------------------------------------
1204
1205      if(peinf%inode .eq. 0) then
1206        call timing%start(timing%chi_sum_ht_nb)
1207      endif
1208
1209      j_dpc=(0.0,1.0)
1210      htwR(:,:)=(0.0,0.0)
1211
1212      nf=pol%nfreq
1213      nsf=pol%nsfreq
1214      do jj=1,nf
1215        eta=pol%dBrdning/ryd
1216        fqt=pol%dFreqGrid(jj)/ryd
1217        do isf=1,nsf
1218          if (isf==1) then
1219            c1=(0.0,0.0)
1220            step1=1.0
1221          else
1222            sf1=pol%dSFreqGrid(isf-1)/ryd
1223            sf2=pol%dSFreqGrid(isf)/ryd
1224            step1=sf2-sf1
1225
1226            c11=((sf1-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf2)/(eta))&
1227              &          -atan((fqt-sf1)/(eta)))
1228            c12=0.50*(sf1-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)&
1229              &          /(((fqt-sf1)**2.0+eta*eta)))
1230
1231            c13=-((sf1+fqt)*j_dpc-eta)*(atan((fqt+sf2)/(eta))&
1232              &          -atan((fqt+sf1)/(eta)))
1233            c14=0.50*(sf1+fqt+j_dpc*eta)*log(((fqt+sf2)**2.0+eta*eta)&
1234              &          /(((fqt+sf1)**2.0+eta*eta)))
1235
1236            c1=c11+c12+c13+c14
1237            c1=c1/step1
1238          endif
1239
1240          if (isf==nsf) then
1241            c2=(0.0,0.0)
1242            step2=1.0
1243          else
1244            sf1=pol%dSFreqGrid(isf)/ryd
1245            sf2=pol%dSFreqGrid(isf+1)/ryd
1246
1247            step2=sf2-sf1
1248
1249            c21=((sf2-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf1)/(eta))&
1250              &          -atan((fqt-sf2)/(eta)))
1251            c22=0.50*(sf2-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)&
1252              &          /(((fqt-sf2)**2.0+eta*eta)))
1253
1254            c23=-((sf2+fqt)*j_dpc-eta)*(atan((fqt+sf1)/(eta))&
1255              &          -atan((fqt+sf2)/(eta)))
1256            c24=0.50*(sf2+fqt+j_dpc*eta)*log(((fqt+sf1)**2.0+eta*eta)&
1257              &          /(((fqt+sf2)**2.0+eta*eta)))
1258
1259            c2=c21+c22+c23+c24
1260
1261            c2=c2/step2
1262          endif
1263
1264          if (isf==1.or.isf==nsf) then
1265            htwR(jj,isf)=0.5d0*(c1/step1+c2/step2)
1266          else
1267            htwR(jj,isf)=1.0d0*(c1+c2)/(step1+step2)
1268          endif
1269
1270        enddo
1271      enddo
1272
1273      if(peinf%inode .eq. 0) then
1274        call timing%stop(timing%chi_sum_ht_nb)
1275      endif
1276
1277    endif!(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)
1278
1279    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1280      pol%chiTDyn=(0.0,0.0)
1281    endif
1282
1283    call progress_init(prog_info, 'building polarizability matrix', 'blocks', &
1284      nrk*kp%nspin*(cwfn%nband-vwfn%nband))
1285    nwarn=0
1286    do irk=1,nrk
1287      if (peinf%verb_debug .and. peinf%inode==0) then
1288        write(6,'(A,i8,6x,A,i8,A)') '### irk=',irk,'(nrk=',nrk,')'
1289      endif
1290#ifdef MPI
1291      call MPI_barrier(MPI_COMM_WORLD,mpierr)
1292#endif
1293      do ispin=1,kp%nspin
1294        if(peinf%inode .eq. 0) then
1295          call timing%start(timing%chi_sum_array_alloc)
1296        endif
1297        iowna(:)=1
1298        ntot=(vwfn%nband+pol%ncrit)*nst(irk)
1299        if (pol%freq_dep .eq. 0) then
1300!JRD XXX Should thread
1301          this%chilocal=0
1302          SAFE_ALLOCATE(this%gmetempr, (scal%npr,ntot))
1303          SAFE_ALLOCATE(this%gmetempc, (ntot,scal%npc))
1304        endif
1305        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then
1306!JRD XXX Should thread
1307          this%chilocalRDyn=0
1308          SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%npr,ntot,pol%nfreq_in_group))
1309          SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npc))
1310        endif
1311
1312        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1313!JRD XXX Should thread
1314          this%chilocalRDyn=(0.0,0.0)
1315
1316          max_nv=0
1317          do ic=1,cwfn%nband-vwfn%nband
1318            count_v=0
1319            do iv=1,(vwfn%nband+pol%ncrit)
1320
1321              isend=peinf%global_pairowner(iv,ic)-1
1322              if (isend .lt. 0) then
1323                write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic
1324                call die("internal error in chi_summation")
1325              endif
1326              if (isend .eq. peinf%inode) then
1327                zvalue=-pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)
1328                iowna(iv)=iowna(iv)+1
1329              endif
1330
1331#ifdef MPI
1332              call MPI_Bcast(zvalue,1,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
1333              call MPI_Bcast(pol%dSFreqGrid,pol%os_nsfreq_para,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
1334#endif
1335
1336              if (scal%npr*scal%npc .ne. 0) then
1337                do it=1, nst(irk)
1338
1339                  if (abs(zvalue) .gt. Tol_Zero) then
1340                    isfreql=-1
1341                    do jj=pol%nsfreq,1,-1
1342                      if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then
1343                        isfreql=jj
1344                        EXIT
1345                      endif
1346                    enddo
1347
1348                    if (isfreql.eq.pol%nsfreq) then
1349                      nwarn=nwarn+1
1350                      if (nwarn==1.and.peinf%inode.eq.0) then
1351                        write(0,*) 'WARNING: for accuracy, sfrequency_high_cutoff should be '
1352                        write(0,*) 'larger than energy difference between highest unoccupied '
1353                        write(0,*) 'state and lowest occupied state.'
1354                      endif
1355                      cycle
1356                    endif
1357
1358                    isfreqr=isfreql+1
1359
1360                    count_v(isfreql)=count_v(isfreql)+1
1361                    count_v(isfreqr)=count_v(isfreqr)+1
1362                  endif
1363                enddo !it
1364              endif
1365            enddo !iv
1366
1367            if (max_nv<maxval(count_v(:))) then
1368              max_nv=maxval(count_v(:))
1369            endif
1370          enddo !ic
1371
1372          SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%npr,max_nv,pol%os_nsfreq_para))
1373          SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npc))
1374          SAFE_ALLOCATE(this%gmeRDyntempcs, (max_nv,scal%npc))
1375
1376          this%gmeRDyntempr2=(0.0,0.0)
1377          this%gmeRDyntempc=(0.0,0.0)
1378          this%gmeRDyntempcs=(0.0,0.0)
1379
1380          SAFE_ALLOCATE(ind_v, (pol%os_nsfreq_para, max_nv))
1381
1382          this%gmeRDyntempr2=(0.0,0.0)
1383          iowna(:)=1
1384        endif!(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)
1385
1386        if(peinf%inode .eq. 0) then
1387          call timing%stop(timing%chi_sum_array_alloc)
1388        endif
1389
1390        do ic=1,cwfn%nband-vwfn%nband
1391          call progress_step(prog_info)
1392
1393          ! We do two giant loops here for freq_dep cases
1394          if (pol%freq_dep .eq. 0) then
1395            itot=0
1396              if(peinf%inode .eq. 0) then
1397                call timing%start(timing%chi_sum_comm)
1398              endif
1399              do iv=1,(vwfn%nband+pol%ncrit)
1400              isend=peinf%global_pairowner(iv,ic)-1
1401              if (isend .lt. 0) then
1402                write(0,*) 'Illegal value for mpi proc, isend:',&
1403                  peinf%inode,iv,ic,peinf%global_pairowner(iv,ic)
1404                call die("internal error in chi_summation")
1405              endif
1406              if (isend .eq. peinf%inode) then
1407                if (iowna(iv) .gt. peinf%ncownactual) call die('iowna(iv) bigger than ncownactual')
1408                this%gmetempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
1409                  ispin,irk,1) * sqrt(this%fact)
1410                iowna(iv)=iowna(iv)+1
1411              endif
1412#ifdef MPI
1413              call MPI_Bcast(this%gmetempn,pol%nmtx,MPI_SCALAR,isend,MPI_COMM_WORLD,mpierr)
1414#endif
1415              if (scal%npr*scal%npc .ne. 0) then
1416
1417                do it =1, nst(irk)
1418                  itot = itot + 1
1419
1420                  do icurr=1,scal%npr
1421                    this%gmetempr(icurr,itot)=this%gmetempn(indt(scal%imyrow(icurr),it,irk)) * &
1422                      pht(scal%imyrow(icurr),it,irk)
1423                  enddo
1424
1425                  do icurr=1,scal%npc
1426                    temp_gme = this%gmetempn(indt(scal%imycol(icurr),it,irk))
1427                    this%gmetempc(itot,icurr)=MYCONJG(temp_gme * pht(scal%imycol(icurr),it,irk) )
1428                  enddo
1429                enddo ! it
1430              endif
1431            enddo ! iv
1432            if(peinf%inode .eq. 0) then
1433              call timing%stop(timing%chi_sum_comm)
1434            endif
1435
1436
1437            ! JRD: Using Level3 BLAS here for better performance
1438
1439            if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then
1440              if(peinf%inode .eq. 0) then
1441                call timing%start(timing%chi_sum_gemm)
1442              endif
1443
1444              call X(gemm)('n','n',scal%npr,scal%npc,ntot, &
1445                -ONE,this%gmetempr,scal%npr,this%gmetempc,ntot,ONE,this%chilocal,scal%npr)
1446
1447              if(peinf%inode .eq. 0) then
1448                call timing%stop(timing%chi_sum_gemm)
1449              endif
1450
1451            endif
1452
1453          endif ! pol%freq_dep .eq. 0
1454
1455          !---------------------
1456          ! JRD: Full Frequency Be Here
1457
1458          if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then
1459
1460            if(peinf%inode .eq. 0) then
1461              call timing%start(timing%chi_sum_array_alloc)
1462            endif
1463            SAFE_ALLOCATE(edenDRtemp, (pol%nfreq))
1464            if(peinf%inode .eq. 0) then
1465              call timing%stop(timing%chi_sum_array_alloc)
1466            endif
1467
1468            itot=0
1469!JRD XXX Should thread
1470            do iv=1,(vwfn%nband+pol%ncrit)
1471              if(peinf%inode .eq. 0) then
1472                call timing%start(timing%chi_sum_comm)
1473              endif
1474
1475              isend=peinf%global_pairowner(iv,ic)-1
1476              if (isend .lt. 0) then
1477                write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic
1478                call die("internal error in chi_summation")
1479              endif
1480              if (isend .eq. peinf%inode) then
1481                this%gmeRDyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
1482                  ispin,irk,1) * sqrt(this%fact)
1483                if (abs(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)) .gt. Tol_Zero) then
1484                  do jj=1,pol%nfreq
1485                    edenDRtemp(jj)= -0.50d0 * ( 1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1) &
1486                      -(pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd)+&
1487                      1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)+&
1488                      (pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd))
1489                  enddo
1490                else
1491                  edenDRtemp(:)=0D0
1492                endif
1493                iowna(iv)=iowna(iv)+1
1494              endif
1495
1496#ifdef MPI
1497              call MPI_Bcast(this%gmeRDyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
1498              call MPI_Bcast(edenDRtemp,pol%nfreq,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
1499#endif
1500              if(peinf%inode .eq. 0) then
1501                call timing%stop(timing%chi_sum_comm)
1502              endif
1503
1504              if (scal%npr*scal%npc .ne. 0) then
1505                do it =1, nst(irk)
1506                  if(peinf%inode .eq. 0) then
1507                    call timing%start(timing%chi_sum_row)
1508                  endif
1509
1510                  itot = itot + 1
1511                  do icurr=1,scal%npr
1512                    do jj =1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group
1513                      ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group
1514                      this%gmeRDyntempr2(icurr,itot,ifreq_para)= &
1515                        (this%gmeRDyntempn(indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk))*edenDRtemp(jj)
1516                    enddo
1517                  enddo
1518
1519                  if(peinf%inode .eq. 0) then
1520                    call timing%stop(timing%chi_sum_row)
1521                  endif
1522
1523                  if(peinf%inode .eq. 0) then
1524                    call timing%start(timing%chi_sum_column)
1525                  endif
1526
1527                  do icurr=1,scal%npc
1528                    this%gmeRDyntempc(itot,icurr) = &
1529                      CONJG(this%gmeRDyntempn(indt(scal%imycol(icurr),it,irk))*pht(scal%imycol(icurr),it,irk))
1530                  enddo
1531
1532                  if(peinf%inode .eq. 0) then
1533                    call timing%stop(timing%chi_sum_column)
1534                  endif
1535
1536                enddo ! it
1537              endif
1538
1539            enddo ! iv
1540
1541
1542            ! JRD: Using Level3 BLAS here for better performance
1543
1544            if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then
1545              do jj =1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group
1546                if(peinf%inode .eq. 0) then
1547                  call timing%start(timing%chi_sum_gemm)
1548                endif
1549
1550                ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group
1551                call zgemm('n','n',scal%npr,scal%npc,ntot,(-1D0,0D0),this%gmeRDyntempr2(:,:,ifreq_para),scal%npr, &
1552                  this%gmeRDyntempc(:,:),ntot,(1D0,0D0),this%chilocalRDyn(:,:,ifreq_para),scal%npr)
1553                if(peinf%inode .eq. 0) then
1554                  call timing%stop(timing%chi_sum_gemm)
1555                endif
1556
1557              enddo
1558            endif
1559
1560            if(peinf%inode .eq. 0) then
1561              call timing%start(timing%chi_sum_array_alloc)
1562            endif
1563
1564            SAFE_DEALLOCATE(edenDRtemp)
1565
1566            if(peinf%inode .eq. 0) then
1567              call timing%stop(timing%chi_sum_array_alloc)
1568            endif
1569
1570          endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)
1571
1572          !---------------------
1573          ! Full Frequency Be Here(shishkin and Kresse 2006)
1574
1575          if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1576            count_v=0
1577            ind_v=0
1578            ind_sf=0
1579            nsftot=0
1580            itot=0
1581
1582            do iv=1,(vwfn%nband+pol%ncrit)
1583
1584              if(peinf%inode .eq. 0) then
1585                call timing%start(timing%chi_sum_comm)
1586              endif
1587
1588              isend=peinf%global_pairowner(iv,ic)-1
1589              if (isend .lt. 0) then
1590                write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic
1591                call die("internal error in chi_summation")
1592              endif
1593              if (isend .eq. peinf%inode) then
1594                this%gmeRDyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
1595                  ispin,irk,1) * sqrt(this%fact)
1596                zvalue=-pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)
1597                iowna(iv)=iowna(iv)+1
1598              endif
1599
1600#ifdef MPI
1601              call MPI_Bcast(this%gmeRDyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
1602              call MPI_Bcast(zvalue,1,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
1603              call MPI_Bcast(pol%dSFreqGrid,pol%os_nsfreq_para,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
1604#endif
1605
1606              if(peinf%inode .eq. 0) then
1607                call timing%stop(timing%chi_sum_comm)
1608              endif
1609              ! compute spectral functions of the polarizability
1610
1611              if (scal%npr*scal%npc .ne. 0) then
1612                do it=1, nst(irk)
1613
1614                  if (abs(zvalue) .gt. Tol_Zero) then
1615
1616                    if(peinf%inode .eq. 0) then
1617                      call timing%start(timing%chi_sum_row)
1618                    endif
1619
1620                    itot=itot+1
1621                    isfreql=-1
1622                    do jj=pol%nsfreq,1,-1
1623                      if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then
1624                        isfreql=jj
1625                        EXIT
1626                      endif
1627                    enddo
1628
1629                    if (isfreql.eq.pol%nsfreq) then
1630                      cycle
1631                    endif
1632
1633                    isfreqr=isfreql+1
1634
1635                    count_v(isfreql)=count_v(isfreql)+1
1636                    count_v(isfreqr)=count_v(isfreqr)+1
1637
1638                    il=count_v(isfreql)
1639                    ir=count_v(isfreqr)
1640
1641                    ind_v(isfreql,il)=itot
1642                    ind_v(isfreqr,ir)=itot
1643
1644                    sfreql=pol%dSFreqGrid(isfreql)/ryd
1645                    sfreqr=pol%dSFreqGrid(isfreqr)/ryd
1646
1647                    wl=-(zvalue-sfreqr)/(sfreqr-sfreql)
1648                    wr=(zvalue-sfreql)/(sfreqr-sfreql)
1649
1650                    do icurr=1,scal%npr
1651                      this%gmeRDyntempr2(icurr,il,isfreql)=this%gmeRDyntempn( &
1652                        indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk)*wl
1653                      this%gmeRDyntempr2(icurr,ir,isfreqr)=this%gmeRDyntempn( &
1654                        indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk)*wr
1655                    enddo
1656
1657                    if(peinf%inode .eq. 0) then
1658                      call timing%stop(timing%chi_sum_row)
1659                    endif
1660
1661                    if(peinf%inode .eq. 0) then
1662                      call timing%start(timing%chi_sum_column)
1663                    endif
1664
1665                    do icurr=1,scal%npc
1666                      this%gmeRDyntempc(itot,icurr) = &
1667                        CONJG(this%gmeRDyntempn(indt(scal%imycol(icurr),it,irk))*pht(scal%imycol(icurr),it,irk))
1668                    enddo
1669
1670                    if(peinf%inode .eq. 0) then
1671                      call timing%stop(timing%chi_sum_column)
1672                    endif
1673
1674                  endif
1675
1676                enddo ! it
1677              endif
1678
1679            enddo ! iv
1680
1681            if(peinf%inode .eq. 0) then
1682              call timing%start(timing%chi_sum_array_alloc)
1683            endif
1684
1685            jj=0
1686            do ii=1+peinf%rank_mtxel,pol%nsfreq,pol%nfreq_group
1687              if (count_v(ii)>0) then
1688                jj=jj+1
1689                ind_sf(jj)=ii
1690              endif
1691            enddo
1692            nsftot=jj
1693            if(peinf%inode .eq. 0) then
1694              call timing%stop(timing%chi_sum_array_alloc)
1695            endif
1696
1697
1698            if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then
1699              do ii=1, nsftot
1700                if(peinf%inode .eq. 0) then
1701                  call timing%start(timing%chi_sum_gemm)
1702                endif
1703
1704                n1=ind_sf(ii)
1705                n2=count_v(n1)
1706
1707                do jj=1,n2
1708                  n3=ind_v(n1,jj)
1709                  this%gmeRDyntempcs(jj,:)=this%gmeRDyntempc(n3,:)
1710                enddo
1711
1712                call zgemm('n','n',scal%npr,scal%npc,n2,(-1D0,0D0),this%gmeRDyntempr2(:,:,n1),scal%npr, &
1713                  this%gmeRDyntempcs(:,:),max_nv,(1D0,0D0),this%chilocalRDyn(:,:,n1),scal%npr)
1714
1715                if(peinf%inode .eq. 0) then
1716                  call timing%stop(timing%chi_sum_gemm)
1717                endif
1718
1719              enddo
1720            endif
1721          endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)
1722
1723        enddo ! ic (loop over conduction bands)
1724
1725
1726        if(peinf%inode .eq. 0) then
1727          call timing%start(timing%chi_sum_array_alloc)
1728        endif
1729        if (pol%freq_dep .eq. 0) then
1730          if (scal%npr*scal%npc .ne. 0) then
1731            pol%chi(:,:,ispin) = pol%chi(:,:,ispin) + this%chilocal(:,:)
1732          endif
1733          SAFE_DEALLOCATE(this%gmetempr)
1734          SAFE_DEALLOCATE(this%gmetempc)
1735        endif
1736
1737        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then
1738          if (scal%npr*scal%npc .ne. 0) then
1739            do jj = 1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group
1740              ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group
1741! JRD XXX This copy is now a waste of time. Should set pol%chi directly above in the zgemm
1742              pol%chiRDyn(:,:,ifreq_para,ispin) = pol%chiRDyn(:,:,ifreq_para,ispin) + this%chilocalRDyn(:,:,ifreq_para)
1743            enddo
1744          endif
1745          SAFE_DEALLOCATE(this%gmeRDyntempr2)
1746          SAFE_DEALLOCATE(this%gmeRDyntempc)
1747        endif
1748        if(peinf%inode .eq. 0) then
1749          call timing%stop(timing%chi_sum_array_alloc)
1750        endif
1751
1752
1753        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1754
1755          if(peinf%inode .eq. 0) then
1756            call timing%start(timing%chi_sum_array_alloc)
1757          endif
1758
1759          if (scal%npr*scal%npc .ne. 0) then
1760            do jj = 1+peinf%rank_mtxel, pol%nsfreq,pol%nfreq_group
1761              pol%chiTDyn(jj,:,:,ispin) = pol%chiTDyn(jj,:,:,ispin) + this%chilocalRDyn(:,:,jj)
1762            enddo
1763          endif
1764          SAFE_DEALLOCATE(this%gmeRDyntempr2)
1765          SAFE_DEALLOCATE(this%gmeRDyntempc)
1766          SAFE_DEALLOCATE(this%gmeRDyntempcs)
1767          SAFE_DEALLOCATE(ind_v)
1768          if(peinf%inode .eq. 0) then
1769            call timing%stop(timing%chi_sum_array_alloc)
1770          endif
1771
1772        endif
1773
1774      enddo ! ispin (loop over spins)
1775    enddo ! irk (loop over k-points in set rk)
1776    call progress_free(prog_info)
1777
1778    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1779
1780      ! -------------------------
1781      ! Hilbert transform
1782      ! -------------------------
1783
1784      if(peinf%inode .eq. 0) then
1785        call timing%start(timing%chi_sum_ht_nb)
1786      endif
1787
1788
1789! JRD XXX pol%chi arrays out of order below
1790      call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, &
1791        (-1D0,0D0),htwR(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, &
1792        (0D0,0D0),this%chiRDynOld(:,:,:,:),pol%nfreq)
1793
1794      do ispin =1, kp%nspin
1795        do jj=1,pol%nfreq_in_group
1796          pol%chiRDyn(:,:,jj,ispin) = this%chiRDynOld(jj,:,:,ispin)
1797        enddo ! jj
1798      enddo ! ispin
1799
1800      if(peinf%inode .eq. 0) then
1801        call timing%stop(timing%chi_sum_ht_nb)
1802      endif
1803
1804
1805    endif !(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)
1806
1807    if(peinf%inode .eq. 0) then
1808      call timing%start(timing%chi_sum_array_alloc)
1809    endif
1810
1811    if (pol%freq_dep .eq. 0) then
1812      SAFE_DEALLOCATE(this%chilocal)
1813    endif
1814    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then
1815      SAFE_DEALLOCATE(this%chilocalRDyn)
1816    endif
1817    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
1818      SAFE_DEALLOCATE(this%chilocalRDyn)
1819      SAFE_DEALLOCATE(count_v)
1820      SAFE_DEALLOCATE(ind_sf)
1821    endif
1822
1823    SAFE_DEALLOCATE(iowna)
1824    !call free_summation_buffers(pol)
1825
1826    if(peinf%inode .eq. 0) then
1827      call timing%stop(timing%chi_sum_array_alloc)
1828    endif
1829
1830
1831    POP_SUB(chi_summation_comm_elements)
1832    return
1833
1834  end subroutine chi_summation_comm_elements
1835
1836#if defined MPI && defined USESCALAPACK
1837  subroutine chi_summation_sub_trunc(this,pol,scal,kp,kpq,vwfn,cwfn,&
1838                                     nst,nrk,indt,pht,gvec,crys,q0,iq)
1839    use inversion_m
1840    type(chi_summator_t), intent(INOUT) :: this
1841    type(polarizability), intent(INOUT) :: pol
1842    type(scalapack), intent(in) :: scal
1843    type(kpoints), intent(IN) :: kp, kpq
1844    type(valence_wfns), intent(IN) :: vwfn
1845    type(conduction_wfns), intent(IN) :: cwfn
1846
1847    integer, intent(IN) :: nst(:)
1848    integer, intent(IN) :: nrk
1849    integer, intent(INOUT) :: indt(:,:,:)
1850    SCALAR,  intent(INOUT) :: pht(:,:,:)
1851    type (gspace), intent(in) :: gvec
1852    type (crystal), intent(in) :: crys
1853    real(DP), intent(in) :: q0(3)
1854    integer, intent(in)  :: iq
1855    real(DP) :: zvalue, cv_energy
1856
1857    SCALAR, ALLOCATABLE :: chilocal(:,:), chiRDyn_local(:,:)
1858    SCALAR, allocatable :: chi_omega0_save(:,:,:)
1859    INTEGER :: ntot, ntot2
1860    INTEGER :: neig_sub
1861    INTEGER :: irk, ipe, ispin, itot, it, iv, i, j, ii, jj
1862    INTEGER :: irow, icol, irowm, icolm, icurr, ilimit, freq_idx, &
1863               i_myband, mytot
1864    integer :: ig_l, ig_g, igp_l, igp_g
1865    INTEGER :: ipe_real
1866
1867    integer, allocatable :: tmprowindex(:),tmpcolindex(:)
1868    SCALAR, allocatable :: tmprowph(:),tmpcolph(:)
1869    type(cvpair_info) :: cvpair_temp
1870    SCALAR :: negfact
1871    SCALAR :: edenDRtemp
1872
1873    SCALAR :: epsheaddummy, wcoul0
1874    type (twork_scell) :: work_scell
1875    real(DP), allocatable :: vcoul(:)
1876    integer, allocatable :: isorti(:)
1877    real(DP) :: vc, oneoverq, avgcut
1878    integer :: iscreen, nfk, iparallel
1879    integer :: qgrid(3)
1880    real(DP) :: q0vec(3)
1881
1882    ! variables for non-blocking cyclic scheme
1883    integer :: isend_static, irec_static
1884    integer :: actual_send, actual_rec
1885    integer :: nsend_row, nsend_col, nrec_row, nrec_col
1886    integer :: req_send, tag_send, req_rec, tag_rec
1887    integer :: stat(MPI_STATUS_SIZE)
1888    SCALAR, allocatable :: buf_send(:,:,:), buf_rec(:,:,:), buf_temp(:,:,:)
1889
1890    type(progress_info) :: prog_info
1891    ! variables for fixed buffer size
1892    logical :: keep_fix_buf
1893    integer :: size_N, size_M, size_K
1894    ! variables for matrix dumping
1895    logical :: do_read_dump_matrix, do_dump_matrix
1896
1897    PUSH_SUB(chi_summation_sub_trunc)
1898
1899    ! Keep these variables hardcoded, can be used by advance
1900    ! user to dump the block cyclic distributed polarizability
1901    ! as check point, for specific format see the 2 routines
1902    ! at the bottom of the module (dump_matrix and read_dump_matrix)
1903    do_dump_matrix = .false.
1904    do_read_dump_matrix = .false.
1905    ! if we read dumped matrix make sure we don't write, so we don't
1906    ! override the results
1907    if ( do_read_dump_matrix ) do_dump_matrix = .false.
1908
1909    ! this is the new communication scheme which avoid continuos allocation
1910    ! deallocation of the communication / computation buffers. it will be
1911    ! set default to .true. and turn off by using the correspondinf input key
1912    keep_fix_buf = .false.
1913    ! keep_fix_buf only work for pol%nonblocking_cyclic
1914    if( pol%nonblocking_cyclic ) then
1915      keep_fix_buf = .true.
1916      if( pol%dont_keep_fix_buffers ) then
1917         keep_fix_buf = .false.
1918      end if
1919    end if
1920
1921    !XXXXXXXXX
1922    ! WRITE(2000,*)  pol%gme
1923    ! WRITE(2000,*)  'XXXX'
1924    ! WRITE(2000,*)  pht
1925    ! WRITE(*,*) SIZE(pol%gme,1)
1926    ! WRITE(*,*) SIZE(pol%gme,2)
1927    ! WRITE(*,*) SIZE(pol%gme,3)
1928    ! WRITE(*,*) SIZE(pol%gme,4)
1929    ! WRITE(*,*) SIZE(pol%gme,5)
1930    ! WRITE(*,*) SIZE(pol%gme,6)
1931    ! DO irk = 1, nrk
1932    !   DO j = 1, SIZE(pol%gme,3)
1933    !     DO i = 1, SIZE(pol%gme,2)
1934    !       WRITE(2000,*) pht(i,j,irk)
1935    !       WRITE(2000,*) pol%gme(:,i,j,1,irk,1)
1936    !     END DO
1937    !   END DO
1938    ! END DO
1939    !XXXXXXXXX
1940
1941    ! first calculate polarizability at omega=0 using all processes (similar to static case)
1942    ntot=0
1943    ntot2=0
1944    ntot = peinf%nvownactual*peinf%ncownactual
1945    do irk = 1, nrk
1946      ntot2=ntot2 + nst(irk)
1947    enddo
1948    ntot=ntot*ntot2
1949
1950    SAFE_ALLOCATE(this%chilocal2, (scal%npr,scal%npc,kp%nspin))
1951    this%chilocal2=0
1952
1953    freq_idx = pol%nfreq_group
1954    IF(pol%nfreq_group .gt. 1) THEN
1955      freq_idx = peinf%rank_mtxel+1
1956    END IF
1957
1958    negfact = -1D0*this%fact
1959
1960    ! trace the time for chi omega=0
1961    if(peinf%inode .eq. 0) then
1962      call timing%start(timing%chi_sum_sub_omega_0)
1963    endif
1964
1965
1966    ! here we want to use all processes for computing omega=0, we
1967    ! do not take care of the parallelization over frequencies
1968    call progress_init(prog_info, 'building polarizability matrix omega=0', 'processor', &
1969      peinf%npes)
1970
1971    if(pol%nonblocking_cyclic) then
1972      ! initialize buffer for non-blocking cyclic scheme
1973      ! static process for the communication
1974      isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes)
1975      irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes)
1976      ! allocate my size for the first send
1977      nsend_row = scal%nprd(peinf%inode+1)
1978      nsend_col = scal%npcd(peinf%inode+1)
1979
1980      if(keep_fix_buf) then
1981        ! precompute max sizes
1982        size_N = MAXVAL(scal%nprd(:))
1983        size_M = MAXVAL(scal%npcd(:))
1984        size_K = ntot
1985        call mpi_allreduce(MPI_IN_PLACE, size_K, 1, &
1986                           MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
1987        ! allocate only once
1988        SAFE_ALLOCATE(buf_send, (size_N, size_M, kp%nspin))
1989        SAFE_ALLOCATE(buf_rec,  (size_N, size_M, kp%nspin))
1990        SAFE_ALLOCATE(buf_temp, (size_N, size_M, kp%nspin))
1991        do ispin = 1 , kp%nspin
1992          !$OMP PARALLEL DO collapse(2)
1993          do ii = 1, size_M
1994            do jj = 1, size_N
1995              buf_send(jj,ii,ispin) = ZERO
1996            enddo
1997          enddo
1998          !$OMP PARALLEL DO collapse(2)
1999          do ii = 1, size_M
2000            do jj = 1, size_N
2001              buf_rec(jj,ii,ispin)  = ZERO
2002            end do
2003          end do
2004          !$OMP PARALLEL DO collapse(2)
2005          do ii = 1, size_M
2006            do jj = 1, size_N
2007              buf_temp(jj,ii,ispin) = ZERO
2008            end do
2009          end do
2010        end do
2011        ! allocate for index mapping
2012        SAFE_ALLOCATE(tmprowindex, (size_N)) ! (scal%nprd(ipe_real)))
2013        SAFE_ALLOCATE(tmpcolindex, (size_M)) ! (scal%npcd(ipe_real)))
2014        SAFE_ALLOCATE(tmprowph,  (size_N)) ! (scal%nprd(ipe_real)))
2015        SAFE_ALLOCATE(tmpcolph,  (size_M)) ! (scal%npcd(ipe_real)))
2016        ! allocate for matrix multiplication
2017        SAFE_ALLOCATE(this%chilocal, (size_N, size_M)) ! (scal%nprd(ipe),scal%npcd(ipe))
2018        SAFE_ALLOCATE(this%gmetempr, (size_N, size_K)) ! (scal%nprd(ipe),ntot)
2019        SAFE_ALLOCATE(this%gmetempc, (size_K, size_M)) ! (ntot,scal%npcd(ipe))
2020        !$OMP PARALLEL DO collapse(2)
2021        do ii = 1, size_M
2022          do jj = 1, size_N
2023            this%chilocal(jj,ii) = ZERO
2024          end do
2025        end do
2026        !$OMP PARALLEL DO collapse(2)
2027        do ii = 1, size_K
2028          do jj = 1, size_N
2029            this%gmetempr(jj,ii) = ZERO
2030          end do
2031        end do
2032        !$OMP PARALLEL DO collapse(2)
2033        do ii = 1, size_M
2034          do jj = 1, size_K
2035            this%gmetempc(jj,ii) = ZERO
2036          end do
2037        end do
2038        !
2039      else  ! keep_fix_buf
2040        !
2041        ! standard case
2042        SAFE_ALLOCATE(buf_send, (nsend_row, nsend_col, kp%nspin))
2043        do ispin = 1 , kp%nspin
2044          !$OMP PARALLEL DO collapse(2)
2045          do ii = 1, nsend_col
2046            do jj = 1, nsend_row
2047              buf_send(jj,ii,ispin) = ZERO
2048            enddo
2049          enddo
2050        end do
2051        !
2052      end if  ! keep_fix_buf
2053    end if
2054
2055    do ipe = 1, peinf%npes
2056      call progress_step(prog_info)
2057
2058      if ( do_read_dump_matrix ) cycle
2059
2060      if(pol%nonblocking_cyclic) then
2061        ! calculate the actual process we have to send and we are receiving
2062        actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes)
2063        actual_rec  = MOD(peinf%inode - ipe + peinf%npes, peinf%npes)
2064        nrec_row = scal%nprd(actual_rec+1)
2065        nrec_col = scal%npcd(actual_rec+1)
2066        !
2067        if( keep_fix_buf ) then
2068          ! just make sure sizes will always be the same
2069          nsend_row = size_N
2070          nrec_row  = size_N
2071          nsend_col = size_M
2072          nrec_col  = size_M
2073        else
2074          ! allocate reciving buffer
2075          SAFE_ALLOCATE(buf_rec, (nrec_row,nrec_col,kp%nspin))
2076        end if
2077        !
2078        do ispin = 1 , kp%nspin
2079          !$OMP PARALLEL DO collapse(2)
2080          do ii = 1, nrec_col
2081            do jj = 1, nrec_row
2082              buf_rec(jj,ii,ispin) = ZERO
2083            enddo
2084          enddo
2085        end do
2086        ! post message
2087        if(peinf%inode .eq. 0) then
2088          call timing%start(timing%chi_sum_comm)
2089        endif
2090        tag_rec = 1
2091        tag_send = 1
2092        req_rec = MPI_REQUEST_NULL
2093        req_send = MPI_REQUEST_NULL
2094        CALL MPI_Irecv(buf_rec, nrec_row*nrec_col*kp%nspin,MPI_SCALAR,irec_static,&
2095                       tag_rec, MPI_COMM_WORLD, req_rec, mpierr)
2096        CALL MPI_Isend(buf_send, nsend_row*nsend_col*kp%nspin,MPI_SCALAR,isend_static,&
2097                       tag_send, MPI_COMM_WORLD, req_send, mpierr)
2098        if(peinf%inode .eq. 0) then
2099          call timing%stop(timing%chi_sum_comm)
2100        endif
2101
2102        ! allocate working array
2103        if(peinf%inode .eq. 0) then
2104          call timing%start(timing%chi_sum_array_alloc)
2105        endif
2106        !
2107        if( .not. keep_fix_buf ) then
2108          SAFE_ALLOCATE(this%chilocal, (nrec_row, nrec_col))
2109        end if
2110        !
2111!$OMP PARALLEL DO collapse(2)
2112        do ii = 1, nrec_col
2113          do jj = 1, nrec_row
2114            this%chilocal(jj,ii)=0D0
2115          enddo
2116        enddo
2117        !
2118        if( .not. keep_fix_buf ) then
2119          SAFE_ALLOCATE(buf_temp, (nrec_row, nrec_col, kp%nspin))
2120        end if
2121        !
2122        do ispin = 1 , kp%nspin
2123!$OMP PARALLEL DO collapse(2)
2124          do ii = 1, nrec_col
2125            do jj = 1, nrec_row
2126              buf_temp(jj,ii,ispin) = ZERO
2127            enddo
2128          enddo
2129        end do
2130        !
2131        if( keep_fix_buf ) then
2132          ! make sure to put buffer to zero
2133          !
2134          !$OMP PARALLEL DO collapse(2)
2135          do ii = 1, size_K
2136            do jj = 1, size_N
2137              this%gmetempr(jj,ii) = ZERO
2138            end do
2139          end do
2140          !$OMP PARALLEL DO collapse(2)
2141          do ii = 1, size_M
2142            do jj = 1, size_K
2143              this%gmetempc(jj,ii) = ZERO
2144            end do
2145          end do
2146          !
2147        else
2148          SAFE_ALLOCATE(this%gmetempr, (nrec_row, ntot))
2149          SAFE_ALLOCATE(this%gmetempc, (ntot, nrec_col))
2150        end if
2151        if(peinf%inode .eq. 0) then
2152          call timing%stop(timing%chi_sum_array_alloc)
2153        endif
2154
2155        ! in this case the ipe is the real process from which we should receive
2156        ipe_real = actual_rec+1
2157      else
2158        ! standard calculation
2159        SAFE_ALLOCATE(this%chilocal, (scal%nprd(ipe),scal%npcd(ipe)))
2160        this%chilocal=0D0
2161        SAFE_ALLOCATE(this%gmetempr, (scal%nprd(ipe),ntot))
2162        SAFE_ALLOCATE(this%gmetempc, (ntot,scal%npcd(ipe)))
2163        ! this is the standard case
2164        ipe_real = ipe
2165      end if
2166
2167      do ispin = 1 , kp%nspin
2168
2169        itot = 0
2170
2171        if(peinf%inode .eq. 0) then
2172          call timing%start(timing%chi_sum_prep)
2173        endif
2174
2175        if ( .not. keep_fix_buf ) then
2176          SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe_real)))
2177          SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe_real)))
2178          SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe_real)))
2179          SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe_real)))
2180        end if
2181
2182        do irk = 1, nrk
2183          do it = 1, nst(irk)
2184
2185            do icurr=1,scal%nprd(ipe_real)
2186              tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe_real),it,irk)
2187              tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe_real),it,irk)
2188            enddo
2189            do icurr=1,scal%npcd(ipe_real)
2190              tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe_real),it,irk)
2191              tmpcolph(icurr) = pht(scal%imycold(icurr,ipe_real),it,irk)
2192            enddo
2193
2194            do iv = 1, vwfn%nband+pol%ncrit
2195               ! check if I own this partuicular band
2196               if (peinf%doiownv(iv)) then
2197                 ilimit = peinf%ncownactual
2198               else
2199                 ilimit = 0
2200               endif
2201
2202               !$OMP PARALLEL private (mytot, zvalue, edenDRtemp, icurr, &
2203               !$OMP                   cvpair_temp, cv_energy )
2204               !$OMP DO
2205               do i_myband = 1, ilimit
2206                 mytot = itot + i_myband
2207
2208                 zvalue = pol%edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx)
2209                 if(pol%lin_denominator<TOL_Zero) then
2210                   ! this is when the lin_denominator mode is not active.
2211                   if(abs(zvalue) .gt. Tol_Zero) then
2212                     edenDRtemp = -0.5D+00 * ( &
2213                            1d0/(zvalue - pol%dFreqBrd(1)/ryd)+ &
2214                            1d0/(zvalue + pol%dFreqBrd(1)/ryd))
2215                   else
2216                     edenDRtemp = 0.0D+00
2217                   end if
2218                 else
2219                   !this is when lin_denominator mode is active
2220                   cvpair_temp = pol%lin_edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx)
2221                   cv_energy = cvpair_temp%ev - cvpair_temp%ec
2222
2223                   if(abs(cv_energy) - pol%lin_denominator > (-TOL_Zero)) then
2224                     !
2225                     if (abs(zvalue) .gt. Tol_Zero) then
2226                       edenDRtemp = -0.5d0*( &
2227                       1d0/(zvalue-pol%dFreqBrd(1)/ryd)+ &
2228                       1d0/(zvalue+pol%dFreqBrd(1)/ryd))
2229                     else
2230                       edenDRtemp = 0D0
2231                     endif
2232                     !
2233                   else ! if denominator is small, do linearized energy denominator
2234                     !
2235                     if(cvpair_temp%vltc) then
2236                       edenDRtemp = -0.5d0*(&
2237
2238                         conjg(integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),&
2239                           cvpair_temp%ev,cvpair_temp%ec,&
2240
2241                           cvpair_temp%vv,cvpair_temp%vc,&
2242                           -pol%dFreqGrid(1)/ryd,pol%efermi/ryd))&
2243
2244                         +integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),&
2245                           cvpair_temp%ev,cvpair_temp%ec,&
2246                           cvpair_temp%vv,cvpair_temp%vc,&
2247                           pol%dFreqGrid(1)/ryd,pol%efermi/ryd))
2248                     else
2249                       edenDRtemp = 0D0
2250                     endif
2251                     !
2252                   end if
2253                   !
2254                 end if ! lin denominator
2255
2256                 ! divede the matrix elements by the corresponding energy denominator
2257                 ! right hand side matrix
2258                 do icurr=1,scal%nprd(ipe_real)
2259                   this%gmetempr(icurr,mytot)=pol%gme(tmprowindex(icurr), &
2260                     i_myband,peinf%indexv(iv),ispin,irk,freq_idx) * tmprowph(icurr) * edenDRtemp
2261                 enddo
2262                 ! left hand side matrix
2263                 do icurr=1,scal%npcd(ipe_real)
2264                   this%gmetempc(mytot,icurr) = &
2265                     MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,peinf%indexv(iv),ispin,irk,freq_idx) * tmpcolph(icurr) )
2266                 enddo
2267
2268               end do ! i_myband
2269               !$OMP END DO
2270               !$OMP END PARALLEL
2271               itot = itot+ilimit
2272
2273            end do  ! iv
2274
2275          end do   ! it
2276        end do   ! irk
2277
2278        if ( .not. keep_fix_buf ) then
2279          SAFE_DEALLOCATE(tmprowindex)
2280          SAFE_DEALLOCATE(tmpcolindex)
2281          SAFE_DEALLOCATE(tmprowph)
2282          SAFE_DEALLOCATE(tmpcolph)
2283        end if
2284
2285        if(peinf%inode .eq. 0) then
2286          call timing%stop(timing%chi_sum_prep)
2287        endif
2288
2289
2290        ! Go with matrix multiplication
2291        ! if we keep fixed buffer size we always multiply
2292        if( keep_fix_buf ) then
2293          !
2294          if(peinf%inode .eq. 0) then
2295            call timing%start(timing%chi_sum_gemm)
2296          endif
2297
2298#ifdef CPLX
2299          call zgemm('n','n', size_N, size_M, size_K, &
2300            negfact,this%gmetempr(:,:), size_N, this%gmetempc(:,:), size_K,&
2301            (0D0,0D0), this%chilocal(:,:), size_N )
2302#else
2303          call dgemm('n','n', size_N, size_M, size_K, &
2304                      negfact,this%gmetempr(:,:), size_N, this%gmetempc(:,:), size_K, &
2305                      0D0,this%chilocal(:,:), size_N )
2306#endif
2307          if(peinf%inode .eq. 0) then
2308            call timing%stop(timing%chi_sum_gemm)
2309          endif
2310
2311          !
2312        else
2313        !
2314        if(ntot > 0) then
2315          if(peinf%inode .eq. 0) then
2316            call timing%start(timing%chi_sum_gemm)
2317          endif
2318
2319          ! in the case pol%nonblocking_cyclic, ipe is the shift wrt inode
2320          ! position, ipe_real is the absolute position of the ipe from which
2321          ! we are receiving. in the standard case ipe == ipe_real
2322#ifdef CPLX
2323          call zgemm('n','n',scal%nprd(ipe_real),scal%npcd(ipe_real),ntot, &
2324            negfact,this%gmetempr(:,:),scal%nprd(ipe_real),this%gmetempc(:,:),ntot,&
2325            (0D0,0D0),this%chilocal(:,:),scal%nprd(ipe_real))
2326#else
2327          call dgemm('n','n',scal%nprd(ipe_real),scal%npcd(ipe_real),ntot, &
2328                     negfact,this%gmetempr(:,:),scal%nprd(ipe_real),this%gmetempc(:,:),ntot,&
2329                     0D0,this%chilocal(:,:),scal%nprd(ipe_real))
2330#endif
2331          if(peinf%inode .eq. 0) then
2332            call timing%stop(timing%chi_sum_gemm)
2333          endif
2334
2335        endif
2336        !
2337        end if !keep_fix_buf
2338
2339        if(pol%nonblocking_cyclic) then
2340          ! just copy the result
2341!$OMP PARALLEL DO collapse(2)
2342          do ii = 1, nrec_col
2343            do jj = 1, nrec_row
2344              buf_temp(jj,ii,ispin) = this%chilocal(jj,ii)
2345            enddo
2346          enddo
2347        else
2348          if(peinf%inode .eq. 0) then
2349            call timing%start(timing%chi_sum_comm)
2350          endif
2351
2352#ifdef MPI
2353          ! here ipe == ipe_real
2354          call MPI_Reduce(this%chilocal(1,1),this%chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, &
2355                          MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
2356#else
2357          this%chilocal2(:,:,ispin)=this%chilocal(:,:)
2358#endif
2359          if(peinf%inode .eq. 0) then
2360            call timing%stop(timing%chi_sum_comm)
2361          endif
2362
2363        end if
2364
2365      enddo ! ispin
2366      if(peinf%inode .eq. 0) then
2367        call timing%start(timing%chi_sum_array_alloc)
2368      endif
2369
2370      !
2371      if( .not. keep_fix_buf ) then
2372        SAFE_DEALLOCATE(this%chilocal)
2373        SAFE_DEALLOCATE(this%gmetempr)
2374        SAFE_DEALLOCATE(this%gmetempc)
2375      end if
2376      !
2377      if(peinf%inode .eq. 0) then
2378        call timing%stop(timing%chi_sum_array_alloc)
2379      endif
2380
2381
2382      ! finalize non-blocking cyclic communication
2383      if(pol%nonblocking_cyclic) then
2384          if(peinf%inode .eq. 0) then
2385            call timing%start(timing%chi_sum_comm)
2386          endif
2387
2388          ! make sure the buffer is received
2389          CALL MPI_Wait(req_rec,stat,mpierr)
2390          ! accumulate contribution into receiving buffer
2391          ! buf_rec(:,:,:) = buf_rec(:,:,:) + buf_temp
2392          do ispin = 1 , kp%nspin
2393!$OMP PARALLEL DO collapse(2)
2394            do ii = 1, nrec_col
2395              do jj = 1, nrec_row
2396                buf_rec(jj,ii,ispin) = buf_rec(jj,ii,ispin) + buf_temp(jj,ii,ispin)
2397              enddo
2398            enddo
2399          end do
2400          !
2401          if( .not. keep_fix_buf ) then
2402            SAFE_DEALLOCATE(buf_temp)
2403          end if
2404          !
2405          ! wait for the massage to be sent
2406          CALL MPI_Wait(req_send,stat,mpierr)
2407          if(peinf%inode .eq. 0) then
2408            call timing%stop(timing%chi_sum_comm)
2409          endif
2410
2411          if(peinf%inode .eq. 0) then
2412            call timing%start(timing%chi_sum_array_alloc)
2413          endif
2414
2415          ! copy the messega to the sending buffer for the next cycle
2416          !
2417          if( .not. keep_fix_buf ) then
2418            SAFE_DEALLOCATE(buf_send)
2419            SAFE_ALLOCATE(buf_send, (nrec_row, nrec_col, kp%nspin))
2420          end if
2421          !
2422          do ispin = 1 , kp%nspin
2423!$OMP PARALLEL DO collapse(2)
2424            do ii = 1, nrec_col
2425              do jj = 1, nrec_row
2426                buf_send(jj,ii,ispin) = buf_rec(jj,ii,ispin)
2427              enddo
2428            enddo
2429          end do
2430          !
2431          !XXX buf_send = buf_rec
2432          !
2433          nsend_row = nrec_row
2434          nsend_col = nrec_col
2435          ! deallocate receiving buffer
2436          if( .not. keep_fix_buf ) then
2437            SAFE_DEALLOCATE(buf_rec)
2438          end if
2439          !
2440          if(peinf%inode .eq. 0) then
2441            call timing%stop(timing%chi_sum_array_alloc)
2442          endif
2443
2444        end if
2445
2446    enddo ! ipe
2447    call progress_free(prog_info)
2448
2449    ! time for chi omega = 0
2450    if(peinf%inode .eq. 0) then
2451      call timing%stop(timing%chi_sum_sub_omega_0)
2452    endif
2453
2454    if(pol%nonblocking_cyclic) then
2455      ! done
2456      do ispin = 1 , kp%nspin
2457        !$OMP PARALLEL DO collapse(2)
2458        do ii = 1, scal%npc
2459          do jj = 1, scal%npr
2460            this%chilocal2(jj,ii,ispin) = buf_send(jj,ii,ispin)
2461          enddo
2462        enddo
2463      end do
2464      !XXXXX this%chilocal2(:,:,:) = buf_send(:,:,:)
2465      !
2466      if(keep_fix_buf) then
2467        ! clean-up
2468        SAFE_DEALLOCATE(buf_send)
2469        SAFE_DEALLOCATE(buf_rec)
2470        SAFE_DEALLOCATE(buf_temp)
2471        !
2472        SAFE_DEALLOCATE(tmprowindex)
2473        SAFE_DEALLOCATE(tmpcolindex)
2474        SAFE_DEALLOCATE(tmprowph)
2475        SAFE_DEALLOCATE(tmpcolph)
2476        !
2477        SAFE_DEALLOCATE(this%chilocal)
2478        SAFE_DEALLOCATE(this%gmetempr)
2479        SAFE_DEALLOCATE(this%gmetempc)
2480      else
2481        ! standard case
2482        SAFE_DEALLOCATE(buf_send)
2483      end if
2484    end if
2485
2486    SAFE_ALLOCATE(chiRDyn_local, (scal%npr,scal%npc))
2487    chiRDyn_local(:,:) = this%chilocal2(:,:,1)
2488    if(kp%nspin.eq.2)  chiRDyn_local(:,:) = chiRDyn_local(:,:) + this%chilocal2(:,:,2)
2489
2490    ! make a copy of chi at omega zero (only in case full chi is needed)
2491    if(pol%need_full_chi) then
2492      SAFE_ALLOCATE(chi_omega0_save, (scal%npr,scal%npc,kp%nspin))
2493      chi_omega0_save(:,:,:) = this%chilocal2(:,:,:)
2494    end if
2495
2496    SAFE_DEALLOCATE(this%chilocal2)
2497
2498    ! Generator Coulomb Interaction Array Vcoul
2499    SAFE_ALLOCATE(isorti, (gvec%ng))
2500    SAFE_ALLOCATE(vcoul, (pol%nmtx))
2501
2502    vcoul(:)=0.0d0
2503    do i=1,gvec%ng
2504      isorti(pol%isrtx(i)) = i
2505    end do
2506
2507    avgcut=TOL_ZERO
2508    iscreen=0
2509    nfk=product(kp%kgrid(1:3))
2510    q0vec=0d0
2511    iparallel=1
2512
2513    if(peinf%inode .eq. 0) then
2514      call timing%start(timing%chi_sum_sub_vcoul)
2515    endif
2516
2517
2518    qgrid(:)=1
2519
2520    epsheaddummy=0.0d0
2521    wcoul0=0.0d0
2522
2523    if(pol%nfreq_group .eq. 1) then
2524      call vcoul_generator(pol%icutv,pol%truncval,gvec,crys%bdot,crys%celvol, &
2525        nfk,pol%nmtx,pol%isrtx,iscreen,q0,q0vec,vcoul, &
2526        pol%iwritecoul,iparallel,avgcut,oneoverq,qgrid,epsheaddummy, &
2527        work_scell,.false.,wcoul0)
2528    else
2529      call vcoul_generator(pol%icutv,pol%truncval,gvec,crys%bdot,crys%celvol, &
2530        nfk,pol%nmtx,pol%isrtx,iscreen,q0,q0vec,vcoul, &
2531        pol%iwritecoul,iparallel,avgcut,oneoverq,qgrid,epsheaddummy, &
2532        work_scell,.false.,wcoul0,nfreq_group=pol%nfreq_group)
2533    endif
2534
2535    if(peinf%inode .eq. 0) then
2536      call timing%stop(timing%chi_sum_sub_vcoul)
2537    endif
2538
2539
2540    ! calculate symmetrized polarizability
2541    do igp_l = 1, scal%npc
2542      igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol)
2543      vc = sqrt(vcoul(igp_g))
2544      do ig_l = 1, scal%npr
2545        ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow)
2546        chiRDyn_local(ig_l,igp_l) = chiRDyn_local(ig_l,igp_l) * sqrt(vcoul(ig_g)) * vc
2547      enddo
2548    enddo
2549
2550    ! Dump write/read matrices if needed
2551    if( do_dump_matrix ) then
2552      call dump_matrix(scal, chiRDyn_local, peinf%inode, 100000*iq)
2553    else
2554      if( do_read_dump_matrix ) then
2555        chiRDyn_local = ZERO
2556        call read_dump_matrix(scal, chiRDyn_local, peinf%inode, 100000*iq)
2557      end if
2558    end if
2559
2560    ! copy the symmetrized chi for omega = 0 only if we work within the subspace
2561    IF(.NOT.pol%need_full_chi) THEN
2562      SAFE_ALLOCATE(pol%chiRDyn_sym_omega0, (scal%npr,scal%npc))
2563      pol%chiRDyn_sym_omega0(:,:) = chiRDyn_local(:,:)
2564    END IF
2565
2566    !XXXXXXXXXXXXXXXXx
2567    ! ALLOCATE(chilocal(pol%nmtx,pol%nmtx))
2568    ! chilocal = ZERO
2569    ! icurr=0
2570    ! do i=1,pol%nmtx
2571    !   irow=MOD(INT(((i-1)/scal%nbl)+TOL_SMALL),scal%nprow)
2572    !   if(irow.ne.scal%myprow) cycle
2573    !   vc = SQRT(vcoul(i))
2574    !   do j=1,pol%nmtx
2575    !     icol=MOD(INT(((j-1)/scal%nbl)+TOL_SMALL),scal%npcol)
2576    !     if(icol .eq. scal%mypcol) then
2577    !       icurr=icurr+1
2578    !       irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1
2579    !       icolm=MOD((icurr-1),scal%npc)+1
2580
2581    !       chilocal(i,j) = chiRDyn_local(irowm,icolm)
2582    !     endif
2583    !   end do
2584    ! end do
2585    ! CALL MPI_ALLREDUCE(MPI_IN_PLACE,chilocal(1,1),pol%nmtx*pol%nmtx,MPI_SCALAR,MPI_SUM,MPI_COMM_WORLD,mpierr)
2586    ! IF(peinf%inode.eq.0) THEN
2587    !   WRITE(3000,*) pol%nmtx
2588    !   DO j = 1, pol%nmtx
2589    !     DO i = 1, pol%nmtx
2590    !       WRITE(3000,*) i, j, chilocal(i,j)
2591    !     END DO
2592    !   END DO
2593    ! END IF
2594    ! DEALLOCATE(chilocal)
2595    !XXXXXXXXXXXXXXXXx
2596
2597    ! diagonalize
2598    if(peinf%inode .eq. 0) then
2599      call timing%start(timing%chi_sum_sub_diag)
2600    endif
2601
2602    SAFE_ALLOCATE(pol%eigenvect_omega0, (scal%npr,scal%npc))
2603    SAFE_ALLOCATE(pol%eigenval_omega0, (pol%nmtx))
2604
2605#ifdef USEELPA
2606#ifdef CPLX
2607    IF(pol%use_elpa) THEN
2608      CALL diagonalize_elpa(pol%nmtx, scal, chiRDyn_local, &
2609                            pol%chi_eigenvalue_cutoff, pol%neig_sub_input, neig_sub, &
2610                            pol%eigenvect_omega0, pol%eigenval_omega0)
2611    ELSE
2612#endif
2613#endif
2614      CALL diagonalize_scalapack(pol%nmtx, scal, chiRDyn_local, &
2615                                 pol%chi_eigenvalue_cutoff, pol%neig_sub_input, neig_sub, &
2616                                 pol%eigenvect_omega0, pol%eigenval_omega0)
2617#ifdef USEELPA
2618#ifdef CPLX
2619    END IF
2620#endif
2621#endif
2622
2623    SAFE_DEALLOCATE(chiRDyn_local)
2624    if(peinf%inode .eq. 0) then
2625      call timing%stop(timing%chi_sum_sub_diag)
2626    endif
2627
2628
2629#ifdef MPI
2630    call MPI_barrier(MPI_COMM_WORLD,mpierr)
2631#endif
2632
2633    ! here we have the eigenvector at omega=0, redistribute gme into scalapack format
2634    ! easy communication scheme
2635    CALL calc_pol_sub_trunc_easy(this,pol,scal,kp,kpq,vwfn,cwfn,&
2636                                 nst,nrk,indt,pht,gvec,crys,q0,&
2637                                 neig_sub, pol%eigenvect_omega0, pol%eigenval_omega0, vcoul)
2638
2639    ! replace the polarizability at omega zero with the non-truncated chi
2640    ! (only in case full chi is needed)
2641    if(pol%need_full_chi) then
2642      DO ispin = 1, kp%nspin
2643        pol%chiRDyn(:,:,1,ispin) = chi_omega0_save(:,:,ispin)
2644      END DO
2645      SAFE_DEALLOCATE(chi_omega0_save)
2646      ! in this case we don`t need eigenvector and eigenvalues
2647      SAFE_DEALLOCATE(pol%eigenvect_omega0)
2648      SAFE_DEALLOCATE(pol%eigenval_omega0)
2649    else
2650      ! this is the case in which we work only within the subspace (except for omega = 0)
2651      ! leave the possibility to do something
2652      ! save coulomb potential to be reused in epsinv
2653      SAFE_ALLOCATE(pol%vcoul_sub, (pol%nmtx))
2654      pol%vcoul_sub(:) = vcoul(:)
2655    end if
2656#ifdef MPI
2657    call MPI_barrier(MPI_COMM_WORLD,mpierr)
2658#endif
2659
2660    POP_SUB(chi_summation_sub_trunc)
2661
2662  end subroutine chi_summation_sub_trunc
2663
2664#ifdef USEELPA
2665#ifdef CPLX
2666  subroutine diagonalize_elpa(nmtx, scal, matrix, chi_eigenvalue_cutoff, neig_sub_input,&
2667                              neig_sub_out, eigen_vect, eigenval)
2668  integer, intent(in) :: nmtx
2669  type (scalapack), intent(in) :: scal
2670  SCALAR, intent(inout) :: matrix(scal%npr,scal%npc)
2671  SCALAR, intent(out)   :: eigen_vect(scal%npr,scal%npc)
2672  real(DP), intent(out) :: eigenval(nmtx)
2673  real(DP)              :: chi_eigenvalue_cutoff, max_abs_eigen
2674  INTEGER               :: neig_sub_input, neig_sub_out
2675
2676  real(DP)              :: scaling
2677  integer               :: neig_sub, neig_sub_temp, i
2678
2679  PUSH_SUB(diagonalize_elpa)
2680
2681  ! if not given in input use 25% of eigenvectors (fixed number, move in input?)
2682  scaling = 0.25D+00
2683  neig_sub = MIN(nmtx,INT(nmtx/scaling)+1)
2684  IF(neig_sub_input > 0) THEN
2685    neig_sub = MIN(neig_sub_input,nmtx)
2686  END IF
2687  IF(peinf%inode .eq. 0) then
2688    write(6,'(1x,A)')      'Diagonalization with ELPA'
2689    write(6,'(1x,A,e8.2)') 'EPS Truncation = ', chi_eigenvalue_cutoff
2690    write(6,'(1x,A,F8.2)') 'Scaling =        ', scaling
2691    write(6,'(1x,A,I8)')   'Scaled basis =   ', neig_sub
2692  END IF
2693
2694  CALL diagonalize_elpa_complex(nmtx, neig_sub, scal, matrix, eigen_vect, eigenval)
2695
2696  ! truncate
2697  neig_sub_temp = 0
2698  max_abs_eigen = MAXVAL(ABS(eigenval))
2699  DO i = 1, neig_sub
2700    IF(ABS(eigenval(i))/max_abs_eigen >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1
2701    ! IF(ABS(eigen(i)) >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1
2702    ! IF(peinf%inode .eq. 0) WRITE(2000,*) eigen(i)
2703  END DO
2704  neig_sub = neig_sub_temp
2705  IF(peinf%inode .eq. 0) then
2706     write(6,*) 'N eigen with relative MAX ABS value >= chi_eigenvalue_cutoff', neig_sub
2707  END IF
2708  neig_sub_out = neig_sub
2709
2710  POP_SUB(diagonalize_elpa)
2711
2712  end subroutine diagonalize_elpa
2713#endif
2714#endif
2715
2716  subroutine diagonalize_scalapack(nmtx, scal, matrix, chi_eigenvalue_cutoff, neig_sub_input,&
2717                                   neig_sub_out, eigen_vect, eigenval)
2718  integer, intent(in) :: nmtx
2719  type (scalapack), intent(in) :: scal
2720  SCALAR, intent(inout) :: matrix(scal%npr,scal%npc)
2721  complex(DPC), intent(out)  :: eigen_vect(scal%npr,scal%npc)
2722  real(DP), intent(out) :: eigenval(nmtx)
2723  real(DP)              :: chi_eigenvalue_cutoff
2724  INTEGER               :: neig_sub_input, neig_sub_out
2725
2726  integer :: info, desca(9), lwork, lrwork, liwork, ipiv(scal%npr+scal%nbl)
2727  integer :: descb(9)
2728  integer, allocatable :: iwork(:)
2729  SCALAR, allocatable :: work(:)
2730
2731  SCALAR, allocatable :: work_2(:,:)
2732  DOUBLE PRECISION, ALLOCATABLE :: eigen(:), rwork(:)
2733  DOUBLE PRECISION :: vl, vu
2734  INTEGER :: iii, jjj, jj, i, j, icurr, irow, icol, irowm, icolm
2735  INTEGER :: neig_sub, neig_sub_temp
2736  DOUBLE PRECISION :: scaling, max_abs_eigen
2737  DOUBLE PRECISION :: ABSTOL, ORFAC
2738  DOUBLE PRECISION, ALLOCATABLE :: gap(:)
2739  INTEGER, ALLOCATABLE :: iclustr(:), ifail(:)
2740  INTEGER :: Neigen_found, Neigenvect_found, clustersize, locsize, nbc, nbr
2741  INTEGER :: nn, nnp, np0, mq0, nq0
2742  character(len=100) :: tmpstr
2743  logical :: find_eigen_in_range
2744
2745  PUSH_SUB(diagonalize_scalapack)
2746
2747  call descinit(desca, nmtx, nmtx, scal%nbl, scal%nbl, 0, 0, &
2748                scal%icntxt, max(1,scal%npr), info)
2749  if(info < 0) then
2750    write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.'
2751    call die("descinit error for descaA in inversion")
2752  else if(info > 0) then
2753    write(0,*) 'info = ', info
2754    call die("descinit error for descaA in inversion")
2755  endif
2756
2757  SAFE_ALLOCATE(work_2, (scal%npr,scal%npc))
2758  work_2 = matrix
2759
2760  find_eigen_in_range = .TRUE.
2761
2762  scaling = 1.0D+00
2763  neig_sub = MIN(nmtx,INT(nmtx/scaling)+1)
2764  IF(neig_sub_input > 0) THEN
2765    neig_sub = MIN(neig_sub_input,nmtx)
2766    find_eigen_in_range = .FALSE.
2767  END IF
2768  IF(peinf%inode .eq. 0) then
2769    write(6,'(1x,A)')      'Diagonalization with Scalapack'
2770    write(6,'(1x,A,e8.2)') 'EPS Truncation = ', chi_eigenvalue_cutoff
2771    write(6,'(1x,A,F8.2)') 'Scaling =        ', scaling
2772    write(6,'(1x,A,I8)')   'Scaled basis =   ', neig_sub
2773  END IF
2774
2775  vl = -100.0D+00
2776  vu =   -ABS(chi_eigenvalue_cutoff)
2777  ABSTOL = 0.0D+00
2778  ORFAC  = 5.0D-7
2779  ! ABSTOL = 1.0D-7
2780  ! ORFAC  = 5.0D-7
2781  Neigen_found = 0
2782  Neigenvect_found = 0
2783
2784  clustersize = MIN(50,neig_sub)
2785  ! clustersize = nmtx/MAX(INT(SQRT(DBLE(scal%npcol*scal%nprow))),1)
2786  nbc = nmtx/(scal%nbl * scal%npcol) + 1
2787  nbr = nmtx/(scal%nbl * scal%nprow) + 1
2788  locsize = scal%nbl * scal%nbl * nbc * nbr
2789  nn  = MAX(nmtx, scal%nbl, 2)
2790  nnp = MAX(nmtx, scal%nprow*scal%npcol+1, 4)
2791  np0 = numroc(nn, scal%nbl, 0, 0, scal%nprow)
2792  mq0 = numroc(MAX(neig_sub,scal%nbl,2), scal%nbl, 0, 0, scal%npcol)
2793
2794#ifdef CPLX
2795  nq0    = numroc(nn, scal%nbl, 0, 0, scal%npcol)
2796  lwork  = nmtx + (np0+nq0+scal%nbl)*scal%nbl
2797  lrwork = 4*nmtx + MAX(5*nn,np0*mq0) + iceil(neig_sub,scal%nprow*scal%npcol)*nn + &
2798           (clustersize-1)*nmtx
2799#else
2800  lwork  = 5*nmtx + MAX(5*nn,np0*mq0+2*scal%nbl*scal%nbl) + &
2801           iceil(neig_sub,scal%nprow*scal%npcol)*nn + (clustersize-1)*nmtx
2802  lrwork = 0
2803#endif
2804  liwork = 6*nnp
2805
2806  SAFE_ALLOCATE(work, (lwork))
2807#ifdef CPLX
2808  SAFE_ALLOCATE(rwork, (lrwork))
2809#endif
2810  SAFE_ALLOCATE(iwork, (liwork))
2811  SAFE_ALLOCATE(iclustr, (2*scal%nprow*scal%npcol))
2812  SAFE_ALLOCATE(gap, (scal%nprow*scal%npcol))
2813  SAFE_ALLOCATE(ifail, (nmtx))
2814  SAFE_ALLOCATE(eigen, (nmtx))
2815  eigen = 0.0D+00
2816
2817  info = 0
2818#ifdef CPLX
2819  IF(find_eigen_in_range) THEN
2820    CALL pzheevx('V', 'V', 'U', nmtx, work_2, 1, 1, desca, vl, vu, 1, neig_sub, ABSTOL, Neigen_found, Neigenvect_found, &
2821                 eigen, ORFAC,  matrix, 1, 1, desca, work, lwork, rwork, lrwork, iwork, liwork, ifail, iclustr, &
2822                 gap, info)
2823  ELSE
2824    CALL pzheevx('V', 'I', 'U', nmtx, work_2, 1, 1, desca, vl, vu, 1, neig_sub, ABSTOL, Neigen_found, Neigenvect_found, &
2825                 eigen, ORFAC,  matrix, 1, 1, desca, work, lwork, rwork, lrwork, iwork, liwork, ifail, iclustr, &
2826                 gap, info)
2827  END IF
2828#else
2829  CALL pdsyevx('V', 'I', 'U', nmtx, work_2, 1, 1, desca, vl, vu, 1, neig_sub, ABSTOL, Neigen_found, Neigenvect_found, &
2830               eigen, ORFAC,  matrix, 1, 1, desca, work, lwork, iwork, liwork, ifail, iclustr, &
2831               gap, info)
2832#endif
2833
2834  IF(info/=0) THEN
2835    IF(info < 0) THEN
2836      call die(" error in parameters for pzheevx/pdsyevx")
2837    ELSE
2838      ! Source: http://www.netlib.org/lapack-dev/Patch/SRC/pzheevx.f
2839      ! if (MOD(INFO,2).NE.0), then one or more eigenvectors failed to converge.  Their indices are stored
2840      ! in IFAIL.  Ensure ABSTOL=2.0*PDLAMCH( 'U' ). Send e-mail to scalapack@cs.utk.edu
2841      if(mod(info, 2) /= 0) then
2842        if(peinf%inode == 0) then
2843          write(0,*) " info = ", info
2844          write(0,'(a)',advance='no') "Unconverged eigenvalues from ifail: "
2845          do jj = 1, size(ifail)
2846            if(ifail(jj) == 0) exit
2847            write(0,'(i8)',advance='no') ifail(jj)
2848          enddo
2849        endif
2850        call die("Convergence problems in pzheevx/pdsyevx.")
2851      endif
2852
2853      ! if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding to one or more clusters of eigenvalues could not be
2854      ! reorthogonalized because of insufficient workspace. The indices of the clusters are stored in the array ICLUSTR.
2855      if(mod(info / 2, 2) /= 0) then
2856        if(peinf%inode == 0) then
2857          write(0,*) " info = ", info
2858          do jj = 1, size(iclustr)
2859            if(iclustr(jj) == 0) exit
2860            write(0,'(i8)',advance='no') iclustr(jj)
2861          enddo
2862        endif
2863        call die("Could not reorthogonalize due to insufficient workspace in pzheevx/pdsyevx.")
2864      endif
2865
2866      ! if (MOD(INFO/4,2).NE.0), then space limit prevented PZHEEVX from computing all of the eigenvectors
2867      ! between VL and VU.  The number of eigenvectors computed is returned in NZ.
2868      if(mod(info / 4, 2) /= 0) then
2869        if(peinf%inode == 0) then
2870          write(0,*) " info = ", info
2871          write(0,*) " number eigenvalues found = ", Neigenvect_found
2872        endif
2873        call die("Space limit prevented computing all eigenvectors in pzheevx/pdsyevx.")
2874      endif
2875
2876      ! if (MOD(INFO/8,2).NE.0), then PZSTEBZ failed to compute eigenvalues.  Ensure ABSTOL=2.0*PDLAMCH( 'U' )
2877      ! Send e-mail to scalapack@cs.utk.edu
2878      if(mod(info / 8, 2) /= 0) then
2879        if(peinf%inode == 0) then
2880          write(0,*) " info = ", info
2881        endif
2882        call die("PZSTEBZ failed to compute eigenvalues in pzheevx/pdsyevx.")
2883      endif
2884    END IF
2885  END IF
2886
2887  SAFE_DEALLOCATE(work)
2888#ifdef CPLX
2889  SAFE_DEALLOCATE(rwork)
2890#endif
2891  SAFE_DEALLOCATE(iwork)
2892  SAFE_DEALLOCATE(iclustr)
2893  SAFE_DEALLOCATE(gap)
2894  SAFE_DEALLOCATE(ifail)
2895
2896  SAFE_DEALLOCATE(work_2)
2897
2898  IF(Neigen_found/=neig_sub) THEN
2899    neig_sub = Neigen_found
2900    ! print a warning
2901    IF(peinf%inode .eq. 0) then
2902      write(6,*) 'Eigenvalue found = ', neig_sub
2903    END IF
2904  END IF
2905  IF(Neigenvect_found < neig_sub) THEN
2906    write(tmpstr,'(a, i10, a, i10, a)') 'Diagonalization with pzheevx/pdsyevx failed: only ', &
2907       Neigenvect_found, ' of ', neig_sub, ' eigenvectors found.'
2908    call die(tmpstr)
2909  END IF
2910
2911  ! truncate
2912  neig_sub_temp = 0
2913  max_abs_eigen = MAXVAL(ABS(eigen))
2914  DO i = 1, neig_sub
2915    IF(ABS(eigen(i))/max_abs_eigen >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1
2916    ! IF(ABS(eigen(i)) >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1
2917    ! IF(peinf%inode .eq. 0) WRITE(2000,*) eigen(i)
2918  END DO
2919  neig_sub = neig_sub_temp
2920
2921  IF(peinf%inode .eq. 0) then
2922     write(6,*) 'N eigen with relative MAX ABS value >= chi_eigenvalue_cutoff', neig_sub
2923  END IF
2924  neig_sub_out = neig_sub
2925
2926  ! matrix at this point contains the eigenvectors contain here the e
2927  ! make a copy into eigenvect (eigenvectors are defined complex)
2928  eigen_vect = (0D0,0D0)
2929  eigen_vect = matrix
2930  eigenval = eigen
2931
2932  SAFE_DEALLOCATE(eigen)
2933
2934  POP_SUB(diagonalize_scalapack)
2935
2936  end subroutine diagonalize_scalapack
2937
2938  ! This is going to be the same communication scheme as comm_matrix
2939  ! for which the eigenvector of the symmetrized polarizability are
2940  ! replicated on each MPI task
2941  subroutine calc_pol_sub_trunc_easy(this,pol,scal,kp,kpq,vwfn,cwfn,&
2942                                     nst,nrk,indt,pht,gvec,crys,q0,&
2943                                     neig_sub, eigen_vect, eigenval, vcoul)
2944    type(chi_summator_t), intent(INOUT) :: this
2945    type(polarizability), intent(INOUT) :: pol
2946    type(scalapack), intent(in) :: scal
2947    type(kpoints), intent(IN) :: kp, kpq
2948    type(valence_wfns), intent(IN) :: vwfn
2949    type(conduction_wfns), intent(IN) :: cwfn
2950    INTEGER :: neig_sub
2951    complex(DPC), allocatable :: eigen_vect(:,:)
2952    real(DP), allocatable :: eigenval(:)
2953    real(DP), allocatable :: vcoul(:)
2954
2955    integer, intent(IN) :: nst(:)
2956    integer, intent(IN) :: nrk
2957    integer, intent(INOUT) :: indt(:,:,:)
2958    SCALAR,  intent(INOUT) :: pht(:,:,:)
2959    type (gspace), intent(in) :: gvec
2960    type (crystal), intent(in) :: crys
2961    real(DP), intent(in) :: q0(3)
2962    real(DP) :: zvalue, cv_energy
2963    complex(DPC), allocatable :: edenDRtemp(:)
2964
2965    INTEGER :: ntot, ntot2
2966    INTEGER :: irk, ipe, ipe_row, ipe_col, ipe_real, ispin, itot, it, iv, ic, i, j, iv_local
2967    INTEGER :: irow, icol, irowm, icolm, icurr, ilimit, freq_idx, &
2968               i_myband, mytot
2969
2970    integer, allocatable :: tmprowindex(:),tmpcolindex(:)
2971    SCALAR, allocatable :: tmprowph(:),tmpcolph(:)
2972    type(cvpair_info) :: cvpair_temp
2973    complex(DPC) :: negfact
2974
2975    SCALAR :: epsheaddummy, wcoul0
2976    type (twork_scell) :: work_scell
2977    integer, allocatable :: isorti(:)
2978    real(DP) :: vc, oneoverq, avgcut
2979    integer :: iscreen, nfk, iparallel
2980    integer :: qgrid(3)
2981    real(DP) :: q0vec(3)
2982
2983    type(progress_info) :: prog_info
2984
2985    INTEGER :: n_vc_tot, n_v_tot, n_c_tot, nmtx, kk, jj, ii, info
2986    INTEGER :: my_sub_nlocal_col, my_sub_nlocal_row, itot_rk2, &
2987      irow_global, icol_global
2988
2989    complex(DPC), allocatable :: C_aux(:,:), gme_sub(:,:,:,:,:), gme_temp(:,:,:)
2990    complex(DPC), allocatable :: C_Pgemm(:,:)
2991    integer, allocatable :: grid2D_2inode(:,:), row_col_loc(:,:), inode_2grid2D(:,:)
2992    INTEGER :: pe_col_size, pe_row_size, desc_sub(9), desca(9)
2993    INTEGER, ALLOCATABLE :: row_indx(:), col_indx(:)
2994    ! variables for block transformation
2995    integer :: iproc_dum
2996    integer :: nmpinode
2997    integer :: iblock, block_size_max, ib_start, ib_end, ib_size, Nblocks
2998    real(DP) :: mem, avail_mem, mem_offset, mem_single_block
2999    ! variables for non-blocking cyclic scheme
3000    integer :: isend_static, irec_static
3001    integer :: actual_send, actual_rec
3002    integer :: nsend_row, nsend_col, nrec_row, nrec_col
3003    integer :: req_send, tag_send, req_rec, tag_rec
3004    integer :: stat(MPI_STATUS_SIZE)
3005    complex(DPC), allocatable :: buf_send(:,:,:,:), buf_rec(:,:,:,:), buf_temp(:,:,:,:)
3006    integer :: size_N, size_M, size_K
3007    ! variables for fast eigenvectors communication
3008    logical :: fast_eig_comm
3009    complex(DPC), allocatable :: C_elem(:), C_elem_send(:), C_elem_rec(:)
3010    real(DP), allocatable :: sqrt_vcoul_local(:)
3011    integer, allocatable :: elem_per_block(:)
3012    integer, allocatable :: glob_C_indec(:,:)
3013    integer, allocatable :: glob_C_indec_send(:,:), glob_C_indec_rec(:,:)
3014    integer :: num_loc_col_within_Neig, max_num_elem, iii, jjj
3015    integer :: req_send_info, tag_send_info, req_rec_info, tag_rec_info
3016
3017    PUSH_SUB(calc_pol_sub_trunc_easy)
3018
3019    fast_eig_comm = .true.
3020    if( pol%sub_collective_eigen_redistr ) then
3021      ! slow redistribution scheme, for debug only
3022      fast_eig_comm = .false.
3023    end if
3024
3025    ! precompute the dimensions of the different objects
3026    ntot=0
3027    ntot2=0
3028    ntot = peinf%nvownactual*peinf%ncownactual
3029    do irk = 1, nrk
3030      ntot2=ntot2 + nst(irk)
3031    enddo
3032    ntot=ntot*ntot2
3033
3034    ! WRITE(*,*) ntot2, nrk, maxval(nst), minval(nst)
3035
3036    n_c_tot = cwfn%nband
3037    n_v_tot = vwfn%nband+pol%ncrit
3038    n_vc_tot = n_c_tot * n_v_tot
3039    nmtx = pol%nmtx
3040
3041    ! first replicate the eigenvector of the symmetrized polarizability at omega=0
3042    ! on all processes (do it by blocks)
3043    ! calculate block size according to available memory
3044    call procmem(mem,nmpinode)
3045    avail_mem = mem
3046    mem_offset = dble(neig_sub) * dble(peinf%ncownmax) * dble(peinf%nvownmax) * &
3047                 dble(ntot2) * dble(kp%nspin) * 16D+00
3048    mem_single_block = MAX(dble(neig_sub)*2.0D+00, dble(neig_sub + peinf%ncownmax*peinf%nvownmax)) * 16D+00
3049    !XXXXX  factor 2 (probably the allreduce inplace allocate internaly an additional buffer)
3050    mem_single_block = mem_single_block * 2.0D+00
3051    !XXXXX
3052    block_size_max = int((avail_mem - mem_offset) / mem_single_block)
3053    block_size_max = MAX(block_size_max,1)
3054    block_size_max = MIN(block_size_max,nmtx)
3055    if(peinf%inode .eq. 0) then
3056      write(6,*)
3057      write(6,'(1x,"Basis Transformation info:")')
3058      write(6,'(1x,"Memory available: ",f0.1," MB per PE")') avail_mem / 1024**2
3059      write(6,'(1x,"Memory offset: ",f0.1," MB")') mem_offset / 1024**2
3060      write(6,'(1x,"Memory for a sigle block: ",f0.1," kB")') mem_single_block / 1024
3061      write(6,'(1x,"Max Block Size: ",i0)') block_size_max
3062    end if
3063    ! broadcast the result to make sure all processes have same value
3064#ifdef MPI
3065    call MPI_Bcast(block_size_max, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
3066#endif
3067    Nblocks = (nmtx-1) / block_size_max + 1
3068    SAFE_ALLOCATE(gme_sub, (neig_sub,peinf%ncownmax,peinf%nvownmax,ntot2,kp%nspin))
3069    gme_sub = ZERO
3070
3071    ! initialize some stuff for fast_eig_comm
3072    if ( fast_eig_comm ) then
3073      ! calculate how many local elements have a global index smaller than Neig
3074      num_loc_col_within_Neig = 0
3075      do icolm = 1, scal%npc
3076        j = INDXL2G (icolm, scal%nbl, scal%mypcol, 0, scal%npcol)
3077        if ( j <= neig_sub ) then
3078          num_loc_col_within_Neig = num_loc_col_within_Neig + 1
3079        end if
3080      end do
3081      ! figure the biggest number of elements will be exchanged and the actual
3082      ! number of elements for each block
3083      SAFE_ALLOCATE(elem_per_block, (Nblocks))
3084      elem_per_block = 0
3085      iii = 0
3086      do iblock = 1, nmtx, block_size_max
3087        ib_start = iblock
3088        ib_end = ib_start + block_size_max - 1
3089        ib_end = MIN(ib_end, nmtx)
3090        ib_size = ib_end - ib_start + 1
3091        ! increase block index
3092        iii = iii + 1
3093        ! compute how many rows are in range and [ib_start,ib_end] and
3094        ! accumulate relative number of cols
3095        do i = ib_start, ib_end
3096          irow = INDXG2P( i, scal%nbl, iproc_dum, 0, scal%nprow)
3097          if ( irow .ne. scal%myprow) cycle
3098          elem_per_block(iii) = elem_per_block(iii) + num_loc_col_within_Neig
3099        end do
3100      end do
3101      max_num_elem = MAXVAL(elem_per_block(:))
3102      call mpi_allreduce(MPI_IN_PLACE, max_num_elem, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
3103      ! allocate communication buffers
3104      SAFE_ALLOCATE(C_elem, (max_num_elem))
3105      SAFE_ALLOCATE(C_elem_send, (max_num_elem))
3106      SAFE_ALLOCATE(C_elem_rec, (max_num_elem))
3107      SAFE_ALLOCATE(glob_C_indec, (2,max_num_elem+1))
3108      SAFE_ALLOCATE(glob_C_indec_send, (2,max_num_elem+1))
3109      SAFE_ALLOCATE(glob_C_indec_rec, (2,max_num_elem+1))
3110      ! precoumpute SQRT of coulomb potential
3111      SAFE_ALLOCATE(sqrt_vcoul_local, (scal%npr))
3112      do irowm = 1, scal%npr
3113        i = INDXL2G (irowm, scal%nbl, scal%myprow, 0, scal%nprow)
3114        sqrt_vcoul_local(irowm) = SQRT(vcoul(i))
3115      end do
3116      ! get process for communication
3117      isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes)
3118      irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes)
3119    end if
3120
3121    call progress_init(prog_info, 'transforming basis', 'block', Nblocks)
3122    icurr=0
3123    iii = 0
3124    do iblock = 1, nmtx, block_size_max
3125      call progress_step(prog_info)
3126      ! actual block indeces
3127      ib_start = iblock
3128      ib_end = ib_start + block_size_max - 1
3129      ib_end = MIN(ib_end, nmtx)
3130      ib_size = ib_end - ib_start + 1
3131      ! allocate C_aux
3132      SAFE_ALLOCATE(C_aux, (ib_size,neig_sub))
3133      C_aux = ZERO
3134
3135      if ( fast_eig_comm ) then
3136        ! increase block index
3137        iii = iii + 1
3138        ! copy information into communication buffers
3139        jjj = 0
3140        do i = ib_start, ib_end
3141          irow = INDXG2P( i, scal%nbl, iproc_dum, 0, scal%nprow)
3142          if ( irow .ne. scal%myprow) cycle
3143          irowm = INDXG2L(i, scal%nbl, scal%myprow, 0, scal%nprow)
3144          vc = sqrt_vcoul_local(irowm)
3145          do icolm = 1, scal%npc
3146            j = INDXL2G (icolm, scal%nbl, scal%mypcol, 0, scal%npcol)
3147            if ( j <= neig_sub ) then
3148              jjj = jjj + 1
3149              C_elem(jjj) = eigen_vect(irowm,icolm) * vc
3150              glob_C_indec(1,jjj) = i
3151              glob_C_indec(2,jjj) = j
3152            end if
3153          end do
3154        end do
3155        glob_C_indec(1,max_num_elem+1) = elem_per_block(iii)
3156
3157        !XXX if ( jjj .ne. elem_per_block(iii) ) STOP
3158
3159        ! loop over process and start ring communication
3160        glob_C_indec_send(:,:) = glob_C_indec(:,:)
3161        C_elem_send(:) = C_elem(:)
3162
3163        if(peinf%inode .eq. 0) then
3164          call timing%start(timing%chi_sum_sub_eigvet_comm)
3165        endif
3166
3167        do ipe = 1, peinf%npes
3168          !
3169          tag_rec_info = 1
3170          tag_send_info = 1
3171          req_rec_info = MPI_REQUEST_NULL
3172          req_send_info = MPI_REQUEST_NULL
3173          CALL MPI_Irecv(glob_C_indec_rec, 2*(max_num_elem+1), &
3174                         MPI_INTEGER, irec_static, &
3175                         tag_rec_info, MPI_COMM_WORLD, req_rec_info, mpierr)
3176          CALL MPI_Isend(glob_C_indec_send, 2*(max_num_elem+1), &
3177                         MPI_INTEGER, isend_static, &
3178                         tag_send_info, MPI_COMM_WORLD, req_send_info, mpierr)
3179          !
3180          tag_rec = 1
3181          tag_send = 1
3182          req_rec = MPI_REQUEST_NULL
3183          req_send = MPI_REQUEST_NULL
3184          CALL MPI_Irecv(C_elem_rec, max_num_elem, &
3185                         MPI_COMPLEX_DPC, irec_static, &
3186                         tag_rec, MPI_COMM_WORLD, req_rec, mpierr)
3187          CALL MPI_Isend(C_elem_send, max_num_elem, &
3188                         MPI_COMPLEX_DPC, isend_static, &
3189                         tag_send, MPI_COMM_WORLD, req_send, mpierr)
3190
3191
3192          do jjj = 1, glob_C_indec(1,max_num_elem+1)
3193            i = glob_C_indec(1,jjj)
3194            j = glob_C_indec(2,jjj)
3195            C_aux(i-ib_start+1,j) = C_elem(jjj)
3196          end do
3197
3198          ! make sure the buffer is received/send
3199          CALL MPI_Wait(req_rec_info,stat,mpierr)
3200          CALL MPI_Wait(req_send_info,stat,mpierr)
3201          !
3202          glob_C_indec_send(:,:) = glob_C_indec_rec(:,:)
3203          glob_C_indec(:,:)      = glob_C_indec_rec(:,:)
3204          !
3205          CALL MPI_Wait(req_rec,stat,mpierr)
3206          CALL MPI_Wait(req_send,stat,mpierr)
3207          !
3208          C_elem_send(:) = C_elem_rec(:)
3209          C_elem(:)      = C_elem_rec(:)
3210          !
3211        end do
3212        if(peinf%inode .eq. 0) then
3213          call timing%stop(timing%chi_sum_sub_eigvet_comm)
3214        endif
3215
3216      else ! fast_eig_comm
3217        !
3218        if ( .true. ) then
3219          !
3220          do i = ib_start, ib_end
3221            irow = INDXG2P( i, scal%nbl, iproc_dum, 0, scal%nprow)
3222            if ( irow .ne. scal%myprow) cycle
3223            vc = SQRT(vcoul(i))
3224            irowm = INDXG2L(i, scal%nbl, scal%myprow, 0, scal%nprow)
3225            !
3226            do icolm = 1, scal%npc
3227              j = INDXL2G (icolm, scal%nbl, scal%mypcol, 0, scal%npcol)
3228              if ( j <= neig_sub ) then
3229                C_aux(i-ib_start+1,j) = eigen_vect(irowm,icolm) * vc
3230              end if
3231            end do
3232            !
3233          end do
3234          !
3235        else
3236          !
3237          ! old and slow
3238          do i = ib_start, ib_end
3239            irow=MOD(INT(((i-1)/scal%nbl)+TOL_SMALL),scal%nprow)
3240            if(irow.ne.scal%myprow) cycle
3241            vc = SQRT(vcoul(i))
3242            do j=1, nmtx
3243              icol=MOD(INT(((j-1)/scal%nbl)+TOL_SMALL),scal%npcol)
3244              if(icol .eq. scal%mypcol) then
3245                icurr=icurr+1
3246                irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1
3247                icolm=MOD((icurr-1),scal%npc)+1
3248
3249                IF(j<=neig_sub) THEN
3250                  C_aux(i-ib_start+1,j) = eigen_vect(irowm,icolm) * vc
3251                END IF
3252
3253              endif
3254            end do
3255          end do
3256          !
3257        end if
3258        ! WRITE(*,*) nmtx,neig_sub
3259        if(peinf%inode .eq. 0) then
3260          call timing%start(timing%chi_sum_sub_eigvet_comm)
3261        endif
3262
3263#ifdef MPI
3264        CALL MPI_ALLREDUCE(MPI_IN_PLACE,C_aux(1,1),ib_size*neig_sub,MPI_COMPLEX_DPC,MPI_SUM,MPI_COMM_WORLD,mpierr)
3265#endif
3266        if(peinf%inode .eq. 0) then
3267          call timing%stop(timing%chi_sum_sub_eigvet_comm)
3268        endif
3269
3270        !
3271      end if ! fast_eig_comm
3272
3273      ! transform basis
3274      if(peinf%inode .eq. 0) then
3275        call timing%start(timing%chi_sum_sub_transf)
3276      endif
3277
3278      SAFE_ALLOCATE(gme_temp, (ib_size,peinf%ncownmax,peinf%nvownmax))
3279      gme_temp = ZERO
3280      do ispin = 1 , kp%nspin
3281        itot_rk2 = 0
3282        do irk = 1, nrk
3283          do it = 1, nst(irk)
3284            itot_rk2 = itot_rk2 + 1
3285
3286            gme_temp = ZERO
3287            !$OMP PARALLEL DO
3288            do i = ib_start, ib_end
3289              gme_temp(i-ib_start+1,1:peinf%ncownactual,1:peinf%nvownactual) = &
3290                      pol%gme(indt(i,it,irk),1:peinf%ncownactual,1:peinf%nvownactual,ispin,irk,1) * &
3291                      pht(i,it,irk)
3292            end do
3293            !$OMP END PARALLEL DO
3294
3295            do iv = 1, vwfn%nband+pol%ncrit
3296              ! check if I own this partuicular band
3297              if (peinf%doiownv(iv)) then
3298                ilimit = peinf%ncownactual
3299              else
3300                cycle
3301              endif
3302              iv_local = peinf%indexv(iv)
3303
3304              call zgemm('c','n',neig_sub,peinf%ncownactual,ib_size, &
3305                         (1D0,0D0),C_aux(1:ib_size,1:neig_sub),ib_size,&
3306                         gme_temp(1:ib_size,1:peinf%ncownactual,peinf%indexv(iv)),ib_size,&
3307                         (1D0,0D0),gme_sub(1:neig_sub,1:peinf%ncownactual,peinf%indexv(iv),itot_rk2,ispin),neig_sub)
3308
3309            end do
3310
3311          end do
3312        end do
3313      end do
3314      SAFE_DEALLOCATE(gme_temp)
3315      SAFE_DEALLOCATE(C_aux)
3316      if(peinf%inode .eq. 0) then
3317        call timing%stop(timing%chi_sum_sub_transf)
3318      endif
3319
3320    end do !iblock
3321    call progress_free(prog_info)
3322    SAFE_DEALLOCATE_P(pol%gme)
3323    ! this is going to be deallocated outside
3324    SAFE_ALLOCATE(pol%gme, (1,1,1,1,1,1))
3325
3326    if ( fast_eig_comm ) then
3327      SAFE_DEALLOCATE(elem_per_block)
3328      SAFE_DEALLOCATE(C_elem)
3329      SAFE_DEALLOCATE(C_elem_send)
3330      SAFE_DEALLOCATE(C_elem_rec)
3331      SAFE_DEALLOCATE(glob_C_indec)
3332      SAFE_DEALLOCATE(glob_C_indec_send)
3333      SAFE_DEALLOCATE(glob_C_indec_rec)
3334      SAFE_DEALLOCATE(sqrt_vcoul_local)
3335    end if
3336
3337
3338    !XXXXX
3339    ! initialize the descriptor for the subspace matrix
3340    my_sub_nlocal_row = numroc(neig_sub, scal%nbl, scal%myprow, 0, scal%nprow)
3341    my_sub_nlocal_col = numroc(neig_sub, scal%nbl, scal%mypcol, 0, scal%npcol)
3342    call descinit(desc_sub, neig_sub, neig_sub, scal%nbl, scal%nbl, 0, 0, &
3343                  scal%icntxt, max(1,my_sub_nlocal_row), info)
3344    if(info < 0) then
3345      write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.'
3346      call die("descinit error for desca sub in sub_chi_sum")
3347    end if
3348    ! exchange some info
3349    SAFE_ALLOCATE(row_col_loc, (2,peinf%npes))
3350    SAFE_ALLOCATE(grid2D_2inode, (scal%nprow,scal%npcol))
3351    SAFE_ALLOCATE(inode_2grid2D, (2,peinf%npes))
3352    row_col_loc   = 0
3353    grid2D_2inode = 0
3354    inode_2grid2D = 0
3355    row_col_loc(1,peinf%inode+1) = my_sub_nlocal_row
3356    row_col_loc(2,peinf%inode+1) = my_sub_nlocal_col
3357    grid2D_2inode(scal%myprow+1,scal%mypcol+1) = peinf%inode
3358    inode_2grid2D(1,peinf%inode+1) = scal%myprow
3359    inode_2grid2D(2,peinf%inode+1) = scal%mypcol
3360#ifdef MPI
3361    CALL MPI_ALLREDUCE(MPI_IN_PLACE,row_col_loc(1,1),2*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
3362    CALL MPI_ALLREDUCE(MPI_IN_PLACE,grid2D_2inode(1,1),scal%nprow*scal%npcol,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
3363    CALL MPI_ALLREDUCE(MPI_IN_PLACE,inode_2grid2D(1,1),2*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
3364#endif
3365    !XXXXX
3366
3367    freq_idx = pol%nfreq_group
3368    IF(pol%nfreq_group .gt. 1) THEN
3369      freq_idx = peinf%rank_mtxel+1
3370    END IF
3371
3372    ! avoid allocate zero size array (for mpi reduce?)
3373    SAFE_ALLOCATE(this%chilocal2RDyn, (MAX(1,my_sub_nlocal_row),MAX(1,my_sub_nlocal_col),pol%nfreq_in_group,kp%nspin))
3374
3375    negfact = -1D0*this%fact
3376
3377    ! setup non-blocking cyclic communication (directly keep fix allocated buffers)
3378    if(pol%nonblocking_cyclic) then
3379      ! initialize buffer for non-blocking cyclic scheme
3380      ! static process for the communication
3381      isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes)
3382      irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes)
3383      ! get sizes
3384      size_N = my_sub_nlocal_row
3385      size_M = my_sub_nlocal_col
3386      size_K = ntot
3387      call mpi_allreduce(MPI_IN_PLACE, size_N, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
3388      call mpi_allreduce(MPI_IN_PLACE, size_M, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
3389      call mpi_allreduce(MPI_IN_PLACE, size_K, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
3390      ! allocate only once
3391      SAFE_ALLOCATE(buf_send, (size_N, size_M, pol%nfreq_in_group, kp%nspin))
3392      SAFE_ALLOCATE(buf_rec,  (size_N, size_M, pol%nfreq_in_group, kp%nspin))
3393      SAFE_ALLOCATE(buf_temp, (size_N, size_M, pol%nfreq_in_group, kp%nspin))
3394      !
3395      do ispin = 1 , kp%nspin
3396        do kk = 1, pol%nfreq_in_group
3397          !
3398          !$OMP PARALLEL DO collapse(2)
3399          do ii = 1, size_M
3400            do jj = 1, size_N
3401              buf_send(jj,ii,kk,ispin) = (0.0D+00,0.0D+00)
3402            enddo
3403          enddo
3404          !$OMP PARALLEL DO collapse(2)
3405          do ii = 1, size_M
3406            do jj = 1, size_N
3407              buf_rec(jj,ii,kk,ispin)  = (0.0D+00,0.0D+00)
3408            end do
3409          end do
3410          !$OMP PARALLEL DO collapse(2)
3411          do ii = 1, size_M
3412            do jj = 1, size_N
3413              buf_temp(jj,ii,kk,ispin) = (0.0D+00,0.0D+00)
3414            end do
3415          end do
3416          !
3417        end do
3418      end do
3419      !
3420      ! allocate for matrix multiplication
3421      SAFE_ALLOCATE(this%chilocalRDyn,  (size_N, size_M, pol%nfreq_in_group))  ! (pe_row_size,pe_col_size,pol%nfreq_in_group))
3422      SAFE_ALLOCATE(this%gmeRDyntempr2, (size_N, size_K, pol%nfreq_in_group))  ! (pe_row_size,ntot,pol%nfreq_in_group))
3423      SAFE_ALLOCATE(this%gmeRDyntempc,  (size_K, size_M))  ! (ntot,pe_col_size))
3424      !
3425      do kk = 1, pol%nfreq_in_group
3426        !$OMP PARALLEL DO collapse(2)
3427        do ii = 1, size_M
3428          do jj = 1, size_N
3429            this%chilocalRDyn(jj,ii,kk) = (0.0D+00,0.0D+00)
3430          end do
3431        end do
3432        !$OMP PARALLEL DO collapse(2)
3433        do ii = 1, size_K
3434          do jj = 1, size_N
3435            this%gmeRDyntempr2(jj,ii,kk) = (0.0D+00,0.0D+00)
3436          end do
3437        end do
3438      end do
3439      !$OMP PARALLEL DO collapse(2)
3440      do ii = 1, size_M
3441        do jj = 1, size_K
3442          this%gmeRDyntempc(jj,ii) = (0.0D+00,0.0D+00)
3443        end do
3444      end do
3445      !
3446    end if
3447
3448
3449    ! total time for omega neq zero
3450    if(peinf%inode .eq. 0) then
3451      call timing%start(timing%chi_sum_sub_omega_neq_0)
3452    endif
3453
3454
3455    call progress_init(prog_info, 'building polarizability matrix within the subspace of omega=0', 'processor', &
3456                       peinf%npes)
3457    ! go with the big loop over processes calculations
3458    DO ipe = 1, peinf%npes
3459      call progress_step(prog_info)
3460      !
3461      ! in the standard case there is no difference between ipe_real and ipe
3462      ! this only matters for pol%nonblocking_cyclic
3463      ipe_real = ipe
3464      !
3465      if(pol%nonblocking_cyclic) then
3466        ! calculate the actual process we have to send and we are receiving
3467        actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes)
3468        actual_rec  = MOD(peinf%inode - ipe + peinf%npes, peinf%npes)
3469        ! fixed size
3470        nsend_row = size_N
3471        nrec_row  = size_N
3472        nsend_col = size_M
3473        nrec_col  = size_M
3474        !
3475        do ispin = 1 , kp%nspin
3476          do kk = 1, pol%nfreq_in_group
3477            !$OMP PARALLEL DO collapse(2)
3478            do ii = 1, nrec_col
3479              do jj = 1, nrec_row
3480                buf_rec(jj,ii,kk,ispin) = ZERO
3481              enddo
3482            enddo
3483          end do
3484        end do
3485        !
3486        ! post message
3487        if(peinf%inode .eq. 0) then
3488          call timing%start(timing%chi_sum_comm)
3489        endif
3490
3491        tag_rec = 1
3492        tag_send = 1
3493        req_rec = MPI_REQUEST_NULL
3494        req_send = MPI_REQUEST_NULL
3495        CALL MPI_Irecv(buf_rec, nrec_row*nrec_col*pol%nfreq_in_group*kp%nspin, &
3496                       MPI_COMPLEX_DPC, irec_static, &
3497                       tag_rec, MPI_COMM_WORLD, req_rec, mpierr)
3498        CALL MPI_Isend(buf_send, nsend_row*nsend_col*pol%nfreq_in_group*kp%nspin, &
3499                       MPI_COMPLEX_DPC, isend_static, &
3500                       tag_send, MPI_COMM_WORLD, req_send, mpierr)
3501        if(peinf%inode .eq. 0) then
3502          call timing%stop(timing%chi_sum_comm)
3503        endif
3504
3505        ! make sure to put buffer to zero
3506        do kk = 1, pol%nfreq_in_group
3507          !$OMP PARALLEL DO collapse(2)
3508          do ii = 1, size_M
3509            do jj = 1, size_N
3510              this%chilocalRDyn(jj,ii,kk) = (0.0D+00,0.0D+00)
3511            end do
3512          end do
3513          !$OMP PARALLEL DO collapse(2)
3514          do ii = 1, size_K
3515            do jj = 1, size_N
3516              this%gmeRDyntempr2(jj,ii,kk) = (0.0D+00,0.0D+00)
3517            end do
3518          end do
3519        end do
3520        !$OMP PARALLEL DO collapse(2)
3521        do ii = 1, size_M
3522          do jj = 1, size_K
3523            this%gmeRDyntempc(jj,ii) = (0.0D+00,0.0D+00)
3524          end do
3525        end do
3526        !
3527        ! in this case the ipe is the real process from which we should receive
3528        ipe_real = actual_rec+1
3529      end if  ! pol%nonblocking_cyclic
3530      !
3531
3532      !
3533      pe_row_size = row_col_loc(1,ipe_real)
3534      pe_col_size = row_col_loc(2,ipe_real)
3535      IF(pe_row_size==0 .OR. pe_col_size==0) CYCLE
3536      ! precompute indices
3537      ! row
3538      SAFE_ALLOCATE(row_indx, (pe_row_size))
3539      ipe_row = inode_2grid2D(1,ipe_real)
3540      row_indx = 0
3541      DO irow = 1, pe_row_size
3542        row_indx(irow) = INDXL2G( irow, scal%nbl, ipe_row, 0, scal%nprow)
3543      END DO
3544      ! col
3545      SAFE_ALLOCATE(col_indx, (pe_col_size))
3546      ipe_col = inode_2grid2D(2,ipe_real)
3547      col_indx = 0
3548      DO icol = 1, pe_col_size
3549        col_indx(icol) = INDXL2G( icol, scal%nbl, ipe_col, 0, scal%npcol)
3550      END DO
3551
3552      ! IF(peinf%inode==0) WRITE(*,*) ipe, ipe_real-1
3553      ! IF(peinf%inode==0) WRITE(*,*) row_indx
3554      ! IF(peinf%inode==0) WRITE(*,*) col_indx
3555      ! IF(peinf%inode==0) WRITE(*,*)
3556
3557      do ispin = 1, kp%nspin
3558        if(peinf%inode .eq. 0) then
3559          call timing%start(timing%chi_sum_prep)
3560        endif
3561
3562        if ( .not. pol%nonblocking_cyclic ) then
3563          ! standard calculation
3564          SAFE_ALLOCATE(this%chilocalRDyn, (pe_row_size,pe_col_size,pol%nfreq_in_group))
3565          SAFE_ALLOCATE(this%gmeRDyntempr2, (pe_row_size,ntot,pol%nfreq_in_group))
3566          SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,pe_col_size))
3567          this%chilocalRDyn  = ZERO
3568          this%gmeRDyntempr2 = ZERO
3569          this%gmeRDyntempc  = ZERO
3570          !
3571        end if
3572
3573        itot = 0
3574        itot_rk2 = 0
3575        do irk = 1, nrk
3576          do it = 1, nst(irk)
3577            itot_rk2 = itot_rk2 + 1
3578            do iv = 1,vwfn%nband+pol%ncrit
3579              if (peinf%doiownv(iv)) then
3580                ilimit = peinf%ncownactual
3581              else
3582                ilimit = 0
3583              endif
3584
3585              !$OMP PARALLEL private (mytot, zvalue, edenDRtemp, jj, icurr, &
3586              !$OMP                   cvpair_temp, cv_energy, irow, irow_global, icol, icol_global)
3587              SAFE_ALLOCATE(edenDRtemp, (pol%nfreq_in_group))
3588              !$OMP DO
3589              do i_myband = 1, ilimit
3590                 mytot = itot + i_myband
3591
3592                 zvalue = pol%edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx)
3593                 if(pol%lin_denominator<TOL_Zero) then
3594                   ! this is when the lin_denominator mode is not active.
3595
3596                   do jj=1, pol%nfreq_in_group
3597                     if (abs(zvalue) .gt. Tol_Zero) then
3598                       edenDRtemp(jj)= -0.5d0*( &
3599                         1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ &
3600                         1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd))
3601                     else
3602                       edenDRtemp(jj)= 0D0
3603                     endif
3604                   enddo
3605
3606                 else
3607                   !this is when lin_denominator mode is active
3608
3609                   cvpair_temp = pol%lin_edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx)
3610                   cv_energy = cvpair_temp%ev - cvpair_temp%ec
3611                   do jj=1, pol%nfreq_in_group
3612                     !check if denominator is not small. If so, do the usual thing
3613
3614                     if(abs(cv_energy+pol%dFreqGrid(jj)/ryd)-pol%lin_denominator>-TOL_Zero .and. &
3615                        abs(cv_energy-pol%dFreqGrid(jj)/ryd)-pol%lin_denominator>-TOL_Zero) then
3616
3617                       if (abs(zvalue) .gt. Tol_Zero) then
3618                         edenDRtemp(jj)= -0.5d0*( &
3619                           1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ &
3620                           1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd))
3621                       else
3622                         edenDRtemp(jj)= 0D0
3623                       endif
3624
3625                     else ! if denominator is small, do linearized energy denominator
3626                       if(cvpair_temp%vltc) then
3627                         edenDRtemp(jj) = -0.5d0*(&
3628
3629                           conjg(integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),&
3630                             cvpair_temp%ev,cvpair_temp%ec,&
3631
3632                             cvpair_temp%vv,cvpair_temp%vc,&
3633                             -pol%dFreqGrid(jj)/ryd,pol%efermi/ryd))&
3634
3635                           +integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),&
3636                             cvpair_temp%ev,cvpair_temp%ec,&
3637                             cvpair_temp%vv,cvpair_temp%vc,&
3638                             pol%dFreqGrid(jj)/ryd,pol%efermi/ryd))
3639                       else
3640                         edenDRtemp(jj)= 0D0
3641                       endif
3642                     endif
3643                   end do
3644
3645                 end if
3646
3647                 ! copy matrix elements
3648                 DO irow = 1, pe_row_size
3649                   irow_global = row_indx(irow)
3650                   DO jj = 1, pol%nfreq_in_group
3651                     this%gmeRDyntempr2(irow,mytot,jj) = gme_sub(irow_global,i_myband,peinf%indexv(iv),itot_rk2,ispin)*&
3652                                                         edenDRtemp(jj)
3653                   END DO
3654                 END DO
3655                 DO icol = 1, pe_col_size
3656                   icol_global = col_indx(icol)
3657                   this%gmeRDyntempc(mytot,icol) = MYCONJG(gme_sub(icol_global,i_myband,peinf%indexv(iv),itot_rk2,ispin))
3658                 END DO
3659
3660              end do ! i_band
3661              !$OMP END DO
3662              SAFE_DEALLOCATE(edenDRtemp)
3663              !$OMP END PARALLEL
3664              itot = itot+ilimit
3665            end do ! iv
3666          end do ! it
3667        end do ! irk
3668
3669        if(peinf%inode .eq. 0) then
3670          call timing%stop(timing%chi_sum_prep)
3671        endif
3672
3673
3674        if( pol%nonblocking_cyclic ) then
3675          !
3676          if(peinf%inode .eq. 0) then
3677            call timing%start(timing%chi_sum_gemm)
3678          endif
3679          do jj = 1, pol%nfreq_in_group
3680            call zgemm('n','n', size_N, size_M, size_K, &
3681                negfact,this%gmeRDyntempr2(:,:,jj), size_N, &
3682                        this%gmeRDyntempc(:,:), size_K,&
3683                (0D0,0D0),this%chilocalRDyn(:,:,jj), size_N)
3684          end do
3685          if(peinf%inode .eq. 0) then
3686            call timing%stop(timing%chi_sum_gemm)
3687          endif
3688          !
3689        else ! pol%nonblocking_cyclic
3690          !
3691          ! go with zgemm (standard calculation)
3692          if(ntot > 0) then
3693            do jj =1, pol%nfreq_in_group
3694              if(peinf%inode .eq. 0) then
3695                call timing%start(timing%chi_sum_gemm)
3696              endif
3697              call zgemm('n','n',pe_row_size,pe_col_size,ntot, &
3698                negfact,this%gmeRDyntempr2(:,:,jj),pe_row_size,this%gmeRDyntempc(:,:),ntot,&
3699                (0D0,0D0),this%chilocalRDyn(:,:,jj),pe_row_size)
3700              if(peinf%inode .eq. 0) then
3701                call timing%stop(timing%chi_sum_gemm)
3702              endif
3703            enddo
3704          endif
3705          !
3706        end if ! pol%nonblocking_cyclic
3707
3708        if( pol%nonblocking_cyclic ) then
3709          !
3710          ! just copy the result
3711          do kk = 1, pol%nfreq_in_group
3712!$OMP PARALLEL DO collapse(2)
3713            do ii = 1, nrec_col
3714              do jj = 1, nrec_row
3715                buf_temp(jj,ii,kk,ispin) = this%chilocalRDyn(jj,ii,kk)
3716              enddo
3717            enddo
3718          end do
3719          !
3720        else ! pol%nonblocking_cyclic
3721        !
3722        ! standard calculation
3723        if(peinf%inode .eq. 0) then
3724          call timing%start(timing%chi_sum_comm)
3725        endif
3726
3727#ifdef MPI
3728        ! all reduce
3729        call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
3730                        pol%nfreq_in_group*pe_row_size*pe_col_size,MPI_COMPLEX_DPC,&
3731                        MPI_SUM,ipe_real-1,MPI_COMM_WORLD,mpierr)
3732#endif
3733        if(peinf%inode .eq. 0) then
3734          call timing%stop(timing%chi_sum_comm)
3735        endif
3736
3737#ifndef MPI
3738          this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:)
3739#endif
3740          !
3741        end if ! pol%nonblocking_cyclic
3742
3743
3744        if(peinf%inode .eq. 0) then
3745          call timing%start(timing%chi_sum_array_alloc)
3746        endif
3747        if ( .not. pol%nonblocking_cyclic ) then
3748          ! deallocate
3749          SAFE_DEALLOCATE(this%chilocalRDyn)
3750          SAFE_DEALLOCATE(this%gmeRDyntempr2)
3751          SAFE_DEALLOCATE(this%gmeRDyntempc)
3752        end if
3753        if(peinf%inode .eq. 0) then
3754          call timing%stop(timing%chi_sum_array_alloc)
3755        endif
3756
3757      end do ! ispin
3758
3759      SAFE_DEALLOCATE(row_indx)
3760      SAFE_DEALLOCATE(col_indx)
3761
3762      ! finalize non-blocking cyclic communication
3763      if(pol%nonblocking_cyclic) then
3764        if(peinf%inode .eq. 0) then
3765          call timing%start(timing%chi_sum_comm)
3766        endif
3767
3768          ! make sure the buffer is received
3769          CALL MPI_Wait(req_rec,stat,mpierr)
3770          ! accumulate contribution into receiving buffer
3771          !XXXX  buf_rec(:,:,:) = buf_rec(:,:,:) + buf_temp
3772          do ispin = 1 , kp%nspin
3773            do kk = 1, pol%nfreq_in_group
3774!$OMP PARALLEL DO collapse(2)
3775              do ii = 1, nrec_col
3776                do jj = 1, nrec_row
3777                  buf_rec(jj,ii,kk,ispin) = buf_rec(jj,ii,kk,ispin) + buf_temp(jj,ii,kk,ispin)
3778                enddo
3779              enddo
3780            end do
3781          end do
3782          ! wait for the massage to be sent
3783          CALL MPI_Wait(req_send,stat,mpierr)
3784          if(peinf%inode .eq. 0) then
3785            call timing%stop(timing%chi_sum_comm)
3786          endif
3787
3788          if(peinf%inode .eq. 0) then
3789            call timing%start(timing%chi_sum_array_alloc)
3790          endif
3791
3792          ! copy the messega to the sending buffer for the next cycle
3793          !
3794          do ispin = 1 , kp%nspin
3795            do kk = 1, pol%nfreq_in_group
3796!$OMP PARALLEL DO collapse(2)
3797              do ii = 1, nrec_col
3798                do jj = 1, nrec_row
3799                  buf_send(jj,ii,kk,ispin) = buf_rec(jj,ii,kk,ispin)
3800                enddo
3801              enddo
3802            end do
3803          end do
3804
3805          !XXXX buf_send = buf_rec
3806          !
3807          nsend_row = nrec_row
3808          nsend_col = nrec_col
3809          !
3810          if(peinf%inode .eq. 0) then
3811            call timing%stop(timing%chi_sum_array_alloc)
3812          endif
3813
3814      end if ! pol%nonblocking_cyclic
3815
3816    END DO ! ipe
3817    call progress_free(prog_info)
3818    !
3819#ifdef MPI
3820    call MPI_barrier(MPI_COMM_WORLD,mpierr)
3821#endif
3822    !
3823    if(peinf%inode .eq. 0) then
3824      call timing%stop(timing%chi_sum_sub_omega_neq_0)
3825    endif
3826
3827
3828    if(pol%nonblocking_cyclic) then
3829      ! done
3830      do ispin = 1 , kp%nspin
3831        do kk = 1, pol%nfreq_in_group
3832          !$OMP PARALLEL DO collapse(2)
3833          do ii = 1, my_sub_nlocal_col
3834            do jj = 1, my_sub_nlocal_row
3835              this%chilocal2RDyn(jj,ii,kk,ispin) = buf_send(jj,ii,kk,ispin)
3836            enddo
3837          enddo
3838        end do
3839      end do
3840      !XXXX this%chilocal2RDyn(:,:,:,:) = buf_send(:,:,:,:)
3841      !
3842      ! clean-up
3843      SAFE_DEALLOCATE(buf_send)
3844      SAFE_DEALLOCATE(buf_rec)
3845      SAFE_DEALLOCATE(buf_temp)
3846      SAFE_DEALLOCATE(this%chilocalRDyn)
3847      SAFE_DEALLOCATE(this%gmeRDyntempr2)
3848      SAFE_DEALLOCATE(this%gmeRDyntempc)
3849      !
3850    end if
3851
3852    ! deallocate stuff
3853    SAFE_DEALLOCATE(gme_sub)
3854    SAFE_DEALLOCATE(row_col_loc)
3855    SAFE_DEALLOCATE(grid2D_2inode)
3856
3857    IF(pol%need_full_chi) THEN
3858      ! here allocate chiRdyson (case for which we need full chi)
3859      SAFE_ALLOCATE(pol%chiRDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin))
3860      pol%chiRDyn = ZERO
3861
3862      ! initialize descriptor for the eigenvector matrix
3863      call descinit(desca, nmtx, nmtx, scal%nbl, scal%nbl, 0, 0, &
3864                    scal%icntxt, max(1,scal%npr), info)
3865      if(info < 0) then
3866        write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.'
3867        call die("descinit error for descaA in sub_chi_sum")
3868      end if
3869
3870      ! transform to original basis use C_Pgemm as temporary array
3871      SAFE_ALLOCATE(C_Pgemm, (scal%npr,scal%npc))
3872      C_Pgemm = ZERO
3873      do ispin = 1, kp%nspin
3874        do jj = 1, pol%nfreq_in_group
3875          if(peinf%inode .eq. 0) then
3876            call timing%start(timing%subspace_pgemm)
3877          endif
3878
3879          CALL pzgemm('N','N', nmtx, neig_sub, neig_sub, (1.0d0,0.0d0), eigen_vect, 1, 1, desca, &
3880                      this%chilocal2RDyn(:,:,jj,ispin), 1, 1, desc_sub, (0.0d0,0.0d0), &
3881                      C_Pgemm, 1, 1, desca)
3882          if(peinf%inode .eq. 0) then
3883            call timing%stop(timing%subspace_pgemm)
3884          endif
3885          if(peinf%inode .eq. 0) then
3886            call timing%start(timing%subspace_pgemm)
3887          endif
3888          CALL pzgemm('N','C', nmtx, nmtx, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, &
3889                      eigen_vect, 1, 1, desca, (0.0d0,0.0d0), &
3890                      pol%chiRDyn(:,:,jj,ispin), 1, 1, desca)
3891          if(peinf%inode .eq. 0) then
3892            call timing%stop(timing%subspace_pgemm)
3893          endif
3894        end do
3895      end do
3896      SAFE_DEALLOCATE(C_Pgemm)
3897      SAFE_DEALLOCATE(this%chilocal2RDyn)
3898
3899      ! remove coulomb potential
3900      do ispin = 1, kp%nspin
3901        ! go back to polarizability (remove v^{1/2})
3902        icurr=0
3903        do i=1, nmtx
3904          irow=MOD(INT(((i-1)/scal%nbl)+TOL_SMALL),scal%nprow)
3905          if(irow.ne.scal%myprow) cycle
3906
3907          vc = SQRT(vcoul(i))
3908
3909          do j=1, nmtx
3910            icol=MOD(INT(((j-1)/scal%nbl)+TOL_SMALL),scal%npcol)
3911            if(icol .eq. scal%mypcol) then
3912              icurr=icurr+1
3913              irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1
3914              icolm=MOD((icurr-1),scal%npc)+1
3915
3916              pol%chiRDyn(irowm,icolm,1:pol%nfreq,ispin) = pol%chiRDyn(irowm,icolm,1:pol%nfreq,ispin) / vc / SQRT(vcoul(j))
3917
3918            endif
3919          end do
3920        end do
3921      end do
3922    ELSE
3923      ! save information for epsinv_sub
3924      pol%nrow_local_sub = my_sub_nlocal_row
3925      pol%ncol_local_sub = my_sub_nlocal_col
3926      pol%neig_sub = neig_sub
3927
3928      SAFE_ALLOCATE(pol%chiRDyn, (MAX(1,my_sub_nlocal_row),MAX(1,my_sub_nlocal_col),pol%nfreq_in_group,kp%nspin))
3929      pol%chiRDyn = ZERO
3930      pol%chiRDyn(:,:,:,:) = this%chilocal2RDyn(:,:,:,:)
3931      SAFE_DEALLOCATE(this%chilocal2RDyn)
3932    END IF
3933
3934    POP_SUB(calc_pol_sub_trunc_easy)
3935
3936  end subroutine calc_pol_sub_trunc_easy
3937
3938  subroutine dump_matrix(scal, matrix, inode, unit)
3939    type (scalapack), intent(in) :: scal
3940    SCALAR, intent(in)           :: matrix(scal%npr,scal%npc)
3941    integer :: inode, unit
3942    character(len=7) :: nam
3943    integer :: jjj
3944
3945    PUSH_SUB(dump_matrix)
3946
3947    ! write(nam,'(I5)') unit+inode
3948    ! open(unit+inode,file='chi.'//TRIM(nam),form='unformatted',status='replace')
3949    do jjj = 1, scal%npc
3950      write(unit+inode) matrix(1:scal%npr,jjj)
3951    end do
3952    flush(unit+inode)
3953    ! close(unit+inode)
3954
3955    POP_SUB(dump_matrix)
3956
3957  end subroutine dump_matrix
3958
3959  subroutine read_dump_matrix(scal, matrix, inode, unit)
3960    type (scalapack), intent(in) :: scal
3961    SCALAR, intent(inout)        :: matrix(scal%npr,scal%npc)
3962    integer :: inode, unit
3963    character(len=7) :: nam
3964    integer :: jjj
3965
3966    PUSH_SUB(read_dump_matrix)
3967
3968    ! write(nam,'(I5)') unit+inode
3969    ! open(unit+inode,file='chi.'//TRIM(nam),form='unformatted',status='old')
3970    do jjj = 1, scal%npc
3971      read(unit+inode) matrix(1:scal%npr,jjj)
3972    end do
3973    ! close(unit+inode)
3974
3975    POP_SUB(read_dump_matrix)
3976
3977  end subroutine
3978
3979#endif
3980! close the condition ( if defined MPI && defined USESCALAPACK) for subspace method
3981
3982end module chi_summation_m
3983