1!============================================================================== 2! 3! Modules: 4! 5! chi_summation_m Last Modified: 04/19/2012 (FHJ) 6! 7! This module creates the (distributed) polarizability matrix chi by summing 8! the (distributed) pol%gme matrices. There are routines that communicate the 9! gme matrix using either matrix or element partitioning scheme. 10! 11!============================================================================== 12 13#include "f_defs.h" 14 15module chi_summation_m 16 17 use global_m 18 use blas_m 19 use mtxelmultiply_m 20 use scalapack_m 21 use lin_denominator_m 22 use io_utils_m 23 use vcoul_generator_m 24 use elpa_interface_m 25 use misc_m, only : procmem 26 27 use timing_m, only: timing => epsilon_timing 28 29 implicit none 30 31 private 32 33 public :: create_chi_summator, free_chi_summator,& 34 chi_summation_comm_matrix, chi_summation_comm_elements 35#if defined MPI && defined USESCALAPACK 36 public :: chi_summation_sub_trunc 37#endif 38 39 !> FHJ: the chi_summator "object" 40 type chi_summator_t 41 real(DP) :: fact 42 !> DVF: below are some temporary buffers needed for the chi summation. They are 43 !! described in detail in this comment. 44 !! gme = g-vector matrix element 45 !! gmetempX where X = n,r,c are temporary arrays for static calculations 46 !! n = normalized by the proper factor used in BGW 47 !! r = row, meaning the matrix elements for all bands (nv*nc*nk) that the proc owns 48 !! for the G-vectors in the rows of chi currently being computed 49 !! c = column, the matrix elements for all bands (nv*nc*nk) that the proc owns 50 !! for the G`-vectors in the rows of chi currently being computed 51 !! the RDyn arrays are needed for full-frequency (FF) calculations, real and complex 52 !! r2 is used in FF with matrix communication because you have a different denominator for 53 !! each frequency. Only the r2 array (not the r) array is used for element communication 54 !! the denominators are built into the gme`s for static calculations 55 !! eden arrays hold the energy denominators for FF 56 !! chilocal holds the contribution of a given processor to the GG` chunk of epsilon 57 !! being computed 58 !! chilocal2 holds the MPI reduced GG` chunk of epsilon being computed 59 SCALAR, allocatable :: chilocal(:,:) 60 SCALAR, allocatable :: chilocal2(:,:,:) 61 complex(DPC), allocatable :: chilocalRDyn(:,:,:) 62 complex(DPC), allocatable :: chilocal2RDyn(:,:,:,:) 63 complex(DPC), allocatable :: chiRDyntmp(:) 64 complex(DPC), allocatable :: chiRDynOld(:,:,:,:) 65 SCALAR, allocatable :: gmetempr(:,:),gmetempc(:,:) 66 SCALAR, allocatable :: gmetempn(:) 67 complex(DPC), allocatable :: gmeRDyntempn(:) 68 complex(DPC), allocatable :: gmeRDyntempr2(:,:,:) 69 complex(DPC), allocatable :: gmeRDyntempr3(:,:) 70 complex(DPC), allocatable :: gmeRDyntempc(:,:) 71 complex(DPC), allocatable :: gmeRDyntempcs(:,:) 72 integer, allocatable :: deltaCount(:,:) 73 integer, allocatable :: deltaCountReduce(:,:) 74 end type chi_summator_t 75 76 public :: chi_summator_t 77 78contains 79 80 subroutine create_chi_summator(this, pol, scal, fact, nspin) 81 type(chi_summator_t), intent(INOUT) :: this !<the chi_summator_t object 82 type(polarizability), intent(IN) :: pol 83 type(scalapack), intent(IN) :: scal 84 real(DP), intent(IN) :: fact 85 integer, intent(IN) :: nspin 86 87 PUSH_SUB(create_chi_summator) 88 89 this%fact = fact 90 91 if (pol%freq_dep .eq. 0) then 92 SAFE_ALLOCATE(this%gmetempn, (pol%nmtx)) 93 endif 94 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then 95 SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx)) 96 SAFE_ALLOCATE(this%chiRDyntmp, (pol%nfreq_in_group)) 97 endif 98 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 99 SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx)) 100 SAFE_ALLOCATE(this%chiRDyntmp, (pol%nfreq_in_group)) 101 SAFE_ALLOCATE(this%chiRDynOld, (pol%nfreq_in_group,scal%npr,scal%npc,nspin)) 102 endif 103 if (pol%freq_dep .eq. 3) then 104 SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx)) 105 SAFE_ALLOCATE(this%chiRDyntmp, (pol%nfreq_in_group)) 106 endif 107 108 POP_SUB(create_chi_summator) 109 return 110 111 end subroutine create_chi_summator 112 113 subroutine free_chi_summator(this, pol) 114 type(chi_summator_t), intent(INOUT) :: this !<the chi_summator_t object 115 type(polarizability), intent(IN) :: pol 116 117 PUSH_SUB(free_chi_summator) 118 119 if (pol%freq_dep .eq. 0) then 120 SAFE_DEALLOCATE(this%gmetempn) 121 endif 122 if (pol%freq_dep .eq. 2) then 123 SAFE_DEALLOCATE(this%gmeRDyntempn) 124 SAFE_DEALLOCATE(this%chiRDyntmp) 125 if ((pol%freq_dep_method .eq. 1)) then 126 SAFE_DEALLOCATE(this%chiRDynOld) 127 endif 128 endif 129 if (pol%freq_dep .eq. 3) then 130 SAFE_DEALLOCATE(this%gmeRDyntempn) 131 SAFE_DEALLOCATE(this%chiRDyntmp) 132 endif 133 134 POP_SUB(free_chi_summator) 135 return 136 137 end subroutine free_chi_summator 138 139 !----------------------------------------------------------------------------- 140 ! GCOMM_MATRIX 141 !----------------------------------------------------------------------------- 142 143 !> Create the pol%chi matrix using gcomm_matrix sceheme 144 subroutine chi_summation_comm_matrix(this,pol,scal,kp,kpq,vwfn,& 145 nst,nrk,indt,pht) 146 type(chi_summator_t), intent(INOUT) :: this 147 type(polarizability), intent(INOUT) :: pol 148 type(scalapack), intent(in) :: scal 149 type(kpoints), intent(IN) :: kp,kpq 150 type(valence_wfns), intent(IN) :: vwfn 151 152 integer, intent(IN) :: nst(:) 153 integer, intent(IN) :: nrk 154 integer, intent(INOUT) :: indt(:,:,:) 155 SCALAR, intent(INOUT) :: pht(:,:,:) 156 157 integer :: ntot_members(pol%nfreq_group) 158 integer :: icurr,ntot,ntot2,itot,ntotmax,ifreq_para 159 integer :: ipe, ilimit, ii, jj, kk, ll, iv, irk, it, ispin,im,mytot 160 integer :: i_myband,tag,irank,tmp_iv,im_proc 161 complex(DPC) :: negfact 162 real(DP) :: zvalue, cv_energy 163 type(cvpair_info) :: cvpair_temp 164 integer, allocatable :: tmprowindex(:),tmpcolindex(:) 165 SCALAR, allocatable :: tmprowph(:),tmpcolph(:), sendbuf(:,:) 166 complex(DPC), allocatable :: gmeRDyntempr(:,:) 167 complex(DPC), allocatable :: edenDRtemp(:,:) 168 169 ! frequency points for the spectral functions of the polarizability 170 integer :: isfreql, isfreqr, nwarn 171 real(DP) :: sfreql, sfreqr, wr, wl 172 ! Hilbert transform coefficients 173 complex(DPC) :: htwR(pol%nfreq,pol%nsfreq), htwA(pol%nfreq,pol%nsfreq) 174 complex(DPC) :: c11,c12,c13,c14,c21,c22,c23,c24 175 complex(DPC) :: cA11,cA12,cA13,cA14,cA21,cA22,cA23,cA24 176 177 integer :: isf,nf,nsf, request 178 real(DP) :: sf1,sf2 179 real(DP) :: step1,step2,fqt,eta 180 complex(DPC) :: c1,c2,j_dpc,cA1,cA2 181 logical :: first_reduce 182 183 ! variables for non-blocking cyclic scheme 184 integer :: isend_static, irec_static 185 integer :: actual_send, actual_rec 186 integer :: nsend_row, nsend_col, nrec_row, nrec_col 187 integer :: req_send, tag_send, req_rec, tag_rec 188#ifdef MPI 189 integer :: stat(MPI_STATUS_SIZE) 190#endif 191 SCALAR, allocatable :: buf_send(:,:,:), buf_rec(:,:,:), buf_temp(:,:,:) 192 193 type(progress_info) :: prog_info 194 195 PUSH_SUB(chi_summation_comm_matrix) 196 197 !call alloc_summation_buffers(pol, this%fact) 198 199 if (pol%freq_dep.eq.0) then 200 SAFE_ALLOCATE(this%chilocal2, (scal%npr,scal%npc,kp%nspin)) 201!$OMP PARALLEL DO collapse(3) 202 do ii = 1, kp%nspin 203 do jj = 1, scal%npc 204 do kk = 1, scal%npr 205 this%chilocal2(kk,jj,ii)=0D0 206 enddo 207 enddo 208 enddo 209 endif 210 211 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then 212 SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin)) 213!$OMP PARALLEL DO collapse(4) 214 do ii = 1, kp%nspin 215 do jj = 1, pol%nfreq_in_group 216 do kk = 1, scal%npc 217 do ll = 1, scal%npr 218 this%chilocal2RDyn(ll,kk,jj,ii)=0D0 219 enddo 220 enddo 221 enddo 222 enddo 223 endif 224 225! At this moment the spectral method only works for gcomm_elements. 226! call die in inread.f90 227 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 228 SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin)) 229!$OMP PARALLEL DO collapse(4) 230 do ii = 1, kp%nspin 231 do jj = 1, pol%nfreq_in_group 232 do kk = 1, scal%npc 233 do ll = 1, scal%npr 234 this%chilocal2RDyn(ll,kk,jj,ii)=0D0 235 enddo 236 enddo 237 enddo 238 enddo 239 240 endif 241 242 if (pol%freq_dep .eq. 3) then 243 SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin)) 244!$OMP PARALLEL DO collapse(4) 245 do ii = 1, kp%nspin 246 do jj = 1, pol%nfreq_in_group 247 do kk = 1, scal%npc 248 do ll = 1, scal%npr 249 this%chilocal2RDyn(ll,kk,jj,ii)=0D0 250 enddo 251 enddo 252 enddo 253 enddo 254 endif 255 256 ntot=0 257 ntot2=0 258 259 ntot = peinf%nvownactual*peinf%ncownactual 260 do irk = 1, nrk 261 ntot2=ntot2 + nst(irk) 262 enddo 263 ntot=ntot*ntot2 264 265 !------------------------------------------------------------------- 266 ! Static Be Here 267 268 269 if (pol%freq_dep .eq. 0) then 270 271#ifdef MPI 272 if(peinf%inode .eq. 0) then 273 call timing%start(timing%chi_sum_bar) 274 endif 275 call MPI_barrier(MPI_COMM_WORLD,mpierr) 276 if(peinf%inode .eq. 0) then 277 call timing%stop(timing%chi_sum_bar) 278 endif 279#endif 280 281 first_reduce = .true. 282 283 call progress_init(prog_info, 'building polarizability matrix', 'processor', & 284 peinf%npes) 285 286 if(pol%nonblocking_cyclic) then 287 ! initialize buffer for non-blocking cyclic scheme 288 ! static process for the communication 289 isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes) 290 irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes) 291 ! allocate my size for the first send 292 nsend_row = scal%nprd(peinf%inode+1) 293 nsend_col = scal%npcd(peinf%inode+1) 294 SAFE_ALLOCATE(buf_send, (nsend_row, nsend_col, kp%nspin)) 295 do ispin = 1 , kp%nspin 296 !$OMP PARALLEL DO collapse(2) 297 do ii = 1, nsend_col 298 do jj = 1, nsend_row 299 buf_send(jj,ii,ispin) = ZERO 300 enddo 301 enddo 302 end do 303 end if 304 305 do ipe = 1, peinf%npes 306 call progress_step(prog_info) 307 308 if(pol%nonblocking_cyclic) then 309 ! calculate the actual process we have to send and we are receiving 310 actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes) 311 actual_rec = MOD(peinf%inode - ipe + peinf%npes, peinf%npes) 312 nrec_row = scal%nprd(actual_rec+1) 313 nrec_col = scal%npcd(actual_rec+1) 314 ! allocate reciving buffer 315 SAFE_ALLOCATE(buf_rec, (nrec_row,nrec_col,kp%nspin)) 316 do ispin = 1 , kp%nspin 317 !$OMP PARALLEL DO collapse(2) 318 do ii = 1, nrec_col 319 do jj = 1, nrec_row 320 buf_rec(jj,ii,ispin) = ZERO 321 enddo 322 enddo 323 end do 324#ifdef MPI 325 ! post message 326 if(peinf%inode .eq. 0) then 327 call timing%start(timing%chi_sum_comm) 328 endif 329 tag_rec = 1 330 tag_send = 1 331 req_rec = MPI_REQUEST_NULL 332 req_send = MPI_REQUEST_NULL 333 CALL MPI_Irecv(buf_rec, nrec_row*nrec_col*kp%nspin,MPI_SCALAR,irec_static,& 334 tag_rec, MPI_COMM_WORLD, req_rec, mpierr) 335 CALL MPI_Isend(buf_send, nsend_row*nsend_col*kp%nspin,MPI_SCALAR,isend_static,& 336 tag_send, MPI_COMM_WORLD, req_send, mpierr) 337#endif 338 if(peinf%inode .eq. 0) then 339 call timing%stop(timing%chi_sum_comm) 340 endif 341 ! allocate working array 342 if(peinf%inode .eq. 0) then 343 call timing%start(timing%chi_sum_array_alloc) 344 endif 345 SAFE_ALLOCATE(this%chilocal, (nrec_row, nrec_col)) 346!$OMP PARALLEL DO collapse(2) 347 do ii = 1, nrec_col 348 do jj = 1, nrec_row 349 this%chilocal(jj,ii)=0D0 350 enddo 351 enddo 352 SAFE_ALLOCATE(buf_temp, (nrec_row, nrec_col, kp%nspin)) 353 do ispin = 1 , kp%nspin 354!$OMP PARALLEL DO collapse(2) 355 do ii = 1, nrec_col 356 do jj = 1, nrec_row 357 buf_temp(jj,ii,ispin) = ZERO 358 enddo 359 enddo 360 end do 361 SAFE_ALLOCATE(this%gmetempr, (nrec_row, ntot)) 362 SAFE_ALLOCATE(this%gmetempc, (nrec_col, ntot)) 363 if(peinf%inode .eq. 0) then 364 call timing%stop(timing%chi_sum_array_alloc) 365 endif 366 else 367 368 if(peinf%inode .eq. 0) then 369 call timing%start(timing%chi_sum_array_alloc) 370 endif 371 SAFE_ALLOCATE(this%chilocal, (scal%nprd(ipe),scal%npcd(ipe))) 372!$OMP PARALLEL DO collapse(2) 373 do ii = 1, scal%npcd(ipe) 374 do jj = 1, scal%nprd(ipe) 375 this%chilocal(jj,ii)=0D0 376 enddo 377 enddo 378 SAFE_ALLOCATE(this%gmetempr, (scal%nprd(ipe),ntot)) 379! JRD XXX Should change order here like do in FF case. I think I did... 380 SAFE_ALLOCATE(this%gmetempc, (scal%npcd(ipe),ntot)) 381 if(peinf%inode .eq. 0) then 382 call timing%stop(timing%chi_sum_array_alloc) 383 endif 384 end if ! pol%nonblocking_cyclic 385 386 do ispin = 1 , kp%nspin 387 388 if(pol%nonblocking_cyclic) then 389 call mtxelmultiply(scal,ntot,nrk,nst,this%fact,vwfn, & 390 this%gmetempr,this%gmetempc,this%chilocal,pol%gme,pol,indt,pht,actual_rec+1,ispin) 391!$OMP PARALLEL DO collapse(2) 392 do ii = 1, nrec_col 393 do jj = 1, nrec_row 394 buf_temp(jj,ii,ispin) = this%chilocal(jj,ii) 395 enddo 396 enddo 397 else 398 399 call mtxelmultiply(scal,ntot,nrk,nst,this%fact,vwfn, & 400 this%gmetempr,this%gmetempc,this%chilocal,pol%gme,pol,indt,pht,ipe,ispin) 401 402#ifdef NONBLOCKING 403 if(peinf%inode .eq. 0) then 404 call timing%start(timing%chi_sum_comm) 405 endif 406 if (.not. first_reduce) then 407 call MPI_WAIT(request,mpistatus,mpierr) 408 SAFE_DEALLOCATE(sendbuf) 409 endif 410 if(peinf%inode .eq. 0) then 411 call timing%stop(timing%chi_sum_comm) 412 endif 413 414 if(peinf%inode .eq. 0) then 415 call timing%start(timing%chi_sum_flt) 416 endif 417 SAFE_ALLOCATE(sendbuf, (scal%nprd(ipe),scal%npcd(ipe))) 418!$OMP PARALLEL DO collapse(2) 419 do ii = 1, scal%npcd(ipe) 420 do jj = 1, scal%nprd(ipe) 421 sendbuf(jj,ii) = this%chilocal(jj,ii) 422 enddo 423 enddo 424 if(peinf%inode .eq. 0) then 425 call timing%stop(timing%chi_sum_flt) 426 endif 427 if(peinf%inode .eq. 0) then 428 call timing%start(timing%chi_sum_ht_nb) 429 endif 430#ifdef MPI 431! JRD XXX Barrier probably implicit in IReduce 432 call MPI_IReduce(sendbuf(1,1),this%chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, & 433 MPI_SUM,ipe-1,MPI_COMM_WORLD,request,mpierr) 434#else 435 this%chilocal2(:,:,ispin)=this%chilocal(:,:) 436#endif 437 first_reduce = .false. 438 if(peinf%inode .eq. 0) then 439 call timing%stop(timing%chi_sum_ht_nb) 440 endif 441 442#else 443 if(peinf%inode .eq. 0) then 444 call timing%start(timing%chi_sum_comm) 445 endif 446#ifdef MPI 447 !call MPI_barrier(MPI_COMM_WORLD,mpierr) 448 call MPI_Reduce(this%chilocal(1,1),this%chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, & 449 MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr) 450#else 451 this%chilocal2(:,:,ispin)=this%chilocal(:,:) 452#endif 453 if(peinf%inode .eq. 0) then 454 call timing%stop(timing%chi_sum_comm) 455 endif 456#endif 457 ! in case nonblocking_cyclic communication will be finalize outside the spin-loop 458 end if ! pol%nonblocking_cyclic 459 460 enddo ! ispin 461 462 SAFE_DEALLOCATE(this%chilocal) 463 SAFE_DEALLOCATE(this%gmetempr) 464 SAFE_DEALLOCATE(this%gmetempc) 465 466 if(pol%nonblocking_cyclic) then 467#ifdef MPI 468 if(peinf%inode .eq. 0) then 469 call timing%start(timing%chi_sum_comm) 470 endif 471 ! make sure the buffer is received 472 CALL MPI_Wait(req_rec,stat,mpierr) 473#endif 474 ! accumulate contribution into receiving buffer 475 ! buf_rec(:,:,:) = buf_rec(:,:,:) + buf_temp 476 do ispin = 1 , kp%nspin 477!$OMP PARALLEL DO collapse(2) 478 do ii = 1, nrec_col 479 do jj = 1, nrec_row 480 buf_rec(jj,ii,ispin) = buf_rec(jj,ii,ispin) + buf_temp(jj,ii,ispin) 481 enddo 482 enddo 483 end do 484 SAFE_DEALLOCATE(buf_temp) 485#ifdef MPI 486 ! wait for the massage to be sent 487 CALL MPI_Wait(req_send,stat,mpierr) 488#endif 489 ! copy the messega to the sending buffer for the next cycle 490 SAFE_DEALLOCATE(buf_send) 491 SAFE_ALLOCATE(buf_send, (nrec_row, nrec_col, kp%nspin)) 492 buf_send = buf_rec 493 nsend_row = nrec_row 494 nsend_col = nrec_col 495 ! deallocate receiving buffer 496 SAFE_DEALLOCATE(buf_rec) 497#ifdef MPI 498 if(peinf%inode .eq. 0) then 499 call timing%stop(timing%chi_sum_comm) 500 endif 501#endif 502 end if 503 504 enddo ! ipe 505 506 if(pol%nonblocking_cyclic) then 507 ! done 508 this%chilocal2(:,:,:) = buf_send(:,:,:) 509 SAFE_DEALLOCATE(buf_send) 510 else 511#ifdef NONBLOCKING 512 if(peinf%inode .eq. 0) then 513 call timing%start(timing%chi_sum_comm) 514 endif 515 call MPI_WAIT(request,mpistatus,mpierr) 516 SAFE_DEALLOCATE(sendbuf) 517 first_reduce = .true. 518 if(peinf%inode .eq. 0) then 519 call timing%stop(timing%chi_sum_comm) 520 endif 521#endif 522 end if 523 524 call progress_free(prog_info) 525 526 do ispin =1, kp%nspin 527 pol%chi(:,:,ispin) = this%chilocal2(:,:,ispin) 528 enddo 529 SAFE_DEALLOCATE(this%chilocal2) 530 531 !XXXXXXXXXXx 532 ! call diagonalize_scalapack(pol%nmtx, scal, pol%chi(:,:,1), 1.0D-3, icurr, this%chilocal) 533 !XXXXXXXXXXX 534 535 endif ! pol%freq_dep .eq. 0 536 537 !------------------------------------- 538 ! Full Frequency Be Here 539 540 negfact = -1D0*this%fact 541 542 if ( ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) .or. pol%freq_dep .eq. 3 ) then 543 if(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then 544#ifdef MPI 545 tag = 10 ! MPI tag 546 if(peinf%rank_mtxel .eq. 0) then 547 ntot_members(1) = ntot 548 endif 549 do irank = 1, pol%nfreq_group-1 550 if(peinf%rank_mtxel .eq. irank) then 551 call MPI_Send(ntot,1,MPI_INTEGER,0,tag,peinf%mtxel_comm,mpierr) 552 endif 553 if(peinf%rank_mtxel .eq. 0) then 554 call MPI_Recv(ntot_members(irank+1),1,MPI_INTEGER,irank,tag,peinf%mtxel_comm,mpistatus,mpierr) 555 endif 556 enddo 557 if(peinf%rank_mtxel .eq. 0) then 558 ntot = sum(ntot_members(1:pol%nfreq_group)) 559 endif 560 call MPI_Bcast(ntot_members,pol%nfreq_group,MPI_INTEGER,0,peinf%mtxel_comm,mpierr) 561 call MPI_Bcast(ntot,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr) 562 call MPI_Bcast(ntotmax,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr) 563 ntotmax = peinf%nvownmax*peinf%ncownmax*nrk 564 !Communicate gme`s and energy denominators among processors in matrix element communication groups 565 if(peinf%inode .eq. 0) then 566 call timing%start(timing%chi_sum_comm) 567 endif 568 call MPI_Allgather(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,pol%gme(1,1,1,1,1,1),& 569 ntotmax*kp%nspin*pol%nmtx,MPI_SCALAR,peinf%mtxel_comm,mpierr) 570 call MPI_Allgather(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,pol%edenDyn(1,1,1,1,1),& 571 ntotmax*kp%nspin,MPI_REAL_DP,peinf%mtxel_comm,mpierr) 572 if(peinf%inode .eq. 0) then 573 call timing%stop(timing%chi_sum_comm) 574 endif 575#endif 576 endif 577 578 call progress_init(prog_info, 'building polarizability matrix', 'processor', & 579 peinf%npes_freqgrp) 580 do ipe = 1, peinf%npes_freqgrp 581 call progress_step(prog_info) 582 if (peinf%verb_debug .and. peinf%inode==0) then 583 write(6,'(A,i8,6x,A,i8,A)') '### ipe=',ipe,'(npes=',peinf%npes,')' 584 endif 585#ifdef MPI 586 call MPI_barrier(MPI_COMM_WORLD,mpierr) 587#endif 588 SAFE_ALLOCATE(this%gmeRDyntempr3, (scal%nprd(ipe),ntot)) 589 SAFE_ALLOCATE(this%gmeRDyntempc, (scal%npcd(ipe),ntot)) 590 SAFE_ALLOCATE(this%chilocalRDyn, (scal%nprd(ipe),scal%npcd(ipe),pol%nfreq_in_group)) 591 592 SAFE_ALLOCATE(gmeRDyntempr, (scal%nprd(ipe),ntot)) 593 SAFE_ALLOCATE(edenDRtemp, (ntot, pol%nfreq_in_group)) 594 595 596!$OMP PARALLEL DO collapse(2) 597 do ii = 1, pol%nfreq_in_group 598 do jj = 1, scal%npcd(ipe) 599 do kk = 1, scal%nprd(ipe) 600 this%chilocalRDyn(kk,jj,ii)=0D0 601 enddo 602 enddo 603 enddo 604 605 do ispin = 1 , kp%nspin 606 607 itot = 0 608 609 if(peinf%inode .eq. 0) then 610 call timing%start(timing%chi_sum_prep) 611 endif 612 613 SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe))) 614 SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe))) 615 SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe))) 616 SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe))) 617 618 do im=1,pol%nfreq_group ! im labels which member of the mtxel comm group are you 619 im_proc=peinf%rank_f+1+(im-1)*peinf%npes_freqgrp ! im_proc gives this mtxel comm group member`s global 620 do irk = 1, nrk ! proc number 621 do it = 1, nst(irk) 622 623 do icurr=1,scal%nprd(ipe) 624 tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe),it,irk) 625 tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe),it,irk) 626 enddo 627 do icurr=1,scal%npcd(ipe) 628 tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe),it,irk) 629 tmpcolph(icurr) = pht(scal%imycold(icurr,ipe),it,irk) 630 enddo 631 632! JRD XXX - Would rather put OMP here with collapse 2 - but unbalanced... need to loop over tmp_iv = 1, nvown instead 633 do iv = 1,vwfn%nband+pol%ncrit 634 635 tmp_iv = peinf%global_indexv(iv,im_proc) 636 637 if (peinf%does_it_ownv(iv,im_proc)) then 638 ilimit = peinf%global_ncown(im_proc) 639 else 640 ilimit = 0 641 endif 642 643 !$OMP PARALLEL DO private (mytot,zvalue,jj,icurr,ifreq_para,cvpair_temp,cv_energy) 644 do i_myband = 1, ilimit 645 mytot = itot + i_myband 646 zvalue = pol%edenDyn(peinf%global_indexv(iv,im_proc),i_myband,ispin,irk,im) 647 if(pol%lin_denominator<TOL_Zero) then 648 ! this is when the lin_denominator mode is not active. 649 650 ! DVF: These factors of (peinf%inode)/peinf%npes*100000 are so that you don`t access the 651 ! array edenDRtemp and other similar arrays for the processors that are not used 652 ! when pol%nfreq_group does not divide the original number of processors in the 653 ! calculation (peinf%npes_orig). This factor makes the loop start at some huge 654 ! number, so the loop will in fact not execute at all 655 656 do jj=1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group 657 ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group 658 if (abs(zvalue) .gt. Tol_Zero) then 659! JRD XXX - INDEX ORDER here worries me 660 edenDRtemp(mytot,ifreq_para)= -0.5d0*( & 661 1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ & 662 1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)) 663 else 664 edenDRtemp(mytot,ifreq_para)= 0D0 665 endif 666 enddo 667 668 endif 669 do icurr=1,scal%nprd(ipe) 670 gmeRDyntempr(icurr,mytot)=pol%gme(tmprowindex(icurr), & 671 i_myband,tmp_iv,ispin,irk,im) * tmprowph(icurr) 672 enddo 673 !do jj = 1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group 674 ! ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group 675 ! this%gmeRDyntempr2(:,mytot,ifreq_para)=gmeRDyntempr(:)*edenDRtemp(ifreq_para) 676 !enddo 677 do icurr=1,scal%npcd(ipe) 678 this%gmeRDyntempc(icurr,mytot) = & 679 MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,tmp_iv,ispin,irk,im) * tmpcolph(icurr) ) 680 enddo 681 enddo ! i_myband 682 !$OMP END PARALLEL DO 683 itot = itot+ilimit 684 enddo ! iv 685 enddo ! it 686 enddo ! irk 687 enddo ! im 688 689 SAFE_DEALLOCATE(tmprowindex) 690 SAFE_DEALLOCATE(tmpcolindex) 691 SAFE_DEALLOCATE(tmprowph) 692 SAFE_DEALLOCATE(tmpcolph) 693 694 if(peinf%inode .eq. 0) then 695 call timing%stop(timing%chi_sum_prep) 696 endif 697 698 699 !Do the zgemm`s 700 if(ntot > 0) then 701 do jj =1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group 702 ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group 703 if(peinf%inode .eq. 0) then 704 call timing%start(timing%chi_sum_gemm) 705 endif 706 707 !$omp parallel do private(icurr) 708 do mytot = 1, ntot 709 do icurr=1,scal%nprd(ipe) 710 this%gmeRDyntempr3(icurr,mytot)=gmeRDyntempr(icurr,mytot)*edenDRtemp(mytot,ifreq_para) 711 enddo 712 enddo 713 714 call zgemm('n','t',scal%nprd(ipe),scal%npcd(ipe),ntot, & 715 negfact,this%gmeRDyntempr3(:,:),scal%nprd(ipe),this%gmeRDyntempc(:,:),scal%npcd(ipe),& 716 (0D0,0D0),this%chilocalRDyn(:,:,ifreq_para),scal%nprd(ipe)) 717 718 if(peinf%inode .eq. 0) then 719 call timing%stop(timing%chi_sum_gemm) 720 endif 721 enddo 722 endif 723 724 if(peinf%inode .eq. 0) then 725 call timing%start(timing%chi_sum_comm) 726 endif 727 728 if(pol%nfreq_group .eq. 1) then 729#ifdef MPI 730 call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), & 731 pol%nfreq_in_group*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,& 732 MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr) 733#endif 734 elseif(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then 735#ifdef MPI 736 call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), & 737 pol%nfreq_in_group*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,& 738 MPI_SUM,ipe-1,peinf%freq_comm,mpierr) 739#endif 740 endif 741#ifndef MPI 742 this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:) 743#endif 744 if(peinf%inode .eq. 0) then 745 call timing%stop(timing%chi_sum_comm) 746 endif 747 748 enddo ! ispin 749 if(peinf%inode .eq. 0) then 750 call timing%start(timing%chi_sum_array_alloc) 751 endif 752 SAFE_DEALLOCATE(this%chilocalRDyn) 753 SAFE_DEALLOCATE(edenDRtemp) 754 SAFE_DEALLOCATE(gmeRDyntempr) 755 SAFE_DEALLOCATE(this%gmeRDyntempr3) 756 SAFE_DEALLOCATE(this%gmeRDyntempc) 757 if(peinf%inode .eq. 0) then 758 call timing%stop(timing%chi_sum_array_alloc) 759 endif 760 enddo ! ipe 761 call progress_free(prog_info) 762 763 do ispin =1, kp%nspin 764 do jj=1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group 765 ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group 766! JRD XXX This assignment is now a waste of time and memory. Should just set pol%chiR(A)Dyn 767! directly above 768 pol%chiRDyn(:,:,ifreq_para,ispin) = this%chilocal2RDyn(:,:,ifreq_para,ispin) 769 enddo ! jj 770 enddo ! ispin 771 SAFE_DEALLOCATE(this%chilocal2RDyn) 772 !XXXXXXXXXXX 773 ! DO jj = 1, pol%nfreq 774 ! IF(peinf%inode .eq. 0) WRITE(2000,*) jj 775 ! call diagonalize_scalapack(pol%nmtx, scal, pol%chiRDyn(:,:,jj,1), 1.0D-3, icurr, this%chilocal, eigenval) 776 ! DEALLOCATE(eigenval) 777 ! DEALLOCATE(this%chilocal) 778 ! END DO 779 !XXXXXXXXXXX 780 endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0) 781 782 !------------------------------------- 783 ! Full Frequency Be Here. 784 ! M. Shishkin and G. Kresse, Implementation and performance of the 785 ! frequency-dependent GW method within the PAW framework, 786 ! PHYSICAL REVIEW B 74, 035101, 2006. 787 788 negfact = -1D0*this%fact 789 790 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 791 if(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then 792#ifdef MPI 793 tag = 10 ! MPI tag 794 if(peinf%rank_mtxel .eq. 0) then 795 ntot_members(1) = ntot 796 endif 797 do irank = 1, pol%nfreq_group-1 798 if(peinf%rank_mtxel .eq. irank) then 799 call MPI_Send(ntot,1,MPI_INTEGER,0,tag,peinf%mtxel_comm,mpierr) 800 endif 801 if(peinf%rank_mtxel .eq. 0) then 802 call MPI_Recv(ntot_members(irank+1),1,MPI_INTEGER,irank,tag,peinf%mtxel_comm,mpistatus,mpierr) 803 endif 804 enddo 805 if(peinf%rank_mtxel .eq. 0) then 806 ntot = sum(ntot_members(1:pol%nfreq_group)) 807 endif 808 call MPI_Bcast(ntot_members,pol%nfreq_group,MPI_INTEGER,0,peinf%mtxel_comm,mpierr) 809 call MPI_Bcast(ntot,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr) 810 call MPI_Bcast(ntotmax,1,MPI_INTEGER,0,peinf%mtxel_comm,mpierr) 811 ntotmax = peinf%nvownmax*peinf%ncownmax*nrk 812 !Communicate gme`s among processors in matrix element communication groups 813 if(peinf%inode .eq. 0) then 814 call timing%start(timing%chi_sum_comm) 815 endif 816 call MPI_Allgather(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,pol%gme(1,1,1,1,1,1),& 817 ntotmax*kp%nspin*pol%nmtx,MPI_SCALAR,peinf%mtxel_comm,mpierr) 818 if(peinf%inode .eq. 0) then 819 call timing%stop(timing%chi_sum_comm) 820 endif 821#endif 822 endif 823 824 ! ------------------------------------------- 825 ! compute the Hilbert transform coefficients 826 ! ------------------------------------------- 827 j_dpc=(0.0,1.0) 828 htwR(:,:)=(0.0,0.0) 829 830 nf=pol%nfreq 831 nsf=pol%nsfreq 832 do jj=1,nf 833 eta=pol%dBrdning/ryd 834 fqt=pol%dFreqGrid(jj)/ryd 835 do isf=1,nsf 836 if (isf==1) then 837 c1=(0.0,0.0) 838 step1=1.0 839 else 840 sf1=pol%dSFreqGrid(isf-1)/ryd 841 sf2=pol%dSFreqGrid(isf)/ryd 842 step1=sf2-sf1 843 844 c11=((sf1-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf2)/(eta))& 845 & -atan((fqt-sf1)/(eta))) 846 c12=0.50*(sf1-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)& 847 & /(((fqt-sf1)**2.0+eta*eta))) 848 849 c13=-((sf1+fqt)*j_dpc-eta)*(atan((fqt+sf2)/(eta))& 850 & -atan((fqt+sf1)/(eta))) 851 c14=0.50*(sf1+fqt+j_dpc*eta)*log(((fqt+sf2)**2.0+eta*eta)& 852 & /(((fqt+sf1)**2.0+eta*eta))) 853 854 c1=c11+c12+c13+c14 855 c1=c1/step1 856 endif 857 858 if (isf==nsf) then 859 c2=(0.0,0.0) 860 step2=1.0 861 else 862 sf1=pol%dSFreqGrid(isf)/ryd 863 sf2=pol%dSFreqGrid(isf+1)/ryd 864 865 step2=sf2-sf1 866 867 c21=((sf2-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf1)/(eta))& 868 & -atan((fqt-sf2)/(eta))) 869 c22=0.50*(sf2-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)& 870 & /(((fqt-sf2)**2.0+eta*eta))) 871 872 c23=-((sf2+fqt)*j_dpc-eta)*(atan((fqt+sf1)/(eta))& 873 & -atan((fqt+sf2)/(eta))) 874 c24=0.50*(sf2+fqt+j_dpc*eta)*log(((fqt+sf1)**2.0+eta*eta)& 875 & /(((fqt+sf2)**2.0+eta*eta))) 876 877 c2=c21+c22+c23+c24 878 879 c2=c2/step2 880 endif 881 882 if (isf==1.or.isf==nsf) then 883 htwR(jj,isf)=0.5d0*(c1/step1+c2/step2) 884 else 885 htwR(jj,isf)=1.0d0*(c1+c2)/(step1+step2) 886 endif 887 888 enddo 889 enddo 890 891 ! ---------------------------------------------------- 892 ! compute the spectral functions of the polarizability 893 ! ---------------------------------------------------- 894 895 call progress_init(prog_info, 'building polarizability matrix', 'processor', & 896 peinf%npes_freqgrp) 897 898 nwarn=0 899 do ipe = 1, peinf%npes_freqgrp 900 call progress_step(prog_info) 901 if (peinf%verb_debug .and. peinf%inode==0) then 902 write(6,'(A,i8,6x,A,i8,A)') '### ipe=',ipe,'(npes=',peinf%npes,')' 903 endif 904#ifdef MPI 905 call MPI_barrier(MPI_COMM_WORLD,mpierr) 906#endif 907 908 SAFE_ALLOCATE(gmeRDyntempr, (scal%nprd(ipe),1)) 909 SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%nprd(ipe),ntot,pol%os_nsfreq_para)) 910 SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npcd(ipe))) 911 SAFE_ALLOCATE(this%chilocalRDyn, (scal%nprd(ipe),scal%npcd(ipe),pol%os_nsfreq_para)) 912 913! JRD XXX should thread if keep COMM elements 914 this%chilocalRDyn=0 915 916! JRD XXX if we keep COMM elements we should thread this 917 gmeRDyntempr=(0.0,0.0) 918 this%gmeRDyntempr2=(0.0,0.0) 919 this%gmeRDyntempc=(0.0,0.0) 920 921 do ispin = 1 , kp%nspin 922 923 itot = 0 924 925 if(peinf%inode .eq. 0) then 926 call timing%start(timing%chi_sum_prep) 927 endif 928 929 SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe))) 930 SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe))) 931 SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe))) 932 SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe))) 933 934 do im=1,pol%nfreq_group ! im labels which member of the mtxel comm group are you 935 im_proc=peinf%rank_f+1+(im-1)*peinf%npes_freqgrp ! im_proc gives this mtxel comm group member`s global 936 do irk = 1, nrk ! proc number 937 do it = 1, nst(irk) 938 939 do icurr=1,scal%nprd(ipe) 940 tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe),it,irk) 941 tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe),it,irk) 942 enddo 943 do icurr=1,scal%npcd(ipe) 944 tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe),it,irk) 945 tmpcolph(icurr) = pht(scal%imycold(icurr,ipe),it,irk) 946 enddo 947 948 do iv = 1,vwfn%nband+pol%ncrit 949 950 tmp_iv = peinf%global_indexv(iv,im_proc) 951 952 if (peinf%does_it_ownv(iv,im_proc)) then 953 ilimit = peinf%global_ncown(im_proc) 954 else 955 ilimit = 0 956 endif 957 958 !$OMP PARALLEL private (mytot,zvalue,gmeRDyntempr,icurr, & 959 !$OMP isfreql,isfreqr,sfreql,sfreqr, & 960 !$OMP wl,wr,i_myband,jj) 961 !$OMP DO 962 do i_myband = 1, ilimit 963 964 zvalue = -pol%edenDyn(peinf%global_indexv(iv,im_proc),i_myband,ispin,irk,im) 965 if (abs(zvalue) .gt. Tol_Zero) then 966 967 mytot = itot + i_myband 968 isfreql=-1 969 970 do jj=pol%nsfreq,1,-1 971 if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then 972 isfreql=jj 973 EXIT 974 endif 975 enddo 976 977 if (isfreql.eq.pol%nsfreq) then 978 nwarn=nwarn+1 979 if (nwarn==1.and.peinf%inode.eq.0) then 980 write(0,*) 'WARNING: for accuracy, sfrequency_high_cutoff should be ' 981 write(0,*) 'larger than energy difference between highest unoccupied ' 982 write(0,*) 'state and lowest occupied state.' 983 endif 984 cycle 985 endif 986 987 sfreql=pol%dSFreqGrid(isfreql)/ryd 988 isfreqr=isfreql+1 989 sfreqr=pol%dSFreqGrid(isfreqr)/ryd 990 991 wr= (zvalue-sfreql)/(sfreqr-sfreql) 992 wl= -(zvalue-sfreqr)/(sfreqr-sfreql) 993 994 do icurr=1,scal%nprd(ipe) 995 gmeRDyntempr(icurr,1)=pol%gme(tmprowindex(icurr), & 996 i_myband,tmp_iv,ispin,irk,im) * tmprowph(icurr) 997 enddo 998 999 this%gmeRDyntempr2(:,mytot,isfreql)=gmeRDyntempr(:,1)*wl 1000 this%gmeRDyntempr2(:,mytot,isfreqr)=gmeRDyntempr(:,1)*wr 1001 1002 do icurr=1,scal%npcd(ipe) 1003 this%gmeRDyntempc(mytot,icurr) = & 1004 MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,tmp_iv,ispin,irk,im) * tmpcolph(icurr) ) 1005 enddo 1006 endif 1007 enddo ! i_myband 1008 !$OMP END DO 1009 !$OMP END PARALLEL 1010 itot = itot+ilimit 1011 1012 enddo ! iv 1013 enddo ! it 1014 enddo ! irk 1015 enddo ! im 1016 1017 SAFE_DEALLOCATE(tmprowindex) 1018 SAFE_DEALLOCATE(tmpcolindex) 1019 SAFE_DEALLOCATE(tmprowph) 1020 SAFE_DEALLOCATE(tmpcolph) 1021 1022 if(peinf%inode .eq. 0) then 1023 call timing%stop(timing%chi_sum_prep) 1024 endif 1025 1026 !Do the zgemm`s 1027 if(ntot > 0) then 1028 do jj =1+peinf%rank_mtxel, pol%nsfreq,pol%nfreq_group 1029 1030 if(peinf%inode .eq. 0) then 1031 call timing%start(timing%chi_sum_gemm) 1032 endif 1033 1034 call zgemm('n','n',scal%nprd(ipe),scal%npcd(ipe),ntot, & 1035 negfact,this%gmeRDyntempr2(:,:,jj),scal%nprd(ipe),this%gmeRDyntempc(:,:),ntot,& 1036 (0D0,0D0),this%chilocalRDyn(:,:,jj),scal%nprd(ipe)) 1037 1038 if(peinf%inode .eq. 0) then 1039 call timing%stop(timing%chi_sum_gemm) 1040 endif 1041 1042 enddo 1043 endif 1044 1045 if(peinf%inode .eq. 0) then 1046 call timing%start(timing%chi_sum_comm) 1047 endif 1048 1049 if(pol%nfreq_group .eq. 1) then 1050#ifdef MPI 1051 call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), & 1052 pol%os_nsfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,& 1053 MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr) 1054#endif 1055 elseif(pol%nfreq_group .gt. 1 .and. peinf%inode .lt. peinf%npes) then 1056#ifdef MPI 1057 call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), & 1058 pol%os_nsfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,& 1059 MPI_SUM,ipe-1,peinf%freq_comm,mpierr) 1060#endif 1061 endif 1062#ifndef MPI 1063 this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:) 1064#endif 1065 if(peinf%inode .eq. 0) then 1066 call timing%stop(timing%chi_sum_comm) 1067 endif 1068 1069 1070 enddo ! ispin 1071 1072 if(peinf%inode .eq. 0) then 1073 call timing%start(timing%chi_sum_array_alloc) 1074 endif 1075 1076 SAFE_DEALLOCATE(this%chilocalRDyn) 1077 SAFE_DEALLOCATE(gmeRDyntempr) 1078 SAFE_DEALLOCATE(this%gmeRDyntempr2) 1079 SAFE_DEALLOCATE(this%gmeRDyntempc) 1080 1081 if(peinf%inode .eq. 0) then 1082 call timing%stop(timing%chi_sum_array_alloc) 1083 endif 1084 1085 enddo ! ipe 1086 call progress_free(prog_info) 1087 1088 do ispin =1, kp%nspin 1089 do jj=1+peinf%rank_mtxel,pol%nsfreq,pol%nfreq_group 1090 pol%chiTDyn(jj,:,:,ispin) = this%chilocal2RDyn(:,:,jj,ispin) 1091 enddo ! jj 1092 enddo ! ispin 1093 SAFE_DEALLOCATE(this%chilocal2RDyn) 1094 1095 ! ----------------------------- 1096 ! Hilbert transform 1097 ! ------------------------------ 1098 1099 1100! JRD XXX - pol%chi... arrays need to be reordered here. 1101! I think we specifying LDC wrong here - should pol%nfreq_in_group not pol%nfreq right? 1102 call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, & 1103 (-1D0,0D0),htwR(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, & 1104 (0D0,0D0),this%chiRDynOld(:,:,:,:),pol%nfreq) 1105 1106 do ispin =1, kp%nspin 1107 do jj=1,pol%nfreq_in_group 1108 pol%chiRDyn(:,:,jj,ispin) = this%chiRDynOld(jj,:,:,ispin) 1109 enddo ! jj 1110 enddo ! ispin 1111 1112 1113 endif ! pol%freq_dep.eq.2.and.pol%freq_dep_method.eq.1 1114 1115 !call free_summation_buffers(pol) 1116 1117 POP_SUB(chi_summation_comm_matrix) 1118 return 1119 1120 end subroutine chi_summation_comm_matrix 1121 1122 !----------------------------------------------------------------------------- 1123 ! GCOMM_ELEMENTS 1124 !----------------------------------------------------------------------------- 1125 1126 !> Create the pol%chi matrix using gcomm_elements sceheme 1127 subroutine chi_summation_comm_elements(this,pol,scal,kp,vwfn,cwfn,& 1128 nst,nrk,indt,pht) 1129 type(chi_summator_t), intent(INOUT) :: this 1130 type(polarizability), intent(INOUT) :: pol 1131 type(scalapack), intent(in) :: scal 1132 type(kpoints), intent(IN) :: kp 1133 type(valence_wfns), intent(IN) :: vwfn 1134 type(conduction_wfns), intent(IN) :: cwfn 1135 1136 integer, intent(IN) :: nst(:) 1137 integer, intent(IN) :: nrk 1138 integer, intent(INOUT) :: indt(:,:,:) 1139 SCALAR, intent(INOUT) :: pht(:,:,:) 1140 1141 integer :: icurr,ntot,itot 1142 integer :: iv, ic, irk, it, ispin 1143 1144 SCALAR :: temp_gme 1145 integer, allocatable :: iowna(:) 1146 integer :: isend 1147 1148 complex(DPC), allocatable :: edenDRtemp(:) 1149 1150 real(DP) :: zvalue 1151 ! frequency points for the spectral functions of the polarizability 1152 integer :: isfreql, isfreqr 1153 real(DP) :: sfreql, sfreqr, wr, wl 1154 ! Hilbert tranform coefficients 1155 complex(DPC) :: htwR(pol%nfreq,pol%nsfreq), htwA(pol%nfreq,pol%nsfreq) 1156 complex(DPC) :: c11,c12,c13,c14,c21,c22,c23,c24 1157 complex(DPC) :: cA11,cA12,cA13,cA14,cA21,cA22,cA23,cA24 1158 1159 integer :: isf,nf,nsf,ifreq_para 1160 real(DP) :: sf1,sf2 1161 real(DP) :: step1,step2,fqt,eta 1162 complex(DPC) :: c1,c2,j_dpc,cA1,cA2 1163 1164 integer :: ii, jj 1165 type(progress_info) :: prog_info 1166 1167 integer :: nsftot, il, ir, n1, n2, n3, max_nv, nwarn 1168 integer, allocatable :: count_v(:), ind_v(:,:), ind_sf(:) 1169 1170 PUSH_SUB(chi_summation_comm_elements) 1171 1172 !call alloc_summation_buffers(pol, this%fact) 1173 1174 if(peinf%inode .eq. 0) then 1175 call timing%start(timing%chi_sum_array_alloc) 1176 endif 1177 1178 SAFE_ALLOCATE(iowna, (vwfn%nband+pol%ncrit)) 1179 1180 if (pol%freq_dep .eq. 0) then 1181 SAFE_ALLOCATE(this%chilocal, (scal%npr,scal%npc)) 1182 endif 1183 1184 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then 1185 SAFE_ALLOCATE(this%chilocalRDyn, (scal%npr,scal%npc,pol%nfreq_in_group)) 1186 endif 1187 1188 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1189 SAFE_ALLOCATE(this%chilocalRDyn, (scal%npr,scal%npc,pol%os_nsfreq_para)) 1190 1191 SAFE_ALLOCATE(count_v, (pol%os_nsfreq_para)) 1192 SAFE_ALLOCATE(ind_sf, (pol%os_nsfreq_para)) 1193 endif 1194 1195 if(peinf%inode .eq. 0) then 1196 call timing%stop(timing%chi_sum_array_alloc) 1197 endif 1198 1199 call logit("Starting chi Sum") 1200 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1201 ! ------------------------------------------- 1202 ! compute the Hilbert transform coefficients 1203 ! ------------------------------------------- 1204 1205 if(peinf%inode .eq. 0) then 1206 call timing%start(timing%chi_sum_ht_nb) 1207 endif 1208 1209 j_dpc=(0.0,1.0) 1210 htwR(:,:)=(0.0,0.0) 1211 1212 nf=pol%nfreq 1213 nsf=pol%nsfreq 1214 do jj=1,nf 1215 eta=pol%dBrdning/ryd 1216 fqt=pol%dFreqGrid(jj)/ryd 1217 do isf=1,nsf 1218 if (isf==1) then 1219 c1=(0.0,0.0) 1220 step1=1.0 1221 else 1222 sf1=pol%dSFreqGrid(isf-1)/ryd 1223 sf2=pol%dSFreqGrid(isf)/ryd 1224 step1=sf2-sf1 1225 1226 c11=((sf1-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf2)/(eta))& 1227 & -atan((fqt-sf1)/(eta))) 1228 c12=0.50*(sf1-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)& 1229 & /(((fqt-sf1)**2.0+eta*eta))) 1230 1231 c13=-((sf1+fqt)*j_dpc-eta)*(atan((fqt+sf2)/(eta))& 1232 & -atan((fqt+sf1)/(eta))) 1233 c14=0.50*(sf1+fqt+j_dpc*eta)*log(((fqt+sf2)**2.0+eta*eta)& 1234 & /(((fqt+sf1)**2.0+eta*eta))) 1235 1236 c1=c11+c12+c13+c14 1237 c1=c1/step1 1238 endif 1239 1240 if (isf==nsf) then 1241 c2=(0.0,0.0) 1242 step2=1.0 1243 else 1244 sf1=pol%dSFreqGrid(isf)/ryd 1245 sf2=pol%dSFreqGrid(isf+1)/ryd 1246 1247 step2=sf2-sf1 1248 1249 c21=((sf2-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf1)/(eta))& 1250 & -atan((fqt-sf2)/(eta))) 1251 c22=0.50*(sf2-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)& 1252 & /(((fqt-sf2)**2.0+eta*eta))) 1253 1254 c23=-((sf2+fqt)*j_dpc-eta)*(atan((fqt+sf1)/(eta))& 1255 & -atan((fqt+sf2)/(eta))) 1256 c24=0.50*(sf2+fqt+j_dpc*eta)*log(((fqt+sf1)**2.0+eta*eta)& 1257 & /(((fqt+sf2)**2.0+eta*eta))) 1258 1259 c2=c21+c22+c23+c24 1260 1261 c2=c2/step2 1262 endif 1263 1264 if (isf==1.or.isf==nsf) then 1265 htwR(jj,isf)=0.5d0*(c1/step1+c2/step2) 1266 else 1267 htwR(jj,isf)=1.0d0*(c1+c2)/(step1+step2) 1268 endif 1269 1270 enddo 1271 enddo 1272 1273 if(peinf%inode .eq. 0) then 1274 call timing%stop(timing%chi_sum_ht_nb) 1275 endif 1276 1277 endif!(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1) 1278 1279 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1280 pol%chiTDyn=(0.0,0.0) 1281 endif 1282 1283 call progress_init(prog_info, 'building polarizability matrix', 'blocks', & 1284 nrk*kp%nspin*(cwfn%nband-vwfn%nband)) 1285 nwarn=0 1286 do irk=1,nrk 1287 if (peinf%verb_debug .and. peinf%inode==0) then 1288 write(6,'(A,i8,6x,A,i8,A)') '### irk=',irk,'(nrk=',nrk,')' 1289 endif 1290#ifdef MPI 1291 call MPI_barrier(MPI_COMM_WORLD,mpierr) 1292#endif 1293 do ispin=1,kp%nspin 1294 if(peinf%inode .eq. 0) then 1295 call timing%start(timing%chi_sum_array_alloc) 1296 endif 1297 iowna(:)=1 1298 ntot=(vwfn%nband+pol%ncrit)*nst(irk) 1299 if (pol%freq_dep .eq. 0) then 1300!JRD XXX Should thread 1301 this%chilocal=0 1302 SAFE_ALLOCATE(this%gmetempr, (scal%npr,ntot)) 1303 SAFE_ALLOCATE(this%gmetempc, (ntot,scal%npc)) 1304 endif 1305 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then 1306!JRD XXX Should thread 1307 this%chilocalRDyn=0 1308 SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%npr,ntot,pol%nfreq_in_group)) 1309 SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npc)) 1310 endif 1311 1312 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1313!JRD XXX Should thread 1314 this%chilocalRDyn=(0.0,0.0) 1315 1316 max_nv=0 1317 do ic=1,cwfn%nband-vwfn%nband 1318 count_v=0 1319 do iv=1,(vwfn%nband+pol%ncrit) 1320 1321 isend=peinf%global_pairowner(iv,ic)-1 1322 if (isend .lt. 0) then 1323 write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic 1324 call die("internal error in chi_summation") 1325 endif 1326 if (isend .eq. peinf%inode) then 1327 zvalue=-pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1) 1328 iowna(iv)=iowna(iv)+1 1329 endif 1330 1331#ifdef MPI 1332 call MPI_Bcast(zvalue,1,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr) 1333 call MPI_Bcast(pol%dSFreqGrid,pol%os_nsfreq_para,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr) 1334#endif 1335 1336 if (scal%npr*scal%npc .ne. 0) then 1337 do it=1, nst(irk) 1338 1339 if (abs(zvalue) .gt. Tol_Zero) then 1340 isfreql=-1 1341 do jj=pol%nsfreq,1,-1 1342 if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then 1343 isfreql=jj 1344 EXIT 1345 endif 1346 enddo 1347 1348 if (isfreql.eq.pol%nsfreq) then 1349 nwarn=nwarn+1 1350 if (nwarn==1.and.peinf%inode.eq.0) then 1351 write(0,*) 'WARNING: for accuracy, sfrequency_high_cutoff should be ' 1352 write(0,*) 'larger than energy difference between highest unoccupied ' 1353 write(0,*) 'state and lowest occupied state.' 1354 endif 1355 cycle 1356 endif 1357 1358 isfreqr=isfreql+1 1359 1360 count_v(isfreql)=count_v(isfreql)+1 1361 count_v(isfreqr)=count_v(isfreqr)+1 1362 endif 1363 enddo !it 1364 endif 1365 enddo !iv 1366 1367 if (max_nv<maxval(count_v(:))) then 1368 max_nv=maxval(count_v(:)) 1369 endif 1370 enddo !ic 1371 1372 SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%npr,max_nv,pol%os_nsfreq_para)) 1373 SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npc)) 1374 SAFE_ALLOCATE(this%gmeRDyntempcs, (max_nv,scal%npc)) 1375 1376 this%gmeRDyntempr2=(0.0,0.0) 1377 this%gmeRDyntempc=(0.0,0.0) 1378 this%gmeRDyntempcs=(0.0,0.0) 1379 1380 SAFE_ALLOCATE(ind_v, (pol%os_nsfreq_para, max_nv)) 1381 1382 this%gmeRDyntempr2=(0.0,0.0) 1383 iowna(:)=1 1384 endif!(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1) 1385 1386 if(peinf%inode .eq. 0) then 1387 call timing%stop(timing%chi_sum_array_alloc) 1388 endif 1389 1390 do ic=1,cwfn%nband-vwfn%nband 1391 call progress_step(prog_info) 1392 1393 ! We do two giant loops here for freq_dep cases 1394 if (pol%freq_dep .eq. 0) then 1395 itot=0 1396 if(peinf%inode .eq. 0) then 1397 call timing%start(timing%chi_sum_comm) 1398 endif 1399 do iv=1,(vwfn%nband+pol%ncrit) 1400 isend=peinf%global_pairowner(iv,ic)-1 1401 if (isend .lt. 0) then 1402 write(0,*) 'Illegal value for mpi proc, isend:',& 1403 peinf%inode,iv,ic,peinf%global_pairowner(iv,ic) 1404 call die("internal error in chi_summation") 1405 endif 1406 if (isend .eq. peinf%inode) then 1407 if (iowna(iv) .gt. peinf%ncownactual) call die('iowna(iv) bigger than ncownactual') 1408 this%gmetempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), & 1409 ispin,irk,1) * sqrt(this%fact) 1410 iowna(iv)=iowna(iv)+1 1411 endif 1412#ifdef MPI 1413 call MPI_Bcast(this%gmetempn,pol%nmtx,MPI_SCALAR,isend,MPI_COMM_WORLD,mpierr) 1414#endif 1415 if (scal%npr*scal%npc .ne. 0) then 1416 1417 do it =1, nst(irk) 1418 itot = itot + 1 1419 1420 do icurr=1,scal%npr 1421 this%gmetempr(icurr,itot)=this%gmetempn(indt(scal%imyrow(icurr),it,irk)) * & 1422 pht(scal%imyrow(icurr),it,irk) 1423 enddo 1424 1425 do icurr=1,scal%npc 1426 temp_gme = this%gmetempn(indt(scal%imycol(icurr),it,irk)) 1427 this%gmetempc(itot,icurr)=MYCONJG(temp_gme * pht(scal%imycol(icurr),it,irk) ) 1428 enddo 1429 enddo ! it 1430 endif 1431 enddo ! iv 1432 if(peinf%inode .eq. 0) then 1433 call timing%stop(timing%chi_sum_comm) 1434 endif 1435 1436 1437 ! JRD: Using Level3 BLAS here for better performance 1438 1439 if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then 1440 if(peinf%inode .eq. 0) then 1441 call timing%start(timing%chi_sum_gemm) 1442 endif 1443 1444 call X(gemm)('n','n',scal%npr,scal%npc,ntot, & 1445 -ONE,this%gmetempr,scal%npr,this%gmetempc,ntot,ONE,this%chilocal,scal%npr) 1446 1447 if(peinf%inode .eq. 0) then 1448 call timing%stop(timing%chi_sum_gemm) 1449 endif 1450 1451 endif 1452 1453 endif ! pol%freq_dep .eq. 0 1454 1455 !--------------------- 1456 ! JRD: Full Frequency Be Here 1457 1458 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then 1459 1460 if(peinf%inode .eq. 0) then 1461 call timing%start(timing%chi_sum_array_alloc) 1462 endif 1463 SAFE_ALLOCATE(edenDRtemp, (pol%nfreq)) 1464 if(peinf%inode .eq. 0) then 1465 call timing%stop(timing%chi_sum_array_alloc) 1466 endif 1467 1468 itot=0 1469!JRD XXX Should thread 1470 do iv=1,(vwfn%nband+pol%ncrit) 1471 if(peinf%inode .eq. 0) then 1472 call timing%start(timing%chi_sum_comm) 1473 endif 1474 1475 isend=peinf%global_pairowner(iv,ic)-1 1476 if (isend .lt. 0) then 1477 write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic 1478 call die("internal error in chi_summation") 1479 endif 1480 if (isend .eq. peinf%inode) then 1481 this%gmeRDyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), & 1482 ispin,irk,1) * sqrt(this%fact) 1483 if (abs(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)) .gt. Tol_Zero) then 1484 do jj=1,pol%nfreq 1485 edenDRtemp(jj)= -0.50d0 * ( 1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1) & 1486 -(pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd)+& 1487 1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)+& 1488 (pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd)) 1489 enddo 1490 else 1491 edenDRtemp(:)=0D0 1492 endif 1493 iowna(iv)=iowna(iv)+1 1494 endif 1495 1496#ifdef MPI 1497 call MPI_Bcast(this%gmeRDyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr) 1498 call MPI_Bcast(edenDRtemp,pol%nfreq,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr) 1499#endif 1500 if(peinf%inode .eq. 0) then 1501 call timing%stop(timing%chi_sum_comm) 1502 endif 1503 1504 if (scal%npr*scal%npc .ne. 0) then 1505 do it =1, nst(irk) 1506 if(peinf%inode .eq. 0) then 1507 call timing%start(timing%chi_sum_row) 1508 endif 1509 1510 itot = itot + 1 1511 do icurr=1,scal%npr 1512 do jj =1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group 1513 ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group 1514 this%gmeRDyntempr2(icurr,itot,ifreq_para)= & 1515 (this%gmeRDyntempn(indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk))*edenDRtemp(jj) 1516 enddo 1517 enddo 1518 1519 if(peinf%inode .eq. 0) then 1520 call timing%stop(timing%chi_sum_row) 1521 endif 1522 1523 if(peinf%inode .eq. 0) then 1524 call timing%start(timing%chi_sum_column) 1525 endif 1526 1527 do icurr=1,scal%npc 1528 this%gmeRDyntempc(itot,icurr) = & 1529 CONJG(this%gmeRDyntempn(indt(scal%imycol(icurr),it,irk))*pht(scal%imycol(icurr),it,irk)) 1530 enddo 1531 1532 if(peinf%inode .eq. 0) then 1533 call timing%stop(timing%chi_sum_column) 1534 endif 1535 1536 enddo ! it 1537 endif 1538 1539 enddo ! iv 1540 1541 1542 ! JRD: Using Level3 BLAS here for better performance 1543 1544 if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then 1545 do jj =1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group 1546 if(peinf%inode .eq. 0) then 1547 call timing%start(timing%chi_sum_gemm) 1548 endif 1549 1550 ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group 1551 call zgemm('n','n',scal%npr,scal%npc,ntot,(-1D0,0D0),this%gmeRDyntempr2(:,:,ifreq_para),scal%npr, & 1552 this%gmeRDyntempc(:,:),ntot,(1D0,0D0),this%chilocalRDyn(:,:,ifreq_para),scal%npr) 1553 if(peinf%inode .eq. 0) then 1554 call timing%stop(timing%chi_sum_gemm) 1555 endif 1556 1557 enddo 1558 endif 1559 1560 if(peinf%inode .eq. 0) then 1561 call timing%start(timing%chi_sum_array_alloc) 1562 endif 1563 1564 SAFE_DEALLOCATE(edenDRtemp) 1565 1566 if(peinf%inode .eq. 0) then 1567 call timing%stop(timing%chi_sum_array_alloc) 1568 endif 1569 1570 endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0) 1571 1572 !--------------------- 1573 ! Full Frequency Be Here(shishkin and Kresse 2006) 1574 1575 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1576 count_v=0 1577 ind_v=0 1578 ind_sf=0 1579 nsftot=0 1580 itot=0 1581 1582 do iv=1,(vwfn%nband+pol%ncrit) 1583 1584 if(peinf%inode .eq. 0) then 1585 call timing%start(timing%chi_sum_comm) 1586 endif 1587 1588 isend=peinf%global_pairowner(iv,ic)-1 1589 if (isend .lt. 0) then 1590 write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic 1591 call die("internal error in chi_summation") 1592 endif 1593 if (isend .eq. peinf%inode) then 1594 this%gmeRDyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), & 1595 ispin,irk,1) * sqrt(this%fact) 1596 zvalue=-pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1) 1597 iowna(iv)=iowna(iv)+1 1598 endif 1599 1600#ifdef MPI 1601 call MPI_Bcast(this%gmeRDyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr) 1602 call MPI_Bcast(zvalue,1,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr) 1603 call MPI_Bcast(pol%dSFreqGrid,pol%os_nsfreq_para,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr) 1604#endif 1605 1606 if(peinf%inode .eq. 0) then 1607 call timing%stop(timing%chi_sum_comm) 1608 endif 1609 ! compute spectral functions of the polarizability 1610 1611 if (scal%npr*scal%npc .ne. 0) then 1612 do it=1, nst(irk) 1613 1614 if (abs(zvalue) .gt. Tol_Zero) then 1615 1616 if(peinf%inode .eq. 0) then 1617 call timing%start(timing%chi_sum_row) 1618 endif 1619 1620 itot=itot+1 1621 isfreql=-1 1622 do jj=pol%nsfreq,1,-1 1623 if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then 1624 isfreql=jj 1625 EXIT 1626 endif 1627 enddo 1628 1629 if (isfreql.eq.pol%nsfreq) then 1630 cycle 1631 endif 1632 1633 isfreqr=isfreql+1 1634 1635 count_v(isfreql)=count_v(isfreql)+1 1636 count_v(isfreqr)=count_v(isfreqr)+1 1637 1638 il=count_v(isfreql) 1639 ir=count_v(isfreqr) 1640 1641 ind_v(isfreql,il)=itot 1642 ind_v(isfreqr,ir)=itot 1643 1644 sfreql=pol%dSFreqGrid(isfreql)/ryd 1645 sfreqr=pol%dSFreqGrid(isfreqr)/ryd 1646 1647 wl=-(zvalue-sfreqr)/(sfreqr-sfreql) 1648 wr=(zvalue-sfreql)/(sfreqr-sfreql) 1649 1650 do icurr=1,scal%npr 1651 this%gmeRDyntempr2(icurr,il,isfreql)=this%gmeRDyntempn( & 1652 indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk)*wl 1653 this%gmeRDyntempr2(icurr,ir,isfreqr)=this%gmeRDyntempn( & 1654 indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk)*wr 1655 enddo 1656 1657 if(peinf%inode .eq. 0) then 1658 call timing%stop(timing%chi_sum_row) 1659 endif 1660 1661 if(peinf%inode .eq. 0) then 1662 call timing%start(timing%chi_sum_column) 1663 endif 1664 1665 do icurr=1,scal%npc 1666 this%gmeRDyntempc(itot,icurr) = & 1667 CONJG(this%gmeRDyntempn(indt(scal%imycol(icurr),it,irk))*pht(scal%imycol(icurr),it,irk)) 1668 enddo 1669 1670 if(peinf%inode .eq. 0) then 1671 call timing%stop(timing%chi_sum_column) 1672 endif 1673 1674 endif 1675 1676 enddo ! it 1677 endif 1678 1679 enddo ! iv 1680 1681 if(peinf%inode .eq. 0) then 1682 call timing%start(timing%chi_sum_array_alloc) 1683 endif 1684 1685 jj=0 1686 do ii=1+peinf%rank_mtxel,pol%nsfreq,pol%nfreq_group 1687 if (count_v(ii)>0) then 1688 jj=jj+1 1689 ind_sf(jj)=ii 1690 endif 1691 enddo 1692 nsftot=jj 1693 if(peinf%inode .eq. 0) then 1694 call timing%stop(timing%chi_sum_array_alloc) 1695 endif 1696 1697 1698 if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then 1699 do ii=1, nsftot 1700 if(peinf%inode .eq. 0) then 1701 call timing%start(timing%chi_sum_gemm) 1702 endif 1703 1704 n1=ind_sf(ii) 1705 n2=count_v(n1) 1706 1707 do jj=1,n2 1708 n3=ind_v(n1,jj) 1709 this%gmeRDyntempcs(jj,:)=this%gmeRDyntempc(n3,:) 1710 enddo 1711 1712 call zgemm('n','n',scal%npr,scal%npc,n2,(-1D0,0D0),this%gmeRDyntempr2(:,:,n1),scal%npr, & 1713 this%gmeRDyntempcs(:,:),max_nv,(1D0,0D0),this%chilocalRDyn(:,:,n1),scal%npr) 1714 1715 if(peinf%inode .eq. 0) then 1716 call timing%stop(timing%chi_sum_gemm) 1717 endif 1718 1719 enddo 1720 endif 1721 endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1) 1722 1723 enddo ! ic (loop over conduction bands) 1724 1725 1726 if(peinf%inode .eq. 0) then 1727 call timing%start(timing%chi_sum_array_alloc) 1728 endif 1729 if (pol%freq_dep .eq. 0) then 1730 if (scal%npr*scal%npc .ne. 0) then 1731 pol%chi(:,:,ispin) = pol%chi(:,:,ispin) + this%chilocal(:,:) 1732 endif 1733 SAFE_DEALLOCATE(this%gmetempr) 1734 SAFE_DEALLOCATE(this%gmetempc) 1735 endif 1736 1737 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then 1738 if (scal%npr*scal%npc .ne. 0) then 1739 do jj = 1+peinf%rank_mtxel+(peinf%inode)/peinf%npes*100000,pol%nfreq,pol%nfreq_group 1740 ifreq_para=(jj+pol%nfreq_group-1)/pol%nfreq_group 1741! JRD XXX This copy is now a waste of time. Should set pol%chi directly above in the zgemm 1742 pol%chiRDyn(:,:,ifreq_para,ispin) = pol%chiRDyn(:,:,ifreq_para,ispin) + this%chilocalRDyn(:,:,ifreq_para) 1743 enddo 1744 endif 1745 SAFE_DEALLOCATE(this%gmeRDyntempr2) 1746 SAFE_DEALLOCATE(this%gmeRDyntempc) 1747 endif 1748 if(peinf%inode .eq. 0) then 1749 call timing%stop(timing%chi_sum_array_alloc) 1750 endif 1751 1752 1753 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1754 1755 if(peinf%inode .eq. 0) then 1756 call timing%start(timing%chi_sum_array_alloc) 1757 endif 1758 1759 if (scal%npr*scal%npc .ne. 0) then 1760 do jj = 1+peinf%rank_mtxel, pol%nsfreq,pol%nfreq_group 1761 pol%chiTDyn(jj,:,:,ispin) = pol%chiTDyn(jj,:,:,ispin) + this%chilocalRDyn(:,:,jj) 1762 enddo 1763 endif 1764 SAFE_DEALLOCATE(this%gmeRDyntempr2) 1765 SAFE_DEALLOCATE(this%gmeRDyntempc) 1766 SAFE_DEALLOCATE(this%gmeRDyntempcs) 1767 SAFE_DEALLOCATE(ind_v) 1768 if(peinf%inode .eq. 0) then 1769 call timing%stop(timing%chi_sum_array_alloc) 1770 endif 1771 1772 endif 1773 1774 enddo ! ispin (loop over spins) 1775 enddo ! irk (loop over k-points in set rk) 1776 call progress_free(prog_info) 1777 1778 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1779 1780 ! ------------------------- 1781 ! Hilbert transform 1782 ! ------------------------- 1783 1784 if(peinf%inode .eq. 0) then 1785 call timing%start(timing%chi_sum_ht_nb) 1786 endif 1787 1788 1789! JRD XXX pol%chi arrays out of order below 1790 call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, & 1791 (-1D0,0D0),htwR(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, & 1792 (0D0,0D0),this%chiRDynOld(:,:,:,:),pol%nfreq) 1793 1794 do ispin =1, kp%nspin 1795 do jj=1,pol%nfreq_in_group 1796 pol%chiRDyn(:,:,jj,ispin) = this%chiRDynOld(jj,:,:,ispin) 1797 enddo ! jj 1798 enddo ! ispin 1799 1800 if(peinf%inode .eq. 0) then 1801 call timing%stop(timing%chi_sum_ht_nb) 1802 endif 1803 1804 1805 endif !(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1) 1806 1807 if(peinf%inode .eq. 0) then 1808 call timing%start(timing%chi_sum_array_alloc) 1809 endif 1810 1811 if (pol%freq_dep .eq. 0) then 1812 SAFE_DEALLOCATE(this%chilocal) 1813 endif 1814 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0 .or. pol%freq_dep_method .eq. 2)) then 1815 SAFE_DEALLOCATE(this%chilocalRDyn) 1816 endif 1817 if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then 1818 SAFE_DEALLOCATE(this%chilocalRDyn) 1819 SAFE_DEALLOCATE(count_v) 1820 SAFE_DEALLOCATE(ind_sf) 1821 endif 1822 1823 SAFE_DEALLOCATE(iowna) 1824 !call free_summation_buffers(pol) 1825 1826 if(peinf%inode .eq. 0) then 1827 call timing%stop(timing%chi_sum_array_alloc) 1828 endif 1829 1830 1831 POP_SUB(chi_summation_comm_elements) 1832 return 1833 1834 end subroutine chi_summation_comm_elements 1835 1836#if defined MPI && defined USESCALAPACK 1837 subroutine chi_summation_sub_trunc(this,pol,scal,kp,kpq,vwfn,cwfn,& 1838 nst,nrk,indt,pht,gvec,crys,q0,iq) 1839 use inversion_m 1840 type(chi_summator_t), intent(INOUT) :: this 1841 type(polarizability), intent(INOUT) :: pol 1842 type(scalapack), intent(in) :: scal 1843 type(kpoints), intent(IN) :: kp, kpq 1844 type(valence_wfns), intent(IN) :: vwfn 1845 type(conduction_wfns), intent(IN) :: cwfn 1846 1847 integer, intent(IN) :: nst(:) 1848 integer, intent(IN) :: nrk 1849 integer, intent(INOUT) :: indt(:,:,:) 1850 SCALAR, intent(INOUT) :: pht(:,:,:) 1851 type (gspace), intent(in) :: gvec 1852 type (crystal), intent(in) :: crys 1853 real(DP), intent(in) :: q0(3) 1854 integer, intent(in) :: iq 1855 real(DP) :: zvalue, cv_energy 1856 1857 SCALAR, ALLOCATABLE :: chilocal(:,:), chiRDyn_local(:,:) 1858 SCALAR, allocatable :: chi_omega0_save(:,:,:) 1859 INTEGER :: ntot, ntot2 1860 INTEGER :: neig_sub 1861 INTEGER :: irk, ipe, ispin, itot, it, iv, i, j, ii, jj 1862 INTEGER :: irow, icol, irowm, icolm, icurr, ilimit, freq_idx, & 1863 i_myband, mytot 1864 integer :: ig_l, ig_g, igp_l, igp_g 1865 INTEGER :: ipe_real 1866 1867 integer, allocatable :: tmprowindex(:),tmpcolindex(:) 1868 SCALAR, allocatable :: tmprowph(:),tmpcolph(:) 1869 type(cvpair_info) :: cvpair_temp 1870 SCALAR :: negfact 1871 SCALAR :: edenDRtemp 1872 1873 SCALAR :: epsheaddummy, wcoul0 1874 type (twork_scell) :: work_scell 1875 real(DP), allocatable :: vcoul(:) 1876 integer, allocatable :: isorti(:) 1877 real(DP) :: vc, oneoverq, avgcut 1878 integer :: iscreen, nfk, iparallel 1879 integer :: qgrid(3) 1880 real(DP) :: q0vec(3) 1881 1882 ! variables for non-blocking cyclic scheme 1883 integer :: isend_static, irec_static 1884 integer :: actual_send, actual_rec 1885 integer :: nsend_row, nsend_col, nrec_row, nrec_col 1886 integer :: req_send, tag_send, req_rec, tag_rec 1887 integer :: stat(MPI_STATUS_SIZE) 1888 SCALAR, allocatable :: buf_send(:,:,:), buf_rec(:,:,:), buf_temp(:,:,:) 1889 1890 type(progress_info) :: prog_info 1891 ! variables for fixed buffer size 1892 logical :: keep_fix_buf 1893 integer :: size_N, size_M, size_K 1894 ! variables for matrix dumping 1895 logical :: do_read_dump_matrix, do_dump_matrix 1896 1897 PUSH_SUB(chi_summation_sub_trunc) 1898 1899 ! Keep these variables hardcoded, can be used by advance 1900 ! user to dump the block cyclic distributed polarizability 1901 ! as check point, for specific format see the 2 routines 1902 ! at the bottom of the module (dump_matrix and read_dump_matrix) 1903 do_dump_matrix = .false. 1904 do_read_dump_matrix = .false. 1905 ! if we read dumped matrix make sure we don't write, so we don't 1906 ! override the results 1907 if ( do_read_dump_matrix ) do_dump_matrix = .false. 1908 1909 ! this is the new communication scheme which avoid continuos allocation 1910 ! deallocation of the communication / computation buffers. it will be 1911 ! set default to .true. and turn off by using the correspondinf input key 1912 keep_fix_buf = .false. 1913 ! keep_fix_buf only work for pol%nonblocking_cyclic 1914 if( pol%nonblocking_cyclic ) then 1915 keep_fix_buf = .true. 1916 if( pol%dont_keep_fix_buffers ) then 1917 keep_fix_buf = .false. 1918 end if 1919 end if 1920 1921 !XXXXXXXXX 1922 ! WRITE(2000,*) pol%gme 1923 ! WRITE(2000,*) 'XXXX' 1924 ! WRITE(2000,*) pht 1925 ! WRITE(*,*) SIZE(pol%gme,1) 1926 ! WRITE(*,*) SIZE(pol%gme,2) 1927 ! WRITE(*,*) SIZE(pol%gme,3) 1928 ! WRITE(*,*) SIZE(pol%gme,4) 1929 ! WRITE(*,*) SIZE(pol%gme,5) 1930 ! WRITE(*,*) SIZE(pol%gme,6) 1931 ! DO irk = 1, nrk 1932 ! DO j = 1, SIZE(pol%gme,3) 1933 ! DO i = 1, SIZE(pol%gme,2) 1934 ! WRITE(2000,*) pht(i,j,irk) 1935 ! WRITE(2000,*) pol%gme(:,i,j,1,irk,1) 1936 ! END DO 1937 ! END DO 1938 ! END DO 1939 !XXXXXXXXX 1940 1941 ! first calculate polarizability at omega=0 using all processes (similar to static case) 1942 ntot=0 1943 ntot2=0 1944 ntot = peinf%nvownactual*peinf%ncownactual 1945 do irk = 1, nrk 1946 ntot2=ntot2 + nst(irk) 1947 enddo 1948 ntot=ntot*ntot2 1949 1950 SAFE_ALLOCATE(this%chilocal2, (scal%npr,scal%npc,kp%nspin)) 1951 this%chilocal2=0 1952 1953 freq_idx = pol%nfreq_group 1954 IF(pol%nfreq_group .gt. 1) THEN 1955 freq_idx = peinf%rank_mtxel+1 1956 END IF 1957 1958 negfact = -1D0*this%fact 1959 1960 ! trace the time for chi omega=0 1961 if(peinf%inode .eq. 0) then 1962 call timing%start(timing%chi_sum_sub_omega_0) 1963 endif 1964 1965 1966 ! here we want to use all processes for computing omega=0, we 1967 ! do not take care of the parallelization over frequencies 1968 call progress_init(prog_info, 'building polarizability matrix omega=0', 'processor', & 1969 peinf%npes) 1970 1971 if(pol%nonblocking_cyclic) then 1972 ! initialize buffer for non-blocking cyclic scheme 1973 ! static process for the communication 1974 isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes) 1975 irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes) 1976 ! allocate my size for the first send 1977 nsend_row = scal%nprd(peinf%inode+1) 1978 nsend_col = scal%npcd(peinf%inode+1) 1979 1980 if(keep_fix_buf) then 1981 ! precompute max sizes 1982 size_N = MAXVAL(scal%nprd(:)) 1983 size_M = MAXVAL(scal%npcd(:)) 1984 size_K = ntot 1985 call mpi_allreduce(MPI_IN_PLACE, size_K, 1, & 1986 MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr) 1987 ! allocate only once 1988 SAFE_ALLOCATE(buf_send, (size_N, size_M, kp%nspin)) 1989 SAFE_ALLOCATE(buf_rec, (size_N, size_M, kp%nspin)) 1990 SAFE_ALLOCATE(buf_temp, (size_N, size_M, kp%nspin)) 1991 do ispin = 1 , kp%nspin 1992 !$OMP PARALLEL DO collapse(2) 1993 do ii = 1, size_M 1994 do jj = 1, size_N 1995 buf_send(jj,ii,ispin) = ZERO 1996 enddo 1997 enddo 1998 !$OMP PARALLEL DO collapse(2) 1999 do ii = 1, size_M 2000 do jj = 1, size_N 2001 buf_rec(jj,ii,ispin) = ZERO 2002 end do 2003 end do 2004 !$OMP PARALLEL DO collapse(2) 2005 do ii = 1, size_M 2006 do jj = 1, size_N 2007 buf_temp(jj,ii,ispin) = ZERO 2008 end do 2009 end do 2010 end do 2011 ! allocate for index mapping 2012 SAFE_ALLOCATE(tmprowindex, (size_N)) ! (scal%nprd(ipe_real))) 2013 SAFE_ALLOCATE(tmpcolindex, (size_M)) ! (scal%npcd(ipe_real))) 2014 SAFE_ALLOCATE(tmprowph, (size_N)) ! (scal%nprd(ipe_real))) 2015 SAFE_ALLOCATE(tmpcolph, (size_M)) ! (scal%npcd(ipe_real))) 2016 ! allocate for matrix multiplication 2017 SAFE_ALLOCATE(this%chilocal, (size_N, size_M)) ! (scal%nprd(ipe),scal%npcd(ipe)) 2018 SAFE_ALLOCATE(this%gmetempr, (size_N, size_K)) ! (scal%nprd(ipe),ntot) 2019 SAFE_ALLOCATE(this%gmetempc, (size_K, size_M)) ! (ntot,scal%npcd(ipe)) 2020 !$OMP PARALLEL DO collapse(2) 2021 do ii = 1, size_M 2022 do jj = 1, size_N 2023 this%chilocal(jj,ii) = ZERO 2024 end do 2025 end do 2026 !$OMP PARALLEL DO collapse(2) 2027 do ii = 1, size_K 2028 do jj = 1, size_N 2029 this%gmetempr(jj,ii) = ZERO 2030 end do 2031 end do 2032 !$OMP PARALLEL DO collapse(2) 2033 do ii = 1, size_M 2034 do jj = 1, size_K 2035 this%gmetempc(jj,ii) = ZERO 2036 end do 2037 end do 2038 ! 2039 else ! keep_fix_buf 2040 ! 2041 ! standard case 2042 SAFE_ALLOCATE(buf_send, (nsend_row, nsend_col, kp%nspin)) 2043 do ispin = 1 , kp%nspin 2044 !$OMP PARALLEL DO collapse(2) 2045 do ii = 1, nsend_col 2046 do jj = 1, nsend_row 2047 buf_send(jj,ii,ispin) = ZERO 2048 enddo 2049 enddo 2050 end do 2051 ! 2052 end if ! keep_fix_buf 2053 end if 2054 2055 do ipe = 1, peinf%npes 2056 call progress_step(prog_info) 2057 2058 if ( do_read_dump_matrix ) cycle 2059 2060 if(pol%nonblocking_cyclic) then 2061 ! calculate the actual process we have to send and we are receiving 2062 actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes) 2063 actual_rec = MOD(peinf%inode - ipe + peinf%npes, peinf%npes) 2064 nrec_row = scal%nprd(actual_rec+1) 2065 nrec_col = scal%npcd(actual_rec+1) 2066 ! 2067 if( keep_fix_buf ) then 2068 ! just make sure sizes will always be the same 2069 nsend_row = size_N 2070 nrec_row = size_N 2071 nsend_col = size_M 2072 nrec_col = size_M 2073 else 2074 ! allocate reciving buffer 2075 SAFE_ALLOCATE(buf_rec, (nrec_row,nrec_col,kp%nspin)) 2076 end if 2077 ! 2078 do ispin = 1 , kp%nspin 2079 !$OMP PARALLEL DO collapse(2) 2080 do ii = 1, nrec_col 2081 do jj = 1, nrec_row 2082 buf_rec(jj,ii,ispin) = ZERO 2083 enddo 2084 enddo 2085 end do 2086 ! post message 2087 if(peinf%inode .eq. 0) then 2088 call timing%start(timing%chi_sum_comm) 2089 endif 2090 tag_rec = 1 2091 tag_send = 1 2092 req_rec = MPI_REQUEST_NULL 2093 req_send = MPI_REQUEST_NULL 2094 CALL MPI_Irecv(buf_rec, nrec_row*nrec_col*kp%nspin,MPI_SCALAR,irec_static,& 2095 tag_rec, MPI_COMM_WORLD, req_rec, mpierr) 2096 CALL MPI_Isend(buf_send, nsend_row*nsend_col*kp%nspin,MPI_SCALAR,isend_static,& 2097 tag_send, MPI_COMM_WORLD, req_send, mpierr) 2098 if(peinf%inode .eq. 0) then 2099 call timing%stop(timing%chi_sum_comm) 2100 endif 2101 2102 ! allocate working array 2103 if(peinf%inode .eq. 0) then 2104 call timing%start(timing%chi_sum_array_alloc) 2105 endif 2106 ! 2107 if( .not. keep_fix_buf ) then 2108 SAFE_ALLOCATE(this%chilocal, (nrec_row, nrec_col)) 2109 end if 2110 ! 2111!$OMP PARALLEL DO collapse(2) 2112 do ii = 1, nrec_col 2113 do jj = 1, nrec_row 2114 this%chilocal(jj,ii)=0D0 2115 enddo 2116 enddo 2117 ! 2118 if( .not. keep_fix_buf ) then 2119 SAFE_ALLOCATE(buf_temp, (nrec_row, nrec_col, kp%nspin)) 2120 end if 2121 ! 2122 do ispin = 1 , kp%nspin 2123!$OMP PARALLEL DO collapse(2) 2124 do ii = 1, nrec_col 2125 do jj = 1, nrec_row 2126 buf_temp(jj,ii,ispin) = ZERO 2127 enddo 2128 enddo 2129 end do 2130 ! 2131 if( keep_fix_buf ) then 2132 ! make sure to put buffer to zero 2133 ! 2134 !$OMP PARALLEL DO collapse(2) 2135 do ii = 1, size_K 2136 do jj = 1, size_N 2137 this%gmetempr(jj,ii) = ZERO 2138 end do 2139 end do 2140 !$OMP PARALLEL DO collapse(2) 2141 do ii = 1, size_M 2142 do jj = 1, size_K 2143 this%gmetempc(jj,ii) = ZERO 2144 end do 2145 end do 2146 ! 2147 else 2148 SAFE_ALLOCATE(this%gmetempr, (nrec_row, ntot)) 2149 SAFE_ALLOCATE(this%gmetempc, (ntot, nrec_col)) 2150 end if 2151 if(peinf%inode .eq. 0) then 2152 call timing%stop(timing%chi_sum_array_alloc) 2153 endif 2154 2155 ! in this case the ipe is the real process from which we should receive 2156 ipe_real = actual_rec+1 2157 else 2158 ! standard calculation 2159 SAFE_ALLOCATE(this%chilocal, (scal%nprd(ipe),scal%npcd(ipe))) 2160 this%chilocal=0D0 2161 SAFE_ALLOCATE(this%gmetempr, (scal%nprd(ipe),ntot)) 2162 SAFE_ALLOCATE(this%gmetempc, (ntot,scal%npcd(ipe))) 2163 ! this is the standard case 2164 ipe_real = ipe 2165 end if 2166 2167 do ispin = 1 , kp%nspin 2168 2169 itot = 0 2170 2171 if(peinf%inode .eq. 0) then 2172 call timing%start(timing%chi_sum_prep) 2173 endif 2174 2175 if ( .not. keep_fix_buf ) then 2176 SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe_real))) 2177 SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe_real))) 2178 SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe_real))) 2179 SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe_real))) 2180 end if 2181 2182 do irk = 1, nrk 2183 do it = 1, nst(irk) 2184 2185 do icurr=1,scal%nprd(ipe_real) 2186 tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe_real),it,irk) 2187 tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe_real),it,irk) 2188 enddo 2189 do icurr=1,scal%npcd(ipe_real) 2190 tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe_real),it,irk) 2191 tmpcolph(icurr) = pht(scal%imycold(icurr,ipe_real),it,irk) 2192 enddo 2193 2194 do iv = 1, vwfn%nband+pol%ncrit 2195 ! check if I own this partuicular band 2196 if (peinf%doiownv(iv)) then 2197 ilimit = peinf%ncownactual 2198 else 2199 ilimit = 0 2200 endif 2201 2202 !$OMP PARALLEL private (mytot, zvalue, edenDRtemp, icurr, & 2203 !$OMP cvpair_temp, cv_energy ) 2204 !$OMP DO 2205 do i_myband = 1, ilimit 2206 mytot = itot + i_myband 2207 2208 zvalue = pol%edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx) 2209 if(pol%lin_denominator<TOL_Zero) then 2210 ! this is when the lin_denominator mode is not active. 2211 if(abs(zvalue) .gt. Tol_Zero) then 2212 edenDRtemp = -0.5D+00 * ( & 2213 1d0/(zvalue - pol%dFreqBrd(1)/ryd)+ & 2214 1d0/(zvalue + pol%dFreqBrd(1)/ryd)) 2215 else 2216 edenDRtemp = 0.0D+00 2217 end if 2218 else 2219 !this is when lin_denominator mode is active 2220 cvpair_temp = pol%lin_edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx) 2221 cv_energy = cvpair_temp%ev - cvpair_temp%ec 2222 2223 if(abs(cv_energy) - pol%lin_denominator > (-TOL_Zero)) then 2224 ! 2225 if (abs(zvalue) .gt. Tol_Zero) then 2226 edenDRtemp = -0.5d0*( & 2227 1d0/(zvalue-pol%dFreqBrd(1)/ryd)+ & 2228 1d0/(zvalue+pol%dFreqBrd(1)/ryd)) 2229 else 2230 edenDRtemp = 0D0 2231 endif 2232 ! 2233 else ! if denominator is small, do linearized energy denominator 2234 ! 2235 if(cvpair_temp%vltc) then 2236 edenDRtemp = -0.5d0*(& 2237 2238 conjg(integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),& 2239 cvpair_temp%ev,cvpair_temp%ec,& 2240 2241 cvpair_temp%vv,cvpair_temp%vc,& 2242 -pol%dFreqGrid(1)/ryd,pol%efermi/ryd))& 2243 2244 +integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),& 2245 cvpair_temp%ev,cvpair_temp%ec,& 2246 cvpair_temp%vv,cvpair_temp%vc,& 2247 pol%dFreqGrid(1)/ryd,pol%efermi/ryd)) 2248 else 2249 edenDRtemp = 0D0 2250 endif 2251 ! 2252 end if 2253 ! 2254 end if ! lin denominator 2255 2256 ! divede the matrix elements by the corresponding energy denominator 2257 ! right hand side matrix 2258 do icurr=1,scal%nprd(ipe_real) 2259 this%gmetempr(icurr,mytot)=pol%gme(tmprowindex(icurr), & 2260 i_myband,peinf%indexv(iv),ispin,irk,freq_idx) * tmprowph(icurr) * edenDRtemp 2261 enddo 2262 ! left hand side matrix 2263 do icurr=1,scal%npcd(ipe_real) 2264 this%gmetempc(mytot,icurr) = & 2265 MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,peinf%indexv(iv),ispin,irk,freq_idx) * tmpcolph(icurr) ) 2266 enddo 2267 2268 end do ! i_myband 2269 !$OMP END DO 2270 !$OMP END PARALLEL 2271 itot = itot+ilimit 2272 2273 end do ! iv 2274 2275 end do ! it 2276 end do ! irk 2277 2278 if ( .not. keep_fix_buf ) then 2279 SAFE_DEALLOCATE(tmprowindex) 2280 SAFE_DEALLOCATE(tmpcolindex) 2281 SAFE_DEALLOCATE(tmprowph) 2282 SAFE_DEALLOCATE(tmpcolph) 2283 end if 2284 2285 if(peinf%inode .eq. 0) then 2286 call timing%stop(timing%chi_sum_prep) 2287 endif 2288 2289 2290 ! Go with matrix multiplication 2291 ! if we keep fixed buffer size we always multiply 2292 if( keep_fix_buf ) then 2293 ! 2294 if(peinf%inode .eq. 0) then 2295 call timing%start(timing%chi_sum_gemm) 2296 endif 2297 2298#ifdef CPLX 2299 call zgemm('n','n', size_N, size_M, size_K, & 2300 negfact,this%gmetempr(:,:), size_N, this%gmetempc(:,:), size_K,& 2301 (0D0,0D0), this%chilocal(:,:), size_N ) 2302#else 2303 call dgemm('n','n', size_N, size_M, size_K, & 2304 negfact,this%gmetempr(:,:), size_N, this%gmetempc(:,:), size_K, & 2305 0D0,this%chilocal(:,:), size_N ) 2306#endif 2307 if(peinf%inode .eq. 0) then 2308 call timing%stop(timing%chi_sum_gemm) 2309 endif 2310 2311 ! 2312 else 2313 ! 2314 if(ntot > 0) then 2315 if(peinf%inode .eq. 0) then 2316 call timing%start(timing%chi_sum_gemm) 2317 endif 2318 2319 ! in the case pol%nonblocking_cyclic, ipe is the shift wrt inode 2320 ! position, ipe_real is the absolute position of the ipe from which 2321 ! we are receiving. in the standard case ipe == ipe_real 2322#ifdef CPLX 2323 call zgemm('n','n',scal%nprd(ipe_real),scal%npcd(ipe_real),ntot, & 2324 negfact,this%gmetempr(:,:),scal%nprd(ipe_real),this%gmetempc(:,:),ntot,& 2325 (0D0,0D0),this%chilocal(:,:),scal%nprd(ipe_real)) 2326#else 2327 call dgemm('n','n',scal%nprd(ipe_real),scal%npcd(ipe_real),ntot, & 2328 negfact,this%gmetempr(:,:),scal%nprd(ipe_real),this%gmetempc(:,:),ntot,& 2329 0D0,this%chilocal(:,:),scal%nprd(ipe_real)) 2330#endif 2331 if(peinf%inode .eq. 0) then 2332 call timing%stop(timing%chi_sum_gemm) 2333 endif 2334 2335 endif 2336 ! 2337 end if !keep_fix_buf 2338 2339 if(pol%nonblocking_cyclic) then 2340 ! just copy the result 2341!$OMP PARALLEL DO collapse(2) 2342 do ii = 1, nrec_col 2343 do jj = 1, nrec_row 2344 buf_temp(jj,ii,ispin) = this%chilocal(jj,ii) 2345 enddo 2346 enddo 2347 else 2348 if(peinf%inode .eq. 0) then 2349 call timing%start(timing%chi_sum_comm) 2350 endif 2351 2352#ifdef MPI 2353 ! here ipe == ipe_real 2354 call MPI_Reduce(this%chilocal(1,1),this%chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, & 2355 MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr) 2356#else 2357 this%chilocal2(:,:,ispin)=this%chilocal(:,:) 2358#endif 2359 if(peinf%inode .eq. 0) then 2360 call timing%stop(timing%chi_sum_comm) 2361 endif 2362 2363 end if 2364 2365 enddo ! ispin 2366 if(peinf%inode .eq. 0) then 2367 call timing%start(timing%chi_sum_array_alloc) 2368 endif 2369 2370 ! 2371 if( .not. keep_fix_buf ) then 2372 SAFE_DEALLOCATE(this%chilocal) 2373 SAFE_DEALLOCATE(this%gmetempr) 2374 SAFE_DEALLOCATE(this%gmetempc) 2375 end if 2376 ! 2377 if(peinf%inode .eq. 0) then 2378 call timing%stop(timing%chi_sum_array_alloc) 2379 endif 2380 2381 2382 ! finalize non-blocking cyclic communication 2383 if(pol%nonblocking_cyclic) then 2384 if(peinf%inode .eq. 0) then 2385 call timing%start(timing%chi_sum_comm) 2386 endif 2387 2388 ! make sure the buffer is received 2389 CALL MPI_Wait(req_rec,stat,mpierr) 2390 ! accumulate contribution into receiving buffer 2391 ! buf_rec(:,:,:) = buf_rec(:,:,:) + buf_temp 2392 do ispin = 1 , kp%nspin 2393!$OMP PARALLEL DO collapse(2) 2394 do ii = 1, nrec_col 2395 do jj = 1, nrec_row 2396 buf_rec(jj,ii,ispin) = buf_rec(jj,ii,ispin) + buf_temp(jj,ii,ispin) 2397 enddo 2398 enddo 2399 end do 2400 ! 2401 if( .not. keep_fix_buf ) then 2402 SAFE_DEALLOCATE(buf_temp) 2403 end if 2404 ! 2405 ! wait for the massage to be sent 2406 CALL MPI_Wait(req_send,stat,mpierr) 2407 if(peinf%inode .eq. 0) then 2408 call timing%stop(timing%chi_sum_comm) 2409 endif 2410 2411 if(peinf%inode .eq. 0) then 2412 call timing%start(timing%chi_sum_array_alloc) 2413 endif 2414 2415 ! copy the messega to the sending buffer for the next cycle 2416 ! 2417 if( .not. keep_fix_buf ) then 2418 SAFE_DEALLOCATE(buf_send) 2419 SAFE_ALLOCATE(buf_send, (nrec_row, nrec_col, kp%nspin)) 2420 end if 2421 ! 2422 do ispin = 1 , kp%nspin 2423!$OMP PARALLEL DO collapse(2) 2424 do ii = 1, nrec_col 2425 do jj = 1, nrec_row 2426 buf_send(jj,ii,ispin) = buf_rec(jj,ii,ispin) 2427 enddo 2428 enddo 2429 end do 2430 ! 2431 !XXX buf_send = buf_rec 2432 ! 2433 nsend_row = nrec_row 2434 nsend_col = nrec_col 2435 ! deallocate receiving buffer 2436 if( .not. keep_fix_buf ) then 2437 SAFE_DEALLOCATE(buf_rec) 2438 end if 2439 ! 2440 if(peinf%inode .eq. 0) then 2441 call timing%stop(timing%chi_sum_array_alloc) 2442 endif 2443 2444 end if 2445 2446 enddo ! ipe 2447 call progress_free(prog_info) 2448 2449 ! time for chi omega = 0 2450 if(peinf%inode .eq. 0) then 2451 call timing%stop(timing%chi_sum_sub_omega_0) 2452 endif 2453 2454 if(pol%nonblocking_cyclic) then 2455 ! done 2456 do ispin = 1 , kp%nspin 2457 !$OMP PARALLEL DO collapse(2) 2458 do ii = 1, scal%npc 2459 do jj = 1, scal%npr 2460 this%chilocal2(jj,ii,ispin) = buf_send(jj,ii,ispin) 2461 enddo 2462 enddo 2463 end do 2464 !XXXXX this%chilocal2(:,:,:) = buf_send(:,:,:) 2465 ! 2466 if(keep_fix_buf) then 2467 ! clean-up 2468 SAFE_DEALLOCATE(buf_send) 2469 SAFE_DEALLOCATE(buf_rec) 2470 SAFE_DEALLOCATE(buf_temp) 2471 ! 2472 SAFE_DEALLOCATE(tmprowindex) 2473 SAFE_DEALLOCATE(tmpcolindex) 2474 SAFE_DEALLOCATE(tmprowph) 2475 SAFE_DEALLOCATE(tmpcolph) 2476 ! 2477 SAFE_DEALLOCATE(this%chilocal) 2478 SAFE_DEALLOCATE(this%gmetempr) 2479 SAFE_DEALLOCATE(this%gmetempc) 2480 else 2481 ! standard case 2482 SAFE_DEALLOCATE(buf_send) 2483 end if 2484 end if 2485 2486 SAFE_ALLOCATE(chiRDyn_local, (scal%npr,scal%npc)) 2487 chiRDyn_local(:,:) = this%chilocal2(:,:,1) 2488 if(kp%nspin.eq.2) chiRDyn_local(:,:) = chiRDyn_local(:,:) + this%chilocal2(:,:,2) 2489 2490 ! make a copy of chi at omega zero (only in case full chi is needed) 2491 if(pol%need_full_chi) then 2492 SAFE_ALLOCATE(chi_omega0_save, (scal%npr,scal%npc,kp%nspin)) 2493 chi_omega0_save(:,:,:) = this%chilocal2(:,:,:) 2494 end if 2495 2496 SAFE_DEALLOCATE(this%chilocal2) 2497 2498 ! Generator Coulomb Interaction Array Vcoul 2499 SAFE_ALLOCATE(isorti, (gvec%ng)) 2500 SAFE_ALLOCATE(vcoul, (pol%nmtx)) 2501 2502 vcoul(:)=0.0d0 2503 do i=1,gvec%ng 2504 isorti(pol%isrtx(i)) = i 2505 end do 2506 2507 avgcut=TOL_ZERO 2508 iscreen=0 2509 nfk=product(kp%kgrid(1:3)) 2510 q0vec=0d0 2511 iparallel=1 2512 2513 if(peinf%inode .eq. 0) then 2514 call timing%start(timing%chi_sum_sub_vcoul) 2515 endif 2516 2517 2518 qgrid(:)=1 2519 2520 epsheaddummy=0.0d0 2521 wcoul0=0.0d0 2522 2523 if(pol%nfreq_group .eq. 1) then 2524 call vcoul_generator(pol%icutv,pol%truncval,gvec,crys%bdot,crys%celvol, & 2525 nfk,pol%nmtx,pol%isrtx,iscreen,q0,q0vec,vcoul, & 2526 pol%iwritecoul,iparallel,avgcut,oneoverq,qgrid,epsheaddummy, & 2527 work_scell,.false.,wcoul0) 2528 else 2529 call vcoul_generator(pol%icutv,pol%truncval,gvec,crys%bdot,crys%celvol, & 2530 nfk,pol%nmtx,pol%isrtx,iscreen,q0,q0vec,vcoul, & 2531 pol%iwritecoul,iparallel,avgcut,oneoverq,qgrid,epsheaddummy, & 2532 work_scell,.false.,wcoul0,nfreq_group=pol%nfreq_group) 2533 endif 2534 2535 if(peinf%inode .eq. 0) then 2536 call timing%stop(timing%chi_sum_sub_vcoul) 2537 endif 2538 2539 2540 ! calculate symmetrized polarizability 2541 do igp_l = 1, scal%npc 2542 igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol) 2543 vc = sqrt(vcoul(igp_g)) 2544 do ig_l = 1, scal%npr 2545 ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow) 2546 chiRDyn_local(ig_l,igp_l) = chiRDyn_local(ig_l,igp_l) * sqrt(vcoul(ig_g)) * vc 2547 enddo 2548 enddo 2549 2550 ! Dump write/read matrices if needed 2551 if( do_dump_matrix ) then 2552 call dump_matrix(scal, chiRDyn_local, peinf%inode, 100000*iq) 2553 else 2554 if( do_read_dump_matrix ) then 2555 chiRDyn_local = ZERO 2556 call read_dump_matrix(scal, chiRDyn_local, peinf%inode, 100000*iq) 2557 end if 2558 end if 2559 2560 ! copy the symmetrized chi for omega = 0 only if we work within the subspace 2561 IF(.NOT.pol%need_full_chi) THEN 2562 SAFE_ALLOCATE(pol%chiRDyn_sym_omega0, (scal%npr,scal%npc)) 2563 pol%chiRDyn_sym_omega0(:,:) = chiRDyn_local(:,:) 2564 END IF 2565 2566 !XXXXXXXXXXXXXXXXx 2567 ! ALLOCATE(chilocal(pol%nmtx,pol%nmtx)) 2568 ! chilocal = ZERO 2569 ! icurr=0 2570 ! do i=1,pol%nmtx 2571 ! irow=MOD(INT(((i-1)/scal%nbl)+TOL_SMALL),scal%nprow) 2572 ! if(irow.ne.scal%myprow) cycle 2573 ! vc = SQRT(vcoul(i)) 2574 ! do j=1,pol%nmtx 2575 ! icol=MOD(INT(((j-1)/scal%nbl)+TOL_SMALL),scal%npcol) 2576 ! if(icol .eq. scal%mypcol) then 2577 ! icurr=icurr+1 2578 ! irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1 2579 ! icolm=MOD((icurr-1),scal%npc)+1 2580 2581 ! chilocal(i,j) = chiRDyn_local(irowm,icolm) 2582 ! endif 2583 ! end do 2584 ! end do 2585 ! CALL MPI_ALLREDUCE(MPI_IN_PLACE,chilocal(1,1),pol%nmtx*pol%nmtx,MPI_SCALAR,MPI_SUM,MPI_COMM_WORLD,mpierr) 2586 ! IF(peinf%inode.eq.0) THEN 2587 ! WRITE(3000,*) pol%nmtx 2588 ! DO j = 1, pol%nmtx 2589 ! DO i = 1, pol%nmtx 2590 ! WRITE(3000,*) i, j, chilocal(i,j) 2591 ! END DO 2592 ! END DO 2593 ! END IF 2594 ! DEALLOCATE(chilocal) 2595 !XXXXXXXXXXXXXXXXx 2596 2597 ! diagonalize 2598 if(peinf%inode .eq. 0) then 2599 call timing%start(timing%chi_sum_sub_diag) 2600 endif 2601 2602 SAFE_ALLOCATE(pol%eigenvect_omega0, (scal%npr,scal%npc)) 2603 SAFE_ALLOCATE(pol%eigenval_omega0, (pol%nmtx)) 2604 2605#ifdef USEELPA 2606#ifdef CPLX 2607 IF(pol%use_elpa) THEN 2608 CALL diagonalize_elpa(pol%nmtx, scal, chiRDyn_local, & 2609 pol%chi_eigenvalue_cutoff, pol%neig_sub_input, neig_sub, & 2610 pol%eigenvect_omega0, pol%eigenval_omega0) 2611 ELSE 2612#endif 2613#endif 2614 CALL diagonalize_scalapack(pol%nmtx, scal, chiRDyn_local, & 2615 pol%chi_eigenvalue_cutoff, pol%neig_sub_input, neig_sub, & 2616 pol%eigenvect_omega0, pol%eigenval_omega0) 2617#ifdef USEELPA 2618#ifdef CPLX 2619 END IF 2620#endif 2621#endif 2622 2623 SAFE_DEALLOCATE(chiRDyn_local) 2624 if(peinf%inode .eq. 0) then 2625 call timing%stop(timing%chi_sum_sub_diag) 2626 endif 2627 2628 2629#ifdef MPI 2630 call MPI_barrier(MPI_COMM_WORLD,mpierr) 2631#endif 2632 2633 ! here we have the eigenvector at omega=0, redistribute gme into scalapack format 2634 ! easy communication scheme 2635 CALL calc_pol_sub_trunc_easy(this,pol,scal,kp,kpq,vwfn,cwfn,& 2636 nst,nrk,indt,pht,gvec,crys,q0,& 2637 neig_sub, pol%eigenvect_omega0, pol%eigenval_omega0, vcoul) 2638 2639 ! replace the polarizability at omega zero with the non-truncated chi 2640 ! (only in case full chi is needed) 2641 if(pol%need_full_chi) then 2642 DO ispin = 1, kp%nspin 2643 pol%chiRDyn(:,:,1,ispin) = chi_omega0_save(:,:,ispin) 2644 END DO 2645 SAFE_DEALLOCATE(chi_omega0_save) 2646 ! in this case we don`t need eigenvector and eigenvalues 2647 SAFE_DEALLOCATE(pol%eigenvect_omega0) 2648 SAFE_DEALLOCATE(pol%eigenval_omega0) 2649 else 2650 ! this is the case in which we work only within the subspace (except for omega = 0) 2651 ! leave the possibility to do something 2652 ! save coulomb potential to be reused in epsinv 2653 SAFE_ALLOCATE(pol%vcoul_sub, (pol%nmtx)) 2654 pol%vcoul_sub(:) = vcoul(:) 2655 end if 2656#ifdef MPI 2657 call MPI_barrier(MPI_COMM_WORLD,mpierr) 2658#endif 2659 2660 POP_SUB(chi_summation_sub_trunc) 2661 2662 end subroutine chi_summation_sub_trunc 2663 2664#ifdef USEELPA 2665#ifdef CPLX 2666 subroutine diagonalize_elpa(nmtx, scal, matrix, chi_eigenvalue_cutoff, neig_sub_input,& 2667 neig_sub_out, eigen_vect, eigenval) 2668 integer, intent(in) :: nmtx 2669 type (scalapack), intent(in) :: scal 2670 SCALAR, intent(inout) :: matrix(scal%npr,scal%npc) 2671 SCALAR, intent(out) :: eigen_vect(scal%npr,scal%npc) 2672 real(DP), intent(out) :: eigenval(nmtx) 2673 real(DP) :: chi_eigenvalue_cutoff, max_abs_eigen 2674 INTEGER :: neig_sub_input, neig_sub_out 2675 2676 real(DP) :: scaling 2677 integer :: neig_sub, neig_sub_temp, i 2678 2679 PUSH_SUB(diagonalize_elpa) 2680 2681 ! if not given in input use 25% of eigenvectors (fixed number, move in input?) 2682 scaling = 0.25D+00 2683 neig_sub = MIN(nmtx,INT(nmtx/scaling)+1) 2684 IF(neig_sub_input > 0) THEN 2685 neig_sub = MIN(neig_sub_input,nmtx) 2686 END IF 2687 IF(peinf%inode .eq. 0) then 2688 write(6,'(1x,A)') 'Diagonalization with ELPA' 2689 write(6,'(1x,A,e8.2)') 'EPS Truncation = ', chi_eigenvalue_cutoff 2690 write(6,'(1x,A,F8.2)') 'Scaling = ', scaling 2691 write(6,'(1x,A,I8)') 'Scaled basis = ', neig_sub 2692 END IF 2693 2694 CALL diagonalize_elpa_complex(nmtx, neig_sub, scal, matrix, eigen_vect, eigenval) 2695 2696 ! truncate 2697 neig_sub_temp = 0 2698 max_abs_eigen = MAXVAL(ABS(eigenval)) 2699 DO i = 1, neig_sub 2700 IF(ABS(eigenval(i))/max_abs_eigen >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1 2701 ! IF(ABS(eigen(i)) >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1 2702 ! IF(peinf%inode .eq. 0) WRITE(2000,*) eigen(i) 2703 END DO 2704 neig_sub = neig_sub_temp 2705 IF(peinf%inode .eq. 0) then 2706 write(6,*) 'N eigen with relative MAX ABS value >= chi_eigenvalue_cutoff', neig_sub 2707 END IF 2708 neig_sub_out = neig_sub 2709 2710 POP_SUB(diagonalize_elpa) 2711 2712 end subroutine diagonalize_elpa 2713#endif 2714#endif 2715 2716 subroutine diagonalize_scalapack(nmtx, scal, matrix, chi_eigenvalue_cutoff, neig_sub_input,& 2717 neig_sub_out, eigen_vect, eigenval) 2718 integer, intent(in) :: nmtx 2719 type (scalapack), intent(in) :: scal 2720 SCALAR, intent(inout) :: matrix(scal%npr,scal%npc) 2721 complex(DPC), intent(out) :: eigen_vect(scal%npr,scal%npc) 2722 real(DP), intent(out) :: eigenval(nmtx) 2723 real(DP) :: chi_eigenvalue_cutoff 2724 INTEGER :: neig_sub_input, neig_sub_out 2725 2726 integer :: info, desca(9), lwork, lrwork, liwork, ipiv(scal%npr+scal%nbl) 2727 integer :: descb(9) 2728 integer, allocatable :: iwork(:) 2729 SCALAR, allocatable :: work(:) 2730 2731 SCALAR, allocatable :: work_2(:,:) 2732 DOUBLE PRECISION, ALLOCATABLE :: eigen(:), rwork(:) 2733 DOUBLE PRECISION :: vl, vu 2734 INTEGER :: iii, jjj, jj, i, j, icurr, irow, icol, irowm, icolm 2735 INTEGER :: neig_sub, neig_sub_temp 2736 DOUBLE PRECISION :: scaling, max_abs_eigen 2737 DOUBLE PRECISION :: ABSTOL, ORFAC 2738 DOUBLE PRECISION, ALLOCATABLE :: gap(:) 2739 INTEGER, ALLOCATABLE :: iclustr(:), ifail(:) 2740 INTEGER :: Neigen_found, Neigenvect_found, clustersize, locsize, nbc, nbr 2741 INTEGER :: nn, nnp, np0, mq0, nq0 2742 character(len=100) :: tmpstr 2743 logical :: find_eigen_in_range 2744 2745 PUSH_SUB(diagonalize_scalapack) 2746 2747 call descinit(desca, nmtx, nmtx, scal%nbl, scal%nbl, 0, 0, & 2748 scal%icntxt, max(1,scal%npr), info) 2749 if(info < 0) then 2750 write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.' 2751 call die("descinit error for descaA in inversion") 2752 else if(info > 0) then 2753 write(0,*) 'info = ', info 2754 call die("descinit error for descaA in inversion") 2755 endif 2756 2757 SAFE_ALLOCATE(work_2, (scal%npr,scal%npc)) 2758 work_2 = matrix 2759 2760 find_eigen_in_range = .TRUE. 2761 2762 scaling = 1.0D+00 2763 neig_sub = MIN(nmtx,INT(nmtx/scaling)+1) 2764 IF(neig_sub_input > 0) THEN 2765 neig_sub = MIN(neig_sub_input,nmtx) 2766 find_eigen_in_range = .FALSE. 2767 END IF 2768 IF(peinf%inode .eq. 0) then 2769 write(6,'(1x,A)') 'Diagonalization with Scalapack' 2770 write(6,'(1x,A,e8.2)') 'EPS Truncation = ', chi_eigenvalue_cutoff 2771 write(6,'(1x,A,F8.2)') 'Scaling = ', scaling 2772 write(6,'(1x,A,I8)') 'Scaled basis = ', neig_sub 2773 END IF 2774 2775 vl = -100.0D+00 2776 vu = -ABS(chi_eigenvalue_cutoff) 2777 ABSTOL = 0.0D+00 2778 ORFAC = 5.0D-7 2779 ! ABSTOL = 1.0D-7 2780 ! ORFAC = 5.0D-7 2781 Neigen_found = 0 2782 Neigenvect_found = 0 2783 2784 clustersize = MIN(50,neig_sub) 2785 ! clustersize = nmtx/MAX(INT(SQRT(DBLE(scal%npcol*scal%nprow))),1) 2786 nbc = nmtx/(scal%nbl * scal%npcol) + 1 2787 nbr = nmtx/(scal%nbl * scal%nprow) + 1 2788 locsize = scal%nbl * scal%nbl * nbc * nbr 2789 nn = MAX(nmtx, scal%nbl, 2) 2790 nnp = MAX(nmtx, scal%nprow*scal%npcol+1, 4) 2791 np0 = numroc(nn, scal%nbl, 0, 0, scal%nprow) 2792 mq0 = numroc(MAX(neig_sub,scal%nbl,2), scal%nbl, 0, 0, scal%npcol) 2793 2794#ifdef CPLX 2795 nq0 = numroc(nn, scal%nbl, 0, 0, scal%npcol) 2796 lwork = nmtx + (np0+nq0+scal%nbl)*scal%nbl 2797 lrwork = 4*nmtx + MAX(5*nn,np0*mq0) + iceil(neig_sub,scal%nprow*scal%npcol)*nn + & 2798 (clustersize-1)*nmtx 2799#else 2800 lwork = 5*nmtx + MAX(5*nn,np0*mq0+2*scal%nbl*scal%nbl) + & 2801 iceil(neig_sub,scal%nprow*scal%npcol)*nn + (clustersize-1)*nmtx 2802 lrwork = 0 2803#endif 2804 liwork = 6*nnp 2805 2806 SAFE_ALLOCATE(work, (lwork)) 2807#ifdef CPLX 2808 SAFE_ALLOCATE(rwork, (lrwork)) 2809#endif 2810 SAFE_ALLOCATE(iwork, (liwork)) 2811 SAFE_ALLOCATE(iclustr, (2*scal%nprow*scal%npcol)) 2812 SAFE_ALLOCATE(gap, (scal%nprow*scal%npcol)) 2813 SAFE_ALLOCATE(ifail, (nmtx)) 2814 SAFE_ALLOCATE(eigen, (nmtx)) 2815 eigen = 0.0D+00 2816 2817 info = 0 2818#ifdef CPLX 2819 IF(find_eigen_in_range) THEN 2820 CALL pzheevx('V', 'V', 'U', nmtx, work_2, 1, 1, desca, vl, vu, 1, neig_sub, ABSTOL, Neigen_found, Neigenvect_found, & 2821 eigen, ORFAC, matrix, 1, 1, desca, work, lwork, rwork, lrwork, iwork, liwork, ifail, iclustr, & 2822 gap, info) 2823 ELSE 2824 CALL pzheevx('V', 'I', 'U', nmtx, work_2, 1, 1, desca, vl, vu, 1, neig_sub, ABSTOL, Neigen_found, Neigenvect_found, & 2825 eigen, ORFAC, matrix, 1, 1, desca, work, lwork, rwork, lrwork, iwork, liwork, ifail, iclustr, & 2826 gap, info) 2827 END IF 2828#else 2829 CALL pdsyevx('V', 'I', 'U', nmtx, work_2, 1, 1, desca, vl, vu, 1, neig_sub, ABSTOL, Neigen_found, Neigenvect_found, & 2830 eigen, ORFAC, matrix, 1, 1, desca, work, lwork, iwork, liwork, ifail, iclustr, & 2831 gap, info) 2832#endif 2833 2834 IF(info/=0) THEN 2835 IF(info < 0) THEN 2836 call die(" error in parameters for pzheevx/pdsyevx") 2837 ELSE 2838 ! Source: http://www.netlib.org/lapack-dev/Patch/SRC/pzheevx.f 2839 ! if (MOD(INFO,2).NE.0), then one or more eigenvectors failed to converge. Their indices are stored 2840 ! in IFAIL. Ensure ABSTOL=2.0*PDLAMCH( 'U' ). Send e-mail to scalapack@cs.utk.edu 2841 if(mod(info, 2) /= 0) then 2842 if(peinf%inode == 0) then 2843 write(0,*) " info = ", info 2844 write(0,'(a)',advance='no') "Unconverged eigenvalues from ifail: " 2845 do jj = 1, size(ifail) 2846 if(ifail(jj) == 0) exit 2847 write(0,'(i8)',advance='no') ifail(jj) 2848 enddo 2849 endif 2850 call die("Convergence problems in pzheevx/pdsyevx.") 2851 endif 2852 2853 ! if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding to one or more clusters of eigenvalues could not be 2854 ! reorthogonalized because of insufficient workspace. The indices of the clusters are stored in the array ICLUSTR. 2855 if(mod(info / 2, 2) /= 0) then 2856 if(peinf%inode == 0) then 2857 write(0,*) " info = ", info 2858 do jj = 1, size(iclustr) 2859 if(iclustr(jj) == 0) exit 2860 write(0,'(i8)',advance='no') iclustr(jj) 2861 enddo 2862 endif 2863 call die("Could not reorthogonalize due to insufficient workspace in pzheevx/pdsyevx.") 2864 endif 2865 2866 ! if (MOD(INFO/4,2).NE.0), then space limit prevented PZHEEVX from computing all of the eigenvectors 2867 ! between VL and VU. The number of eigenvectors computed is returned in NZ. 2868 if(mod(info / 4, 2) /= 0) then 2869 if(peinf%inode == 0) then 2870 write(0,*) " info = ", info 2871 write(0,*) " number eigenvalues found = ", Neigenvect_found 2872 endif 2873 call die("Space limit prevented computing all eigenvectors in pzheevx/pdsyevx.") 2874 endif 2875 2876 ! if (MOD(INFO/8,2).NE.0), then PZSTEBZ failed to compute eigenvalues. Ensure ABSTOL=2.0*PDLAMCH( 'U' ) 2877 ! Send e-mail to scalapack@cs.utk.edu 2878 if(mod(info / 8, 2) /= 0) then 2879 if(peinf%inode == 0) then 2880 write(0,*) " info = ", info 2881 endif 2882 call die("PZSTEBZ failed to compute eigenvalues in pzheevx/pdsyevx.") 2883 endif 2884 END IF 2885 END IF 2886 2887 SAFE_DEALLOCATE(work) 2888#ifdef CPLX 2889 SAFE_DEALLOCATE(rwork) 2890#endif 2891 SAFE_DEALLOCATE(iwork) 2892 SAFE_DEALLOCATE(iclustr) 2893 SAFE_DEALLOCATE(gap) 2894 SAFE_DEALLOCATE(ifail) 2895 2896 SAFE_DEALLOCATE(work_2) 2897 2898 IF(Neigen_found/=neig_sub) THEN 2899 neig_sub = Neigen_found 2900 ! print a warning 2901 IF(peinf%inode .eq. 0) then 2902 write(6,*) 'Eigenvalue found = ', neig_sub 2903 END IF 2904 END IF 2905 IF(Neigenvect_found < neig_sub) THEN 2906 write(tmpstr,'(a, i10, a, i10, a)') 'Diagonalization with pzheevx/pdsyevx failed: only ', & 2907 Neigenvect_found, ' of ', neig_sub, ' eigenvectors found.' 2908 call die(tmpstr) 2909 END IF 2910 2911 ! truncate 2912 neig_sub_temp = 0 2913 max_abs_eigen = MAXVAL(ABS(eigen)) 2914 DO i = 1, neig_sub 2915 IF(ABS(eigen(i))/max_abs_eigen >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1 2916 ! IF(ABS(eigen(i)) >= chi_eigenvalue_cutoff) neig_sub_temp = neig_sub_temp + 1 2917 ! IF(peinf%inode .eq. 0) WRITE(2000,*) eigen(i) 2918 END DO 2919 neig_sub = neig_sub_temp 2920 2921 IF(peinf%inode .eq. 0) then 2922 write(6,*) 'N eigen with relative MAX ABS value >= chi_eigenvalue_cutoff', neig_sub 2923 END IF 2924 neig_sub_out = neig_sub 2925 2926 ! matrix at this point contains the eigenvectors contain here the e 2927 ! make a copy into eigenvect (eigenvectors are defined complex) 2928 eigen_vect = (0D0,0D0) 2929 eigen_vect = matrix 2930 eigenval = eigen 2931 2932 SAFE_DEALLOCATE(eigen) 2933 2934 POP_SUB(diagonalize_scalapack) 2935 2936 end subroutine diagonalize_scalapack 2937 2938 ! This is going to be the same communication scheme as comm_matrix 2939 ! for which the eigenvector of the symmetrized polarizability are 2940 ! replicated on each MPI task 2941 subroutine calc_pol_sub_trunc_easy(this,pol,scal,kp,kpq,vwfn,cwfn,& 2942 nst,nrk,indt,pht,gvec,crys,q0,& 2943 neig_sub, eigen_vect, eigenval, vcoul) 2944 type(chi_summator_t), intent(INOUT) :: this 2945 type(polarizability), intent(INOUT) :: pol 2946 type(scalapack), intent(in) :: scal 2947 type(kpoints), intent(IN) :: kp, kpq 2948 type(valence_wfns), intent(IN) :: vwfn 2949 type(conduction_wfns), intent(IN) :: cwfn 2950 INTEGER :: neig_sub 2951 complex(DPC), allocatable :: eigen_vect(:,:) 2952 real(DP), allocatable :: eigenval(:) 2953 real(DP), allocatable :: vcoul(:) 2954 2955 integer, intent(IN) :: nst(:) 2956 integer, intent(IN) :: nrk 2957 integer, intent(INOUT) :: indt(:,:,:) 2958 SCALAR, intent(INOUT) :: pht(:,:,:) 2959 type (gspace), intent(in) :: gvec 2960 type (crystal), intent(in) :: crys 2961 real(DP), intent(in) :: q0(3) 2962 real(DP) :: zvalue, cv_energy 2963 complex(DPC), allocatable :: edenDRtemp(:) 2964 2965 INTEGER :: ntot, ntot2 2966 INTEGER :: irk, ipe, ipe_row, ipe_col, ipe_real, ispin, itot, it, iv, ic, i, j, iv_local 2967 INTEGER :: irow, icol, irowm, icolm, icurr, ilimit, freq_idx, & 2968 i_myband, mytot 2969 2970 integer, allocatable :: tmprowindex(:),tmpcolindex(:) 2971 SCALAR, allocatable :: tmprowph(:),tmpcolph(:) 2972 type(cvpair_info) :: cvpair_temp 2973 complex(DPC) :: negfact 2974 2975 SCALAR :: epsheaddummy, wcoul0 2976 type (twork_scell) :: work_scell 2977 integer, allocatable :: isorti(:) 2978 real(DP) :: vc, oneoverq, avgcut 2979 integer :: iscreen, nfk, iparallel 2980 integer :: qgrid(3) 2981 real(DP) :: q0vec(3) 2982 2983 type(progress_info) :: prog_info 2984 2985 INTEGER :: n_vc_tot, n_v_tot, n_c_tot, nmtx, kk, jj, ii, info 2986 INTEGER :: my_sub_nlocal_col, my_sub_nlocal_row, itot_rk2, & 2987 irow_global, icol_global 2988 2989 complex(DPC), allocatable :: C_aux(:,:), gme_sub(:,:,:,:,:), gme_temp(:,:,:) 2990 complex(DPC), allocatable :: C_Pgemm(:,:) 2991 integer, allocatable :: grid2D_2inode(:,:), row_col_loc(:,:), inode_2grid2D(:,:) 2992 INTEGER :: pe_col_size, pe_row_size, desc_sub(9), desca(9) 2993 INTEGER, ALLOCATABLE :: row_indx(:), col_indx(:) 2994 ! variables for block transformation 2995 integer :: iproc_dum 2996 integer :: nmpinode 2997 integer :: iblock, block_size_max, ib_start, ib_end, ib_size, Nblocks 2998 real(DP) :: mem, avail_mem, mem_offset, mem_single_block 2999 ! variables for non-blocking cyclic scheme 3000 integer :: isend_static, irec_static 3001 integer :: actual_send, actual_rec 3002 integer :: nsend_row, nsend_col, nrec_row, nrec_col 3003 integer :: req_send, tag_send, req_rec, tag_rec 3004 integer :: stat(MPI_STATUS_SIZE) 3005 complex(DPC), allocatable :: buf_send(:,:,:,:), buf_rec(:,:,:,:), buf_temp(:,:,:,:) 3006 integer :: size_N, size_M, size_K 3007 ! variables for fast eigenvectors communication 3008 logical :: fast_eig_comm 3009 complex(DPC), allocatable :: C_elem(:), C_elem_send(:), C_elem_rec(:) 3010 real(DP), allocatable :: sqrt_vcoul_local(:) 3011 integer, allocatable :: elem_per_block(:) 3012 integer, allocatable :: glob_C_indec(:,:) 3013 integer, allocatable :: glob_C_indec_send(:,:), glob_C_indec_rec(:,:) 3014 integer :: num_loc_col_within_Neig, max_num_elem, iii, jjj 3015 integer :: req_send_info, tag_send_info, req_rec_info, tag_rec_info 3016 3017 PUSH_SUB(calc_pol_sub_trunc_easy) 3018 3019 fast_eig_comm = .true. 3020 if( pol%sub_collective_eigen_redistr ) then 3021 ! slow redistribution scheme, for debug only 3022 fast_eig_comm = .false. 3023 end if 3024 3025 ! precompute the dimensions of the different objects 3026 ntot=0 3027 ntot2=0 3028 ntot = peinf%nvownactual*peinf%ncownactual 3029 do irk = 1, nrk 3030 ntot2=ntot2 + nst(irk) 3031 enddo 3032 ntot=ntot*ntot2 3033 3034 ! WRITE(*,*) ntot2, nrk, maxval(nst), minval(nst) 3035 3036 n_c_tot = cwfn%nband 3037 n_v_tot = vwfn%nband+pol%ncrit 3038 n_vc_tot = n_c_tot * n_v_tot 3039 nmtx = pol%nmtx 3040 3041 ! first replicate the eigenvector of the symmetrized polarizability at omega=0 3042 ! on all processes (do it by blocks) 3043 ! calculate block size according to available memory 3044 call procmem(mem,nmpinode) 3045 avail_mem = mem 3046 mem_offset = dble(neig_sub) * dble(peinf%ncownmax) * dble(peinf%nvownmax) * & 3047 dble(ntot2) * dble(kp%nspin) * 16D+00 3048 mem_single_block = MAX(dble(neig_sub)*2.0D+00, dble(neig_sub + peinf%ncownmax*peinf%nvownmax)) * 16D+00 3049 !XXXXX factor 2 (probably the allreduce inplace allocate internaly an additional buffer) 3050 mem_single_block = mem_single_block * 2.0D+00 3051 !XXXXX 3052 block_size_max = int((avail_mem - mem_offset) / mem_single_block) 3053 block_size_max = MAX(block_size_max,1) 3054 block_size_max = MIN(block_size_max,nmtx) 3055 if(peinf%inode .eq. 0) then 3056 write(6,*) 3057 write(6,'(1x,"Basis Transformation info:")') 3058 write(6,'(1x,"Memory available: ",f0.1," MB per PE")') avail_mem / 1024**2 3059 write(6,'(1x,"Memory offset: ",f0.1," MB")') mem_offset / 1024**2 3060 write(6,'(1x,"Memory for a sigle block: ",f0.1," kB")') mem_single_block / 1024 3061 write(6,'(1x,"Max Block Size: ",i0)') block_size_max 3062 end if 3063 ! broadcast the result to make sure all processes have same value 3064#ifdef MPI 3065 call MPI_Bcast(block_size_max, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr) 3066#endif 3067 Nblocks = (nmtx-1) / block_size_max + 1 3068 SAFE_ALLOCATE(gme_sub, (neig_sub,peinf%ncownmax,peinf%nvownmax,ntot2,kp%nspin)) 3069 gme_sub = ZERO 3070 3071 ! initialize some stuff for fast_eig_comm 3072 if ( fast_eig_comm ) then 3073 ! calculate how many local elements have a global index smaller than Neig 3074 num_loc_col_within_Neig = 0 3075 do icolm = 1, scal%npc 3076 j = INDXL2G (icolm, scal%nbl, scal%mypcol, 0, scal%npcol) 3077 if ( j <= neig_sub ) then 3078 num_loc_col_within_Neig = num_loc_col_within_Neig + 1 3079 end if 3080 end do 3081 ! figure the biggest number of elements will be exchanged and the actual 3082 ! number of elements for each block 3083 SAFE_ALLOCATE(elem_per_block, (Nblocks)) 3084 elem_per_block = 0 3085 iii = 0 3086 do iblock = 1, nmtx, block_size_max 3087 ib_start = iblock 3088 ib_end = ib_start + block_size_max - 1 3089 ib_end = MIN(ib_end, nmtx) 3090 ib_size = ib_end - ib_start + 1 3091 ! increase block index 3092 iii = iii + 1 3093 ! compute how many rows are in range and [ib_start,ib_end] and 3094 ! accumulate relative number of cols 3095 do i = ib_start, ib_end 3096 irow = INDXG2P( i, scal%nbl, iproc_dum, 0, scal%nprow) 3097 if ( irow .ne. scal%myprow) cycle 3098 elem_per_block(iii) = elem_per_block(iii) + num_loc_col_within_Neig 3099 end do 3100 end do 3101 max_num_elem = MAXVAL(elem_per_block(:)) 3102 call mpi_allreduce(MPI_IN_PLACE, max_num_elem, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr) 3103 ! allocate communication buffers 3104 SAFE_ALLOCATE(C_elem, (max_num_elem)) 3105 SAFE_ALLOCATE(C_elem_send, (max_num_elem)) 3106 SAFE_ALLOCATE(C_elem_rec, (max_num_elem)) 3107 SAFE_ALLOCATE(glob_C_indec, (2,max_num_elem+1)) 3108 SAFE_ALLOCATE(glob_C_indec_send, (2,max_num_elem+1)) 3109 SAFE_ALLOCATE(glob_C_indec_rec, (2,max_num_elem+1)) 3110 ! precoumpute SQRT of coulomb potential 3111 SAFE_ALLOCATE(sqrt_vcoul_local, (scal%npr)) 3112 do irowm = 1, scal%npr 3113 i = INDXL2G (irowm, scal%nbl, scal%myprow, 0, scal%nprow) 3114 sqrt_vcoul_local(irowm) = SQRT(vcoul(i)) 3115 end do 3116 ! get process for communication 3117 isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes) 3118 irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes) 3119 end if 3120 3121 call progress_init(prog_info, 'transforming basis', 'block', Nblocks) 3122 icurr=0 3123 iii = 0 3124 do iblock = 1, nmtx, block_size_max 3125 call progress_step(prog_info) 3126 ! actual block indeces 3127 ib_start = iblock 3128 ib_end = ib_start + block_size_max - 1 3129 ib_end = MIN(ib_end, nmtx) 3130 ib_size = ib_end - ib_start + 1 3131 ! allocate C_aux 3132 SAFE_ALLOCATE(C_aux, (ib_size,neig_sub)) 3133 C_aux = ZERO 3134 3135 if ( fast_eig_comm ) then 3136 ! increase block index 3137 iii = iii + 1 3138 ! copy information into communication buffers 3139 jjj = 0 3140 do i = ib_start, ib_end 3141 irow = INDXG2P( i, scal%nbl, iproc_dum, 0, scal%nprow) 3142 if ( irow .ne. scal%myprow) cycle 3143 irowm = INDXG2L(i, scal%nbl, scal%myprow, 0, scal%nprow) 3144 vc = sqrt_vcoul_local(irowm) 3145 do icolm = 1, scal%npc 3146 j = INDXL2G (icolm, scal%nbl, scal%mypcol, 0, scal%npcol) 3147 if ( j <= neig_sub ) then 3148 jjj = jjj + 1 3149 C_elem(jjj) = eigen_vect(irowm,icolm) * vc 3150 glob_C_indec(1,jjj) = i 3151 glob_C_indec(2,jjj) = j 3152 end if 3153 end do 3154 end do 3155 glob_C_indec(1,max_num_elem+1) = elem_per_block(iii) 3156 3157 !XXX if ( jjj .ne. elem_per_block(iii) ) STOP 3158 3159 ! loop over process and start ring communication 3160 glob_C_indec_send(:,:) = glob_C_indec(:,:) 3161 C_elem_send(:) = C_elem(:) 3162 3163 if(peinf%inode .eq. 0) then 3164 call timing%start(timing%chi_sum_sub_eigvet_comm) 3165 endif 3166 3167 do ipe = 1, peinf%npes 3168 ! 3169 tag_rec_info = 1 3170 tag_send_info = 1 3171 req_rec_info = MPI_REQUEST_NULL 3172 req_send_info = MPI_REQUEST_NULL 3173 CALL MPI_Irecv(glob_C_indec_rec, 2*(max_num_elem+1), & 3174 MPI_INTEGER, irec_static, & 3175 tag_rec_info, MPI_COMM_WORLD, req_rec_info, mpierr) 3176 CALL MPI_Isend(glob_C_indec_send, 2*(max_num_elem+1), & 3177 MPI_INTEGER, isend_static, & 3178 tag_send_info, MPI_COMM_WORLD, req_send_info, mpierr) 3179 ! 3180 tag_rec = 1 3181 tag_send = 1 3182 req_rec = MPI_REQUEST_NULL 3183 req_send = MPI_REQUEST_NULL 3184 CALL MPI_Irecv(C_elem_rec, max_num_elem, & 3185 MPI_COMPLEX_DPC, irec_static, & 3186 tag_rec, MPI_COMM_WORLD, req_rec, mpierr) 3187 CALL MPI_Isend(C_elem_send, max_num_elem, & 3188 MPI_COMPLEX_DPC, isend_static, & 3189 tag_send, MPI_COMM_WORLD, req_send, mpierr) 3190 3191 3192 do jjj = 1, glob_C_indec(1,max_num_elem+1) 3193 i = glob_C_indec(1,jjj) 3194 j = glob_C_indec(2,jjj) 3195 C_aux(i-ib_start+1,j) = C_elem(jjj) 3196 end do 3197 3198 ! make sure the buffer is received/send 3199 CALL MPI_Wait(req_rec_info,stat,mpierr) 3200 CALL MPI_Wait(req_send_info,stat,mpierr) 3201 ! 3202 glob_C_indec_send(:,:) = glob_C_indec_rec(:,:) 3203 glob_C_indec(:,:) = glob_C_indec_rec(:,:) 3204 ! 3205 CALL MPI_Wait(req_rec,stat,mpierr) 3206 CALL MPI_Wait(req_send,stat,mpierr) 3207 ! 3208 C_elem_send(:) = C_elem_rec(:) 3209 C_elem(:) = C_elem_rec(:) 3210 ! 3211 end do 3212 if(peinf%inode .eq. 0) then 3213 call timing%stop(timing%chi_sum_sub_eigvet_comm) 3214 endif 3215 3216 else ! fast_eig_comm 3217 ! 3218 if ( .true. ) then 3219 ! 3220 do i = ib_start, ib_end 3221 irow = INDXG2P( i, scal%nbl, iproc_dum, 0, scal%nprow) 3222 if ( irow .ne. scal%myprow) cycle 3223 vc = SQRT(vcoul(i)) 3224 irowm = INDXG2L(i, scal%nbl, scal%myprow, 0, scal%nprow) 3225 ! 3226 do icolm = 1, scal%npc 3227 j = INDXL2G (icolm, scal%nbl, scal%mypcol, 0, scal%npcol) 3228 if ( j <= neig_sub ) then 3229 C_aux(i-ib_start+1,j) = eigen_vect(irowm,icolm) * vc 3230 end if 3231 end do 3232 ! 3233 end do 3234 ! 3235 else 3236 ! 3237 ! old and slow 3238 do i = ib_start, ib_end 3239 irow=MOD(INT(((i-1)/scal%nbl)+TOL_SMALL),scal%nprow) 3240 if(irow.ne.scal%myprow) cycle 3241 vc = SQRT(vcoul(i)) 3242 do j=1, nmtx 3243 icol=MOD(INT(((j-1)/scal%nbl)+TOL_SMALL),scal%npcol) 3244 if(icol .eq. scal%mypcol) then 3245 icurr=icurr+1 3246 irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1 3247 icolm=MOD((icurr-1),scal%npc)+1 3248 3249 IF(j<=neig_sub) THEN 3250 C_aux(i-ib_start+1,j) = eigen_vect(irowm,icolm) * vc 3251 END IF 3252 3253 endif 3254 end do 3255 end do 3256 ! 3257 end if 3258 ! WRITE(*,*) nmtx,neig_sub 3259 if(peinf%inode .eq. 0) then 3260 call timing%start(timing%chi_sum_sub_eigvet_comm) 3261 endif 3262 3263#ifdef MPI 3264 CALL MPI_ALLREDUCE(MPI_IN_PLACE,C_aux(1,1),ib_size*neig_sub,MPI_COMPLEX_DPC,MPI_SUM,MPI_COMM_WORLD,mpierr) 3265#endif 3266 if(peinf%inode .eq. 0) then 3267 call timing%stop(timing%chi_sum_sub_eigvet_comm) 3268 endif 3269 3270 ! 3271 end if ! fast_eig_comm 3272 3273 ! transform basis 3274 if(peinf%inode .eq. 0) then 3275 call timing%start(timing%chi_sum_sub_transf) 3276 endif 3277 3278 SAFE_ALLOCATE(gme_temp, (ib_size,peinf%ncownmax,peinf%nvownmax)) 3279 gme_temp = ZERO 3280 do ispin = 1 , kp%nspin 3281 itot_rk2 = 0 3282 do irk = 1, nrk 3283 do it = 1, nst(irk) 3284 itot_rk2 = itot_rk2 + 1 3285 3286 gme_temp = ZERO 3287 !$OMP PARALLEL DO 3288 do i = ib_start, ib_end 3289 gme_temp(i-ib_start+1,1:peinf%ncownactual,1:peinf%nvownactual) = & 3290 pol%gme(indt(i,it,irk),1:peinf%ncownactual,1:peinf%nvownactual,ispin,irk,1) * & 3291 pht(i,it,irk) 3292 end do 3293 !$OMP END PARALLEL DO 3294 3295 do iv = 1, vwfn%nband+pol%ncrit 3296 ! check if I own this partuicular band 3297 if (peinf%doiownv(iv)) then 3298 ilimit = peinf%ncownactual 3299 else 3300 cycle 3301 endif 3302 iv_local = peinf%indexv(iv) 3303 3304 call zgemm('c','n',neig_sub,peinf%ncownactual,ib_size, & 3305 (1D0,0D0),C_aux(1:ib_size,1:neig_sub),ib_size,& 3306 gme_temp(1:ib_size,1:peinf%ncownactual,peinf%indexv(iv)),ib_size,& 3307 (1D0,0D0),gme_sub(1:neig_sub,1:peinf%ncownactual,peinf%indexv(iv),itot_rk2,ispin),neig_sub) 3308 3309 end do 3310 3311 end do 3312 end do 3313 end do 3314 SAFE_DEALLOCATE(gme_temp) 3315 SAFE_DEALLOCATE(C_aux) 3316 if(peinf%inode .eq. 0) then 3317 call timing%stop(timing%chi_sum_sub_transf) 3318 endif 3319 3320 end do !iblock 3321 call progress_free(prog_info) 3322 SAFE_DEALLOCATE_P(pol%gme) 3323 ! this is going to be deallocated outside 3324 SAFE_ALLOCATE(pol%gme, (1,1,1,1,1,1)) 3325 3326 if ( fast_eig_comm ) then 3327 SAFE_DEALLOCATE(elem_per_block) 3328 SAFE_DEALLOCATE(C_elem) 3329 SAFE_DEALLOCATE(C_elem_send) 3330 SAFE_DEALLOCATE(C_elem_rec) 3331 SAFE_DEALLOCATE(glob_C_indec) 3332 SAFE_DEALLOCATE(glob_C_indec_send) 3333 SAFE_DEALLOCATE(glob_C_indec_rec) 3334 SAFE_DEALLOCATE(sqrt_vcoul_local) 3335 end if 3336 3337 3338 !XXXXX 3339 ! initialize the descriptor for the subspace matrix 3340 my_sub_nlocal_row = numroc(neig_sub, scal%nbl, scal%myprow, 0, scal%nprow) 3341 my_sub_nlocal_col = numroc(neig_sub, scal%nbl, scal%mypcol, 0, scal%npcol) 3342 call descinit(desc_sub, neig_sub, neig_sub, scal%nbl, scal%nbl, 0, 0, & 3343 scal%icntxt, max(1,my_sub_nlocal_row), info) 3344 if(info < 0) then 3345 write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.' 3346 call die("descinit error for desca sub in sub_chi_sum") 3347 end if 3348 ! exchange some info 3349 SAFE_ALLOCATE(row_col_loc, (2,peinf%npes)) 3350 SAFE_ALLOCATE(grid2D_2inode, (scal%nprow,scal%npcol)) 3351 SAFE_ALLOCATE(inode_2grid2D, (2,peinf%npes)) 3352 row_col_loc = 0 3353 grid2D_2inode = 0 3354 inode_2grid2D = 0 3355 row_col_loc(1,peinf%inode+1) = my_sub_nlocal_row 3356 row_col_loc(2,peinf%inode+1) = my_sub_nlocal_col 3357 grid2D_2inode(scal%myprow+1,scal%mypcol+1) = peinf%inode 3358 inode_2grid2D(1,peinf%inode+1) = scal%myprow 3359 inode_2grid2D(2,peinf%inode+1) = scal%mypcol 3360#ifdef MPI 3361 CALL MPI_ALLREDUCE(MPI_IN_PLACE,row_col_loc(1,1),2*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr) 3362 CALL MPI_ALLREDUCE(MPI_IN_PLACE,grid2D_2inode(1,1),scal%nprow*scal%npcol,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr) 3363 CALL MPI_ALLREDUCE(MPI_IN_PLACE,inode_2grid2D(1,1),2*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr) 3364#endif 3365 !XXXXX 3366 3367 freq_idx = pol%nfreq_group 3368 IF(pol%nfreq_group .gt. 1) THEN 3369 freq_idx = peinf%rank_mtxel+1 3370 END IF 3371 3372 ! avoid allocate zero size array (for mpi reduce?) 3373 SAFE_ALLOCATE(this%chilocal2RDyn, (MAX(1,my_sub_nlocal_row),MAX(1,my_sub_nlocal_col),pol%nfreq_in_group,kp%nspin)) 3374 3375 negfact = -1D0*this%fact 3376 3377 ! setup non-blocking cyclic communication (directly keep fix allocated buffers) 3378 if(pol%nonblocking_cyclic) then 3379 ! initialize buffer for non-blocking cyclic scheme 3380 ! static process for the communication 3381 isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes) 3382 irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes) 3383 ! get sizes 3384 size_N = my_sub_nlocal_row 3385 size_M = my_sub_nlocal_col 3386 size_K = ntot 3387 call mpi_allreduce(MPI_IN_PLACE, size_N, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr) 3388 call mpi_allreduce(MPI_IN_PLACE, size_M, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr) 3389 call mpi_allreduce(MPI_IN_PLACE, size_K, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr) 3390 ! allocate only once 3391 SAFE_ALLOCATE(buf_send, (size_N, size_M, pol%nfreq_in_group, kp%nspin)) 3392 SAFE_ALLOCATE(buf_rec, (size_N, size_M, pol%nfreq_in_group, kp%nspin)) 3393 SAFE_ALLOCATE(buf_temp, (size_N, size_M, pol%nfreq_in_group, kp%nspin)) 3394 ! 3395 do ispin = 1 , kp%nspin 3396 do kk = 1, pol%nfreq_in_group 3397 ! 3398 !$OMP PARALLEL DO collapse(2) 3399 do ii = 1, size_M 3400 do jj = 1, size_N 3401 buf_send(jj,ii,kk,ispin) = (0.0D+00,0.0D+00) 3402 enddo 3403 enddo 3404 !$OMP PARALLEL DO collapse(2) 3405 do ii = 1, size_M 3406 do jj = 1, size_N 3407 buf_rec(jj,ii,kk,ispin) = (0.0D+00,0.0D+00) 3408 end do 3409 end do 3410 !$OMP PARALLEL DO collapse(2) 3411 do ii = 1, size_M 3412 do jj = 1, size_N 3413 buf_temp(jj,ii,kk,ispin) = (0.0D+00,0.0D+00) 3414 end do 3415 end do 3416 ! 3417 end do 3418 end do 3419 ! 3420 ! allocate for matrix multiplication 3421 SAFE_ALLOCATE(this%chilocalRDyn, (size_N, size_M, pol%nfreq_in_group)) ! (pe_row_size,pe_col_size,pol%nfreq_in_group)) 3422 SAFE_ALLOCATE(this%gmeRDyntempr2, (size_N, size_K, pol%nfreq_in_group)) ! (pe_row_size,ntot,pol%nfreq_in_group)) 3423 SAFE_ALLOCATE(this%gmeRDyntempc, (size_K, size_M)) ! (ntot,pe_col_size)) 3424 ! 3425 do kk = 1, pol%nfreq_in_group 3426 !$OMP PARALLEL DO collapse(2) 3427 do ii = 1, size_M 3428 do jj = 1, size_N 3429 this%chilocalRDyn(jj,ii,kk) = (0.0D+00,0.0D+00) 3430 end do 3431 end do 3432 !$OMP PARALLEL DO collapse(2) 3433 do ii = 1, size_K 3434 do jj = 1, size_N 3435 this%gmeRDyntempr2(jj,ii,kk) = (0.0D+00,0.0D+00) 3436 end do 3437 end do 3438 end do 3439 !$OMP PARALLEL DO collapse(2) 3440 do ii = 1, size_M 3441 do jj = 1, size_K 3442 this%gmeRDyntempc(jj,ii) = (0.0D+00,0.0D+00) 3443 end do 3444 end do 3445 ! 3446 end if 3447 3448 3449 ! total time for omega neq zero 3450 if(peinf%inode .eq. 0) then 3451 call timing%start(timing%chi_sum_sub_omega_neq_0) 3452 endif 3453 3454 3455 call progress_init(prog_info, 'building polarizability matrix within the subspace of omega=0', 'processor', & 3456 peinf%npes) 3457 ! go with the big loop over processes calculations 3458 DO ipe = 1, peinf%npes 3459 call progress_step(prog_info) 3460 ! 3461 ! in the standard case there is no difference between ipe_real and ipe 3462 ! this only matters for pol%nonblocking_cyclic 3463 ipe_real = ipe 3464 ! 3465 if(pol%nonblocking_cyclic) then 3466 ! calculate the actual process we have to send and we are receiving 3467 actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes) 3468 actual_rec = MOD(peinf%inode - ipe + peinf%npes, peinf%npes) 3469 ! fixed size 3470 nsend_row = size_N 3471 nrec_row = size_N 3472 nsend_col = size_M 3473 nrec_col = size_M 3474 ! 3475 do ispin = 1 , kp%nspin 3476 do kk = 1, pol%nfreq_in_group 3477 !$OMP PARALLEL DO collapse(2) 3478 do ii = 1, nrec_col 3479 do jj = 1, nrec_row 3480 buf_rec(jj,ii,kk,ispin) = ZERO 3481 enddo 3482 enddo 3483 end do 3484 end do 3485 ! 3486 ! post message 3487 if(peinf%inode .eq. 0) then 3488 call timing%start(timing%chi_sum_comm) 3489 endif 3490 3491 tag_rec = 1 3492 tag_send = 1 3493 req_rec = MPI_REQUEST_NULL 3494 req_send = MPI_REQUEST_NULL 3495 CALL MPI_Irecv(buf_rec, nrec_row*nrec_col*pol%nfreq_in_group*kp%nspin, & 3496 MPI_COMPLEX_DPC, irec_static, & 3497 tag_rec, MPI_COMM_WORLD, req_rec, mpierr) 3498 CALL MPI_Isend(buf_send, nsend_row*nsend_col*pol%nfreq_in_group*kp%nspin, & 3499 MPI_COMPLEX_DPC, isend_static, & 3500 tag_send, MPI_COMM_WORLD, req_send, mpierr) 3501 if(peinf%inode .eq. 0) then 3502 call timing%stop(timing%chi_sum_comm) 3503 endif 3504 3505 ! make sure to put buffer to zero 3506 do kk = 1, pol%nfreq_in_group 3507 !$OMP PARALLEL DO collapse(2) 3508 do ii = 1, size_M 3509 do jj = 1, size_N 3510 this%chilocalRDyn(jj,ii,kk) = (0.0D+00,0.0D+00) 3511 end do 3512 end do 3513 !$OMP PARALLEL DO collapse(2) 3514 do ii = 1, size_K 3515 do jj = 1, size_N 3516 this%gmeRDyntempr2(jj,ii,kk) = (0.0D+00,0.0D+00) 3517 end do 3518 end do 3519 end do 3520 !$OMP PARALLEL DO collapse(2) 3521 do ii = 1, size_M 3522 do jj = 1, size_K 3523 this%gmeRDyntempc(jj,ii) = (0.0D+00,0.0D+00) 3524 end do 3525 end do 3526 ! 3527 ! in this case the ipe is the real process from which we should receive 3528 ipe_real = actual_rec+1 3529 end if ! pol%nonblocking_cyclic 3530 ! 3531 3532 ! 3533 pe_row_size = row_col_loc(1,ipe_real) 3534 pe_col_size = row_col_loc(2,ipe_real) 3535 IF(pe_row_size==0 .OR. pe_col_size==0) CYCLE 3536 ! precompute indices 3537 ! row 3538 SAFE_ALLOCATE(row_indx, (pe_row_size)) 3539 ipe_row = inode_2grid2D(1,ipe_real) 3540 row_indx = 0 3541 DO irow = 1, pe_row_size 3542 row_indx(irow) = INDXL2G( irow, scal%nbl, ipe_row, 0, scal%nprow) 3543 END DO 3544 ! col 3545 SAFE_ALLOCATE(col_indx, (pe_col_size)) 3546 ipe_col = inode_2grid2D(2,ipe_real) 3547 col_indx = 0 3548 DO icol = 1, pe_col_size 3549 col_indx(icol) = INDXL2G( icol, scal%nbl, ipe_col, 0, scal%npcol) 3550 END DO 3551 3552 ! IF(peinf%inode==0) WRITE(*,*) ipe, ipe_real-1 3553 ! IF(peinf%inode==0) WRITE(*,*) row_indx 3554 ! IF(peinf%inode==0) WRITE(*,*) col_indx 3555 ! IF(peinf%inode==0) WRITE(*,*) 3556 3557 do ispin = 1, kp%nspin 3558 if(peinf%inode .eq. 0) then 3559 call timing%start(timing%chi_sum_prep) 3560 endif 3561 3562 if ( .not. pol%nonblocking_cyclic ) then 3563 ! standard calculation 3564 SAFE_ALLOCATE(this%chilocalRDyn, (pe_row_size,pe_col_size,pol%nfreq_in_group)) 3565 SAFE_ALLOCATE(this%gmeRDyntempr2, (pe_row_size,ntot,pol%nfreq_in_group)) 3566 SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,pe_col_size)) 3567 this%chilocalRDyn = ZERO 3568 this%gmeRDyntempr2 = ZERO 3569 this%gmeRDyntempc = ZERO 3570 ! 3571 end if 3572 3573 itot = 0 3574 itot_rk2 = 0 3575 do irk = 1, nrk 3576 do it = 1, nst(irk) 3577 itot_rk2 = itot_rk2 + 1 3578 do iv = 1,vwfn%nband+pol%ncrit 3579 if (peinf%doiownv(iv)) then 3580 ilimit = peinf%ncownactual 3581 else 3582 ilimit = 0 3583 endif 3584 3585 !$OMP PARALLEL private (mytot, zvalue, edenDRtemp, jj, icurr, & 3586 !$OMP cvpair_temp, cv_energy, irow, irow_global, icol, icol_global) 3587 SAFE_ALLOCATE(edenDRtemp, (pol%nfreq_in_group)) 3588 !$OMP DO 3589 do i_myband = 1, ilimit 3590 mytot = itot + i_myband 3591 3592 zvalue = pol%edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx) 3593 if(pol%lin_denominator<TOL_Zero) then 3594 ! this is when the lin_denominator mode is not active. 3595 3596 do jj=1, pol%nfreq_in_group 3597 if (abs(zvalue) .gt. Tol_Zero) then 3598 edenDRtemp(jj)= -0.5d0*( & 3599 1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ & 3600 1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)) 3601 else 3602 edenDRtemp(jj)= 0D0 3603 endif 3604 enddo 3605 3606 else 3607 !this is when lin_denominator mode is active 3608 3609 cvpair_temp = pol%lin_edenDyn(peinf%indexv(iv),i_myband,ispin,irk,freq_idx) 3610 cv_energy = cvpair_temp%ev - cvpair_temp%ec 3611 do jj=1, pol%nfreq_in_group 3612 !check if denominator is not small. If so, do the usual thing 3613 3614 if(abs(cv_energy+pol%dFreqGrid(jj)/ryd)-pol%lin_denominator>-TOL_Zero .and. & 3615 abs(cv_energy-pol%dFreqGrid(jj)/ryd)-pol%lin_denominator>-TOL_Zero) then 3616 3617 if (abs(zvalue) .gt. Tol_Zero) then 3618 edenDRtemp(jj)= -0.5d0*( & 3619 1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ & 3620 1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)) 3621 else 3622 edenDRtemp(jj)= 0D0 3623 endif 3624 3625 else ! if denominator is small, do linearized energy denominator 3626 if(cvpair_temp%vltc) then 3627 edenDRtemp(jj) = -0.5d0*(& 3628 3629 conjg(integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),& 3630 cvpair_temp%ev,cvpair_temp%ec,& 3631 3632 cvpair_temp%vv,cvpair_temp%vc,& 3633 -pol%dFreqGrid(jj)/ryd,pol%efermi/ryd))& 3634 3635 +integrate_mbz(kp,kpq%rk(1:3,cvpair_temp%idx_kp),& 3636 cvpair_temp%ev,cvpair_temp%ec,& 3637 cvpair_temp%vv,cvpair_temp%vc,& 3638 pol%dFreqGrid(jj)/ryd,pol%efermi/ryd)) 3639 else 3640 edenDRtemp(jj)= 0D0 3641 endif 3642 endif 3643 end do 3644 3645 end if 3646 3647 ! copy matrix elements 3648 DO irow = 1, pe_row_size 3649 irow_global = row_indx(irow) 3650 DO jj = 1, pol%nfreq_in_group 3651 this%gmeRDyntempr2(irow,mytot,jj) = gme_sub(irow_global,i_myband,peinf%indexv(iv),itot_rk2,ispin)*& 3652 edenDRtemp(jj) 3653 END DO 3654 END DO 3655 DO icol = 1, pe_col_size 3656 icol_global = col_indx(icol) 3657 this%gmeRDyntempc(mytot,icol) = MYCONJG(gme_sub(icol_global,i_myband,peinf%indexv(iv),itot_rk2,ispin)) 3658 END DO 3659 3660 end do ! i_band 3661 !$OMP END DO 3662 SAFE_DEALLOCATE(edenDRtemp) 3663 !$OMP END PARALLEL 3664 itot = itot+ilimit 3665 end do ! iv 3666 end do ! it 3667 end do ! irk 3668 3669 if(peinf%inode .eq. 0) then 3670 call timing%stop(timing%chi_sum_prep) 3671 endif 3672 3673 3674 if( pol%nonblocking_cyclic ) then 3675 ! 3676 if(peinf%inode .eq. 0) then 3677 call timing%start(timing%chi_sum_gemm) 3678 endif 3679 do jj = 1, pol%nfreq_in_group 3680 call zgemm('n','n', size_N, size_M, size_K, & 3681 negfact,this%gmeRDyntempr2(:,:,jj), size_N, & 3682 this%gmeRDyntempc(:,:), size_K,& 3683 (0D0,0D0),this%chilocalRDyn(:,:,jj), size_N) 3684 end do 3685 if(peinf%inode .eq. 0) then 3686 call timing%stop(timing%chi_sum_gemm) 3687 endif 3688 ! 3689 else ! pol%nonblocking_cyclic 3690 ! 3691 ! go with zgemm (standard calculation) 3692 if(ntot > 0) then 3693 do jj =1, pol%nfreq_in_group 3694 if(peinf%inode .eq. 0) then 3695 call timing%start(timing%chi_sum_gemm) 3696 endif 3697 call zgemm('n','n',pe_row_size,pe_col_size,ntot, & 3698 negfact,this%gmeRDyntempr2(:,:,jj),pe_row_size,this%gmeRDyntempc(:,:),ntot,& 3699 (0D0,0D0),this%chilocalRDyn(:,:,jj),pe_row_size) 3700 if(peinf%inode .eq. 0) then 3701 call timing%stop(timing%chi_sum_gemm) 3702 endif 3703 enddo 3704 endif 3705 ! 3706 end if ! pol%nonblocking_cyclic 3707 3708 if( pol%nonblocking_cyclic ) then 3709 ! 3710 ! just copy the result 3711 do kk = 1, pol%nfreq_in_group 3712!$OMP PARALLEL DO collapse(2) 3713 do ii = 1, nrec_col 3714 do jj = 1, nrec_row 3715 buf_temp(jj,ii,kk,ispin) = this%chilocalRDyn(jj,ii,kk) 3716 enddo 3717 enddo 3718 end do 3719 ! 3720 else ! pol%nonblocking_cyclic 3721 ! 3722 ! standard calculation 3723 if(peinf%inode .eq. 0) then 3724 call timing%start(timing%chi_sum_comm) 3725 endif 3726 3727#ifdef MPI 3728 ! all reduce 3729 call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), & 3730 pol%nfreq_in_group*pe_row_size*pe_col_size,MPI_COMPLEX_DPC,& 3731 MPI_SUM,ipe_real-1,MPI_COMM_WORLD,mpierr) 3732#endif 3733 if(peinf%inode .eq. 0) then 3734 call timing%stop(timing%chi_sum_comm) 3735 endif 3736 3737#ifndef MPI 3738 this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:) 3739#endif 3740 ! 3741 end if ! pol%nonblocking_cyclic 3742 3743 3744 if(peinf%inode .eq. 0) then 3745 call timing%start(timing%chi_sum_array_alloc) 3746 endif 3747 if ( .not. pol%nonblocking_cyclic ) then 3748 ! deallocate 3749 SAFE_DEALLOCATE(this%chilocalRDyn) 3750 SAFE_DEALLOCATE(this%gmeRDyntempr2) 3751 SAFE_DEALLOCATE(this%gmeRDyntempc) 3752 end if 3753 if(peinf%inode .eq. 0) then 3754 call timing%stop(timing%chi_sum_array_alloc) 3755 endif 3756 3757 end do ! ispin 3758 3759 SAFE_DEALLOCATE(row_indx) 3760 SAFE_DEALLOCATE(col_indx) 3761 3762 ! finalize non-blocking cyclic communication 3763 if(pol%nonblocking_cyclic) then 3764 if(peinf%inode .eq. 0) then 3765 call timing%start(timing%chi_sum_comm) 3766 endif 3767 3768 ! make sure the buffer is received 3769 CALL MPI_Wait(req_rec,stat,mpierr) 3770 ! accumulate contribution into receiving buffer 3771 !XXXX buf_rec(:,:,:) = buf_rec(:,:,:) + buf_temp 3772 do ispin = 1 , kp%nspin 3773 do kk = 1, pol%nfreq_in_group 3774!$OMP PARALLEL DO collapse(2) 3775 do ii = 1, nrec_col 3776 do jj = 1, nrec_row 3777 buf_rec(jj,ii,kk,ispin) = buf_rec(jj,ii,kk,ispin) + buf_temp(jj,ii,kk,ispin) 3778 enddo 3779 enddo 3780 end do 3781 end do 3782 ! wait for the massage to be sent 3783 CALL MPI_Wait(req_send,stat,mpierr) 3784 if(peinf%inode .eq. 0) then 3785 call timing%stop(timing%chi_sum_comm) 3786 endif 3787 3788 if(peinf%inode .eq. 0) then 3789 call timing%start(timing%chi_sum_array_alloc) 3790 endif 3791 3792 ! copy the messega to the sending buffer for the next cycle 3793 ! 3794 do ispin = 1 , kp%nspin 3795 do kk = 1, pol%nfreq_in_group 3796!$OMP PARALLEL DO collapse(2) 3797 do ii = 1, nrec_col 3798 do jj = 1, nrec_row 3799 buf_send(jj,ii,kk,ispin) = buf_rec(jj,ii,kk,ispin) 3800 enddo 3801 enddo 3802 end do 3803 end do 3804 3805 !XXXX buf_send = buf_rec 3806 ! 3807 nsend_row = nrec_row 3808 nsend_col = nrec_col 3809 ! 3810 if(peinf%inode .eq. 0) then 3811 call timing%stop(timing%chi_sum_array_alloc) 3812 endif 3813 3814 end if ! pol%nonblocking_cyclic 3815 3816 END DO ! ipe 3817 call progress_free(prog_info) 3818 ! 3819#ifdef MPI 3820 call MPI_barrier(MPI_COMM_WORLD,mpierr) 3821#endif 3822 ! 3823 if(peinf%inode .eq. 0) then 3824 call timing%stop(timing%chi_sum_sub_omega_neq_0) 3825 endif 3826 3827 3828 if(pol%nonblocking_cyclic) then 3829 ! done 3830 do ispin = 1 , kp%nspin 3831 do kk = 1, pol%nfreq_in_group 3832 !$OMP PARALLEL DO collapse(2) 3833 do ii = 1, my_sub_nlocal_col 3834 do jj = 1, my_sub_nlocal_row 3835 this%chilocal2RDyn(jj,ii,kk,ispin) = buf_send(jj,ii,kk,ispin) 3836 enddo 3837 enddo 3838 end do 3839 end do 3840 !XXXX this%chilocal2RDyn(:,:,:,:) = buf_send(:,:,:,:) 3841 ! 3842 ! clean-up 3843 SAFE_DEALLOCATE(buf_send) 3844 SAFE_DEALLOCATE(buf_rec) 3845 SAFE_DEALLOCATE(buf_temp) 3846 SAFE_DEALLOCATE(this%chilocalRDyn) 3847 SAFE_DEALLOCATE(this%gmeRDyntempr2) 3848 SAFE_DEALLOCATE(this%gmeRDyntempc) 3849 ! 3850 end if 3851 3852 ! deallocate stuff 3853 SAFE_DEALLOCATE(gme_sub) 3854 SAFE_DEALLOCATE(row_col_loc) 3855 SAFE_DEALLOCATE(grid2D_2inode) 3856 3857 IF(pol%need_full_chi) THEN 3858 ! here allocate chiRdyson (case for which we need full chi) 3859 SAFE_ALLOCATE(pol%chiRDyn, (scal%npr,scal%npc,pol%nfreq_in_group,kp%nspin)) 3860 pol%chiRDyn = ZERO 3861 3862 ! initialize descriptor for the eigenvector matrix 3863 call descinit(desca, nmtx, nmtx, scal%nbl, scal%nbl, 0, 0, & 3864 scal%icntxt, max(1,scal%npr), info) 3865 if(info < 0) then 3866 write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.' 3867 call die("descinit error for descaA in sub_chi_sum") 3868 end if 3869 3870 ! transform to original basis use C_Pgemm as temporary array 3871 SAFE_ALLOCATE(C_Pgemm, (scal%npr,scal%npc)) 3872 C_Pgemm = ZERO 3873 do ispin = 1, kp%nspin 3874 do jj = 1, pol%nfreq_in_group 3875 if(peinf%inode .eq. 0) then 3876 call timing%start(timing%subspace_pgemm) 3877 endif 3878 3879 CALL pzgemm('N','N', nmtx, neig_sub, neig_sub, (1.0d0,0.0d0), eigen_vect, 1, 1, desca, & 3880 this%chilocal2RDyn(:,:,jj,ispin), 1, 1, desc_sub, (0.0d0,0.0d0), & 3881 C_Pgemm, 1, 1, desca) 3882 if(peinf%inode .eq. 0) then 3883 call timing%stop(timing%subspace_pgemm) 3884 endif 3885 if(peinf%inode .eq. 0) then 3886 call timing%start(timing%subspace_pgemm) 3887 endif 3888 CALL pzgemm('N','C', nmtx, nmtx, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, & 3889 eigen_vect, 1, 1, desca, (0.0d0,0.0d0), & 3890 pol%chiRDyn(:,:,jj,ispin), 1, 1, desca) 3891 if(peinf%inode .eq. 0) then 3892 call timing%stop(timing%subspace_pgemm) 3893 endif 3894 end do 3895 end do 3896 SAFE_DEALLOCATE(C_Pgemm) 3897 SAFE_DEALLOCATE(this%chilocal2RDyn) 3898 3899 ! remove coulomb potential 3900 do ispin = 1, kp%nspin 3901 ! go back to polarizability (remove v^{1/2}) 3902 icurr=0 3903 do i=1, nmtx 3904 irow=MOD(INT(((i-1)/scal%nbl)+TOL_SMALL),scal%nprow) 3905 if(irow.ne.scal%myprow) cycle 3906 3907 vc = SQRT(vcoul(i)) 3908 3909 do j=1, nmtx 3910 icol=MOD(INT(((j-1)/scal%nbl)+TOL_SMALL),scal%npcol) 3911 if(icol .eq. scal%mypcol) then 3912 icurr=icurr+1 3913 irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1 3914 icolm=MOD((icurr-1),scal%npc)+1 3915 3916 pol%chiRDyn(irowm,icolm,1:pol%nfreq,ispin) = pol%chiRDyn(irowm,icolm,1:pol%nfreq,ispin) / vc / SQRT(vcoul(j)) 3917 3918 endif 3919 end do 3920 end do 3921 end do 3922 ELSE 3923 ! save information for epsinv_sub 3924 pol%nrow_local_sub = my_sub_nlocal_row 3925 pol%ncol_local_sub = my_sub_nlocal_col 3926 pol%neig_sub = neig_sub 3927 3928 SAFE_ALLOCATE(pol%chiRDyn, (MAX(1,my_sub_nlocal_row),MAX(1,my_sub_nlocal_col),pol%nfreq_in_group,kp%nspin)) 3929 pol%chiRDyn = ZERO 3930 pol%chiRDyn(:,:,:,:) = this%chilocal2RDyn(:,:,:,:) 3931 SAFE_DEALLOCATE(this%chilocal2RDyn) 3932 END IF 3933 3934 POP_SUB(calc_pol_sub_trunc_easy) 3935 3936 end subroutine calc_pol_sub_trunc_easy 3937 3938 subroutine dump_matrix(scal, matrix, inode, unit) 3939 type (scalapack), intent(in) :: scal 3940 SCALAR, intent(in) :: matrix(scal%npr,scal%npc) 3941 integer :: inode, unit 3942 character(len=7) :: nam 3943 integer :: jjj 3944 3945 PUSH_SUB(dump_matrix) 3946 3947 ! write(nam,'(I5)') unit+inode 3948 ! open(unit+inode,file='chi.'//TRIM(nam),form='unformatted',status='replace') 3949 do jjj = 1, scal%npc 3950 write(unit+inode) matrix(1:scal%npr,jjj) 3951 end do 3952 flush(unit+inode) 3953 ! close(unit+inode) 3954 3955 POP_SUB(dump_matrix) 3956 3957 end subroutine dump_matrix 3958 3959 subroutine read_dump_matrix(scal, matrix, inode, unit) 3960 type (scalapack), intent(in) :: scal 3961 SCALAR, intent(inout) :: matrix(scal%npr,scal%npc) 3962 integer :: inode, unit 3963 character(len=7) :: nam 3964 integer :: jjj 3965 3966 PUSH_SUB(read_dump_matrix) 3967 3968 ! write(nam,'(I5)') unit+inode 3969 ! open(unit+inode,file='chi.'//TRIM(nam),form='unformatted',status='old') 3970 do jjj = 1, scal%npc 3971 read(unit+inode) matrix(1:scal%npr,jjj) 3972 end do 3973 ! close(unit+inode) 3974 3975 POP_SUB(read_dump_matrix) 3976 3977 end subroutine 3978 3979#endif 3980! close the condition ( if defined MPI && defined USESCALAPACK) for subspace method 3981 3982end module chi_summation_m 3983