1!================================================================================= 2! 3! Routines: 4! 5! (1) epscopy() Last Modified 2014 (FHJ) 6! 7! Copy dielectric matrices from eps0mat(.h5)/epsmat(.h5) files to memory. 8! 9! This routine reads the eps0mat and epsmat files (units 10 and 11) 10! and copies the relevant information about those dielectric matrices 11! to memory (epsmpi). Processor 0 will actually read data and redistribute 12! them to other processors. The eps0mat can have more than one q-point, 13! which is useful when performing voronoi averages for the q->0 point. 14! 15!================================================================================== 16 17#include "f_defs.h" 18 19module epscopy_m 20 21 use global_m 22 use misc_m 23 use io_utils_m 24 use epsread_hdf5_m 25 use scalapack_m 26 use timing_m, only: timing => sigma_timing 27 28 implicit none 29 30 private 31 32 public :: epscopy_init, epscopy 33 34 type :: comm_buffer 35 complex(DPC), allocatable :: msg(:,:) 36 integer, allocatable :: col_global_indx(:) 37 integer :: nrow, ncol 38 integer :: proc 39 end type 40 41contains 42 43!> Determines the number of q-points, the maximum rank of the dielectric matrix, 44!! number of frequencies 45subroutine epscopy_init(sig, neps) 46 type (siginfo), intent(inout) :: sig 47 integer, intent(out) :: neps !< Max. rank of dielectric matrix 48 49 integer :: ngarbage1, nmtx, itrash, ig, igp, iq 50 integer :: nFreq, nfreq_imag, freq_dep_flag 51 real(DP), allocatable :: dFreqGrid(:) 52 complex(DPC), allocatable :: dFreqBrd(:) 53 logical, allocatable :: is_imag(:) 54 logical :: file_exists 55 logical :: subtrunc_flags(6) 56 integer :: neig_sub, ierr 57 real(DP) :: chi_eigenvalue_cutoff 58#ifdef HDF5 59 integer :: ngarbage3(3), nmtxmax 60 real(DP) :: dgarbage1 61#endif 62 63 PUSH_SUB(epscopy_init) 64 65 sig%subspace_q0 = .FALSE. 66 sig%matrix_in_subspace_basis_q0 = .FALSE. 67 sig%keep_full_eps_static_q0 = .FALSE. 68 sig%subspace = .FALSE. 69 sig%matrix_in_subspace_basis = .FALSE. 70 sig%keep_full_eps_static = .FALSE. 71 subtrunc_flags(:) = .FALSE. 72 sig%neig_sub_max = 0 73 74 if (sig%freq_dep>=0.and.sig%freq_dep<=3) then 75#ifdef HDF5 76 if (sig%use_hdf5) then 77 if (peinf%inode==0) then 78 call read_eps_grid_sizes_hdf5(ngarbage1, sig%nq0, dgarbage1, sig%nFreq, & 79 sig%nfreq_imag, nmtxmax, ngarbage3, freq_dep_flag, 'eps0mat.h5') 80 neps = nmtxmax 81 82 ! Consistency check, before continuing 83 if (freq_dep_flag==2.and.sig%freq_dep/=2) & 84 call die('eps0mat is frequency-dependent, but this Sigma calculation is not.') 85 if (freq_dep_flag==0.and.(sig%freq_dep==2.or.sig%freq_dep==3)) then 86 call die('This Sigma calculation is frequency-dependent, but eps0mat is not.') 87 endif 88 if (freq_dep_flag==3.and.sig%freq_dep/=3) then 89 call die('This Sigma calculation is for GN GPP, but eps0mat is not.') 90 endif 91 92 ! Check if subspace method has been used 93 neig_sub = 0 94 call read_eps_subspace_info(sig%subspace_q0, & 95 sig%matrix_in_subspace_basis_q0, & 96 sig%keep_full_eps_static_q0, & 97 neig_sub, chi_eigenvalue_cutoff, 'eps0mat.h5') 98 sig%neig_sub_max = MAX(neig_sub, sig%neig_sub_max) 99 ! Consistency check for subspace 100 101 if ( sig%subspace_q0 ) then 102 if ( sig%matrix_in_subspace_basis_q0 ) then 103#if !defined MPI || !defined USESCALAPACK 104 call die('Subspace method only works with MPI and SCALAPACK') 105#endif 106 end if 107 if ( sig%freq_dep/=2 ) then 108 call die('Subspace approximation implemented only for freq_dep = 2') 109 end if 110 end if 111 112 inquire(file="epsmat.h5", exist=file_exists) 113 if (file_exists) then 114 sig%igamma = 0 115 else 116 sig%igamma = 1 117 endif 118 119 if(sig%igamma/=0) then ! Gamma-only calculation 120 sig%nq1 = 0 121 else 122 call read_eps_grid_sizes_hdf5(ngarbage1, sig%nq1, dgarbage1, nfreq, nfreq_imag, & 123 nmtxmax, ngarbage3, freq_dep_flag, 'epsmat.h5') 124 if (nFreq/=sig%nFreq) then 125 call die('nFreq mismatch between eps0mat.h5 and epsmat.h5') 126 endif 127 if (nfreq_imag/=sig%nfreq_imag) then 128 call die('nfreq_imag mismatch between eps0mat.h5 and epsmat.h5') 129 endif 130 if (nmtxmax>neps) neps = nmtxmax 131 132 ! check subspace also for epsmat.h5 133 neig_sub = 0 134 call read_eps_subspace_info(sig%subspace, & 135 sig%matrix_in_subspace_basis, & 136 sig%keep_full_eps_static, & 137 neig_sub, chi_eigenvalue_cutoff, 'epsmat.h5') 138 sig%neig_sub_max = MAX(neig_sub, sig%neig_sub_max) 139 ! Consistency check for subspace 140 141 if ( sig%subspace ) then 142 if ( sig%matrix_in_subspace_basis ) then 143#if !defined MPI || !defined USESCALAPACK 144 call die('Subspace method only works with MPI and SCALAPACK') 145#endif 146 end if 147 if ( sig%freq_dep/=2 ) then 148 call die('Subspace approximation implemented only for freq_dep = 2') 149 end if 150 end if ! sig%subspace 151 152 endif ! .not.sig%igamma/=0 153 endif !peinf%inode==0 154 else 155#endif 156 if (peinf%inode==0) then 157 call open_file(unit=10,file='eps0mat',form='unformatted',status='old') 158 read(10) 159 ierr = 0 160 read(10,IOSTAT=ierr) freq_dep_flag, sig%nFreq, & 161 sig%subspace_q0, sig%matrix_in_subspace_basis_q0, sig%keep_full_eps_static_q0 162 IF(ierr .NE. 0) THEN 163 ! probably this is due because of the absence of the subspace flags 164 sig%subspace_q0 = .FALSE. 165 sig%matrix_in_subspace_basis_q0 = .FALSE. 166 sig%keep_full_eps_static_q0 = .FALSE. 167 END IF 168 169! make sure that in the case of subspace MPI and scalapack are linked 170#if !defined MPI || !defined USESCALAPACK 171 IF(sig%subspace_q0) THEN 172 call die('Subspace method only works with MPI and SCALAPACK') 173 END IF 174#endif 175 176 ! Consistency check, before continuing 177 if (freq_dep_flag==2.and.sig%freq_dep/=2) & 178 call die('eps0mat is frequency-dependent, but this Sigma calculation is not.') 179 if (freq_dep_flag==0.and.sig%freq_dep==2) then 180 call die('This Sigma calculation is frequency-dependent, but eps0mat is not.') 181 endif 182 if (sig%freq_dep/=2 .and. sig%subspace_q0) then 183 call die('Subspace approximation implemented only for freq_dep = 2') 184 end if 185 186 read(10) 187 if (sig%freq_dep==2.or.sig%freq_dep==3) then 188 SAFE_ALLOCATE(dFreqGrid, (sig%nFreq)) 189 SAFE_ALLOCATE(dFreqBrd, (sig%nFreq)) 190 SAFE_ALLOCATE(is_imag, (sig%nFreq)) 191 read(10) dFreqGrid, dFreqBrd 192 is_imag = (dabs(dFreqGrid)<TOL_ZERO) 193 is_imag(1) = .false. 194 sig%nfreq_imag = count(is_imag) 195 SAFE_DEALLOCATE(dFreqGrid) 196 SAFE_DEALLOCATE(dFreqBrd) 197 SAFE_DEALLOCATE(is_imag) 198 else 199 read(10) 200 endif 201 read(10) 202 read(10) 203 read(10) 204 read(10) sig%nq0 205 read(10) 206 !XXXXXXXX 207 IF(sig%subspace_q0 .AND. sig%matrix_in_subspace_basis_q0) THEN 208 do iq=1,sig%nq0 209 read(10) itrash, neps 210 read(10) ! ekin 211 read(10) ! q 212 read(10) ! vcoul 213 ! full epsilon omega zero 214 IF(sig%keep_full_eps_static_q0) THEN 215 do igp=1, neps 216 read(10) ! For epsRDyn only 217 end do 218 END IF 219 ! eigenvectors 220 read(10) neig_sub 221 IF(sig%neig_sub_max < neig_sub) sig%neig_sub_max = neig_sub 222 do igp=1, neps 223 read(10) ! For epsRDyn only 224 end do 225 ! subspace matrices 226 do igp=1, neig_sub 227 do ig=1, neig_sub 228 read(10) 229 end do 230#ifdef CPLX 231 do ig=1, neig_sub 232 read(10) ! For epsADyn (empty line...) 233 enddo 234#endif 235 end do 236 end do ! iq 237 ELSE 238 read(10) itrash, neps 239 END IF 240 !XXXXXXXX 241 call close_file(10) 242 call open_file(unit=10,file='eps0mat',form='unformatted',status='old') 243 244 inquire(file="epsmat", exist=file_exists) 245 if (file_exists) then 246 sig%igamma = 0 247 else 248 sig%igamma = 1 249 endif 250 if(sig%igamma/=0) then ! Gamma-only calculation 251 sig%nq1 = 0 252 else 253 call open_file(unit=11,file='epsmat',form='unformatted',status='old') 254 read(11) 255 ierr = 0 256 read(11,IOSTAT=ierr) ngarbage1, nFreq, & 257 sig%subspace, sig%matrix_in_subspace_basis, sig%keep_full_eps_static 258 IF(ierr .NE. 0) THEN 259 sig%subspace = .FALSE. 260 sig%matrix_in_subspace_basis = .FALSE. 261 sig%keep_full_eps_static = .FALSE. 262 END IF 263 264! make sure that in the case of subspace MPI and scalapack are linked 265#if !defined MPI || !defined USESCALAPACK 266 IF(sig%subspace) THEN 267 call die('Subspace method only works with MPI and SCALAPACK') 268 END IF 269#endif 270 271 if (nFreq/=sig%nFreq) then 272 call die('nFreq mismatch between eps0mat and epsmat') 273 endif 274 if (sig%freq_dep/=2 .and. sig%subspace) then 275 call die('Subspace approximation implemented only for freq_dep = 2') 276 end if 277 278 read(11) 279 if (sig%freq_dep==2.or.sig%freq_dep==3) then 280 SAFE_ALLOCATE(dFreqGrid, (sig%nFreq)) 281 SAFE_ALLOCATE(dFreqBrd, (sig%nFreq)) 282 SAFE_ALLOCATE(is_imag, (sig%nFreq)) 283 read(11) dFreqGrid, dFreqBrd 284 is_imag = (dabs(dFreqGrid)<TOL_ZERO) 285 is_imag(1) = .false. 286 if (sig%nfreq_imag/=count(is_imag)) then 287 call die('nfreq_imag mismatch between eps0mat and epsmat') 288 endif 289 SAFE_DEALLOCATE(dFreqGrid) 290 SAFE_DEALLOCATE(dFreqBrd) 291 SAFE_DEALLOCATE(is_imag) 292 else 293 read(11) 294 endif 295 read(11) 296 read(11) 297 read(11) 298 read(11) sig%nq1 299 read(11) 300 301 do iq=1,sig%nq1 302 read(11) itrash, nmtx 303 read(11) 304 read(11) 305 306 IF(sig%subspace .AND. sig%matrix_in_subspace_basis) THEN 307 read(11) ! vcoul 308 ! full epsilon omega zero 309 IF(sig%keep_full_eps_static) THEN 310 do igp=1, nmtx 311 read(11) ! For epsRDyn only 312 end do 313 END IF 314 ! eigenvectors 315 read(11) neig_sub 316 do igp=1, nmtx 317 read(11) ! For epsRDyn only 318 end do 319 ! subspace matrices 320 do igp=1, neig_sub 321 do ig=1, neig_sub 322 read(11) 323 end do 324#ifdef CPLX 325 do ig=1, neig_sub 326 read(11) ! For epsADyn (empty line...) 327 enddo 328#endif 329 end do 330 ! write(*,*) neig_sub, nmtx 331 if(sig%neig_sub_max < neig_sub) sig%neig_sub_max=neig_sub 332 ELSE 333 if (sig%freq_dep==0.or.sig%freq_dep==1) then 334 do ig=1,nmtx 335 read(11) 336 enddo 337 endif 338 if (sig%freq_dep==2.or.sig%freq_dep==3) then 339 do igp=1,nmtx 340 do ig=1,nmtx 341 read(11) ! For epsRDyn 342 enddo 343#ifdef CPLX 344 do ig=1,nmtx 345 read(11) ! For epsADyn 346 enddo 347#endif 348 enddo 349 endif 350 END IF 351 if(neps<nmtx) neps = nmtx 352 enddo 353 call close_file(11) 354 call open_file(unit=11,file='epsmat',form='unformatted',status='old') 355 endif 356 endif ! peinf%inode==0 357#ifdef HDF5 358 endif 359#endif 360#ifdef MPI 361 call MPI_Bcast(sig%igamma, 1, MPI_INTEGER, 0 ,MPI_COMM_WORLD, mpierr) 362 call MPI_Bcast(sig%nq0, 1, MPI_INTEGER, 0 ,MPI_COMM_WORLD, mpierr) 363 call MPI_Bcast(sig%nq1, 1, MPI_INTEGER, 0 ,MPI_COMM_WORLD, mpierr) 364 call MPI_Bcast(neps, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr) 365 call MPI_Bcast(sig%nFreq, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr) 366 call MPI_Bcast(sig%nfreq_imag, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr) 367!XXXX 368 subtrunc_flags(1) = sig%subspace_q0 369 subtrunc_flags(2) = sig%matrix_in_subspace_basis_q0 370 subtrunc_flags(3) = sig%keep_full_eps_static_q0 371 subtrunc_flags(4) = sig%subspace 372 subtrunc_flags(5) = sig%matrix_in_subspace_basis 373 subtrunc_flags(6) = sig%keep_full_eps_static 374 call MPI_Bcast(subtrunc_flags, 6, MPI_LOGICAL, 0, MPI_COMM_WORLD, mpierr) 375 sig%subspace_q0 = subtrunc_flags(1) 376 sig%matrix_in_subspace_basis_q0 = subtrunc_flags(2) 377 sig%keep_full_eps_static_q0 = subtrunc_flags(3) 378 sig%subspace = subtrunc_flags(4) 379 sig%matrix_in_subspace_basis = subtrunc_flags(5) 380 sig%keep_full_eps_static = subtrunc_flags(6) 381 call MPI_Bcast(sig%neig_sub_max, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr) 382!XXXX 383#endif 384 if (sig%freq_dep==2.or.sig%freq_dep==3) then 385 SAFE_ALLOCATE(sig%dFreqGrid,(sig%nFreq)) 386 SAFE_ALLOCATE(sig%dFreqBrd,(sig%nFreq)) 387 endif 388 else ! sig%freq_dep>=0.and.sig_freq_dep<=2 389 neps = 0 390 sig%nFreq = 0 391 endif 392 if (sig%nq0/=1.and..not.sig%subsample) call die('epscopy_init: nq0/=1') 393 if (peinf%inode==0) then 394 write(6,'(/1x,a,i0)') 'Total number of frequencies in the dielectric matrix: ', sig%nfreq 395 write(6,'(1x,a,i0/)') 'Number of imag. frequencies in the dielectric matrix: ', sig%nfreq_imag 396 !XXXXX 397 if(sig%subspace_q0) then 398 write(6,'(1x,a)') 'Eps0mat calculated with subspace truncation method' 399 IF(sig%matrix_in_subspace_basis_q0) THEN 400 write(6,'(1x,a)') 'Eps0mat matrices written within the subspace of omega 0' 401 END IF 402 IF(sig%keep_full_eps_static_q0) THEN 403 write(6,'(1x,a)') 'Eps0mat retains the full EpsInv for omega=0' 404 END IF 405 write(6,*) 406 end if 407 if(sig%subspace .AND. sig%igamma==0) then 408 write(6,'(1x,a)') 'Epsmat calculated with subspace truncation method' 409 IF(sig%matrix_in_subspace_basis) THEN 410 write(6,'(1x,a)') 'Epsmat matrices written within the subspace of omega 0' 411 END IF 412 IF(sig%keep_full_eps_static) THEN 413 write(6,'(1x,a)') 'Epsmat retains the full EpsInv for omega=0' 414 END IF 415 write(6,*) 416 end if 417 !XXXXX 418 endif 419 420 ! check if we can perform the calc in the subspace basis 421 if(sig%do_sigma_subspace) then 422 if(sig%igamma==0) then 423 ! k-point, we need both 424 if(sig%matrix_in_subspace_basis_q0 .AND. sig%matrix_in_subspace_basis) then 425 if (peinf%inode==0) write(6,'(1x,a)') 'DIRECT evalution of sigma in subspace basis' 426 else 427 if (peinf%inode==0) write(6,'(1x,a)') 'Epsilon matrices not in subspace basis!' 428 if (peinf%inode==0) write(6,'(1x,a)') 'Direct evalution of sigma in subspace basis turned off' 429 sig%do_sigma_subspace = .false. 430 end if 431 else 432 if(sig%matrix_in_subspace_basis_q0) then 433 if (peinf%inode==0) write(6,'(1x,a)') 'DIRECT evalution of sigma in subspace basis' 434 else 435 if (peinf%inode==0) write(6,'(1x,a)') 'Epsilon matrix not in subspace basis!' 436 if (peinf%inode==0) write(6,'(1x,a)') 'Direct evalution of sigma in subspace basis turned off' 437 sig%do_sigma_subspace = .false. 438 end if 439 end if 440 end if 441 442 POP_SUB(epscopy_init) 443 444end subroutine epscopy_init 445 446 447subroutine epscopy(crys,gvec,sig,neps,epsmpi,epshead,iunit_eps,fne) 448 type (crystal), intent(in) :: crys 449 type (gspace), intent(in) :: gvec 450 type (siginfo), intent(inout) :: sig 451 integer, intent(in) :: neps !< number of G-vectors up to epsilon cutoff defined in sigma.inp 452 type (epsmpiinfo), intent(inout) :: epsmpi 453 SCALAR, intent(out) :: epshead !< for full frequency, this is the retarded static head. 454 integer, intent(in) :: iunit_eps 455 character*20, intent(in) :: fne 456 457 SCALAR, allocatable :: eps(:) 458 complex(DPC), allocatable :: epsRDyn(:,:) 459 complex(DPC), allocatable :: epsADyn(:,:) 460 logical :: is_static 461#ifdef HDF5 462 integer, allocatable :: sub_neig_of_q(:) 463#endif 464 465 PUSH_SUB(epscopy) 466 467 epshead = ZERO 468 is_static = sig%freq_dep/=2 .AND. sig%freq_dep/=3 469 470 ! FHJ: Initialize buffers 471 if (is_static) then 472 SAFE_ALLOCATE(epsmpi%eps, (neps,epsmpi%ngpown,sig%nq)) 473 else 474 if(sig%do_sigma_subspace) then 475 ! if doing calcultion within subspace allocate stuff 476 SAFE_ALLOCATE(sig%epssub%eps_sub, (sig%neig_sub_max, sig%epssub%Nbas_own, sig%nFreq, sig%nq)) 477 !XXXX SAFE_ALLOCATE(sig%epssub%eigenvec_sub, (sig%epssub%ngpown_sub, sig%neig_sub_max, sig%nq)) 478 SAFE_ALLOCATE(sig%epssub%eigenvec_sub, (neps, sig%epssub%Nbas_own, sig%nq)) 479 SAFE_ALLOCATE(sig%epssub%eps_wings_rows, (neps, sig%nFreq, sig%nq)) 480 SAFE_ALLOCATE(sig%epssub%eps_wings_cols, (neps, sig%nFreq, sig%nq)) 481 SAFE_ALLOCATE(sig%epssub%eps_wings_correction_rows, (neps, sig%nFreq)) 482 SAFE_ALLOCATE(sig%epssub%eps_wings_correction_cols, (neps, sig%nFreq)) 483 sig%epssub%eps_sub = (0.0d0,0.0d0) 484 sig%epssub%eigenvec_sub = (0.0d0,0.0d0) 485 sig%epssub%eps_wings_rows = (0.0d0,0.0d0) 486 sig%epssub%eps_wings_cols = (0.0d0,0.0d0) 487 sig%epssub%eps_wings_correction_rows = (0.0d0,0.0d0) 488 sig%epssub%eps_wings_correction_cols = (0.0d0,0.0d0) 489 SAFE_ALLOCATE(sig%epssub%eps_sub_info, (3, 2, sig%nq)) 490 SAFE_ALLOCATE(sig%epssub%wing_pos, (sig%nq)) 491 sig%epssub%wing_pos = 0 492 SAFE_ALLOCATE(sig%epssub%vcoul_sub, (neps, sig%nq)) 493 sig%epssub%vcoul_sub = 0.0d0 494 !XXX deallocate and reallocate epsmpi%epsR with a single frequency 495 !XXX SAFE_DEALLOCATE_P(epsmpi%epsR) 496 SAFE_ALLOCATE(epsmpi%epsR, (neps,epsmpi%ngpown,1,sig%nq)) 497 else 498 ! standard case 499 SAFE_ALLOCATE(epsmpi%epsR, (neps,epsmpi%ngpown,sig%nFreq,sig%nq)) 500 if (sig%need_advanced) then 501 SAFE_ALLOCATE(epsmpi%epsA, (neps,epsmpi%ngpown,sig%nFreq,sig%nq)) 502 endif 503 end if 504 endif 505 506 ! FHJ: Temp buffers. 507 if (is_static) then 508 SAFE_ALLOCATE(eps, (neps)) 509 else 510 SAFE_ALLOCATE(epsRDyn, (neps,sig%nFreq)) 511 if (sig%need_advanced) then 512 SAFE_ALLOCATE(epsADyn, (neps,sig%nFreq)) 513 endif 514 endif 515 !------------------------------------------------ 516 ! FHJ: Read dielectric matrices from eps0mat(.h5) 517 !------------------------------------------------ 518 call read_epsmat(.true.) 519#ifdef MPI 520 call MPI_Bcast(epshead, 1, MPI_SCALAR, 0, MPI_COMM_WORLD, mpierr) 521#endif 522 if (dble(epshead)<1d-3 .and. sig%iscreen==SCREEN_SEMICOND .and. peinf%inode==0) then 523 write(0,'(a)') 'WARNING: You are using semiconductor screening, but the' 524 write(0,'(a)') 'head of epsilon inverse is very small and seems metallic.' 525 endif 526 !------------------------------------------------ 527 ! FHJ: Read dielectric matrices from epsmat(.h5) 528 !------------------------------------------------ 529 if (sig%igamma==0) call read_epsmat(.false.) 530 531#ifdef MPI 532 call MPI_Bcast(sig%qgrid,3,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr) 533#endif 534 epsmpi%qk(:,:) = sig%qpt(:,:) 535 536 ! FHJ: Free temp. buffers 537 if (is_static) then 538 SAFE_DEALLOCATE(eps) 539 else 540 SAFE_DEALLOCATE(epsRDyn) 541 if (sig%need_advanced) then 542 SAFE_DEALLOCATE(epsADyn) 543 endif 544 endif 545 546 POP_SUB(epscopy) 547 548 return 549 550contains 551 552 553 subroutine read_epsmat(is_q0) 554 logical, intent(in) :: is_q0 555 556 character(len=16) :: fname 557 integer :: qoffset, nq_file, iunit 558 integer :: freq_dep_flag, nFreq, nfreq_imag, ng_old, ngq, nmtx 559 integer :: ii, ig, igp, nq_tmp, iq, qgrid(3), gg(3), iout, iw, dest, igp_loc 560 real(DP) :: ecuts, qk(3), ekin 561 real(DP), allocatable :: ekold(:) 562#ifdef HDF5 563 integer :: nmtxmax 564 integer, allocatable :: nmtx_of_q(:), isrtinvdummy(:) 565#endif 566 integer, allocatable :: isrtq(:), isrtqi(:), isrtold(:), gvecs_old(:,:) 567 type(progress_info) :: prog_info !< a user-friendly progress report 568 character(len=6) :: sname 569 character(len=11) :: sdate 570 logical :: matrix_in_subspace_basis, keep_full_eps_static 571 real(DP), allocatable :: vcoul(:) 572 573 PUSH_SUB(epscopy.read_epsmat) 574 575 if (is_q0) then 576 fname = 'eps0mat' 577 qoffset = 0 578 nq_file = sig%nq0 579 iunit = 10 580 else 581 fname = 'epsmat' 582 qoffset = sig%nq0 583 nq_file = sig%nq1 584 iunit = 11 585 endif 586 if (sig%use_hdf5) fname = TRUNC(fname) // '.h5' 587 588 if (peinf%inode==0) then 589 SAFE_ALLOCATE(ekold, (gvec%ng)) 590 SAFE_ALLOCATE(isrtold, (gvec%ng)) 591 SAFE_ALLOCATE(isrtq, (gvec%ng)) 592 SAFE_ALLOCATE(isrtqi, (gvec%ng)) 593 endif 594 595 matrix_in_subspace_basis = .FALSE. 596 keep_full_eps_static = .FALSE. 597 if (is_q0) then 598 if(sig%subspace_q0) then 599 matrix_in_subspace_basis = sig%matrix_in_subspace_basis_q0 600 keep_full_eps_static = sig%keep_full_eps_static_q0 601 end if 602 else 603 if(sig%subspace) then 604 matrix_in_subspace_basis = sig%matrix_in_subspace_basis 605 keep_full_eps_static = sig%keep_full_eps_static 606 end if 607 end if 608 609!------------------------------------------------------------------------------ 610! Read q-grid, q-points, G-vectors and freq. grid 611!------------------------------------------------------------------------------ 612 if (peinf%inode==0) then 613#ifdef HDF5 614 if (sig%use_hdf5) then 615 sname = 'chiGG0' 616 sdate = 'nodate' 617 call read_eps_grid_sizes_hdf5(ng_old, nq_tmp, ecuts, nfreq, nfreq_imag, & 618 nmtxmax, qgrid, freq_dep_flag, TRUNC(fname)) 619 if (.not.is_static.and.is_q0) then 620 call read_eps_freqgrid_hdf5(nFreq, sig%dFreqGrid, sig%dFreqBrd, TRUNC(fname)) 621 endif 622 623 SAFE_ALLOCATE(nmtx_of_q, (nq_file)) 624 call read_eps_qgrid_hdf5(nq_tmp, sig%qpt(:,qoffset+1:), nmtx_of_q, TRUNC(fname)) 625 ! Note: it seems like the old format stores isort up to ngq, and the 626 ! HDF5 stores up to ng_old 627 SAFE_ALLOCATE(gvecs_old, (3,ng_old)) 628 call read_eps_old_gvecs_hdf5(ng_old, gvecs_old, TRUNC(fname)) 629 ! subspace: read number of eigenvectors 630 if(matrix_in_subspace_basis) then 631 SAFE_ALLOCATE(sub_neig_of_q, (nq_file)) 632 sub_neig_of_q = 0 633 call read_eps_sub_neig_of_q(nq_tmp, sub_neig_of_q, TRUNC(fname)) 634 end if 635 else 636#endif 637 read(iunit) sname, sdate 638 read(iunit) freq_dep_flag, nFreq 639 read(iunit) qgrid(1:3) 640 if (.not.is_static.and.is_q0) then 641 read(iunit) sig%dFreqGrid, sig%dFreqBrd 642 else 643 read(iunit) 644 endif 645 read(iunit) 646 read(iunit) 647 read(iunit) ecuts 648 read(iunit) nq_tmp, ((sig%qpt(ii,qoffset+iq), ii=1,3), iq=1,nq_tmp) 649 read(iunit) ng_old 650 backspace(iunit) 651 SAFE_ALLOCATE(gvecs_old, (3, ng_old)) 652 read(iunit) ng_old, ((gvecs_old(ii,iq), ii=1,3), iq=1,ng_old) 653#ifdef HDF5 654 endif 655#endif 656 if (nq_tmp/=nq_file) call die('nq mismatch for '//TRUNC(fname)) 657 ! FHJ: only substitute sig%qgrid (used in voronoi averages) if sig%qgrid 658 ! is empty and this is epsmat(.h5), if this file is available 659 if (all(sig%qgrid(1:3)==0).and.((.not.is_q0).or.(sig%igamma==1))) then 660 sig%qgrid(:) = qgrid(:) 661 endif 662 endif ! peinf%inode==0 663 664 665!------------------------------------------------------------------------------ 666! Bcast q-points and allocate/bcast full-frequency and epsilon buffers 667!------------------------------------------------------------------------------ 668#ifdef MPI 669 call MPI_Bcast(sig%qpt(1,qoffset+1), 3*nq_file, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpierr) 670#endif 671 if (is_q0) then 672 sig%q0vec(:) = sig%qpt(:,1) 673 ! We need to manually set the q0 point in sig%qpt to zero 674 if (.not.sig%subsample) sig%qpt(:,:sig%nq0) = 0d0 675 if (.not.is_static) then 676#ifdef MPI 677 call MPI_Bcast(sig%dFreqGrid,sig%nFreq,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,mpierr) 678 call MPI_Bcast(sig%dFreqBrd,sig%nFreq,MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD,mpierr) 679#endif 680 endif 681 endif !is_q0 682 683 684!------------------------------------------------------------------------------ 685! Loop over all q-points, map G-vectors and read/store dielectric matrices 686!------------------------------------------------------------------------------ 687 call progress_init(prog_info, 'reading '//TRUNC(fname), 'q-point', nq_file) 688 do iq=1,nq_file 689 call progress_step(prog_info, iq) 690 call logit('Storing eps to memory') 691 692 ! FHJ: Map old isrt (from epsilon gspace) to new isrt (to gvec gspace) 693 !--------------------------------------------------------=------------ 694 if (peinf%inode==0) then 695#ifdef HDF5 696 if (sig%use_hdf5) then 697 SAFE_ALLOCATE(isrtinvdummy, (ng_old)) 698 call read_eps_gvecsofq_hdf5(ng_old,isrtold,isrtinvdummy,ekold,iq,TRUNC(fname)) 699 SAFE_DEALLOCATE(isrtinvdummy) 700 nmtx = nmtx_of_q(iq) 701 ngq = ng_old 702 ! in case of subspace read the coulomb potential 703 if(matrix_in_subspace_basis) then 704 SAFE_ALLOCATE(vcoul, (nmtx)) 705 call read_vcoul_hdf5(vcoul, iq, TRUNC(fname)) 706 end if 707 else 708#endif 709 read(iunit) ngq, nmtx, (isrtold(ig),ii,ig=1,ngq) 710 read(iunit) (ekold(ig),ig=1,ngq) 711 read(iunit) (qk(ii),ii=1,3) 712 ! in case of subspace read the coulomb potential 713 if(matrix_in_subspace_basis) then 714 SAFE_ALLOCATE(vcoul, (nmtx)) 715 read(iunit) (vcoul(ig),ig=1,nmtx) 716 end if 717#ifdef HDF5 718 endif 719#endif 720 isrtq = 0 721 isrtqi = 0 722 qk = sig%qpt(:,iq+qoffset) 723 if (is_q0) qk(:) = 0d0 724 do ig=1,ngq 725 if (ekold(isrtold(ig))<=sig%ecutb) then 726 gg(:) = gvecs_old(:, isrtold(ig)) 727 call findvector(iout, gg, gvec) 728 isrtq(ig) = iout 729 isrtqi(iout) = ig 730 if (ig==1) then 731 ! just check the first so we do not waste time 732 ekin = DOT_PRODUCT(gvec%components(:,iout)+qk(:), MATMUL(crys%bdot, gvec%components(:,iout)+qk(:))) 733 if (abs(ekin-ekold(isrtold(ig))) > TOL_Zero) then 734 write(0,*) 'iq = ', iq, ' ig = ',ig, ' qk = ', qk 735 write(0,*) 'epsmat: ekold(isrtold(i)) = ', ekold(isrtold(ig)), ' ekin = ', ekin 736 call die("Incorrect kinetic energies in epsmat.") 737 endif 738 endif 739 endif ! if (ekold(isrtold(i))<=sig%ecutb) 740 enddo ! do ig=1,ngq 741 endif ! if (peinf%inode==0) 742#ifdef MPI 743 call MPI_Bcast(nmtx,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr) 744#endif 745 if (peinf%inode==0) then 746 epsmpi%isrtq(:,iq+qoffset) = isrtq(:) 747 epsmpi%isrtqi(:,iq+qoffset) = isrtqi(:) 748 endif 749 epsmpi%nmtx(iq+qoffset) = nmtx 750 751 IF(matrix_in_subspace_basis) THEN 752 ! broadcast coulomb potential 753 IF(peinf%inode/=0) THEN 754 SAFE_ALLOCATE(vcoul, (nmtx)) 755 vcoul = 0.0_DP 756 END IF 757#ifdef MPI 758 call MPI_Bcast(vcoul,nmtx,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr) 759#endif 760 761#if defined MPI && defined USESCALAPACK 762 call timing%start(timing%epscopy_sub) 763 call epscopy_subspace(nmtx,vcoul,keep_full_eps_static,iq,qoffset,iunit,fname) 764 call timing%stop(timing%epscopy_sub) 765#endif 766 767 IF(sig%need_advanced) THEN 768 write(6,'(1x,a)') & 769 'Subspace truncation method only implemented for sig%freq_dep==2 and sig%freq_dep_method==2' 770 call die('need_advanced is an invalid option for subspace + cd_integration_method.') 771 END IF 772 773 if (is_q0.and.iq==1.and.peinf%inode==0) then 774 epshead = epsmpi%epsR(1,1,1,1) 775 endif 776 777 SAFE_DEALLOCATE(vcoul) 778 ELSE ! subspace read 779#ifdef HDF5 780 if (sig%use_hdf5) then 781 ! FHJ: Read & store epsinv === HDF5 === 782 !-------------------------------------- 783 if (is_static) then 784 call timing%start(timing%epscopy_io) 785 call read_eps_matrix_par_hdf5(epsmpi%eps(:,:,iq+qoffset), & 786 epsmpi%nb, peinf%pool_rank, peinf%npes_pool, nmtx, & 787 iq, 1, fname) 788 call timing%stop(timing%epscopy_io) 789 else 790 call timing%start(timing%epscopy_io) 791 if (sig%need_advanced) then 792 call read_eps_matrix_par_f_hdf5(epsmpi%epsR(:,:,:,iq+qoffset), & 793 epsmpi%nb, peinf%pool_comm, peinf%my_pool, peinf%npools, nmtx, & 794 sig%nFreq, iq, 1, fname, advanced=epsmpi%epsA(:,:,:,iq+qoffset)) 795 else 796 call read_eps_matrix_par_f_hdf5(epsmpi%epsR(:,:,:,iq+qoffset), & 797 epsmpi%nb, peinf%pool_comm, peinf%my_pool, peinf%npools, nmtx, & 798 sig%nFreq, iq, 1, fname) 799 endif 800 call timing%stop(timing%epscopy_io) 801 endif 802 if (is_q0.and.iq==1.and.peinf%inode==0) then 803 if (is_static) then 804 epshead = epsmpi%eps(1,1,1) 805 else 806 epshead = epsmpi%epsR(1,1,1,1) 807 endif 808 endif 809 else !HDF5 810#endif 811 ! FHJ: Read & store epsinv === Fortran binary === 812 !------------------------------------------------- 813 do igp=1,nmtx 814 if (is_static) then 815 if (peinf%inode==0) then 816 call timing%start(timing%epscopy_io) 817 read(iunit) (eps(ig),ig=1,nmtx) 818 call timing%stop(timing%epscopy_io) 819 if (is_q0.and.iq==1.and.igp==1) epshead = eps(1) 820 call timing%start(timing%epscopy_comm) 821 call timing%stop(timing%epscopy_comm) 822 endif 823 824 call timing%start(timing%epscopy_comm) 825#ifdef MPI 826 call MPI_Bcast(eps, neps, MPI_SCALAR, 0, MPI_COMM_WORLD, mpierr) 827#endif 828 call timing%stop(timing%epscopy_comm) 829 else ! is_static 830 if (peinf%inode==0) then 831 do ig=1,nmtx 832 call timing%start(timing%epscopy_io) 833 read(iunit) (epsRDyn(ig,iw),iw=1,sig%nFreq) ! Retarded part 834 call timing%stop(timing%epscopy_io) 835 if (is_q0.and.iq==1.and.igp==1) epshead = epsRDyn(1,1) 836 call timing%start(timing%epscopy_comm) 837 call timing%stop(timing%epscopy_comm) 838 enddo 839#ifdef CPLX 840 do ig=1,nmtx 841 call timing%start(timing%epscopy_io) 842 if (sig%need_advanced) then 843 read(iunit) (epsADyn(ig,iw),iw=1,sig%nFreq) ! Advanced part 844 else 845 read(iunit) ! Advanced part 846 endif 847 call timing%stop(timing%epscopy_io) 848 enddo 849#endif 850 endif 851 852 call timing%start(timing%epscopy_comm) 853#ifdef MPI 854 call MPI_Bcast(epsRDyn, sig%nFreq*neps, MPI_COMPLEX_DPC, 0, MPI_COMM_WORLD, mpierr) 855 if (sig%need_advanced) then 856 call MPI_Bcast(epsADyn, sig%nFreq*neps, MPI_COMPLEX_DPC, 0, MPI_COMM_WORLD, mpierr) 857 endif 858#endif 859 call timing%stop(timing%epscopy_comm) 860 endif ! is_static 861 862 dest = INDXG2P(igp, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool) 863 igp_loc = INDXG2L(igp, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool) 864 call timing%start(timing%epscopy_comm) 865 if (peinf%pool_rank==dest) then 866 if (is_static) then 867 epsmpi%eps(:,igp_loc,iq+qoffset) = eps(:) 868 else 869 epsmpi%epsR(:,igp_loc,:,iq+qoffset) = epsRDyn(:,:) 870 if (sig%need_advanced) then 871 epsmpi%epsA(:,igp_loc,:,iq+qoffset) = epsADyn(:,:) 872 endif 873 endif ! is_static 874 endif ! if (peinf%pool_rank==dest) 875 call timing%stop(timing%epscopy_comm) 876 enddo ! do ig=1,nmtx 877#ifdef HDF5 878 endif !HDF5 879#endif 880 END IF ! subspace read 881 enddo ! iq 882 call progress_free(prog_info) 883 884!------------------------------------------------------------------------------ 885! Done! 886!------------------------------------------------------------------------------ 887 888 if (peinf%inode==0) then 889#ifdef HDF5 890 if (sig%use_hdf5) then 891 SAFE_DEALLOCATE(nmtx_of_q) 892 if(matrix_in_subspace_basis) then 893 SAFE_DEALLOCATE(sub_neig_of_q) 894 end if 895 else 896#endif 897 call close_file(iunit) 898#ifdef HDF5 899 endif 900#endif 901 SAFE_DEALLOCATE(ekold) 902 SAFE_DEALLOCATE(isrtold) 903 SAFE_DEALLOCATE(isrtq) 904 SAFE_DEALLOCATE(isrtqi) 905 endif 906 907 POP_SUB(epscopy.read_epsmat) 908 909 end subroutine read_epsmat 910 911#if defined MPI && defined USESCALAPACK 912 subroutine epscopy_subspace(nmtx,vcoul,keep_full_eps_static,iq,qoffset,iunit,fname) 913 integer :: nmtx 914 real(DP) :: vcoul(nmtx) 915 logical :: keep_full_eps_static 916 integer :: iq, qoffset, iunit 917 character(len=16), intent(in) :: fname 918 919 complex(DPC), allocatable :: eigenvect(:,:), C_aux(:,:), buffer(:,:) 920 complex(DPC), allocatable :: eps_sub_f(:,:,:), E_aux(:,:,:), buffer_f(:,:,:) 921 complex(DPC), allocatable :: eps1Aux(:,:), C_Pgemm(:,:) 922 complex(DPC) :: C_send(nmtx), C_recv(nmtx) 923 real(DP) :: vc 924 type (scalapack) :: scal, scal_sub, scal_1d, scal_aux 925 integer, allocatable :: row_indices(:,:), col_indices(:,:) 926 integer, allocatable :: row_indices_sub(:,:), col_indices_sub(:,:) 927 integer :: desca(9), desc_sub(9), info 928 integer :: max_npr, max_npc, tag 929 integer :: nrow_loc_1d, ncol_loc_1d 930 integer :: nrow_loc_sub, ncol_loc_sub, max_npr_sub, max_npc_sub 931 integer :: igp, ig, ipe, neig_sub, Nfreq, ifreq, ig_l, ig_g, igp_l, igp_g 932 integer :: i_local, i_global, j_local, j_global 933 integer :: icurr, i, irow, j, icol, irowm, icolm 934 935 integer :: Nmsg_to_send, Nmsg_to_recv 936 type(comm_buffer), allocatable :: buf_send(:), buf_recv(:) 937 integer, allocatable :: pe2grid(:,:), grid2pe(:,:), pool2pe(:,:) 938 integer, allocatable :: num_col_to_send(:), num_col_to_recv(:) 939 integer, allocatable :: proc2msg_send(:), local_col_count(:) 940 integer, allocatable :: req_send(:), req_recv(:) 941 integer, allocatable :: my_status(:,:) 942 integer :: iproc_dum, iproc_send, iproc_rec, pool_send, iproc_row_rec 943 integer :: Nbuf_send, Nbuf_recv, imsg_recv, imsg_send, my_col 944 ! variables for sigma-subspace 945 complex(DPC), allocatable :: buffer_sub_eigen(:,:), buffer_sub_eps(:,:,:) 946 integer :: indx_start, indx_end, sub_size 947 integer :: iproc_pool, iproc_row, iproc_col 948 integer :: max_head, gg(3), iout 949 integer :: ipe_wing, my_pos_loc_wing 950 ! variables for fast vcoul scaling 951 real(DP), allocatable :: row_vcoul_aux(:), col_vcoul_aux(:) 952 integer, allocatable :: my_diag_indeces(:,:) 953 integer :: idiag, my_num_diag_elem 954 ! variables for cyclic redistribution of eigenvectors and subspace epsilon 955 logical :: cyclic_redistr 956 complex(DPC), allocatable :: rec_buffer_sub_eigen(:,:), send_buffer_sub_eigen(:,:) 957 complex(DPC), allocatable :: rec_buffer_sub_eps(:,:,:), send_buffer_sub_eps(:,:,:) 958 integer :: max_npc_lt_Neig, npc_lt_Neig 959 integer :: req_send_singl, tag_send, req_rec_singl, tag_rec 960 integer :: irec_static, isend_static 961 integer :: actual_send, actual_rec, ipe_real 962 logical :: do_copy 963 integer :: ncol_to_copy 964 integer, allocatable :: copy_col_inx(:) 965 966 PUSH_SUB(epscopy.epscopy_subspace) 967 968 ! set default cyclic_redistr flag 969 cyclic_redistr = .true. 970 if( sig%sub_collective_redistr ) cyclic_redistr = .false. 971 972 ! set up the blas env for the full matrices 973 call blacs_setup(scal, nmtx, .true.) 974 975 ! exchange info 976 SAFE_ALLOCATE(scal%nprd, (peinf%npes)) 977 SAFE_ALLOCATE(scal%npcd, (peinf%npes)) 978 scal%nprd = 0 979 scal%npcd = 0 980 scal%nprd(peinf%inode + 1) = scal%npr 981 scal%npcd(peinf%inode + 1) = scal%npc 982 ! 983 call MPI_ALLREDUCE(MPI_IN_PLACE,scal%nprd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 984 call MPI_ALLREDUCE(MPI_IN_PLACE,scal%npcd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 985 ! get max number of row/col 986 max_npr = MAXVAL(scal%nprd) 987 max_npc = MAXVAL(scal%npcd) 988 989 ! precompute indices 990 SAFE_ALLOCATE(row_indices, (max_npr, peinf%npes)) 991 row_indices = 0 992 DO i_local = 1, scal%npr 993 i_global = indxl2g(i_local, scal%nbl, scal%myprow, 0, scal%nprow) 994 row_indices(i_local,peinf%inode + 1) = i_global 995 END DO 996 SAFE_ALLOCATE(col_indices, (max_npc, peinf%npes)) 997 col_indices = 0 998 DO i_local = 1, scal%npc 999 i_global = indxl2g(i_local, scal%nbl, scal%mypcol, 0, scal%npcol) 1000 col_indices(i_local,peinf%inode + 1) = i_global 1001 END DO 1002 call MPI_ALLREDUCE(MPI_IN_PLACE,row_indices,peinf%npes*max_npr, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 1003 call MPI_ALLREDUCE(MPI_IN_PLACE,col_indices,peinf%npes*max_npc, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 1004 1005 !XXXXXXXXXX Here we allocate the full matrix on the first inode=0 (can be problematic in term of memory...) 1006 IF(keep_full_eps_static) THEN 1007 ! exchange info (low memory communication scheme) 1008 SAFE_ALLOCATE(pool2pe, (peinf%npools,peinf%npes_pool)) 1009 pool2pe = 0 1010 IF(peinf%my_pool >= 0 .AND. peinf%pool_rank >= 0) THEN 1011 pool2pe(peinf%my_pool+1, peinf%pool_rank+1) = peinf%inode 1012 END IF 1013 call MPI_ALLREDUCE(MPI_IN_PLACE, pool2pe, peinf%npools*peinf%npes_pool, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr) 1014 ! 1015 ! go with h5 1016#ifdef HDF5 1017 if (sig%use_hdf5) then 1018 ! read from .h5 1019 ! here we try to read only for a single pool of process using a 1d-cyclic 1020 ! scalapack layout 1021 scal_1d%nprow = 1 1022 scal_1d%npcol = peinf%npes_pool 1023 scal_1d%myprow = 0 1024 scal_1d%mypcol = peinf%pool_rank 1025 scal_1d%nbl = epsmpi%nb 1026 scal_1d%icntxt = scal%icntxt 1027 nrow_loc_1d = numroc(nmtx, scal_1d%nbl, scal_1d%myprow, 0, scal_1d%nprow) 1028 ncol_loc_1d = numroc(nmtx, scal_1d%nbl, scal_1d%mypcol, 0, scal_1d%npcol) 1029 scal_1d%npr = nrow_loc_1d 1030 scal_1d%npc = ncol_loc_1d 1031 ! write(*,*) peinf%inode, peinf%pool_rank, peinf%my_pool, nrow_loc_1d, ncol_loc_1d, epsmpi%ngpown 1032 1033 ! temporary buffer 1034 SAFE_ALLOCATE(eps1Aux, (scal_1d%npr, scal_1d%npc)) 1035 eps1Aux = (0.0d0, 0.0d0) 1036 if( peinf%my_pool == 0) then 1037 ! 1038 call timing%start(timing%epscopy_io) 1039 call read_eps_matrix_par_hdf5_general(scal_1d, eps1Aux, nmtx, nmtx, iq, 1, fname, & 1040 'mats/matrix_fulleps0', peinf%pool_comm) 1041 call timing%stop(timing%epscopy_io) 1042 ! 1043 call timing%start(timing%epscopy_comm) 1044 do iproc_dum = 1, peinf%npools - 1 1045 iproc_send = pool2pe(iproc_dum+1, peinf%pool_rank+1) 1046 tag = 0 1047 CALL MPI_SEND(eps1Aux, scal_1d%npr*scal_1d%npc, & 1048 MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr) 1049 end do 1050 call timing%stop(timing%epscopy_comm) 1051 ! nrow_loc_1d should be equal to nmtx 1052 epsmpi%epsR(1:nrow_loc_1d,1:ncol_loc_1d,1,iq+qoffset) = eps1Aux(:,:) 1053 else 1054 iproc_dum = pool2pe(1, peinf%pool_rank+1) 1055 ! this condition should ensure that the processes left over will not be 1056 ! involved in communication 1057 if(peinf%my_pool >= 0 .AND. peinf%pool_rank >= 0) then 1058 tag = 0 1059 CALL MPI_RECV(eps1Aux, scal_1d%npr*scal_1d%npc, & 1060 MPI_COMPLEX_DPC, iproc_dum, tag, MPI_COMM_WORLD, mpistatus, mpierr) 1061 end if 1062 epsmpi%epsR(1:nrow_loc_1d,1:ncol_loc_1d,1,iq+qoffset) = eps1Aux(:,:) 1063 end if 1064 SAFE_DEALLOCATE(eps1Aux) 1065 1066 else ! sig%use_hdf5 1067#endif 1068 ! standard binary read 1069 ! 1070 IF(peinf%inode == 0) THEN 1071 DO j_global = 1, nmtx 1072 ! read full column 1073 read(iunit) (C_send(ig), ig=1, nmtx) 1074 ! 1075 ! global to pool_rank (in the new sigma distribution) 1076 pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool) 1077 DO iproc_dum = 0, peinf%npools - 1 1078 ! select the process that I will have to send 1079 iproc_send = pool2pe(iproc_dum+1,pool_send+1) 1080 IF(iproc_send == 0) THEN 1081 j_local = INDXG2L( j_global, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool) 1082 epsmpi%epsR(1:nmtx,j_local,1,iq+qoffset) = C_send(1:nmtx) 1083 ELSE 1084 tag = 0 1085 CALL MPI_SEND(C_send, nmtx, & 1086 MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr) 1087 END IF 1088 END DO 1089 END DO 1090 ELSE 1091 DO j_global = 1, nmtx 1092 ! global to pool_rank (in the new sigma distribution) 1093 pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool) 1094 DO iproc_dum = 0, peinf%npools - 1 1095 ! select the process that I will have to send 1096 iproc_send = pool2pe(iproc_dum+1,pool_send+1) 1097 IF(iproc_send == peinf%inode) THEN 1098 j_local = INDXG2L( j_global, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool) 1099 tag = 0 1100 CALL MPI_RECV(C_recv, nmtx, & 1101 MPI_COMPLEX_DPC, 0, tag, MPI_COMM_WORLD, mpistatus, mpierr) 1102 epsmpi%epsR(1:nmtx,j_local,1,iq+qoffset) = C_recv 1103 END IF 1104 END DO 1105 END DO 1106 END IF 1107 ! 1108#ifdef HDF5 1109 end if ! h5 1110#endif 1111 ! 1112 SAFE_DEALLOCATE(pool2pe) 1113 END IF ! keep_full_eps_static 1114 1115 SAFE_ALLOCATE(eigenvect, (scal%npr, scal%npc)) 1116 eigenvect = (0.0d0,0.0d0) 1117 1118#ifdef HDF5 1119 if (sig%use_hdf5) then 1120 ! read from .h5 1121 ! get number of eigenvectors 1122 neig_sub = 0 1123 IF(peinf%inode==0) THEN 1124 neig_sub = sub_neig_of_q(iq) 1125 END IF 1126 call MPI_Bcast(neig_sub,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr) 1127 1128 scal_aux%nprow = scal%nprow 1129 scal_aux%npcol = scal%npcol 1130 scal_aux%myprow = scal%myprow 1131 scal_aux%mypcol = scal%mypcol 1132 scal_aux%nbl = scal%nbl 1133 scal_aux%icntxt = scal%icntxt 1134 scal_aux%npr = scal%npr 1135 scal_aux%npc = numroc(neig_sub, scal%nbl, scal%mypcol, 0, scal%npcol) 1136 ! 1137 call timing%start(timing%epscopy_io) 1138 if(.true.) then 1139 call read_eps_matrix_par_hdf5_general(scal_aux, eigenvect, nmtx, neig_sub, iq, 1, fname, & 1140 'mats/matrix_eigenvec', MPI_COMM_WORLD) 1141 else 1142 call read_eps_matrix_par_hdf5_general(scal, eigenvect, nmtx, nmtx, iq, 1, fname, & 1143 'mats/matrix_eigenvec', MPI_COMM_WORLD) 1144 end if 1145 call timing%stop(timing%epscopy_io) 1146 ! 1147 else ! sig%use_hdf5 1148#endif 1149 ! standard binary read 1150 ! 1151 neig_sub = 0 1152 IF(peinf%inode==0) THEN 1153 read(iunit) neig_sub 1154 END IF 1155 call MPI_Bcast(neig_sub,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr) 1156 1157 SAFE_ALLOCATE(buffer, (max_npr,max_npc)) 1158 buffer = (0.0d0,0.0d0) 1159 1160 ! read eigenvector and redistribute 1161 IF(peinf%inode==0) THEN 1162 call timing%start(timing%sub_io_vec) 1163 SAFE_ALLOCATE(C_aux, (nmtx,neig_sub)) 1164 C_aux = (0.0d0,0.0d0) 1165 do igp=1, neig_sub 1166 read(iunit) (C_aux(ig,igp),ig=1,nmtx) 1167 end do 1168 do igp=neig_sub + 1, nmtx 1169 read(iunit) 1170 end do 1171 call timing%stop(timing%sub_io_vec) 1172 ! prepare the messages and send them 1173 DO ipe = 1, peinf%npes - 1 1174 call timing%start(timing%sub_prep_vec) 1175 buffer = (0.0d0,0.0d0) 1176 DO j_local=1, scal%npcd(ipe + 1) 1177 j_global = col_indices(j_local, ipe + 1) 1178 IF(j_global > neig_sub) CYCLE 1179 DO i_local=1, scal%nprd(ipe + 1) 1180 i_global = row_indices(i_local, ipe + 1) 1181 buffer(i_local, j_local) = C_aux(i_global,j_global) 1182 END DO 1183 END DO 1184 call timing%stop(timing%sub_prep_vec) 1185 ! send 1186 call timing%start(timing%sub_comm_vec) 1187 tag = 0 1188 CALL MPI_SEND(buffer, max_npr*max_npc, MPI_COMPLEX_DPC, ipe, tag, MPI_COMM_WORLD,mpierr) 1189 call timing%stop(timing%sub_comm_vec) 1190 END DO 1191 ! myself 1192 DO j_local=1, scal%npc 1193 j_global = col_indices(j_local, peinf%inode + 1) 1194 IF(j_global > neig_sub) CYCLE 1195 DO i_local=1, scal%npr 1196 i_global = row_indices(i_local, peinf%inode + 1) 1197 eigenvect(i_local, j_local) = C_aux(i_global,j_global) 1198 END DO 1199 END DO 1200 ! done 1201 1202 !XXX if(peinf%inode == 0) then 1203 !XXX do ig = 1, neig_sub 1204 !XXX do igp = 1, nmtx 1205 !XXX write(2000,*) ig, igp, C_aux(igp,ig) 1206 !XXX end do 1207 !XXX end do 1208 !XXX write(2000,*) 1209 !XXX end if 1210 1211 SAFE_DEALLOCATE(C_aux) 1212 ELSE 1213 ! receive my stuff 1214 tag = 0 1215 CALL MPI_RECV(buffer, max_npr*max_npc, MPI_COMPLEX_DPC, 0, tag, MPI_COMM_WORLD, mpistatus, mpierr) 1216 eigenvect(1:scal%npr, 1:scal%npc) = buffer(1:scal%npr, 1:scal%npc) 1217 END IF 1218 call MPI_Barrier(MPI_COMM_WORLD, mpierr) 1219 SAFE_DEALLOCATE(buffer) 1220 ! 1221#ifdef HDF5 1222 end if ! h5 1223#endif 1224 1225 !XXX do j_local = 1, scal%npc 1226 !XXX do i_local = 1, scal%npr 1227 !XXX write(1000+peinf%inode,*) i_local, j_local, eigenvect(i_local,j_local) 1228 !XXX end do 1229 !XXX end do 1230 !XXX flush(1000+peinf%inode) 1231 !XXX write(*,*) 'AAAAA' 1232 1233 ! All the same for the subspace epsilon (all frequencies in one) 1234 ! First create the scalapack env, retain the block/grid-layout as that 1235 ! of the eigenvalue matrix 1236 ! get the row/col size for the subspace matrices 1237 ! call MPI_Bcast(neig_sub,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr) 1238 nrow_loc_sub = numroc(neig_sub, scal%nbl, scal%myprow, 0, scal%nprow) 1239 ncol_loc_sub = numroc(neig_sub, scal%nbl, scal%mypcol, 0, scal%npcol) 1240 scal_sub%nprow = scal%nprow 1241 scal_sub%npcol = scal%npcol 1242 scal_sub%myprow = scal%myprow 1243 scal_sub%mypcol = scal%mypcol 1244 scal_sub%nbl = scal%nbl 1245 scal_sub%icntxt = scal%icntxt 1246 scal_sub%npr = nrow_loc_sub 1247 scal_sub%npc = ncol_loc_sub 1248 ! 1249 SAFE_ALLOCATE(scal_sub%nprd, (peinf%npes)) 1250 SAFE_ALLOCATE(scal_sub%npcd, (peinf%npes)) 1251 scal_sub%nprd = 0 1252 scal_sub%npcd = 0 1253 scal_sub%nprd(peinf%inode + 1) = scal_sub%npr 1254 scal_sub%npcd(peinf%inode + 1) = scal_sub%npc 1255 ! 1256 call MPI_ALLREDUCE(MPI_IN_PLACE,scal_sub%nprd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 1257 call MPI_ALLREDUCE(MPI_IN_PLACE,scal_sub%npcd,peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 1258 ! get max number of row/col 1259 max_npr_sub = MAXVAL(scal_sub%nprd) 1260 max_npc_sub = MAXVAL(scal_sub%npcd) 1261 1262 ! precompute indices for subspace matrices 1263 SAFE_ALLOCATE(row_indices_sub, (max_npr_sub, peinf%npes)) 1264 row_indices_sub = 0 1265 DO i_local = 1, scal_sub%npr 1266 i_global = indxl2g(i_local, scal_sub%nbl, scal_sub%myprow, 0, scal_sub%nprow) 1267 row_indices_sub(i_local, peinf%inode + 1) = i_global 1268 END DO 1269 SAFE_ALLOCATE(col_indices_sub, (max_npc_sub, peinf%npes)) 1270 col_indices_sub = 0 1271 DO i_local = 1, scal_sub%npc 1272 i_global = indxl2g(i_local, scal_sub%nbl, scal_sub%mypcol, 0, scal_sub%npcol) 1273 col_indices_sub(i_local, peinf%inode + 1) = i_global 1274 END DO 1275 call MPI_ALLREDUCE(MPI_IN_PLACE, row_indices_sub,peinf%npes*max_npr_sub, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 1276 call MPI_ALLREDUCE(MPI_IN_PLACE, col_indices_sub,peinf%npes*max_npc_sub, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD,mpierr) 1277 1278 ! here we go with copy/redistribution 1279 Nfreq = sig%Nfreq 1280 SAFE_ALLOCATE(eps_sub_f, (MAX(1,scal_sub%npr), MAX(1,scal_sub%npc), Nfreq)) 1281 eps_sub_f = (0.0d0,0.0d0) 1282#ifdef HDF5 1283 if (sig%use_hdf5) then 1284 ! read from .h5 1285 call timing%start(timing%epscopy_io) 1286 call read_eps_matrix_par_hdf5_general_f(scal_sub, eps_sub_f, neig_sub, neig_sub, Nfreq, iq, 1, fname, & 1287 'mats/matrix_subspace', MPI_COMM_WORLD) 1288 call timing%stop(timing%epscopy_io) 1289 ! 1290 else ! sig%use_hdf5 1291#endif 1292 ! binary case 1293 SAFE_ALLOCATE(buffer_f, (Nfreq, max_npr_sub, max_npc_sub)) 1294 buffer_f = (0.0d0,0.0d0) 1295 IF(peinf%inode==0) THEN 1296 call timing%start(timing%sub_io_eps) 1297 SAFE_ALLOCATE(E_aux, (Nfreq,neig_sub,neig_sub)) 1298 E_aux = (0.0d0,0.0d0) 1299 do igp= 1, neig_sub 1300 do ig = 1, neig_sub 1301 read(iunit) (E_aux(ifreq,ig,igp),ifreq=1,Nfreq) 1302 end do 1303#ifdef CPLX 1304 do ig=1, neig_sub 1305 read(iunit) ! For epsADyn (empty line...) 1306 enddo 1307#endif 1308 end do 1309 call timing%stop(timing%sub_io_eps) 1310 ! prepare the messages and send them 1311 DO ipe = 1, peinf%npes - 1 1312 call timing%start(timing%sub_prep_eps) 1313 buffer_f = (0.0d0,0.0d0) 1314 DO j_local=1, scal_sub%npcd(ipe + 1) 1315 j_global = col_indices_sub(j_local, ipe + 1) 1316 DO i_local=1, scal_sub%nprd(ipe + 1) 1317 i_global = row_indices_sub(i_local, ipe + 1) 1318 buffer_f(1:Nfreq, i_local, j_local) = E_aux(1:Nfreq, i_global, j_global) 1319 END DO 1320 END DO 1321 call timing%stop(timing%sub_prep_eps) 1322 ! send 1323 call timing%start(timing%sub_comm_eps) 1324 tag = 0 1325 CALL MPI_SEND(buffer_f, Nfreq*max_npr_sub*max_npc_sub, MPI_COMPLEX_DPC, ipe, tag, MPI_COMM_WORLD, mpierr) 1326 call timing%stop(timing%sub_comm_eps) 1327 END DO 1328 ! myself 1329 DO j_local=1, scal_sub%npc 1330 j_global = col_indices_sub(j_local, peinf%inode + 1) 1331 DO i_local=1, scal_sub%npr 1332 i_global = row_indices_sub(i_local, peinf%inode + 1) 1333 eps_sub_f(i_local, j_local, 1:Nfreq) = E_aux(1:Nfreq, i_global, j_global) 1334 END DO 1335 END DO 1336 ! done 1337 SAFE_DEALLOCATE(E_aux) 1338 ELSE 1339 ! receive my stuff 1340 tag = 0 1341 CALL MPI_RECV(buffer_f, max_npr_sub*max_npc_sub*Nfreq, MPI_COMPLEX_DPC, 0, tag, MPI_COMM_WORLD, mpistatus, mpierr) 1342 DO ifreq = 1, Nfreq 1343 eps_sub_f(1:scal_sub%npr, 1:scal_sub%npc, ifreq) = buffer_f(ifreq, 1:scal_sub%npr, 1:scal_sub%npc) 1344 END DO 1345 END IF 1346 SAFE_DEALLOCATE(buffer_f) 1347 ! 1348#ifdef HDF5 1349 end if ! sig%use_hdf5 1350#endif 1351 1352 ! initialize descriptors 1353 call descinit(desca, nmtx, nmtx, scal%nbl, scal%nbl, 0, 0, & 1354 scal%icntxt, max(1,scal%npr), info) 1355 if(info < 0) then 1356 write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.' 1357 call die("descinit error for descaA in subspace epscopy") 1358 else if(info > 0) then 1359 write(0,*) 'info = ', info 1360 call die("descinit error for descaA in subspace epscopy") 1361 endif 1362 1363 call descinit(desc_sub, neig_sub, neig_sub, scal_sub%nbl, scal_sub%nbl, 0, 0, & 1364 scal_sub%icntxt, max(1,nrow_loc_sub), info) 1365 if(info < 0) then 1366 write(0,'(a,i3,a)') 'Argument number ', -info, ' had an illegal value on entry.' 1367 call die("descinit error for desca_sub in subspace epscopy") 1368 else if(info > 0) then 1369 write(0,*) 'info = ', info 1370 call die("descinit error for desca_sub in subspace epscopy") 1371 endif 1372 1373 ! 0) initialize, map proc to rank_pool 1374 SAFE_ALLOCATE(grid2pe, (scal%nprow, scal%npcol)) 1375 grid2pe = 0 1376 grid2pe(scal%myprow+1, scal%mypcol+1) = peinf%inode 1377 call MPI_ALLREDUCE(MPI_IN_PLACE, grid2pe, scal%nprow*scal%npcol , MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr) 1378 ! 1379 SAFE_ALLOCATE(pe2grid, (2,peinf%npes)) 1380 pe2grid = 0 1381 pe2grid(1, peinf%inode+1) = scal%myprow 1382 pe2grid(2, peinf%inode+1) = scal%mypcol 1383 call MPI_ALLREDUCE(MPI_IN_PLACE, pe2grid, 2*peinf%npes, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr) 1384 ! 1385 SAFE_ALLOCATE(pool2pe, (peinf%npools,peinf%npes_pool)) 1386 pool2pe = 0 1387 IF(peinf%my_pool >= 0 .AND. peinf%pool_rank >= 0) THEN 1388 pool2pe(peinf%my_pool+1, peinf%pool_rank+1) = peinf%inode 1389 END IF 1390 call MPI_ALLREDUCE(MPI_IN_PLACE, pool2pe, peinf%npools*peinf%npes_pool, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, mpierr) 1391 ! loop over my local column and calculate the how many of them I have to send to all other proc 1392 SAFE_ALLOCATE(num_col_to_send, (peinf%npes)) 1393 num_col_to_send = 0 1394 do j_local = 1, scal%npc 1395 ! local to global (in the epsilon distribution) 1396 j_global = col_indices(j_local, peinf%inode + 1) 1397 IF(j_global > nmtx) CYCLE 1398 ! global to pool_rank (in the new sigma distribution) 1399 pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool) 1400 DO iproc_dum = 0, peinf%npools - 1 1401 ! select the process that I will have to send 1402 iproc_send = pool2pe(iproc_dum+1,pool_send+1) 1403 num_col_to_send(iproc_send+1) = num_col_to_send(iproc_send+1) + 1 1404 END DO 1405 end do 1406 ! loop over my local column (in the new sigma distribution) and precompute how many columns I have to receive from 1407 ! all other processes, this will happen only if peinf%my_pool >= 0 (I hope...) 1408 SAFE_ALLOCATE(num_col_to_recv, (peinf%npes)) 1409 num_col_to_recv = 0 1410 IF(peinf%my_pool >= 0) THEN 1411 do j_local = 1, epsmpi%ngpown 1412 ! local to global (in the sigma distribution) 1413 j_global = INDXL2G(j_local, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool) 1414 IF(j_global > nmtx) CYCLE 1415 ! get the process in the epsilon distribution that hold this column 1416 iproc_row_rec = INDXG2P( j_global, scal%nbl, iproc_dum, 0, scal%npcol) 1417 DO iproc_dum = 0, scal%nprow - 1 1418 iproc_rec = grid2pe(iproc_dum+1, iproc_row_rec+1) 1419 num_col_to_recv(iproc_rec+1) = num_col_to_recv(iproc_rec+1) + 1 1420 END DO 1421 end do 1422 END IF 1423 1424 ! allocate receiving buffer 1425 Nbuf_recv = 0 1426 DO ipe = 1, peinf%npes 1427 IF(num_col_to_recv(ipe) > 0) Nbuf_recv = Nbuf_recv + 1 1428 END DO 1429 SAFE_ALLOCATE(buf_recv, (Nbuf_recv)) 1430 imsg_recv = 0 1431 DO ipe = 1, peinf%npes 1432 IF(num_col_to_recv(ipe) > 0) THEN 1433 imsg_recv = imsg_recv + 1 1434 buf_recv(imsg_recv)%proc = ipe - 1 1435 buf_recv(imsg_recv)%ncol = num_col_to_recv(ipe) 1436 buf_recv(imsg_recv)%nrow = scal%nprd(ipe) 1437 SAFE_ALLOCATE(buf_recv(imsg_recv)%msg, (buf_recv(imsg_recv)%nrow,buf_recv(imsg_recv)%ncol)) 1438 SAFE_ALLOCATE(buf_recv(imsg_recv)%col_global_indx, (buf_recv(imsg_recv)%ncol)) 1439 buf_recv(imsg_recv)%col_global_indx = -100 1440 END IF 1441 END DO 1442 1443 ! allocate sending buffer 1444 Nbuf_send = 0 1445 DO ipe = 1, peinf%npes 1446 IF(num_col_to_send(ipe) > 0) Nbuf_send = Nbuf_send + 1 1447 END DO 1448 SAFE_ALLOCATE(proc2msg_send, (peinf%npes)) 1449 proc2msg_send = 0 1450 SAFE_ALLOCATE(buf_send, (Nbuf_send)) 1451 imsg_send = 0 1452 DO ipe = 1, peinf%npes 1453 IF(num_col_to_send(ipe) > 0) THEN 1454 imsg_send = imsg_send + 1 1455 buf_send(imsg_send)%proc = ipe - 1 1456 buf_send(imsg_send)%ncol = num_col_to_send(ipe) 1457 buf_send(imsg_send)%nrow = scal%npr 1458 SAFE_ALLOCATE(buf_send(imsg_send)%msg, (buf_send(imsg_send)%nrow, buf_send(imsg_send)%ncol)) 1459 SAFE_ALLOCATE(buf_send(imsg_send)%col_global_indx, (buf_send(imsg_send)%ncol)) 1460 buf_send(imsg_send)%col_global_indx = -100 1461 ! 1462 proc2msg_send(ipe) = imsg_send 1463 END IF 1464 END DO 1465 SAFE_ALLOCATE(local_col_count, (Nbuf_send)) 1466 SAFE_ALLOCATE(req_send, (2 * Nbuf_send)) 1467 SAFE_ALLOCATE(my_status, (MPI_STATUS_SIZE, 2*Nbuf_send)) 1468 SAFE_ALLOCATE(req_recv, (2 * Nbuf_recv)) 1469 1470 ! in the case of sigma subspace find which is the head/wings position for this 1471 ! specific q-point 1472 if(sig%do_sigma_subspace) then 1473 max_head = 1 1474 if((peinf%inode==0)) then 1475 gg = 0 1476 call findvector(iout, gg, gvec) 1477 max_head = epsmpi%isrtqi(iout,iq+qoffset) 1478 call MPI_Bcast(max_head,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr) 1479 else 1480 call MPI_Bcast(max_head,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr) 1481 end if 1482 sig%epssub%wing_pos(iq+qoffset) = max_head 1483 end if 1484 1485 ! prepare the coul_Aux to avoid recomputing many times the SQRT(vcoul) 1486 call timing%start(timing%epscopy_vcoul) 1487 SAFE_ALLOCATE(row_vcoul_aux, (scal%npr)) ! row -> * SQRT(vc) 1488 SAFE_ALLOCATE(col_vcoul_aux, (scal%npc)) ! col -> * 1/SQRT(vc) 1489 ! initialize to one since it will be a scaling factor 1490 row_vcoul_aux = 1.0d+00 1491 do ig_l = 1, scal%npr 1492 ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow) 1493 row_vcoul_aux(ig_l) = sqrt(vcoul(ig_g)) 1494 enddo 1495 col_vcoul_aux = 1.0d+00 1496 do igp_l = 1, scal%npc 1497 igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol) 1498 vc = 1.0d+00 / sqrt(vcoul(igp_g)) 1499 col_vcoul_aux(igp_l) = vc 1500 end do 1501 ! diagonal elements 1502 my_num_diag_elem = 0 1503 do igp_l = 1, scal%npc 1504 igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol) 1505 do ig_l = 1, scal%npr 1506 ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow) 1507 if (ig_g==igp_g) my_num_diag_elem = my_num_diag_elem + 1 1508 enddo 1509 enddo 1510 ! allocate index array for diagonal elements 1511 SAFE_ALLOCATE(my_diag_indeces, (2,my_num_diag_elem)) 1512 my_diag_indeces = 0 1513 idiag = 0 1514 do igp_l = 1, scal%npc 1515 igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol) 1516 vc = sqrt(vcoul(igp_g)) 1517 do ig_l = 1, scal%npr 1518 ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow) 1519 if (ig_g==igp_g) then 1520 idiag = idiag + 1 1521 my_diag_indeces(1, idiag) = ig_l 1522 my_diag_indeces(2, idiag) = igp_l 1523 end if 1524 enddo 1525 enddo 1526 call timing%stop(timing%epscopy_vcoul) 1527 1528 ! for the subspace epsinv ONE has been already scaled from the diagonal... 1529 ! expand the subspace inverse epsilon 1530 ! allocate temporary matrices 1531 SAFE_ALLOCATE(eps1Aux, (scal%npr, scal%npc)) 1532 SAFE_ALLOCATE(C_Pgemm, (scal%npr, scal%npc)) 1533 ! loop over frequencies 1534 DO ifreq = 1, Nfreq 1535 call timing%start(timing%epscopy_pgemm) 1536 CALL pzgemm('N','N', nmtx, neig_sub, neig_sub, (1.0d0,0.0d0), eigenvect, 1, 1, desca, & 1537 eps_sub_f(:,:,ifreq), 1, 1, desc_sub, (0.0d0,0.0d0), & 1538 C_Pgemm, 1, 1, desca) 1539 if((.not.sig%do_sigma_subspace) .or. ifreq == 1) then 1540 ! here we always calculate the full epsilon^{-1} for the first frequency, this 1541 ! is apparently important if we need to calculate the static CH/SH 1542 CALL pzgemm('N','C', nmtx, nmtx, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, & 1543 eigenvect, 1, 1, desca, (0.0d0,0.0d0), & 1544 eps1Aux, 1, 1, desca) 1545 else 1546 CALL pzgemm('N','C', max_head, nmtx, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, & 1547 eigenvect, 1, 1, desca, (0.0d0,0.0d0), & 1548 eps1Aux, 1, 1, desca) 1549 do j_local = 1, scal%npc 1550 j_global = col_indices(j_local, peinf%inode + 1) 1551 if( j_global <= max_head) eps1Aux(:,j_local) = (0.0d0,0.0d0) 1552 end do 1553 CALL pzgemm('N','C', nmtx, max_head, neig_sub, (1.0d0,0.0d0), C_Pgemm, 1, 1, desca, & 1554 eigenvect, 1, 1, desca, (1.0d0,0.0d0), & 1555 eps1Aux, 1, 1, desca) 1556 end if 1557 call timing%stop(timing%epscopy_pgemm) 1558 ! here we have the full matrix for frequency ifreq, restore one on diagonal 1559 ! and "unsymmetrize" with vcoul 1560 ! FHJ: WARNING: never perform a nested loop over the global rows and 1561 ! columns, as these dimensions may be huge, and the code will spend 1562 ! a long time doing nothing. Instead, always loop over the local rows 1563 ! and columns and use indxl2g to get the corresponding global index. 1564 call timing%start(timing%epscopy_vcoul) 1565 if(.true.) then 1566 ! new fast scheme 1567 ! sum one on diagonal 1568 do idiag = 1, my_num_diag_elem 1569 i_local = my_diag_indeces(1, idiag) 1570 j_local = my_diag_indeces(2, idiag) 1571 eps1Aux(i_local, j_local) = eps1Aux(i_local, j_local) + ONE 1572 end do 1573 ! scale with coulomb operator 1574 do igp_l = 1, scal%npc 1575 vc = col_vcoul_aux(igp_l) 1576 do ig_l = 1, scal%npr 1577 eps1Aux(ig_l,igp_l) = eps1Aux(ig_l,igp_l) * row_vcoul_aux(ig_l) * vc 1578 end do 1579 end do 1580 ! 1581 else 1582 ! keep old slow scheme for debug 1583 do igp_l = 1, scal%npc 1584 igp_g = indxl2g(igp_l, scal%nbl, scal%mypcol, 0, scal%npcol) 1585 vc = sqrt(vcoul(igp_g)) 1586 do ig_l = 1, scal%npr 1587 ig_g = indxl2g(ig_l, scal%nbl, scal%myprow, 0, scal%nprow) 1588 if (ig_g==igp_g) eps1Aux(ig_l,igp_l) = eps1Aux(ig_l,igp_l) + ONE 1589 eps1Aux(ig_l,igp_l) = eps1Aux(ig_l,igp_l) * sqrt(vcoul(ig_g)) / vc 1590 enddo 1591 enddo 1592 ! 1593 end if 1594 call timing%stop(timing%epscopy_vcoul) 1595 1596 ! in case of subspace in sigma, we just save in the fully expanded 1597 ! form the static case (first frequency, equal to 0) 1598 IF((.not.sig%do_sigma_subspace) .OR. ifreq == 1) THEN 1599 ! here we have the epsinv matrix as output from a standard epsilon calculation 1600 ! redistribute according to the new layout as required in sigma (for omega=0 there 1601 ! is the possibility to keep the full epsinv) 1602 IF((.NOT.(keep_full_eps_static)) .OR. ifreq > 1) THEN 1603 ! post all messages to be received 1604 DO imsg_recv = 1, Nbuf_recv 1605 ! WRITE(2000+peinf%inode,*) peinf%inode, imsg_recv, buf_recv(imsg_recv)%proc 1606 tag = 0 1607 CALL MPI_IRECV(buf_recv(imsg_recv)%msg, buf_recv(imsg_recv)%ncol*buf_recv(imsg_recv)%nrow, & 1608 MPI_COMPLEX_DPC, buf_recv(imsg_recv)%proc, tag, MPI_COMM_WORLD, req_recv((imsg_recv-1)*2+1), & 1609 mpierr) 1610 tag = 0 1611 CALL MPI_IRECV(buf_recv(imsg_recv)%col_global_indx, buf_recv(imsg_recv)%ncol, & 1612 MPI_INTEGER, buf_recv(imsg_recv)%proc, tag, MPI_COMM_WORLD, req_recv(imsg_recv*2), & 1613 mpierr) 1614 END DO 1615 ! fill send buffer 1616 ! loop over local column 1617 local_col_count = 0 1618 do j_local = 1, scal%npc 1619 j_global = col_indices(j_local, peinf%inode + 1) 1620 IF(j_global > nmtx) CYCLE 1621 pool_send = INDXG2P( j_global, epsmpi%nb, iproc_dum, 0, peinf%npes_pool) 1622 DO iproc_dum = 0, peinf%npools - 1 1623 iproc_send = pool2pe(iproc_dum+1,pool_send+1) 1624 imsg_send = proc2msg_send(iproc_send+1) 1625 local_col_count(imsg_send) = local_col_count(imsg_send) + 1 1626 buf_send(imsg_send)%msg(1:scal%npr,local_col_count(imsg_send)) = eps1Aux(1:scal%npr,j_local) 1627 buf_send(imsg_send)%col_global_indx(local_col_count(imsg_send)) = j_global 1628 END DO 1629 end do 1630 ! send messeges 1631 DO imsg_send = 1, Nbuf_send 1632 ! WRITE(3000+peinf%inode,*) peinf%inode, imsg_send, buf_send(imsg_send)%proc 1633 tag = 0 1634 CALL MPI_ISEND(buf_send(imsg_send)%msg, buf_send(imsg_send)%ncol*buf_send(imsg_send)%nrow, & 1635 MPI_COMPLEX_DPC, buf_send(imsg_send)%proc, tag, MPI_COMM_WORLD, req_send((imsg_send-1)*2+1), & 1636 mpierr) 1637 tag = 0 1638 CALL MPI_ISEND(buf_send(imsg_send)%col_global_indx, buf_send(imsg_send)%ncol, & 1639 MPI_INTEGER, buf_send(imsg_send)%proc, tag, MPI_COMM_WORLD, req_send(imsg_send*2), & 1640 mpierr) 1641 END DO 1642 ! collect messages 1643 DO imsg_recv = 1, Nbuf_recv 1644 CALL MPI_WAIT(req_recv((imsg_recv-1)*2+1), mpistatus, mpierr) 1645 CALL MPI_WAIT(req_recv(imsg_recv*2), mpistatus, mpierr) 1646 ! fill in epsmpi 1647 ! iproc_row_rec = pe2grid(1, buf_recv(imsg_recv)%proc+1) 1648 DO j_local = 1, buf_recv(imsg_recv)%ncol 1649 j_global = buf_recv(imsg_recv)%col_global_indx(j_local) 1650 my_col = INDXG2L(j_global, epsmpi%nb, peinf%pool_rank, 0, peinf%npes_pool) 1651 DO i_local = 1, buf_recv(imsg_recv)%nrow 1652 i_global = row_indices(i_local, buf_recv(imsg_recv)%proc+1) 1653 epsmpi%epsR(i_global,my_col,ifreq,iq+qoffset) = buf_recv(imsg_recv)%msg(i_local,j_local) 1654 END DO 1655 END DO 1656 END DO 1657 1658 ! wait for all 1659 CALL MPI_WAITALL(Nbuf_send*2,req_send,my_status,mpierr) 1660 END IF 1661 call MPI_Barrier(MPI_COMM_WORLD, mpierr) 1662 END IF ! .not.subspace and ifreq>1 1663 1664 IF(sig%do_sigma_subspace) THEN 1665 ! in case of subspace here we need to save the wings of the full matrix 1666 ! for simplicity we always read from eps1Aux, even in the case (keep_full_eps_static and ifreq==1) 1667 1668 sig%epssub%eps_wings_cols(:,ifreq,iq+qoffset) = (0.0d0,0.0d0) 1669 ipe_wing = INDXG2P(max_head, scal%nbl, iproc_dum, 0, scal%npcol) 1670 if (ipe_wing == scal%mypcol) then 1671 my_pos_loc_wing = INDXG2L(max_head, scal%nbl, scal%mypcol, 0, scal%npcol) 1672 do i_local = 1, scal%npr 1673 i_global = row_indices(i_local, peinf%inode + 1) 1674 sig%epssub%eps_wings_cols(i_global,ifreq,iq+qoffset) = eps1Aux(i_local,my_pos_loc_wing) 1675 end do 1676 end if 1677 1678 sig%epssub%eps_wings_rows(:,ifreq,iq+qoffset) = (0.0d0,0.0d0) 1679 ipe_wing = INDXG2P(max_head, scal%nbl, iproc_dum, 0, scal%nprow) 1680 if (ipe_wing == scal%myprow) then 1681 my_pos_loc_wing = INDXG2L(max_head, scal%nbl, scal%myprow, 0, scal%nprow) 1682 do j_local = 1, scal%npc 1683 j_global = col_indices(j_local, peinf%inode + 1) 1684 sig%epssub%eps_wings_rows(j_global,ifreq,iq+qoffset) = eps1Aux(my_pos_loc_wing,j_local) 1685 if (j_global == max_head) then 1686 sig%epssub%eps_wings_rows(max_head,ifreq,iq+qoffset) = & 1687 sig%epssub%eps_wings_rows(max_head,ifreq,iq+qoffset) - 1.0d0 1688 end if 1689 end do 1690 end if 1691 1692 END IF 1693 1694 END DO ! ifreq 1695 1696 SAFE_DEALLOCATE(row_vcoul_aux) 1697 SAFE_DEALLOCATE(col_vcoul_aux) 1698 SAFE_DEALLOCATE(my_diag_indeces) 1699 1700 ! check if we have to redistribute eigenvectors for sigma-subspace calc 1701 if(sig%do_sigma_subspace) then 1702 call timing%start(timing%epscopy_redstr) 1703 ! 1704 ! sum-up wings from previous step 1705 call MPI_Allreduce(MPI_IN_PLACE, sig%epssub%eps_wings_rows(1:Neps,1:sig%nFreq,iq+qoffset), & 1706 sig%epssub%neps * sig%nFreq, MPI_COMPLEX_DPC, MPI_SUM, MPI_COMM_WORLD, mpierr) 1707 call MPI_Allreduce(MPI_IN_PLACE, sig%epssub%eps_wings_cols(1:Neps,1:sig%nFreq,iq+qoffset), & 1708 sig%epssub%neps * sig%nFreq, MPI_COMPLEX_DPC, MPI_SUM, MPI_COMM_WORLD, mpierr) 1709 1710 if ( cyclic_redistr ) then 1711 ! cyclic style redistribution (not the most efficient but easier than other methods) 1712 ! start allocating buffers and initialize 1713 ! eigenvectors 1714 ! figure out how many col do this porc own with global index less than Neig 1715 ! (assuming that if j_local < j_local' then j_global < j_global') 1716 npc_lt_Neig = 0 1717 do j_local = 1, scal%npc 1718 j_global = col_indices(j_local, peinf%inode + 1) 1719 if ( j_global > neig_sub ) exit 1720 npc_lt_Neig = npc_lt_Neig + 1 1721 end do 1722 max_npc_lt_Neig = npc_lt_Neig 1723 call mpi_allreduce(MPI_IN_PLACE, max_npc_lt_Neig, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr) 1724 ! 1725 SAFE_ALLOCATE(buffer_sub_eigen, (max_npr, max_npc_lt_Neig)) 1726 SAFE_ALLOCATE(rec_buffer_sub_eigen, (max_npr, max_npc_lt_Neig)) 1727 SAFE_ALLOCATE(send_buffer_sub_eigen, (max_npr, max_npc_lt_Neig)) 1728!$OMP PARALLEL do private (j_local, i_local) collapse(2) 1729 do j_local = 1, max_npc_lt_Neig 1730 do i_local = 1, max_npr 1731 buffer_sub_eigen(i_local, j_local) = (0.0d0,0.0d0) 1732 rec_buffer_sub_eigen(i_local, j_local) = (0.0d0,0.0d0) 1733 send_buffer_sub_eigen(i_local, j_local) = (0.0d0,0.0d0) 1734 end do 1735 end do 1736!$OMP END PARALLEL DO 1737 ! 1738!$OMP PARALLEL do private (j_local, i_local) collapse(2) 1739 do j_local = 1, npc_lt_Neig 1740 do i_local = 1, scal%npr 1741 buffer_sub_eigen(i_local, j_local) = eigenvect(i_local, j_local) 1742 send_buffer_sub_eigen(i_local, j_local) = eigenvect(i_local, j_local) 1743 end do 1744 end do 1745!$OMP END PARALLEL DO 1746 1747 ! calculate my index range and save info 1748 do_copy = .true. 1749 sig%epssub%eps_sub_info(:,2,iq+qoffset) = 0 1750 indx_start = peinf%pool_rank * sig%epssub%Nbas_own_max + 1 1751 if(indx_start <= neig_sub) then 1752 indx_end = MIN((peinf%pool_rank + 1) * sig%epssub%Nbas_own_max, neig_sub) 1753 sub_size = indx_end - indx_start + 1 1754 sig%epssub%eps_sub_info(1,2,iq+qoffset) = indx_start 1755 sig%epssub%eps_sub_info(2,2,iq+qoffset) = indx_end 1756 else 1757 indx_start = 0 1758 indx_end = 0 1759 sub_size = 0 1760 do_copy = .false. 1761 end if 1762 sig%epssub%eps_sub_info(3,2,iq+qoffset) = nmtx 1763 1764 ! keep track of the indeces to copy 1765 SAFE_ALLOCATE(copy_col_inx, (max_npc_lt_Neig)) 1766 ! get process for communication 1767 isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes) 1768 irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes) 1769 ! start cyclic loop (starting with myself) 1770 ipe_real = peinf%inode + 1 1771 do ipe = 1, peinf%npes 1772 ! 1773 actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes) 1774 actual_rec = MOD(peinf%inode - ipe + peinf%npes, peinf%npes) 1775 ! 1776 ! post msgs 1777 tag_rec = 1 1778 tag_send = 1 1779 req_rec_singl = MPI_REQUEST_NULL 1780 req_send_singl = MPI_REQUEST_NULL 1781 CALL MPI_Irecv(rec_buffer_sub_eigen, max_npr*max_npc_lt_Neig, & 1782 MPI_COMPLEX_DPC, irec_static, & 1783 tag_rec, MPI_COMM_WORLD, req_rec_singl, mpierr) 1784 CALL MPI_Isend(send_buffer_sub_eigen, max_npr*max_npc_lt_Neig, & 1785 MPI_COMPLEX_DPC, isend_static, & 1786 tag_send, MPI_COMM_WORLD, req_send_singl, mpierr) 1787 1788 if ( do_copy ) then 1789 ! precompute number of column to copy 1790 ncol_to_copy = 0 1791 copy_col_inx = 0 1792 do j_local = 1, MIN(max_npc_lt_Neig, scal%npcd(ipe_real)) 1793 j_global = col_indices(j_local, ipe_real) 1794 if ( j_global >= indx_start .and. j_global <= indx_end) then 1795 ncol_to_copy = ncol_to_copy + 1 1796 copy_col_inx( ncol_to_copy ) = j_local 1797 end if 1798 if (j_global > indx_end) exit 1799 end do 1800 ! copy message on local buffer 1801!$OMP PARALLEL do private (j, j_local, j_global, i_local, i_global) 1802 do j = 1, ncol_to_copy 1803 j_local = copy_col_inx( j ) 1804 j_global = col_indices(j_local, ipe_real) - indx_start + 1 1805 do i_local = 1, scal%nprd(ipe_real) 1806 i_global = row_indices(i_local, ipe_real) 1807 sig%epssub%eigenvec_sub(i_global, j_global, iq+qoffset) = buffer_sub_eigen(i_local,j_local) 1808 end do 1809 end do 1810!$OMP END PARALLEL DO 1811 ! 1812 end if 1813 1814 ! finalize communication 1815 CALL MPI_Wait(req_rec_singl, mpistatus, mpierr) 1816 ! swap buffers 1817!$OMP PARALLEL do private (j_local, i_local) collapse(2) 1818 do j_local = 1, max_npc_lt_Neig 1819 do i_local = 1, max_npr 1820 buffer_sub_eigen(i_local, j_local) = rec_buffer_sub_eigen(i_local, j_local) 1821 end do 1822 end do 1823!$OMP END PARALLEL DO 1824 ! the same for the sendig buffer 1825 CALL MPI_Wait(req_send_singl, mpistatus, mpierr) 1826 ! swap buffers 1827!$OMP PARALLEL do private (j_local, i_local) collapse(2) 1828 do j_local = 1, max_npc_lt_Neig 1829 do i_local = 1, max_npr 1830 send_buffer_sub_eigen(i_local, j_local) = rec_buffer_sub_eigen(i_local, j_local) 1831 end do 1832 end do 1833!$OMP END PARALLEL DO 1834 1835 ! be ready for the next cycle 1836 ipe_real = actual_rec + 1 1837 1838 end do ! ipe 1839 ! deallocate stuff 1840 SAFE_DEALLOCATE(buffer_sub_eigen) 1841 SAFE_DEALLOCATE(rec_buffer_sub_eigen) 1842 SAFE_DEALLOCATE(send_buffer_sub_eigen) 1843 SAFE_DEALLOCATE(copy_col_inx) 1844 1845 ! the same for the subspace epsilon 1846 SAFE_ALLOCATE(buffer_sub_eps, (max_npr_sub, max_npc_sub, Nfreq)) 1847 SAFE_ALLOCATE(rec_buffer_sub_eps, (max_npr_sub, max_npc_sub, Nfreq)) 1848 SAFE_ALLOCATE(send_buffer_sub_eps, (max_npr_sub, max_npc_sub, Nfreq)) 1849!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2) 1850 do ifreq = 1, Nfreq 1851 do j_local = 1, max_npc_sub 1852 do i_local = 1, max_npr_sub 1853 buffer_sub_eps(i_local, j_local, ifreq) = (0.0d0,0.0d0) 1854 rec_buffer_sub_eps(i_local, j_local, ifreq) = (0.0d0,0.0d0) 1855 send_buffer_sub_eps(i_local, j_local, ifreq) = (0.0d0,0.0d0) 1856 end do 1857 end do 1858 end do 1859!$OMP END PARALLEL DO 1860!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2) 1861 do ifreq = 1, Nfreq 1862 do j_local = 1, scal_sub%npc 1863 do i_local = 1, scal_sub%npr 1864 buffer_sub_eps(i_local, j_local, ifreq) = eps_sub_f(i_local, j_local, ifreq) 1865 send_buffer_sub_eps(i_local, j_local, ifreq) = eps_sub_f(i_local, j_local, ifreq) 1866 end do 1867 end do 1868 end do 1869!$OMP END PARALLEL DO 1870 1871 ! calculate my index range and save info 1872 do_copy = .true. 1873 sig%epssub%eps_sub_info(:,1,iq+qoffset) = 0 1874 indx_start = peinf%pool_rank * sig%epssub%Nbas_own_max + 1 1875 if(indx_start <= neig_sub) then 1876 indx_end = MIN((peinf%pool_rank + 1) * sig%epssub%Nbas_own_max, neig_sub) 1877 sub_size = indx_end - indx_start + 1 1878 sig%epssub%eps_sub_info(1,1,iq+qoffset) = indx_start 1879 sig%epssub%eps_sub_info(2,1,iq+qoffset) = indx_end 1880 else 1881 indx_start = 0 1882 indx_end = 0 1883 sub_size = 0 1884 do_copy = .false. 1885 end if 1886 sig%epssub%eps_sub_info(3,1,iq+qoffset) = neig_sub 1887 1888 ! keep track of the indeces to copy 1889 SAFE_ALLOCATE(copy_col_inx, (max_npc_sub)) 1890 ! get process for communication 1891 isend_static = MOD(peinf%inode + 1 + peinf%npes, peinf%npes) 1892 irec_static = MOD(peinf%inode - 1 + peinf%npes, peinf%npes) 1893 ! start cyclic loop (starting with myself) 1894 ipe_real = peinf%inode + 1 1895 do ipe = 1, peinf%npes 1896 ! 1897 actual_send = MOD(peinf%inode + ipe + peinf%npes, peinf%npes) 1898 actual_rec = MOD(peinf%inode - ipe + peinf%npes, peinf%npes) 1899 ! 1900 ! post msgs 1901 tag_rec = 1 1902 tag_send = 1 1903 req_rec_singl = MPI_REQUEST_NULL 1904 req_send_singl = MPI_REQUEST_NULL 1905 CALL MPI_Irecv(rec_buffer_sub_eps, max_npr_sub*max_npc_sub*Nfreq, & 1906 MPI_COMPLEX_DPC, irec_static, & 1907 tag_rec, MPI_COMM_WORLD, req_rec_singl, mpierr) 1908 CALL MPI_Isend(send_buffer_sub_eps, max_npr_sub*max_npc_sub*Nfreq, & 1909 MPI_COMPLEX_DPC, isend_static, & 1910 tag_send, MPI_COMM_WORLD, req_send_singl, mpierr) 1911 1912 if ( do_copy ) then 1913 ! precompute number of column to copy 1914 ncol_to_copy = 0 1915 copy_col_inx = 0 1916 do j_local = 1, scal_sub%npcd(ipe_real) 1917 j_global = col_indices_sub(j_local, ipe_real) 1918 if ( j_global >= indx_start .and. j_global <= indx_end) then 1919 ncol_to_copy = ncol_to_copy + 1 1920 copy_col_inx( ncol_to_copy ) = j_local 1921 end if 1922 if (j_global > indx_end) exit 1923 end do 1924 ! copy message on local buffer 1925!$OMP PARALLEL do private (j, j_local, j_global, i_local, i_global, ifreq) 1926 do ifreq = 1, Nfreq 1927 do j = 1, ncol_to_copy 1928 j_local = copy_col_inx( j ) 1929 j_global = col_indices_sub(j_local, ipe_real) - indx_start + 1 1930 do i_local = 1, scal_sub%nprd(ipe_real) 1931 i_global = row_indices_sub(i_local, ipe_real) 1932 sig%epssub%eps_sub(i_global, j_global, ifreq, iq+qoffset) = & 1933 buffer_sub_eps(i_local, j_local, ifreq) 1934 end do 1935 end do 1936 end do 1937!$OMP END PARALLEL DO 1938 end if ! do_copy 1939 1940 ! finalize communication 1941 CALL MPI_Wait(req_rec_singl, mpistatus, mpierr) 1942 ! swap buffers 1943!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2) 1944 do ifreq = 1, Nfreq 1945 do j_local = 1, max_npc_sub 1946 do i_local = 1, max_npr_sub 1947 buffer_sub_eps(i_local, j_local, ifreq) = rec_buffer_sub_eps(i_local, j_local, ifreq) 1948 end do 1949 end do 1950 end do 1951!$OMP END PARALLEL DO 1952 ! the same for the sendig buffer 1953 CALL MPI_Wait(req_send_singl, mpistatus, mpierr) 1954 ! swap buffers 1955!$OMP PARALLEL do private (j_local, i_local, ifreq) collapse(2) 1956 do ifreq = 1, Nfreq 1957 do j_local = 1, max_npc_sub 1958 do i_local = 1, max_npr_sub 1959 send_buffer_sub_eps(i_local, j_local, ifreq) = rec_buffer_sub_eps(i_local, j_local, ifreq) 1960 end do 1961 end do 1962 end do 1963!$OMP END PARALLEL DO 1964 1965 ! be ready for the next cycle 1966 ipe_real = actual_rec + 1 1967 1968 end do ! ipe 1969 ! 1970 SAFE_DEALLOCATE(copy_col_inx) 1971 SAFE_DEALLOCATE(buffer_sub_eps) 1972 SAFE_DEALLOCATE(rec_buffer_sub_eps) 1973 SAFE_DEALLOCATE(send_buffer_sub_eps) 1974 ! DONE 1975 else ! cyclic_redistr 1976 ! redistribution based on collective operations 1977 if (peinf%inode==0) write(6,'(1x,a)') 'Redistribute using collective operations!' 1978 ! 1979 SAFE_ALLOCATE(buffer_sub_eps, (neig_sub, sig%epssub%Nbas_own_max, Nfreq)) 1980 !XXX SAFE_ALLOCATE(buffer_sub_eigen, (sig%epssub%ngpown_sub_max, neig_sub)) 1981 SAFE_ALLOCATE(buffer_sub_eigen, (nmtx, sig%epssub%Nbas_own_max)) 1982 do iproc_pool = 1, peinf%npes_pool 1983 1984 ! eigenvectors 1985 indx_start = (iproc_pool-1) * sig%epssub%Nbas_own_max + 1 1986 if(indx_start <= neig_sub) then 1987 indx_end = MIN(iproc_pool * sig%epssub%Nbas_own_max, neig_sub) 1988 sub_size = indx_end - indx_start + 1 1989 1990 buffer_sub_eigen = (0.0d0,0.0d0) 1991 icurr = 0 1992 do j_global = indx_start, indx_end 1993 icurr = icurr + 1 1994 iproc_col = INDXG2P( j_global, scal%nbl, iproc_dum, 0, scal%npcol) 1995 if(iproc_col == scal%mypcol) then 1996 j_local = INDXG2L(j_global, scal%nbl, scal%mypcol, 0, scal%npcol) 1997 do i_local = 1, scal%npr 1998 i_global = row_indices(i_local, peinf%inode + 1) 1999 if(i_global > nmtx) cycle 2000 buffer_sub_eigen(i_global, icurr) = eigenvect(i_local, j_local) 2001 end do 2002 end if 2003 end do 2004 2005 iproc_send = pool2pe(1,iproc_pool) 2006 if(iproc_send == peinf%inode) then 2007 call MPI_Reduce(MPI_IN_PLACE, buffer_sub_eigen, & 2008 nmtx*sig%epssub%Nbas_own_max, MPI_COMPLEX_DPC,& 2009 MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr) 2010 ! copy data 2011 sig%epssub%eigenvec_sub(1:nmtx, 1:sig%epssub%Nbas_own, iq+qoffset) = & 2012 buffer_sub_eigen(1:nmtx, 1:sig%epssub%Nbas_own) 2013 else 2014 call MPI_Reduce(buffer_sub_eigen, buffer_sub_eigen, & 2015 nmtx*sig%epssub%Nbas_own_max, MPI_COMPLEX_DPC,& 2016 MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr) 2017 end if 2018 end if 2019 2020 ! save information 2021 if(iproc_pool-1 == peinf%pool_rank) then 2022 sig%epssub%eps_sub_info(:,2,iq+qoffset) = 0 2023 if(indx_start <= neig_sub) then 2024 sig%epssub%eps_sub_info(1,2,iq+qoffset) = indx_start 2025 sig%epssub%eps_sub_info(2,2,iq+qoffset) = indx_end 2026 end if 2027 sig%epssub%eps_sub_info(3,2,iq+qoffset) = nmtx 2028 end if 2029 2030 ! subspace epsilon 2031 indx_start = (iproc_pool-1) * sig%epssub%Nbas_own_max + 1 2032 if(indx_start <= neig_sub) then 2033 indx_end = MIN(iproc_pool * sig%epssub%Nbas_own_max, neig_sub) 2034 sub_size = indx_end - indx_start + 1 2035 2036 buffer_sub_eps = (0.0d0,0.0d0) 2037 icurr = 0 2038 do j_global = indx_start, indx_end 2039 icurr = icurr + 1 2040 iproc_col = INDXG2P( j_global, scal_sub%nbl, iproc_dum, 0, scal_sub%npcol) 2041 if(iproc_col == scal_sub%mypcol) then 2042 j_local = INDXG2L(j_global, scal_sub%nbl, scal_sub%mypcol, 0, scal_sub%npcol) 2043 do i_local = 1, scal_sub%npr 2044 i_global = row_indices_sub(i_local, peinf%inode + 1) 2045 if(i_global > neig_sub) cycle 2046 buffer_sub_eps(i_global, icurr, :) = eps_sub_f(i_local, j_local, :) 2047 end do 2048 end if 2049 end do 2050 2051 iproc_send = pool2pe(1,iproc_pool) 2052 if(iproc_send == peinf%inode) then 2053 call MPI_Reduce(MPI_IN_PLACE, buffer_sub_eps, & 2054 neig_sub * sig%epssub%Nbas_own_max * Nfreq, MPI_COMPLEX_DPC,& 2055 MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr) 2056 ! copy data 2057 sig%epssub%eps_sub(1:neig_sub, 1:sig%epssub%Nbas_own, 1:Nfreq, iq+qoffset) = & 2058 buffer_sub_eps(1:neig_sub, 1:sig%epssub%Nbas_own, 1:Nfreq) 2059 else 2060 call MPI_Reduce(buffer_sub_eps, buffer_sub_eps, & 2061 neig_sub * sig%epssub%Nbas_own_max * Nfreq, MPI_COMPLEX_DPC,& 2062 MPI_SUM, iproc_send, MPI_COMM_WORLD, mpierr) 2063 end if 2064 end if 2065 2066 ! save information 2067 if(iproc_pool-1 == peinf%pool_rank) then 2068 sig%epssub%eps_sub_info(:,1,iq+qoffset) = 0 2069 if(indx_start <= neig_sub) then 2070 sig%epssub%eps_sub_info(1,1,iq+qoffset) = indx_start 2071 sig%epssub%eps_sub_info(2,1,iq+qoffset) = indx_end 2072 end if 2073 sig%epssub%eps_sub_info(3,1,iq+qoffset) = neig_sub 2074 end if 2075 2076 end do ! iproc_pool 2077 SAFE_DEALLOCATE(buffer_sub_eigen) 2078 SAFE_DEALLOCATE(buffer_sub_eps) 2079 2080 ! replicate over pools 2081 if(peinf%inode == pool2pe(1, peinf%pool_rank + 1)) then 2082 do pool_send = 2, peinf%npools 2083 iproc_send = pool2pe(pool_send, peinf%pool_rank + 1) 2084 tag = 0 2085 CALL MPI_SEND(sig%epssub%eps_sub(:,:,:,iq+qoffset), & 2086 sig%neig_sub_max * sig%epssub%Nbas_own * sig%nFreq, & 2087 MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr) 2088 tag = 0 2089 CALL MPI_SEND(sig%epssub%eigenvec_sub(:,:,iq+qoffset), & 2090 sig%epssub%neps * sig%epssub%Nbas_own, & 2091 MPI_COMPLEX_DPC, iproc_send, tag, MPI_COMM_WORLD, mpierr) 2092 end do 2093 else 2094 if(peinf%my_pool >= 0) then 2095 iproc_rec = pool2pe(1, peinf%pool_rank + 1) 2096 tag = 0 2097 CALL MPI_RECV(sig%epssub%eps_sub(:,:,:,iq+qoffset), & 2098 sig%neig_sub_max * sig%epssub%Nbas_own * sig%nFreq, & 2099 MPI_COMPLEX_DPC, iproc_rec, tag, MPI_COMM_WORLD, mpistatus, mpierr) 2100 tag = 0 2101 CALL MPI_RECV(sig%epssub%eigenvec_sub(:,:,iq+qoffset), & 2102 sig%epssub%neps * sig%epssub%Nbas_own, & 2103 MPI_COMPLEX_DPC, iproc_rec, tag, MPI_COMM_WORLD, mpistatus, mpierr) 2104 end if 2105 end if 2106 ! 2107 end if ! cyclic_redistr 2108 2109 ! copy coulomb operator 2110 sig%epssub%vcoul_sub(1:nmtx,iq+qoffset) = vcoul(1:nmtx) 2111 ! 2112 call timing%stop(timing%epscopy_redstr) 2113 end if ! do_sigma_subspace 2114 2115 SAFE_DEALLOCATE(eps1Aux) 2116 SAFE_DEALLOCATE(C_Pgemm) 2117 SAFE_DEALLOCATE(eigenvect) 2118 SAFE_DEALLOCATE(eps_sub_f) 2119 2120 ! deallocate buffer 2121 DO imsg_recv = 1, Nbuf_recv 2122 SAFE_DEALLOCATE( buf_recv(imsg_recv)%msg ) 2123 SAFE_DEALLOCATE( buf_recv(imsg_recv)%col_global_indx ) 2124 END DO 2125 SAFE_DEALLOCATE(buf_recv) 2126 DO imsg_send = 1, Nbuf_send 2127 SAFE_DEALLOCATE( buf_send(imsg_send)%msg ) 2128 SAFE_DEALLOCATE( buf_send(imsg_send)%col_global_indx ) 2129 END DO 2130 SAFE_DEALLOCATE(buf_send) 2131 SAFE_DEALLOCATE(proc2msg_send) 2132 SAFE_DEALLOCATE(local_col_count) 2133 SAFE_DEALLOCATE(req_send) 2134 SAFE_DEALLOCATE(my_status) 2135 SAFE_DEALLOCATE(req_recv) 2136 2137 !WRITE(*,*) sig%need_advanced 2138 2139 POP_SUB(epscopy.epscopy_subspace) 2140 2141 end subroutine 2142#endif 2143 2144end subroutine epscopy 2145 2146end module epscopy_m 2147