1! 2! Copyright (C) 2007-2016 Quantum ESPRESSO group 3! This file is distributed under the terms of the 4! GNU General Public License. See the file `License' 5! in the root directory of the present distribution, 6! or http://www.gnu.org/copyleft/gpl.txt . 7! 8!---------------------------------------------------------------------------- 9SUBROUTINE memory_report() 10 !---------------------------------------------------------------------------- 11 ! 12 ! Rough estimate of the dynamical memory allocated by the pw.x code 13 ! Should be called after the first steps of initialization are done, 14 ! but before large arrays are actually allocated. 15 ! Not guaranteed to be accurate for all cases (especially exotic ones). 16 ! Originally written by PG, much improved by Pietro Bonfa' with: 17 ! * Detailed memory report in verbose mode 18 ! * Memory buffers for LDA+U projectors now included. 19 ! * Local potential and structure factors now included 20 ! (small but sometimes not negligible). 21 ! * Q functions (qrad) now included. 22 ! * Initial estimate of memory used in runs with real space augmentation. 23 ! * Added psi and vc (or vr) in diagonalization routines. 24 ! * Added memory allocated during initialization (wfc_init + rotate_wfc). 25 ! * Added memory used for force evaluation. 26 ! * Added a few comments. 27 ! 28 USE io_global, ONLY : stdout 29 USE kinds, ONLY : dp 30 USE constants, ONLY : tpi, fpi, pi, eps16 31 USE wvfct, ONLY : nbnd, nbndx 32 USE basis, ONLY : natomwfc, starting_wfc 33 USE cell_base, ONLY : omega, bg, alat 34 USE exx, ONLY : ecutfock, use_ace 35 USE exx_base, ONLY : nkqs 36 USE fft_base, ONLY : dffts, dfftp 37 USE gvect, ONLY : ngm, ngl, ngm_g, g, ecutrho 38 USE gvecs, ONLY : ngms, doublegrid 39 USE gvecw, ONLY : ecutwfc, gcutw 40 USE klist, ONLY : nks, nkstot, xk, qnorm 41 USE cellmd, ONLY : cell_factor 42 USE uspp, ONLY : nkb, okvan 43 USE atom, ONLY : rgrid 44 USE funct, ONLY : dft_is_meta, dft_is_hybrid 45 USE ldaU, ONLY : lda_plus_u, U_projection, nwfcU 46 USE fixed_occ, ONLY : one_atom_occupations 47 USE wannier_new,ONLY: use_wannier 48 USE lsda_mod, ONLY : nspin 49 USE uspp_param,ONLY : lmaxkb, upf, nh, nbetam 50 USE us, ONLY : dq 51 USE noncollin_module, ONLY : npol, nspin_mag 52 USE control_flags, ONLY: isolve, nmix, imix, gamma_only, lscf, io_level, & 53 lxdm, smallmem, tqr, iverbosity 54 USE force_mod, ONLY : lforce, lstres 55 USE ions_base, ONLY : nat, ntyp => nsp, ityp 56 USE mp_bands, ONLY : nproc_bgrp, nbgrp 57 USE mp_pools, ONLY : npool 58 USE mp_images, ONLY : nproc_image 59 ! 60 IMPLICIT NONE 61 ! 62 INCLUDE 'laxlib.fh' 63 ! 64 INTEGER, PARAMETER :: MB=1024*1024 65 INTEGER, PARAMETER :: GB=1024*MB 66 INTEGER :: g_fact, mix_type_size, scf_type_size 67 INTEGER :: nk, nbnd_l, npwx_g, npwx_l, ngxx_g, nexx_l, nexx_g, ngm_l 68 INTEGER :: maxbnd, maxnab, maxnij, nab, na, nij, nt, lmaxq, nqxq 69 INTEGER :: indm, ijv, roughestimate 70 REAL(DP):: mbr, mbx, mby, mbz, dmbx, dmby, dmbz 71 ! 72 ! these quantities are real in order to prevent integer overflow 73 ! 74 REAL(dp), PARAMETER :: complex_size=16_dp, real_size=8_dp, int_size=4_dp 75 REAL(dp) :: ram, ram_, ram1, ram2, maxram, totram, add 76 INTEGER :: np_ortho(2) 77 ! 78 IF ( gamma_only) THEN 79 g_fact = 2 ! use half plane waves or G-vectors 80 ELSE 81 g_fact = 1 ! use all plane waves or G-vectors 82 END IF 83 ! 84 ! npwx (max number of plane waves) is roughly estimated from the volume 85 ! of the G-vector sphere, divided by the volume of the Brillouin Zone 86 ! (the exact value can be computed only after G-vectors are computed) 87 ! npwx_l is npwx on this processor (local) 88 ! npwx_g is npwx summed over all processors (global) 89 ! 90 npwx_g = NINT ( fpi/3.0_dp * SQRT(ecutwfc)**3 / (tpi**3/omega) / g_fact ) 91 npwx_l = npwx_g/nproc_bgrp 92 ! 93 ! ram = dynamically (permanently) allocated RAM, per process 94 ! maxram= "high watermark": max ram needed during a run 95 ! totram= max ram needed summed over all processors 96 ! 97 !===================================================================== 98 ! Wavefunctions (including buffer space) 99 ! 100 ! files: allocate_wfc.f90:28,30,32 101 ! orthoatwfc.f90:102 102 !===================================================================== 103 IF ( io_level > 0 .OR. nks == 1) THEN 104 nk = 1 ! store just one wavefunction array at the time 105 ELSE 106 nk = nks+1 ! store nks wavefunctions in memory (buffers) 107 END IF 108 ram = complex_size * nbnd * npol * npwx_l * nk 109 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'wfc', ram/nk/MB 110 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'wfc (w. buffer)', ram/MB 111 ! atomic wavefunctions 112 IF ( one_atom_occupations .OR. use_wannier ) THEN 113 add = complex_size * natomwfc * npol * npwx_l 114 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'atomic wfc', add/MB 115 ram = ram + add 116 END IF 117 ! Hubbard wavefunctions 118 IF ( lda_plus_u .AND. U_projection .NE. 'pseudo' ) THEN 119 add = complex_size * nwfcU * npol * npwx_l * nk ! also buffer 120 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'U proj.', add/nk/MB 121 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'U proj. (w. buff.)', add/MB 122 ram = ram + add 123 END IF 124 ! 125 ! hybrid functionals 126 IF ( dft_is_hybrid () ) THEN 127 ! ngxx_g = estimated global number of G-vectors used in V_x\psi products 128 ! nexx_g = estimated global size of the FFT grid used in V_x\psi products 129 ! nexx_l = estimated local size of the FFT grid used in V_x\psi products 130 ngxx_g = NINT ( fpi/3.0_dp * SQRT(ecutfock)**3 / (tpi**3/omega) ) 131 nexx_g = 2*ngxx_g 132 nexx_l = nexx_g/nproc_bgrp 133 ! nbnd_l : estimated number of bands per proc with band parallelization 134 nbnd_l = NINT( DBLE(nbnd) / nbgrp ) 135 ! Stored wavefunctions in real space 136 add = complex_size/g_fact * nexx_l * npol * nbnd_l * nkqs 137 ! ACE Projectors 138 IF (use_ace) add = add + complex_size * npwx_l * npol * nbnd * nks 139#if defined (__MPI) 140 ! scratch space needed for symmetrization 141 add = add + complex_size * nexx_g*npol*2 142#endif 143 add = add + complex_size * nexx_l*npol*2 144 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'EXX', add/MB 145 ram = ram + add 146 END IF 147 !===================================================================== 148 ! 149 !===================================================================== 150 ! Local pseudopotential, allocate_locpot.f90 151 !===================================================================== 152 add = complex_size * ngm * ntyp ! structure factor 153 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'str. fact', add/MB 154 ram = ram + add 155 add = real_size * ngl * ntyp ! local 156 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'local pot', add/MB 157 ram = ram + add 158 !===================================================================== 159 ! 160 !===================================================================== 161 ! Nonlocal pseudopotentials V_NL (beta functions), reciprocal space 162 !===================================================================== 163 add = complex_size * nkb * npwx_l ! allocate_wfc.f90:62 vkb 164 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'nlocal pot', add/MB 165 ram = ram + add 166 ! other (possibly minor) data loads 167 lmaxq = 2*lmaxkb+1 168 IF (lmaxq > 0) THEN 169 ! not accurate if spline_ps .and. cell_factor <= 1.1d0 170 nqxq = int( ( (sqrt(ecutrho) + qnorm) / dq + 4) * cell_factor ) 171 ! allocate_nlpot.f90:87 qrad 172 add = real_size * nqxq * nbetam*(nbetam+1)/2 * lmaxq * ntyp 173 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'qrad', add/MB 174 ram = ram + add 175 END IF 176 !===================================================================== 177 ! 178 !===================================================================== 179 ! Charge density and potentials - see scf_type in scf_mod 180 !===================================================================== 181 scf_type_size = (complex_size * ngm + real_size * dfftp%nnr ) * nspin ! scf_mod.f90:94-95 182 IF ( dft_is_meta() .or. lxdm ) scf_type_size = 2 * scf_type_size 183 ! rho, v, vnew (allocate_fft.f90:56) 184 add = 3 * scf_type_size 185 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'rho,v,vnew', add/MB 186 ram = ram + add 187 188 ! vltot, vrs, rho_core, rhog_core, psic, strf, kedtau if needed 189 ram = ram + complex_size * ( dfftp%nnr + ngm *( 1 + ntyp ) ) + & 190 real_size * dfftp%nnr*(2+nspin) 191 IF ( dft_is_meta() ) ram = ram + real_size * dfftp%nnr*nspin 192 ! arrays for rho mixing 193 IF ( lscf ) THEN 194 ! rhoin (electrons.f90:439) 195 ram = ram + scf_type_size 196 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'rhoin', DBLE(scf_type_size)/MB 197 ! see mix_type in scf_mod 198 mix_type_size = complex_size * ngm * nspin 199 IF ( dft_is_meta() .or. lxdm ) mix_type_size = 2 * mix_type_size 200 ! df, dv (if kept in memory) 201 IF ( io_level < 2 ) THEN 202 add = mix_type_size * 2 * nmix 203 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'rho*nmix', add/MB 204 ram = ram + add 205 END IF 206 END IF 207 !===================================================================== 208 ! 209 !===================================================================== 210 ! G-vectors: g, gg, mill, nl, nlm, ig_l2g, igtongl 211 !===================================================================== 212 add = real_size * ngm * 4 + int_size * ngm * 7 213 ! double grid: nls, nlsm 214 add = add + int_size * ngms * 2 215 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'G-vectors', add/MB 216 ram = ram + add 217 !===================================================================== 218 ! 219 !===================================================================== 220 ! Real treatment of augmentation charge. 221 !===================================================================== 222 IF ( okvan .and. tqr ) THEN 223 ! realus.f90:422 224 mbr = 0.d0 225 DO nt = 1, ntyp 226 IF ( .not. upf(nt)%tvanp ) CYCLE 227 DO ijv = 1, upf(nt)%nbeta*(upf(nt)%nbeta+1)/2 228 DO indm = upf(nt)%mesh,1,-1 229 ! 230 IF ( maxval(abs( upf(nt)%qfuncl(indm,ijv,:) )) > eps16 ) THEN 231 mbr = max( mbr, rgrid(nt)%r(indm) ) 232 exit 233 ENDIF 234 ! 235 ENDDO 236 ENDDO 237 END DO 238 mbr = mbr / alat 239 240 mbx = mbr*sqrt( bg(1,1)**2 + bg(1,2)**2 + bg(1,3)**2 ) 241 mby = mbr*sqrt( bg(2,1)**2 + bg(2,2)**2 + bg(2,3)**2 ) 242 mbz = mbr*sqrt( bg(3,1)**2 + bg(3,2)**2 + bg(3,3)**2 ) 243 ! 244 dmbx = 2*anint( mbx*dfftp%nr1x ) + 2 245 dmby = 2*anint( mby*dfftp%my_nr2p ) + 2 ! approximation! 246 dmbz = 2*anint( mbz*dfftp%my_nr3p ) + 2 ! approximation! 247 ! 248 roughestimate = anint( dble( dmbx*dmby*dmbz ) * pi / 6.D0 ) 249 add = 0.d0 250 DO na = 1, nat 251 nt = ityp(na) 252 IF ( .not. upf(nt)%tvanp ) CYCLE 253 ! mbia nfuncs 254 add = add + real_size*roughestimate*(nh(nt)*(nh(nt)+1)/2) 255 END DO 256 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'qr (very rough)', add/MB 257 ram = ram + add 258 END IF 259 !===================================================================== 260 ! 261 ! compute ram_: scratch space that raises the "high watermark" 262 ! 263 !===================================================================== 264 ! ram1: scratch space allocated in iterative diagonalization 265 ! hpsi, spsi, hr and sr matrices, scalar products 266 ! nbnd_l is the estimated dimension of distributed matrices 267 ! 268 CALL laxlib_getval( np_ortho = np_ortho ) 269 ! 270 nbnd_l = nbndx/np_ortho(1) 271 ram1 = complex_size/g_fact * ( 3*nbnd_l**2 ) ! hr,sr,vr/hc,sc,vc 272 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'h,s,v(r/c)', ram1/MB 273 add = complex_size/g_fact * ( nkb*npol*nbnd ) ! <psi|beta>, becmod.f90:353 274 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) '<psi|beta>', add/MB 275 ram1 = ram1 + add 276 ! 277 ! 278 IF ( isolve == 0 ) THEN 279 add = complex_size * nbndx * npol * npwx_l ! hpsi 280 ram1 = ram1 + add 281 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'psi', add/MB 282 ram1 = ram1 + add 283 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'hpsi', add/MB 284 IF ( okvan ) THEN 285 add = complex_size * nbndx * npol * npwx_l ! spsi 286 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'spsi', add/MB 287 ram1 = ram1 + add 288 END IF 289 END IF 290 ram_ = ram1 291 !===================================================================== 292 ! 293 !===================================================================== 294 ! arrays allocated in approx_screening2 during charge mixing 295 !===================================================================== 296 IF ( lscf .AND. imix > 1 ) & 297 ram_ = MAX( ram_, complex_size * ngm * 27 + real_size * dffts%nnr ) 298 !===================================================================== 299 ! 300 !===================================================================== 301 ! ram1: scratch space allocated in initialization 302 ! wfcinit + rotatewfcgamma can be larger than regterg cegterg if 303 ! starting with many atomic wavefunctions 304 ! files: wfcinit.f90:246 305 ! rotate_wfc_gamma.f90:45 306 ! rotate_wfc_k.f90:55 307 !===================================================================== 308 IF ( starting_wfc(1:6) == 'atomic' ) THEN 309 maxbnd = MAX( natomwfc, nbnd ) 310 ELSE 311 maxbnd = nbnd 312 END IF 313 ram1 = complex_size * npwx_l * maxbnd * ( npol + 1) + & 314 real_size * maxbnd*(3*maxbnd + 1) 315 ! 316 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'wfcinit/wfcrot', ram1/MB 317 ram_ = MAX( ram_, ram1 ) 318 !===================================================================== 319 ! 320 !===================================================================== 321 ! ram1: arrays allocated in addusdens_g and addusforce & addusstress 322 ! for ultrasoft/paw pp. 323 ! 324 ! N.B: newq is always smaller than addusdens_g 325 ! 326 ! files: addusdens.f90 : 92,93,114 327 ! addusforce.f90 : 78,102,105,117,132-133 328 ! addusstress.f90 329 !===================================================================== 330 IF ( okvan ) THEN 331 IF ( .not. tqr ) THEN 332 ngm_l = ngm/npool 333 maxnab = 0 334 maxnij = 0 335 DO nt = 1, ntyp 336 IF ( upf(nt)%tvanp ) THEN 337 ! 338 ! nij = max number of (ih,jh) pairs per atom type nt 339 ! 340 nij = nh(nt)*(nh(nt)+1)/2 341 IF ( nij > maxnij ) maxnij = nij 342 ! 343 ! count max number of atoms of type nt 344 ! 345 nab = 0 346 DO na = 1, nat 347 IF ( ityp(na) == nt ) nab = nab + 1 348 ENDDO 349 IF ( nab > maxnab ) maxnab = nab 350 END IF 351 END DO 352 ! ylmk0 qmod 353 ram1 = real_size * ngm_l * ( lmaxq * lmaxq + 1 ) 354 ! aux skk aux2 qgm 355 ram1 = ram1 + complex_size * ( ngm*nspin_mag + ngm_l*(maxnab+maxnij+1) ) 356 ! 357 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusdens', ram1/MB 358 ram_ = MAX ( ram_, ram1 ) 359 ! 360 ! forces 361 ! 362 IF (lforce) THEN 363 ! vg ylmk0 qmod 364 ram1 = real_size * (ngm*nspin_mag + ngm_l*( lmaxq*lmaxq + 1 ) ) 365 ! qgm aux1 366 ram1 = ram1 + complex_size * ngm_l * ( maxnij + 3*maxnab ) 367 ! ddeeq 368 ram1 = ram1 + real_size * ( maxnij * maxnab * 3 * nspin_mag ) 369 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusforce', ram1/MB 370 ! 371 ram_ = MAX ( ram_, ram1 ) 372 END IF 373 ! 374 ! stress 375 ! 376 IF (lstres) THEN 377 ! vg ylmk0,dylmk0 qmod 378 ram1 = real_size * (ngm*nspin_mag + ngm_l*( 2*lmaxq*lmaxq + 1 ) ) 379 ! qgm aux1 aux2 380 ram1 = ram1 + complex_size * ngm_l * ( maxnij + 3 + nspin ) 381 ! ddeeq 382 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusstress', ram1/MB 383 ! 384 ram_ = MAX ( ram_, ram1 ) 385 END IF 386 ELSE 387 ! nothing allocated in addusdens_r, hurray! 388 IF (lforce) THEN 389 ram1 = real_size*roughestimate*(nh(nt)*(nh(nt)+1)/2)*3 390 IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusforce_r', ram1/MB 391 ram_ = MAX ( ram_, ram1 ) 392 END IF 393 END IF 394 END IF 395 ! 396 maxram = ram + ram_ 397 ! 398 ! arrays used for global sorting in ggen: 399 ! igsrt, g2l, g2sort_g, total dimensions: 400 ! 401 IF ( .NOT. smallmem ) maxram = MAX ( maxram, & 402 int_size * 2 * ngm_g + real_size * ngm_g ) 403 ! 404 totram = maxram * nproc_image 405 IF ( iverbosity > 0 ) THEN 406 IF ( ram .lt. GB ) WRITE( stdout, 1010 ) ram/MB, ' MB' 407 IF ( ram .ge. GB ) WRITE( stdout, 1010 ) ram/GB, ' GB' 408 END IF 409 410 IF ( maxram .lt. GB ) WRITE( stdout, 1011 ) maxram/MB, ' MB' 411 IF ( maxram .ge. GB ) WRITE( stdout, 1011 ) maxram/GB, ' GB' 412 413 IF ( nproc_image > 1) THEN 414 IF ( totram .lt. GB ) WRITE( stdout, 1012 ) totram/MB, ' MB' 415 IF ( totram .ge. GB ) WRITE( stdout, 1012 ) totram/GB, ' GB' 416 END IF 417 ! 418 ! check: more bands than plane waves? not good 419 ! 420 IF ( npwx_g < nbndx ) CALL errore('memory_report','more bands than PWs!',1) 421 ! 422 1010 format (/5x,'Estimated static dynamical RAM per process > ', F10.2, A3) 423 1011 format (/5x,'Estimated max dynamical RAM per process > ', F10.2, A3) 424 1012 format (/5x,'Estimated total dynamical RAM > ', F10.2, A3) 425 1013 format (/5x,'Dynamical RAM for ', A19, ': ', F10.2, ' MB') 426 1014 format (/5x,'Dynamical RAM for ', A19, ': ', '?? MB (not yet implemented') 427END subroutine memory_report 428