1!
2! Copyright (C) 2007-2016 Quantum ESPRESSO group
3! This file is distributed under the terms of the
4! GNU General Public License. See the file `License'
5! in the root directory of the present distribution,
6! or http://www.gnu.org/copyleft/gpl.txt .
7!
8!----------------------------------------------------------------------------
9SUBROUTINE memory_report()
10  !----------------------------------------------------------------------------
11  !
12  ! Rough estimate of the dynamical memory allocated by the pw.x code
13  ! Should be called after the first steps of initialization are done,
14  ! but before large arrays are actually allocated.
15  ! Not guaranteed to be accurate for all cases (especially exotic ones).
16  ! Originally written by PG, much improved by Pietro Bonfa' with:
17  ! * Detailed memory report in verbose mode
18  ! * Memory buffers for LDA+U projectors now included.
19  ! * Local potential and structure factors now included
20  !  (small but sometimes not negligible).
21  ! * Q functions (qrad) now included.
22  ! * Initial estimate of memory used in runs with real space augmentation.
23  ! * Added psi and vc (or vr) in diagonalization routines.
24  ! * Added memory allocated during initialization (wfc_init + rotate_wfc).
25  ! * Added memory used for force evaluation.
26  ! * Added a few comments.
27  !
28  USE io_global, ONLY : stdout
29  USE kinds,     ONLY : dp
30  USE constants, ONLY : tpi, fpi, pi, eps16
31  USE wvfct,     ONLY : nbnd, nbndx
32  USE basis,     ONLY : natomwfc, starting_wfc
33  USE cell_base, ONLY : omega, bg, alat
34  USE exx,       ONLY : ecutfock, use_ace
35  USE exx_base,  ONLY : nkqs
36  USE fft_base,  ONLY : dffts, dfftp
37  USE gvect,     ONLY : ngm, ngl, ngm_g, g, ecutrho
38  USE gvecs,     ONLY : ngms, doublegrid
39  USE gvecw,     ONLY : ecutwfc, gcutw
40  USE klist,     ONLY : nks, nkstot, xk, qnorm
41  USE cellmd,    ONLY : cell_factor
42  USE uspp,      ONLY : nkb, okvan
43  USE atom,      ONLY : rgrid
44  USE funct,     ONLY : dft_is_meta, dft_is_hybrid
45  USE ldaU,      ONLY : lda_plus_u, U_projection, nwfcU
46  USE fixed_occ, ONLY : one_atom_occupations
47  USE wannier_new,ONLY: use_wannier
48  USE lsda_mod,  ONLY : nspin
49  USE uspp_param,ONLY : lmaxkb, upf, nh, nbetam
50  USE us,        ONLY : dq
51  USE noncollin_module, ONLY : npol, nspin_mag
52  USE control_flags,    ONLY: isolve, nmix, imix, gamma_only, lscf, io_level, &
53       lxdm, smallmem, tqr, iverbosity
54  USE force_mod, ONLY : lforce, lstres
55  USE ions_base, ONLY : nat, ntyp => nsp, ityp
56  USE mp_bands,  ONLY : nproc_bgrp, nbgrp
57  USE mp_pools,  ONLY : npool
58  USE mp_images, ONLY : nproc_image
59  !
60  IMPLICIT NONE
61  !
62  INCLUDE 'laxlib.fh'
63  !
64  INTEGER, PARAMETER :: MB=1024*1024
65  INTEGER, PARAMETER :: GB=1024*MB
66  INTEGER :: g_fact, mix_type_size, scf_type_size
67  INTEGER :: nk, nbnd_l, npwx_g, npwx_l, ngxx_g, nexx_l, nexx_g, ngm_l
68  INTEGER :: maxbnd, maxnab, maxnij, nab, na, nij, nt, lmaxq, nqxq
69  INTEGER :: indm, ijv, roughestimate
70  REAL(DP):: mbr, mbx, mby, mbz, dmbx, dmby, dmbz
71  !
72  ! these quantities are real in order to prevent integer overflow
73  !
74  REAL(dp), PARAMETER :: complex_size=16_dp, real_size=8_dp, int_size=4_dp
75  REAL(dp) :: ram, ram_, ram1, ram2, maxram, totram, add
76  INTEGER :: np_ortho(2)
77  !
78  IF ( gamma_only) THEN
79     g_fact = 2  ! use half plane waves or G-vectors
80  ELSE
81     g_fact = 1  ! use  all plane waves or G-vectors
82  END IF
83  !
84  ! npwx (max number of plane waves) is roughly estimated from the volume
85  ! of the G-vector sphere, divided by the volume of the Brillouin Zone
86  ! (the exact value can be computed only after G-vectors are computed)
87  ! npwx_l is npwx on this processor (local)
88  ! npwx_g is npwx summed over all processors (global)
89  !
90  npwx_g = NINT ( fpi/3.0_dp * SQRT(ecutwfc)**3 / (tpi**3/omega) / g_fact )
91  npwx_l = npwx_g/nproc_bgrp
92  !
93  ! ram   = dynamically (permanently) allocated RAM, per process
94  ! maxram= "high watermark": max ram needed during a run
95  ! totram= max ram needed summed over all processors
96  !
97  !=====================================================================
98  ! Wavefunctions (including buffer space)
99  !
100  !                files: allocate_wfc.f90:28,30,32
101  !                       orthoatwfc.f90:102
102  !=====================================================================
103  IF ( io_level > 0 .OR. nks == 1) THEN
104     nk = 1      ! store just one wavefunction array at the time
105  ELSE
106     nk = nks+1  ! store nks wavefunctions in memory (buffers)
107  END IF
108  ram =  complex_size * nbnd * npol * npwx_l * nk
109  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'wfc', ram/nk/MB
110  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'wfc (w. buffer)', ram/MB
111  ! atomic wavefunctions
112  IF ( one_atom_occupations .OR. use_wannier ) THEN
113     add = complex_size * natomwfc * npol * npwx_l
114     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'atomic wfc', add/MB
115     ram = ram + add
116  END IF
117  ! Hubbard wavefunctions
118  IF ( lda_plus_u .AND. U_projection .NE. 'pseudo' ) THEN
119     add = complex_size * nwfcU * npol * npwx_l * nk ! also buffer
120     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'U proj.', add/nk/MB
121     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'U proj. (w. buff.)', add/MB
122     ram = ram + add
123  END IF
124  !
125  ! hybrid functionals
126  IF ( dft_is_hybrid () ) THEN
127     ! ngxx_g = estimated global number of G-vectors used in V_x\psi products
128     ! nexx_g = estimated global size of the FFT grid used in V_x\psi products
129     ! nexx_l = estimated local size of the FFT grid used in V_x\psi products
130     ngxx_g = NINT ( fpi/3.0_dp * SQRT(ecutfock)**3 / (tpi**3/omega) )
131     nexx_g = 2*ngxx_g
132     nexx_l = nexx_g/nproc_bgrp
133     ! nbnd_l : estimated number of bands per proc with band parallelization
134     nbnd_l = NINT( DBLE(nbnd) / nbgrp )
135     ! Stored wavefunctions in real space
136     add = complex_size/g_fact * nexx_l * npol * nbnd_l * nkqs
137     ! ACE Projectors
138     IF (use_ace) add = add + complex_size * npwx_l * npol * nbnd * nks
139#if defined (__MPI)
140     ! scratch space needed for symmetrization
141     add = add + complex_size * nexx_g*npol*2
142#endif
143     add = add + complex_size * nexx_l*npol*2
144     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'EXX', add/MB
145     ram = ram + add
146  END IF
147  !=====================================================================
148  !
149  !=====================================================================
150  ! Local pseudopotential, allocate_locpot.f90
151  !=====================================================================
152  add = complex_size * ngm * ntyp ! structure factor
153  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'str. fact', add/MB
154  ram = ram + add
155  add = real_size * ngl * ntyp    ! local
156  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'local pot', add/MB
157  ram = ram + add
158  !=====================================================================
159  !
160  !=====================================================================
161  ! Nonlocal pseudopotentials V_NL (beta functions), reciprocal space
162  !=====================================================================
163  add = complex_size * nkb * npwx_l ! allocate_wfc.f90:62 vkb
164  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'nlocal pot', add/MB
165  ram = ram + add
166  ! other (possibly minor) data loads
167  lmaxq = 2*lmaxkb+1
168  IF (lmaxq > 0) THEN
169     ! not accurate if spline_ps .and. cell_factor <= 1.1d0
170     nqxq = int( ( (sqrt(ecutrho) + qnorm) / dq + 4) * cell_factor )
171     ! allocate_nlpot.f90:87 qrad
172     add = real_size * nqxq * nbetam*(nbetam+1)/2 * lmaxq * ntyp
173     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'qrad', add/MB
174     ram = ram + add
175  END IF
176  !=====================================================================
177  !
178  !=====================================================================
179  ! Charge density and potentials - see scf_type in scf_mod
180  !=====================================================================
181  scf_type_size =  (complex_size * ngm + real_size * dfftp%nnr ) * nspin ! scf_mod.f90:94-95
182  IF ( dft_is_meta() .or. lxdm ) scf_type_size =  2 * scf_type_size
183  ! rho, v, vnew (allocate_fft.f90:56)
184  add = 3 * scf_type_size
185  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'rho,v,vnew', add/MB
186  ram =  ram + add
187
188  ! vltot, vrs, rho_core, rhog_core, psic, strf, kedtau if needed
189  ram =  ram + complex_size * ( dfftp%nnr + ngm *( 1 + ntyp ) ) + &
190       real_size * dfftp%nnr*(2+nspin)
191  IF ( dft_is_meta() ) ram = ram + real_size * dfftp%nnr*nspin
192  ! arrays for rho mixing
193  IF ( lscf ) THEN
194     ! rhoin (electrons.f90:439)
195     ram =  ram + scf_type_size
196     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'rhoin', DBLE(scf_type_size)/MB
197     ! see mix_type in scf_mod
198     mix_type_size =  complex_size * ngm * nspin
199     IF ( dft_is_meta() .or. lxdm ) mix_type_size =  2 * mix_type_size
200     ! df, dv (if kept in memory)
201     IF ( io_level < 2 ) THEN
202        add = mix_type_size * 2 * nmix
203        IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'rho*nmix', add/MB
204        ram = ram + add
205     END IF
206  END IF
207  !=====================================================================
208  !
209  !=====================================================================
210  ! G-vectors: g, gg, mill, nl, nlm, ig_l2g, igtongl
211  !=====================================================================
212  add = real_size * ngm * 4 + int_size * ngm * 7
213  ! double grid: nls, nlsm
214  add = add + int_size * ngms * 2
215  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'G-vectors', add/MB
216  ram = ram + add
217  !=====================================================================
218  !
219  !=====================================================================
220  ! Real treatment of augmentation charge.
221  !=====================================================================
222  IF ( okvan .and. tqr ) THEN
223    ! realus.f90:422
224    mbr = 0.d0
225    DO nt = 1, ntyp
226        IF ( .not. upf(nt)%tvanp ) CYCLE
227        DO ijv = 1, upf(nt)%nbeta*(upf(nt)%nbeta+1)/2
228          DO indm = upf(nt)%mesh,1,-1
229              !
230              IF ( maxval(abs( upf(nt)%qfuncl(indm,ijv,:) )) > eps16 ) THEN
231                mbr = max( mbr,  rgrid(nt)%r(indm) )
232                exit
233              ENDIF
234              !
235          ENDDO
236        ENDDO
237    END DO
238    mbr = mbr / alat
239
240    mbx = mbr*sqrt( bg(1,1)**2 + bg(1,2)**2 + bg(1,3)**2 )
241    mby = mbr*sqrt( bg(2,1)**2 + bg(2,2)**2 + bg(2,3)**2 )
242    mbz = mbr*sqrt( bg(3,1)**2 + bg(3,2)**2 + bg(3,3)**2 )
243    !
244    dmbx = 2*anint( mbx*dfftp%nr1x ) + 2
245    dmby = 2*anint( mby*dfftp%my_nr2p ) + 2  ! approximation!
246    dmbz = 2*anint( mbz*dfftp%my_nr3p ) + 2  ! approximation!
247    !
248    roughestimate = anint( dble( dmbx*dmby*dmbz ) * pi / 6.D0 )
249    add  = 0.d0
250    DO na = 1, nat
251      nt = ityp(na)
252      IF ( .not. upf(nt)%tvanp ) CYCLE
253      !                          mbia             nfuncs
254      add = add + real_size*roughestimate*(nh(nt)*(nh(nt)+1)/2)
255    END DO
256    IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'qr (very rough)', add/MB
257    ram = ram + add
258  END IF
259  !=====================================================================
260  !
261  ! compute ram_: scratch space that raises the "high watermark"
262  !
263  !=====================================================================
264  ! ram1:  scratch space allocated in iterative diagonalization
265  !        hpsi, spsi, hr and sr matrices, scalar products
266  !        nbnd_l is the estimated dimension of distributed matrices
267  !
268  CALL laxlib_getval( np_ortho = np_ortho )
269  !
270  nbnd_l = nbndx/np_ortho(1)
271  ram1 = complex_size/g_fact * ( 3*nbnd_l**2 ) ! hr,sr,vr/hc,sc,vc
272  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'h,s,v(r/c)', ram1/MB
273  add = complex_size/g_fact * ( nkb*npol*nbnd ) ! <psi|beta>, becmod.f90:353
274  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) '<psi|beta>', add/MB
275  ram1 = ram1 + add
276  !
277  !
278  IF ( isolve == 0 ) THEN
279     add = complex_size * nbndx * npol * npwx_l              ! hpsi
280     ram1 = ram1 + add
281     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'psi', add/MB
282     ram1 = ram1 + add
283     IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'hpsi', add/MB
284     IF ( okvan ) THEN
285        add = complex_size * nbndx * npol * npwx_l ! spsi
286        IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'spsi', add/MB
287        ram1 = ram1 + add
288     END IF
289  END IF
290  ram_ = ram1
291  !=====================================================================
292  !
293  !=====================================================================
294  ! arrays allocated in approx_screening2 during charge mixing
295  !=====================================================================
296  IF ( lscf .AND. imix > 1 ) &
297     ram_ = MAX( ram_, complex_size * ngm * 27 + real_size * dffts%nnr )
298  !=====================================================================
299  !
300  !=====================================================================
301  ! ram1: scratch space allocated in initialization
302  !       wfcinit + rotatewfcgamma can be larger than regterg cegterg if
303  !       starting with many atomic wavefunctions
304  !       files: wfcinit.f90:246
305  !              rotate_wfc_gamma.f90:45
306  !              rotate_wfc_k.f90:55
307  !=====================================================================
308  IF ( starting_wfc(1:6) == 'atomic' ) THEN
309     maxbnd = MAX( natomwfc, nbnd )
310  ELSE
311     maxbnd = nbnd
312  END IF
313  ram1 = complex_size * npwx_l * maxbnd  * ( npol + 1) + &
314         real_size * maxbnd*(3*maxbnd + 1)
315  !
316  IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'wfcinit/wfcrot', ram1/MB
317  ram_ = MAX( ram_,  ram1 )
318  !=====================================================================
319  !
320  !=====================================================================
321  ! ram1: arrays allocated in addusdens_g and addusforce & addusstress
322  !       for ultrasoft/paw pp.
323  !
324  !       N.B: newq is always smaller than addusdens_g
325  !
326  !       files: addusdens.f90  : 92,93,114
327  !              addusforce.f90 : 78,102,105,117,132-133
328  !              addusstress.f90
329  !=====================================================================
330  IF ( okvan ) THEN
331     IF ( .not. tqr ) THEN
332        ngm_l  = ngm/npool
333        maxnab = 0
334        maxnij = 0
335        DO nt = 1, ntyp
336            IF ( upf(nt)%tvanp ) THEN
337              !
338              ! nij = max number of (ih,jh) pairs per atom type nt
339              !
340              nij = nh(nt)*(nh(nt)+1)/2
341              IF ( nij > maxnij ) maxnij = nij
342              !
343              ! count max number of atoms of type nt
344              !
345              nab = 0
346              DO na = 1, nat
347                  IF ( ityp(na) == nt ) nab = nab + 1
348              ENDDO
349              IF ( nab > maxnab ) maxnab = nab
350            END IF
351        END DO
352        !                               ylmk0      qmod
353        ram1 = real_size * ngm_l * ( lmaxq * lmaxq + 1 )
354        !                                aux                   skk   aux2   qgm
355        ram1 = ram1 + complex_size * ( ngm*nspin_mag + ngm_l*(maxnab+maxnij+1) )
356        !
357        IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusdens', ram1/MB
358        ram_ = MAX ( ram_, ram1 )
359        !
360        ! forces
361        !
362        IF (lforce) THEN
363           !                      vg                       ylmk0     qmod
364           ram1 = real_size * (ngm*nspin_mag + ngm_l*( lmaxq*lmaxq + 1 ) )
365           !                                    qgm      aux1
366           ram1 = ram1 + complex_size * ngm_l * ( maxnij + 3*maxnab )
367           !                           ddeeq
368           ram1 = ram1 + real_size * ( maxnij * maxnab * 3 * nspin_mag )
369           IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusforce', ram1/MB
370           !
371           ram_ = MAX ( ram_, ram1 )
372        END IF
373        !
374        ! stress
375        !
376        IF (lstres) THEN
377           !                      vg                      ylmk0,dylmk0  qmod
378           ram1 = real_size *  (ngm*nspin_mag + ngm_l*( 2*lmaxq*lmaxq + 1 ) )
379           !                                    qgm      aux1  aux2
380           ram1 = ram1 + complex_size * ngm_l * ( maxnij + 3 + nspin )
381           !                           ddeeq
382           IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusstress', ram1/MB
383           !
384           ram_ = MAX ( ram_, ram1 )
385        END IF
386     ELSE
387        ! nothing allocated in addusdens_r, hurray!
388        IF (lforce) THEN
389           ram1 = real_size*roughestimate*(nh(nt)*(nh(nt)+1)/2)*3
390           IF ( iverbosity > 0 ) WRITE( stdout, 1013 ) 'addusforce_r', ram1/MB
391           ram_ = MAX ( ram_, ram1 )
392        END IF
393     END IF
394  END IF
395  !
396  maxram = ram + ram_
397  !
398  ! arrays used for global sorting in ggen:
399  !    igsrt, g2l, g2sort_g, total dimensions:
400  !
401  IF ( .NOT. smallmem ) maxram = MAX ( maxram, &
402       int_size * 2 * ngm_g + real_size * ngm_g )
403  !
404  totram = maxram * nproc_image
405  IF ( iverbosity > 0 ) THEN
406     IF ( ram .lt. GB ) WRITE( stdout, 1010 ) ram/MB, ' MB'
407     IF ( ram .ge. GB ) WRITE( stdout, 1010 ) ram/GB, ' GB'
408  END IF
409
410  IF ( maxram .lt. GB ) WRITE( stdout, 1011 ) maxram/MB, ' MB'
411  IF ( maxram .ge. GB ) WRITE( stdout, 1011 ) maxram/GB, ' GB'
412
413  IF ( nproc_image > 1) THEN
414     IF ( totram .lt. GB ) WRITE( stdout, 1012 ) totram/MB, ' MB'
415     IF ( totram .ge. GB ) WRITE( stdout, 1012 ) totram/GB, ' GB'
416  END IF
417  !
418  ! check: more bands than plane waves? not good
419  !
420  IF ( npwx_g < nbndx ) CALL errore('memory_report','more bands than PWs!',1)
421  !
422 1010 format (/5x,'Estimated static dynamical RAM per process > ', F10.2, A3)
423 1011 format (/5x,'Estimated max dynamical RAM per process > ', F10.2, A3)
424 1012 format (/5x,'Estimated total dynamical RAM > ', F10.2, A3)
425 1013 format (/5x,'Dynamical RAM for ', A19, ': ', F10.2, ' MB')
426 1014 format (/5x,'Dynamical RAM for ', A19, ': ', '?? MB (not yet implemented')
427END subroutine memory_report
428