1!--------------------------------------------------------------------------------------------------!
2!   CP2K: A general program to perform molecular dynamics simulations                              !
3!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
4!--------------------------------------------------------------------------------------------------!
5
6! **************************************************************************************************
7!> \brief Rountines for optimizing load balance between processes in HFX calculations
8!> \par History
9!>      04.2008 created [Manuel Guidon]
10!> \author Manuel Guidon
11! **************************************************************************************************
12MODULE hfx_load_balance_methods
13   USE cell_types,                      ONLY: cell_type
14   USE cp_files,                        ONLY: close_file,&
15                                              open_file
16   USE cp_para_types,                   ONLY: cp_para_env_type
17   USE hfx_pair_list_methods,           ONLY: build_atomic_pair_list,&
18                                              build_pair_list
19   USE hfx_types,                       ONLY: &
20        hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &
21        hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &
22        pair_list_type, pair_set_list_type
23   USE input_constants,                 ONLY: hfx_do_eval_energy,&
24                                              hfx_do_eval_forces
25   USE kinds,                           ONLY: dp,&
26                                              int_8
27   USE message_passing,                 ONLY: mp_allgather,&
28                                              mp_gatherv,&
29                                              mp_isendrecv,&
30                                              mp_max,&
31                                              mp_sum,&
32                                              mp_sync,&
33                                              mp_waitall
34   USE parallel_rng_types,              ONLY: UNIFORM,&
35                                              create_rng_stream,&
36                                              delete_rng_stream,&
37                                              next_random_number,&
38                                              rng_stream_type
39   USE particle_types,                  ONLY: particle_type
40   USE util,                            ONLY: sort
41#include "./base/base_uses.f90"
42
43   IMPLICIT NONE
44   PRIVATE
45
46   PUBLIC :: hfx_load_balance, &
47             hfx_update_load_balance, &
48             collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy
49
50   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
51
52   REAL(KIND=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
53                                                 -1.91570128356921242E-002_dp, -1.6668495454436603_dp, &
54                                                 1.7512639006523709_dp, -9.76074323945336081E-002_dp, &
55                                                 2.6230786127311889_dp, -0.31870737623014189_dp, &
56                                                 7.9588203912690973_dp, 1.8331423413134813_dp, &
57                                                 -0.15427618665346299_dp, 0.19749436090711650_dp/)
58   REAL(KIND=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
59                                                 -9.36564055598656797E-002_dp, 0.64284973765086939_dp, &
60                                                 1.0137565430060556_dp, -6.80088178288954567E-003_dp, &
61                                                 1.1692629207374552_dp, -2.6314710080507573_dp, &
62                                                 19.237814781880786_dp, 1.0505934173661349_dp, &
63                                                 0.80382371955699250_dp, 0.49903401991818103_dp/)
64   REAL(KIND=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350E-002_dp, 0.38073304105744837_dp/)
65   REAL(KIND=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
66                                                 -9.41673106447732111E-002_dp, 0.94568006899317825_dp, &
67                                                 -1.4511897117448544_dp, 0.59178934677316952_dp, &
68                                                 2.7291149361757236_dp, -0.50555512044800210_dp, &
69                                                 8.3508180969609871_dp, 1.6829982496141809_dp, &
70                                                 -0.74895370472152600_dp, 0.43801726744197500_dp/)
71   REAL(KIND=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
72                                                 5.33216585432061581E-003_dp, 0.45572145697283628_dp, &
73                                                 1.8119743851500618_dp, -0.12533918548421166_dp, &
74                                                 -1.4040312084552751_dp, -4.5331650463917859_dp, &
75                                                 12.593431549069477_dp, 1.1311978374487595_dp, &
76                                                 1.4245996087624646_dp, 1.1425350529853495_dp/)
77   REAL(KIND=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
78
79!***
80
81CONTAINS
82
83! **************************************************************************************************
84!> \brief Distributes the computation of eri's to all available processes.
85!> \param x_data Object that stores the indices array
86!> \param eps_schwarz screening parameter
87!> \param particle_set , atomic_kind_set, para_env ...
88!> \param max_set Maximum number of set to be considered
89!> \param para_env para_env
90!> \param coeffs_set screening functions
91!> \param coeffs_kind screening functions
92!> \param is_assoc_atomic_block_global KS-matrix sparsity
93!> \param do_periodic flag for periodicity
94!> \param load_balance_parameter Paramters for Monte-Carlo routines
95!> \param kind_of helper array for mapping
96!> \param basis_parameter Basis set parameters
97!> \param pmax_set Initial screening matrix
98!> \param pmax_atom ...
99!> \param i_thread Process ID of current Thread
100!> \param n_threads Total Number of Threads
101!> \param cell cell
102!> \param do_p_screening Flag for initial p screening
103!> \param map_atom_to_kind_atom ...
104!> \param nkind ...
105!> \param eval_type ...
106!> \param pmax_block ...
107!> \param use_virial ...
108!> \par History
109!>      06.2007 created [Manuel Guidon]
110!>      08.2007 new parallel scheme [Manuel Guidon]
111!>      09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
112!>      11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
113!>      02.2009 completely refactored [Manuel Guidon]
114!> \author Manuel Guidon
115!> \note
116!>      The optimization is done via a binning procedure followed by simple
117!>      Monte Carlo procedure:
118!>      In a first step the total amount of integrals in the system is calculated,
119!>      taking into account the sparsity of the KS-matrix , the screening based
120!>      on near/farfield approximations and if desired the screening on an initial
121!>      density matrix.
122!>      In a second step, bins are generate that contain approximately the same number
123!>      of integrals, and a cost for these bins is estimated (currently the number of integrals)
124!>      In a third step, a Monte Carlo procedure optimizes the assignment
125!>      of the different loads to each process
126!>      At the end each process owns an unique array of *atomic* indices-ranges
127!>      that are used to decide whether a process has to calculate a certain
128!>      bunch of integrals or not
129! **************************************************************************************************
130   SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
131                               coeffs_set, coeffs_kind, &
132                               is_assoc_atomic_block_global, do_periodic, &
133                               load_balance_parameter, kind_of, basis_parameter, pmax_set, &
134                               pmax_atom, i_thread, n_threads, cell, &
135                               do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
136                               pmax_block, use_virial)
137      TYPE(hfx_type), POINTER                            :: x_data
138      REAL(dp), INTENT(IN)                               :: eps_schwarz
139      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
140      INTEGER, INTENT(IN)                                :: max_set
141      TYPE(cp_para_env_type), POINTER                    :: para_env
142      TYPE(hfx_screen_coeff_type), &
143         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
144      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
145         POINTER                                         :: coeffs_kind
146      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
147      LOGICAL                                            :: do_periodic
148      TYPE(hfx_load_balance_type), POINTER               :: load_balance_parameter
149      INTEGER                                            :: kind_of(*)
150      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
151      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
152      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
153      INTEGER, INTENT(IN)                                :: i_thread, n_threads
154      TYPE(cell_type), POINTER                           :: cell
155      LOGICAL, INTENT(IN)                                :: do_p_screening
156      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
157      INTEGER, INTENT(IN)                                :: nkind, eval_type
158      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
159      LOGICAL, INTENT(IN)                                :: use_virial
160
161      CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_load_balance', &
162         routineP = moduleN//':'//routineN
163
164      CHARACTER(LEN=512)                                 :: error_msg
165      INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
166         handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
167         jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
168         latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
169         new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
170         objective_block_size, objective_nblocks, req(2), source, total_blocks
171      INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
172         distribution_counter_end, distribution_counter_start, global_quartet_counter, &
173         local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
174      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer_in, buffer_out
175      INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
176                                                            sendbuffer, swapbuffer
177      INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
178      INTEGER(int_8), SAVE                               :: shm_global_quartet_counter, &
179                                                            shm_local_quartet_counter
180      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount, rdispl, tmp_index, tmp_pos, &
181                                                            to_be_sorted
182      INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
183      INTEGER, SAVE                                      :: shm_nblocks
184      LOGICAL                                            :: changed, last_bin_needs_to_be_filled, &
185                                                            optimized
186      LOGICAL, DIMENSION(:, :), POINTER, SAVE            :: atomic_pair_list
187      REAL(dp)                                           :: coeffs_kind_max0, log10_eps_schwarz, &
188                                                            log_2, pmax_blocks
189      TYPE(hfx_block_range_type), DIMENSION(:), POINTER  :: blocks_guess, tmp_blocks, tmp_blocks2
190      TYPE(hfx_block_range_type), DIMENSION(:), &
191         POINTER, SAVE                                   :: shm_blocks
192      TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
193      TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
194         SAVE                                            :: full_dist
195      TYPE(pair_list_type)                               :: list_ij, list_kl
196      TYPE(pair_set_list_type), ALLOCATABLE, &
197         DIMENSION(:)                                    :: set_list_ij, set_list_kl
198
199!$OMP BARRIER
200!$OMP MASTER
201      CALL timeset(routineN, handle)
202!$OMP END MASTER
203!$OMP BARRIER
204
205      log10_eps_schwarz = LOG10(eps_schwarz)
206      log_2 = LOG10(2.0_dp)
207      coeffs_kind_max0 = MAXVAL(coeffs_kind(:, :)%x(2))
208      ncpu = para_env%num_pe
209      n_processes = ncpu*n_threads
210      natom = SIZE(particle_set)
211
212      block_size = load_balance_parameter%block_size
213      ALLOCATE (set_list_ij((max_set*block_size)**2))
214      ALLOCATE (set_list_kl((max_set*block_size)**2))
215
216      IF (.NOT. load_balance_parameter%blocks_initialized) THEN
217!$OMP BARRIER
218!$OMP MASTER
219         CALL timeset(routineN//"_range", handle_range)
220
221         nblocks = MAX((natom + block_size - 1)/block_size, 1)
222         ALLOCATE (blocks_guess(nblocks))
223         ALLOCATE (tmp_blocks(natom))
224         ALLOCATE (tmp_blocks2(natom))
225
226         pmax_blocks = 0.0_dp
227         SELECT CASE (eval_type)
228         CASE (hfx_do_eval_energy)
229            atomic_pair_list => x_data%atomic_pair_list
230         CASE (hfx_do_eval_forces)
231            atomic_pair_list => x_data%atomic_pair_list_forces
232         END SELECT
233         atomic_pair_list = .TRUE.
234         CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
235                          list_ij, list_kl, set_list_ij, set_list_kl, &
236                          particle_set, &
237                          coeffs_set, coeffs_kind, &
238                          is_assoc_atomic_block_global, do_periodic, &
239                          kind_of, basis_parameter, pmax_set, pmax_atom, &
240                          pmax_blocks, cell, &
241                          do_p_screening, map_atom_to_kind_atom, eval_type, &
242                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
243
244         total_block_self_cost = 0
245
246         DO i = 1, nblocks
247            total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
248         END DO
249
250         CALL mp_sum(total_block_self_cost, para_env%group)
251
252         objective_block_size = load_balance_parameter%block_size
253         objective_nblocks = MAX((natom + objective_block_size - 1)/objective_block_size, 1)
254
255         self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
256
257         DO i = 1, nblocks
258            tmp_blocks2(i) = blocks_guess(i)
259         END DO
260
261         optimized = .FALSE.
262         i = 0
263         DO WHILE (.NOT. optimized)
264            i = i + 1
265            current_block_id = 0
266            changed = .FALSE.
267            DO atom_block = 1, nblocks
268               current_block_id = current_block_id + 1
269               iatom_start = tmp_blocks2(atom_block)%istart
270               iatom_end = tmp_blocks2(atom_block)%iend
271               IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
272                  changed = .TRUE.
273                  new_iatom_start = iatom_start
274                  new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
275                  new_jatom_start = new_iatom_end + 1
276                  new_jatom_end = iatom_end
277                  tmp_blocks(current_block_id)%istart = new_iatom_start
278                  tmp_blocks(current_block_id)%iend = new_iatom_end
279                  tmp_blocks(current_block_id)%cost = estimate_block_cost( &
280                                                      natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
281                                                      new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
282                                                      new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
283                                                      particle_set, &
284                                                      coeffs_set, coeffs_kind, &
285                                                      is_assoc_atomic_block_global, do_periodic, &
286                                                      kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
287                                                      cell, &
288                                                      do_p_screening, map_atom_to_kind_atom, eval_type, &
289                                                      log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
290                  current_block_id = current_block_id + 1
291                  tmp_blocks(current_block_id)%istart = new_jatom_start
292                  tmp_blocks(current_block_id)%iend = new_jatom_end
293                  tmp_blocks(current_block_id)%cost = estimate_block_cost( &
294                                                      natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
295                                                      new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
296                                                      new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
297                                                      particle_set, &
298                                                      coeffs_set, coeffs_kind, &
299                                                      is_assoc_atomic_block_global, do_periodic, &
300                                                      kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
301                                                      cell, &
302                                                      do_p_screening, map_atom_to_kind_atom, eval_type, &
303                                                      log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
304               ELSE
305                  tmp_blocks(current_block_id)%istart = iatom_start
306                  tmp_blocks(current_block_id)%iend = iatom_end
307                  tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
308               END IF
309            END DO
310            IF (.NOT. changed) optimized = .TRUE.
311            IF (i > 20) optimized = .TRUE.
312            nblocks = current_block_id
313            DO atom_block = 1, nblocks
314               tmp_blocks2(atom_block) = tmp_blocks(atom_block)
315            END DO
316         END DO
317
318         DEALLOCATE (tmp_blocks2)
319
320         ! ** count number of non empty blocks on each node
321         non_empty_blocks = 0
322         DO atom_block = 1, nblocks
323            IF (tmp_blocks(atom_block)%istart == 0) CYCLE
324            non_empty_blocks = non_empty_blocks + 1
325         END DO
326
327         ALLOCATE (rcount(ncpu))
328         rcount = 0
329         rcount(para_env%mepos + 1) = non_empty_blocks
330         CALL mp_sum(rcount, para_env%group)
331
332         ! ** sum all non_empty_blocks
333         total_blocks = 0
334         DO i = 1, ncpu
335            total_blocks = total_blocks + rcount(i)
336         END DO
337
338         ! ** calculate offsets
339         ALLOCATE (rdispl(ncpu))
340         rcount(:) = rcount(:)*3
341         rdispl(1) = 0
342         DO i = 2, ncpu
343            rdispl(i) = rdispl(i - 1) + rcount(i - 1)
344         END DO
345
346         ALLOCATE (buffer_in(3*non_empty_blocks))
347
348         non_empty_blocks = 0
349         DO atom_block = 1, nblocks
350            IF (tmp_blocks(atom_block)%istart == 0) CYCLE
351            buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
352            buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
353            buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
354            non_empty_blocks = non_empty_blocks + 1
355         END DO
356
357         nblocks = total_blocks
358
359         ALLOCATE (tmp_blocks2(nblocks))
360
361         ALLOCATE (buffer_out(3*nblocks))
362
363         ! ** Gather all three arrays
364         CALL mp_allgather(buffer_in, buffer_out, rcount, rdispl, para_env%group)
365
366         DO i = 1, nblocks
367            tmp_blocks2(i)%istart = INT(buffer_out((i - 1)*3 + 1))
368            tmp_blocks2(i)%iend = INT(buffer_out((i - 1)*3 + 2))
369            tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
370         END DO
371
372         ! ** Now we sort the blocks
373         ALLOCATE (to_be_sorted(nblocks))
374         ALLOCATE (tmp_index(nblocks))
375
376         DO atom_block = 1, nblocks
377            to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
378         END DO
379
380         CALL sort(to_be_sorted, nblocks, tmp_index)
381
382         ALLOCATE (x_data%blocks(nblocks))
383
384         DO atom_block = 1, nblocks
385            x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
386         END DO
387
388         shm_blocks => x_data%blocks
389         shm_nblocks = nblocks
390
391         ! ** Set nblocks in structure
392         load_balance_parameter%nblocks = nblocks
393
394         DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
395
396         DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
397
398         load_balance_parameter%blocks_initialized = .TRUE.
399
400         x_data%blocks = shm_blocks
401         load_balance_parameter%nblocks = shm_nblocks
402         load_balance_parameter%blocks_initialized = .TRUE.
403
404         ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
405         x_data%pmax_block = 0.0_dp
406         pmax_block => x_data%pmax_block
407         CALL timestop(handle_range)
408!$OMP END MASTER
409!$OMP BARRIER
410
411         IF (.NOT. load_balance_parameter%blocks_initialized) THEN
412            ALLOCATE (x_data%blocks(shm_nblocks))
413            x_data%blocks = shm_blocks
414            load_balance_parameter%nblocks = shm_nblocks
415            load_balance_parameter%blocks_initialized = .TRUE.
416         END IF
417         !! ** precalculate maximum density matrix elements in blocks
418!$OMP BARRIER
419      END IF
420
421!$OMP BARRIER
422!$OMP MASTER
423      pmax_block => x_data%pmax_block
424      pmax_block = 0.0_dp
425      IF (do_p_screening) THEN
426         DO iatom_block = 1, shm_nblocks
427            iatom_start = x_data%blocks(iatom_block)%istart
428            iatom_end = x_data%blocks(iatom_block)%iend
429            DO jatom_block = 1, shm_nblocks
430               jatom_start = x_data%blocks(jatom_block)%istart
431               jatom_end = x_data%blocks(jatom_block)%iend
432               pmax_block(iatom_block, jatom_block) = MAXVAL(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
433            END DO
434         END DO
435      END IF
436
437      SELECT CASE (eval_type)
438      CASE (hfx_do_eval_energy)
439         atomic_pair_list => x_data%atomic_pair_list
440      CASE (hfx_do_eval_forces)
441         atomic_pair_list => x_data%atomic_pair_list_forces
442      END SELECT
443      CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
444                                  do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
445                                  x_data%blocks)
446
447!$OMP END MASTER
448!$OMP BARRIER
449
450      !! If there is only 1 cpu skip the binning
451      IF (n_processes == 1) THEN
452         ALLOCATE (tmp_dist(1))
453         tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
454         tmp_dist(1)%istart = 0_int_8
455         ptr_to_tmp_dist => tmp_dist(:)
456         SELECT CASE (eval_type)
457         CASE (hfx_do_eval_energy)
458            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
459         CASE (hfx_do_eval_forces)
460            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
461         END SELECT
462         DEALLOCATE (tmp_dist)
463      ELSE
464         !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
465!$OMP BARRIER
466!$OMP MASTER
467         CALL timeset(routineN//"_count", handle_inner)
468!$OMP END MASTER
469!$OMP BARRIER
470
471         cost_per_core = 0_int_8
472         my_process_id = para_env%mepos*n_threads + i_thread
473         nblocks = load_balance_parameter%nblocks
474
475         DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
476
477            latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
478            tmp_block = atom_block/nblocks
479            katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
480            IF (latom_block < katom_block) CYCLE
481            tmp_block = tmp_block/nblocks
482            jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
483            tmp_block = tmp_block/nblocks
484            iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
485            IF (jatom_block < iatom_block) CYCLE
486
487            iatom_start = x_data%blocks(iatom_block)%istart
488            iatom_end = x_data%blocks(iatom_block)%iend
489            jatom_start = x_data%blocks(jatom_block)%istart
490            jatom_end = x_data%blocks(jatom_block)%iend
491            katom_start = x_data%blocks(katom_block)%istart
492            katom_end = x_data%blocks(katom_block)%iend
493            latom_start = x_data%blocks(latom_block)%istart
494            latom_end = x_data%blocks(latom_block)%iend
495
496            SELECT CASE (eval_type)
497            CASE (hfx_do_eval_energy)
498               pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
499                                 pmax_block(latom_block, jatom_block), &
500                                 pmax_block(latom_block, iatom_block), &
501                                 pmax_block(katom_block, jatom_block))
502            CASE (hfx_do_eval_forces)
503               pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
504                                 pmax_block(latom_block, jatom_block), &
505                                 pmax_block(latom_block, iatom_block) + &
506                                 pmax_block(katom_block, jatom_block))
507            END SELECT
508
509            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
510
511            cost_per_core = cost_per_core &
512                            + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
513                                                  iatom_start, iatom_end, jatom_start, jatom_end, &
514                                                  katom_start, katom_end, latom_start, latom_end, &
515                                                  particle_set, &
516                                                  coeffs_set, coeffs_kind, &
517                                                  is_assoc_atomic_block_global, do_periodic, &
518                                                  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
519                                                  cell, &
520                                                  do_p_screening, map_atom_to_kind_atom, eval_type, &
521                                                  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
522
523         END DO ! atom_block
524
525         nbins = load_balance_parameter%nbins
526         cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
527
528!$OMP BARRIER
529!$OMP MASTER
530         CALL timestop(handle_inner)
531!$OMP END MASTER
532!$OMP BARRIER
533
534! new load balancing test
535         IF (.FALSE.) THEN
536            CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
537                                            natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
538                                            particle_set, &
539                                            coeffs_set, coeffs_kind, &
540                                            is_assoc_atomic_block_global, do_periodic, &
541                                            kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
542                                            cell, x_data, para_env, pmax_block, &
543                                            do_p_screening, map_atom_to_kind_atom, eval_type, &
544                                            log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
545         END IF
546
547!$OMP BARRIER
548!$OMP MASTER
549         CALL timeset(routineN//"_bin", handle_inner)
550!$OMP END MASTER
551!$OMP BARRIER
552
553         ALLOCATE (binned_dist(nbins))
554         binned_dist(:)%istart = -1_int_8
555         binned_dist(:)%number_of_atom_quartets = 0_int_8
556         binned_dist(:)%cost = 0_int_8
557         binned_dist(:)%time_first_scf = 0.0_dp
558         binned_dist(:)%time_other_scf = 0.0_dp
559         binned_dist(:)%time_forces = 0.0_dp
560
561         current_cost = 0
562         mepos = 1
563         distribution_counter_start = 1
564         distribution_counter_end = 0
565         ibin = 1
566
567         global_quartet_counter = 0
568         local_quartet_counter = 0
569         last_bin_needs_to_be_filled = .FALSE.
570         DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
571            latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
572            tmp_block = atom_block/nblocks
573            katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
574            IF (latom_block < katom_block) CYCLE
575            tmp_block = tmp_block/nblocks
576            jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
577            tmp_block = tmp_block/nblocks
578            iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
579            IF (jatom_block < iatom_block) CYCLE
580
581            distribution_counter_end = distribution_counter_end + 1
582            global_quartet_counter = global_quartet_counter + 1
583            last_bin_needs_to_be_filled = .TRUE.
584
585            IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
586
587            iatom_start = x_data%blocks(iatom_block)%istart
588            iatom_end = x_data%blocks(iatom_block)%iend
589            jatom_start = x_data%blocks(jatom_block)%istart
590            jatom_end = x_data%blocks(jatom_block)%iend
591            katom_start = x_data%blocks(katom_block)%istart
592            katom_end = x_data%blocks(katom_block)%iend
593            latom_start = x_data%blocks(latom_block)%istart
594            latom_end = x_data%blocks(latom_block)%iend
595
596            SELECT CASE (eval_type)
597            CASE (hfx_do_eval_energy)
598               pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
599                                 pmax_block(latom_block, jatom_block), &
600                                 pmax_block(latom_block, iatom_block), &
601                                 pmax_block(katom_block, jatom_block))
602            CASE (hfx_do_eval_forces)
603               pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
604                                 pmax_block(latom_block, jatom_block), &
605                                 pmax_block(latom_block, iatom_block) + &
606                                 pmax_block(katom_block, jatom_block))
607            END SELECT
608
609            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
610
611            current_cost = current_cost &
612                           + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
613                                                 iatom_start, iatom_end, jatom_start, jatom_end, &
614                                                 katom_start, katom_end, latom_start, latom_end, &
615                                                 particle_set, &
616                                                 coeffs_set, coeffs_kind, &
617                                                 is_assoc_atomic_block_global, do_periodic, &
618                                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
619                                                 cell, &
620                                                 do_p_screening, map_atom_to_kind_atom, eval_type, &
621                                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
622
623            IF (current_cost >= cost_per_bin) THEN
624               IF (ibin == nbins) THEN
625                  binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
626                                                              distribution_counter_end - distribution_counter_start + 1
627               ELSE
628                  binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
629               END IF
630               binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
631               ibin = MIN(ibin + 1, nbins)
632               distribution_counter_start = distribution_counter_end + 1
633               current_cost = 0
634               last_bin_needs_to_be_filled = .FALSE.
635            END IF
636         ENDDO
637
638!$OMP BARRIER
639!$OMP MASTER
640         CALL timestop(handle_inner)
641         CALL timeset(routineN//"_dist", handle_inner)
642!$OMP END MASTER
643!$OMP BARRIER
644         !! Fill the last bin if necessary
645         IF (last_bin_needs_to_be_filled) THEN
646            binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
647            IF (ibin == nbins) THEN
648               binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
649                                                           distribution_counter_end - distribution_counter_start + 1
650            ELSE
651               binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
652            END IF
653         END IF
654
655         !! Sanity-Check
656         DO ibin = 1, nbins
657            local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
658         END DO
659!$OMP BARRIER
660!$OMP MASTER
661         shm_local_quartet_counter = 0
662         shm_global_quartet_counter = 0
663!$OMP END MASTER
664!$OMP BARRIER
665!$OMP ATOMIC
666         shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
667!$OMP ATOMIC
668         shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
669
670!$OMP BARRIER
671!$OMP MASTER
672         CALL mp_sum(shm_local_quartet_counter, para_env%group)
673         CALL mp_sum(shm_global_quartet_counter, para_env%group)
674         IF (para_env%ionode) THEN
675            IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
676               WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
677                  "Number of local quartets (", shm_local_quartet_counter, &
678                  ") and number of global quartets (", shm_global_quartet_counter, &
679                  ") are different. Please send in a bug report."
680               CPABORT(error_msg)
681            END IF
682         END IF
683!$OMP END MASTER
684
685!$OMP BARRIER
686!$OMP MASTER
687         ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
688         cost_matrix = 0
689!$OMP END MASTER
690!$OMP BARRIER
691         icpu = para_env%mepos + 1
692         DO i = 1, nbins
693            cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
694         END DO
695         mepos = para_env%mepos
696!$OMP BARRIER
697
698!$OMP MASTER
699         ! sync before/after ring of isendrecv
700         CALL mp_sync(para_env%group)
701
702         ALLOCATE (sendbuffer(nbins*n_threads))
703         ALLOCATE (recbuffer(nbins*n_threads))
704
705         sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
706
707         dest = MODULO(mepos + 1, ncpu)
708         source = MODULO(mepos - 1, ncpu)
709         DO icpu = 0, ncpu - 1
710            IF (icpu .NE. ncpu - 1) THEN
711               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
712                                 para_env%group, req(1), req(2), 13)
713            ENDIF
714            data_from = MODULO(mepos - icpu, ncpu)
715            cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
716            IF (icpu .NE. ncpu - 1) THEN
717               CALL mp_waitall(req)
718            ENDIF
719            swapbuffer => sendbuffer
720            sendbuffer => recbuffer
721            recbuffer => swapbuffer
722         ENDDO
723         DEALLOCATE (recbuffer, sendbuffer)
724!$OMP END MASTER
725!$OMP BARRIER
726
727!$OMP BARRIER
728!$OMP MASTER
729         CALL timestop(handle_inner)
730         CALL timeset(routineN//"_opt", handle_inner)
731!$OMP END MASTER
732!$OMP BARRIER
733
734         !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
735!$OMP BARRIER
736         ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
737         local_cost_matrix = cost_matrix
738!$OMP MASTER
739         ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
740
741         CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
742                                    shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
743
744         CALL timestop(handle_inner)
745         CALL timeset(routineN//"_redist", handle_inner)
746         !! Collect local data to global array
747         ALLOCATE (full_dist(ncpu*n_threads, nbins))
748
749         full_dist(:, :)%istart = 0_int_8
750         full_dist(:, :)%number_of_atom_quartets = 0_int_8
751         full_dist(:, :)%cost = 0_int_8
752         full_dist(:, :)%time_first_scf = 0.0_dp
753         full_dist(:, :)%time_other_scf = 0.0_dp
754         full_dist(:, :)%time_forces = 0.0_dp
755!$OMP END MASTER
756!$OMP BARRIER
757         mepos = para_env%mepos + 1
758         full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
759
760!$OMP BARRIER
761!$OMP MASTER
762         ALLOCATE (sendbuffer(3*nbins*n_threads))
763         ALLOCATE (recbuffer(3*nbins*n_threads))
764         mepos = para_env%mepos
765         DO j = 1, n_threads
766            DO i = 1, nbins
767               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
768               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
769               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
770            END DO
771         END DO
772
773         ! sync before/after ring of isendrecv
774         CALL mp_sync(para_env%group)
775         dest = MODULO(mepos + 1, ncpu)
776         source = MODULO(mepos - 1, ncpu)
777         DO icpu = 0, ncpu - 1
778            IF (icpu .NE. ncpu - 1) THEN
779               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
780                                 para_env%group, req(1), req(2), 13)
781            ENDIF
782            data_from = MODULO(mepos - icpu, ncpu)
783            DO j = 1, n_threads
784               DO i = 1, nbins
785                  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
786                  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
787                  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
788               END DO
789            END DO
790
791            IF (icpu .NE. ncpu - 1) THEN
792               CALL mp_waitall(req)
793            ENDIF
794            swapbuffer => sendbuffer
795            sendbuffer => recbuffer
796            recbuffer => swapbuffer
797         ENDDO
798         DEALLOCATE (recbuffer, sendbuffer)
799
800         ! sync before/after ring of isendrecv
801         CALL mp_sync(para_env%group)
802!$OMP END MASTER
803!$OMP BARRIER
804         !! reorder the distribution according to the distribution vector
805         ALLOCATE (tmp_pos(ncpu*n_threads))
806         tmp_pos = 1
807         ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
808
809         tmp_dist(:)%istart = 0_int_8
810         tmp_dist(:)%number_of_atom_quartets = 0_int_8
811         tmp_dist(:)%cost = 0_int_8
812         tmp_dist(:)%time_first_scf = 0.0_dp
813         tmp_dist(:)%time_other_scf = 0.0_dp
814         tmp_dist(:)%time_forces = 0.0_dp
815
816         DO icpu = 1, n_processes
817            DO i = 1, nbins
818               mepos = my_process_id + 1
819               IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
820                  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
821                  tmp_pos(mepos) = tmp_pos(mepos) + 1
822               END IF
823            END DO
824         END DO
825
826         !! Assign the load to each process
827         NULLIFY (ptr_to_tmp_dist)
828         mepos = my_process_id + 1
829         ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
830         SELECT CASE (eval_type)
831         CASE (hfx_do_eval_energy)
832            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
833         CASE (hfx_do_eval_forces)
834            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
835         END SELECT
836
837!$OMP BARRIER
838!$OMP MASTER
839         DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
840!$OMP END MASTER
841!$OMP BARRIER
842         DEALLOCATE (tmp_dist, tmp_pos)
843         DEALLOCATE (binned_dist, local_cost_matrix)
844         DEALLOCATE (set_list_ij, set_list_kl)
845
846!$OMP BARRIER
847!$OMP MASTER
848         CALL timestop(handle_inner)
849!$OMP END MASTER
850!$OMP BARRIER
851      END IF
852!$OMP BARRIER
853!$OMP MASTER
854      CALL timestop(handle)
855!$OMP END MASTER
856!$OMP BARRIER
857   END SUBROUTINE hfx_load_balance
858
859! **************************************************************************************************
860!> \brief Reference implementation of new recursive load balancing routine
861!>        Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
862!>        each process in a P-Q grid such that every process has more or less the
863!>        same amount of work. Has no output at the moment (not used) but writes
864!>        its computed load balance values into a file. Possible output is ready
865!>        to use in the two arrays p_atom_blocks & q_atom_blocks
866!> \param n_processes ...
867!> \param my_process_id ...
868!> \param nblocks ...
869!> \param natom ...
870!> \param nkind ...
871!> \param list_ij ...
872!> \param list_kl ...
873!> \param set_list_ij ...
874!> \param set_list_kl ...
875!> \param particle_set ...
876!> \param coeffs_set ...
877!> \param coeffs_kind ...
878!> \param is_assoc_atomic_block_global ...
879!> \param do_periodic ...
880!> \param kind_of ...
881!> \param basis_parameter ...
882!> \param pmax_set ...
883!> \param pmax_atom ...
884!> \param pmax_blocks ...
885!> \param cell ...
886!> \param x_data ...
887!> \param para_env ...
888!> \param pmax_block ...
889!> \param do_p_screening ...
890!> \param map_atom_to_kind_atom ...
891!> \param eval_type ...
892!> \param log10_eps_schwarz ...
893!> \param log_2 ...
894!> \param coeffs_kind_max0 ...
895!> \param use_virial ...
896!> \param atomic_pair_list ...
897!> \par History
898!>      03.2011 created [Michael Steinlechner]
899!> \author Michael Steinlechner
900! **************************************************************************************************
901
902   SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
903                                         natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
904                                         particle_set, &
905                                         coeffs_set, coeffs_kind, &
906                                         is_assoc_atomic_block_global, do_periodic, &
907                                         kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
908                                         cell, x_data, para_env, pmax_block, &
909                                         do_p_screening, map_atom_to_kind_atom, eval_type, &
910                                         log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
911
912! input variables:
913      INTEGER, INTENT(IN)                                :: n_processes, my_process_id, nblocks, &
914                                                            natom, nkind
915      TYPE(pair_list_type), INTENT(IN)                   :: list_ij, list_kl
916      TYPE(pair_set_list_type), ALLOCATABLE, &
917         DIMENSION(:), INTENT(IN)                        :: set_list_ij, set_list_kl
918      TYPE(particle_type), DIMENSION(:), INTENT(IN), &
919         POINTER                                         :: particle_set
920      TYPE(hfx_screen_coeff_type), &
921         DIMENSION(:, :, :, :), INTENT(IN), POINTER      :: coeffs_set
922      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
923         INTENT(IN), POINTER                             :: coeffs_kind
924      INTEGER, DIMENSION(:, :), INTENT(IN)               :: is_assoc_atomic_block_global
925      LOGICAL, INTENT(IN)                                :: do_periodic
926      INTEGER, INTENT(IN)                                :: kind_of(*)
927      TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
928         POINTER                                         :: basis_parameter
929      TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
930         POINTER                                         :: pmax_set
931      REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_atom
932      REAL(dp)                                           :: pmax_blocks
933      TYPE(cell_type), INTENT(IN), POINTER               :: cell
934      TYPE(hfx_type), INTENT(IN), POINTER                :: x_data
935      TYPE(cp_para_env_type), INTENT(IN), POINTER        :: para_env
936      REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_block
937      LOGICAL, INTENT(IN)                                :: do_p_screening
938      INTEGER, DIMENSION(:), INTENT(IN), POINTER         :: map_atom_to_kind_atom
939      INTEGER, INTENT(IN)                                :: eval_type
940      REAL(dp), INTENT(IN)                               :: log10_eps_schwarz, log_2, &
941                                                            coeffs_kind_max0
942      LOGICAL, INTENT(IN)                                :: use_virial
943      LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER      :: atomic_pair_list
944
945      CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_recursive_load_balance', &
946         routineP = moduleN//':'//routineN
947
948      INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
949         jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
950         nP, nQ, numBins, p, q, sizeP, sizeQ, unit_nr
951      INTEGER(int_8)                                     :: local_cost, pidx, qidx, sumP, sumQ
952      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: local_cost_vector
953      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: blocksize, p_atom_blocks, permute, &
954                                                            q_atom_blocks
955      REAL(dp)                                           :: maximum, mean
956
957! internal variables:
958
959!$OMP BARRIER
960!$OMP MASTER
961      CALL timeset(routineN, handle)
962!$OMP END MASTER
963!$OMP BARRIER
964
965      ! calculate best p/q distribution grid for the n_processes
966      CALL hfx_calculate_PQ(p, q, numBins, n_processes)
967
968      ALLOCATE (blocksize(numBins))
969      ALLOCATE (permute(nblocks**2))
970      DO i = 1, nblocks**2
971         permute(i) = i
972      END DO
973
974      ! call the main recursive permutation routine.
975      ! Output:
976      !   blocksize :: vector (size numBins) with the sizes for each column/row block
977      !   permute   :: permutation vector
978      CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numBins, &
979                                 permute, 1, &
980                                 my_process_id, n_processes, nblocks, &
981                                 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
982                                 particle_set, &
983                                 coeffs_set, coeffs_kind, &
984                                 is_assoc_atomic_block_global, do_periodic, &
985                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
986                                 cell, x_data, para_env, pmax_block, &
987                                 do_p_screening, map_atom_to_kind_atom, eval_type, &
988                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
989
990      ! number of blocks per processor in p-direction (vertical)
991      nP = numBins/p
992      ! number of blocks per processor in q-direction (horizontal)
993      nQ = numBins/q
994
995      ! calc own position in P-Q-processor grid (PQ-grid is column-major)
996      pidx = MODULO(INT(my_process_id), INT(p)) + 1
997      qidx = my_process_id/p + 1
998
999      sizeP = SUM(blocksize((nP*(pidx - 1) + 1):(nP*pidx)))
1000      sizeQ = SUM(blocksize((nQ*(qidx - 1) + 1):(nQ*qidx)))
1001
1002      sumP = SUM(blocksize(1:(nP*(pidx - 1))))
1003      sumQ = SUM(blocksize(1:(nQ*(qidx - 1))))
1004
1005      ALLOCATE (p_atom_blocks(sizeP))
1006      ALLOCATE (q_atom_blocks(sizeQ))
1007
1008      p_atom_blocks(:) = permute((sumP + 1):(sumP + sizeP))
1009      q_atom_blocks(:) = permute((sumQ + 1):(sumQ + sizeQ))
1010
1011      ! from here on, we are actually finished, each process has been
1012      ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
1013      ! what follows is just a small routine to calculate the local cost
1014      ! for each processor which is then written to a file.
1015
1016      ! calculate local cost for each processor!
1017      ! ****************************************
1018      local_cost = 0
1019      DO i = 1, sizeP
1020         DO j = 1, sizeQ
1021
1022            !       get corresponding 4D block indices out of our own P-Q-block
1023            latom_block = MODULO(q_atom_blocks(j), nblocks)
1024            iatom_block = q_atom_blocks(j)/nblocks + 1
1025            jatom_block = MODULO(p_atom_blocks(i), nblocks)
1026            katom_block = p_atom_blocks(i)/nblocks + 1
1027
1028            !       symmetry checks.
1029            IF (latom_block < katom_block) CYCLE
1030            IF (jatom_block < iatom_block) CYCLE
1031
1032            iatom_start = x_data%blocks(iatom_block)%istart
1033            iatom_end = x_data%blocks(iatom_block)%iend
1034            jatom_start = x_data%blocks(jatom_block)%istart
1035            jatom_end = x_data%blocks(jatom_block)%iend
1036            katom_start = x_data%blocks(katom_block)%istart
1037            katom_end = x_data%blocks(katom_block)%iend
1038            latom_start = x_data%blocks(latom_block)%istart
1039            latom_end = x_data%blocks(latom_block)%iend
1040
1041            !       whatever.
1042            SELECT CASE (eval_type)
1043            CASE (hfx_do_eval_energy)
1044               pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
1045                                 pmax_block(latom_block, jatom_block), &
1046                                 pmax_block(latom_block, iatom_block), &
1047                                 pmax_block(katom_block, jatom_block))
1048            CASE (hfx_do_eval_forces)
1049               pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
1050                                 pmax_block(latom_block, jatom_block), &
1051                                 pmax_block(latom_block, iatom_block) + &
1052                                 pmax_block(katom_block, jatom_block))
1053            END SELECT
1054
1055            !       screening.
1056            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
1057
1058            !       estimate the cost of this atom_block.
1059            local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1060                                                          set_list_kl, &
1061                                                          iatom_start, iatom_end, jatom_start, jatom_end, &
1062                                                          katom_start, katom_end, latom_start, latom_end, &
1063                                                          particle_set, &
1064                                                          coeffs_set, coeffs_kind, &
1065                                                          is_assoc_atomic_block_global, do_periodic, &
1066                                                          kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1067                                                          cell, &
1068                                                          do_p_screening, map_atom_to_kind_atom, eval_type, &
1069                                                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1070         END DO
1071      END DO
1072
1073      ALLOCATE (local_cost_vector(n_processes))
1074      local_cost_vector = 0
1075      local_cost_vector(my_process_id + 1) = local_cost
1076      CALL mp_sum(local_cost_vector, para_env%group)
1077
1078      mean = SUM(local_cost_vector)/n_processes
1079      maximum = MAXVAL(local_cost_vector)
1080
1081!$OMP     BARRIER
1082!$OMP     MASTER
1083      ! only output once
1084      IF (my_process_id == 0) THEN
1085         CALL open_file(unit_number=unit_nr, file_name="loads.dat")
1086         WRITE (unit_nr, *) 'maximum cost:', maximum
1087         WRITE (unit_nr, *) 'mean cost:', mean
1088         WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
1089         WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
1090         DO i = 1, n_processes
1091            WRITE (unit_nr, *) local_cost_vector(i)
1092         END DO
1093         CALL close_file(unit_nr)
1094      END IF
1095!$OMP     END MASTER
1096!$OMP     BARRIER
1097
1098      DEALLOCATE (local_cost_vector)
1099      DEALLOCATE (p_atom_blocks, q_atom_blocks)
1100      DEALLOCATE (blocksize, permute)
1101
1102!$OMP BARRIER
1103!$OMP MASTER
1104      CALL timestop(handle)
1105!$OMP END MASTER
1106!$OMP BARRIER
1107
1108   END SUBROUTINE hfx_recursive_load_balance
1109
1110! **************************************************************************************************
1111!> \brief Small routine to calculate the optimal P-Q-processor grid distribution
1112!>        for a given number of processors N
1113!>        and the corresponding number of Bins for the load balancing routine
1114!> \param p     number of rows on P-Q process grid (output)
1115!> \param q     number of columns on P-Q process grid (output)
1116!> \param nBins number of Bins (output)
1117!> \param N     number of processes (input)
1118!> \par History
1119!>      03.2011 created [Michael Steinlechner]
1120!> \author Michael Steinlechner
1121! **************************************************************************************************
1122   SUBROUTINE hfx_calculate_PQ(p, q, nBins, N)
1123
1124      INTEGER, INTENT(OUT)                               :: p, q, nBins
1125      INTEGER, INTENT(IN)                                :: N
1126
1127      INTEGER                                            :: a, b, k
1128      REAL(dp)                                           :: sqN
1129
1130      k = 2
1131      sqN = SQRT(REAL(N, KIND=dp))
1132      p = 1
1133
1134      DO WHILE (REAL(k, KIND=dp) <= sqN)
1135         IF (MODULO(N, k) == 0) THEN
1136            p = k
1137         END IF
1138         k = k + 1
1139      END DO
1140      q = N/p
1141
1142      ! now compute the least common multiple of p & q to get the number of necessary bins
1143      ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
1144      ! and use euclid's algorithm for GCD computation.
1145      a = p
1146      b = q
1147
1148      DO WHILE (b .NE. 0)
1149         IF (a > b) THEN
1150            a = a - b
1151         ELSE
1152            b = b - a
1153         END IF
1154      END DO
1155      ! gcd(p,q) is now saved in a
1156
1157      nBins = p*q/a
1158
1159   END SUBROUTINE
1160
1161! **************************************************************************************************
1162!> \brief Recursive permutation routine for the load balancing of the integral
1163!>       computation
1164!> \param blocksize     vector of blocksizes, size(nProc), which contains for
1165!>                      each process the local blocksize (OUTPUT)
1166!> \param blockstart    starting row/column idx of the block which is to be examined
1167!>                      at this point (INPUT)
1168!> \param blockend      ending row/column idx of the block which is to be examined
1169!>                      (INPUT)
1170!> \param nProc_in      number of bins into which the current block has to be divided
1171!>                      (INPUT)
1172!> \param permute       permutation vector which balances column/row cost
1173!>                      size(nblocks^2). (OUTPUT)
1174!> \param step ...
1175!> \param my_process_id ...
1176!> \param n_processes ...
1177!> \param nblocks ...
1178!> \param natom ...
1179!> \param nkind ...
1180!> \param list_ij ...
1181!> \param list_kl ...
1182!> \param set_list_ij ...
1183!> \param set_list_kl ...
1184!> \param particle_set ...
1185!> \param coeffs_set ...
1186!> \param coeffs_kind ...
1187!> \param is_assoc_atomic_block_global ...
1188!> \param do_periodic ...
1189!> \param kind_of ...
1190!> \param basis_parameter ...
1191!> \param pmax_set ...
1192!> \param pmax_atom ...
1193!> \param pmax_blocks ...
1194!> \param cell ...
1195!> \param x_data ...
1196!> \param para_env ...
1197!> \param pmax_block ...
1198!> \param do_p_screening ...
1199!> \param map_atom_to_kind_atom ...
1200!> \param eval_type ...
1201!> \param log10_eps_schwarz ...
1202!> \param log_2 ...
1203!> \param coeffs_kind_max0 ...
1204!> \param use_virial ...
1205!> \param atomic_pair_list ...
1206!> \par History
1207!>      03.2011 created [Michael Steinlechner]
1208!> \author Michael Steinlechner
1209! **************************************************************************************************
1210   RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1211                                              permute, step, &
1212                                              my_process_id, n_processes, nblocks, &
1213                                              natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1214                                              particle_set, &
1215                                              coeffs_set, coeffs_kind, &
1216                                              is_assoc_atomic_block_global, do_periodic, &
1217                                              kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1218                                              cell, x_data, para_env, pmax_block, &
1219                                              do_p_screening, map_atom_to_kind_atom, eval_type, &
1220                                              log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1221
1222      INTEGER                                            :: nProc_in, blockend, blockstart
1223      INTEGER, DIMENSION(nProc_in)                       :: blocksize
1224      INTEGER                                            :: nblocks, n_processes, my_process_id
1225      INTEGER, INTENT(IN)                                :: step
1226      INTEGER, DIMENSION(nblocks*nblocks)                :: permute
1227      INTEGER                                            :: natom
1228      INTEGER, INTENT(IN)                                :: nkind
1229      TYPE(pair_list_type)                               :: list_ij, list_kl
1230      TYPE(pair_set_list_type), ALLOCATABLE, &
1231         DIMENSION(:)                                    :: set_list_ij, set_list_kl
1232      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
1233      TYPE(hfx_screen_coeff_type), &
1234         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
1235      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
1236         POINTER                                         :: coeffs_kind
1237      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
1238      LOGICAL                                            :: do_periodic
1239      INTEGER                                            :: kind_of(*)
1240      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
1241      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
1242      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
1243      REAL(dp)                                           :: pmax_blocks
1244      TYPE(cell_type), POINTER                           :: cell
1245      TYPE(hfx_type), POINTER                            :: x_data
1246      TYPE(cp_para_env_type), POINTER                    :: para_env
1247      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
1248      LOGICAL, INTENT(IN)                                :: do_p_screening
1249      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
1250      INTEGER, INTENT(IN)                                :: eval_type
1251      REAL(dp)                                           :: log10_eps_schwarz, log_2, &
1252                                                            coeffs_kind_max0
1253      LOGICAL, INTENT(IN)                                :: use_virial
1254      LOGICAL, DIMENSION(:, :), POINTER                  :: atomic_pair_list
1255
1256      INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
1257         jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1258         latom_end, latom_start, nbins, nProc, row, startoffset
1259      INTEGER(int_8)                                     :: atom_block, tmp_block
1260      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: ithblocksize, localblocksize
1261      INTEGER, DIMENSION(blockend-blockstart+1)          :: bin_perm, tmp_perm
1262      REAL(dp)                                           :: partialcost
1263      REAL(dp), DIMENSION(nblocks*nblocks)               :: cost_vector
1264
1265      nProc = nProc_in
1266      cost_vector = 0.0_dp
1267
1268!   loop over local atom_blocks.
1269      DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
1270
1271!       get corresponding 4D block indices
1272         latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
1273         tmp_block = atom_block/nblocks
1274         katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1275         IF (latom_block < katom_block) CYCLE
1276         tmp_block = tmp_block/nblocks
1277         jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1278         tmp_block = tmp_block/nblocks
1279         iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1280         IF (jatom_block < iatom_block) CYCLE
1281
1282!       get 2D indices of this atom_block (with permutation applied)
1283!       for this, we need to invert the permutation, this means
1284!       find position in permutation vector where value==idx
1285
1286         row = (katom_block - 1)*nblocks + jatom_block
1287         inv_perm = 1
1288         DO WHILE (permute(inv_perm) .NE. row)
1289            inv_perm = inv_perm + 1
1290         END DO
1291         row = inv_perm
1292
1293         col = (iatom_block - 1)*nblocks + latom_block
1294         inv_perm = 1
1295         DO WHILE (permute(inv_perm) .NE. col)
1296            inv_perm = inv_perm + 1
1297         END DO
1298         col = inv_perm
1299
1300!       if row/col outside our current diagonal block, skip calculation.
1301         IF (col < blockstart .OR. col > blockend) CYCLE
1302         IF (row < blockstart .OR. row > blockend) CYCLE
1303
1304         iatom_start = x_data%blocks(iatom_block)%istart
1305         iatom_end = x_data%blocks(iatom_block)%iend
1306         jatom_start = x_data%blocks(jatom_block)%istart
1307         jatom_end = x_data%blocks(jatom_block)%iend
1308         katom_start = x_data%blocks(katom_block)%istart
1309         katom_end = x_data%blocks(katom_block)%iend
1310         latom_start = x_data%blocks(latom_block)%istart
1311         latom_end = x_data%blocks(latom_block)%iend
1312
1313!       whatever.
1314         SELECT CASE (eval_type)
1315         CASE (hfx_do_eval_energy)
1316            pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
1317                              pmax_block(latom_block, jatom_block), &
1318                              pmax_block(latom_block, iatom_block), &
1319                              pmax_block(katom_block, jatom_block))
1320         CASE (hfx_do_eval_forces)
1321            pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
1322                              pmax_block(latom_block, jatom_block), &
1323                              pmax_block(latom_block, iatom_block) + &
1324                              pmax_block(katom_block, jatom_block))
1325         END SELECT
1326
1327!       screening.
1328         IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
1329
1330!       every second recursion step, compute row sum instead of column sum
1331
1332         IF (MODULO(step, 2) .EQ. 0) THEN
1333            idx = row
1334         ELSE
1335            idx = col
1336         END IF
1337
1338!       estimate the cost of this atom_block.
1339         partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1340                                           set_list_kl, &
1341                                           iatom_start, iatom_end, jatom_start, jatom_end, &
1342                                           katom_start, katom_end, latom_start, latom_end, &
1343                                           particle_set, &
1344                                           coeffs_set, coeffs_kind, &
1345                                           is_assoc_atomic_block_global, do_periodic, &
1346                                           kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1347                                           cell, &
1348                                           do_p_screening, map_atom_to_kind_atom, eval_type, &
1349                                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1350
1351         cost_vector(idx) = cost_vector(idx) + partialcost
1352      END DO ! atom_block
1353
1354!   sum costvector over all processes
1355      CALL mp_sum(cost_vector, para_env%group)
1356
1357!   calculate next prime factor of nProc
1358      nBins = 2
1359      DO WHILE (MODULO(INT(nProc), INT(nBins)) .NE. 0)
1360         nBins = nBins + 1
1361      END DO
1362
1363      nProc = nProc/nBins
1364
1365! ... do the binning...
1366
1367      ALLOCATE (localblocksize(nBins))
1368      CALL hfx_permute_binning(nBins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1369
1370!... and update the permutation vector
1371
1372      tmp_perm = permute(blockstart:blockend)
1373      permute(blockstart:blockend) = tmp_perm(bin_perm)
1374
1375!   split recursion into the nBins Bins
1376      IF (nProc > 1) THEN
1377         ALLOCATE (ithblocksize(nProc))
1378         DO i = 1, nBins
1379            startoffset = SUM(localblocksize(1:(i - 1)))
1380            endoffset = SUM(localblocksize(1:i)) - 1
1381
1382            CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nProc, &
1383                                       permute, step + 1, &
1384                                       my_process_id, n_processes, nblocks, &
1385                                       natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1386                                       particle_set, &
1387                                       coeffs_set, coeffs_kind, &
1388                                       is_assoc_atomic_block_global, do_periodic, &
1389                                       kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1390                                       cell, x_data, para_env, pmax_block, &
1391                                       do_p_screening, map_atom_to_kind_atom, eval_type, &
1392                                       log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1393            blocksize(((i - 1)*nProc + 1):(i*nProc)) = ithblocksize
1394         END DO
1395         DEALLOCATE (ithblocksize)
1396      ELSE
1397         DO i = 1, nBins
1398            blocksize(i) = localblocksize(i)
1399         END DO
1400      END IF
1401
1402      DEALLOCATE (localblocksize)
1403
1404   END SUBROUTINE hfx_recursive_permute
1405
1406! **************************************************************************************************
1407!> \brief small binning routine for the recursive load balancing
1408!>
1409!> \param nBins         number of Bins (INPUT)
1410!> \param costvector    vector of current row/column costs which have to be binned (INPUT)
1411!> \param maxbinsize    upper bound for bin size (INPUT)
1412!> \param perm          resulting permutation due to be binning routine (OUTPUT)
1413!> \param block_count   vector of size(nbins) which contains the size of each bin (OUTPUT)
1414!> \par History
1415!>      03.2011 created [Michael Steinlechner]
1416!> \author Michael Steinlechner
1417! **************************************************************************************************
1418   SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1419
1420      INTEGER, INTENT(IN)                                :: nBins, maxbinsize
1421      REAL(dp), DIMENSION(maxbinsize), INTENT(IN)        :: costvector
1422      INTEGER, DIMENSION(maxbinsize), INTENT(OUT)        :: perm
1423      INTEGER, DIMENSION(nBins), INTENT(OUT)             :: block_count
1424
1425      INTEGER                                            :: i, j, mod_idx, offset
1426      INTEGER, DIMENSION(nBins, maxbinsize)              :: bin
1427      INTEGER, DIMENSION(nBins)                          :: bin_idx
1428      INTEGER, DIMENSION(maxbinsize)                     :: idx
1429      REAL(dp), DIMENSION(maxbinsize)                    :: vec
1430      REAL(dp), DIMENSION(nBins)                         :: bincosts
1431
1432! be careful not to change costvector (copy it!)
1433
1434      vec = costvector
1435      block_count = 0
1436      bincosts = 0
1437
1438      !sort the array (ascending)
1439      CALL sort(vec, maxbinsize, idx)
1440
1441      ! count the loop down to distribute the largest cols/rows first
1442      DO i = maxbinsize, 1, -1
1443         IF (vec(i) == 0) THEN
1444            ! spread zero-cost col/rows evenly among procs
1445            mod_idx = MODULO(i, nBins) + 1 !(note the fortran offset by one!)
1446            block_count(mod_idx) = block_count(mod_idx) + 1
1447            bin(mod_idx, block_count(mod_idx)) = idx(i)
1448         ELSE
1449            ! sort the bins so that the one with the lowest cost is at the
1450            ! first place, where we then assign the current col/row
1451            CALL sort(bincosts, nBins, bin_idx)
1452            block_count = block_count(bin_idx)
1453            bin = bin(bin_idx, :)
1454
1455            bincosts(1) = bincosts(1) + vec(i)
1456            block_count(1) = block_count(1) + 1
1457            bin(1, block_count(1)) = idx(i)
1458         END IF
1459      END DO
1460
1461      ! construct permutation vector from the binning
1462      offset = 0
1463      DO i = 1, nBins
1464         DO j = 1, block_count(i)
1465            perm(offset + j) = bin(i, j)
1466         END DO
1467         offset = offset + block_count(i)
1468      END DO
1469
1470   END SUBROUTINE hfx_permute_binning
1471
1472! **************************************************************************************************
1473!> \brief Cheap way of redistributing the eri's
1474!> \param x_data Object that stores the indices array
1475!> \param para_env para_env
1476!> \param load_balance_parameter contains parmameter for Monte-Carlo routines
1477!> \param i_thread current thread ID
1478!> \param n_threads Total Number of threads
1479!> \param eval_type ...
1480!> \par History
1481!>      12.2007 created [Manuel Guidon]
1482!>      02.2009 optimize Memory Usage [Manuel Guidon]
1483!> \author Manuel Guidon
1484!> \note
1485!>      The cost matrix is given by the walltime for each bin that is measured
1486!>      during the calculation
1487! **************************************************************************************************
1488   SUBROUTINE hfx_update_load_balance(x_data, para_env, &
1489                                      load_balance_parameter, &
1490                                      i_thread, n_threads, eval_type)
1491
1492      TYPE(hfx_type), POINTER                            :: x_data
1493      TYPE(cp_para_env_type), POINTER                    :: para_env
1494      TYPE(hfx_load_balance_type)                        :: load_balance_parameter
1495      INTEGER, INTENT(IN)                                :: i_thread, n_threads, eval_type
1496
1497      CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_update_load_balance', &
1498         routineP = moduleN//':'//routineN
1499
1500      INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1501         my_global_start_idx, my_process_id, n_processes, nbins, ncpu, req(2), source, start_idx
1502      INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
1503                                                            sendbuffer, swapbuffer
1504      INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
1505      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: tmp_pos
1506      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: bins_per_rank
1507      INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE        :: bin_histogram
1508      INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
1509      INTEGER, SAVE                                      :: max_bin_size
1510      TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
1511      TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
1512         SAVE                                            :: full_dist
1513
1514!$OMP BARRIER
1515!$OMP MASTER
1516      CALL timeset(routineN, handle)
1517!$OMP END MASTER
1518!$OMP BARRIER
1519
1520      ncpu = para_env%num_pe
1521      n_processes = ncpu*n_threads
1522      !! If there is only 1 cpu skip the binning
1523      IF (n_processes == 1) THEN
1524         ALLOCATE (tmp_dist(1))
1525         tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
1526         tmp_dist(1)%istart = 0_int_8
1527         ptr_to_tmp_dist => tmp_dist(:)
1528         SELECT CASE (eval_type)
1529         CASE (hfx_do_eval_energy)
1530            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1531         CASE (hfx_do_eval_forces)
1532            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1533         END SELECT
1534         DEALLOCATE (tmp_dist)
1535      ELSE
1536         mepos = para_env%mepos
1537         my_process_id = para_env%mepos*n_threads + i_thread
1538         nbins = load_balance_parameter%nbins
1539!$OMP MASTER
1540         ALLOCATE (bin_histogram(n_processes, 2))
1541         bin_histogram = 0
1542!$OMP END MASTER
1543!$OMP BARRIER
1544         SELECT CASE (eval_type)
1545         CASE (hfx_do_eval_energy)
1546            my_bin_size = SIZE(x_data%distribution_energy)
1547         CASE (hfx_do_eval_forces)
1548            my_bin_size = SIZE(x_data%distribution_forces)
1549         END SELECT
1550         bin_histogram(my_process_id + 1, 1) = my_bin_size
1551!$OMP BARRIER
1552!$OMP MASTER
1553         CALL mp_sum(bin_histogram(:, 1), para_env%group)
1554         bin_histogram(1, 2) = bin_histogram(1, 1)
1555         DO iprocess = 2, n_processes
1556            bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1557         END DO
1558
1559         max_bin_size = MAXVAL(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1560         CALL mp_max(max_bin_size, para_env%group)
1561!$OMP END MASTER
1562!$OMP BARRIER
1563         ALLOCATE (binned_dist(my_bin_size))
1564         !! Use old binned_dist, but with timings cost
1565         SELECT CASE (eval_type)
1566         CASE (hfx_do_eval_energy)
1567            binned_dist = x_data%distribution_energy
1568         CASE (hfx_do_eval_forces)
1569            binned_dist = x_data%distribution_forces
1570         END SELECT
1571
1572         DO ibin = 1, my_bin_size
1573            IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
1574               binned_dist(ibin)%cost = 0
1575            ELSE
1576               SELECT CASE (eval_type)
1577               CASE (hfx_do_eval_energy)
1578                  IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
1579                     binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_first_scf + &
1580                                                   binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1581                  ELSE
1582                     binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1583                  END IF
1584               CASE (hfx_do_eval_forces)
1585                  binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
1586               END SELECT
1587            END IF
1588         END DO
1589!$OMP BARRIER
1590!$OMP MASTER
1591         !! store all local results in a big cost matrix
1592         ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1593         cost_matrix = 0
1594         ALLOCATE (sendbuffer(max_bin_size*n_threads))
1595         ALLOCATE (recbuffer(max_bin_size*n_threads))
1596!$OMP END MASTER
1597!$OMP BARRIER
1598         my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1599         icpu = para_env%mepos + 1
1600         DO i = 1, my_bin_size
1601            cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1602         END DO
1603
1604         mepos = para_env%mepos
1605!$OMP BARRIER
1606!$OMP MASTER
1607         ALLOCATE (bins_per_rank(ncpu))
1608         bins_per_rank = 0
1609         DO icpu = 1, ncpu
1610            bins_per_rank(icpu) = SUM(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1611         END DO
1612         sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1613            cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1614
1615         dest = MODULO(mepos + 1, ncpu)
1616         source = MODULO(mepos - 1, ncpu)
1617         ! sync before/after ring of isendrecv
1618         CALL mp_sync(para_env%group)
1619         DO icpu = 0, ncpu - 1
1620            IF (icpu .NE. ncpu - 1) THEN
1621               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
1622                                 para_env%group, req(1), req(2), 13)
1623            ENDIF
1624            data_from = MODULO(mepos - icpu, ncpu)
1625            start_idx = SUM(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1626            end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1627            cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1628
1629            IF (icpu .NE. ncpu - 1) THEN
1630               CALL mp_waitall(req)
1631            ENDIF
1632            swapbuffer => sendbuffer
1633            sendbuffer => recbuffer
1634            recbuffer => swapbuffer
1635         ENDDO
1636         DEALLOCATE (recbuffer, sendbuffer)
1637         ! sync before/after ring of isendrecv
1638         CALL mp_sync(para_env%group)
1639!$OMP END MASTER
1640!$OMP BARRIER
1641         ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
1642         local_cost_matrix = cost_matrix
1643!$OMP MASTER
1644         ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1645         CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1646                                    shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1647
1648         ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1649
1650         full_dist(:, :)%istart = 0_int_8
1651         full_dist(:, :)%number_of_atom_quartets = 0_int_8
1652         full_dist(:, :)%cost = 0_int_8
1653         full_dist(:, :)%time_first_scf = 0.0_dp
1654         full_dist(:, :)%time_other_scf = 0.0_dp
1655         full_dist(:, :)%time_forces = 0.0_dp
1656!$OMP END MASTER
1657
1658!$OMP BARRIER
1659         mepos = para_env%mepos + 1
1660         full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1661!$OMP BARRIER
1662!$OMP MASTER
1663         ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1664         ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1665         mepos = para_env%mepos
1666         DO j = 1, n_threads
1667            DO i = 1, max_bin_size
1668               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1669               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1670               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1671            END DO
1672         END DO
1673         dest = MODULO(mepos + 1, ncpu)
1674         source = MODULO(mepos - 1, ncpu)
1675         ! sync before/after ring of isendrecv
1676         CALL mp_sync(para_env%group)
1677         DO icpu = 0, ncpu - 1
1678            IF (icpu .NE. ncpu - 1) THEN
1679               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
1680                                 para_env%group, req(1), req(2), 13)
1681            ENDIF
1682            data_from = MODULO(mepos - icpu, ncpu)
1683            DO j = 1, n_threads
1684               DO i = 1, max_bin_size
1685                  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1686                  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1687                  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1688               END DO
1689            END DO
1690
1691            IF (icpu .NE. ncpu - 1) THEN
1692               CALL mp_waitall(req)
1693            ENDIF
1694            swapbuffer => sendbuffer
1695            sendbuffer => recbuffer
1696            recbuffer => swapbuffer
1697         ENDDO
1698         ! sync before/after ring of isendrecv
1699         DEALLOCATE (recbuffer, sendbuffer)
1700         CALL mp_sync(para_env%group)
1701!$OMP END MASTER
1702!$OMP BARRIER
1703         !! reorder the distribution according to the distribution vector
1704         ALLOCATE (tmp_pos(ncpu*n_threads))
1705         tmp_pos = 1
1706         ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1707
1708         tmp_dist(:)%istart = 0_int_8
1709         tmp_dist(:)%number_of_atom_quartets = 0_int_8
1710         tmp_dist(:)%cost = 0_int_8
1711         tmp_dist(:)%time_first_scf = 0.0_dp
1712         tmp_dist(:)%time_other_scf = 0.0_dp
1713         tmp_dist(:)%time_forces = 0.0_dp
1714
1715         mepos = my_process_id + 1
1716         DO icpu = 1, n_processes
1717            DO i = 1, bin_histogram(icpu, 1)
1718               IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
1719                  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1720                  tmp_pos(mepos) = tmp_pos(mepos) + 1
1721               END IF
1722            END DO
1723         END DO
1724
1725         !! Assign the load to each process
1726         NULLIFY (ptr_to_tmp_dist)
1727         mepos = my_process_id + 1
1728         ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1729         SELECT CASE (eval_type)
1730         CASE (hfx_do_eval_energy)
1731            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1732         CASE (hfx_do_eval_forces)
1733            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1734         END SELECT
1735
1736!$OMP BARRIER
1737!$OMP MASTER
1738         DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1739         DEALLOCATE (bins_per_rank, bin_histogram)
1740!$OMP END MASTER
1741!$OMP BARRIER
1742         DEALLOCATE (tmp_dist, tmp_pos)
1743         DEALLOCATE (binned_dist, local_cost_matrix)
1744      END IF
1745!$OMP BARRIER
1746!$OMP MASTER
1747      CALL timestop(handle)
1748!$OMP END MASTER
1749!$OMP BARRIER
1750
1751   END SUBROUTINE hfx_update_load_balance
1752
1753! **************************************************************************************************
1754!> \brief estimates the cost of a set quartet with info available at load balance time
1755!>        i.e. without much info on the primitives primitives
1756!> \param nsa ...
1757!> \param nsb ...
1758!> \param nsc ...
1759!> \param nsd ...
1760!> \param npgfa ...
1761!> \param npgfb ...
1762!> \param npgfc ...
1763!> \param npgfd ...
1764!> \param ratio ...
1765!> \param p1 ...
1766!> \param p2 ...
1767!> \param p3 ...
1768!> \return ...
1769!> \par History
1770!>      08.2009 created Joost VandeVondele
1771!> \author Joost VandeVondele
1772! **************************************************************************************************
1773   FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
1774      IMPLICIT NONE
1775      REAL(KIND=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1776      INTEGER(KIND=int_8) :: res
1777      REAL(KIND=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
1778
1779      INTEGER   :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1780
1781      estimate1 = estimate_basic(p1)
1782      estimate2 = estimate_basic(p2)
1783      mu = LOG(ABS(1.0E6_dp*p3(1)) + 1)
1784      sigma = p3(2)*0.1_dp*mu
1785      switch = 1.0_dp/(1.0_dp + EXP((LOG(estimate1) - mu)/sigma))
1786      estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1787      res = INT(estimate*0.001_dp, KIND=int_8) + 1
1788
1789   CONTAINS
1790
1791! **************************************************************************************************
1792!> \brief ...
1793!> \param p ...
1794!> \return ...
1795! **************************************************************************************************
1796      REAL(KIND=dp) FUNCTION estimate_basic(p) RESULT(res)
1797      REAL(KIND=dp)                                      :: p(12)
1798
1799      REAL(KIND=dp)                                      :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1800                                                            p7, p8, p9
1801
1802         p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1803         p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1804         p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1805         res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1806               poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1807               poly2(npgfd, p4, p5, p6)*EXP(-p7*ratio + p8*ratio**2) + &
1808              1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1809         res = 1 + ABS(res)
1810      END FUNCTION estimate_basic
1811
1812! **************************************************************************************************
1813!> \brief ...
1814!> \param x ...
1815!> \param a0 ...
1816!> \param a1 ...
1817!> \param a2 ...
1818!> \return ...
1819! **************************************************************************************************
1820      REAL(KIND=dp) FUNCTION poly2(x, a0, a1, a2)
1821      INTEGER                                            :: x
1822      REAL(KIND=dp)                                      :: a0, a1, a2
1823
1824         poly2 = a0 + a1*x + a2*x*x
1825      END FUNCTION poly2
1826
1827   END FUNCTION cost_model
1828! **************************************************************************************************
1829!> \brief Minimizes the maximum cost per cpu by shuffling around all bins
1830!> \param total_number_of_bins ...
1831!> \param number_of_processes ...
1832!> \param bin_costs costs per bin
1833!> \param distribution_vector will contain the final distribution
1834!> \param do_randomize ...
1835!> \par History
1836!>      03.2009 created from a hack by Joost [Manuel Guidon]
1837!> \author Manuel Guidon
1838! **************************************************************************************************
1839   SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1840                                    distribution_vector, do_randomize)
1841      INTEGER                                            :: total_number_of_bins, number_of_processes
1842      INTEGER(int_8), DIMENSION(:), POINTER              :: bin_costs
1843      INTEGER, DIMENSION(:), POINTER                     :: distribution_vector
1844      LOGICAL, INTENT(IN)                                :: do_randomize
1845
1846      CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_distribution', &
1847                                     routineP = moduleN//':'//routineN
1848
1849      INTEGER                                            :: i, itmp, j, nstep
1850      INTEGER(int_8), DIMENSION(:), POINTER              :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1851      INTEGER, DIMENSION(:), POINTER                     :: tmp_cpu_index, tmp_index
1852      TYPE(rng_stream_type), POINTER                     :: rng_stream
1853
1854      nstep = MAX(1, INT(number_of_processes)/2)
1855
1856      ALLOCATE (tmp_cost(total_number_of_bins))
1857      ALLOCATE (tmp_index(total_number_of_bins))
1858      ALLOCATE (tmp_cpu_cost(number_of_processes))
1859      ALLOCATE (tmp_cpu_index(number_of_processes))
1860      ALLOCATE (my_cost_cpu(number_of_processes))
1861      tmp_cost = bin_costs
1862
1863      CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1864      my_cost_cpu = 0
1865      !
1866      ! assign the largest remaining bin to the CPU with the smallest load
1867      ! gives near perfect distributions for a sufficient number of bins ...
1868      ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
1869      ! each cpu a similar number of tasks.
1870      ! it also avoids degenerate cases where thousands of zero sized tasks
1871      ! are assigned to the same (least loaded) cpu
1872      !
1873      IF (do_randomize) THEN
1874         NULLIFY (rng_stream)
1875         CALL create_rng_stream(rng_stream=rng_stream, &
1876                                name="uniform_rng", &
1877                                distribution_type=UNIFORM)
1878      END IF
1879
1880      DO i = total_number_of_bins, 1, -nstep
1881         tmp_cpu_cost = my_cost_cpu
1882         CALL sort(tmp_cpu_cost, INT(number_of_processes), tmp_cpu_index)
1883         IF (do_randomize) THEN
1884            CALL reshuffle(MIN(i, nstep), tmp_cpu_index(1:MIN(i, nstep)), rng_stream)
1885         END IF
1886         DO j = 1, MIN(i, nstep)
1887            itmp = tmp_cpu_index(j)
1888            distribution_vector(tmp_index(i - j + 1)) = itmp
1889            my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1890         ENDDO
1891      ENDDO
1892
1893      IF (do_randomize) THEN
1894         CALL delete_rng_stream(rng_stream)
1895      END IF
1896
1897      DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1898      DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1899   END SUBROUTINE optimize_distribution
1900
1901! **************************************************************************************************
1902!> \brief Given a 2d index pair, this function returns a 1d index pair for
1903!>        a symmetric upper triangle NxN matrix
1904!>        The compiler should inline this function, therefore it appears in
1905!>        several modules
1906!> \param i 2d index
1907!> \param j 2d index
1908!> \param N matrix size
1909!> \return ...
1910!> \par History
1911!>      03.2009 created [Manuel Guidon]
1912!> \author Manuel Guidon
1913! **************************************************************************************************
1914   PURE FUNCTION get_1D_idx(i, j, N)
1915      INTEGER, INTENT(IN)                                :: i, j
1916      INTEGER(int_8), INTENT(IN)                         :: N
1917      INTEGER(int_8)                                     :: get_1D_idx
1918
1919      INTEGER(int_8)                                     :: min_ij
1920
1921      min_ij = MIN(i, j)
1922      get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N
1923
1924   END FUNCTION get_1D_idx
1925
1926! **************************************************************************************************
1927!> \brief ...
1928!> \param natom ...
1929!> \param nkind ...
1930!> \param list_ij ...
1931!> \param list_kl ...
1932!> \param set_list_ij ...
1933!> \param set_list_kl ...
1934!> \param iatom_start ...
1935!> \param iatom_end ...
1936!> \param jatom_start ...
1937!> \param jatom_end ...
1938!> \param katom_start ...
1939!> \param katom_end ...
1940!> \param latom_start ...
1941!> \param latom_end ...
1942!> \param particle_set ...
1943!> \param coeffs_set ...
1944!> \param coeffs_kind ...
1945!> \param is_assoc_atomic_block_global ...
1946!> \param do_periodic ...
1947!> \param kind_of ...
1948!> \param basis_parameter ...
1949!> \param pmax_set ...
1950!> \param pmax_atom ...
1951!> \param pmax_blocks ...
1952!> \param cell ...
1953!> \param do_p_screening ...
1954!> \param map_atom_to_kind_atom ...
1955!> \param eval_type ...
1956!> \param log10_eps_schwarz ...
1957!> \param log_2 ...
1958!> \param coeffs_kind_max0 ...
1959!> \param use_virial ...
1960!> \param atomic_pair_list ...
1961!> \return ...
1962! **************************************************************************************************
1963   FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1964                                iatom_start, iatom_end, jatom_start, jatom_end, &
1965                                katom_start, katom_end, latom_start, latom_end, &
1966                                particle_set, &
1967                                coeffs_set, coeffs_kind, &
1968                                is_assoc_atomic_block_global, do_periodic, &
1969                                kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1970                                cell, &
1971                                do_p_screening, map_atom_to_kind_atom, eval_type, &
1972                                log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1973                                atomic_pair_list)
1974
1975      INTEGER, INTENT(IN)                                :: natom, nkind
1976      TYPE(pair_list_type)                               :: list_ij, list_kl
1977      TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
1978      INTEGER, INTENT(IN)                                :: iatom_start, iatom_end, jatom_start, &
1979                                                            jatom_end, katom_start, katom_end, &
1980                                                            latom_start, latom_end
1981      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
1982      TYPE(hfx_screen_coeff_type), &
1983         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
1984      TYPE(hfx_screen_coeff_type), &
1985         DIMENSION(nkind, nkind)                         :: coeffs_kind
1986      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
1987      LOGICAL                                            :: do_periodic
1988      INTEGER                                            :: kind_of(*)
1989      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
1990      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
1991      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
1992      REAL(dp)                                           :: pmax_blocks
1993      TYPE(cell_type), POINTER                           :: cell
1994      LOGICAL, INTENT(IN)                                :: do_p_screening
1995      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
1996      INTEGER, INTENT(IN)                                :: eval_type
1997      REAL(dp)                                           :: log10_eps_schwarz, log_2, &
1998                                                            coeffs_kind_max0
1999      LOGICAL, INTENT(IN)                                :: use_virial
2000      LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list
2001      INTEGER(int_8)                                     :: estimate_block_cost
2002
2003      INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
2004         i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
2005         jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
2006      INTEGER, DIMENSION(:), POINTER                     :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
2007                                                            nsgfb, nsgfc, nsgfd
2008      REAL(dp)                                           :: actual_pmax_atom, cost_tmp, max_val1, &
2009                                                            max_val2, pmax_entry, rab2, rcd2, &
2010                                                            screen_kind_ij, screen_kind_kl
2011      REAL(dp), DIMENSION(:, :), POINTER                 :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
2012
2013      estimate_block_cost = 0_int_8
2014
2015      CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
2016                           kind_of, basis_parameter, particle_set, &
2017                           do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2018                           log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2019
2020      CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2021                           kind_of, basis_parameter, particle_set, &
2022                           do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2023                           log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2024
2025      DO i_list_ij = 1, list_ij%n_element
2026         iatom = list_ij%elements(i_list_ij)%pair(1)
2027         jatom = list_ij%elements(i_list_ij)%pair(2)
2028         i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2029         i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2030         ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2031         jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2032         rab2 = list_ij%elements(i_list_ij)%dist2
2033
2034         nsgfa => basis_parameter(ikind)%nsgf
2035         nsgfb => basis_parameter(jkind)%nsgf
2036         npgfa => basis_parameter(ikind)%npgf
2037         npgfb => basis_parameter(jkind)%npgf
2038
2039         DO i_list_kl = 1, list_kl%n_element
2040
2041            katom = list_kl%elements(i_list_kl)%pair(1)
2042            latom = list_kl%elements(i_list_kl)%pair(2)
2043
2044            IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE
2045            IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) CYCLE
2046
2047            IF (eval_type == hfx_do_eval_forces) THEN
2048               IF (.NOT. use_virial) THEN
2049                  IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) CYCLE
2050               END IF
2051            END IF
2052
2053            i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2054            i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2055            kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2056            lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2057            rcd2 = list_kl%elements(i_list_kl)%dist2
2058
2059            nsgfc => basis_parameter(kkind)%nsgf
2060            nsgfd => basis_parameter(lkind)%nsgf
2061            npgfc => basis_parameter(kkind)%npgf
2062            npgfd => basis_parameter(lkind)%npgf
2063
2064            IF (do_p_screening) THEN
2065               actual_pmax_atom = MAX(pmax_atom(katom, iatom), &
2066                                      pmax_atom(latom, jatom), &
2067                                      pmax_atom(latom, iatom), &
2068                                      pmax_atom(katom, jatom))
2069            ELSE
2070               actual_pmax_atom = 0.0_dp
2071            END IF
2072
2073            screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2074                             coeffs_kind(jkind, ikind)%x(2)
2075            screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2076                             coeffs_kind(lkind, kkind)%x(2)
2077            IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
2078
2079            IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2080                       is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2081                       is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2082                       is_assoc_atomic_block_global(latom, jatom) >= 1)) CYCLE
2083
2084            IF (do_p_screening) THEN
2085               SELECT CASE (eval_type)
2086               CASE (hfx_do_eval_energy)
2087                  swap_id = 0
2088                  kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
2089                  IF (ikind >= kkind) THEN
2090                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2091                                                               map_atom_to_kind_atom(katom), &
2092                                                               map_atom_to_kind_atom(iatom))
2093                  ELSE
2094                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2095                                                               map_atom_to_kind_atom(iatom), &
2096                                                               map_atom_to_kind_atom(katom))
2097                     swap_id = swap_id + 1
2098                  END IF
2099                  kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
2100                  IF (jkind >= lkind) THEN
2101                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2102                                                               map_atom_to_kind_atom(latom), &
2103                                                               map_atom_to_kind_atom(jatom))
2104                  ELSE
2105                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2106                                                               map_atom_to_kind_atom(jatom), &
2107                                                               map_atom_to_kind_atom(latom))
2108                     swap_id = swap_id + 2
2109                  END IF
2110                  kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
2111                  IF (ikind >= lkind) THEN
2112                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2113                                                               map_atom_to_kind_atom(latom), &
2114                                                               map_atom_to_kind_atom(iatom))
2115                  ELSE
2116                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2117                                                               map_atom_to_kind_atom(iatom), &
2118                                                               map_atom_to_kind_atom(latom))
2119                     swap_id = swap_id + 4
2120                  END IF
2121                  kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
2122                  IF (jkind >= kkind) THEN
2123                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2124                                                               map_atom_to_kind_atom(katom), &
2125                                                               map_atom_to_kind_atom(jatom))
2126                  ELSE
2127                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2128                                                               map_atom_to_kind_atom(jatom), &
2129                                                               map_atom_to_kind_atom(katom))
2130                     swap_id = swap_id + 8
2131                  END IF
2132               CASE (hfx_do_eval_forces)
2133                  swap_id = 16
2134                  kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
2135                  IF (ikind >= kkind) THEN
2136                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2137                                                               map_atom_to_kind_atom(katom), &
2138                                                               map_atom_to_kind_atom(iatom))
2139                  ELSE
2140                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2141                                                               map_atom_to_kind_atom(iatom), &
2142                                                               map_atom_to_kind_atom(katom))
2143                     swap_id = swap_id + 1
2144                  END IF
2145                  kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
2146                  IF (jkind >= lkind) THEN
2147                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2148                                                               map_atom_to_kind_atom(latom), &
2149                                                               map_atom_to_kind_atom(jatom))
2150                  ELSE
2151                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2152                                                               map_atom_to_kind_atom(jatom), &
2153                                                               map_atom_to_kind_atom(latom))
2154                     swap_id = swap_id + 2
2155                  END IF
2156                  kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
2157                  IF (ikind >= lkind) THEN
2158                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2159                                                               map_atom_to_kind_atom(latom), &
2160                                                               map_atom_to_kind_atom(iatom))
2161                  ELSE
2162                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2163                                                               map_atom_to_kind_atom(iatom), &
2164                                                               map_atom_to_kind_atom(latom))
2165                     swap_id = swap_id + 4
2166                  END IF
2167                  kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
2168                  IF (jkind >= kkind) THEN
2169                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2170                                                               map_atom_to_kind_atom(katom), &
2171                                                               map_atom_to_kind_atom(jatom))
2172                  ELSE
2173                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2174                                                               map_atom_to_kind_atom(jatom), &
2175                                                               map_atom_to_kind_atom(katom))
2176                     swap_id = swap_id + 8
2177                  END IF
2178               END SELECT
2179            END IF
2180
2181            DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2182               iset = set_list_ij(i_set_list_ij)%pair(1)
2183               jset = set_list_ij(i_set_list_ij)%pair(2)
2184
2185               max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2186                          coeffs_set(jset, iset, jkind, ikind)%x(2)
2187
2188               IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
2189               DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2190                  kset = set_list_kl(i_set_list_kl)%pair(1)
2191                  lset = set_list_kl(i_set_list_kl)%pair(2)
2192
2193                  max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2194                                         coeffs_set(lset, kset, lkind, kkind)%x(2))
2195
2196                  IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) CYCLE
2197                  IF (do_p_screening) THEN
2198                     CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2199                                       iset, jset, kset, lset, &
2200                                       pmax_entry, swap_id)
2201                     IF (eval_type == hfx_do_eval_forces) THEN
2202                        pmax_entry = log_2 + pmax_entry
2203                     END IF
2204                  ELSE
2205                     pmax_entry = 0.0_dp
2206                  END IF
2207                  max_val2 = max_val2 + pmax_entry
2208                  IF (max_val2 < log10_eps_schwarz) CYCLE
2209                  SELECT CASE (eval_type)
2210                  CASE (hfx_do_eval_energy)
2211                     cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2212                                           npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2213                                           max_val2/log10_eps_schwarz, &
2214                                           p1_energy, p2_energy, p3_energy)
2215                     estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
2216                  CASE (hfx_do_eval_forces)
2217                     cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2218                                           npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2219                                           max_val2/log10_eps_schwarz, &
2220                                           p1_forces, p2_forces, p3_forces)
2221                     estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
2222                  END SELECT
2223               ENDDO ! i_set_list_kl
2224            ENDDO ! i_set_list_ij
2225         ENDDO ! i_list_kl
2226      ENDDO ! i_list_ij
2227
2228   END FUNCTION estimate_block_cost
2229
2230! **************************************************************************************************
2231!> \brief ...
2232!> \param nkind ...
2233!> \param para_env ...
2234!> \param natom ...
2235!> \param block_size ...
2236!> \param nblock ...
2237!> \param blocks ...
2238!> \param list_ij ...
2239!> \param list_kl ...
2240!> \param set_list_ij ...
2241!> \param set_list_kl ...
2242!> \param particle_set ...
2243!> \param coeffs_set ...
2244!> \param coeffs_kind ...
2245!> \param is_assoc_atomic_block_global ...
2246!> \param do_periodic ...
2247!> \param kind_of ...
2248!> \param basis_parameter ...
2249!> \param pmax_set ...
2250!> \param pmax_atom ...
2251!> \param pmax_blocks ...
2252!> \param cell ...
2253!> \param do_p_screening ...
2254!> \param map_atom_to_kind_atom ...
2255!> \param eval_type ...
2256!> \param log10_eps_schwarz ...
2257!> \param log_2 ...
2258!> \param coeffs_kind_max0 ...
2259!> \param use_virial ...
2260!> \param atomic_pair_list ...
2261! **************************************************************************************************
2262   SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2263                          list_ij, list_kl, set_list_ij, set_list_kl, &
2264                          particle_set, &
2265                          coeffs_set, coeffs_kind, &
2266                          is_assoc_atomic_block_global, do_periodic, &
2267                          kind_of, basis_parameter, pmax_set, pmax_atom, &
2268                          pmax_blocks, cell, &
2269                          do_p_screening, map_atom_to_kind_atom, eval_type, &
2270                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2271                          atomic_pair_list)
2272
2273      INTEGER, INTENT(IN)                                :: nkind
2274      TYPE(cp_para_env_type), POINTER                    :: para_env
2275      INTEGER                                            :: natom, block_size, nblock
2276      TYPE(hfx_block_range_type), DIMENSION(1:nblock)    :: blocks
2277      TYPE(pair_list_type)                               :: list_ij, list_kl
2278      TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
2279      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
2280      TYPE(hfx_screen_coeff_type), &
2281         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
2282      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
2283         POINTER                                         :: coeffs_kind
2284      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
2285      LOGICAL                                            :: do_periodic
2286      INTEGER                                            :: kind_of(*)
2287      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
2288      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
2289      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
2290      REAL(dp)                                           :: pmax_blocks
2291      TYPE(cell_type), POINTER                           :: cell
2292      LOGICAL, INTENT(IN)                                :: do_p_screening
2293      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
2294      INTEGER, INTENT(IN)                                :: eval_type
2295      REAL(dp)                                           :: log10_eps_schwarz, log_2, &
2296                                                            coeffs_kind_max0
2297      LOGICAL, INTENT(IN)                                :: use_virial
2298      LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list
2299
2300      INTEGER                                            :: atom_block, i, iatom_block, iatom_end, &
2301                                                            iatom_start, my_cpu_rank, ncpus
2302
2303      DO atom_block = 0, nblock - 1
2304         iatom_block = MODULO(atom_block, nblock) + 1
2305         iatom_start = (iatom_block - 1)*block_size + 1
2306         iatom_end = MIN(iatom_block*block_size, natom)
2307         blocks(atom_block + 1)%istart = iatom_start
2308         blocks(atom_block + 1)%iend = iatom_end
2309         blocks(atom_block + 1)%cost = 0_int_8
2310      END DO
2311
2312      ncpus = para_env%num_pe
2313      my_cpu_rank = para_env%mepos
2314      DO i = 1, nblock
2315         IF (MODULO(i, ncpus) /= my_cpu_rank) THEN
2316            blocks(i)%istart = 0
2317            blocks(i)%iend = 0
2318            CYCLE
2319         END IF
2320         iatom_start = blocks(i)%istart
2321         iatom_end = blocks(i)%iend
2322         blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2323                                              iatom_start, iatom_end, iatom_start, iatom_end, &
2324                                              iatom_start, iatom_end, iatom_start, iatom_end, &
2325                                              particle_set, &
2326                                              coeffs_set, coeffs_kind, &
2327                                              is_assoc_atomic_block_global, do_periodic, &
2328                                              kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2329                                              cell, &
2330                                              do_p_screening, map_atom_to_kind_atom, eval_type, &
2331                                              log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2332
2333      END DO
2334   END SUBROUTINE init_blocks
2335
2336! **************************************************************************************************
2337!> \brief ...
2338!> \param para_env ...
2339!> \param x_data ...
2340!> \param iw ...
2341!> \param n_threads ...
2342!> \param i_thread ...
2343!> \param eval_type ...
2344! **************************************************************************************************
2345   SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
2346                                        eval_type)
2347
2348      TYPE(cp_para_env_type), POINTER                    :: para_env
2349      TYPE(hfx_type), POINTER                            :: x_data
2350      INTEGER, INTENT(IN)                                :: iw, n_threads, i_thread, eval_type
2351
2352      CHARACTER(LEN=*), PARAMETER :: routineN = 'collect_load_balance_info', &
2353         routineP = moduleN//':'//routineN
2354
2355      INTEGER                                            :: i, j, k, my_rank, nbins, nranks, &
2356                                                            total_bins
2357      INTEGER(int_8)                                     :: avg_bin, avg_rank, max_bin, max_rank, &
2358                                                            min_bin, min_rank, sum_bin, sum_rank
2359      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer, buffer_in, buffer_out, summary
2360      INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE    :: shm_cost_vector
2361      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bins_per_rank, rdispl, sort_idx
2362      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: shm_bins_per_rank, shm_displ
2363
2364      SELECT CASE (eval_type)
2365      CASE (hfx_do_eval_energy)
2366         nbins = SIZE(x_data%distribution_energy)
2367      CASE (hfx_do_eval_forces)
2368         nbins = SIZE(x_data%distribution_forces)
2369      END SELECT
2370
2371!$OMP MASTER
2372      ALLOCATE (shm_bins_per_rank(n_threads))
2373      ALLOCATE (shm_displ(n_threads + 1))
2374!$OMP END MASTER
2375!$OMP BARRIER
2376
2377      shm_bins_per_rank(i_thread + 1) = nbins
2378!$OMP BARRIER
2379      nbins = 0
2380      DO i = 1, n_threads
2381         nbins = nbins + shm_bins_per_rank(i)
2382      END DO
2383      my_rank = para_env%mepos
2384      nranks = para_env%num_pe
2385
2386!$OMP BARRIER
2387!$OMP MASTER
2388      ALLOCATE (bins_per_rank(nranks))
2389      bins_per_rank = 0
2390
2391      bins_per_rank(my_rank + 1) = nbins
2392
2393      CALL mp_sum(bins_per_rank, para_env%group)
2394
2395      total_bins = 0
2396      DO i = 1, nranks
2397         total_bins = total_bins + bins_per_rank(i)
2398      END DO
2399
2400      ALLOCATE (shm_cost_vector(2*total_bins))
2401      shm_cost_vector = -1_int_8
2402      shm_displ(1) = 1
2403      DO i = 2, n_threads
2404         shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2405      END DO
2406      shm_displ(n_threads + 1) = nbins + 1
2407!$OMP END MASTER
2408!$OMP BARRIER
2409      j = 0
2410      SELECT CASE (eval_type)
2411      CASE (hfx_do_eval_energy)
2412         DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2413            j = j + 1
2414            shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2415            shm_cost_vector(2*i) = INT(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, KIND=int_8)
2416         END DO
2417      CASE (hfx_do_eval_forces)
2418         DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2419            j = j + 1
2420            shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2421            shm_cost_vector(2*i) = INT(x_data%distribution_forces(j)%time_forces*10000.0_dp, KIND=int_8)
2422         END DO
2423      END SELECT
2424!$OMP BARRIER
2425!$OMP MASTER
2426      ! ** calculate offsets
2427      ALLOCATE (rdispl(nranks))
2428      bins_per_rank(:) = bins_per_rank(:)*2
2429      rdispl(1) = 0
2430      DO i = 2, nranks
2431         rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2432      END DO
2433
2434      ALLOCATE (buffer_in(2*nbins))
2435      ALLOCATE (buffer_out(2*total_bins))
2436
2437      DO i = 1, nbins
2438         buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2439         buffer_in(2*i) = shm_cost_vector(2*i)
2440      END DO
2441
2442      CALL mp_gatherv(buffer_in, buffer_out, bins_per_rank, rdispl, para_env%source, para_env%group)
2443
2444      IF (iw > 0) THEN
2445
2446         ALLOCATE (summary(2*nranks))
2447         summary = 0_int_8
2448
2449         WRITE (iw, '( /, 1X, 79("-") )')
2450         WRITE (iw, '( " -", 77X, "-" )')
2451         SELECT CASE (eval_type)
2452         CASE (hfx_do_eval_energy)
2453            WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
2454         CASE (hfx_do_eval_forces)
2455            WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
2456         END SELECT
2457         WRITE (iw, '( " -", 77X, "-" )')
2458         WRITE (iw, '( 1X, 79("-") )')
2459
2460         WRITE (iw, FMT="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
2461         WRITE (iw, '( 1X, 79("-"), / )')
2462         k = 0
2463         DO i = 1, nranks
2464            DO j = 1, bins_per_rank(i)/2
2465               k = k + 1
2466               WRITE (iw, FMT="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2467                  i - 1, j, buffer_out(2*(k - 1) + 1), REAL(buffer_out(2*k), dp)/10000.0_dp
2468               summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2469               summary(2*i) = summary(2*i) + buffer_out(2*k)
2470            END DO
2471         END DO
2472
2473         !** Summary
2474         max_bin = 0_int_8
2475         min_bin = HUGE(min_bin)
2476         sum_bin = 0_int_8
2477         DO i = 1, total_bins
2478            sum_bin = sum_bin + buffer_out(2*i)
2479            max_bin = MAX(max_bin, buffer_out(2*i))
2480            min_bin = MIN(min_bin, buffer_out(2*i))
2481         END DO
2482         avg_bin = sum_bin/total_bins
2483
2484         max_rank = 0_int_8
2485         min_rank = HUGE(min_rank)
2486         sum_rank = 0_int_8
2487         DO i = 1, nranks
2488            sum_rank = sum_rank + summary(2*i)
2489            max_rank = MAX(max_rank, summary(2*i))
2490            min_rank = MIN(min_rank, summary(2*i))
2491         END DO
2492         avg_rank = sum_rank/nranks
2493
2494         WRITE (iw, FMT='(/,T3,A,/)') "SUMMARY:"
2495         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max bin", REAL(max_bin, dp)/10000.0_dp
2496         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min bin", REAL(min_bin, dp)/10000.0_dp
2497         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum bin", REAL(sum_bin, dp)/10000.0_dp
2498         WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg bin", REAL(avg_bin, dp)/10000.0_dp
2499         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max rank", REAL(max_rank, dp)/10000.0_dp
2500         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min rank", REAL(min_rank, dp)/10000.0_dp
2501         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum rank", REAL(sum_rank, dp)/10000.0_dp
2502         WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg rank", REAL(avg_rank, dp)/10000.0_dp
2503
2504         ALLOCATE (buffer(nranks))
2505         ALLOCATE (sort_idx(nranks))
2506
2507         DO i = 1, nranks
2508            buffer(i) = summary(2*i)
2509         END DO
2510
2511         CALL sort(buffer, nranks, sort_idx)
2512
2513         WRITE (iw, FMT="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
2514         DO i = nranks, 1, -1
2515       WRITE (iw, FMT="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), REAL(buffer(i), dp)/10000.0_dp
2516         END DO
2517
2518         DEALLOCATE (summary, buffer, sort_idx)
2519
2520      END IF
2521
2522      DEALLOCATE (buffer_in, buffer_out, rdispl)
2523
2524      CALL mp_sync(para_env%group)
2525
2526      DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2527!$OMP END MASTER
2528!$OMP BARRIER
2529
2530   END SUBROUTINE collect_load_balance_info
2531
2532! **************************************************************************************************
2533!> \brief ...
2534!> \param size ...
2535!> \param array ...
2536!> \param rng_stream ...
2537! **************************************************************************************************
2538   SUBROUTINE reshuffle(size, array, rng_stream)
2539      INTEGER                                            :: size
2540      INTEGER, DIMENSION(size)                           :: array
2541      TYPE(rng_stream_type), POINTER                     :: rng_stream
2542
2543      INTEGER                                            :: i, idx1, idx2, tmp
2544      REAL(dp)                                           :: x
2545
2546      DO i = 1, size*10
2547         x = next_random_number(rng_stream)
2548         idx1 = INT(x*(size + 1 - 1)) + 1
2549         x = next_random_number(rng_stream)
2550         idx2 = INT(x*(size + 1 - 1)) + 1
2551
2552         tmp = array(idx1)
2553         array(idx1) = array(idx2)
2554         array(idx2) = tmp
2555      END DO
2556
2557   END SUBROUTINE
2558
2559#include "hfx_get_pmax_val.f90"
2560
2561END MODULE hfx_load_balance_methods
2562