!--------------------------------------------------------------------------------------------------!
!   CP2K: A general program to perform molecular dynamics simulations                              !
!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
!--------------------------------------------------------------------------------------------------!

! **************************************************************************************************
!> \brief Rountines for optimizing load balance between processes in HFX calculations
!> \par History
!>      04.2008 created [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
MODULE hfx_load_balance_methods
   USE cell_types,                      ONLY: cell_type
   USE cp_files,                        ONLY: close_file,&
                                              open_file
   USE cp_para_types,                   ONLY: cp_para_env_type
   USE hfx_pair_list_methods,           ONLY: build_atomic_pair_list,&
                                              build_pair_list
   USE hfx_types,                       ONLY: &
        hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &
        hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &
        pair_list_type, pair_set_list_type
   USE input_constants,                 ONLY: hfx_do_eval_energy,&
                                              hfx_do_eval_forces
   USE kinds,                           ONLY: dp,&
                                              int_8
   USE message_passing,                 ONLY: mp_allgather,&
                                              mp_gatherv,&
                                              mp_isendrecv,&
                                              mp_max,&
                                              mp_sum,&
                                              mp_sync,&
                                              mp_waitall
   USE parallel_rng_types,              ONLY: UNIFORM,&
                                              create_rng_stream,&
                                              delete_rng_stream,&
                                              next_random_number,&
                                              rng_stream_type
   USE particle_types,                  ONLY: particle_type
   USE util,                            ONLY: sort
#include "./base/base_uses.f90"

   IMPLICIT NONE
   PRIVATE

   PUBLIC :: hfx_load_balance, &
             hfx_update_load_balance, &
             collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'

   REAL(KIND=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
                                                 -1.91570128356921242E-002_dp, -1.6668495454436603_dp, &
                                                 1.7512639006523709_dp, -9.76074323945336081E-002_dp, &
                                                 2.6230786127311889_dp, -0.31870737623014189_dp, &
                                                 7.9588203912690973_dp, 1.8331423413134813_dp, &
                                                 -0.15427618665346299_dp, 0.19749436090711650_dp/)
   REAL(KIND=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
                                                 -9.36564055598656797E-002_dp, 0.64284973765086939_dp, &
                                                 1.0137565430060556_dp, -6.80088178288954567E-003_dp, &
                                                 1.1692629207374552_dp, -2.6314710080507573_dp, &
                                                 19.237814781880786_dp, 1.0505934173661349_dp, &
                                                 0.80382371955699250_dp, 0.49903401991818103_dp/)
   REAL(KIND=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350E-002_dp, 0.38073304105744837_dp/)
   REAL(KIND=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
                                                 -9.41673106447732111E-002_dp, 0.94568006899317825_dp, &
                                                 -1.4511897117448544_dp, 0.59178934677316952_dp, &
                                                 2.7291149361757236_dp, -0.50555512044800210_dp, &
                                                 8.3508180969609871_dp, 1.6829982496141809_dp, &
                                                 -0.74895370472152600_dp, 0.43801726744197500_dp/)
   REAL(KIND=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
                                                 5.33216585432061581E-003_dp, 0.45572145697283628_dp, &
                                                 1.8119743851500618_dp, -0.12533918548421166_dp, &
                                                 -1.4040312084552751_dp, -4.5331650463917859_dp, &
                                                 12.593431549069477_dp, 1.1311978374487595_dp, &
                                                 1.4245996087624646_dp, 1.1425350529853495_dp/)
   REAL(KIND=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)

!***

CONTAINS

! **************************************************************************************************
!> \brief Distributes the computation of eri's to all available processes.
!> \param x_data Object that stores the indices array
!> \param eps_schwarz screening parameter
!> \param particle_set , atomic_kind_set, para_env ...
!> \param max_set Maximum number of set to be considered
!> \param para_env para_env
!> \param coeffs_set screening functions
!> \param coeffs_kind screening functions
!> \param is_assoc_atomic_block_global KS-matrix sparsity
!> \param do_periodic flag for periodicity
!> \param load_balance_parameter Paramters for Monte-Carlo routines
!> \param kind_of helper array for mapping
!> \param basis_parameter Basis set parameters
!> \param pmax_set Initial screening matrix
!> \param pmax_atom ...
!> \param i_thread Process ID of current Thread
!> \param n_threads Total Number of Threads
!> \param cell cell
!> \param do_p_screening Flag for initial p screening
!> \param map_atom_to_kind_atom ...
!> \param nkind ...
!> \param eval_type ...
!> \param pmax_block ...
!> \param use_virial ...
!> \par History
!>      06.2007 created [Manuel Guidon]
!>      08.2007 new parallel scheme [Manuel Guidon]
!>      09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
!>      11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
!>      02.2009 completely refactored [Manuel Guidon]
!> \author Manuel Guidon
!> \note
!>      The optimization is done via a binning procedure followed by simple
!>      Monte Carlo procedure:
!>      In a first step the total amount of integrals in the system is calculated,
!>      taking into account the sparsity of the KS-matrix , the screening based
!>      on near/farfield approximations and if desired the screening on an initial
!>      density matrix.
!>      In a second step, bins are generate that contain approximately the same number
!>      of integrals, and a cost for these bins is estimated (currently the number of integrals)
!>      In a third step, a Monte Carlo procedure optimizes the assignment
!>      of the different loads to each process
!>      At the end each process owns an unique array of *atomic* indices-ranges
!>      that are used to decide whether a process has to calculate a certain
!>      bunch of integrals or not
! **************************************************************************************************
   SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
                               coeffs_set, coeffs_kind, &
                               is_assoc_atomic_block_global, do_periodic, &
                               load_balance_parameter, kind_of, basis_parameter, pmax_set, &
                               pmax_atom, i_thread, n_threads, cell, &
                               do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
                               pmax_block, use_virial)
      TYPE(hfx_type), POINTER                            :: x_data
      REAL(dp), INTENT(IN)                               :: eps_schwarz
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      INTEGER, INTENT(IN)                                :: max_set
      TYPE(cp_para_env_type), POINTER                    :: para_env
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
         POINTER                                         :: coeffs_kind
      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
      LOGICAL                                            :: do_periodic
      TYPE(hfx_load_balance_type), POINTER               :: load_balance_parameter
      INTEGER                                            :: kind_of(*)
      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
      INTEGER, INTENT(IN)                                :: i_thread, n_threads
      TYPE(cell_type), POINTER                           :: cell
      LOGICAL, INTENT(IN)                                :: do_p_screening
      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
      INTEGER, INTENT(IN)                                :: nkind, eval_type
      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
      LOGICAL, INTENT(IN)                                :: use_virial

      CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_load_balance', &
         routineP = moduleN//':'//routineN

      CHARACTER(LEN=512)                                 :: error_msg
      INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
         handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
         jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
         latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
         new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
         objective_block_size, objective_nblocks, req(2), source, total_blocks
      INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
         distribution_counter_end, distribution_counter_start, global_quartet_counter, &
         local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer_in, buffer_out
      INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
                                                            sendbuffer, swapbuffer
      INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
      INTEGER(int_8), SAVE                               :: shm_global_quartet_counter, &
                                                            shm_local_quartet_counter
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount, rdispl, tmp_index, tmp_pos, &
                                                            to_be_sorted
      INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
      INTEGER, SAVE                                      :: shm_nblocks
      LOGICAL                                            :: changed, last_bin_needs_to_be_filled, &
                                                            optimized
      LOGICAL, DIMENSION(:, :), POINTER, SAVE            :: atomic_pair_list
      REAL(dp)                                           :: coeffs_kind_max0, log10_eps_schwarz, &
                                                            log_2, pmax_blocks
      TYPE(hfx_block_range_type), DIMENSION(:), POINTER  :: blocks_guess, tmp_blocks, tmp_blocks2
      TYPE(hfx_block_range_type), DIMENSION(:), &
         POINTER, SAVE                                   :: shm_blocks
      TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
      TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
         SAVE                                            :: full_dist
      TYPE(pair_list_type)                               :: list_ij, list_kl
      TYPE(pair_set_list_type), ALLOCATABLE, &
         DIMENSION(:)                                    :: set_list_ij, set_list_kl

!$OMP BARRIER
!$OMP MASTER
      CALL timeset(routineN, handle)
!$OMP END MASTER
!$OMP BARRIER

      log10_eps_schwarz = LOG10(eps_schwarz)
      log_2 = LOG10(2.0_dp)
      coeffs_kind_max0 = MAXVAL(coeffs_kind(:, :)%x(2))
      ncpu = para_env%num_pe
      n_processes = ncpu*n_threads
      natom = SIZE(particle_set)

      block_size = load_balance_parameter%block_size
      ALLOCATE (set_list_ij((max_set*block_size)**2))
      ALLOCATE (set_list_kl((max_set*block_size)**2))

      IF (.NOT. load_balance_parameter%blocks_initialized) THEN
!$OMP BARRIER
!$OMP MASTER
         CALL timeset(routineN//"_range", handle_range)

         nblocks = MAX((natom + block_size - 1)/block_size, 1)
         ALLOCATE (blocks_guess(nblocks))
         ALLOCATE (tmp_blocks(natom))
         ALLOCATE (tmp_blocks2(natom))

         pmax_blocks = 0.0_dp
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            atomic_pair_list => x_data%atomic_pair_list
         CASE (hfx_do_eval_forces)
            atomic_pair_list => x_data%atomic_pair_list_forces
         END SELECT
         atomic_pair_list = .TRUE.
         CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
                          list_ij, list_kl, set_list_ij, set_list_kl, &
                          particle_set, &
                          coeffs_set, coeffs_kind, &
                          is_assoc_atomic_block_global, do_periodic, &
                          kind_of, basis_parameter, pmax_set, pmax_atom, &
                          pmax_blocks, cell, &
                          do_p_screening, map_atom_to_kind_atom, eval_type, &
                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

         total_block_self_cost = 0

         DO i = 1, nblocks
            total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
         END DO

         CALL mp_sum(total_block_self_cost, para_env%group)

         objective_block_size = load_balance_parameter%block_size
         objective_nblocks = MAX((natom + objective_block_size - 1)/objective_block_size, 1)

         self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)

         DO i = 1, nblocks
            tmp_blocks2(i) = blocks_guess(i)
         END DO

         optimized = .FALSE.
         i = 0
         DO WHILE (.NOT. optimized)
            i = i + 1
            current_block_id = 0
            changed = .FALSE.
            DO atom_block = 1, nblocks
               current_block_id = current_block_id + 1
               iatom_start = tmp_blocks2(atom_block)%istart
               iatom_end = tmp_blocks2(atom_block)%iend
               IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
                  changed = .TRUE.
                  new_iatom_start = iatom_start
                  new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
                  new_jatom_start = new_iatom_end + 1
                  new_jatom_end = iatom_end
                  tmp_blocks(current_block_id)%istart = new_iatom_start
                  tmp_blocks(current_block_id)%iend = new_iatom_end
                  tmp_blocks(current_block_id)%cost = estimate_block_cost( &
                                                      natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                                      new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
                                                      new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
                                                      particle_set, &
                                                      coeffs_set, coeffs_kind, &
                                                      is_assoc_atomic_block_global, do_periodic, &
                                                      kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                                      cell, &
                                                      do_p_screening, map_atom_to_kind_atom, eval_type, &
                                                      log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
                  current_block_id = current_block_id + 1
                  tmp_blocks(current_block_id)%istart = new_jatom_start
                  tmp_blocks(current_block_id)%iend = new_jatom_end
                  tmp_blocks(current_block_id)%cost = estimate_block_cost( &
                                                      natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                                      new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
                                                      new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
                                                      particle_set, &
                                                      coeffs_set, coeffs_kind, &
                                                      is_assoc_atomic_block_global, do_periodic, &
                                                      kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                                      cell, &
                                                      do_p_screening, map_atom_to_kind_atom, eval_type, &
                                                      log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
               ELSE
                  tmp_blocks(current_block_id)%istart = iatom_start
                  tmp_blocks(current_block_id)%iend = iatom_end
                  tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
               END IF
            END DO
            IF (.NOT. changed) optimized = .TRUE.
            IF (i > 20) optimized = .TRUE.
            nblocks = current_block_id
            DO atom_block = 1, nblocks
               tmp_blocks2(atom_block) = tmp_blocks(atom_block)
            END DO
         END DO

         DEALLOCATE (tmp_blocks2)

         ! ** count number of non empty blocks on each node
         non_empty_blocks = 0
         DO atom_block = 1, nblocks
            IF (tmp_blocks(atom_block)%istart == 0) CYCLE
            non_empty_blocks = non_empty_blocks + 1
         END DO

         ALLOCATE (rcount(ncpu))
         rcount = 0
         rcount(para_env%mepos + 1) = non_empty_blocks
         CALL mp_sum(rcount, para_env%group)

         ! ** sum all non_empty_blocks
         total_blocks = 0
         DO i = 1, ncpu
            total_blocks = total_blocks + rcount(i)
         END DO

         ! ** calculate offsets
         ALLOCATE (rdispl(ncpu))
         rcount(:) = rcount(:)*3
         rdispl(1) = 0
         DO i = 2, ncpu
            rdispl(i) = rdispl(i - 1) + rcount(i - 1)
         END DO

         ALLOCATE (buffer_in(3*non_empty_blocks))

         non_empty_blocks = 0
         DO atom_block = 1, nblocks
            IF (tmp_blocks(atom_block)%istart == 0) CYCLE
            buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
            buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
            buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
            non_empty_blocks = non_empty_blocks + 1
         END DO

         nblocks = total_blocks

         ALLOCATE (tmp_blocks2(nblocks))

         ALLOCATE (buffer_out(3*nblocks))

         ! ** Gather all three arrays
         CALL mp_allgather(buffer_in, buffer_out, rcount, rdispl, para_env%group)

         DO i = 1, nblocks
            tmp_blocks2(i)%istart = INT(buffer_out((i - 1)*3 + 1))
            tmp_blocks2(i)%iend = INT(buffer_out((i - 1)*3 + 2))
            tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
         END DO

         ! ** Now we sort the blocks
         ALLOCATE (to_be_sorted(nblocks))
         ALLOCATE (tmp_index(nblocks))

         DO atom_block = 1, nblocks
            to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
         END DO

         CALL sort(to_be_sorted, nblocks, tmp_index)

         ALLOCATE (x_data%blocks(nblocks))

         DO atom_block = 1, nblocks
            x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
         END DO

         shm_blocks => x_data%blocks
         shm_nblocks = nblocks

         ! ** Set nblocks in structure
         load_balance_parameter%nblocks = nblocks

         DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)

         DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)

         load_balance_parameter%blocks_initialized = .TRUE.

         x_data%blocks = shm_blocks
         load_balance_parameter%nblocks = shm_nblocks
         load_balance_parameter%blocks_initialized = .TRUE.

         ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
         x_data%pmax_block = 0.0_dp
         pmax_block => x_data%pmax_block
         CALL timestop(handle_range)
!$OMP END MASTER
!$OMP BARRIER

         IF (.NOT. load_balance_parameter%blocks_initialized) THEN
            ALLOCATE (x_data%blocks(shm_nblocks))
            x_data%blocks = shm_blocks
            load_balance_parameter%nblocks = shm_nblocks
            load_balance_parameter%blocks_initialized = .TRUE.
         END IF
         !! ** precalculate maximum density matrix elements in blocks
!$OMP BARRIER
      END IF

!$OMP BARRIER
!$OMP MASTER
      pmax_block => x_data%pmax_block
      pmax_block = 0.0_dp
      IF (do_p_screening) THEN
         DO iatom_block = 1, shm_nblocks
            iatom_start = x_data%blocks(iatom_block)%istart
            iatom_end = x_data%blocks(iatom_block)%iend
            DO jatom_block = 1, shm_nblocks
               jatom_start = x_data%blocks(jatom_block)%istart
               jatom_end = x_data%blocks(jatom_block)%iend
               pmax_block(iatom_block, jatom_block) = MAXVAL(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
            END DO
         END DO
      END IF

      SELECT CASE (eval_type)
      CASE (hfx_do_eval_energy)
         atomic_pair_list => x_data%atomic_pair_list
      CASE (hfx_do_eval_forces)
         atomic_pair_list => x_data%atomic_pair_list_forces
      END SELECT
      CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
                                  do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
                                  x_data%blocks)

!$OMP END MASTER
!$OMP BARRIER

      !! If there is only 1 cpu skip the binning
      IF (n_processes == 1) THEN
         ALLOCATE (tmp_dist(1))
         tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
         tmp_dist(1)%istart = 0_int_8
         ptr_to_tmp_dist => tmp_dist(:)
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
         CASE (hfx_do_eval_forces)
            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
         END SELECT
         DEALLOCATE (tmp_dist)
      ELSE
         !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
!$OMP BARRIER
!$OMP MASTER
         CALL timeset(routineN//"_count", handle_inner)
!$OMP END MASTER
!$OMP BARRIER

         cost_per_core = 0_int_8
         my_process_id = para_env%mepos*n_threads + i_thread
         nblocks = load_balance_parameter%nblocks

         DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes

            latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
            tmp_block = atom_block/nblocks
            katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
            IF (latom_block < katom_block) CYCLE
            tmp_block = tmp_block/nblocks
            jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
            tmp_block = tmp_block/nblocks
            iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
            IF (jatom_block < iatom_block) CYCLE

            iatom_start = x_data%blocks(iatom_block)%istart
            iatom_end = x_data%blocks(iatom_block)%iend
            jatom_start = x_data%blocks(jatom_block)%istart
            jatom_end = x_data%blocks(jatom_block)%iend
            katom_start = x_data%blocks(katom_block)%istart
            katom_end = x_data%blocks(katom_block)%iend
            latom_start = x_data%blocks(latom_block)%istart
            latom_end = x_data%blocks(latom_block)%iend

            SELECT CASE (eval_type)
            CASE (hfx_do_eval_energy)
               pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
                                 pmax_block(latom_block, jatom_block), &
                                 pmax_block(latom_block, iatom_block), &
                                 pmax_block(katom_block, jatom_block))
            CASE (hfx_do_eval_forces)
               pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
                                 pmax_block(latom_block, jatom_block), &
                                 pmax_block(latom_block, iatom_block) + &
                                 pmax_block(katom_block, jatom_block))
            END SELECT

            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE

            cost_per_core = cost_per_core &
                            + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                                  iatom_start, iatom_end, jatom_start, jatom_end, &
                                                  katom_start, katom_end, latom_start, latom_end, &
                                                  particle_set, &
                                                  coeffs_set, coeffs_kind, &
                                                  is_assoc_atomic_block_global, do_periodic, &
                                                  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                                  cell, &
                                                  do_p_screening, map_atom_to_kind_atom, eval_type, &
                                                  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

         END DO ! atom_block

         nbins = load_balance_parameter%nbins
         cost_per_bin = (cost_per_core + nbins - 1)/(nbins)

!$OMP BARRIER
!$OMP MASTER
         CALL timestop(handle_inner)
!$OMP END MASTER
!$OMP BARRIER

! new load balancing test
         IF (.FALSE.) THEN
            CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
                                            natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                            particle_set, &
                                            coeffs_set, coeffs_kind, &
                                            is_assoc_atomic_block_global, do_periodic, &
                                            kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                            cell, x_data, para_env, pmax_block, &
                                            do_p_screening, map_atom_to_kind_atom, eval_type, &
                                            log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
         END IF

!$OMP BARRIER
!$OMP MASTER
         CALL timeset(routineN//"_bin", handle_inner)
!$OMP END MASTER
!$OMP BARRIER

         ALLOCATE (binned_dist(nbins))
         binned_dist(:)%istart = -1_int_8
         binned_dist(:)%number_of_atom_quartets = 0_int_8
         binned_dist(:)%cost = 0_int_8
         binned_dist(:)%time_first_scf = 0.0_dp
         binned_dist(:)%time_other_scf = 0.0_dp
         binned_dist(:)%time_forces = 0.0_dp

         current_cost = 0
         mepos = 1
         distribution_counter_start = 1
         distribution_counter_end = 0
         ibin = 1

         global_quartet_counter = 0
         local_quartet_counter = 0
         last_bin_needs_to_be_filled = .FALSE.
         DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
            latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
            tmp_block = atom_block/nblocks
            katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
            IF (latom_block < katom_block) CYCLE
            tmp_block = tmp_block/nblocks
            jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
            tmp_block = tmp_block/nblocks
            iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
            IF (jatom_block < iatom_block) CYCLE

            distribution_counter_end = distribution_counter_end + 1
            global_quartet_counter = global_quartet_counter + 1
            last_bin_needs_to_be_filled = .TRUE.

            IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block

            iatom_start = x_data%blocks(iatom_block)%istart
            iatom_end = x_data%blocks(iatom_block)%iend
            jatom_start = x_data%blocks(jatom_block)%istart
            jatom_end = x_data%blocks(jatom_block)%iend
            katom_start = x_data%blocks(katom_block)%istart
            katom_end = x_data%blocks(katom_block)%iend
            latom_start = x_data%blocks(latom_block)%istart
            latom_end = x_data%blocks(latom_block)%iend

            SELECT CASE (eval_type)
            CASE (hfx_do_eval_energy)
               pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
                                 pmax_block(latom_block, jatom_block), &
                                 pmax_block(latom_block, iatom_block), &
                                 pmax_block(katom_block, jatom_block))
            CASE (hfx_do_eval_forces)
               pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
                                 pmax_block(latom_block, jatom_block), &
                                 pmax_block(latom_block, iatom_block) + &
                                 pmax_block(katom_block, jatom_block))
            END SELECT

            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE

            current_cost = current_cost &
                           + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                                 iatom_start, iatom_end, jatom_start, jatom_end, &
                                                 katom_start, katom_end, latom_start, latom_end, &
                                                 particle_set, &
                                                 coeffs_set, coeffs_kind, &
                                                 is_assoc_atomic_block_global, do_periodic, &
                                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                                 cell, &
                                                 do_p_screening, map_atom_to_kind_atom, eval_type, &
                                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

            IF (current_cost >= cost_per_bin) THEN
               IF (ibin == nbins) THEN
                  binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
                                                              distribution_counter_end - distribution_counter_start + 1
               ELSE
                  binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
               END IF
               binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
               ibin = MIN(ibin + 1, nbins)
               distribution_counter_start = distribution_counter_end + 1
               current_cost = 0
               last_bin_needs_to_be_filled = .FALSE.
            END IF
         ENDDO

!$OMP BARRIER
!$OMP MASTER
         CALL timestop(handle_inner)
         CALL timeset(routineN//"_dist", handle_inner)
!$OMP END MASTER
!$OMP BARRIER
         !! Fill the last bin if necessary
         IF (last_bin_needs_to_be_filled) THEN
            binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
            IF (ibin == nbins) THEN
               binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
                                                           distribution_counter_end - distribution_counter_start + 1
            ELSE
               binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
            END IF
         END IF

         !! Sanity-Check
         DO ibin = 1, nbins
            local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
         END DO
!$OMP BARRIER
!$OMP MASTER
         shm_local_quartet_counter = 0
         shm_global_quartet_counter = 0
!$OMP END MASTER
!$OMP BARRIER
!$OMP ATOMIC
         shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
!$OMP ATOMIC
         shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter

!$OMP BARRIER
!$OMP MASTER
         CALL mp_sum(shm_local_quartet_counter, para_env%group)
         CALL mp_sum(shm_global_quartet_counter, para_env%group)
         IF (para_env%ionode) THEN
            IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
               WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
                  "Number of local quartets (", shm_local_quartet_counter, &
                  ") and number of global quartets (", shm_global_quartet_counter, &
                  ") are different. Please send in a bug report."
               CPABORT(error_msg)
            END IF
         END IF
!$OMP END MASTER

!$OMP BARRIER
!$OMP MASTER
         ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
         cost_matrix = 0
!$OMP END MASTER
!$OMP BARRIER
         icpu = para_env%mepos + 1
         DO i = 1, nbins
            cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
         END DO
         mepos = para_env%mepos
!$OMP BARRIER

!$OMP MASTER
         ! sync before/after ring of isendrecv
         CALL mp_sync(para_env%group)

         ALLOCATE (sendbuffer(nbins*n_threads))
         ALLOCATE (recbuffer(nbins*n_threads))

         sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)

         dest = MODULO(mepos + 1, ncpu)
         source = MODULO(mepos - 1, ncpu)
         DO icpu = 0, ncpu - 1
            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
                                 para_env%group, req(1), req(2), 13)
            ENDIF
            data_from = MODULO(mepos - icpu, ncpu)
            cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_waitall(req)
            ENDIF
            swapbuffer => sendbuffer
            sendbuffer => recbuffer
            recbuffer => swapbuffer
         ENDDO
         DEALLOCATE (recbuffer, sendbuffer)
!$OMP END MASTER
!$OMP BARRIER

!$OMP BARRIER
!$OMP MASTER
         CALL timestop(handle_inner)
         CALL timeset(routineN//"_opt", handle_inner)
!$OMP END MASTER
!$OMP BARRIER

         !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
!$OMP BARRIER
         ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
         local_cost_matrix = cost_matrix
!$OMP MASTER
         ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))

         CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
                                    shm_distribution_vector, x_data%load_balance_parameter%do_randomize)

         CALL timestop(handle_inner)
         CALL timeset(routineN//"_redist", handle_inner)
         !! Collect local data to global array
         ALLOCATE (full_dist(ncpu*n_threads, nbins))

         full_dist(:, :)%istart = 0_int_8
         full_dist(:, :)%number_of_atom_quartets = 0_int_8
         full_dist(:, :)%cost = 0_int_8
         full_dist(:, :)%time_first_scf = 0.0_dp
         full_dist(:, :)%time_other_scf = 0.0_dp
         full_dist(:, :)%time_forces = 0.0_dp
!$OMP END MASTER
!$OMP BARRIER
         mepos = para_env%mepos + 1
         full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)

!$OMP BARRIER
!$OMP MASTER
         ALLOCATE (sendbuffer(3*nbins*n_threads))
         ALLOCATE (recbuffer(3*nbins*n_threads))
         mepos = para_env%mepos
         DO j = 1, n_threads
            DO i = 1, nbins
               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
            END DO
         END DO

         ! sync before/after ring of isendrecv
         CALL mp_sync(para_env%group)
         dest = MODULO(mepos + 1, ncpu)
         source = MODULO(mepos - 1, ncpu)
         DO icpu = 0, ncpu - 1
            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
                                 para_env%group, req(1), req(2), 13)
            ENDIF
            data_from = MODULO(mepos - icpu, ncpu)
            DO j = 1, n_threads
               DO i = 1, nbins
                  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
                  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
                  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
               END DO
            END DO

            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_waitall(req)
            ENDIF
            swapbuffer => sendbuffer
            sendbuffer => recbuffer
            recbuffer => swapbuffer
         ENDDO
         DEALLOCATE (recbuffer, sendbuffer)

         ! sync before/after ring of isendrecv
         CALL mp_sync(para_env%group)
!$OMP END MASTER
!$OMP BARRIER
         !! reorder the distribution according to the distribution vector
         ALLOCATE (tmp_pos(ncpu*n_threads))
         tmp_pos = 1
         ALLOCATE (tmp_dist(nbins*ncpu*n_threads))

         tmp_dist(:)%istart = 0_int_8
         tmp_dist(:)%number_of_atom_quartets = 0_int_8
         tmp_dist(:)%cost = 0_int_8
         tmp_dist(:)%time_first_scf = 0.0_dp
         tmp_dist(:)%time_other_scf = 0.0_dp
         tmp_dist(:)%time_forces = 0.0_dp

         DO icpu = 1, n_processes
            DO i = 1, nbins
               mepos = my_process_id + 1
               IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
                  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
                  tmp_pos(mepos) = tmp_pos(mepos) + 1
               END IF
            END DO
         END DO

         !! Assign the load to each process
         NULLIFY (ptr_to_tmp_dist)
         mepos = my_process_id + 1
         ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
         CASE (hfx_do_eval_forces)
            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
         END SELECT

!$OMP BARRIER
!$OMP MASTER
         DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
!$OMP END MASTER
!$OMP BARRIER
         DEALLOCATE (tmp_dist, tmp_pos)
         DEALLOCATE (binned_dist, local_cost_matrix)
         DEALLOCATE (set_list_ij, set_list_kl)

!$OMP BARRIER
!$OMP MASTER
         CALL timestop(handle_inner)
!$OMP END MASTER
!$OMP BARRIER
      END IF
!$OMP BARRIER
!$OMP MASTER
      CALL timestop(handle)
!$OMP END MASTER
!$OMP BARRIER
   END SUBROUTINE hfx_load_balance

! **************************************************************************************************
!> \brief Reference implementation of new recursive load balancing routine
!>        Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
!>        each process in a P-Q grid such that every process has more or less the
!>        same amount of work. Has no output at the moment (not used) but writes
!>        its computed load balance values into a file. Possible output is ready
!>        to use in the two arrays p_atom_blocks & q_atom_blocks
!> \param n_processes ...
!> \param my_process_id ...
!> \param nblocks ...
!> \param natom ...
!> \param nkind ...
!> \param list_ij ...
!> \param list_kl ...
!> \param set_list_ij ...
!> \param set_list_kl ...
!> \param particle_set ...
!> \param coeffs_set ...
!> \param coeffs_kind ...
!> \param is_assoc_atomic_block_global ...
!> \param do_periodic ...
!> \param kind_of ...
!> \param basis_parameter ...
!> \param pmax_set ...
!> \param pmax_atom ...
!> \param pmax_blocks ...
!> \param cell ...
!> \param x_data ...
!> \param para_env ...
!> \param pmax_block ...
!> \param do_p_screening ...
!> \param map_atom_to_kind_atom ...
!> \param eval_type ...
!> \param log10_eps_schwarz ...
!> \param log_2 ...
!> \param coeffs_kind_max0 ...
!> \param use_virial ...
!> \param atomic_pair_list ...
!> \par History
!>      03.2011 created [Michael Steinlechner]
!> \author Michael Steinlechner
! **************************************************************************************************

   SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
                                         natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                         particle_set, &
                                         coeffs_set, coeffs_kind, &
                                         is_assoc_atomic_block_global, do_periodic, &
                                         kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                         cell, x_data, para_env, pmax_block, &
                                         do_p_screening, map_atom_to_kind_atom, eval_type, &
                                         log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

! input variables:
      INTEGER, INTENT(IN)                                :: n_processes, my_process_id, nblocks, &
                                                            natom, nkind
      TYPE(pair_list_type), INTENT(IN)                   :: list_ij, list_kl
      TYPE(pair_set_list_type), ALLOCATABLE, &
         DIMENSION(:), INTENT(IN)                        :: set_list_ij, set_list_kl
      TYPE(particle_type), DIMENSION(:), INTENT(IN), &
         POINTER                                         :: particle_set
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(:, :, :, :), INTENT(IN), POINTER      :: coeffs_set
      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
         INTENT(IN), POINTER                             :: coeffs_kind
      INTEGER, DIMENSION(:, :), INTENT(IN)               :: is_assoc_atomic_block_global
      LOGICAL, INTENT(IN)                                :: do_periodic
      INTEGER, INTENT(IN)                                :: kind_of(*)
      TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
         POINTER                                         :: basis_parameter
      TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
         POINTER                                         :: pmax_set
      REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_atom
      REAL(dp)                                           :: pmax_blocks
      TYPE(cell_type), INTENT(IN), POINTER               :: cell
      TYPE(hfx_type), INTENT(IN), POINTER                :: x_data
      TYPE(cp_para_env_type), INTENT(IN), POINTER        :: para_env
      REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_block
      LOGICAL, INTENT(IN)                                :: do_p_screening
      INTEGER, DIMENSION(:), INTENT(IN), POINTER         :: map_atom_to_kind_atom
      INTEGER, INTENT(IN)                                :: eval_type
      REAL(dp), INTENT(IN)                               :: log10_eps_schwarz, log_2, &
                                                            coeffs_kind_max0
      LOGICAL, INTENT(IN)                                :: use_virial
      LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER      :: atomic_pair_list

      CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_recursive_load_balance', &
         routineP = moduleN//':'//routineN

      INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
         jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
         nP, nQ, numBins, p, q, sizeP, sizeQ, unit_nr
      INTEGER(int_8)                                     :: local_cost, pidx, qidx, sumP, sumQ
      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: local_cost_vector
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: blocksize, p_atom_blocks, permute, &
                                                            q_atom_blocks
      REAL(dp)                                           :: maximum, mean

! internal variables:

!$OMP BARRIER
!$OMP MASTER
      CALL timeset(routineN, handle)
!$OMP END MASTER
!$OMP BARRIER

      ! calculate best p/q distribution grid for the n_processes
      CALL hfx_calculate_PQ(p, q, numBins, n_processes)

      ALLOCATE (blocksize(numBins))
      ALLOCATE (permute(nblocks**2))
      DO i = 1, nblocks**2
         permute(i) = i
      END DO

      ! call the main recursive permutation routine.
      ! Output:
      !   blocksize :: vector (size numBins) with the sizes for each column/row block
      !   permute   :: permutation vector
      CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numBins, &
                                 permute, 1, &
                                 my_process_id, n_processes, nblocks, &
                                 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                 particle_set, &
                                 coeffs_set, coeffs_kind, &
                                 is_assoc_atomic_block_global, do_periodic, &
                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                 cell, x_data, para_env, pmax_block, &
                                 do_p_screening, map_atom_to_kind_atom, eval_type, &
                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

      ! number of blocks per processor in p-direction (vertical)
      nP = numBins/p
      ! number of blocks per processor in q-direction (horizontal)
      nQ = numBins/q

      ! calc own position in P-Q-processor grid (PQ-grid is column-major)
      pidx = MODULO(INT(my_process_id), INT(p)) + 1
      qidx = my_process_id/p + 1

      sizeP = SUM(blocksize((nP*(pidx - 1) + 1):(nP*pidx)))
      sizeQ = SUM(blocksize((nQ*(qidx - 1) + 1):(nQ*qidx)))

      sumP = SUM(blocksize(1:(nP*(pidx - 1))))
      sumQ = SUM(blocksize(1:(nQ*(qidx - 1))))

      ALLOCATE (p_atom_blocks(sizeP))
      ALLOCATE (q_atom_blocks(sizeQ))

      p_atom_blocks(:) = permute((sumP + 1):(sumP + sizeP))
      q_atom_blocks(:) = permute((sumQ + 1):(sumQ + sizeQ))

      ! from here on, we are actually finished, each process has been
      ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
      ! what follows is just a small routine to calculate the local cost
      ! for each processor which is then written to a file.

      ! calculate local cost for each processor!
      ! ****************************************
      local_cost = 0
      DO i = 1, sizeP
         DO j = 1, sizeQ

            !       get corresponding 4D block indices out of our own P-Q-block
            latom_block = MODULO(q_atom_blocks(j), nblocks)
            iatom_block = q_atom_blocks(j)/nblocks + 1
            jatom_block = MODULO(p_atom_blocks(i), nblocks)
            katom_block = p_atom_blocks(i)/nblocks + 1

            !       symmetry checks.
            IF (latom_block < katom_block) CYCLE
            IF (jatom_block < iatom_block) CYCLE

            iatom_start = x_data%blocks(iatom_block)%istart
            iatom_end = x_data%blocks(iatom_block)%iend
            jatom_start = x_data%blocks(jatom_block)%istart
            jatom_end = x_data%blocks(jatom_block)%iend
            katom_start = x_data%blocks(katom_block)%istart
            katom_end = x_data%blocks(katom_block)%iend
            latom_start = x_data%blocks(latom_block)%istart
            latom_end = x_data%blocks(latom_block)%iend

            !       whatever.
            SELECT CASE (eval_type)
            CASE (hfx_do_eval_energy)
               pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
                                 pmax_block(latom_block, jatom_block), &
                                 pmax_block(latom_block, iatom_block), &
                                 pmax_block(katom_block, jatom_block))
            CASE (hfx_do_eval_forces)
               pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
                                 pmax_block(latom_block, jatom_block), &
                                 pmax_block(latom_block, iatom_block) + &
                                 pmax_block(katom_block, jatom_block))
            END SELECT

            !       screening.
            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE

            !       estimate the cost of this atom_block.
            local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
                                                          set_list_kl, &
                                                          iatom_start, iatom_end, jatom_start, jatom_end, &
                                                          katom_start, katom_end, latom_start, latom_end, &
                                                          particle_set, &
                                                          coeffs_set, coeffs_kind, &
                                                          is_assoc_atomic_block_global, do_periodic, &
                                                          kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                                          cell, &
                                                          do_p_screening, map_atom_to_kind_atom, eval_type, &
                                                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
         END DO
      END DO

      ALLOCATE (local_cost_vector(n_processes))
      local_cost_vector = 0
      local_cost_vector(my_process_id + 1) = local_cost
      CALL mp_sum(local_cost_vector, para_env%group)

      mean = SUM(local_cost_vector)/n_processes
      maximum = MAXVAL(local_cost_vector)

!$OMP     BARRIER
!$OMP     MASTER
      ! only output once
      IF (my_process_id == 0) THEN
         CALL open_file(unit_number=unit_nr, file_name="loads.dat")
         WRITE (unit_nr, *) 'maximum cost:', maximum
         WRITE (unit_nr, *) 'mean cost:', mean
         WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
         WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
         DO i = 1, n_processes
            WRITE (unit_nr, *) local_cost_vector(i)
         END DO
         CALL close_file(unit_nr)
      END IF
!$OMP     END MASTER
!$OMP     BARRIER

      DEALLOCATE (local_cost_vector)
      DEALLOCATE (p_atom_blocks, q_atom_blocks)
      DEALLOCATE (blocksize, permute)

!$OMP BARRIER
!$OMP MASTER
      CALL timestop(handle)
!$OMP END MASTER
!$OMP BARRIER

   END SUBROUTINE hfx_recursive_load_balance

! **************************************************************************************************
!> \brief Small routine to calculate the optimal P-Q-processor grid distribution
!>        for a given number of processors N
!>        and the corresponding number of Bins for the load balancing routine
!> \param p     number of rows on P-Q process grid (output)
!> \param q     number of columns on P-Q process grid (output)
!> \param nBins number of Bins (output)
!> \param N     number of processes (input)
!> \par History
!>      03.2011 created [Michael Steinlechner]
!> \author Michael Steinlechner
! **************************************************************************************************
   SUBROUTINE hfx_calculate_PQ(p, q, nBins, N)

      INTEGER, INTENT(OUT)                               :: p, q, nBins
      INTEGER, INTENT(IN)                                :: N

      INTEGER                                            :: a, b, k
      REAL(dp)                                           :: sqN

      k = 2
      sqN = SQRT(REAL(N, KIND=dp))
      p = 1

      DO WHILE (REAL(k, KIND=dp) <= sqN)
         IF (MODULO(N, k) == 0) THEN
            p = k
         END IF
         k = k + 1
      END DO
      q = N/p

      ! now compute the least common multiple of p & q to get the number of necessary bins
      ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
      ! and use euclid's algorithm for GCD computation.
      a = p
      b = q

      DO WHILE (b .NE. 0)
         IF (a > b) THEN
            a = a - b
         ELSE
            b = b - a
         END IF
      END DO
      ! gcd(p,q) is now saved in a

      nBins = p*q/a

   END SUBROUTINE

! **************************************************************************************************
!> \brief Recursive permutation routine for the load balancing of the integral
!>       computation
!> \param blocksize     vector of blocksizes, size(nProc), which contains for
!>                      each process the local blocksize (OUTPUT)
!> \param blockstart    starting row/column idx of the block which is to be examined
!>                      at this point (INPUT)
!> \param blockend      ending row/column idx of the block which is to be examined
!>                      (INPUT)
!> \param nProc_in      number of bins into which the current block has to be divided
!>                      (INPUT)
!> \param permute       permutation vector which balances column/row cost
!>                      size(nblocks^2). (OUTPUT)
!> \param step ...
!> \param my_process_id ...
!> \param n_processes ...
!> \param nblocks ...
!> \param natom ...
!> \param nkind ...
!> \param list_ij ...
!> \param list_kl ...
!> \param set_list_ij ...
!> \param set_list_kl ...
!> \param particle_set ...
!> \param coeffs_set ...
!> \param coeffs_kind ...
!> \param is_assoc_atomic_block_global ...
!> \param do_periodic ...
!> \param kind_of ...
!> \param basis_parameter ...
!> \param pmax_set ...
!> \param pmax_atom ...
!> \param pmax_blocks ...
!> \param cell ...
!> \param x_data ...
!> \param para_env ...
!> \param pmax_block ...
!> \param do_p_screening ...
!> \param map_atom_to_kind_atom ...
!> \param eval_type ...
!> \param log10_eps_schwarz ...
!> \param log_2 ...
!> \param coeffs_kind_max0 ...
!> \param use_virial ...
!> \param atomic_pair_list ...
!> \par History
!>      03.2011 created [Michael Steinlechner]
!> \author Michael Steinlechner
! **************************************************************************************************
   RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
                                              permute, step, &
                                              my_process_id, n_processes, nblocks, &
                                              natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                              particle_set, &
                                              coeffs_set, coeffs_kind, &
                                              is_assoc_atomic_block_global, do_periodic, &
                                              kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                              cell, x_data, para_env, pmax_block, &
                                              do_p_screening, map_atom_to_kind_atom, eval_type, &
                                              log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

      INTEGER                                            :: nProc_in, blockend, blockstart
      INTEGER, DIMENSION(nProc_in)                       :: blocksize
      INTEGER                                            :: nblocks, n_processes, my_process_id
      INTEGER, INTENT(IN)                                :: step
      INTEGER, DIMENSION(nblocks*nblocks)                :: permute
      INTEGER                                            :: natom
      INTEGER, INTENT(IN)                                :: nkind
      TYPE(pair_list_type)                               :: list_ij, list_kl
      TYPE(pair_set_list_type), ALLOCATABLE, &
         DIMENSION(:)                                    :: set_list_ij, set_list_kl
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
         POINTER                                         :: coeffs_kind
      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
      LOGICAL                                            :: do_periodic
      INTEGER                                            :: kind_of(*)
      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
      REAL(dp)                                           :: pmax_blocks
      TYPE(cell_type), POINTER                           :: cell
      TYPE(hfx_type), POINTER                            :: x_data
      TYPE(cp_para_env_type), POINTER                    :: para_env
      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block
      LOGICAL, INTENT(IN)                                :: do_p_screening
      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
      INTEGER, INTENT(IN)                                :: eval_type
      REAL(dp)                                           :: log10_eps_schwarz, log_2, &
                                                            coeffs_kind_max0
      LOGICAL, INTENT(IN)                                :: use_virial
      LOGICAL, DIMENSION(:, :), POINTER                  :: atomic_pair_list

      INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
         jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
         latom_end, latom_start, nbins, nProc, row, startoffset
      INTEGER(int_8)                                     :: atom_block, tmp_block
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: ithblocksize, localblocksize
      INTEGER, DIMENSION(blockend-blockstart+1)          :: bin_perm, tmp_perm
      REAL(dp)                                           :: partialcost
      REAL(dp), DIMENSION(nblocks*nblocks)               :: cost_vector

      nProc = nProc_in
      cost_vector = 0.0_dp

!   loop over local atom_blocks.
      DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes

!       get corresponding 4D block indices
         latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
         tmp_block = atom_block/nblocks
         katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
         IF (latom_block < katom_block) CYCLE
         tmp_block = tmp_block/nblocks
         jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
         tmp_block = tmp_block/nblocks
         iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
         IF (jatom_block < iatom_block) CYCLE

!       get 2D indices of this atom_block (with permutation applied)
!       for this, we need to invert the permutation, this means
!       find position in permutation vector where value==idx

         row = (katom_block - 1)*nblocks + jatom_block
         inv_perm = 1
         DO WHILE (permute(inv_perm) .NE. row)
            inv_perm = inv_perm + 1
         END DO
         row = inv_perm

         col = (iatom_block - 1)*nblocks + latom_block
         inv_perm = 1
         DO WHILE (permute(inv_perm) .NE. col)
            inv_perm = inv_perm + 1
         END DO
         col = inv_perm

!       if row/col outside our current diagonal block, skip calculation.
         IF (col < blockstart .OR. col > blockend) CYCLE
         IF (row < blockstart .OR. row > blockend) CYCLE

         iatom_start = x_data%blocks(iatom_block)%istart
         iatom_end = x_data%blocks(iatom_block)%iend
         jatom_start = x_data%blocks(jatom_block)%istart
         jatom_end = x_data%blocks(jatom_block)%iend
         katom_start = x_data%blocks(katom_block)%istart
         katom_end = x_data%blocks(katom_block)%iend
         latom_start = x_data%blocks(latom_block)%istart
         latom_end = x_data%blocks(latom_block)%iend

!       whatever.
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
                              pmax_block(latom_block, jatom_block), &
                              pmax_block(latom_block, iatom_block), &
                              pmax_block(katom_block, jatom_block))
         CASE (hfx_do_eval_forces)
            pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
                              pmax_block(latom_block, jatom_block), &
                              pmax_block(latom_block, iatom_block) + &
                              pmax_block(katom_block, jatom_block))
         END SELECT

!       screening.
         IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE

!       every second recursion step, compute row sum instead of column sum

         IF (MODULO(step, 2) .EQ. 0) THEN
            idx = row
         ELSE
            idx = col
         END IF

!       estimate the cost of this atom_block.
         partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
                                           set_list_kl, &
                                           iatom_start, iatom_end, jatom_start, jatom_end, &
                                           katom_start, katom_end, latom_start, latom_end, &
                                           particle_set, &
                                           coeffs_set, coeffs_kind, &
                                           is_assoc_atomic_block_global, do_periodic, &
                                           kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                           cell, &
                                           do_p_screening, map_atom_to_kind_atom, eval_type, &
                                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

         cost_vector(idx) = cost_vector(idx) + partialcost
      END DO ! atom_block

!   sum costvector over all processes
      CALL mp_sum(cost_vector, para_env%group)

!   calculate next prime factor of nProc
      nBins = 2
      DO WHILE (MODULO(INT(nProc), INT(nBins)) .NE. 0)
         nBins = nBins + 1
      END DO

      nProc = nProc/nBins

! ... do the binning...

      ALLOCATE (localblocksize(nBins))
      CALL hfx_permute_binning(nBins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)

!... and update the permutation vector

      tmp_perm = permute(blockstart:blockend)
      permute(blockstart:blockend) = tmp_perm(bin_perm)

!   split recursion into the nBins Bins
      IF (nProc > 1) THEN
         ALLOCATE (ithblocksize(nProc))
         DO i = 1, nBins
            startoffset = SUM(localblocksize(1:(i - 1)))
            endoffset = SUM(localblocksize(1:i)) - 1

            CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nProc, &
                                       permute, step + 1, &
                                       my_process_id, n_processes, nblocks, &
                                       natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                       particle_set, &
                                       coeffs_set, coeffs_kind, &
                                       is_assoc_atomic_block_global, do_periodic, &
                                       kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                       cell, x_data, para_env, pmax_block, &
                                       do_p_screening, map_atom_to_kind_atom, eval_type, &
                                       log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
            blocksize(((i - 1)*nProc + 1):(i*nProc)) = ithblocksize
         END DO
         DEALLOCATE (ithblocksize)
      ELSE
         DO i = 1, nBins
            blocksize(i) = localblocksize(i)
         END DO
      END IF

      DEALLOCATE (localblocksize)

   END SUBROUTINE hfx_recursive_permute

! **************************************************************************************************
!> \brief small binning routine for the recursive load balancing
!>
!> \param nBins         number of Bins (INPUT)
!> \param costvector    vector of current row/column costs which have to be binned (INPUT)
!> \param maxbinsize    upper bound for bin size (INPUT)
!> \param perm          resulting permutation due to be binning routine (OUTPUT)
!> \param block_count   vector of size(nbins) which contains the size of each bin (OUTPUT)
!> \par History
!>      03.2011 created [Michael Steinlechner]
!> \author Michael Steinlechner
! **************************************************************************************************
   SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)

      INTEGER, INTENT(IN)                                :: nBins, maxbinsize
      REAL(dp), DIMENSION(maxbinsize), INTENT(IN)        :: costvector
      INTEGER, DIMENSION(maxbinsize), INTENT(OUT)        :: perm
      INTEGER, DIMENSION(nBins), INTENT(OUT)             :: block_count

      INTEGER                                            :: i, j, mod_idx, offset
      INTEGER, DIMENSION(nBins, maxbinsize)              :: bin
      INTEGER, DIMENSION(nBins)                          :: bin_idx
      INTEGER, DIMENSION(maxbinsize)                     :: idx
      REAL(dp), DIMENSION(maxbinsize)                    :: vec
      REAL(dp), DIMENSION(nBins)                         :: bincosts

! be careful not to change costvector (copy it!)

      vec = costvector
      block_count = 0
      bincosts = 0

      !sort the array (ascending)
      CALL sort(vec, maxbinsize, idx)

      ! count the loop down to distribute the largest cols/rows first
      DO i = maxbinsize, 1, -1
         IF (vec(i) == 0) THEN
            ! spread zero-cost col/rows evenly among procs
            mod_idx = MODULO(i, nBins) + 1 !(note the fortran offset by one!)
            block_count(mod_idx) = block_count(mod_idx) + 1
            bin(mod_idx, block_count(mod_idx)) = idx(i)
         ELSE
            ! sort the bins so that the one with the lowest cost is at the
            ! first place, where we then assign the current col/row
            CALL sort(bincosts, nBins, bin_idx)
            block_count = block_count(bin_idx)
            bin = bin(bin_idx, :)

            bincosts(1) = bincosts(1) + vec(i)
            block_count(1) = block_count(1) + 1
            bin(1, block_count(1)) = idx(i)
         END IF
      END DO

      ! construct permutation vector from the binning
      offset = 0
      DO i = 1, nBins
         DO j = 1, block_count(i)
            perm(offset + j) = bin(i, j)
         END DO
         offset = offset + block_count(i)
      END DO

   END SUBROUTINE hfx_permute_binning

! **************************************************************************************************
!> \brief Cheap way of redistributing the eri's
!> \param x_data Object that stores the indices array
!> \param para_env para_env
!> \param load_balance_parameter contains parmameter for Monte-Carlo routines
!> \param i_thread current thread ID
!> \param n_threads Total Number of threads
!> \param eval_type ...
!> \par History
!>      12.2007 created [Manuel Guidon]
!>      02.2009 optimize Memory Usage [Manuel Guidon]
!> \author Manuel Guidon
!> \note
!>      The cost matrix is given by the walltime for each bin that is measured
!>      during the calculation
! **************************************************************************************************
   SUBROUTINE hfx_update_load_balance(x_data, para_env, &
                                      load_balance_parameter, &
                                      i_thread, n_threads, eval_type)

      TYPE(hfx_type), POINTER                            :: x_data
      TYPE(cp_para_env_type), POINTER                    :: para_env
      TYPE(hfx_load_balance_type)                        :: load_balance_parameter
      INTEGER, INTENT(IN)                                :: i_thread, n_threads, eval_type

      CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_update_load_balance', &
         routineP = moduleN//':'//routineN

      INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
         my_global_start_idx, my_process_id, n_processes, nbins, ncpu, req(2), source, start_idx
      INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &
                                                            sendbuffer, swapbuffer
      INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: tmp_pos
      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: bins_per_rank
      INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE        :: bin_histogram
      INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector
      INTEGER, SAVE                                      :: max_bin_size
      TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist
      TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
         SAVE                                            :: full_dist

!$OMP BARRIER
!$OMP MASTER
      CALL timeset(routineN, handle)
!$OMP END MASTER
!$OMP BARRIER

      ncpu = para_env%num_pe
      n_processes = ncpu*n_threads
      !! If there is only 1 cpu skip the binning
      IF (n_processes == 1) THEN
         ALLOCATE (tmp_dist(1))
         tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
         tmp_dist(1)%istart = 0_int_8
         ptr_to_tmp_dist => tmp_dist(:)
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
         CASE (hfx_do_eval_forces)
            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
         END SELECT
         DEALLOCATE (tmp_dist)
      ELSE
         mepos = para_env%mepos
         my_process_id = para_env%mepos*n_threads + i_thread
         nbins = load_balance_parameter%nbins
!$OMP MASTER
         ALLOCATE (bin_histogram(n_processes, 2))
         bin_histogram = 0
!$OMP END MASTER
!$OMP BARRIER
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            my_bin_size = SIZE(x_data%distribution_energy)
         CASE (hfx_do_eval_forces)
            my_bin_size = SIZE(x_data%distribution_forces)
         END SELECT
         bin_histogram(my_process_id + 1, 1) = my_bin_size
!$OMP BARRIER
!$OMP MASTER
         CALL mp_sum(bin_histogram(:, 1), para_env%group)
         bin_histogram(1, 2) = bin_histogram(1, 1)
         DO iprocess = 2, n_processes
            bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
         END DO

         max_bin_size = MAXVAL(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
         CALL mp_max(max_bin_size, para_env%group)
!$OMP END MASTER
!$OMP BARRIER
         ALLOCATE (binned_dist(my_bin_size))
         !! Use old binned_dist, but with timings cost
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            binned_dist = x_data%distribution_energy
         CASE (hfx_do_eval_forces)
            binned_dist = x_data%distribution_forces
         END SELECT

         DO ibin = 1, my_bin_size
            IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
               binned_dist(ibin)%cost = 0
            ELSE
               SELECT CASE (eval_type)
               CASE (hfx_do_eval_energy)
                  IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
                     binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_first_scf + &
                                                   binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
                  ELSE
                     binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
                  END IF
               CASE (hfx_do_eval_forces)
                  binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
               END SELECT
            END IF
         END DO
!$OMP BARRIER
!$OMP MASTER
         !! store all local results in a big cost matrix
         ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
         cost_matrix = 0
         ALLOCATE (sendbuffer(max_bin_size*n_threads))
         ALLOCATE (recbuffer(max_bin_size*n_threads))
!$OMP END MASTER
!$OMP BARRIER
         my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
         icpu = para_env%mepos + 1
         DO i = 1, my_bin_size
            cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
         END DO

         mepos = para_env%mepos
!$OMP BARRIER
!$OMP MASTER
         ALLOCATE (bins_per_rank(ncpu))
         bins_per_rank = 0
         DO icpu = 1, ncpu
            bins_per_rank(icpu) = SUM(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
         END DO
         sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
            cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))

         dest = MODULO(mepos + 1, ncpu)
         source = MODULO(mepos - 1, ncpu)
         ! sync before/after ring of isendrecv
         CALL mp_sync(para_env%group)
         DO icpu = 0, ncpu - 1
            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
                                 para_env%group, req(1), req(2), 13)
            ENDIF
            data_from = MODULO(mepos - icpu, ncpu)
            start_idx = SUM(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
            end_idx = start_idx + bins_per_rank(data_from + 1) - 1
            cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)

            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_waitall(req)
            ENDIF
            swapbuffer => sendbuffer
            sendbuffer => recbuffer
            recbuffer => swapbuffer
         ENDDO
         DEALLOCATE (recbuffer, sendbuffer)
         ! sync before/after ring of isendrecv
         CALL mp_sync(para_env%group)
!$OMP END MASTER
!$OMP BARRIER
         ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
         local_cost_matrix = cost_matrix
!$OMP MASTER
         ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
         CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
                                    shm_distribution_vector, x_data%load_balance_parameter%do_randomize)

         ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))

         full_dist(:, :)%istart = 0_int_8
         full_dist(:, :)%number_of_atom_quartets = 0_int_8
         full_dist(:, :)%cost = 0_int_8
         full_dist(:, :)%time_first_scf = 0.0_dp
         full_dist(:, :)%time_other_scf = 0.0_dp
         full_dist(:, :)%time_forces = 0.0_dp
!$OMP END MASTER

!$OMP BARRIER
         mepos = para_env%mepos + 1
         full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
!$OMP BARRIER
!$OMP MASTER
         ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
         ALLOCATE (recbuffer(3*max_bin_size*n_threads))
         mepos = para_env%mepos
         DO j = 1, n_threads
            DO i = 1, max_bin_size
               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
            END DO
         END DO
         dest = MODULO(mepos + 1, ncpu)
         source = MODULO(mepos - 1, ncpu)
         ! sync before/after ring of isendrecv
         CALL mp_sync(para_env%group)
         DO icpu = 0, ncpu - 1
            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, &
                                 para_env%group, req(1), req(2), 13)
            ENDIF
            data_from = MODULO(mepos - icpu, ncpu)
            DO j = 1, n_threads
               DO i = 1, max_bin_size
                  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
                  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
                  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
               END DO
            END DO

            IF (icpu .NE. ncpu - 1) THEN
               CALL mp_waitall(req)
            ENDIF
            swapbuffer => sendbuffer
            sendbuffer => recbuffer
            recbuffer => swapbuffer
         ENDDO
         ! sync before/after ring of isendrecv
         DEALLOCATE (recbuffer, sendbuffer)
         CALL mp_sync(para_env%group)
!$OMP END MASTER
!$OMP BARRIER
         !! reorder the distribution according to the distribution vector
         ALLOCATE (tmp_pos(ncpu*n_threads))
         tmp_pos = 1
         ALLOCATE (tmp_dist(nbins*ncpu*n_threads))

         tmp_dist(:)%istart = 0_int_8
         tmp_dist(:)%number_of_atom_quartets = 0_int_8
         tmp_dist(:)%cost = 0_int_8
         tmp_dist(:)%time_first_scf = 0.0_dp
         tmp_dist(:)%time_other_scf = 0.0_dp
         tmp_dist(:)%time_forces = 0.0_dp

         mepos = my_process_id + 1
         DO icpu = 1, n_processes
            DO i = 1, bin_histogram(icpu, 1)
               IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
                  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
                  tmp_pos(mepos) = tmp_pos(mepos) + 1
               END IF
            END DO
         END DO

         !! Assign the load to each process
         NULLIFY (ptr_to_tmp_dist)
         mepos = my_process_id + 1
         ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
         CASE (hfx_do_eval_forces)
            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
         END SELECT

!$OMP BARRIER
!$OMP MASTER
         DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
         DEALLOCATE (bins_per_rank, bin_histogram)
!$OMP END MASTER
!$OMP BARRIER
         DEALLOCATE (tmp_dist, tmp_pos)
         DEALLOCATE (binned_dist, local_cost_matrix)
      END IF
!$OMP BARRIER
!$OMP MASTER
      CALL timestop(handle)
!$OMP END MASTER
!$OMP BARRIER

   END SUBROUTINE hfx_update_load_balance

! **************************************************************************************************
!> \brief estimates the cost of a set quartet with info available at load balance time
!>        i.e. without much info on the primitives primitives
!> \param nsa ...
!> \param nsb ...
!> \param nsc ...
!> \param nsd ...
!> \param npgfa ...
!> \param npgfb ...
!> \param npgfc ...
!> \param npgfd ...
!> \param ratio ...
!> \param p1 ...
!> \param p2 ...
!> \param p3 ...
!> \return ...
!> \par History
!>      08.2009 created Joost VandeVondele
!> \author Joost VandeVondele
! **************************************************************************************************
   FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
      IMPLICIT NONE
      REAL(KIND=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
      INTEGER(KIND=int_8) :: res
      REAL(KIND=dp), INTENT(IN) :: p1(12), p2(12), p3(2)

      INTEGER   :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd

      estimate1 = estimate_basic(p1)
      estimate2 = estimate_basic(p2)
      mu = LOG(ABS(1.0E6_dp*p3(1)) + 1)
      sigma = p3(2)*0.1_dp*mu
      switch = 1.0_dp/(1.0_dp + EXP((LOG(estimate1) - mu)/sigma))
      estimate = estimate1*(1.0_dp - switch) + estimate2*switch
      res = INT(estimate*0.001_dp, KIND=int_8) + 1

   CONTAINS

! **************************************************************************************************
!> \brief ...
!> \param p ...
!> \return ...
! **************************************************************************************************
      REAL(KIND=dp) FUNCTION estimate_basic(p) RESULT(res)
      REAL(KIND=dp)                                      :: p(12)

      REAL(KIND=dp)                                      :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
                                                            p7, p8, p9

         p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
         p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
         p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
         res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
               poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
               poly2(npgfd, p4, p5, p6)*EXP(-p7*ratio + p8*ratio**2) + &
              1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
         res = 1 + ABS(res)
      END FUNCTION estimate_basic

! **************************************************************************************************
!> \brief ...
!> \param x ...
!> \param a0 ...
!> \param a1 ...
!> \param a2 ...
!> \return ...
! **************************************************************************************************
      REAL(KIND=dp) FUNCTION poly2(x, a0, a1, a2)
      INTEGER                                            :: x
      REAL(KIND=dp)                                      :: a0, a1, a2

         poly2 = a0 + a1*x + a2*x*x
      END FUNCTION poly2

   END FUNCTION cost_model
! **************************************************************************************************
!> \brief Minimizes the maximum cost per cpu by shuffling around all bins
!> \param total_number_of_bins ...
!> \param number_of_processes ...
!> \param bin_costs costs per bin
!> \param distribution_vector will contain the final distribution
!> \param do_randomize ...
!> \par History
!>      03.2009 created from a hack by Joost [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
   SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
                                    distribution_vector, do_randomize)
      INTEGER                                            :: total_number_of_bins, number_of_processes
      INTEGER(int_8), DIMENSION(:), POINTER              :: bin_costs
      INTEGER, DIMENSION(:), POINTER                     :: distribution_vector
      LOGICAL, INTENT(IN)                                :: do_randomize

      CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_distribution', &
                                     routineP = moduleN//':'//routineN

      INTEGER                                            :: i, itmp, j, nstep
      INTEGER(int_8), DIMENSION(:), POINTER              :: my_cost_cpu, tmp_cost, tmp_cpu_cost
      INTEGER, DIMENSION(:), POINTER                     :: tmp_cpu_index, tmp_index
      TYPE(rng_stream_type), POINTER                     :: rng_stream

      nstep = MAX(1, INT(number_of_processes)/2)

      ALLOCATE (tmp_cost(total_number_of_bins))
      ALLOCATE (tmp_index(total_number_of_bins))
      ALLOCATE (tmp_cpu_cost(number_of_processes))
      ALLOCATE (tmp_cpu_index(number_of_processes))
      ALLOCATE (my_cost_cpu(number_of_processes))
      tmp_cost = bin_costs

      CALL sort(tmp_cost, total_number_of_bins, tmp_index)
      my_cost_cpu = 0
      !
      ! assign the largest remaining bin to the CPU with the smallest load
      ! gives near perfect distributions for a sufficient number of bins ...
      ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
      ! each cpu a similar number of tasks.
      ! it also avoids degenerate cases where thousands of zero sized tasks
      ! are assigned to the same (least loaded) cpu
      !
      IF (do_randomize) THEN
         NULLIFY (rng_stream)
         CALL create_rng_stream(rng_stream=rng_stream, &
                                name="uniform_rng", &
                                distribution_type=UNIFORM)
      END IF

      DO i = total_number_of_bins, 1, -nstep
         tmp_cpu_cost = my_cost_cpu
         CALL sort(tmp_cpu_cost, INT(number_of_processes), tmp_cpu_index)
         IF (do_randomize) THEN
            CALL reshuffle(MIN(i, nstep), tmp_cpu_index(1:MIN(i, nstep)), rng_stream)
         END IF
         DO j = 1, MIN(i, nstep)
            itmp = tmp_cpu_index(j)
            distribution_vector(tmp_index(i - j + 1)) = itmp
            my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
         ENDDO
      ENDDO

      IF (do_randomize) THEN
         CALL delete_rng_stream(rng_stream)
      END IF

      DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
      DEALLOCATE (tmp_cpu_index, my_cost_cpu)
   END SUBROUTINE optimize_distribution

! **************************************************************************************************
!> \brief Given a 2d index pair, this function returns a 1d index pair for
!>        a symmetric upper triangle NxN matrix
!>        The compiler should inline this function, therefore it appears in
!>        several modules
!> \param i 2d index
!> \param j 2d index
!> \param N matrix size
!> \return ...
!> \par History
!>      03.2009 created [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
   PURE FUNCTION get_1D_idx(i, j, N)
      INTEGER, INTENT(IN)                                :: i, j
      INTEGER(int_8), INTENT(IN)                         :: N
      INTEGER(int_8)                                     :: get_1D_idx

      INTEGER(int_8)                                     :: min_ij

      min_ij = MIN(i, j)
      get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N

   END FUNCTION get_1D_idx

! **************************************************************************************************
!> \brief ...
!> \param natom ...
!> \param nkind ...
!> \param list_ij ...
!> \param list_kl ...
!> \param set_list_ij ...
!> \param set_list_kl ...
!> \param iatom_start ...
!> \param iatom_end ...
!> \param jatom_start ...
!> \param jatom_end ...
!> \param katom_start ...
!> \param katom_end ...
!> \param latom_start ...
!> \param latom_end ...
!> \param particle_set ...
!> \param coeffs_set ...
!> \param coeffs_kind ...
!> \param is_assoc_atomic_block_global ...
!> \param do_periodic ...
!> \param kind_of ...
!> \param basis_parameter ...
!> \param pmax_set ...
!> \param pmax_atom ...
!> \param pmax_blocks ...
!> \param cell ...
!> \param do_p_screening ...
!> \param map_atom_to_kind_atom ...
!> \param eval_type ...
!> \param log10_eps_schwarz ...
!> \param log_2 ...
!> \param coeffs_kind_max0 ...
!> \param use_virial ...
!> \param atomic_pair_list ...
!> \return ...
! **************************************************************************************************
   FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                iatom_start, iatom_end, jatom_start, jatom_end, &
                                katom_start, katom_end, latom_start, latom_end, &
                                particle_set, &
                                coeffs_set, coeffs_kind, &
                                is_assoc_atomic_block_global, do_periodic, &
                                kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                cell, &
                                do_p_screening, map_atom_to_kind_atom, eval_type, &
                                log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
                                atomic_pair_list)

      INTEGER, INTENT(IN)                                :: natom, nkind
      TYPE(pair_list_type)                               :: list_ij, list_kl
      TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
      INTEGER, INTENT(IN)                                :: iatom_start, iatom_end, jatom_start, &
                                                            jatom_end, katom_start, katom_end, &
                                                            latom_start, latom_end
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(nkind, nkind)                         :: coeffs_kind
      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
      LOGICAL                                            :: do_periodic
      INTEGER                                            :: kind_of(*)
      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
      REAL(dp)                                           :: pmax_blocks
      TYPE(cell_type), POINTER                           :: cell
      LOGICAL, INTENT(IN)                                :: do_p_screening
      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
      INTEGER, INTENT(IN)                                :: eval_type
      REAL(dp)                                           :: log10_eps_schwarz, log_2, &
                                                            coeffs_kind_max0
      LOGICAL, INTENT(IN)                                :: use_virial
      LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list
      INTEGER(int_8)                                     :: estimate_block_cost

      INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
         i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
         jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
      INTEGER, DIMENSION(:), POINTER                     :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
                                                            nsgfb, nsgfc, nsgfd
      REAL(dp)                                           :: actual_pmax_atom, cost_tmp, max_val1, &
                                                            max_val2, pmax_entry, rab2, rcd2, &
                                                            screen_kind_ij, screen_kind_kl
      REAL(dp), DIMENSION(:, :), POINTER                 :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4

      estimate_block_cost = 0_int_8

      CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
                           kind_of, basis_parameter, particle_set, &
                           do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
                           log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)

      CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
                           kind_of, basis_parameter, particle_set, &
                           do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
                           log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)

      DO i_list_ij = 1, list_ij%n_element
         iatom = list_ij%elements(i_list_ij)%pair(1)
         jatom = list_ij%elements(i_list_ij)%pair(2)
         i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
         i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
         ikind = list_ij%elements(i_list_ij)%kind_pair(1)
         jkind = list_ij%elements(i_list_ij)%kind_pair(2)
         rab2 = list_ij%elements(i_list_ij)%dist2

         nsgfa => basis_parameter(ikind)%nsgf
         nsgfb => basis_parameter(jkind)%nsgf
         npgfa => basis_parameter(ikind)%npgf
         npgfb => basis_parameter(jkind)%npgf

         DO i_list_kl = 1, list_kl%n_element

            katom = list_kl%elements(i_list_kl)%pair(1)
            latom = list_kl%elements(i_list_kl)%pair(2)

            IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE
            IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) CYCLE

            IF (eval_type == hfx_do_eval_forces) THEN
               IF (.NOT. use_virial) THEN
                  IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) CYCLE
               END IF
            END IF

            i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
            i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
            kkind = list_kl%elements(i_list_kl)%kind_pair(1)
            lkind = list_kl%elements(i_list_kl)%kind_pair(2)
            rcd2 = list_kl%elements(i_list_kl)%dist2

            nsgfc => basis_parameter(kkind)%nsgf
            nsgfd => basis_parameter(lkind)%nsgf
            npgfc => basis_parameter(kkind)%npgf
            npgfd => basis_parameter(lkind)%npgf

            IF (do_p_screening) THEN
               actual_pmax_atom = MAX(pmax_atom(katom, iatom), &
                                      pmax_atom(latom, jatom), &
                                      pmax_atom(latom, iatom), &
                                      pmax_atom(katom, jatom))
            ELSE
               actual_pmax_atom = 0.0_dp
            END IF

            screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
                             coeffs_kind(jkind, ikind)%x(2)
            screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
                             coeffs_kind(lkind, kkind)%x(2)
            IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE

            IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
                       is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
                       is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
                       is_assoc_atomic_block_global(latom, jatom) >= 1)) CYCLE

            IF (do_p_screening) THEN
               SELECT CASE (eval_type)
               CASE (hfx_do_eval_energy)
                  swap_id = 0
                  kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
                  IF (ikind >= kkind) THEN
                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(katom), &
                                                               map_atom_to_kind_atom(iatom))
                  ELSE
                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(iatom), &
                                                               map_atom_to_kind_atom(katom))
                     swap_id = swap_id + 1
                  END IF
                  kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
                  IF (jkind >= lkind) THEN
                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(latom), &
                                                               map_atom_to_kind_atom(jatom))
                  ELSE
                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(jatom), &
                                                               map_atom_to_kind_atom(latom))
                     swap_id = swap_id + 2
                  END IF
                  kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
                  IF (ikind >= lkind) THEN
                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(latom), &
                                                               map_atom_to_kind_atom(iatom))
                  ELSE
                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(iatom), &
                                                               map_atom_to_kind_atom(latom))
                     swap_id = swap_id + 4
                  END IF
                  kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
                  IF (jkind >= kkind) THEN
                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(katom), &
                                                               map_atom_to_kind_atom(jatom))
                  ELSE
                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(jatom), &
                                                               map_atom_to_kind_atom(katom))
                     swap_id = swap_id + 8
                  END IF
               CASE (hfx_do_eval_forces)
                  swap_id = 16
                  kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
                  IF (ikind >= kkind) THEN
                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(katom), &
                                                               map_atom_to_kind_atom(iatom))
                  ELSE
                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(iatom), &
                                                               map_atom_to_kind_atom(katom))
                     swap_id = swap_id + 1
                  END IF
                  kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
                  IF (jkind >= lkind) THEN
                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(latom), &
                                                               map_atom_to_kind_atom(jatom))
                  ELSE
                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(jatom), &
                                                               map_atom_to_kind_atom(latom))
                     swap_id = swap_id + 2
                  END IF
                  kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
                  IF (ikind >= lkind) THEN
                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(latom), &
                                                               map_atom_to_kind_atom(iatom))
                  ELSE
                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(iatom), &
                                                               map_atom_to_kind_atom(latom))
                     swap_id = swap_id + 4
                  END IF
                  kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
                  IF (jkind >= kkind) THEN
                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(katom), &
                                                               map_atom_to_kind_atom(jatom))
                  ELSE
                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
                                                               map_atom_to_kind_atom(jatom), &
                                                               map_atom_to_kind_atom(katom))
                     swap_id = swap_id + 8
                  END IF
               END SELECT
            END IF

            DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
               iset = set_list_ij(i_set_list_ij)%pair(1)
               jset = set_list_ij(i_set_list_ij)%pair(2)

               max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
                          coeffs_set(jset, iset, jkind, ikind)%x(2)

               IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
               DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
                  kset = set_list_kl(i_set_list_kl)%pair(1)
                  lset = set_list_kl(i_set_list_kl)%pair(2)

                  max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
                                         coeffs_set(lset, kset, lkind, kkind)%x(2))

                  IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) CYCLE
                  IF (do_p_screening) THEN
                     CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
                                       iset, jset, kset, lset, &
                                       pmax_entry, swap_id)
                     IF (eval_type == hfx_do_eval_forces) THEN
                        pmax_entry = log_2 + pmax_entry
                     END IF
                  ELSE
                     pmax_entry = 0.0_dp
                  END IF
                  max_val2 = max_val2 + pmax_entry
                  IF (max_val2 < log10_eps_schwarz) CYCLE
                  SELECT CASE (eval_type)
                  CASE (hfx_do_eval_energy)
                     cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                           npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
                                           max_val2/log10_eps_schwarz, &
                                           p1_energy, p2_energy, p3_energy)
                     estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
                  CASE (hfx_do_eval_forces)
                     cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                           npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
                                           max_val2/log10_eps_schwarz, &
                                           p1_forces, p2_forces, p3_forces)
                     estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
                  END SELECT
               ENDDO ! i_set_list_kl
            ENDDO ! i_set_list_ij
         ENDDO ! i_list_kl
      ENDDO ! i_list_ij

   END FUNCTION estimate_block_cost

! **************************************************************************************************
!> \brief ...
!> \param nkind ...
!> \param para_env ...
!> \param natom ...
!> \param block_size ...
!> \param nblock ...
!> \param blocks ...
!> \param list_ij ...
!> \param list_kl ...
!> \param set_list_ij ...
!> \param set_list_kl ...
!> \param particle_set ...
!> \param coeffs_set ...
!> \param coeffs_kind ...
!> \param is_assoc_atomic_block_global ...
!> \param do_periodic ...
!> \param kind_of ...
!> \param basis_parameter ...
!> \param pmax_set ...
!> \param pmax_atom ...
!> \param pmax_blocks ...
!> \param cell ...
!> \param do_p_screening ...
!> \param map_atom_to_kind_atom ...
!> \param eval_type ...
!> \param log10_eps_schwarz ...
!> \param log_2 ...
!> \param coeffs_kind_max0 ...
!> \param use_virial ...
!> \param atomic_pair_list ...
! **************************************************************************************************
   SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
                          list_ij, list_kl, set_list_ij, set_list_kl, &
                          particle_set, &
                          coeffs_set, coeffs_kind, &
                          is_assoc_atomic_block_global, do_periodic, &
                          kind_of, basis_parameter, pmax_set, pmax_atom, &
                          pmax_blocks, cell, &
                          do_p_screening, map_atom_to_kind_atom, eval_type, &
                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
                          atomic_pair_list)

      INTEGER, INTENT(IN)                                :: nkind
      TYPE(cp_para_env_type), POINTER                    :: para_env
      INTEGER                                            :: natom, block_size, nblock
      TYPE(hfx_block_range_type), DIMENSION(1:nblock)    :: blocks
      TYPE(pair_list_type)                               :: list_ij, list_kl
      TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set
      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
         POINTER                                         :: coeffs_kind
      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global
      LOGICAL                                            :: do_periodic
      INTEGER                                            :: kind_of(*)
      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set
      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom
      REAL(dp)                                           :: pmax_blocks
      TYPE(cell_type), POINTER                           :: cell
      LOGICAL, INTENT(IN)                                :: do_p_screening
      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom
      INTEGER, INTENT(IN)                                :: eval_type
      REAL(dp)                                           :: log10_eps_schwarz, log_2, &
                                                            coeffs_kind_max0
      LOGICAL, INTENT(IN)                                :: use_virial
      LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list

      INTEGER                                            :: atom_block, i, iatom_block, iatom_end, &
                                                            iatom_start, my_cpu_rank, ncpus

      DO atom_block = 0, nblock - 1
         iatom_block = MODULO(atom_block, nblock) + 1
         iatom_start = (iatom_block - 1)*block_size + 1
         iatom_end = MIN(iatom_block*block_size, natom)
         blocks(atom_block + 1)%istart = iatom_start
         blocks(atom_block + 1)%iend = iatom_end
         blocks(atom_block + 1)%cost = 0_int_8
      END DO

      ncpus = para_env%num_pe
      my_cpu_rank = para_env%mepos
      DO i = 1, nblock
         IF (MODULO(i, ncpus) /= my_cpu_rank) THEN
            blocks(i)%istart = 0
            blocks(i)%iend = 0
            CYCLE
         END IF
         iatom_start = blocks(i)%istart
         iatom_end = blocks(i)%iend
         blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
                                              iatom_start, iatom_end, iatom_start, iatom_end, &
                                              iatom_start, iatom_end, iatom_start, iatom_end, &
                                              particle_set, &
                                              coeffs_set, coeffs_kind, &
                                              is_assoc_atomic_block_global, do_periodic, &
                                              kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
                                              cell, &
                                              do_p_screening, map_atom_to_kind_atom, eval_type, &
                                              log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

      END DO
   END SUBROUTINE init_blocks

! **************************************************************************************************
!> \brief ...
!> \param para_env ...
!> \param x_data ...
!> \param iw ...
!> \param n_threads ...
!> \param i_thread ...
!> \param eval_type ...
! **************************************************************************************************
   SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
                                        eval_type)

      TYPE(cp_para_env_type), POINTER                    :: para_env
      TYPE(hfx_type), POINTER                            :: x_data
      INTEGER, INTENT(IN)                                :: iw, n_threads, i_thread, eval_type

      CHARACTER(LEN=*), PARAMETER :: routineN = 'collect_load_balance_info', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: i, j, k, my_rank, nbins, nranks, &
                                                            total_bins
      INTEGER(int_8)                                     :: avg_bin, avg_rank, max_bin, max_rank, &
                                                            min_bin, min_rank, sum_bin, sum_rank
      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer, buffer_in, buffer_out, summary
      INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE    :: shm_cost_vector
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bins_per_rank, rdispl, sort_idx
      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: shm_bins_per_rank, shm_displ

      SELECT CASE (eval_type)
      CASE (hfx_do_eval_energy)
         nbins = SIZE(x_data%distribution_energy)
      CASE (hfx_do_eval_forces)
         nbins = SIZE(x_data%distribution_forces)
      END SELECT

!$OMP MASTER
      ALLOCATE (shm_bins_per_rank(n_threads))
      ALLOCATE (shm_displ(n_threads + 1))
!$OMP END MASTER
!$OMP BARRIER

      shm_bins_per_rank(i_thread + 1) = nbins
!$OMP BARRIER
      nbins = 0
      DO i = 1, n_threads
         nbins = nbins + shm_bins_per_rank(i)
      END DO
      my_rank = para_env%mepos
      nranks = para_env%num_pe

!$OMP BARRIER
!$OMP MASTER
      ALLOCATE (bins_per_rank(nranks))
      bins_per_rank = 0

      bins_per_rank(my_rank + 1) = nbins

      CALL mp_sum(bins_per_rank, para_env%group)

      total_bins = 0
      DO i = 1, nranks
         total_bins = total_bins + bins_per_rank(i)
      END DO

      ALLOCATE (shm_cost_vector(2*total_bins))
      shm_cost_vector = -1_int_8
      shm_displ(1) = 1
      DO i = 2, n_threads
         shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
      END DO
      shm_displ(n_threads + 1) = nbins + 1
!$OMP END MASTER
!$OMP BARRIER
      j = 0
      SELECT CASE (eval_type)
      CASE (hfx_do_eval_energy)
         DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
            j = j + 1
            shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
            shm_cost_vector(2*i) = INT(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, KIND=int_8)
         END DO
      CASE (hfx_do_eval_forces)
         DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
            j = j + 1
            shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
            shm_cost_vector(2*i) = INT(x_data%distribution_forces(j)%time_forces*10000.0_dp, KIND=int_8)
         END DO
      END SELECT
!$OMP BARRIER
!$OMP MASTER
      ! ** calculate offsets
      ALLOCATE (rdispl(nranks))
      bins_per_rank(:) = bins_per_rank(:)*2
      rdispl(1) = 0
      DO i = 2, nranks
         rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
      END DO

      ALLOCATE (buffer_in(2*nbins))
      ALLOCATE (buffer_out(2*total_bins))

      DO i = 1, nbins
         buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
         buffer_in(2*i) = shm_cost_vector(2*i)
      END DO

      CALL mp_gatherv(buffer_in, buffer_out, bins_per_rank, rdispl, para_env%source, para_env%group)

      IF (iw > 0) THEN

         ALLOCATE (summary(2*nranks))
         summary = 0_int_8

         WRITE (iw, '( /, 1X, 79("-") )')
         WRITE (iw, '( " -", 77X, "-" )')
         SELECT CASE (eval_type)
         CASE (hfx_do_eval_energy)
            WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
         CASE (hfx_do_eval_forces)
            WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
         END SELECT
         WRITE (iw, '( " -", 77X, "-" )')
         WRITE (iw, '( 1X, 79("-") )')

         WRITE (iw, FMT="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
         WRITE (iw, '( 1X, 79("-"), / )')
         k = 0
         DO i = 1, nranks
            DO j = 1, bins_per_rank(i)/2
               k = k + 1
               WRITE (iw, FMT="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
                  i - 1, j, buffer_out(2*(k - 1) + 1), REAL(buffer_out(2*k), dp)/10000.0_dp
               summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
               summary(2*i) = summary(2*i) + buffer_out(2*k)
            END DO
         END DO

         !** Summary
         max_bin = 0_int_8
         min_bin = HUGE(min_bin)
         sum_bin = 0_int_8
         DO i = 1, total_bins
            sum_bin = sum_bin + buffer_out(2*i)
            max_bin = MAX(max_bin, buffer_out(2*i))
            min_bin = MIN(min_bin, buffer_out(2*i))
         END DO
         avg_bin = sum_bin/total_bins

         max_rank = 0_int_8
         min_rank = HUGE(min_rank)
         sum_rank = 0_int_8
         DO i = 1, nranks
            sum_rank = sum_rank + summary(2*i)
            max_rank = MAX(max_rank, summary(2*i))
            min_rank = MIN(min_rank, summary(2*i))
         END DO
         avg_rank = sum_rank/nranks

         WRITE (iw, FMT='(/,T3,A,/)') "SUMMARY:"
         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max bin", REAL(max_bin, dp)/10000.0_dp
         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min bin", REAL(min_bin, dp)/10000.0_dp
         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum bin", REAL(sum_bin, dp)/10000.0_dp
         WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg bin", REAL(avg_bin, dp)/10000.0_dp
         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max rank", REAL(max_rank, dp)/10000.0_dp
         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min rank", REAL(min_rank, dp)/10000.0_dp
         WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum rank", REAL(sum_rank, dp)/10000.0_dp
         WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg rank", REAL(avg_rank, dp)/10000.0_dp

         ALLOCATE (buffer(nranks))
         ALLOCATE (sort_idx(nranks))

         DO i = 1, nranks
            buffer(i) = summary(2*i)
         END DO

         CALL sort(buffer, nranks, sort_idx)

         WRITE (iw, FMT="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
         DO i = nranks, 1, -1
       WRITE (iw, FMT="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), REAL(buffer(i), dp)/10000.0_dp
         END DO

         DEALLOCATE (summary, buffer, sort_idx)

      END IF

      DEALLOCATE (buffer_in, buffer_out, rdispl)

      CALL mp_sync(para_env%group)

      DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
!$OMP END MASTER
!$OMP BARRIER

   END SUBROUTINE collect_load_balance_info

! **************************************************************************************************
!> \brief ...
!> \param size ...
!> \param array ...
!> \param rng_stream ...
! **************************************************************************************************
   SUBROUTINE reshuffle(size, array, rng_stream)
      INTEGER                                            :: size
      INTEGER, DIMENSION(size)                           :: array
      TYPE(rng_stream_type), POINTER                     :: rng_stream

      INTEGER                                            :: i, idx1, idx2, tmp
      REAL(dp)                                           :: x

      DO i = 1, size*10
         x = next_random_number(rng_stream)
         idx1 = INT(x*(size + 1 - 1)) + 1
         x = next_random_number(rng_stream)
         idx2 = INT(x*(size + 1 - 1)) + 1

         tmp = array(idx1)
         array(idx1) = array(idx2)
         array(idx2) = tmp
      END DO

   END SUBROUTINE

#include "hfx_get_pmax_val.f90"

END MODULE hfx_load_balance_methods